From 04ebae907ab4892ea79a20219a7231c425643f86 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Tue, 20 Aug 2024 22:14:50 -0400 Subject: [PATCH 001/426] [gn] tblgen opts for llvm-cgdata This should have been part of 5ec73b7dcf7f232. --- llvm/utils/gn/secondary/llvm/tools/llvm-cgdata/BUILD.gn | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-cgdata/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-cgdata/BUILD.gn index ebb064fb396413..a6ba4f2fb7856c 100644 --- a/llvm/utils/gn/secondary/llvm/tools/llvm-cgdata/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/tools/llvm-cgdata/BUILD.gn @@ -1,11 +1,19 @@ +import("//llvm/utils/TableGen/tablegen.gni") import("//llvm/utils/gn/build/driver_executable.gni") +tablegen("Opts") { + visibility = [ ":llvm-cgdata" ] + args = [ "-gen-opt-parser-defs" ] +} + driver_executable("llvm-cgdata") { deps = [ + ":Opts", "//llvm/lib/CGData", "//llvm/lib/CodeGen", "//llvm/lib/IR", "//llvm/lib/Object", + "//llvm/lib/Option", "//llvm/lib/Support", ] sources = [ "llvm-cgdata.cpp" ] From 12d4c89e88bf9349a063fdd992233b29adeb8241 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 20 Aug 2024 19:21:13 -0700 Subject: [PATCH 002/426] [ELF,test] Improve error-handling-script-linux.test * Use split-file * Remove -o /dev/null * Avoid `{ list; }` compound command not supported by the lit internal shell (#102382) * Don't test "ld.lld" before "error:" as per convention Pull Request: https://github.com/llvm/llvm-project/pull/105454 --- lld/test/ELF/error-handling-script-linux.test | 56 +++++++++++-------- .../ELF/error-handling-script-windows.bat | 6 +- 2 files changed, 36 insertions(+), 26 deletions(-) mode change 100755 => 100644 lld/test/ELF/error-handling-script-linux.test diff --git a/lld/test/ELF/error-handling-script-linux.test b/lld/test/ELF/error-handling-script-linux.test old mode 100755 new mode 100644 index 54e1b29ab236a6..a43cfa5e5b6c33 --- a/lld/test/ELF/error-handling-script-linux.test +++ b/lld/test/ELF/error-handling-script-linux.test @@ -1,46 +1,56 @@ -#!/bin/sh # REQUIRES: x86 # UNSUPPORTED: system-windows -# RUN: llvm-mc -filetype=obj -triple=x86_64 /dev/null -o %t0.o -# RUN: not ld.lld -o /dev/null -lidontexist --error-handling-script=%s %t0.o 2>&1 |\ -# RUN: FileCheck --check-prefix=CHECK-LIB %s -# RUN: not ld.lld -o /dev/null -lidontexist --error-handling-script=%s.nope %t0.o 2>&1 |\ -# RUN: FileCheck --check-prefix=CHECK-SCRIPT-DOES-NOT-EXIST -DFILE=%s.nope %s +# RUN: rm -rf %t && split-file %s %t && cd %t +# RUN: chmod +x a.sh +# RUN: llvm-mc -filetype=obj -triple=x86_64 /dev/null -o 0.o +# RUN: not ld.lld -lidontexist --error-handling-script=%t/a.sh 0.o 2>&1 |\ +# RUN: FileCheck --check-prefix=CHECK-LIB %s --match-full-lines --strict-whitespace +# RUN: not ld.lld -lidontexist --error-handling-script=./notexist 0.o 2>&1 |\ +# RUN: FileCheck --check-prefix=CHECK-SCRIPT-DOES-NOT-EXIST %s -# RUN: echo 'bar: movl a(%rip), %eax' | llvm-mc -filetype=obj -triple=x86_64 - -o %t1.o -# RUN: not ld.lld -o /dev/null --error-handling-script=%s %t1.o 2>&1 |\ +# RUN: llvm-mc -filetype=obj -triple=x86_64 1.s -o 1.o +# RUN: not ld.lld --error-handling-script=./a.sh 1.o 2>&1 |\ # RUN: FileCheck --check-prefix=CHECK-SYM-C %s -# RUN: echo 'bar: movl _Z1av(%rip), %eax' | llvm-mc -filetype=obj -triple=x86_64 - -o %t2.o -# RUN: not ld.lld -o /dev/null --demangle --error-handling-script=%s %t2.o 2>&1 |\ +# RUN: llvm-mc -filetype=obj -triple=x86_64 2.s -o 2.o +# RUN: not ld.lld --demangle --error-handling-script=./a.sh 2.o 2>&1 |\ # RUN: FileCheck --check-prefix=CHECK-SYM-CXX-DEMANGLE %s -# RUN: not ld.lld -o /dev/null --no-demangle --error-handling-script=%s %t2.o 2>&1 |\ +# RUN: not ld.lld --no-demangle --error-handling-script=./a.sh 2.o 2>&1 |\ # RUN: FileCheck --check-prefix=CHECK-SYM-CXX-NO-DEMANGLE %s -# RUN: { echo 'a_: ret'; echo 'bar: movl a(%rip), %eax' ; } | llvm-mc -filetype=obj -triple=x86_64 - -o %t3.o -# RUN: not ld.lld -o /dev/null --error-handling-script=%s %t3.o 2>&1 |\ -# RUN: FileCheck --check-prefix=CHECK-SYM-C-CORRECTION -DOBJ=%t3.o %s +# RUN: llvm-mc -filetype=obj -triple=x86_64 3.s -o 3.o +# RUN: not ld.lld --error-handling-script=%t/a.sh 3.o 2>&1 |\ +# RUN: FileCheck --check-prefix=CHECK-SYM-C-CORRECTION %s -# CHECK-LIB: script: info: called with missing-lib idontexist -# CHECK-LIB-NEXT: ld.lld: error: unable to find library -lidontexist +# CHECK-LIB:script: info: called with missing-lib idontexist +# CHECK-LIB-NEXT:{{.*}}error: unable to find library -lidontexist -# CHECK-SCRIPT-DOES-NOT-EXIST: ld.lld: error: unable to find library -lidontexist -# CHECK-SCRIPT-DOES-NOT-EXIST-NEXT: ld.lld: error: error handling script '[[FILE]]' failed to execute +# CHECK-SCRIPT-DOES-NOT-EXIST: error: unable to find library -lidontexist +# CHECK-SCRIPT-DOES-NOT-EXIST-NEXT: error: error handling script './notexist' failed to execute # CHECK-SYM-C: script: info: called with undefined-symbol a -# CHECK-SYM-C-NEXT: ld.lld: error: undefined symbol: a +# CHECK-SYM-C-NEXT: error: undefined symbol: a # CHECK-SYM-CXX-DEMANGLE: script: info: called with undefined-symbol _Z1av -# CHECK-SYM-CXX-DEMANGLE-NEXT: ld.lld: error: undefined symbol: a() +# CHECK-SYM-CXX-DEMANGLE-NEXT: error: undefined symbol: a() # CHECK-SYM-CXX-NO-DEMANGLE: script: info: called with undefined-symbol _Z1av -# CHECK-SYM-CXX-NO-DEMANGLE-NEXT: ld.lld: error: undefined symbol: _Z1av +# CHECK-SYM-CXX-NO-DEMANGLE-NEXT: error: undefined symbol: _Z1av # CHECK-SYM-C-CORRECTION: script: info: called with undefined-symbol a -# CHECK-SYM-C-CORRECTION-NEXT: ld.lld: error: undefined symbol: a -# CHECK-SYM-C-CORRECTION-NEXT: >>> referenced by [[OBJ]]: +# CHECK-SYM-C-CORRECTION-NEXT: error: undefined symbol: a +# CHECK-SYM-C-CORRECTION-NEXT: >>> referenced by 3.o: # CHECK-SYM-C-CORRECTION-NEXT: >>> did you mean: a_ +#--- 1.s +movl a(%rip), %eax +#--- 2.s +movl _Z1av(%rip), %eax +#--- 3.s +a_: ret +movl a(%rip), %eax +#--- a.sh +#!/bin/sh echo "script: info: called with $*" diff --git a/lld/test/ELF/error-handling-script-windows.bat b/lld/test/ELF/error-handling-script-windows.bat index 64c4e95f043c6b..528dabcb234562 100644 --- a/lld/test/ELF/error-handling-script-windows.bat +++ b/lld/test/ELF/error-handling-script-windows.bat @@ -6,10 +6,10 @@ :: RUN: FileCheck --check-prefix=CHECK-SCRIPT-DOES-NOT-EXIST -DFILE=%s.nope %s :: :: CHECK-LIB: script: info: called with missing-lib idontexist -:: CHECK-LIB-NEXT: ld.lld: error: unable to find library -lidontexist +:: CHECK-LIB-NEXT: error: unable to find library -lidontexist -:: CHECK-SCRIPT-DOES-NOT-EXIST: ld.lld: error: unable to find library -lidontexist -:: CHECK-SCRIPT-DOES-NOT-EXIST-NEXT: ld.lld: error: error handling script '[[FILE]]' failed to execute +:: CHECK-SCRIPT-DOES-NOT-EXIST: error: unable to find library -lidontexist +:: CHECK-SCRIPT-DOES-NOT-EXIST-NEXT: error: error handling script '[[FILE]]' failed to execute @echo off echo "script: info: called with %*" From 64d75bd100ac974e7343740b1b692be89ce0d9a5 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 21 Aug 2024 02:33:50 +0000 Subject: [PATCH 003/426] [gn build] Port 55d744eea361 --- llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn | 1 + llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/Utils/BUILD.gn | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn index c9b1b4d9ac3135..edd5be27900cc9 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -171,6 +171,7 @@ static_library("LLVMAMDGPUCodeGen") { "AMDGPUMachineModuleInfo.cpp", "AMDGPUMacroFusion.cpp", "AMDGPUMarkLastScratchLoad.cpp", + "AMDGPUMemoryUtils.cpp", "AMDGPUOpenCLEnqueuedBlockLowering.cpp", "AMDGPUPerfHintAnalysis.cpp", "AMDGPUPostLegalizerCombiner.cpp", diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/Utils/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/Utils/BUILD.gn index 159fb222a9d517..805c29ea3d37a9 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/Utils/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/Utils/BUILD.gn @@ -34,7 +34,6 @@ static_library("Utils") { "AMDGPUAsmUtils.cpp", "AMDGPUBaseInfo.cpp", "AMDGPUDelayedMCExpr.cpp", - "AMDGPUMemoryUtils.cpp", "AMDGPUPALMetadata.cpp", "AMDKernelCodeTUtils.cpp", ] From d9b6e9f1c1565d9469eb0546da8c276051175408 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 21 Aug 2024 02:33:50 +0000 Subject: [PATCH 004/426] [gn build] Port c8a678b1e486 --- llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn index ee635fd145c035..38ff30f3fab7d1 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn @@ -23,6 +23,7 @@ unittest("AnalysisTests") { "CallGraphTest.cpp", "CaptureTrackingTest.cpp", "ConstraintSystemTest.cpp", + "CtxProfAnalysisTest.cpp", "DDGTest.cpp", "DXILResourceTest.cpp", "DomTreeUpdaterTest.cpp", From 0f22d47a7a1f70ec77ea8ccdf08a6487827937db Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 20 Aug 2024 20:56:47 -0700 Subject: [PATCH 005/426] [LTO] Teach computeLTOCacheKey to return std::string (NFC) (#105331) Without this patch, computeLTOCacheKey computes SHA1, creates its hexadecimal representation with toHex, which returns std::string, and then copies it to an output parameter of type SmallString. This patch removes the redirection and teaches computeLTOCacheKey to directly return std::string computed by toHex. With the move semantics, no buffer copy should be involved. While I am at it, this patch adds a Twine to concatenate two strings. --- llvm/include/llvm/LTO/LTO.h | 8 +++----- llvm/lib/LTO/LTO.cpp | 17 ++++++++--------- llvm/lib/LTO/ThinLTOCodeGenerator.cpp | 8 ++++---- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h index 30eda34cd7ba54..0781d57feb5a64 100644 --- a/llvm/include/llvm/LTO/LTO.h +++ b/llvm/include/llvm/LTO/LTO.h @@ -62,11 +62,9 @@ void thinLTOInternalizeAndPromoteInIndex( /// Computes a unique hash for the Module considering the current list of /// export/import and other global analysis results. -/// The hash is produced in \p Key. -void computeLTOCacheKey( - SmallString<40> &Key, const lto::Config &Conf, - const ModuleSummaryIndex &Index, StringRef ModuleID, - const FunctionImporter::ImportMapTy &ImportList, +std::string computeLTOCacheKey( + const lto::Config &Conf, const ModuleSummaryIndex &Index, + StringRef ModuleID, const FunctionImporter::ImportMapTy &ImportList, const FunctionImporter::ExportSetTy &ExportList, const std::map &ResolvedODR, const GVSummaryMapTy &DefinedGlobals, diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index bb3c9f7acdb8e5..f69e089edf42e7 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -88,10 +88,10 @@ extern cl::opt EnableMemProfContextDisambiguation; // Computes a unique hash for the Module considering the current list of // export/import and other global analysis results. -// The hash is produced in \p Key. -void llvm::computeLTOCacheKey( - SmallString<40> &Key, const Config &Conf, const ModuleSummaryIndex &Index, - StringRef ModuleID, const FunctionImporter::ImportMapTy &ImportList, +// Returns the hash in its hexadecimal representation. +std::string llvm::computeLTOCacheKey( + const Config &Conf, const ModuleSummaryIndex &Index, StringRef ModuleID, + const FunctionImporter::ImportMapTy &ImportList, const FunctionImporter::ExportSetTy &ExportList, const std::map &ResolvedODR, const GVSummaryMapTy &DefinedGlobals, @@ -355,7 +355,7 @@ void llvm::computeLTOCacheKey( } } - Key = toHex(Hasher.result()); + return toHex(Hasher.result()); } static void thinLTOResolvePrevailingGUID( @@ -1488,11 +1488,10 @@ class InProcessThinBackend : public ThinBackendProc { // no module hash. return RunThinBackend(AddStream); - SmallString<40> Key; // The module may be cached, this helps handling it. - computeLTOCacheKey(Key, Conf, CombinedIndex, ModuleID, ImportList, - ExportList, ResolvedODR, DefinedGlobals, CfiFunctionDefs, - CfiFunctionDecls); + std::string Key = computeLTOCacheKey( + Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR, + DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls); Expected CacheAddStreamOrErr = Cache(Task, Key, ModuleID); if (Error Err = CacheAddStreamOrErr.takeError()) return Err; diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp index b982df7c6e5d3f..0ba3093637aacf 100644 --- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp +++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp @@ -382,13 +382,13 @@ class ModuleCacheEntry { Conf.RelocModel = TMBuilder.RelocModel; Conf.CGOptLevel = TMBuilder.CGOptLevel; Conf.Freestanding = Freestanding; - SmallString<40> Key; - computeLTOCacheKey(Key, Conf, Index, ModuleID, ImportList, ExportList, - ResolvedODR, DefinedGVSummaries); + std::string Key = + computeLTOCacheKey(Conf, Index, ModuleID, ImportList, ExportList, + ResolvedODR, DefinedGVSummaries); // This choice of file name allows the cache to be pruned (see pruneCache() // in include/llvm/Support/CachePruning.h). - sys::path::append(EntryPath, CachePath, "llvmcache-" + Key); + sys::path::append(EntryPath, CachePath, Twine("llvmcache-", Key)); } // Access the path to this entry in the cache. From 35cec805bfa91fd9b83c29c45f3a5877e484bd85 Mon Sep 17 00:00:00 2001 From: Will Hawkins Date: Wed, 21 Aug 2024 00:09:08 -0400 Subject: [PATCH 006/426] [lldb][test] Workaround older systems that lack gettid (#104831) Older glibc versions do not provide `gettid`. Provide our own `gettid` in these cases. Fixes a build failure caused by #104109. --- lldb/unittests/Process/elf-core/CMakeLists.txt | 11 +++++++++++ lldb/unittests/Process/elf-core/ThreadElfCoreTest.cpp | 5 +++++ 2 files changed, 16 insertions(+) diff --git a/lldb/unittests/Process/elf-core/CMakeLists.txt b/lldb/unittests/Process/elf-core/CMakeLists.txt index b852a3ffb863c1..68ab6e0683c182 100644 --- a/lldb/unittests/Process/elf-core/CMakeLists.txt +++ b/lldb/unittests/Process/elf-core/CMakeLists.txt @@ -1,3 +1,6 @@ +include(CheckSymbolExists) +include(CMakePushCheckState) + add_lldb_unittest(ProcessElfCoreTests ThreadElfCoreTest.cpp @@ -13,3 +16,11 @@ add_lldb_unittest(ProcessElfCoreTests LINK_COMPONENTS Support ) + +cmake_push_check_state() +set(CMAKE_REQUIRED_DEFINITIONS -D_GNU_SOURCE) +check_symbol_exists(gettid "unistd.h" HAVE_GETTID) +if(HAVE_GETTID) + target_compile_definitions(ProcessElfCoreTests PRIVATE HAVE_GETTID) +endif() +cmake_pop_check_state() diff --git a/lldb/unittests/Process/elf-core/ThreadElfCoreTest.cpp b/lldb/unittests/Process/elf-core/ThreadElfCoreTest.cpp index 3bc8b9053d2009..288729b4470607 100644 --- a/lldb/unittests/Process/elf-core/ThreadElfCoreTest.cpp +++ b/lldb/unittests/Process/elf-core/ThreadElfCoreTest.cpp @@ -24,6 +24,11 @@ #include #include +#ifndef HAVE_GETTID +#include +pid_t gettid() { return ((pid_t)syscall(SYS_gettid)); } +#endif + using namespace lldb_private; namespace { From cbd302410e9f27013223a96edcd78dfb597979e1 Mon Sep 17 00:00:00 2001 From: ShatianWang <38512325+ShatianWang@users.noreply.github.com> Date: Wed, 21 Aug 2024 00:35:07 -0400 Subject: [PATCH 007/426] [BOLT] Improve BinaryFunction::inferFallThroughCounts() (#105450) This PR improves how basic block execution count is updated when using the BOLT option `-infer-fall-throughs`. Previously, if a 0-count fall-through edge is assigned a positive inferred count N, then the successor block's execution count will be incremented by N. Since the successor's execution count is calculated using information besides inflow sum (such as outflow sum), it likely is already correct, and incrementing it by an additional N would be wrong. This PR improves how the successor's execution count is updated by using the max over its current count and N. --- bolt/lib/Core/BinaryFunctionProfile.cpp | 3 +- bolt/test/X86/infer-fall-throughs.s | 45 +++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 bolt/test/X86/infer-fall-throughs.s diff --git a/bolt/lib/Core/BinaryFunctionProfile.cpp b/bolt/lib/Core/BinaryFunctionProfile.cpp index 55ebe5fc900e65..726da6a9d0829f 100644 --- a/bolt/lib/Core/BinaryFunctionProfile.cpp +++ b/bolt/lib/Core/BinaryFunctionProfile.cpp @@ -336,7 +336,8 @@ void BinaryFunction::inferFallThroughCounts() { if (SuccBI.Count == 0) { SuccBI.Count = Inferred; SuccBI.MispredictedCount = BinaryBasicBlock::COUNT_INFERRED; - Succ->ExecutionCount += Inferred; + Succ->ExecutionCount = + std::max(Succ->getKnownExecutionCount(), Inferred); } } } diff --git a/bolt/test/X86/infer-fall-throughs.s b/bolt/test/X86/infer-fall-throughs.s new file mode 100644 index 00000000000000..3289a8ea4aec51 --- /dev/null +++ b/bolt/test/X86/infer-fall-throughs.s @@ -0,0 +1,45 @@ +## Test that infer-fall-throughs would correctly infer the wrong fall-through +## edge count in the example + +# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o +# RUN: link_fdata %s %t.o %t.fdata +# RUN: llvm-strip --strip-unneeded %t.o +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe -o %t.bolt \ +# RUN: --print-estimate-edge-counts --data=%t.fdata \ +# RUN: 2>&1 | FileCheck --check-prefix=WITHOUTINFERENCE %s +# RUN: llvm-bolt %t.exe -o %t.bolt --infer-fall-throughs \ +# RUN: --print-estimate-edge-counts --data=%t.fdata \ +# RUN: 2>&1 | FileCheck --check-prefix=CORRECTINFERENCE %s + + +# WITHOUTINFERENCE: Binary Function "main" after estimate-edge-counts +# WITHOUTINFERENCE: {{^\.Ltmp0}} +# WITHOUTINFERENCE: Successors: .Ltmp1 (mispreds: 0, count: 10), .LFT0 (mispreds: 0, count: 0) +# WITHOUTINFERENCE: {{^\.LFT0}} +# WITHOUTINFERENCE: Exec Count : 490 + +# CORRECTINFERENCE: Binary Function "main" after estimate-edge-counts +# CORRECTINFERENCE: {{^\.Ltmp0}} +# CORRECTINFERENCE: Successors: .Ltmp1 (mispreds: 0, count: 10), .LFT0 (inferred count: 490) +# CORRECTINFERENCE: {{^\.LFT0}} +# CORRECTINFERENCE: Exec Count : 490 + + + .globl main + .type main, @function +main: +LLmain_LLstart: + jmp LLstart +# FDATA: 1 main #LLmain_LLstart# 1 main #LLstart# 0 500 +LLstart: + jge LLexit +# FDATA: 1 main #LLstart# 1 main #LLexit# 0 10 +# FDATA: 1 main #LLstart# 1 main #LLmore# 0 0 +LLmore: + movl $5, %eax +# FDATA: 1 main #LLmore# 1 main #LLexit# 0 490 +LLexit: + ret +.LLmain_end: + .size main, .LLmain_end-main From 33f1aedef878931f61208b39c0220aa0d4bc9381 Mon Sep 17 00:00:00 2001 From: Chaitanya Date: Wed, 21 Aug 2024 10:31:42 +0530 Subject: [PATCH 008/426] [AMDGPU] Update instrumentAddress method to support aligned size and unusual size accesses. (#104804) This PR updates instrumentAddress api to support properly aligned sizes and unsual size accesses. Changes ported from asan pass. --- .../AMDGPU/AMDGPUAsanInstrumentation.cpp | 51 ++++++++++++++++--- .../Target/AMDGPU/AMDGPUAsanInstrumentation.h | 8 +-- 2 files changed, 49 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp index 593fca5bc3ed68..4c8ddbd9aabd5a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp @@ -147,11 +147,13 @@ static Value *memToShadow(Module &M, IRBuilder<> &IRB, Type *IntptrTy, return IRB.CreateAdd(Shadow, ShadowBase); } -void instrumentAddress(Module &M, IRBuilder<> &IRB, Instruction *OrigIns, - Instruction *InsertBefore, Value *Addr, - MaybeAlign Alignment, uint32_t TypeStoreSize, - bool IsWrite, Value *SizeArgument, bool UseCalls, - bool Recover, int AsanScale, int AsanOffset) { +static void instrumentAddressImpl(Module &M, IRBuilder<> &IRB, + Instruction *OrigIns, + Instruction *InsertBefore, Value *Addr, + Align Alignment, uint32_t TypeStoreSize, + bool IsWrite, Value *SizeArgument, + bool UseCalls, bool Recover, int AsanScale, + int AsanOffset) { Type *AddrTy = Addr->getType(); Type *IntptrTy = M.getDataLayout().getIntPtrType( M.getContext(), AddrTy->getPointerAddressSpace()); @@ -164,7 +166,7 @@ void instrumentAddress(Module &M, IRBuilder<> &IRB, Instruction *OrigIns, Value *ShadowPtr = memToShadow(M, IRB, IntptrTy, AddrLong, AsanScale, AsanOffset); const uint64_t ShadowAlign = - std::max(Alignment.valueOrOne().value() >> AsanScale, 1); + std::max(Alignment.value() >> AsanScale, 1); Value *ShadowValue = IRB.CreateAlignedLoad( ShadowTy, IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy), Align(ShadowAlign)); Value *Cmp = IRB.CreateIsNotNull(ShadowValue); @@ -179,6 +181,43 @@ void instrumentAddress(Module &M, IRBuilder<> &IRB, Instruction *OrigIns, return; } +void instrumentAddress(Module &M, IRBuilder<> &IRB, Instruction *OrigIns, + Instruction *InsertBefore, Value *Addr, Align Alignment, + TypeSize TypeStoreSize, bool IsWrite, + Value *SizeArgument, bool UseCalls, bool Recover, + int AsanScale, int AsanOffset) { + if (!TypeStoreSize.isScalable()) { + unsigned Granularity = 1 << AsanScale; + const auto FixedSize = TypeStoreSize.getFixedValue(); + switch (FixedSize) { + case 8: + case 16: + case 32: + case 64: + case 128: + if (Alignment.value() >= Granularity || + Alignment.value() >= FixedSize / 8) + return instrumentAddressImpl( + M, IRB, OrigIns, InsertBefore, Addr, Alignment, FixedSize, IsWrite, + SizeArgument, UseCalls, Recover, AsanScale, AsanOffset); + } + } + // Instrument unusual size or unusual alignment. + IRB.SetInsertPoint(InsertBefore); + Type *AddrTy = Addr->getType(); + Type *IntptrTy = M.getDataLayout().getIntPtrType(AddrTy); + Value *NumBits = IRB.CreateTypeSize(IntptrTy, TypeStoreSize); + Value *Size = IRB.CreateLShr(NumBits, ConstantInt::get(IntptrTy, 3)); + Value *AddrLong = IRB.CreatePtrToInt(Addr, IntptrTy); + Value *SizeMinusOne = IRB.CreateAdd(Size, ConstantInt::get(IntptrTy, -1)); + Value *LastByte = + IRB.CreateIntToPtr(IRB.CreateAdd(AddrLong, SizeMinusOne), AddrTy); + instrumentAddressImpl(M, IRB, OrigIns, InsertBefore, Addr, {}, 8, IsWrite, + SizeArgument, UseCalls, Recover, AsanScale, AsanOffset); + instrumentAddressImpl(M, IRB, OrigIns, InsertBefore, LastByte, {}, 8, IsWrite, + SizeArgument, UseCalls, Recover, AsanScale, AsanOffset); +} + void getInterestingMemoryOperands( Module &M, Instruction *I, SmallVectorImpl &Interesting) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.h b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.h index b10fded57b1a7e..b2b8ec19b49ece 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.h @@ -43,10 +43,10 @@ uint64_t getRedzoneSizeForGlobal(int Scale, uint64_t SizeInBytes); /// Instrument the memory operand Addr. /// Generates report blocks that catch the addressing errors. void instrumentAddress(Module &M, IRBuilder<> &IRB, Instruction *OrigIns, - Instruction *InsertBefore, Value *Addr, - MaybeAlign Alignment, uint32_t TypeStoreSize, - bool IsWrite, Value *SizeArgument, bool UseCalls, - bool Recover, int Scale, int Offset); + Instruction *InsertBefore, Value *Addr, Align Alignment, + TypeSize TypeStoreSize, bool IsWrite, + Value *SizeArgument, bool UseCalls, bool Recover, + int Scale, int Offset); /// Get all the memory operands from the instruction /// that needs to be instrumented From 69a0af756b921ad445eb9684f325d27a1c3a13b8 Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Tue, 20 Aug 2024 22:10:30 -0700 Subject: [PATCH 009/426] Revert "[FunctionAttrs] deduce attr `cold` on functions if all CG paths call a `cold` function" This reverts commit b7eac8c6fea1ba3930d08011a0e5e3a262bfaece. Causing a test failure. Not 100% sure the issue so to reverting to unblock pipeline. --- llvm/lib/Transforms/IPO/FunctionAttrs.cpp | 69 --- llvm/test/Transforms/FunctionAttrs/cold.ll | 542 +++++++-------------- 2 files changed, 178 insertions(+), 433 deletions(-) diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index 603a1565e48c45..d50218aaa3b6cc 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -82,7 +82,6 @@ STATISTIC(NumNoUnwind, "Number of functions marked as nounwind"); STATISTIC(NumNoFree, "Number of functions marked as nofree"); STATISTIC(NumWillReturn, "Number of functions marked as willreturn"); STATISTIC(NumNoSync, "Number of functions marked as nosync"); -STATISTIC(NumCold, "Number of functions marked as cold"); STATISTIC(NumThinLinkNoRecurse, "Number of functions marked as norecurse during thinlink"); @@ -1746,7 +1745,6 @@ static bool canReturn(Function &F) { return false; } - // Set the noreturn function attribute if possible. static void addNoReturnAttrs(const SCCNodeSet &SCCNodes, SmallSet &Changed) { @@ -1762,72 +1760,6 @@ static void addNoReturnAttrs(const SCCNodeSet &SCCNodes, } } -static bool -allBBPathsGoThroughCold(BasicBlock *BB, - SmallDenseMap &Visited) { - // If BB contains a cold callsite this path through the CG is cold. - // Ignore whether the instructions actually are guranteed to transfer - // execution. Divergent behavior is considered unlikely. - if (any_of(*BB, [](Instruction &I) { - if (auto *CB = dyn_cast(&I)) - return CB->hasFnAttr(Attribute::Cold); - return false; - })) { - Visited[BB] = true; - return true; - } - - auto Succs = successors(BB); - // We found a path that doesn't go through any cold callsite. - if (Succs.empty()) - return false; - - // We didn't find a cold callsite in this BB, so check that all successors - // contain a cold callsite (or that their successors do). - // Potential TODO: We could use static branch hints to assume certain - // successor paths are inherently cold, irrespective of if they contain a cold - // callsite. - for (auto *Succ : Succs) { - // Start with false, this is necessary to ensure we don't turn loops into - // cold. - auto R = Visited.try_emplace(Succ, false); - if (!R.second) { - if (R.first->second) - continue; - return false; - } - if (!allBBPathsGoThroughCold(Succ, Visited)) - return false; - Visited[Succ] = true; - } - - return true; -} - -static bool allPathsGoThroughCold(Function &F) { - SmallDenseMap Visited; - Visited[&F.front()] = false; - return allBBPathsGoThroughCold(&F.front(), Visited); -} - -// Set the cold function attribute if possible. -static void addColdAttrs(const SCCNodeSet &SCCNodes, - SmallSet &Changed) { - for (Function *F : SCCNodes) { - if (!F || !F->hasExactDefinition() || F->hasFnAttribute(Attribute::Naked) || - F->hasFnAttribute(Attribute::Cold) || F->hasFnAttribute(Attribute::Hot)) - continue; - - // Potential TODO: We could add attribute `cold` on functions with `coldcc`. - if (allPathsGoThroughCold(*F)) { - F->addFnAttr(Attribute::Cold); - ++NumCold; - Changed.insert(F); - continue; - } - } -} - static bool functionWillReturn(const Function &F) { // We can infer and propagate function attributes only when we know that the // definition we'll get at link time is *exactly* the definition we see now. @@ -1921,7 +1853,6 @@ deriveAttrsInPostOrder(ArrayRef Functions, AARGetterT &&AARGetter, addArgumentAttrs(Nodes.SCCNodes, Changed); inferConvergent(Nodes.SCCNodes, Changed); addNoReturnAttrs(Nodes.SCCNodes, Changed); - addColdAttrs(Nodes.SCCNodes, Changed); addWillReturn(Nodes.SCCNodes, Changed); addNoUndefAttrs(Nodes.SCCNodes, Changed); diff --git a/llvm/test/Transforms/FunctionAttrs/cold.ll b/llvm/test/Transforms/FunctionAttrs/cold.ll index a205fbda062121..1fa8ae06797943 100644 --- a/llvm/test/Transforms/FunctionAttrs/cold.ll +++ b/llvm/test/Transforms/FunctionAttrs/cold.ll @@ -54,23 +54,14 @@ while.body2: } define void @test_no_exit() { -; FNATTRS: Function Attrs: cold noreturn -; FNATTRS-LABEL: define void @test_no_exit -; FNATTRS-SAME: () #[[ATTR3:[0-9]+]] { -; FNATTRS-NEXT: entry: -; FNATTRS-NEXT: br label [[WHILE_BODY:%.*]] -; FNATTRS: while.body: -; FNATTRS-NEXT: call void @cold0() -; FNATTRS-NEXT: br label [[WHILE_BODY]] -; -; ATTRIBUTOR: Function Attrs: noreturn -; ATTRIBUTOR-LABEL: define void @test_no_exit -; ATTRIBUTOR-SAME: () #[[ATTR2]] { -; ATTRIBUTOR-NEXT: entry: -; ATTRIBUTOR-NEXT: br label [[WHILE_BODY:%.*]] -; ATTRIBUTOR: while.body: -; ATTRIBUTOR-NEXT: call void @cold0() -; ATTRIBUTOR-NEXT: br label [[WHILE_BODY]] +; COMMON: Function Attrs: noreturn +; COMMON-LABEL: define void @test_no_exit +; COMMON-SAME: () #[[ATTR2]] { +; COMMON-NEXT: entry: +; COMMON-NEXT: br label [[WHILE_BODY:%.*]] +; COMMON: while.body: +; COMMON-NEXT: call void @cold0() +; COMMON-NEXT: br label [[WHILE_BODY]] ; entry: br label %while.body @@ -81,29 +72,17 @@ while.body: } define void @test_no_exit2() { -; FNATTRS: Function Attrs: cold noreturn -; FNATTRS-LABEL: define void @test_no_exit2 -; FNATTRS-SAME: () #[[ATTR3]] { -; FNATTRS-NEXT: entry: -; FNATTRS-NEXT: br label [[WHILE_BODY:%.*]] -; FNATTRS: while.body: -; FNATTRS-NEXT: call void @not_cold0() -; FNATTRS-NEXT: br label [[WHILE_BODY2:%.*]] -; FNATTRS: while.body2: -; FNATTRS-NEXT: call void @cold1() -; FNATTRS-NEXT: br label [[WHILE_BODY]] -; -; ATTRIBUTOR: Function Attrs: noreturn -; ATTRIBUTOR-LABEL: define void @test_no_exit2 -; ATTRIBUTOR-SAME: () #[[ATTR2]] { -; ATTRIBUTOR-NEXT: entry: -; ATTRIBUTOR-NEXT: br label [[WHILE_BODY:%.*]] -; ATTRIBUTOR: while.body: -; ATTRIBUTOR-NEXT: call void @not_cold0() -; ATTRIBUTOR-NEXT: br label [[WHILE_BODY2:%.*]] -; ATTRIBUTOR: while.body2: -; ATTRIBUTOR-NEXT: call void @cold1() -; ATTRIBUTOR-NEXT: br label [[WHILE_BODY]] +; COMMON: Function Attrs: noreturn +; COMMON-LABEL: define void @test_no_exit2 +; COMMON-SAME: () #[[ATTR2]] { +; COMMON-NEXT: entry: +; COMMON-NEXT: br label [[WHILE_BODY:%.*]] +; COMMON: while.body: +; COMMON-NEXT: call void @not_cold0() +; COMMON-NEXT: br label [[WHILE_BODY2:%.*]] +; COMMON: while.body2: +; COMMON-NEXT: call void @cold1() +; COMMON-NEXT: br label [[WHILE_BODY]] ; entry: br label %while.body @@ -118,32 +97,18 @@ while.body2: } define dso_local void @test_entry(i32 noundef %x) { -; FNATTRS: Function Attrs: cold -; FNATTRS-LABEL: define dso_local void @test_entry -; FNATTRS-SAME: (i32 noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] { -; FNATTRS-NEXT: entry: -; FNATTRS-NEXT: tail call void @cold0() -; FNATTRS-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 -; FNATTRS-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] -; FNATTRS: if.then: -; FNATTRS-NEXT: tail call void @not_cold0() -; FNATTRS-NEXT: br label [[IF_END]] -; FNATTRS: if.end: -; FNATTRS-NEXT: tail call void @not_cold1() -; FNATTRS-NEXT: ret void -; -; ATTRIBUTOR-LABEL: define dso_local void @test_entry -; ATTRIBUTOR-SAME: (i32 noundef [[X:%.*]]) { -; ATTRIBUTOR-NEXT: entry: -; ATTRIBUTOR-NEXT: tail call void @cold0() -; ATTRIBUTOR-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 -; ATTRIBUTOR-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] -; ATTRIBUTOR: if.then: -; ATTRIBUTOR-NEXT: tail call void @not_cold0() -; ATTRIBUTOR-NEXT: br label [[IF_END]] -; ATTRIBUTOR: if.end: -; ATTRIBUTOR-NEXT: tail call void @not_cold1() -; ATTRIBUTOR-NEXT: ret void +; COMMON-LABEL: define dso_local void @test_entry +; COMMON-SAME: (i32 noundef [[X:%.*]]) { +; COMMON-NEXT: entry: +; COMMON-NEXT: tail call void @cold0() +; COMMON-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 +; COMMON-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] +; COMMON: if.then: +; COMMON-NEXT: tail call void @not_cold0() +; COMMON-NEXT: br label [[IF_END]] +; COMMON: if.end: +; COMMON-NEXT: tail call void @not_cold1() +; COMMON-NEXT: ret void ; entry: tail call void @cold0() @@ -160,19 +125,12 @@ if.end: } define dso_local void @test_hot_fail(i32 noundef %x) hot { -; FNATTRS: Function Attrs: hot -; FNATTRS-LABEL: define dso_local void @test_hot_fail -; FNATTRS-SAME: (i32 noundef [[X:%.*]]) #[[ATTR4:[0-9]+]] { -; FNATTRS-NEXT: entry: -; FNATTRS-NEXT: tail call void @cold0() -; FNATTRS-NEXT: ret void -; -; ATTRIBUTOR: Function Attrs: hot -; ATTRIBUTOR-LABEL: define dso_local void @test_hot_fail -; ATTRIBUTOR-SAME: (i32 noundef [[X:%.*]]) #[[ATTR3:[0-9]+]] { -; ATTRIBUTOR-NEXT: entry: -; ATTRIBUTOR-NEXT: tail call void @cold0() -; ATTRIBUTOR-NEXT: ret void +; COMMON: Function Attrs: hot +; COMMON-LABEL: define dso_local void @test_hot_fail +; COMMON-SAME: (i32 noundef [[X:%.*]]) #[[ATTR3:[0-9]+]] { +; COMMON-NEXT: entry: +; COMMON-NEXT: tail call void @cold0() +; COMMON-NEXT: ret void ; entry: tail call void @cold0() @@ -180,34 +138,19 @@ entry: } define dso_local void @test_br2(i32 noundef %x) { -; FNATTRS: Function Attrs: cold -; FNATTRS-LABEL: define dso_local void @test_br2 -; FNATTRS-SAME: (i32 noundef [[X:%.*]]) #[[ATTR0]] { -; FNATTRS-NEXT: entry: -; FNATTRS-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 -; FNATTRS-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] -; FNATTRS: if.then: -; FNATTRS-NEXT: tail call void @cold0() -; FNATTRS-NEXT: br label [[IF_END:%.*]] -; FNATTRS: if.else: -; FNATTRS-NEXT: tail call void @cold1() -; FNATTRS-NEXT: br label [[IF_END]] -; FNATTRS: if.end: -; FNATTRS-NEXT: ret void -; -; ATTRIBUTOR-LABEL: define dso_local void @test_br2 -; ATTRIBUTOR-SAME: (i32 noundef [[X:%.*]]) { -; ATTRIBUTOR-NEXT: entry: -; ATTRIBUTOR-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 -; ATTRIBUTOR-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] -; ATTRIBUTOR: if.then: -; ATTRIBUTOR-NEXT: tail call void @cold0() -; ATTRIBUTOR-NEXT: br label [[IF_END:%.*]] -; ATTRIBUTOR: if.else: -; ATTRIBUTOR-NEXT: tail call void @cold1() -; ATTRIBUTOR-NEXT: br label [[IF_END]] -; ATTRIBUTOR: if.end: -; ATTRIBUTOR-NEXT: ret void +; COMMON-LABEL: define dso_local void @test_br2 +; COMMON-SAME: (i32 noundef [[X:%.*]]) { +; COMMON-NEXT: entry: +; COMMON-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 +; COMMON-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; COMMON: if.then: +; COMMON-NEXT: tail call void @cold0() +; COMMON-NEXT: br label [[IF_END:%.*]] +; COMMON: if.else: +; COMMON-NEXT: tail call void @cold1() +; COMMON-NEXT: br label [[IF_END]] +; COMMON: if.end: +; COMMON-NEXT: ret void ; entry: %tobool.not = icmp eq i32 %x, 0 @@ -226,38 +169,21 @@ if.end: } define dso_local void @test_exit(i32 noundef %x) { -; FNATTRS: Function Attrs: cold -; FNATTRS-LABEL: define dso_local void @test_exit -; FNATTRS-SAME: (i32 noundef [[X:%.*]]) #[[ATTR0]] { -; FNATTRS-NEXT: entry: -; FNATTRS-NEXT: tail call void @not_cold0() -; FNATTRS-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 -; FNATTRS-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] -; FNATTRS: if.then: -; FNATTRS-NEXT: tail call void @not_cold1() -; FNATTRS-NEXT: br label [[IF_END:%.*]] -; FNATTRS: if.else: -; FNATTRS-NEXT: tail call void @not_cold2() -; FNATTRS-NEXT: br label [[IF_END]] -; FNATTRS: if.end: -; FNATTRS-NEXT: tail call void @cold0() -; FNATTRS-NEXT: ret void -; -; ATTRIBUTOR-LABEL: define dso_local void @test_exit -; ATTRIBUTOR-SAME: (i32 noundef [[X:%.*]]) { -; ATTRIBUTOR-NEXT: entry: -; ATTRIBUTOR-NEXT: tail call void @not_cold0() -; ATTRIBUTOR-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 -; ATTRIBUTOR-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] -; ATTRIBUTOR: if.then: -; ATTRIBUTOR-NEXT: tail call void @not_cold1() -; ATTRIBUTOR-NEXT: br label [[IF_END:%.*]] -; ATTRIBUTOR: if.else: -; ATTRIBUTOR-NEXT: tail call void @not_cold2() -; ATTRIBUTOR-NEXT: br label [[IF_END]] -; ATTRIBUTOR: if.end: -; ATTRIBUTOR-NEXT: tail call void @cold0() -; ATTRIBUTOR-NEXT: ret void +; COMMON-LABEL: define dso_local void @test_exit +; COMMON-SAME: (i32 noundef [[X:%.*]]) { +; COMMON-NEXT: entry: +; COMMON-NEXT: tail call void @not_cold0() +; COMMON-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 +; COMMON-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; COMMON: if.then: +; COMMON-NEXT: tail call void @not_cold1() +; COMMON-NEXT: br label [[IF_END:%.*]] +; COMMON: if.else: +; COMMON-NEXT: tail call void @not_cold2() +; COMMON-NEXT: br label [[IF_END]] +; COMMON: if.end: +; COMMON-NEXT: tail call void @cold0() +; COMMON-NEXT: ret void ; entry: tail call void @not_cold0() @@ -278,104 +204,54 @@ if.end: } define dso_local void @test_complex(i32 noundef %x) { -; FNATTRS: Function Attrs: cold -; FNATTRS-LABEL: define dso_local void @test_complex -; FNATTRS-SAME: (i32 noundef [[X:%.*]]) #[[ATTR0]] { -; FNATTRS-NEXT: entry: -; FNATTRS-NEXT: tail call void @not_cold0() -; FNATTRS-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 -; FNATTRS-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE11:%.*]], label [[IF_THEN:%.*]] -; FNATTRS: if.then: -; FNATTRS-NEXT: [[CALL:%.*]] = tail call i32 @get_val() -; FNATTRS-NEXT: [[TOBOOL1_NOT:%.*]] = icmp eq i32 [[CALL]], 0 -; FNATTRS-NEXT: br i1 [[TOBOOL1_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN2:%.*]] -; FNATTRS: if.then2: -; FNATTRS-NEXT: tail call void @cold1() -; FNATTRS-NEXT: br label [[IF_END12:%.*]] -; FNATTRS: if.else: -; FNATTRS-NEXT: [[CALL3:%.*]] = tail call i32 @get_val() -; FNATTRS-NEXT: [[TOBOOL4_NOT:%.*]] = icmp eq i32 [[CALL3]], 0 -; FNATTRS-NEXT: br i1 [[TOBOOL4_NOT]], label [[IF_ELSE6:%.*]], label [[IF_THEN5:%.*]] -; FNATTRS: if.then5: -; FNATTRS-NEXT: tail call void @cold0() -; FNATTRS-NEXT: br label [[IF_END12]] -; FNATTRS: if.else6: -; FNATTRS-NEXT: tail call void @not_cold0() -; FNATTRS-NEXT: [[CALL7:%.*]] = tail call i32 @get_val() -; FNATTRS-NEXT: switch i32 [[CALL7]], label [[SW_DEFAULT:%.*]] [ -; FNATTRS-NEXT: i32 0, label [[SW_BB:%.*]] -; FNATTRS-NEXT: i32 1, label [[SW_BB8:%.*]] -; FNATTRS-NEXT: i32 2, label [[SW_BB9:%.*]] -; FNATTRS-NEXT: ] -; FNATTRS: sw.bb: -; FNATTRS-NEXT: tail call void @not_cold0() -; FNATTRS-NEXT: br label [[CALL_COLD:%.*]] -; FNATTRS: sw.bb8: -; FNATTRS-NEXT: tail call void @not_cold1() -; FNATTRS-NEXT: br label [[CALL_COLD]] -; FNATTRS: sw.bb9: -; FNATTRS-NEXT: tail call void @not_cold2() -; FNATTRS-NEXT: br label [[CALL_COLD]] -; FNATTRS: sw.default: -; FNATTRS-NEXT: tail call void @cold0() -; FNATTRS-NEXT: br label [[IF_END12]] -; FNATTRS: call_cold: -; FNATTRS-NEXT: tail call void @cold_at_cb() #[[ATTR0]] -; FNATTRS-NEXT: br label [[IF_END12]] -; FNATTRS: if.else11: -; FNATTRS-NEXT: tail call void @cold0() -; FNATTRS-NEXT: br label [[IF_END12]] -; FNATTRS: if.end12: -; FNATTRS-NEXT: ret void -; -; ATTRIBUTOR-LABEL: define dso_local void @test_complex -; ATTRIBUTOR-SAME: (i32 noundef [[X:%.*]]) { -; ATTRIBUTOR-NEXT: entry: -; ATTRIBUTOR-NEXT: tail call void @not_cold0() -; ATTRIBUTOR-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 -; ATTRIBUTOR-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE11:%.*]], label [[IF_THEN:%.*]] -; ATTRIBUTOR: if.then: -; ATTRIBUTOR-NEXT: [[CALL:%.*]] = tail call i32 @get_val() -; ATTRIBUTOR-NEXT: [[TOBOOL1_NOT:%.*]] = icmp eq i32 [[CALL]], 0 -; ATTRIBUTOR-NEXT: br i1 [[TOBOOL1_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN2:%.*]] -; ATTRIBUTOR: if.then2: -; ATTRIBUTOR-NEXT: tail call void @cold1() -; ATTRIBUTOR-NEXT: br label [[IF_END12:%.*]] -; ATTRIBUTOR: if.else: -; ATTRIBUTOR-NEXT: [[CALL3:%.*]] = tail call i32 @get_val() -; ATTRIBUTOR-NEXT: [[TOBOOL4_NOT:%.*]] = icmp eq i32 [[CALL3]], 0 -; ATTRIBUTOR-NEXT: br i1 [[TOBOOL4_NOT]], label [[IF_ELSE6:%.*]], label [[IF_THEN5:%.*]] -; ATTRIBUTOR: if.then5: -; ATTRIBUTOR-NEXT: tail call void @cold0() -; ATTRIBUTOR-NEXT: br label [[IF_END12]] -; ATTRIBUTOR: if.else6: -; ATTRIBUTOR-NEXT: tail call void @not_cold0() -; ATTRIBUTOR-NEXT: [[CALL7:%.*]] = tail call i32 @get_val() -; ATTRIBUTOR-NEXT: switch i32 [[CALL7]], label [[SW_DEFAULT:%.*]] [ -; ATTRIBUTOR-NEXT: i32 0, label [[SW_BB:%.*]] -; ATTRIBUTOR-NEXT: i32 1, label [[SW_BB8:%.*]] -; ATTRIBUTOR-NEXT: i32 2, label [[SW_BB9:%.*]] -; ATTRIBUTOR-NEXT: ] -; ATTRIBUTOR: sw.bb: -; ATTRIBUTOR-NEXT: tail call void @not_cold0() -; ATTRIBUTOR-NEXT: br label [[CALL_COLD:%.*]] -; ATTRIBUTOR: sw.bb8: -; ATTRIBUTOR-NEXT: tail call void @not_cold1() -; ATTRIBUTOR-NEXT: br label [[CALL_COLD]] -; ATTRIBUTOR: sw.bb9: -; ATTRIBUTOR-NEXT: tail call void @not_cold2() -; ATTRIBUTOR-NEXT: br label [[CALL_COLD]] -; ATTRIBUTOR: sw.default: -; ATTRIBUTOR-NEXT: tail call void @cold0() -; ATTRIBUTOR-NEXT: br label [[IF_END12]] -; ATTRIBUTOR: call_cold: -; ATTRIBUTOR-NEXT: tail call void @cold_at_cb() #[[ATTR0:[0-9]+]] -; ATTRIBUTOR-NEXT: br label [[IF_END12]] -; ATTRIBUTOR: if.else11: -; ATTRIBUTOR-NEXT: tail call void @cold0() -; ATTRIBUTOR-NEXT: br label [[IF_END12]] -; ATTRIBUTOR: if.end12: -; ATTRIBUTOR-NEXT: ret void +; COMMON-LABEL: define dso_local void @test_complex +; COMMON-SAME: (i32 noundef [[X:%.*]]) { +; COMMON-NEXT: entry: +; COMMON-NEXT: tail call void @not_cold0() +; COMMON-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 +; COMMON-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE11:%.*]], label [[IF_THEN:%.*]] +; COMMON: if.then: +; COMMON-NEXT: [[CALL:%.*]] = tail call i32 @get_val() +; COMMON-NEXT: [[TOBOOL1_NOT:%.*]] = icmp eq i32 [[CALL]], 0 +; COMMON-NEXT: br i1 [[TOBOOL1_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN2:%.*]] +; COMMON: if.then2: +; COMMON-NEXT: tail call void @cold1() +; COMMON-NEXT: br label [[IF_END12:%.*]] +; COMMON: if.else: +; COMMON-NEXT: [[CALL3:%.*]] = tail call i32 @get_val() +; COMMON-NEXT: [[TOBOOL4_NOT:%.*]] = icmp eq i32 [[CALL3]], 0 +; COMMON-NEXT: br i1 [[TOBOOL4_NOT]], label [[IF_ELSE6:%.*]], label [[IF_THEN5:%.*]] +; COMMON: if.then5: +; COMMON-NEXT: tail call void @cold0() +; COMMON-NEXT: br label [[IF_END12]] +; COMMON: if.else6: +; COMMON-NEXT: tail call void @not_cold0() +; COMMON-NEXT: [[CALL7:%.*]] = tail call i32 @get_val() +; COMMON-NEXT: switch i32 [[CALL7]], label [[SW_DEFAULT:%.*]] [ +; COMMON-NEXT: i32 0, label [[SW_BB:%.*]] +; COMMON-NEXT: i32 1, label [[SW_BB8:%.*]] +; COMMON-NEXT: i32 2, label [[SW_BB9:%.*]] +; COMMON-NEXT: ] +; COMMON: sw.bb: +; COMMON-NEXT: tail call void @not_cold0() +; COMMON-NEXT: br label [[CALL_COLD:%.*]] +; COMMON: sw.bb8: +; COMMON-NEXT: tail call void @not_cold1() +; COMMON-NEXT: br label [[CALL_COLD]] +; COMMON: sw.bb9: +; COMMON-NEXT: tail call void @not_cold2() +; COMMON-NEXT: br label [[CALL_COLD]] +; COMMON: sw.default: +; COMMON-NEXT: tail call void @cold0() +; COMMON-NEXT: br label [[IF_END12]] +; COMMON: call_cold: +; COMMON-NEXT: tail call void @cold_at_cb() #[[ATTR0:[0-9]+]] +; COMMON-NEXT: br label [[IF_END12]] +; COMMON: if.else11: +; COMMON-NEXT: tail call void @cold0() +; COMMON-NEXT: br label [[IF_END12]] +; COMMON: if.end12: +; COMMON-NEXT: ret void ; entry: tail call void @not_cold0() @@ -438,122 +314,63 @@ if.end12: } define dso_local void @test_complex2(i32 noundef %x) { -; FNATTRS: Function Attrs: cold -; FNATTRS-LABEL: define dso_local void @test_complex2 -; FNATTRS-SAME: (i32 noundef [[X:%.*]]) #[[ATTR0]] { -; FNATTRS-NEXT: entry: -; FNATTRS-NEXT: tail call void @not_cold0() -; FNATTRS-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 -; FNATTRS-NEXT: [[CALL12:%.*]] = tail call i32 @get_val() -; FNATTRS-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE11:%.*]], label [[IF_THEN:%.*]] -; FNATTRS: if.then: -; FNATTRS-NEXT: [[TOBOOL1_NOT:%.*]] = icmp eq i32 [[CALL12]], 0 -; FNATTRS-NEXT: br i1 [[TOBOOL1_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN2:%.*]] -; FNATTRS: if.then2: -; FNATTRS-NEXT: tail call void @cold1() -; FNATTRS-NEXT: br label [[IF_END16:%.*]] -; FNATTRS: if.else: -; FNATTRS-NEXT: [[CALL3:%.*]] = tail call i32 @get_val() -; FNATTRS-NEXT: [[TOBOOL4_NOT:%.*]] = icmp eq i32 [[CALL3]], 0 -; FNATTRS-NEXT: br i1 [[TOBOOL4_NOT]], label [[IF_ELSE6:%.*]], label [[IF_THEN5:%.*]] -; FNATTRS: if.then5: -; FNATTRS-NEXT: tail call void @cold0() -; FNATTRS-NEXT: br label [[IF_END16]] -; FNATTRS: if.else6: -; FNATTRS-NEXT: tail call void @not_cold0() -; FNATTRS-NEXT: [[CALL7:%.*]] = tail call i32 @get_val() -; FNATTRS-NEXT: switch i32 [[CALL7]], label [[SW_DEFAULT:%.*]] [ -; FNATTRS-NEXT: i32 0, label [[SW_BB:%.*]] -; FNATTRS-NEXT: i32 1, label [[SW_BB8:%.*]] -; FNATTRS-NEXT: i32 2, label [[SW_BB9:%.*]] -; FNATTRS-NEXT: ] -; FNATTRS: sw.bb: -; FNATTRS-NEXT: tail call void @not_cold0() -; FNATTRS-NEXT: br label [[CALL_COLD:%.*]] -; FNATTRS: sw.bb8: -; FNATTRS-NEXT: tail call void @not_cold1() -; FNATTRS-NEXT: br label [[CALL_COLD]] -; FNATTRS: sw.bb9: -; FNATTRS-NEXT: tail call void @not_cold2() -; FNATTRS-NEXT: br label [[CALL_COLD]] -; FNATTRS: sw.default: -; FNATTRS-NEXT: tail call void @cold0() -; FNATTRS-NEXT: br label [[IF_END16]] -; FNATTRS: call_cold: -; FNATTRS-NEXT: tail call void @cold_at_cb() #[[ATTR0]] -; FNATTRS-NEXT: br label [[IF_END16]] -; FNATTRS: if.else11: -; FNATTRS-NEXT: [[CMP:%.*]] = icmp slt i32 [[CALL12]], 1 -; FNATTRS-NEXT: br i1 [[CMP]], label [[IF_END14:%.*]], label [[FOR_BODY:%.*]] -; FNATTRS: if.end14: -; FNATTRS-NEXT: tail call void @cold1() -; FNATTRS-NEXT: br label [[IF_END16]] -; FNATTRS: for.body: -; FNATTRS-NEXT: [[I_021:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[IF_ELSE11]] ] -; FNATTRS-NEXT: tail call void @cold0() -; FNATTRS-NEXT: [[INC]] = add nuw nsw i32 [[I_021]], 1 -; FNATTRS-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[CALL12]] -; FNATTRS-NEXT: br i1 [[EXITCOND_NOT]], label [[IF_END16]], label [[FOR_BODY]] -; FNATTRS: if.end16: -; FNATTRS-NEXT: ret void -; -; ATTRIBUTOR-LABEL: define dso_local void @test_complex2 -; ATTRIBUTOR-SAME: (i32 noundef [[X:%.*]]) { -; ATTRIBUTOR-NEXT: entry: -; ATTRIBUTOR-NEXT: tail call void @not_cold0() -; ATTRIBUTOR-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 -; ATTRIBUTOR-NEXT: [[CALL12:%.*]] = tail call i32 @get_val() -; ATTRIBUTOR-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE11:%.*]], label [[IF_THEN:%.*]] -; ATTRIBUTOR: if.then: -; ATTRIBUTOR-NEXT: [[TOBOOL1_NOT:%.*]] = icmp eq i32 [[CALL12]], 0 -; ATTRIBUTOR-NEXT: br i1 [[TOBOOL1_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN2:%.*]] -; ATTRIBUTOR: if.then2: -; ATTRIBUTOR-NEXT: tail call void @cold1() -; ATTRIBUTOR-NEXT: br label [[IF_END16:%.*]] -; ATTRIBUTOR: if.else: -; ATTRIBUTOR-NEXT: [[CALL3:%.*]] = tail call i32 @get_val() -; ATTRIBUTOR-NEXT: [[TOBOOL4_NOT:%.*]] = icmp eq i32 [[CALL3]], 0 -; ATTRIBUTOR-NEXT: br i1 [[TOBOOL4_NOT]], label [[IF_ELSE6:%.*]], label [[IF_THEN5:%.*]] -; ATTRIBUTOR: if.then5: -; ATTRIBUTOR-NEXT: tail call void @cold0() -; ATTRIBUTOR-NEXT: br label [[IF_END16]] -; ATTRIBUTOR: if.else6: -; ATTRIBUTOR-NEXT: tail call void @not_cold0() -; ATTRIBUTOR-NEXT: [[CALL7:%.*]] = tail call i32 @get_val() -; ATTRIBUTOR-NEXT: switch i32 [[CALL7]], label [[SW_DEFAULT:%.*]] [ -; ATTRIBUTOR-NEXT: i32 0, label [[SW_BB:%.*]] -; ATTRIBUTOR-NEXT: i32 1, label [[SW_BB8:%.*]] -; ATTRIBUTOR-NEXT: i32 2, label [[SW_BB9:%.*]] -; ATTRIBUTOR-NEXT: ] -; ATTRIBUTOR: sw.bb: -; ATTRIBUTOR-NEXT: tail call void @not_cold0() -; ATTRIBUTOR-NEXT: br label [[CALL_COLD:%.*]] -; ATTRIBUTOR: sw.bb8: -; ATTRIBUTOR-NEXT: tail call void @not_cold1() -; ATTRIBUTOR-NEXT: br label [[CALL_COLD]] -; ATTRIBUTOR: sw.bb9: -; ATTRIBUTOR-NEXT: tail call void @not_cold2() -; ATTRIBUTOR-NEXT: br label [[CALL_COLD]] -; ATTRIBUTOR: sw.default: -; ATTRIBUTOR-NEXT: tail call void @cold0() -; ATTRIBUTOR-NEXT: br label [[IF_END16]] -; ATTRIBUTOR: call_cold: -; ATTRIBUTOR-NEXT: tail call void @cold_at_cb() #[[ATTR0]] -; ATTRIBUTOR-NEXT: br label [[IF_END16]] -; ATTRIBUTOR: if.else11: -; ATTRIBUTOR-NEXT: [[CMP:%.*]] = icmp slt i32 [[CALL12]], 1 -; ATTRIBUTOR-NEXT: br i1 [[CMP]], label [[IF_END14:%.*]], label [[FOR_BODY:%.*]] -; ATTRIBUTOR: if.end14: -; ATTRIBUTOR-NEXT: tail call void @cold1() -; ATTRIBUTOR-NEXT: br label [[IF_END16]] -; ATTRIBUTOR: for.body: -; ATTRIBUTOR-NEXT: [[I_021:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[IF_ELSE11]] ] -; ATTRIBUTOR-NEXT: tail call void @cold0() -; ATTRIBUTOR-NEXT: [[INC]] = add nuw nsw i32 [[I_021]], 1 -; ATTRIBUTOR-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[CALL12]] -; ATTRIBUTOR-NEXT: br i1 [[EXITCOND_NOT]], label [[IF_END16]], label [[FOR_BODY]] -; ATTRIBUTOR: if.end16: -; ATTRIBUTOR-NEXT: ret void +; COMMON-LABEL: define dso_local void @test_complex2 +; COMMON-SAME: (i32 noundef [[X:%.*]]) { +; COMMON-NEXT: entry: +; COMMON-NEXT: tail call void @not_cold0() +; COMMON-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 +; COMMON-NEXT: [[CALL12:%.*]] = tail call i32 @get_val() +; COMMON-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE11:%.*]], label [[IF_THEN:%.*]] +; COMMON: if.then: +; COMMON-NEXT: [[TOBOOL1_NOT:%.*]] = icmp eq i32 [[CALL12]], 0 +; COMMON-NEXT: br i1 [[TOBOOL1_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN2:%.*]] +; COMMON: if.then2: +; COMMON-NEXT: tail call void @cold1() +; COMMON-NEXT: br label [[IF_END16:%.*]] +; COMMON: if.else: +; COMMON-NEXT: [[CALL3:%.*]] = tail call i32 @get_val() +; COMMON-NEXT: [[TOBOOL4_NOT:%.*]] = icmp eq i32 [[CALL3]], 0 +; COMMON-NEXT: br i1 [[TOBOOL4_NOT]], label [[IF_ELSE6:%.*]], label [[IF_THEN5:%.*]] +; COMMON: if.then5: +; COMMON-NEXT: tail call void @cold0() +; COMMON-NEXT: br label [[IF_END16]] +; COMMON: if.else6: +; COMMON-NEXT: tail call void @not_cold0() +; COMMON-NEXT: [[CALL7:%.*]] = tail call i32 @get_val() +; COMMON-NEXT: switch i32 [[CALL7]], label [[SW_DEFAULT:%.*]] [ +; COMMON-NEXT: i32 0, label [[SW_BB:%.*]] +; COMMON-NEXT: i32 1, label [[SW_BB8:%.*]] +; COMMON-NEXT: i32 2, label [[SW_BB9:%.*]] +; COMMON-NEXT: ] +; COMMON: sw.bb: +; COMMON-NEXT: tail call void @not_cold0() +; COMMON-NEXT: br label [[CALL_COLD:%.*]] +; COMMON: sw.bb8: +; COMMON-NEXT: tail call void @not_cold1() +; COMMON-NEXT: br label [[CALL_COLD]] +; COMMON: sw.bb9: +; COMMON-NEXT: tail call void @not_cold2() +; COMMON-NEXT: br label [[CALL_COLD]] +; COMMON: sw.default: +; COMMON-NEXT: tail call void @cold0() +; COMMON-NEXT: br label [[IF_END16]] +; COMMON: call_cold: +; COMMON-NEXT: tail call void @cold_at_cb() #[[ATTR0]] +; COMMON-NEXT: br label [[IF_END16]] +; COMMON: if.else11: +; COMMON-NEXT: [[CMP:%.*]] = icmp slt i32 [[CALL12]], 1 +; COMMON-NEXT: br i1 [[CMP]], label [[IF_END14:%.*]], label [[FOR_BODY:%.*]] +; COMMON: if.end14: +; COMMON-NEXT: tail call void @cold1() +; COMMON-NEXT: br label [[IF_END16]] +; COMMON: for.body: +; COMMON-NEXT: [[I_021:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[IF_ELSE11]] ] +; COMMON-NEXT: tail call void @cold0() +; COMMON-NEXT: [[INC]] = add nuw nsw i32 [[I_021]], 1 +; COMMON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[CALL12]] +; COMMON-NEXT: br i1 [[EXITCOND_NOT]], label [[IF_END16]], label [[FOR_BODY]] +; COMMON: if.end16: +; COMMON-NEXT: ret void ; entry: tail call void @not_cold0() @@ -668,7 +485,7 @@ define dso_local void @test_complex_fail(i32 noundef %x) { ; COMMON-NEXT: tail call void @cold0() ; COMMON-NEXT: br label [[IF_END12]] ; COMMON: call_cold: -; COMMON-NEXT: tail call void @cold_at_cb() #[[ATTR0:[0-9]+]] +; COMMON-NEXT: tail call void @cold_at_cb() #[[ATTR0]] ; COMMON-NEXT: br label [[IF_END12]] ; COMMON: if.else11: ; COMMON-NEXT: tail call void @cold0() @@ -867,14 +684,11 @@ if.end16: } ;. -; FNATTRS: attributes #[[ATTR0]] = { cold } -; FNATTRS: attributes #[[ATTR1]] = { nofree norecurse noreturn nosync nounwind memory(none) } -; FNATTRS: attributes #[[ATTR2]] = { noreturn } -; FNATTRS: attributes #[[ATTR3]] = { cold noreturn } -; FNATTRS: attributes #[[ATTR4]] = { hot } -;. -; ATTRIBUTOR: attributes #[[ATTR0]] = { cold } -; ATTRIBUTOR: attributes #[[ATTR1]] = { nofree norecurse noreturn nosync nounwind memory(none) } -; ATTRIBUTOR: attributes #[[ATTR2]] = { noreturn } -; ATTRIBUTOR: attributes #[[ATTR3]] = { hot } +; COMMON: attributes #[[ATTR0]] = { cold } +; COMMON: attributes #[[ATTR1]] = { nofree norecurse noreturn nosync nounwind memory(none) } +; COMMON: attributes #[[ATTR2]] = { noreturn } +; COMMON: attributes #[[ATTR3]] = { hot } ;. +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; ATTRIBUTOR: {{.*}} +; FNATTRS: {{.*}} From 341d86dcd3ec4ff3073d5f666564bb2cd38f6142 Mon Sep 17 00:00:00 2001 From: Vladimir Vereschaka Date: Tue, 20 Aug 2024 22:13:16 -0700 Subject: [PATCH 010/426] [CMake] Update CMake cache file for the ARM/Aarch64 cross toolchain builds. NFC. (#103552) In order to build LLDB project added the following changes: * enable LIBCXX_ENABLE_STATIC_ABI_LIBRARY option to link the ABI library statically. * set LIBCXX_ABI_VERSION to 1 by default. --- clang/cmake/caches/CrossWinToARMLinux.cmake | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/clang/cmake/caches/CrossWinToARMLinux.cmake b/clang/cmake/caches/CrossWinToARMLinux.cmake index e4d0a0c2d14cb9..87118bbd33377d 100644 --- a/clang/cmake/caches/CrossWinToARMLinux.cmake +++ b/clang/cmake/caches/CrossWinToARMLinux.cmake @@ -108,9 +108,9 @@ endif() message(STATUS "Toolchain target to build: ${LLVM_TARGETS_TO_BUILD}") -# Allow to override libc++ ABI version. Use 2 by default. +# Allow to override libc++ ABI version (1 is default). if (NOT DEFINED LIBCXX_ABI_VERSION) - set(LIBCXX_ABI_VERSION 2) + set(LIBCXX_ABI_VERSION 1) endif() message(STATUS "Toolchain's Libc++ ABI version: ${LIBCXX_ABI_VERSION}") @@ -217,6 +217,8 @@ set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_ENABLE_SHARED set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_ABI_VERSION ${LIBCXX_ABI_VERSION} CACHE STRING "") set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_CXX_ABI "libcxxabi" CACHE STRING "") #!!! set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS ON CACHE BOOL "") +# Merge libc++ and libc++abi libraries into the single libc++ library file. +set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_ENABLE_STATIC_ABI_LIBRARY ON CACHE BOOL "") # Avoid searching for the python3 interpreter during the runtimes configuration for the cross builds. # It starts searching the python3 package using the target's sysroot path, that usually is not compatible with the build host. From 6c62ad446b2441b78ae524d9e700e351d5a76394 Mon Sep 17 00:00:00 2001 From: Vassil Vassilev Date: Wed, 21 Aug 2024 07:22:31 +0200 Subject: [PATCH 011/426] [clang-repl] [codegen] Reduce the state in TBAA. NFC for static compilation. (#98138) In incremental compilation clang works with multiple `llvm::Module`s. Our current approach is to create a CodeGenModule entity for every new module request (via StartModule). However, some of the state such as the mangle context needs to be preserved to keep the original semantics in the ever-growing TU. Fixes: llvm/llvm-project#95581. cc: @jeaye --- clang/lib/CodeGen/CGCall.cpp | 31 +++++++++---------- clang/lib/CodeGen/CGClass.cpp | 2 +- clang/lib/CodeGen/CodeGenModule.cpp | 15 +++++---- clang/lib/CodeGen/CodeGenModule.h | 5 +-- clang/lib/CodeGen/CodeGenTBAA.cpp | 13 +++++--- clang/lib/CodeGen/CodeGenTBAA.h | 5 +-- clang/lib/CodeGen/CodeGenTypes.cpp | 7 +++-- clang/lib/CodeGen/CodeGenTypes.h | 8 +---- clang/lib/CodeGen/MicrosoftCXXABI.cpp | 2 +- .../assigment-with-implicit-ctor.cpp | 13 ++++++++ 10 files changed, 56 insertions(+), 45 deletions(-) create mode 100644 clang/test/Interpreter/assigment-with-implicit-ctor.cpp diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index e4f221ae55eefa..34ca2227608361 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -314,7 +314,8 @@ CodeGenTypes::arrangeCXXMethodDeclaration(const CXXMethodDecl *MD) { if (MD->isImplicitObjectMemberFunction()) { // The abstract case is perfectly fine. - const CXXRecordDecl *ThisType = TheCXXABI.getThisArgumentTypeForMethod(MD); + const CXXRecordDecl *ThisType = + getCXXABI().getThisArgumentTypeForMethod(MD); return arrangeCXXMethodType(ThisType, prototype.getTypePtr(), MD); } @@ -337,7 +338,7 @@ CodeGenTypes::arrangeCXXStructorDeclaration(GlobalDecl GD) { SmallVector argTypes; SmallVector paramInfos; - const CXXRecordDecl *ThisType = TheCXXABI.getThisArgumentTypeForMethod(GD); + const CXXRecordDecl *ThisType = getCXXABI().getThisArgumentTypeForMethod(GD); argTypes.push_back(DeriveThisType(ThisType, MD)); bool PassParams = true; @@ -356,7 +357,7 @@ CodeGenTypes::arrangeCXXStructorDeclaration(GlobalDecl GD) { appendParameterTypes(*this, argTypes, paramInfos, FTP); CGCXXABI::AddedStructorArgCounts AddedArgs = - TheCXXABI.buildStructorSignature(GD, argTypes); + getCXXABI().buildStructorSignature(GD, argTypes); if (!paramInfos.empty()) { // Note: prefix implies after the first param. if (AddedArgs.Prefix) @@ -372,11 +373,10 @@ CodeGenTypes::arrangeCXXStructorDeclaration(GlobalDecl GD) { : RequiredArgs::All); FunctionType::ExtInfo extInfo = FTP->getExtInfo(); - CanQualType resultType = TheCXXABI.HasThisReturn(GD) - ? argTypes.front() - : TheCXXABI.hasMostDerivedReturn(GD) - ? CGM.getContext().VoidPtrTy - : Context.VoidTy; + CanQualType resultType = getCXXABI().HasThisReturn(GD) ? argTypes.front() + : getCXXABI().hasMostDerivedReturn(GD) + ? CGM.getContext().VoidPtrTy + : Context.VoidTy; return arrangeLLVMFunctionInfo(resultType, FnInfoOpts::IsInstanceMethod, argTypes, extInfo, paramInfos, required); } @@ -437,11 +437,10 @@ CodeGenTypes::arrangeCXXConstructorCall(const CallArgList &args, : RequiredArgs::All; GlobalDecl GD(D, CtorKind); - CanQualType ResultType = TheCXXABI.HasThisReturn(GD) - ? ArgTypes.front() - : TheCXXABI.hasMostDerivedReturn(GD) - ? CGM.getContext().VoidPtrTy - : Context.VoidTy; + CanQualType ResultType = getCXXABI().HasThisReturn(GD) ? ArgTypes.front() + : getCXXABI().hasMostDerivedReturn(GD) + ? CGM.getContext().VoidPtrTy + : Context.VoidTy; FunctionType::ExtInfo Info = FPT->getExtInfo(); llvm::SmallVector ParamInfos; @@ -806,7 +805,7 @@ const CGFunctionInfo &CodeGenTypes::arrangeLLVMFunctionInfo( } else if (info.getCC() == CC_Swift || info.getCC() == CC_SwiftAsync) { swiftcall::computeABIInfo(CGM, *FI); } else { - getABIInfo().computeInfo(*FI); + CGM.getABIInfo().computeInfo(*FI); } // Loop over all of the computed argument and return value info. If any of @@ -6033,6 +6032,6 @@ RValue CodeGenFunction::EmitVAArg(VAArgExpr *VE, Address &VAListAddr, : EmitVAListRef(VE->getSubExpr()); QualType Ty = VE->getType(); if (VE->isMicrosoftABI()) - return CGM.getTypes().getABIInfo().EmitMSVAArg(*this, VAListAddr, Ty, Slot); - return CGM.getTypes().getABIInfo().EmitVAArg(*this, VAListAddr, Ty, Slot); + return CGM.getABIInfo().EmitMSVAArg(*this, VAListAddr, Ty, Slot); + return CGM.getABIInfo().EmitVAArg(*this, VAListAddr, Ty, Slot); } diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp index 667e260f2228dc..e5ba50de3462da 100644 --- a/clang/lib/CodeGen/CGClass.cpp +++ b/clang/lib/CodeGen/CGClass.cpp @@ -209,7 +209,7 @@ CodeGenModule::GetNonVirtualBaseClassOffset(const CXXRecordDecl *ClassDecl, return nullptr; llvm::Type *PtrDiffTy = - Types.ConvertType(getContext().getPointerDiffType()); + getTypes().ConvertType(getContext().getPointerDiffType()); return llvm::ConstantInt::get(PtrDiffTy, Offset.getQuantity()); } diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 0d3b896af8aa39..42742ae83de47b 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -341,10 +341,11 @@ CodeGenModule::CodeGenModule(ASTContext &C, : Context(C), LangOpts(C.getLangOpts()), FS(FS), HeaderSearchOpts(HSO), PreprocessorOpts(PPO), CodeGenOpts(CGO), TheModule(M), Diags(diags), Target(C.getTargetInfo()), ABI(createCXXABI(*this)), - VMContext(M.getContext()), Types(*this), VTables(*this), + VMContext(M.getContext()), VTables(*this), SanitizerMD(new SanitizerMetadata(*this)) { // Initialize the type cache. + Types.reset(new CodeGenTypes(*this)); llvm::LLVMContext &LLVMContext = M.getContext(); VoidTy = llvm::Type::getVoidTy(LLVMContext); Int8Ty = llvm::Type::getInt8Ty(LLVMContext); @@ -403,7 +404,7 @@ CodeGenModule::CodeGenModule(ASTContext &C, if (LangOpts.Sanitize.has(SanitizerKind::Thread) || (!CodeGenOpts.RelaxedAliasing && CodeGenOpts.OptimizationLevel > 0)) TBAA.reset(new CodeGenTBAA(Context, getTypes(), TheModule, CodeGenOpts, - getLangOpts(), getCXXABI().getMangleContext())); + getLangOpts())); // If debug info or coverage generation is enabled, create the CGDebugInfo // object. @@ -1465,12 +1466,12 @@ void CodeGenModule::EmitBackendOptionsMetadata( void CodeGenModule::UpdateCompletedType(const TagDecl *TD) { // Make sure that this type is translated. - Types.UpdateCompletedType(TD); + getTypes().UpdateCompletedType(TD); } void CodeGenModule::RefreshTypeCacheForClass(const CXXRecordDecl *RD) { // Make sure that this type is translated. - Types.RefreshTypeCacheForClass(RD); + getTypes().RefreshTypeCacheForClass(RD); } llvm::MDNode *CodeGenModule::getTBAATypeInfo(QualType QTy) { @@ -5405,6 +5406,10 @@ void CodeGenModule::maybeSetTrivialComdat(const Decl &D, GO.setComdat(TheModule.getOrInsertComdat(GO.getName())); } +const ABIInfo &CodeGenModule::getABIInfo() { + return getTargetCodeGenInfo().getABIInfo(); +} + /// Pass IsTentative as true if you want to create a tentative definition. void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D, bool IsTentative) { @@ -7813,7 +7818,5 @@ void CodeGenModule::moveLazyEmissionStates(CodeGenModule *NewBuilder) { NewBuilder->WeakRefReferences = std::move(WeakRefReferences); - NewBuilder->TBAA = std::move(TBAA); - NewBuilder->ABI->MangleCtx = std::move(ABI->MangleCtx); } diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h index 284bba823baeb4..c58bb88035ca8a 100644 --- a/clang/lib/CodeGen/CodeGenModule.h +++ b/clang/lib/CodeGen/CodeGenModule.h @@ -320,7 +320,7 @@ class CodeGenModule : public CodeGenTypeCache { // This should not be moved earlier, since its initialization depends on some // of the previous reference members being already initialized and also checks // if TheTargetCodeGenInfo is NULL - CodeGenTypes Types; + std::unique_ptr Types; /// Holds information about C++ vtables. CodeGenVTables VTables; @@ -776,6 +776,7 @@ class CodeGenModule : public CodeGenTypeCache { bool supportsCOMDAT() const; void maybeSetTrivialComdat(const Decl &D, llvm::GlobalObject &GO); + const ABIInfo &getABIInfo(); CGCXXABI &getCXXABI() const { return *ABI; } llvm::LLVMContext &getLLVMContext() { return VMContext; } @@ -783,7 +784,7 @@ class CodeGenModule : public CodeGenTypeCache { const TargetCodeGenInfo &getTargetCodeGenInfo(); - CodeGenTypes &getTypes() { return Types; } + CodeGenTypes &getTypes() { return *Types; } CodeGenVTables &getVTables() { return VTables; } diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp b/clang/lib/CodeGen/CodeGenTBAA.cpp index b7e6a4d1adcc37..5b3393ec150e44 100644 --- a/clang/lib/CodeGen/CodeGenTBAA.cpp +++ b/clang/lib/CodeGen/CodeGenTBAA.cpp @@ -16,6 +16,7 @@ #include "CodeGenTBAA.h" #include "ABIInfoImpl.h" +#include "CGCXXABI.h" #include "CGRecordLayout.h" #include "CodeGenTypes.h" #include "clang/AST/ASTContext.h" @@ -36,10 +37,10 @@ using namespace CodeGen; CodeGenTBAA::CodeGenTBAA(ASTContext &Ctx, CodeGenTypes &CGTypes, llvm::Module &M, const CodeGenOptions &CGO, - const LangOptions &Features, MangleContext &MContext) + const LangOptions &Features) : Context(Ctx), CGTypes(CGTypes), Module(M), CodeGenOpts(CGO), - Features(Features), MContext(MContext), MDHelper(M.getContext()), - Root(nullptr), Char(nullptr) {} + Features(Features), MDHelper(M.getContext()), Root(nullptr), + Char(nullptr) {} CodeGenTBAA::~CodeGenTBAA() { } @@ -256,7 +257,8 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type *Ty) { SmallString<256> OutName; llvm::raw_svector_ostream Out(OutName); - MContext.mangleCanonicalTypeName(QualType(ETy, 0), Out); + CGTypes.getCXXABI().getMangleContext().mangleCanonicalTypeName( + QualType(ETy, 0), Out); return createScalarTypeNode(OutName, getChar(), Size); } @@ -481,7 +483,8 @@ llvm::MDNode *CodeGenTBAA::getBaseTypeInfoHelper(const Type *Ty) { if (Features.CPlusPlus) { // Don't use the mangler for C code. llvm::raw_svector_ostream Out(OutName); - MContext.mangleCanonicalTypeName(QualType(Ty, 0), Out); + CGTypes.getCXXABI().getMangleContext().mangleCanonicalTypeName( + QualType(Ty, 0), Out); } else { OutName = RD->getName(); } diff --git a/clang/lib/CodeGen/CodeGenTBAA.h b/clang/lib/CodeGen/CodeGenTBAA.h index 5d9ecec3ff0fe2..ba74a39a4d25ee 100644 --- a/clang/lib/CodeGen/CodeGenTBAA.h +++ b/clang/lib/CodeGen/CodeGenTBAA.h @@ -24,7 +24,6 @@ namespace clang { class ASTContext; class CodeGenOptions; class LangOptions; - class MangleContext; class QualType; class Type; @@ -120,7 +119,6 @@ class CodeGenTBAA { llvm::Module &Module; const CodeGenOptions &CodeGenOpts; const LangOptions &Features; - MangleContext &MContext; // MDHelper - Helper for creating metadata. llvm::MDBuilder MDHelper; @@ -174,8 +172,7 @@ class CodeGenTBAA { public: CodeGenTBAA(ASTContext &Ctx, CodeGenTypes &CGTypes, llvm::Module &M, - const CodeGenOptions &CGO, const LangOptions &Features, - MangleContext &MContext); + const CodeGenOptions &CGO, const LangOptions &Features); ~CodeGenTBAA(); /// getTypeInfo - Get metadata used to describe accesses to objects of the diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp index 652a6d2f92ad82..c70ae3818a27a4 100644 --- a/clang/lib/CodeGen/CodeGenTypes.cpp +++ b/clang/lib/CodeGen/CodeGenTypes.cpp @@ -31,9 +31,8 @@ using namespace clang; using namespace CodeGen; CodeGenTypes::CodeGenTypes(CodeGenModule &cgm) - : CGM(cgm), Context(cgm.getContext()), TheModule(cgm.getModule()), - Target(cgm.getTarget()), TheCXXABI(cgm.getCXXABI()), - TheABIInfo(cgm.getTargetCodeGenInfo().getABIInfo()) { + : CGM(cgm), Context(cgm.getContext()), TheModule(cgm.getModule()), + Target(cgm.getTarget()) { SkippedLayout = false; LongDoubleReferenced = false; } @@ -44,6 +43,8 @@ CodeGenTypes::~CodeGenTypes() { delete &*I++; } +CGCXXABI &CodeGenTypes::getCXXABI() const { return getCGM().getCXXABI(); } + const CodeGenOptions &CodeGenTypes::getCodeGenOpts() const { return CGM.getCodeGenOpts(); } diff --git a/clang/lib/CodeGen/CodeGenTypes.h b/clang/lib/CodeGen/CodeGenTypes.h index cbda2628e9140f..5aebf9a2122372 100644 --- a/clang/lib/CodeGen/CodeGenTypes.h +++ b/clang/lib/CodeGen/CodeGenTypes.h @@ -57,11 +57,6 @@ class CodeGenTypes { ASTContext &Context; llvm::Module &TheModule; const TargetInfo &Target; - CGCXXABI &TheCXXABI; - - // This should not be moved earlier, since its initialization depends on some - // of the previous reference members being already initialized - const ABIInfo &TheABIInfo; /// The opaque type map for Objective-C interfaces. All direct /// manipulation is done by the runtime interfaces, which are @@ -106,9 +101,8 @@ class CodeGenTypes { } CodeGenModule &getCGM() const { return CGM; } ASTContext &getContext() const { return Context; } - const ABIInfo &getABIInfo() const { return TheABIInfo; } const TargetInfo &getTarget() const { return Target; } - CGCXXABI &getCXXABI() const { return TheCXXABI; } + CGCXXABI &getCXXABI() const; llvm::LLVMContext &getLLVMContext() { return TheModule.getContext(); } const CodeGenOptions &getCodeGenOpts() const; diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp index cc6740edabcd3c..76d0191a7e63ad 100644 --- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp +++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp @@ -1111,7 +1111,7 @@ static bool isTrivialForMSVC(const CXXRecordDecl *RD, QualType Ty, const Type *Base = nullptr; uint64_t NumElts = 0; if (CGM.getTarget().getTriple().isAArch64() && - CGM.getTypes().getABIInfo().isHomogeneousAggregate(Ty, Base, NumElts) && + CGM.getABIInfo().isHomogeneousAggregate(Ty, Base, NumElts) && isa(Base)) { return true; } diff --git a/clang/test/Interpreter/assigment-with-implicit-ctor.cpp b/clang/test/Interpreter/assigment-with-implicit-ctor.cpp new file mode 100644 index 00000000000000..24cea8ec1a4b2e --- /dev/null +++ b/clang/test/Interpreter/assigment-with-implicit-ctor.cpp @@ -0,0 +1,13 @@ +// REQUIRES: host-supports-jit +// UNSUPPORTED: system-aix +// +// RUN: cat %s | clang-repl | FileCheck %s +// RUN: cat %s | clang-repl -Xcc -O2 | FileCheck %s + +struct box { box() = default; box(int *const data) : data{data} {} int *data{}; }; + +box foo() { box ret; ret = new int{}; return ret; } + +extern "C" int printf(const char *, ...); +printf("good"); +// CHECK: good From a14c7309900f5a61f89b82f6f3d2dc5a51b3e1b4 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Wed, 21 Aug 2024 08:08:32 +0200 Subject: [PATCH 012/426] [clang][bytecode] Fix diagnostic in final ltor cast (#105292) Don't diagnose volatile reads but diagnose a few other accesses earlier. --- clang/lib/AST/ByteCode/Compiler.cpp | 2 +- clang/lib/AST/ByteCode/EvalEmitter.cpp | 11 +++++--- clang/lib/AST/ByteCode/Interp.cpp | 25 +++++++++++++++++++ clang/lib/AST/ByteCode/Interp.h | 1 + .../temp/temp.arg/temp.arg.nontype/p1-11.cpp | 1 + 5 files changed, 36 insertions(+), 4 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 590087e04b7474..6d05f75131640a 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -2556,7 +2556,7 @@ bool Compiler::VisitCXXConstructExpr(const CXXConstructExpr *E) { if (DiscardResult) return this->emitPopPtr(E); - return true; + return this->emitFinishInit(E); } if (T->isArrayType()) { diff --git a/clang/lib/AST/ByteCode/EvalEmitter.cpp b/clang/lib/AST/ByteCode/EvalEmitter.cpp index e36d86c814e17f..53ec8f52d4921f 100644 --- a/clang/lib/AST/ByteCode/EvalEmitter.cpp +++ b/clang/lib/AST/ByteCode/EvalEmitter.cpp @@ -165,11 +165,16 @@ template <> bool EvalEmitter::emitRet(const SourceInfo &Info) { if (ConvertResultToRValue) { if (!Ptr.isZero() && !Ptr.isDereferencable()) return false; + + if (S.getLangOpts().CPlusPlus11 && Ptr.isBlockPointer() && + !CheckFinalLoad(S, OpPC, Ptr)) { + return false; + } + // Never allow reading from a non-const pointer, unless the memory // has been created in this evaluation. - if (!Ptr.isZero() && Ptr.isBlockPointer() && - Ptr.block()->getEvalID() != Ctx.getEvalID() && - (!CheckLoad(S, OpPC, Ptr, AK_Read) || !Ptr.isConst())) + if (!Ptr.isZero() && !Ptr.isConst() && Ptr.isBlockPointer() && + Ptr.block()->getEvalID() != Ctx.getEvalID()) return false; if (std::optional V = diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp index a0571728570d3f..aea303f0e630c9 100644 --- a/clang/lib/AST/ByteCode/Interp.cpp +++ b/clang/lib/AST/ByteCode/Interp.cpp @@ -559,6 +559,31 @@ bool CheckLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr, return true; } +/// This is not used by any of the opcodes directly. It's used by +/// EvalEmitter to do the final lvalue-to-rvalue conversion. +bool CheckFinalLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr) { + if (!CheckLive(S, OpPC, Ptr, AK_Read)) + return false; + if (!CheckConstant(S, OpPC, Ptr)) + return false; + + if (!CheckDummy(S, OpPC, Ptr, AK_Read)) + return false; + if (!CheckExtern(S, OpPC, Ptr)) + return false; + if (!CheckRange(S, OpPC, Ptr, AK_Read)) + return false; + if (!CheckActive(S, OpPC, Ptr, AK_Read)) + return false; + if (!CheckInitialized(S, OpPC, Ptr, AK_Read)) + return false; + if (!CheckTemporary(S, OpPC, Ptr, AK_Read)) + return false; + if (!CheckMutable(S, OpPC, Ptr)) + return false; + return true; +} + bool CheckStore(InterpState &S, CodePtr OpPC, const Pointer &Ptr) { if (!CheckLive(S, OpPC, Ptr, AK_Assign)) return false; diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index b805b7b246c51b..d8629881abc685 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -92,6 +92,7 @@ bool CheckMutable(InterpState &S, CodePtr OpPC, const Pointer &Ptr); /// Checks if a value can be loaded from a block. bool CheckLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr, AccessKinds AK = AK_Read); +bool CheckFinalLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr); bool CheckInitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr, AccessKinds AK); diff --git a/clang/test/CXX/temp/temp.arg/temp.arg.nontype/p1-11.cpp b/clang/test/CXX/temp/temp.arg/temp.arg.nontype/p1-11.cpp index e28753c3d668cc..692958ef565cf4 100644 --- a/clang/test/CXX/temp/temp.arg/temp.arg.nontype/p1-11.cpp +++ b/clang/test/CXX/temp/temp.arg/temp.arg.nontype/p1-11.cpp @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -std=c++11 %s -verify -triple x86_64-linux-gnu +// RUN: %clang_cc1 -std=c++11 %s -verify -triple x86_64-linux-gnu -fexperimental-new-constant-interpreter namespace std { typedef decltype(nullptr) nullptr_t; From 6f456024c37424d9c8cc1cea07126a28f246588d Mon Sep 17 00:00:00 2001 From: Adrian Vogelsgesang Date: Wed, 21 Aug 2024 08:25:57 +0200 Subject: [PATCH 013/426] [lldb-dap] Mark hidden frames as "subtle" (#105457) This commit takes advantage of the recently introduced `SBFrame::IsHidden` to show those hidden frames as "subtle" frames in the UI. E.g., VS Code hides those stack frames by default, and renders them as grayed out frames, in case the user decides to show them in the stack trace --- .../lldb-dap/stackTrace/subtleFrames/Makefile | 3 ++ .../subtleFrames/TestDAP_subtleFrames.py | 29 +++++++++++++++++++ .../lldb-dap/stackTrace/subtleFrames/main.cpp | 13 +++++++++ lldb/tools/lldb-dap/JSONUtils.cpp | 3 ++ 4 files changed, 48 insertions(+) create mode 100644 lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/Makefile create mode 100644 lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/TestDAP_subtleFrames.py create mode 100644 lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/main.cpp diff --git a/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/Makefile b/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/Makefile new file mode 100644 index 00000000000000..99998b20bcb050 --- /dev/null +++ b/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/TestDAP_subtleFrames.py b/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/TestDAP_subtleFrames.py new file mode 100644 index 00000000000000..1e41e841e39bc8 --- /dev/null +++ b/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/TestDAP_subtleFrames.py @@ -0,0 +1,29 @@ +""" +Test lldb-dap stack trace response +""" + + +import dap_server +from lldbsuite.test.decorators import * + +import lldbdap_testcase +from lldbsuite.test.lldbtest import * + + +class TestDAP_subtleFrames(lldbdap_testcase.DAPTestCaseBase): + @add_test_categories(["libc++"]) + def test_subtleFrames(self): + """ + Internal stack frames (such as the ones used by `std::function`) are marked as "subtle". + """ + program = self.getBuildArtifact("a.out") + self.build_and_launch(program) + source = "main.cpp" + self.set_source_breakpoints(source, [line_number(source, "BREAK HERE")]) + self.continue_to_next_stop() + + frames = self.get_stackFrames() + for f in frames: + if "__function" in f["name"]: + self.assertEqual(f["presentationHint"], "subtle") + self.assertTrue(any(f.get("presentationHint") == "subtle" for f in frames)) diff --git a/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/main.cpp b/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/main.cpp new file mode 100644 index 00000000000000..71944528441e38 --- /dev/null +++ b/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/main.cpp @@ -0,0 +1,13 @@ +#include +#include + +void greet() { + // BREAK HERE + std::cout << "Hello\n"; +} + +int main() { + std::function func{greet}; + func(); + return 0; +} diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp index a8b85f55939e17..c080fd395b7288 100644 --- a/lldb/tools/lldb-dap/JSONUtils.cpp +++ b/lldb/tools/lldb-dap/JSONUtils.cpp @@ -763,6 +763,9 @@ llvm::json::Value CreateStackFrame(lldb::SBFrame &frame) { object.try_emplace("instructionPointerReference", formatted_addr); } + if (frame.IsArtificial() || frame.IsHidden()) + object.try_emplace("presentationHint", "subtle"); + return llvm::json::Value(std::move(object)); } From 86f2ec03f898950e1e41c2205f4cf7c60d118390 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Wed, 21 Aug 2024 07:29:02 +0100 Subject: [PATCH 014/426] [llvm][DWARFLinker] Don't attach DW_AT_dwo_id to CUs (#105186) This fixes a verifier error uncovered as a result of https://github.com/llvm/llvm-project/pull/101775. When compiling with `-gmodules`, Clang will create a skeleton CU that contains a `DW_AT_dwo_id` and a `DW_AT_dwo_name` corresponding to the path of the `.pcm` that carries the type definitions referenced in the non-skeleton CU (see the [gmodules LLDB docs](https://lldb.llvm.org/resources/extensions.html) for more details). The non-skeleton CU will also contain a `DW_AT_dwo_id` that matches that of the skeleton. `dsymutil` effectively undoes the `-gmodules` work, replacing all the module type references with definitions. I.e., we no longer create a skeleton `.dwo` CU. Prior to this patch `dsymutil` did not strip out the `DW_AT_dwo_id` on the non-skeleton CU. This now (since https://github.com/llvm/llvm-project/pull/101775) causes verification errors such as: ``` Verifying .debug_names... error: Name Index @ 0x0: Entry @ 0x9a unable to load .dwo file "None" for DWARF unit @ 0x0. error: output verification failed for x86_64 make: *** [a.out.dSYM] Error 1 ``` ...because the verifier sees the DWO ID but can't find a matching `.dwo` unit. This patch simply strips the `DW_AT_dwo_id` from the main CU. --- llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp | 6 ++++++ llvm/test/tools/dsymutil/X86/modules.m | 12 ++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp index 7510326f2e1b34..280d3f1861ff00 100644 --- a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp +++ b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp @@ -1412,6 +1412,12 @@ unsigned DWARFLinker::DIECloner::cloneScalarAttribute( unsigned AttrSize, AttributesInfo &Info) { uint64_t Value; + // We don't emit any skeleton CUs with dsymutil. So avoid emitting + // a redundant DW_AT_GNU_dwo_id on the non-skeleton CU. + if (AttrSpec.Attr == dwarf::DW_AT_GNU_dwo_id || + AttrSpec.Attr == dwarf::DW_AT_dwo_id) + return 0; + // Check for the offset to the macro table. If offset is incorrect then we // need to remove the attribute. if (AttrSpec.Attr == dwarf::DW_AT_macro_info) { diff --git a/llvm/test/tools/dsymutil/X86/modules.m b/llvm/test/tools/dsymutil/X86/modules.m index 9467dcb35955cc..5145e07dd65e52 100644 --- a/llvm/test/tools/dsymutil/X86/modules.m +++ b/llvm/test/tools/dsymutil/X86/modules.m @@ -31,6 +31,7 @@ #ifdef BAR_H // --------------------------------------------------------------------- // CHECK: DW_TAG_compile_unit +// CHECK-NOT: DW_AT_GNU_dwo_id // CHECK-NOT: DW_TAG // CHECK: DW_TAG_module // CHECK-NEXT: DW_AT_name{{.*}}"Bar" @@ -55,6 +56,7 @@ #ifdef FOO_H // --------------------------------------------------------------------- // CHECK: DW_TAG_compile_unit +// CHECK-NOT: DW_AT_GNU_dwo_id // CHECK-NOT: DW_TAG // CHECK: 0x0[[FOO:.*]]: DW_TAG_module // CHECK-NEXT: DW_AT_name{{.*}}"Foo" @@ -92,8 +94,9 @@ @interface Foo { #else // --------------------------------------------------------------------- -// CHECK: DW_TAG_compile_unit -// CHECK: DW_AT_low_pc +// CHECK: DW_TAG_compile_unit +// CHECK-NOT: DW_AT_GNU_dwo_id +// CHECK: DW_AT_low_pc // CHECK-NOT: DW_TAG_module // CHECK-NOT: DW_TAG_typedef // @@ -130,8 +133,9 @@ int main(int argc, char **argv) { #endif #endif -// CHECK: DW_TAG_compile_unit -// CHECK: DW_AT_name {{.*}}"odr_violation.c" +// CHECK: DW_TAG_compile_unit +// CHECK-NOT: DW_AT_GNU_dwo_id +// CHECK: DW_AT_name {{.*}}"odr_violation.c" // CHECK: DW_TAG_variable // CHECK: DW_AT_name {{.*}}"odr_violation" // CHECK: DW_AT_type [DW_FORM_ref4] ({{.*}}{0x{{0*}}[[BAR2:.*]]} From 7f7f4feaf07dd3bb4b22d0c25d34b6c99c753aa2 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Tue, 20 Aug 2024 23:37:19 -0700 Subject: [PATCH 015/426] Revert "[AArch64] Optimize when storing symmetry constants" (#105474) Reverts llvm/llvm-project#93717 Introduce stack use after return https://lab.llvm.org/buildbot/#/builders/24/builds/1003 --- .../AArch64/AArch64LoadStoreOptimizer.cpp | 178 ----------------- .../CodeGen/AArch64/movimm-expand-ldst.ll | 180 ------------------ .../CodeGen/AArch64/movimm-expand-ldst.mir | 128 ------------- 3 files changed, 486 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index e06a811ba5a20a..8de3f8db84ae2b 100644 --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -226,14 +226,6 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { // Find and merge an index ldr/st instruction into a base ld/st instruction. bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale); - // Finds and collapses loads of symmetric constant value. - bool tryFoldSymmetryConstantLoad(MachineBasicBlock::iterator &I, - unsigned Limit); - MachineBasicBlock::iterator - doFoldSymmetryConstantLoad(MachineInstr &MI, - SmallVectorImpl &MIs, - int UpperLoadIdx, int Accumulated); - bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt); bool runOnMachineFunction(MachineFunction &Fn) override; @@ -2451,155 +2443,6 @@ AArch64LoadStoreOpt::findMatchingConstOffsetBackward( return E; } -static bool isSymmetricLoadCandidate(MachineInstr &MI, Register BaseReg) { - auto MatchBaseReg = [&](unsigned Count) { - for (unsigned I = 0; I < Count; I++) { - auto OpI = MI.getOperand(I); - if (OpI.isReg() && OpI.getReg() != BaseReg) - return false; - } - return true; - }; - - unsigned Opc = MI.getOpcode(); - switch (Opc) { - default: - return false; - case AArch64::MOVZXi: - return MatchBaseReg(1); - case AArch64::MOVKXi: - return MatchBaseReg(2); - case AArch64::ORRXrs: - MachineOperand &Imm = MI.getOperand(3); - // Fourth operand of ORR must be 32 which mean - // 32bit symmetric constant load. - // ex) renamable $x8 = ORRXrs $x8, $x8, 32 - if (MatchBaseReg(3) && Imm.isImm() && Imm.getImm() == 32) - return true; - } - - return false; -} - -MachineBasicBlock::iterator AArch64LoadStoreOpt::doFoldSymmetryConstantLoad( - MachineInstr &MI, SmallVectorImpl &MIs, - int UpperLoadIdx, int Accumulated) { - MachineBasicBlock::iterator I = MI.getIterator(); - MachineBasicBlock::iterator E = I->getParent()->end(); - MachineBasicBlock::iterator NextI = next_nodbg(I, E); - MachineBasicBlock *MBB = MI.getParent(); - - if (!UpperLoadIdx) { - // ORR ensures that previous instructions load lower 32-bit constants. - // Remove ORR only. - (*MIs.begin())->eraseFromParent(); - } else { - // We need to remove MOV for upper of 32bit because we know these instrs - // is part of symmetric constant. - int Index = 0; - for (auto MI = MIs.begin(); Index < UpperLoadIdx; ++MI, Index++) { - (*MI)->eraseFromParent(); - } - } - - Register BaseReg = getLdStRegOp(MI).getReg(); - const MachineOperand MO = AArch64InstrInfo::getLdStBaseOp(MI); - Register DstRegW = TRI->getSubReg(BaseReg, AArch64::sub_32); - unsigned DstRegState = getRegState(MI.getOperand(0)); - int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm(); - BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STPWi)) - .addReg(DstRegW, DstRegState) - .addReg(DstRegW, DstRegState) - .addReg(MO.getReg(), getRegState(MO)) - .addImm(Offset * 2) - .setMemRefs(MI.memoperands()) - .setMIFlags(MI.getFlags()); - I->eraseFromParent(); - return NextI; -} - -bool AArch64LoadStoreOpt::tryFoldSymmetryConstantLoad( - MachineBasicBlock::iterator &I, unsigned Limit) { - MachineInstr &MI = *I; - if (MI.getOpcode() != AArch64::STRXui) - return false; - - MachineBasicBlock::iterator MBBI = I; - MachineBasicBlock::iterator B = I->getParent()->begin(); - if (MBBI == B) - return false; - - TypeSize Scale(0U, false), Width(0U, false); - int64_t MinOffset, MaxOffset; - if (!AArch64InstrInfo::getMemOpInfo(AArch64::STPWi, Scale, Width, MinOffset, - MaxOffset)) - return false; - - // We replace the STRX instruction, which stores 64 bits, with the STPW - // instruction, which stores two consecutive 32 bits. Therefore, we compare - // the offset range with multiplied by two. - int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm(); - if (Offset * 2 < MinOffset || Offset * 2 > MaxOffset) - return false; - - Register BaseReg = getLdStRegOp(MI).getReg(); - unsigned Count = 0, UpperLoadIdx = 0; - uint64_t Accumulated = 0, Mask = 0xFFFFUL; - bool hasORR = false, Found = false; - SmallVector MIs; - ModifiedRegUnits.clear(); - UsedRegUnits.clear(); - do { - MBBI = prev_nodbg(MBBI, B); - MachineInstr &MI = *MBBI; - if (!MI.isTransient()) - ++Count; - if (!isSymmetricLoadCandidate(MI, BaseReg)) { - LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, - TRI); - if (!ModifiedRegUnits.available(BaseReg) || - !UsedRegUnits.available(BaseReg)) - return false; - continue; - } - - unsigned Opc = MI.getOpcode(); - if (Opc == AArch64::ORRXrs) { - hasORR = true; - MIs.push_back(MBBI); - continue; - } - unsigned ValueOrder = Opc == AArch64::MOVZXi ? 1 : 2; - MachineOperand Value = MI.getOperand(ValueOrder); - MachineOperand Shift = MI.getOperand(ValueOrder + 1); - if (!Value.isImm() || !Shift.isImm()) - return false; - - uint64_t IValue = Value.getImm(); - uint64_t IShift = Shift.getImm(); - uint64_t Adder = IValue << IShift; - MIs.push_back(MBBI); - if (Adder >> 32) - UpperLoadIdx = MIs.size(); - - Accumulated -= Accumulated & (Mask << IShift); - Accumulated += Adder; - if (Accumulated != 0 && - (((Accumulated >> 32) == (Accumulated & 0xffffffffULL)) || - (hasORR && (Accumulated >> 32 == 0)))) { - Found = true; - break; - } - } while (MBBI != B && Count < Limit); - - if (Found) { - I = doFoldSymmetryConstantLoad(MI, MIs, UpperLoadIdx, Accumulated); - return true; - } - - return false; -} - bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( MachineBasicBlock::iterator &MBBI) { MachineInstr &MI = *MBBI; @@ -2910,27 +2753,6 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, ++MBBI; } - // We have an opportunity to optimize the `STRXui` instruction, which loads - // the same 32-bit value into a register twice. The `STPXi` instruction allows - // us to load a 32-bit value only once. - // Considering : - // renamable $x8 = MOVZXi 49370, 0 - // renamable $x8 = MOVKXi $x8, 320, 16 - // renamable $x8 = ORRXrs $x8, $x8, 32 - // STRXui killed renamable $x8, killed renamable $x0, 0 - // Transform : - // $w8 = MOVZWi 49370, 0 - // $w8 = MOVKWi $w8, 320, 16 - // STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0 - for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); - MBBI != E;) { - if (isMergeableLdStUpdate(*MBBI) && - tryFoldSymmetryConstantLoad(MBBI, UpdateLimit)) - Modified = true; - else - ++MBBI; - } - return Modified; } diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll index 9f4ebf5efb982a..b25ac96f97c7d0 100644 --- a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll +++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll @@ -93,183 +93,3 @@ define i64 @testuu0xf555f555f555f555() { ; CHECK-NEXT: ret ret i64 u0xf555f555f555f555 } - -define void @test_store_0x1234567812345678(ptr %x) { -; CHECK-LABEL: test_store_0x1234567812345678: -; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #22136 // =0x5678 -; CHECK-NEXT: movk x8, #4660, lsl #16 -; CHECK-NEXT: stp w8, w8, [x0] -; CHECK-NEXT: ret - store i64 u0x1234567812345678, ptr %x - ret void -} - -define void @test_store_0xff3456ffff3456ff(ptr %x) { -; CHECK-LABEL: test_store_0xff3456ffff3456ff: -; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #22271 // =0x56ff -; CHECK-NEXT: movk x8, #65332, lsl #16 -; CHECK-NEXT: stp w8, w8, [x0] -; CHECK-NEXT: ret - store i64 u0xff3456ffff3456ff, ptr %x - ret void -} - -define void @test_store_0x00345600345600(ptr %x) { -; CHECK-LABEL: test_store_0x00345600345600: -; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #22016 // =0x5600 -; CHECK-NEXT: movk x8, #52, lsl #16 -; CHECK-NEXT: movk x8, #13398, lsl #32 -; CHECK-NEXT: str x8, [x0] -; CHECK-NEXT: ret - store i64 u0x00345600345600, ptr %x - ret void -} - -define void @test_store_0x5555555555555555(ptr %x) { -; CHECK-LABEL: test_store_0x5555555555555555: -; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555 -; CHECK-NEXT: str x8, [x0] -; CHECK-NEXT: ret - store i64 u0x5555555555555555, ptr %x - ret void -} - -define void @test_store_0x5055555550555555(ptr %x) { -; CHECK-LABEL: test_store_0x5055555550555555: -; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555 -; CHECK-NEXT: and x8, x8, #0xf0fffffff0ffffff -; CHECK-NEXT: str x8, [x0] -; CHECK-NEXT: ret - store i64 u0x5055555550555555, ptr %x - ret void -} - -define void @test_store_0x0000555555555555(ptr %x) { -; CHECK-LABEL: test_store_0x0000555555555555: -; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555 -; CHECK-NEXT: movk x8, #0, lsl #48 -; CHECK-NEXT: str x8, [x0] -; CHECK-NEXT: ret - store i64 u0x0000555555555555, ptr %x - ret void -} - -define void @test_store_0x0000555500005555(ptr %x) { -; CHECK-LABEL: test_store_0x0000555500005555: -; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #21845 // =0x5555 -; CHECK-NEXT: stp w8, w8, [x0] -; CHECK-NEXT: ret - store i64 u0x0000555500005555, ptr %x - ret void -} - -define void @test_store_0x5555000055550000(ptr %x) { -; CHECK-LABEL: test_store_0x5555000055550000: -; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #1431633920 // =0x55550000 -; CHECK-NEXT: stp w8, w8, [x0] -; CHECK-NEXT: ret - store i64 u0x5555000055550000, ptr %x - ret void -} - -define void @test_store_u0xffff5555ffff5555(ptr %x) { -; CHECK-LABEL: test_store_u0xffff5555ffff5555: -; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-43691 // =0xffffffffffff5555 -; CHECK-NEXT: movk x8, #21845, lsl #32 -; CHECK-NEXT: str x8, [x0] -; CHECK-NEXT: ret - store i64 u0xffff5555ffff5555, ptr %x - ret void -} - -define void @test_store_0x8888ffff8888ffff(ptr %x) { -; CHECK-LABEL: test_store_0x8888ffff8888ffff: -; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-2004287489 // =0xffffffff8888ffff -; CHECK-NEXT: movk x8, #34952, lsl #48 -; CHECK-NEXT: str x8, [x0] -; CHECK-NEXT: ret - store i64 u0x8888ffff8888ffff, ptr %x - ret void -} - -define void @test_store_uu0xfffff555f555f555(ptr %x) { -; CHECK-LABEL: test_store_uu0xfffff555f555f555: -; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-2731 // =0xfffffffffffff555 -; CHECK-NEXT: movk x8, #62805, lsl #16 -; CHECK-NEXT: movk x8, #62805, lsl #32 -; CHECK-NEXT: str x8, [x0] -; CHECK-NEXT: ret - store i64 u0xfffff555f555f555, ptr %x - ret void -} - -define void @test_store_uu0xf555f555f555f555(ptr %x) { -; CHECK-LABEL: test_store_uu0xf555f555f555f555: -; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555 -; CHECK-NEXT: orr x8, x8, #0xe001e001e001e001 -; CHECK-NEXT: str x8, [x0] -; CHECK-NEXT: ret - store i64 u0xf555f555f555f555, ptr %x - ret void -} - -define void @test_store_0x1234567812345678_offset_range(ptr %x) { -; CHECK-LABEL: test_store_0x1234567812345678_offset_range: -; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #22136 // =0x5678 -; CHECK-NEXT: movk x8, #4660, lsl #16 -; CHECK-NEXT: stp w8, w8, [x0, #32] -; CHECK-NEXT: ret - %g = getelementptr i64, ptr %x, i64 4 - store i64 u0x1234567812345678, ptr %g - ret void -} - -define void @test_store_0x1234567812345678_offset_min(ptr %x) { -; CHECK-LABEL: test_store_0x1234567812345678_offset_min: -; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #22136 // =0x5678 -; CHECK-NEXT: movk x8, #4660, lsl #16 -; CHECK-NEXT: stp w8, w8, [x0] -; CHECK-NEXT: ret - %g = getelementptr i8, ptr %x, i32 0 - store i64 u0x1234567812345678, ptr %g - ret void -} - -define void @test_store_0x1234567812345678_offset_max(ptr %x) { -; CHECK-LABEL: test_store_0x1234567812345678_offset_max: -; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #22136 // =0x5678 -; CHECK-NEXT: movk x8, #4660, lsl #16 -; CHECK-NEXT: stp w8, w8, [x0, #248] -; CHECK-NEXT: ret - %g = getelementptr i8, ptr %x, i32 248 - store i64 u0x1234567812345678, ptr %g - ret void -} - -define void @test_store_0x1234567812345678_offset_max_over(ptr %x) { -; CHECK-LABEL: test_store_0x1234567812345678_offset_max_over: -; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #22136 // =0x5678 -; CHECK-NEXT: movk x8, #4660, lsl #16 -; CHECK-NEXT: orr x8, x8, x8, lsl #32 -; CHECK-NEXT: stur x8, [x0, #249] -; CHECK-NEXT: ret - %g = getelementptr i8, ptr %x, i32 249 - store i64 u0x1234567812345678, ptr %g - ret void -} diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir index 1effb9aa63a943..72529807d5d54a 100644 --- a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir +++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir @@ -32,131 +32,3 @@ body: | ; CHECK-NEXT: RET undef $lr, implicit $x0 renamable $x0 = MOVi64imm -4550323095879417536 RET_ReallyLR implicit $x0 -... ---- -name: test_fold_repeating_constant_store -tracksRegLiveness: true -body: | - bb.0: - liveins: $x0 - ; CHECK-LABEL: name: test_fold_repeating_constant_store - ; CHECK: liveins: $x0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $x8 = MOVZXi 49370, 0 - ; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 320, 16 - ; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0 - ; CHECK-NEXT: RET undef $lr - renamable $x8 = MOVi64imm 90284035103834330 - STRXui killed renamable $x8, killed renamable $x0, 0 - RET_ReallyLR -... ---- -name: test_fold_repeating_constant_store_neg -tracksRegLiveness: true -body: | - bb.0: - liveins: $x0 - ; CHECK-LABEL: name: test_fold_repeating_constant_store_neg - ; CHECK: liveins: $x0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $x8 = MOVZXi 320, 0 - ; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 49370, 16 - ; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0 - ; CHECK-NEXT: RET undef $lr - renamable $x8 = MOVi64imm -4550323095879417536 - STRXui killed renamable $x8, killed renamable $x0, 0 - RET_ReallyLR -... ---- -name: test_fold_repeating_constant_store_16bit_unit -tracksRegLiveness: true -body: | - bb.0: - liveins: $x0 - ; CHECK-LABEL: name: test_fold_repeating_constant_store_16bit_unit - ; CHECK: liveins: $x0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $x8 = MOVZXi 21845, 16 - ; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0 - ; CHECK-NEXT: RET undef $lr - renamable $x8 = MOVZXi 21845, 16 - renamable $x8 = MOVKXi $x8, 21845, 48 - STRXui killed renamable $x8, killed renamable $x0, 0 - RET undef $lr -... ---- -name: test_fold_repeating_constant_store_offset_min -tracksRegLiveness: true -body: | - bb.0: - liveins: $x0 - ; CHECK-LABEL: name: test_fold_repeating_constant_store_offset_min - ; CHECK: liveins: $x0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $x8 = MOVZXi 22136, 0 - ; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 4660, 16 - ; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0 - ; CHECK-NEXT: RET undef $lr - renamable $x8 = MOVZXi 22136, 0 - renamable $x8 = MOVKXi $x8, 4660, 16 - renamable $x8 = ORRXrs $x8, $x8, 32 - STRXui killed renamable $x8, killed renamable $x0, 0 - RET undef $lr -... ---- -name: test_fold_repeating_constant_store_offset_max -tracksRegLiveness: true -body: | - bb.0: - liveins: $x0 - ; CHECK-LABEL: name: test_fold_repeating_constant_store_offset_max - ; CHECK: liveins: $x0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $x8 = MOVZXi 22136, 0 - ; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 4660, 16 - ; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 62 - ; CHECK-NEXT: RET undef $lr - renamable $x8 = MOVZXi 22136, 0 - renamable $x8 = MOVKXi $x8, 4660, 16 - renamable $x8 = ORRXrs $x8, $x8, 32 - STRXui killed renamable $x8, killed renamable $x0, 31 - RET undef $lr -... ---- -name: test_fold_repeating_constant_store_offset_min_lower -tracksRegLiveness: true -body: | - bb.0: - liveins: $x0 - ; CHECK-LABEL: name: test_fold_repeating_constant_store_offset_min_lower - ; CHECK: liveins: $x0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $x8 = MOVZXi 22136, 0 - ; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 4660, 16 - ; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0 - ; CHECK-NEXT: RET undef $lr - renamable $x8 = MOVZXi 22136, 0 - renamable $x8 = MOVKXi $x8, 4660, 16 - renamable $x8 = ORRXrs $x8, $x8, 32 - STRXui killed renamable $x8, killed renamable $x0, 0 - RET undef $lr -... ---- -name: test_fold_repeating_constant_store_offset_max_over -tracksRegLiveness: true -body: | - bb.0: - liveins: $x0 - ; CHECK-LABEL: name: test_fold_repeating_constant_store_offset_max_over - ; CHECK: liveins: $x0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $x8 = MOVZXi 22136, 0 - ; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 4660, 16 - ; CHECK-NEXT: renamable $x8 = ORRXrs $x8, $x8, 32 - ; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x0, 32 - ; CHECK-NEXT: RET undef $lr - renamable $x8 = MOVZXi 22136, 0 - renamable $x8 = MOVKXi $x8, 4660, 16 - renamable $x8 = ORRXrs $x8, $x8, 32 - STRXui killed renamable $x8, killed renamable $x0, 32 - RET undef $lr From 90556efaa2f5703920cce4a9c0ee36365e15e2ab Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 20 Aug 2024 23:37:41 -0700 Subject: [PATCH 016/426] [Driver] Use llvm::make_range(std::pair) (NFC) (#105470) --- clang/lib/Driver/ToolChains/Clang.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index f7c2f485d3fc11..53fdc29948508e 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -9088,7 +9088,7 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, // Pass the CUDA path to the linker wrapper tool. for (Action::OffloadKind Kind : {Action::OFK_Cuda, Action::OFK_OpenMP}) { auto TCRange = C.getOffloadToolChains(Kind); - for (auto &I : llvm::make_range(TCRange.first, TCRange.second)) { + for (auto &I : llvm::make_range(TCRange)) { const ToolChain *TC = I.second; if (TC->getTriple().isNVPTX()) { CudaInstallationDetector CudaInstallation(D, TheTriple, Args); From a3d41879ecf5690a73f9226951d3856c7faa34a4 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Wed, 21 Aug 2024 08:44:54 +0200 Subject: [PATCH 017/426] [mlir][ODS] Optionally generate public C++ functions for type constraints (#104577) Add `gen-type-constraint-decls` and `gen-type-constraint-defs`, which generate public C++ functions for type constraints. The name of the C++ function is specified in the `cppFunctionName` field. Type constraints are typically used for op/type/attribute verification. They are also sometimes called from builders and transformations. Until now, this required duplicating the check in C++. Note: This commit just adds the option for type constraints, but attribute constraints could be supported in the same way. Alternatives considered: 1. The C++ functions could also be generated as part of `gen-typedef-decls/defs`, but that can be confusing because type constraints may rely on type definitions from multiple `.td` files. `#include`s could cause duplicate definitions of the same type constraint. 2. The C++ functions could also be generated as static member functions of dialects, but they don't really belong to a dialect. (Because they may rely on type definitions from multiple dialects.) --- mlir/docs/DefiningDialects/Constraints.md | 59 +++++++++++++++++++ mlir/include/mlir/IR/BuiltinTypes.h | 1 + mlir/include/mlir/IR/BuiltinTypes.td | 14 ++--- mlir/include/mlir/IR/CMakeLists.txt | 3 + mlir/include/mlir/IR/Constraints.td | 6 +- mlir/include/mlir/TableGen/Constraint.h | 4 ++ mlir/lib/IR/BuiltinTypes.cpp | 16 +++++- mlir/lib/IR/CMakeLists.txt | 1 + mlir/lib/TableGen/Constraint.cpp | 10 +++- mlir/test/mlir-tblgen/type-constraints.td | 14 +++++ mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp | 64 +++++++++++++++++++++ 11 files changed, 181 insertions(+), 11 deletions(-) create mode 100644 mlir/docs/DefiningDialects/Constraints.md create mode 100644 mlir/test/mlir-tblgen/type-constraints.td diff --git a/mlir/docs/DefiningDialects/Constraints.md b/mlir/docs/DefiningDialects/Constraints.md new file mode 100644 index 00000000000000..52a4283d6084c6 --- /dev/null +++ b/mlir/docs/DefiningDialects/Constraints.md @@ -0,0 +1,59 @@ +# Constraints + +[TOC] + +## Attribute / Type Constraints + +When defining the arguments of an operation in TableGen, users can specify +either plain attributes/types or use attribute/type constraints to levy +additional requirements on the attribute value or operand type. + +```tablegen +def My_Type1 : MyDialect_Type<"Type1", "type1"> { ... } +def My_Type2 : MyDialect_Type<"Type2", "type2"> { ... } + +// Plain type +let arguments = (ins MyType1:$val); +// Type constraint +let arguments = (ins AnyTypeOf<[MyType1, MyType2]>:$val); +``` + +`AnyTypeOf` is an example for a type constraints. Many useful type constraints +can be found in `mlir/IR/CommonTypeConstraints.td`. Additional verification +code is generated for type/attribute constraints. Type constraints can not only +be used when defining operation arguments, but also when defining type +parameters. + +Optionally, C++ functions can be generated, so that type constraints can be +checked from C++. The name of the C++ function must be specified in the +`cppFunctionName` field. If no function name is specified, no C++ function is +emitted. + +```tablegen +// Example: Element type constraint for VectorType +def Builtin_VectorTypeElementType : AnyTypeOf<[AnyInteger, Index, AnyFloat]> { + let cppFunctionName = "isValidVectorTypeElementType"; +} +``` + +The above example tranlates into the following C++ code: +```c++ +bool isValidVectorTypeElementType(::mlir::Type type) { + return (((::llvm::isa<::mlir::IntegerType>(type))) || ((::llvm::isa<::mlir::IndexType>(type))) || ((::llvm::isa<::mlir::FloatType>(type)))); +} +``` + +An extra TableGen rule is needed to emit C++ code for type constraints. This +will generate only the declarations/definitions of the type constaraints that +are defined in the specified `.td` file, but not those that are in included +`.td` files. + +```cmake +mlir_tablegen(TypeConstraints.h.inc -gen-type-constraint-decls) +mlir_tablegen(TypeConstraints.cpp.inc -gen-type-constraint-defs) +``` + +The generated `TypeConstraints.h.inc` will need to be included +whereever you are referencing the type constraint in C++. Note that no C++ +namespace will be emitted by the code generator. The `#include` statements of +the `.h.inc`/`.cpp.inc` files should be wrapped in C++ namespaces by the user. diff --git a/mlir/include/mlir/IR/BuiltinTypes.h b/mlir/include/mlir/IR/BuiltinTypes.h index d12522ba55c96e..eefa4279df1a01 100644 --- a/mlir/include/mlir/IR/BuiltinTypes.h +++ b/mlir/include/mlir/IR/BuiltinTypes.h @@ -198,6 +198,7 @@ class BaseMemRefType : public Type, public ShapedType::Trait { #include "mlir/IR/BuiltinTypes.h.inc" namespace mlir { +#include "mlir/IR/BuiltinTypeConstraints.h.inc" //===----------------------------------------------------------------------===// // MemRefType diff --git a/mlir/include/mlir/IR/BuiltinTypes.td b/mlir/include/mlir/IR/BuiltinTypes.td index 4b3add2035263c..1ab1bbe9bfc9b2 100644 --- a/mlir/include/mlir/IR/BuiltinTypes.td +++ b/mlir/include/mlir/IR/BuiltinTypes.td @@ -1097,6 +1097,10 @@ def Builtin_UnrankedTensor : Builtin_Type<"UnrankedTensor", "unranked_tensor", [ // VectorType //===----------------------------------------------------------------------===// +def Builtin_VectorTypeElementType : AnyTypeOf<[AnyInteger, Index, AnyFloat]> { + let cppFunctionName = "isValidVectorTypeElementType"; +} + def Builtin_Vector : Builtin_Type<"Vector", "vector", [ShapedTypeInterface, ValueSemantics], "Type"> { let summary = "Multi-dimensional SIMD vector type"; @@ -1147,7 +1151,7 @@ def Builtin_Vector : Builtin_Type<"Vector", "vector", }]; let parameters = (ins ArrayRefParameter<"int64_t">:$shape, - AnyTypeOf<[AnyInteger, Index, AnyFloat]>:$elementType, + Builtin_VectorTypeElementType:$elementType, ArrayRefParameter<"bool">:$scalableDims ); let builders = [ @@ -1171,12 +1175,8 @@ def Builtin_Vector : Builtin_Type<"Vector", "vector", class Builder; /// Returns true if the given type can be used as an element of a vector - /// type. In particular, vectors can consist of integer, index, or float - /// primitives. - static bool isValidElementType(Type t) { - // TODO: Auto-generate this function from $elementType. - return ::llvm::isa(t); - } + /// type. See "Builtin_VectorTypeElementType" for allowed types. + static bool isValidElementType(Type t); /// Returns true if the vector contains scalable dimensions. bool isScalable() const { diff --git a/mlir/include/mlir/IR/CMakeLists.txt b/mlir/include/mlir/IR/CMakeLists.txt index 04a57d26a068d5..b741eb18d47916 100644 --- a/mlir/include/mlir/IR/CMakeLists.txt +++ b/mlir/include/mlir/IR/CMakeLists.txt @@ -35,6 +35,9 @@ set(LLVM_TARGET_DEFINITIONS BuiltinTypes.td) mlir_tablegen(BuiltinTypes.h.inc -gen-typedef-decls) mlir_tablegen(BuiltinTypes.cpp.inc -gen-typedef-defs) add_public_tablegen_target(MLIRBuiltinTypesIncGen) +mlir_tablegen(BuiltinTypeConstraints.h.inc -gen-type-constraint-decls) +mlir_tablegen(BuiltinTypeConstraints.cpp.inc -gen-type-constraint-defs) +add_public_tablegen_target(MLIRBuiltinTypeConstraintsIncGen) set(LLVM_TARGET_DEFINITIONS BuiltinTypeInterfaces.td) mlir_tablegen(BuiltinTypeInterfaces.h.inc -gen-type-interface-decls) diff --git a/mlir/include/mlir/IR/Constraints.td b/mlir/include/mlir/IR/Constraints.td index 39bc55db63da1a..13223aa8abcdaa 100644 --- a/mlir/include/mlir/IR/Constraints.td +++ b/mlir/include/mlir/IR/Constraints.td @@ -149,10 +149,14 @@ class Constraint { // Subclass for constraints on a type. class TypeConstraint : + string cppTypeParam = "::mlir::Type", + string cppFunctionNameParam = ""> : Constraint { // The name of the C++ Type class if known, or Type if not. string cppType = cppTypeParam; + // The name of the C++ function that is generated for this type constraint. + // If empty, no C++ function is generated. + string cppFunctionName = cppFunctionNameParam; } // Subclass for constraints on an attribute. diff --git a/mlir/include/mlir/TableGen/Constraint.h b/mlir/include/mlir/TableGen/Constraint.h index 0d0c28e651ee99..8877daaa775145 100644 --- a/mlir/include/mlir/TableGen/Constraint.h +++ b/mlir/include/mlir/TableGen/Constraint.h @@ -69,6 +69,10 @@ class Constraint { /// context on the def). std::string getUniqueDefName() const; + /// Returns the name of the C++ function that should be generated for this + /// constraint, or std::nullopt if no C++ function should be generated. + std::optional getCppFunctionName() const; + Kind getKind() const { return kind; } /// Return the underlying def. diff --git a/mlir/lib/IR/BuiltinTypes.cpp b/mlir/lib/IR/BuiltinTypes.cpp index a3f5ece8c17369..16b53efa55fb80 100644 --- a/mlir/lib/IR/BuiltinTypes.cpp +++ b/mlir/lib/IR/BuiltinTypes.cpp @@ -32,6 +32,10 @@ using namespace mlir::detail; #define GET_TYPEDEF_CLASSES #include "mlir/IR/BuiltinTypes.cpp.inc" +namespace mlir { +#include "mlir/IR/BuiltinTypeConstraints.cpp.inc" +} // namespace mlir + //===----------------------------------------------------------------------===// // BuiltinDialect //===----------------------------------------------------------------------===// @@ -230,6 +234,10 @@ LogicalResult OpaqueType::verify(function_ref emitError, // VectorType //===----------------------------------------------------------------------===// +bool VectorType::isValidElementType(Type t) { + return isValidVectorTypeElementType(t); +} + LogicalResult VectorType::verify(function_ref emitError, ArrayRef shape, Type elementType, ArrayRef scalableDims) { @@ -278,7 +286,9 @@ Type TensorType::getElementType() const { [](auto type) { return type.getElementType(); }); } -bool TensorType::hasRank() const { return !llvm::isa(*this); } +bool TensorType::hasRank() const { + return !llvm::isa(*this); +} ArrayRef TensorType::getShape() const { return llvm::cast(*this).getShape(); @@ -365,7 +375,9 @@ Type BaseMemRefType::getElementType() const { [](auto type) { return type.getElementType(); }); } -bool BaseMemRefType::hasRank() const { return !llvm::isa(*this); } +bool BaseMemRefType::hasRank() const { + return !llvm::isa(*this); +} ArrayRef BaseMemRefType::getShape() const { return llvm::cast(*this).getShape(); diff --git a/mlir/lib/IR/CMakeLists.txt b/mlir/lib/IR/CMakeLists.txt index c38ce6c058a006..4cabac185171c2 100644 --- a/mlir/lib/IR/CMakeLists.txt +++ b/mlir/lib/IR/CMakeLists.txt @@ -55,6 +55,7 @@ add_mlir_library(MLIRIR MLIRBuiltinLocationAttributesIncGen MLIRBuiltinOpsIncGen MLIRBuiltinTypesIncGen + MLIRBuiltinTypeConstraintsIncGen MLIRBuiltinTypeInterfacesIncGen MLIRCallInterfacesIncGen MLIRCastInterfacesIncGen diff --git a/mlir/lib/TableGen/Constraint.cpp b/mlir/lib/TableGen/Constraint.cpp index 4ccbd0a685e09a..8cf4ed08a2d54f 100644 --- a/mlir/lib/TableGen/Constraint.cpp +++ b/mlir/lib/TableGen/Constraint.cpp @@ -30,7 +30,7 @@ Constraint::Constraint(const llvm::Record *record) kind = CK_Region; } else if (def->isSubClassOf("SuccessorConstraint")) { kind = CK_Successor; - } else if(!def->isSubClassOf("Constraint")) { + } else if (!def->isSubClassOf("Constraint")) { llvm::errs() << "Expected a constraint but got: \n" << *def << "\n"; llvm::report_fatal_error("Abort"); } @@ -109,6 +109,14 @@ std::optional Constraint::getBaseDefName() const { } } +std::optional Constraint::getCppFunctionName() const { + std::optional name = + def->getValueAsOptionalString("cppFunctionName"); + if (!name || *name == "") + return std::nullopt; + return name; +} + AppliedConstraint::AppliedConstraint(Constraint &&constraint, llvm::StringRef self, std::vector &&entities) diff --git a/mlir/test/mlir-tblgen/type-constraints.td b/mlir/test/mlir-tblgen/type-constraints.td new file mode 100644 index 00000000000000..7ce80653077716 --- /dev/null +++ b/mlir/test/mlir-tblgen/type-constraints.td @@ -0,0 +1,14 @@ +// RUN: mlir-tblgen -gen-type-constraint-decls -I %S/../../include %s | FileCheck %s --check-prefix=DECL +// RUN: mlir-tblgen -gen-type-constraint-defs -I %S/../../include %s | FileCheck %s --check-prefix=DEF + +include "mlir/IR/CommonTypeConstraints.td" + +def DummyConstraint : AnyTypeOf<[AnyInteger, Index, AnyFloat]> { + let cppFunctionName = "isValidDummy"; +} + +// DECL: bool isValidDummy(::mlir::Type type); + +// DEF: bool isValidDummy(::mlir::Type type) { +// DEF: return (((::llvm::isa<::mlir::IntegerType>(type))) || ((::llvm::isa<::mlir::IndexType>(type))) || ((::llvm::isa<::mlir::FloatType>(type)))); +// DEF: } diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp index 71ba6a5c73da9e..eccd8029d950ff 100644 --- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp +++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp @@ -1023,6 +1023,55 @@ bool DefGenerator::emitDefs(StringRef selectedDialect) { return false; } +//===----------------------------------------------------------------------===// +// Type Constraints +//===----------------------------------------------------------------------===// + +/// Find all type constraints for which a C++ function should be generated. +static std::vector +getAllTypeConstraints(const llvm::RecordKeeper &records) { + std::vector result; + for (llvm::Record *def : + records.getAllDerivedDefinitionsIfDefined("TypeConstraint")) { + // Ignore constraints defined outside of the top-level file. + if (llvm::SrcMgr.FindBufferContainingLoc(def->getLoc()[0]) != + llvm::SrcMgr.getMainFileID()) + continue; + Constraint constr(def); + // Generate C++ function only if "cppFunctionName" is set. + if (!constr.getCppFunctionName()) + continue; + result.push_back(constr); + } + return result; +} + +static void emitTypeConstraintDecls(const llvm::RecordKeeper &records, + raw_ostream &os) { + static const char *const typeConstraintDecl = R"( +bool {0}(::mlir::Type type); +)"; + + for (Constraint constr : getAllTypeConstraints(records)) + os << strfmt(typeConstraintDecl, *constr.getCppFunctionName()); +} + +static void emitTypeConstraintDefs(const llvm::RecordKeeper &records, + raw_ostream &os) { + static const char *const typeConstraintDef = R"( +bool {0}(::mlir::Type type) { + return ({1}); +} +)"; + + for (Constraint constr : getAllTypeConstraints(records)) { + FmtContext ctx; + ctx.withSelf("type"); + std::string condition = tgfmt(constr.getConditionTemplate(), &ctx); + os << strfmt(typeConstraintDef, *constr.getCppFunctionName(), condition); + } +} + //===----------------------------------------------------------------------===// // GEN: Registration hooks //===----------------------------------------------------------------------===// @@ -1070,3 +1119,18 @@ static mlir::GenRegistration TypeDefGenerator generator(records, os); return generator.emitDecls(typeDialect); }); + +static mlir::GenRegistration + genTypeConstrDefs("gen-type-constraint-defs", + "Generate type constraint definitions", + [](const llvm::RecordKeeper &records, raw_ostream &os) { + emitTypeConstraintDefs(records, os); + return false; + }); +static mlir::GenRegistration + genTypeConstrDecls("gen-type-constraint-decls", + "Generate type constraint declarations", + [](const llvm::RecordKeeper &records, raw_ostream &os) { + emitTypeConstraintDecls(records, os); + return false; + }); From 947b9f55b5f327e14368a48fb6ce10242ea29bf3 Mon Sep 17 00:00:00 2001 From: Jonas Hahnfeld Date: Wed, 21 Aug 2024 09:09:06 +0200 Subject: [PATCH 018/426] [clang-repl] Fix printing preprocessed tokens and macros (#104964) --- clang/lib/Frontend/PrintPreprocessedOutput.cpp | 10 ++++------ clang/test/Interpreter/preprocessor.cpp | 4 ++++ 2 files changed, 8 insertions(+), 6 deletions(-) create mode 100644 clang/test/Interpreter/preprocessor.cpp diff --git a/clang/lib/Frontend/PrintPreprocessedOutput.cpp b/clang/lib/Frontend/PrintPreprocessedOutput.cpp index 135dca0e6a1775..383d4356084916 100644 --- a/clang/lib/Frontend/PrintPreprocessedOutput.cpp +++ b/clang/lib/Frontend/PrintPreprocessedOutput.cpp @@ -916,8 +916,7 @@ static void PrintPreprocessedTokens(Preprocessor &PP, Token &Tok, PP.Lex(Tok); continue; } else if (Tok.is(tok::annot_repl_input_end)) { - PP.Lex(Tok); - continue; + // Fall through to exit the loop. } else if (Tok.is(tok::eod)) { // Don't print end of directive tokens, since they are typically newlines // that mess up our line tracking. These come from unknown pre-processor @@ -1025,7 +1024,8 @@ static void PrintPreprocessedTokens(Preprocessor &PP, Token &Tok, Callbacks->setEmittedTokensOnThisLine(); IsStartOfLine = false; - if (Tok.is(tok::eof)) break; + if (Tok.is(tok::eof) || Tok.is(tok::annot_repl_input_end)) + break; PP.Lex(Tok); // If lexing that token causes us to need to skip future tokens, do so now. @@ -1048,9 +1048,7 @@ static void DoPrintMacros(Preprocessor &PP, raw_ostream *OS) { // the macro table at the end. PP.EnterMainSourceFile(); - Token Tok; - do PP.Lex(Tok); - while (Tok.isNot(tok::eof)); + PP.LexTokensUntilEOF(); SmallVector MacrosByID; for (Preprocessor::macro_iterator I = PP.macro_begin(), E = PP.macro_end(); diff --git a/clang/test/Interpreter/preprocessor.cpp b/clang/test/Interpreter/preprocessor.cpp new file mode 100644 index 00000000000000..8239fd45e661b0 --- /dev/null +++ b/clang/test/Interpreter/preprocessor.cpp @@ -0,0 +1,4 @@ +// RUN: %clang_cc1 -fincremental-extensions -E %s +// RUN: %clang_cc1 -fincremental-extensions -E -dD %s +// RUN: %clang_cc1 -fincremental-extensions -E -dI %s +// RUN: %clang_cc1 -fincremental-extensions -E -dM %s From 89b1468345a74d2095616a8be2306cf0b08fa43a Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 21 Aug 2024 00:10:31 -0700 Subject: [PATCH 019/426] [ELF] Move ppc64noTocRelax to Ctx. NFC Ctx was introduced in March 2022 as a more suitable place for such singletons. --- lld/ELF/Arch/PPC64.cpp | 2 +- lld/ELF/Config.h | 5 +++++ lld/ELF/Driver.cpp | 1 + lld/ELF/InputSection.cpp | 2 -- lld/ELF/InputSection.h | 6 ------ lld/ELF/Relocations.cpp | 2 +- lld/ELF/Writer.cpp | 1 - 7 files changed, 8 insertions(+), 11 deletions(-) diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp index 3188772f7c4904..753ced698a05c0 100644 --- a/lld/ELF/Arch/PPC64.cpp +++ b/lld/ELF/Arch/PPC64.cpp @@ -1593,7 +1593,7 @@ void PPC64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const { // entry, there may be R_PPC64_TOC16_HA not paired with // R_PPC64_TOC16_LO_DS. Don't relax. This loses some relaxation // opportunities but is safe. - if (ppc64noTocRelax.count({rel.sym, rel.addend}) || + if (ctx.ppc64noTocRelax.count({rel.sym, rel.addend}) || !tryRelaxPPC64TocIndirection(rel, loc)) relocate(loc, rel, val); break; diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h index 0ddac5f6358781..035b385ba37ec3 100644 --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -582,6 +582,11 @@ struct Ctx { unsigned scriptSymOrderCounter = 1; llvm::DenseMap scriptSymOrder; + // The set of TOC entries (.toc + addend) for which we should not apply + // toc-indirect to toc-relative relaxation. const Symbol * refers to the + // STT_SECTION symbol associated to the .toc input section. + llvm::DenseSet> ppc64noTocRelax; + void reset(); llvm::raw_fd_ostream openAuxiliaryFile(llvm::StringRef, std::error_code &); diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index 8aa2380ba3a177..36552e4bb035af 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -123,6 +123,7 @@ void Ctx::reset() { needsTlsLd.store(false, std::memory_order_relaxed); scriptSymOrderCounter = 1; scriptSymOrder.clear(); + ppc64noTocRelax.clear(); ltoAllVtablesHaveTypeInfos = false; } diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp index a7bb9bd47299e2..fd3e947428388b 100644 --- a/lld/ELF/InputSection.cpp +++ b/lld/ELF/InputSection.cpp @@ -34,8 +34,6 @@ using namespace llvm::sys; using namespace lld; using namespace lld::elf; -DenseSet> elf::ppc64noTocRelax; - // Returns a string to construct an error message. std::string lld::toString(const InputSectionBase *sec) { return (toString(sec->file) + ":(" + sec->name + ")").str(); diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h index e3b7af13d066da..60c8d57b8db86a 100644 --- a/lld/ELF/InputSection.h +++ b/lld/ELF/InputSection.h @@ -498,12 +498,6 @@ inline bool isDebugSection(const InputSectionBase &sec) { return (sec.flags & llvm::ELF::SHF_ALLOC) == 0 && sec.name.starts_with(".debug"); } - -// The set of TOC entries (.toc + addend) for which we should not apply -// toc-indirect to toc-relative relaxation. const Symbol * refers to the -// STT_SECTION symbol associated to the .toc input section. -extern llvm::DenseSet> ppc64noTocRelax; - } // namespace elf std::string toString(const elf::InputSectionBase *); diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp index b60b00cee4a602..9dbb4567495a81 100644 --- a/lld/ELF/Relocations.cpp +++ b/lld/ELF/Relocations.cpp @@ -1495,7 +1495,7 @@ void RelocationScanner::scanOne(typename Relocs::const_iterator &i) { // InputSectionBase::relocateAlloc(). if (type == R_PPC64_TOC16_LO && sym.isSection() && isa(sym) && cast(sym).section->name == ".toc") - ppc64noTocRelax.insert({&sym, addend}); + ctx.ppc64noTocRelax.insert({&sym, addend}); if ((type == R_PPC64_TLSGD && expr == R_TLSDESC_CALL) || (type == R_PPC64_TLSLD && expr == R_TLSLD_HINT)) { diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index 4c0b4df5bea170..82d9ea24d9bd3f 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -1800,7 +1800,6 @@ template void Writer::finalizeSections() { // that we can correctly decide if a dynamic relocation is needed. This is // called after processSymbolAssignments() because it needs to know whether // a linker-script-defined symbol is absolute. - ppc64noTocRelax.clear(); scanRelocations(); reportUndefinedSymbols(); postScanRelocations(); From d7c84d7b71fc5ea89b87480ff5d727496288799c Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 21 Aug 2024 08:28:52 +0100 Subject: [PATCH 020/426] [LAA] Collect loop guards only once in MemoryDepChecker (NFCI). This on its own gives small compile-time improvements in some configs and enables using loop guards at more places in the future while keeping compile-time impact low. https://llvm-compile-time-tracker.com/compare.php?from=c44202574ff9a8c0632aba30c2765b134557435f&to=55ffc3dd920fa9af439fd39f8f9cc13509531420&stat=instructions:u --- llvm/include/llvm/Analysis/LoopAccessAnalysis.h | 3 +++ llvm/lib/Analysis/LoopAccessAnalysis.cpp | 8 ++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index 87c70abba30fcd..73d9c26ed6b1b7 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -334,6 +334,9 @@ class MemoryDepChecker { std::pair> PointerBounds; + /// Cache for the loop guards of InnermostLoop. + std::optional LoopGuards; + /// Check whether there is a plausible dependence between the two /// accesses. /// diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 872bc52b82cca7..980f142f113265 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -2054,8 +2054,12 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n"); return Dependence::NoDep; } - } else - Dist = SE.applyLoopGuards(Dist, InnermostLoop); + } else { + if (!LoopGuards) + LoopGuards.emplace( + ScalarEvolution::LoopGuards::collect(InnermostLoop, SE)); + Dist = SE.applyLoopGuards(Dist, *LoopGuards); + } // Negative distances are not plausible dependencies. if (SE.isKnownNonPositive(Dist)) { From f47966b1de459a095b01ac2f9fa975076b609c06 Mon Sep 17 00:00:00 2001 From: David CARLIER Date: Wed, 21 Aug 2024 08:37:26 +0100 Subject: [PATCH 021/426] [compiler-rt] Reland "SetThreadName implementation for Fuchsia" (#105179) --- compiler-rt/lib/fuzzer/FuzzerUtilFuchsia.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/compiler-rt/lib/fuzzer/FuzzerUtilFuchsia.cpp b/compiler-rt/lib/fuzzer/FuzzerUtilFuchsia.cpp index fe79e1908d6029..735d1555d30532 100644 --- a/compiler-rt/lib/fuzzer/FuzzerUtilFuchsia.cpp +++ b/compiler-rt/lib/fuzzer/FuzzerUtilFuchsia.cpp @@ -607,7 +607,11 @@ size_t PageSize() { } void SetThreadName(std::thread &thread, const std::string &name) { - // TODO ? + if (zx_status_t s = zx_object_set_property( + thread.native_handle(), ZX_PROP_NAME, name.data(), name.size()); + s != ZX_OK) + Printf("SetThreadName for name %s failed: %s", name.c_str(), + zx_status_get_string(s)); } } // namespace fuzzer From b6686e764c02b1373359bbd80d9c0e1a834d1a64 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Wed, 21 Aug 2024 07:42:18 +0000 Subject: [PATCH 022/426] [Flang][Runtime] Handle missing definitions in (#101242) According to the C99 standard, may not define FE_INVALID and the likes. Even if C++11 mandate them, musl and emscripten don't provide them, so handle that case. --- flang/runtime/edit-input.cpp | 11 +++++++++++ flang/runtime/exceptions.cpp | 24 +++++++++++++++++++++++- flang/runtime/stop.cpp | 10 ++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/flang/runtime/edit-input.cpp b/flang/runtime/edit-input.cpp index 71021dd8a01588..61b070bde80e6f 100644 --- a/flang/runtime/edit-input.cpp +++ b/flang/runtime/edit-input.cpp @@ -507,18 +507,29 @@ static RT_API_ATTRS void RaiseFPExceptions( #define RAISE std::feraiseexcept #endif #endif // !defined(RT_DEVICE_COMPILATION) + +// Some environment (e.g. emscripten, musl) don't define FE_OVERFLOW as allowed +// by c99 (but not c++11) :-/ +#if defined(FE_OVERFLOW) || defined(RT_DEVICE_COMPILATION) if (flags & decimal::ConversionResultFlags::Overflow) { RAISE(FE_OVERFLOW); } +#endif +#if defined(FE_UNDERFLOW) || defined(RT_DEVICE_COMPILATION) if (flags & decimal::ConversionResultFlags::Underflow) { RAISE(FE_UNDERFLOW); } +#endif +#if defined(FE_INEXACT) || defined(RT_DEVICE_COMPILATION) if (flags & decimal::ConversionResultFlags::Inexact) { RAISE(FE_INEXACT); } +#endif +#if defined(FE_INVALID) || defined(RT_DEVICE_COMPILATION) if (flags & decimal::ConversionResultFlags::Invalid) { RAISE(FE_INVALID); } +#endif #undef RAISE } diff --git a/flang/runtime/exceptions.cpp b/flang/runtime/exceptions.cpp index 2032ce7b122429..8239c556bcea97 100644 --- a/flang/runtime/exceptions.cpp +++ b/flang/runtime/exceptions.cpp @@ -12,9 +12,26 @@ #include "terminator.h" #include +// When not supported, these macro are undefined in cfenv.h, +// set them to zero in that case. +#ifndef FE_INVALID +#define FE_INVALID 0 +#endif #ifndef __FE_DENORM #define __FE_DENORM 0 // denorm is nonstandard #endif +#ifndef FE_DIVBYZERO +#define FE_DIVBYZERO 0 +#endif +#ifndef FE_OVERFLOW +#define FE_OVERFLOW 0 +#endif +#ifndef FE_UNDERFLOW +#define FE_UNDERFLOW 0 +#endif +#ifndef FE_INEXACT +#define FE_INEXACT 0 +#endif namespace Fortran::runtime { @@ -45,7 +62,12 @@ uint32_t RTNAME(MapException)(uint32_t excepts) { if (excepts == 0 || excepts >= mapSize) { terminator.Crash("Invalid excepts value: %d", excepts); } - return map[excepts]; + uint32_t except_value = map[excepts]; + if (except_value == 0) { + terminator.Crash( + "Excepts value %d not supported by flang runtime", excepts); + } + return except_value; } // Verify that the size of ieee_modes_type and ieee_status_type objects from diff --git a/flang/runtime/stop.cpp b/flang/runtime/stop.cpp index 98324da1d91e16..cfb36b40840200 100644 --- a/flang/runtime/stop.cpp +++ b/flang/runtime/stop.cpp @@ -26,21 +26,31 @@ static void DescribeIEEESignaledExceptions() { #endif if (excepts) { std::fputs("IEEE arithmetic exceptions signaled:", stderr); +#ifdef FE_DIVBYZERO if (excepts & FE_DIVBYZERO) { std::fputs(" DIVBYZERO", stderr); } +#endif +#ifdef FE_INEXACT if (excepts & FE_INEXACT) { std::fputs(" INEXACT", stderr); } +#endif +#ifdef FE_INVALID if (excepts & FE_INVALID) { std::fputs(" INVALID", stderr); } +#endif +#ifdef FE_OVERFLOW if (excepts & FE_OVERFLOW) { std::fputs(" OVERFLOW", stderr); } +#endif +#ifdef FE_UNDERFLOW if (excepts & FE_UNDERFLOW) { std::fputs(" UNDERFLOW", stderr); } +#endif std::fputc('\n', stderr); } } From 7c4cadfc4333df8a20bb5a66b0ba4560bb4bd91c Mon Sep 17 00:00:00 2001 From: Freddy Ye Date: Wed, 21 Aug 2024 15:44:06 +0800 Subject: [PATCH 023/426] [X86][AVX10.2] Support AVX10.2-CONVERT new instructions. (#101600) Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965 --- clang/include/clang/Basic/BuiltinsX86.def | 44 + clang/lib/Headers/CMakeLists.txt | 2 + clang/lib/Headers/avx10_2_512convertintrin.h | 325 ++++ clang/lib/Headers/avx10_2convertintrin.h | 598 +++++++ clang/lib/Headers/immintrin.h | 2 + clang/lib/Sema/SemaX86.cpp | 2 + .../CodeGen/X86/avx10_2_512convert-builtins.c | 318 ++++ .../CodeGen/X86/avx10_2convert-builtins.c | 612 +++++++ llvm/include/llvm/IR/IntrinsicsX86.td | 130 ++ llvm/lib/Target/X86/X86ISelLowering.cpp | 39 +- llvm/lib/Target/X86/X86ISelLowering.h | 27 +- llvm/lib/Target/X86/X86InstrAVX10.td | 286 ++++ llvm/lib/Target/X86/X86InstrAVX512.td | 118 +- llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 90 +- llvm/lib/Target/X86/X86IntrinsicsInfo.h | 91 +- .../X86/avx10_2_512convert-intrinsics.ll | 677 ++++++++ .../CodeGen/X86/avx10_2convert-intrinsics.ll | 1324 +++++++++++++++ .../MC/Disassembler/X86/avx10.2convert-32.txt | 1491 +++++++++++++++++ .../MC/Disassembler/X86/avx10.2convert-64.txt | 1491 +++++++++++++++++ llvm/test/MC/X86/avx10.2convert-32-att.s | 1490 ++++++++++++++++ llvm/test/MC/X86/avx10.2convert-32-intel.s | 1490 ++++++++++++++++ llvm/test/MC/X86/avx10.2convert-64-att.s | 1490 ++++++++++++++++ llvm/test/MC/X86/avx10.2convert-64-intel.s | 1490 ++++++++++++++++ llvm/test/TableGen/x86-fold-tables.inc | 243 +++ 24 files changed, 13806 insertions(+), 64 deletions(-) create mode 100644 clang/lib/Headers/avx10_2_512convertintrin.h create mode 100644 clang/lib/Headers/avx10_2convertintrin.h create mode 100644 clang/test/CodeGen/X86/avx10_2_512convert-builtins.c create mode 100644 clang/test/CodeGen/X86/avx10_2convert-builtins.c create mode 100644 llvm/test/CodeGen/X86/avx10_2_512convert-intrinsics.ll create mode 100644 llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll create mode 100644 llvm/test/MC/Disassembler/X86/avx10.2convert-32.txt create mode 100644 llvm/test/MC/Disassembler/X86/avx10.2convert-64.txt create mode 100644 llvm/test/MC/X86/avx10.2convert-32-att.s create mode 100644 llvm/test/MC/X86/avx10.2convert-32-intel.s create mode 100644 llvm/test/MC/X86/avx10.2convert-64-att.s create mode 100644 llvm/test/MC/X86/avx10.2convert-64-intel.s diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def index a696cf117908e2..e4aa8661b9a806 100644 --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -2217,6 +2217,50 @@ TARGET_BUILTIN(__builtin_ia32_vcvttps2ibs512_mask, "V16UiV16fV16UiUsIi", "nV:512 TARGET_BUILTIN(__builtin_ia32_vcvttps2iubs128_mask, "V4UiV4fV4UiUc", "nV:128:", "avx10.2-256") TARGET_BUILTIN(__builtin_ia32_vcvttps2iubs256_mask, "V8UiV8fV8UiUcIi", "nV:256:", "avx10.2-256") TARGET_BUILTIN(__builtin_ia32_vcvttps2iubs512_mask, "V16UiV16fV16UiUsIi", "nV:512:", "avx10.2-512") + +// AVX10.2 CONVERT +TARGET_BUILTIN(__builtin_ia32_vcvt2ps2phx128_mask, "V8xV4fV4fV8xUc", "ncV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvt2ps2phx256_mask, "V16xV8fV8fV16xUsIi", "ncV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvt2ps2phx512_mask, "V32xV16fV16fV32xUiIi", "ncV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8_128_mask, "V16cV16cV8xV16cUc", "nV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8_256_mask, "V16cV32cV16xV16cUs", "nV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8_512_mask, "V32cV64cV32xV32cUi", "nV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8s_128_mask, "V16cV16cV8xV16cUc", "nV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8s_256_mask, "V16cV32cV16xV16cUs", "nV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8s_512_mask, "V32cV64cV32xV32cUi", "nV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8_128_mask, "V16cV16cV8xV16cUc", "nV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8_256_mask, "V16cV32cV16xV16cUs", "nV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8_512_mask, "V32cV64cV32xV32cUi", "nV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8s_128_mask, "V16cV16cV8xV16cUc", "nV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8s_256_mask, "V16cV32cV16xV16cUs", "nV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8s_512_mask, "V32cV64cV32xV32cUi", "nV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8_128, "V16cV8xV8x", "nV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8_256, "V32cV16xV16x", "nV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8_512, "V64cV32xV32x", "nV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8s_128, "V16cV8xV8x", "nV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8s_256, "V32cV16xV16x", "nV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8s_512, "V64cV32xV32x", "nV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8_128, "V16cV8xV8x", "nV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8_256, "V32cV16xV16x", "nV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8_512, "V64cV32xV32x", "nV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8s_128, "V16cV8xV8x", "nV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8s_256, "V32cV16xV16x", "nV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8s_512, "V64cV32xV32x", "nV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vcvthf8_2ph128_mask, "V8xV16cV8xUc", "nV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvthf8_2ph256_mask, "V16xV16cV16xUs", "nV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvthf8_2ph512_mask, "V32xV32cV32xUi", "nV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8s_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8s_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8s_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256") +TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512") #undef BUILTIN #undef TARGET_BUILTIN #undef TARGET_HEADER_BUILTIN diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 598bc556e8330a..5a62538792f301 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -147,9 +147,11 @@ set(x86_files amxcomplexintrin.h amxfp16intrin.h amxintrin.h + avx10_2_512convertintrin.h avx10_2_512minmaxintrin.h avx10_2_512niintrin.h avx10_2_512satcvtintrin.h + avx10_2convertintrin.h avx10_2minmaxintrin.h avx10_2niintrin.h avx10_2satcvtintrin.h diff --git a/clang/lib/Headers/avx10_2_512convertintrin.h b/clang/lib/Headers/avx10_2_512convertintrin.h new file mode 100644 index 00000000000000..a34e135fa30473 --- /dev/null +++ b/clang/lib/Headers/avx10_2_512convertintrin.h @@ -0,0 +1,325 @@ +/*===--------- avx10_2_512convertintrin.h - AVX10_2_512CONVERT -------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif // __IMMINTRIN_H + +#ifdef __SSE2__ + +#ifndef __AVX10_2_512CONVERTINTRIN_H +#define __AVX10_2_512CONVERTINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS512 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"), \ + __min_vector_width__(512))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_cvtx2ps_ph(__m512 __A, + __m512 __B) { + return (__m512h)__builtin_ia32_vcvt2ps2phx512_mask( + (__v16sf)__A, (__v16sf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)(-1), + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtx2ps_ph(__m512h __W, __mmask32 __U, __m512 __A, __m512 __B) { + return (__m512h)__builtin_ia32_vcvt2ps2phx512_mask( + (__v16sf)__A, (__v16sf)__B, (__v32hf)__W, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtx2ps_ph(__mmask32 __U, __m512 __A, __m512 __B) { + return (__m512h)__builtin_ia32_vcvt2ps2phx512_mask( + (__v16sf)__A, (__v16sf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvtx_round2ps_ph(A, B, R) \ + ((__m512h)__builtin_ia32_vcvt2ps2phx512_mask( \ + (__v16sf)(A), (__v16sf)(B), (__v32hf)_mm512_undefined_ph(), \ + (__mmask32)(-1), (const int)(R))) + +#define _mm512_mask_cvtx_round2ps_ph(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_vcvt2ps2phx512_mask((__v16sf)(A), (__v16sf)(B), \ + (__v32hf)(W), (__mmask32)(U), \ + (const int)(R))) + +#define _mm512_maskz_cvtx_round2ps_ph(U, A, B, R) \ + ((__m512h)__builtin_ia32_vcvt2ps2phx512_mask( \ + (__v16sf)(A), (__v16sf)(B), (__v32hf)_mm512_setzero_ph(), \ + (__mmask32)(U), (const int)(R))) + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvtbiasph_pbf8(__m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2bf8_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(), + (__mmask32)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiasph_pbf8( + __m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2bf8_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtbiasph_pbf8(__mmask32 __U, __m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2bf8_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(), + (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvtbiassph_pbf8(__m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2bf8s_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(), + (__mmask32)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiassph_pbf8( + __m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2bf8s_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtbiassph_pbf8(__mmask32 __U, __m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2bf8s_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(), + (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvtbiasph_phf8(__m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2hf8_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(), + (__mmask32)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiasph_phf8( + __m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2hf8_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtbiasph_phf8(__mmask32 __U, __m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2hf8_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(), + (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvtbiassph_phf8(__m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2hf8s_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(), + (__mmask32)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiassph_phf8( + __m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2hf8s_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtbiassph_phf8(__mmask32 __U, __m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2hf8s_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(), + (__mmask32)__U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtne2ph_pbf8(__m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_vcvtne2ph2bf8_512((__v32hf)(__A), + (__v32hf)(__B)); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtne2ph_pbf8( + __m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_selectb_512( + (__mmask64)__U, (__v64qi)_mm512_cvtne2ph_pbf8(__A, __B), (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtne2ph_pbf8(__mmask64 __U, __m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_selectb_512( + (__mmask64)__U, (__v64qi)_mm512_cvtne2ph_pbf8(__A, __B), + (__v64qi)(__m512i)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtnes2ph_pbf8(__m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_vcvtne2ph2bf8s_512((__v32hf)(__A), + (__v32hf)(__B)); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtnes2ph_pbf8( + __m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_selectb_512( + (__mmask64)__U, (__v64qi)_mm512_cvtnes2ph_pbf8(__A, __B), (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtnes2ph_pbf8(__mmask64 __U, __m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_selectb_512( + (__mmask64)__U, (__v64qi)_mm512_cvtnes2ph_pbf8(__A, __B), + (__v64qi)(__m512i)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtne2ph_phf8(__m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_vcvtne2ph2hf8_512((__v32hf)(__A), + (__v32hf)(__B)); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtne2ph_phf8( + __m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_selectb_512( + (__mmask64)__U, (__v64qi)_mm512_cvtne2ph_phf8(__A, __B), (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtne2ph_phf8(__mmask64 __U, __m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_selectb_512( + (__mmask64)__U, (__v64qi)_mm512_cvtne2ph_phf8(__A, __B), + (__v64qi)(__m512i)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtnes2ph_phf8(__m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_vcvtne2ph2hf8s_512((__v32hf)(__A), + (__v32hf)(__B)); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtnes2ph_phf8( + __m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_selectb_512( + (__mmask64)__U, (__v64qi)_mm512_cvtnes2ph_phf8(__A, __B), (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtnes2ph_phf8(__mmask64 __U, __m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_selectb_512( + (__mmask64)__U, (__v64qi)_mm512_cvtnes2ph_phf8(__A, __B), + (__v64qi)(__m512i)_mm512_setzero_si512()); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_cvtnehf8_ph(__m256i __A) { + return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask( + (__v32qi)__A, (__v32hf)(__m512h)_mm512_undefined_ph(), (__mmask32)-1); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtnehf8_ph(__m512h __W, __mmask32 __U, __m256i __A) { + return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask( + (__v32qi)__A, (__v32hf)(__m512h)__W, (__mmask32)__U); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtnehf8_ph(__mmask32 __U, __m256i __A) { + return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask( + (__v32qi)__A, (__v32hf)(__m512h)_mm512_setzero_ph(), (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvtneph_pbf8(__m512h __A) { + return (__m256i)__builtin_ia32_vcvtneph2bf8_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtneph_pbf8(__m256i __W, __mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtneph2bf8_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtneph_pbf8(__mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtneph2bf8_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvtnesph_pbf8(__m512h __A) { + return (__m256i)__builtin_ia32_vcvtneph2bf8s_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtnesph_pbf8(__m256i __W, __mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtneph2bf8s_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtnesph_pbf8(__mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtneph2bf8s_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvtneph_phf8(__m512h __A) { + return (__m256i)__builtin_ia32_vcvtneph2hf8_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtneph_phf8(__m256i __W, __mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtneph2hf8_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtneph_phf8(__mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtneph2hf8_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvtnesph_phf8(__m512h __A) { + return (__m256i)__builtin_ia32_vcvtneph2hf8s_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtnesph_phf8(__m256i __W, __mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtneph2hf8s_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtnesph_phf8(__mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtneph2hf8s_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U); +} + +static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_cvtpbf8_ph(__m256i __A) { + return _mm512_castsi512_ph(_mm512_slli_epi16(_mm512_cvtepi8_epi16(__A), 8)); +} + +static __inline __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtpbf8_ph(__m512h __S, __mmask16 __U, __m256i __A) { + return _mm512_castsi512_ph( + _mm512_mask_slli_epi16((__m512i)__S, __U, _mm512_cvtepi8_epi16(__A), 8)); +} + +static __inline __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtpbf8_ph(__mmask16 __U, __m256i __A) { + return _mm512_castsi512_ph( + _mm512_slli_epi16(_mm512_maskz_cvtepi8_epi16(__U, __A), 8)); +} + +#undef __DEFAULT_FN_ATTRS512 + +#endif // __AVX10_2_512CONVERTINTRIN_H +#endif // __SSE2__ diff --git a/clang/lib/Headers/avx10_2convertintrin.h b/clang/lib/Headers/avx10_2convertintrin.h new file mode 100644 index 00000000000000..134adb2850c8de --- /dev/null +++ b/clang/lib/Headers/avx10_2convertintrin.h @@ -0,0 +1,598 @@ +/*===--------------- avx10_2convertintrin.h - AVX10_2CONVERT ---------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif // __IMMINTRIN_H + +#ifdef __SSE2__ + +#ifndef __AVX10_2CONVERTINTRIN_H +#define __AVX10_2CONVERTINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \ + __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \ + __min_vector_width__(256))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtx2ps_ph(__m128 __A, + __m128 __B) { + return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask( + (__v4sf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)(-1)); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_cvtx2ps_ph(__m128h __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask( + (__v4sf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtx2ps_ph(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask( + (__v4sf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtx2ps_ph(__m256 __A, + __m256 __B) { + return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask( + (__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)(-1), + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtx2ps_ph(__m256h __W, __mmask16 __U, __m256 __A, __m256 __B) { + return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask( + (__v8sf)__A, (__v8sf)__B, (__v16hf)__W, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) { + return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask( + (__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm256_cvtx_round2ps_ph(A, B, R) \ + ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \ + (__v8sf)(A), (__v8sf)(B), (__v16hf)_mm256_undefined_ph(), \ + (__mmask16)(-1), (const int)(R))) + +#define _mm256_mask_cvtx_round2ps_ph(W, U, A, B, R) \ + ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \ + (__v8sf)(A), (__v8sf)(B), (__v16hf)(W), (__mmask16)(U), (const int)(R))) + +#define _mm256_maskz_cvtx_round2ps_ph(U, A, B, R) \ + ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \ + (__v8sf)(A), (__v8sf)(B), (__v16hf)(_mm256_setzero_ph()), \ + (__mmask16)(U), (const int)(R))) + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtbiasph_pbf8(__m128i __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtbiasph_pbf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtbiasph_pbf8(__mmask8 __U, __m128i __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtbiasph_pbf8(__m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(), + (__mmask16)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiasph_pbf8( + __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtbiasph_pbf8(__mmask16 __U, __m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), + (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtbiassph_pbf8(__m128i __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtbiassph_pbf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtbiassph_pbf8(__mmask8 __U, __m128i __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtbiassph_pbf8(__m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(), + (__mmask16)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiassph_pbf8( + __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtbiassph_pbf8(__mmask16 __U, __m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), + (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtbiasph_phf8(__m128i __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtbiasph_phf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtbiasph_phf8(__mmask8 __U, __m128i __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtbiasph_phf8(__m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(), + (__mmask16)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiasph_phf8( + __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtbiasph_phf8(__mmask16 __U, __m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), + (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtbiassph_phf8(__m128i __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtbiassph_phf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtbiassph_phf8(__mmask8 __U, __m128i __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtbiassph_phf8(__m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(), + (__mmask16)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiassph_phf8( + __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtbiassph_phf8(__mmask16 __U, __m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), + (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtne2ph_pbf8(__m128h __A, + __m128h __B) { + return (__m128i)__builtin_ia32_vcvtne2ph2bf8_128((__v8hf)(__A), + (__v8hf)(__B)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtne2ph_pbf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { + return (__m128i)__builtin_ia32_selectb_128( + (__mmask16)__U, (__v16qi)_mm_cvtne2ph_pbf8(__A, __B), (__v16qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtne2ph_pbf8(__mmask16 __U, __m128h __A, __m128h __B) { + return (__m128i)__builtin_ia32_selectb_128( + (__mmask16)__U, (__v16qi)_mm_cvtne2ph_pbf8(__A, __B), + (__v16qi)(__m128i)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtne2ph_pbf8(__m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_vcvtne2ph2bf8_256((__v16hf)(__A), + (__v16hf)(__B)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtne2ph_pbf8( + __m256i __W, __mmask32 __U, __m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_selectb_256( + (__mmask16)__U, (__v32qi)_mm256_cvtne2ph_pbf8(__A, __B), (__v32qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtne2ph_pbf8(__mmask32 __U, __m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_selectb_256( + (__mmask16)__U, (__v32qi)_mm256_cvtne2ph_pbf8(__A, __B), + (__v32qi)(__m256i)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtnes2ph_pbf8(__m128h __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtne2ph2bf8s_128((__v8hf)(__A), + (__v8hf)(__B)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtnes2ph_pbf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { + return (__m128i)__builtin_ia32_selectb_128( + (__mmask16)__U, (__v16qi)_mm_cvtnes2ph_pbf8(__A, __B), (__v16qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtnes2ph_pbf8(__mmask16 __U, __m128h __A, __m128h __B) { + return (__m128i)__builtin_ia32_selectb_128( + (__mmask16)__U, (__v16qi)_mm_cvtnes2ph_pbf8(__A, __B), + (__v16qi)(__m128i)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtnes2ph_pbf8(__m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_vcvtne2ph2bf8s_256((__v16hf)(__A), + (__v16hf)(__B)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtnes2ph_pbf8( + __m256i __W, __mmask32 __U, __m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_selectb_256( + (__mmask16)__U, (__v32qi)_mm256_cvtnes2ph_pbf8(__A, __B), (__v32qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtnes2ph_pbf8(__mmask32 __U, __m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_selectb_256( + (__mmask16)__U, (__v32qi)_mm256_cvtnes2ph_pbf8(__A, __B), + (__v32qi)(__m256i)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtne2ph_phf8(__m128h __A, + __m128h __B) { + return (__m128i)__builtin_ia32_vcvtne2ph2hf8_128((__v8hf)(__A), + (__v8hf)(__B)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtne2ph_phf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { + return (__m128i)__builtin_ia32_selectb_128( + (__mmask16)__U, (__v16qi)_mm_cvtne2ph_phf8(__A, __B), (__v16qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtne2ph_phf8(__mmask16 __U, __m128h __A, __m128h __B) { + return (__m128i)__builtin_ia32_selectb_128( + (__mmask16)__U, (__v16qi)_mm_cvtne2ph_phf8(__A, __B), + (__v16qi)(__m128i)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtne2ph_phf8(__m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_vcvtne2ph2hf8_256((__v16hf)(__A), + (__v16hf)(__B)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtne2ph_phf8( + __m256i __W, __mmask32 __U, __m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_selectb_256( + (__mmask16)__U, (__v32qi)_mm256_cvtne2ph_phf8(__A, __B), (__v32qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtne2ph_phf8(__mmask32 __U, __m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_selectb_256( + (__mmask16)__U, (__v32qi)_mm256_cvtne2ph_phf8(__A, __B), + (__v32qi)(__m256i)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtnes2ph_phf8(__m128h __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtne2ph2hf8s_128((__v8hf)(__A), + (__v8hf)(__B)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtnes2ph_phf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { + return (__m128i)__builtin_ia32_selectb_128( + (__mmask16)__U, (__v16qi)_mm_cvtnes2ph_phf8(__A, __B), (__v16qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtnes2ph_phf8(__mmask16 __U, __m128h __A, __m128h __B) { + return (__m128i)__builtin_ia32_selectb_128( + (__mmask16)__U, (__v16qi)_mm_cvtnes2ph_phf8(__A, __B), + (__v16qi)(__m128i)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtnes2ph_phf8(__m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_vcvtne2ph2hf8s_256((__v16hf)(__A), + (__v16hf)(__B)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtnes2ph_phf8( + __m256i __W, __mmask32 __U, __m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_selectb_256( + (__mmask16)__U, (__v32qi)_mm256_cvtnes2ph_phf8(__A, __B), (__v32qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtnes2ph_phf8(__mmask32 __U, __m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_selectb_256( + (__mmask16)__U, (__v32qi)_mm256_cvtnes2ph_phf8(__A, __B), + (__v32qi)(__m256i)_mm256_setzero_si256()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtnehf8_ph(__m128i __A) { + return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask( + (__v16qi)__A, (__v8hf)(__m128h)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_cvtnehf8_ph(__m128h __W, __mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask( + (__v16qi)__A, (__v8hf)(__m128h)__W, (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtnehf8_ph(__mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask( + (__v16qi)__A, (__v8hf)(__m128h)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_cvtnehf8_ph(__m128i __A) { + return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask( + (__v16qi)__A, (__v16hf)(__m256h)_mm256_undefined_ph(), (__mmask16)-1); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtnehf8_ph(__m256h __W, __mmask16 __U, __m128i __A) { + return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask( + (__v16qi)__A, (__v16hf)(__m256h)__W, (__mmask16)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtnehf8_ph(__mmask16 __U, __m128i __A) { + return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask( + (__v16qi)__A, (__v16hf)(__m256h)_mm256_setzero_ph(), (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtneph_pbf8(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtneph2bf8_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtneph_pbf8(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtneph2bf8_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtneph_pbf8(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtneph2bf8_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtneph_pbf8(__m256h __A) { + return (__m128i)__builtin_ia32_vcvtneph2bf8_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtneph_pbf8(__m128i __W, __mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtneph2bf8_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtneph_pbf8(__mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtneph2bf8_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtnesph_pbf8(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtneph2bf8s_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtnesph_pbf8(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtneph2bf8s_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtnesph_pbf8(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtneph2bf8s_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtnesph_pbf8(__m256h __A) { + return (__m128i)__builtin_ia32_vcvtneph2bf8s_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtnesph_pbf8(__m128i __W, __mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtneph2bf8s_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtnesph_pbf8(__mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtneph2bf8s_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtneph_phf8(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtneph2hf8_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtneph_phf8(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtneph2hf8_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtneph_phf8(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtneph2hf8_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtneph_phf8(__m256h __A) { + return (__m128i)__builtin_ia32_vcvtneph2hf8_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtneph_phf8(__m128i __W, __mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtneph2hf8_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtneph_phf8(__mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtneph2hf8_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtnesph_phf8(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtneph2hf8s_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtnesph_phf8(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtneph2hf8s_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtnesph_phf8(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtneph2hf8s_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtnesph_phf8(__m256h __A) { + return (__m128i)__builtin_ia32_vcvtneph2hf8s_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtnesph_phf8(__m128i __W, __mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtneph2hf8s_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtnesph_phf8(__mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtneph2hf8s_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtpbf8_ph(__m128i __A) { + return _mm_castsi128_ph(_mm_slli_epi16(_mm_cvtepi8_epi16(__A), 8)); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_cvtpbf8_ph(__m128h __S, __mmask8 __U, __m128i __A) { + return _mm_castsi128_ph( + _mm_mask_slli_epi16((__m128i)__S, __U, _mm_cvtepi8_epi16(__A), 8)); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtpbf8_ph(__mmask8 __U, __m128i __A) { + return _mm_castsi128_ph(_mm_slli_epi16(_mm_maskz_cvtepi8_epi16(__U, __A), 8)); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtpbf8_ph(__m128i __A) { + return _mm256_castsi256_ph(_mm256_slli_epi16(_mm256_cvtepi8_epi16(__A), 8)); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtpbf8_ph(__m256h __S, __mmask8 __U, __m128i __A) { + return _mm256_castsi256_ph( + _mm256_mask_slli_epi16((__m256i)__S, __U, _mm256_cvtepi8_epi16(__A), 8)); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtpbf8_ph(__mmask8 __U, __m128i __A) { + return _mm256_castsi256_ph( + _mm256_slli_epi16(_mm256_maskz_cvtepi8_epi16(__U, __A), 8)); +} + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif // __AVX10_2CONVERTINTRIN_H +#endif // __SSE2__ diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index f570c5752db4b9..a922056622e79f 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -649,12 +649,14 @@ _storebe_i64(void * __P, long long __D) { #endif #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2__) +#include #include #include #include #endif #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2_512__) +#include #include #include #include diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp index 45c4a1c80b4083..311e574537059d 100644 --- a/clang/lib/Sema/SemaX86.cpp +++ b/clang/lib/Sema/SemaX86.cpp @@ -430,6 +430,8 @@ bool SemaX86::CheckBuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_vfmulcph512_mask: case X86::BI__builtin_ia32_vfcmulcsh_mask: case X86::BI__builtin_ia32_vfcmulcph512_mask: + case X86::BI__builtin_ia32_vcvt2ps2phx256_mask: + case X86::BI__builtin_ia32_vcvt2ps2phx512_mask: ArgNum = 4; HasRC = true; break; diff --git a/clang/test/CodeGen/X86/avx10_2_512convert-builtins.c b/clang/test/CodeGen/X86/avx10_2_512convert-builtins.c new file mode 100644 index 00000000000000..e71cc0c9ad6b02 --- /dev/null +++ b/clang/test/CodeGen/X86/avx10_2_512convert-builtins.c @@ -0,0 +1,318 @@ +// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64 -target-feature +avx10.2-512 \ +// RUN: -emit-llvm -o - -Wno-invalid-feature-combination -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=i386 -target-feature +avx10.2-512 \ +// RUN: -emit-llvm -o - -Wno-invalid-feature-combination -Wall -Werror | FileCheck %s + +#include + +__m512h test_mm512_cvtx2ps_ph(__m512 __A, __m512 __B) { + // CHECK-LABEL: @test_mm512_cvtx2ps_ph( + // CHECK: call <32 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.512 + return _mm512_cvtx2ps_ph(__A, __B); +} + +__m512h test_mm512_mask_cvtx2ps_ph(__m512h __W, __mmask32 __U, __m512 __A, __m512 __B) { + // CHECK-LABEL: @test_mm512_mask_cvtx2ps_ph( + // CHECK: call <32 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.512 + return _mm512_mask_cvtx2ps_ph(__W, __U, __A, __B); +} + +__m512h test_mm512_maskz_cvtx2ps_ph(__mmask32 __U, __m512 __A, __m512 __B) { + // CHECK-LABEL: @test_mm512_maskz_cvtx2ps_ph( + // CHECK: call <32 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.512 + return _mm512_maskz_cvtx2ps_ph(__U, __A, __B); +} + +__m512h test_mm512_cvtx_round2ps_ph(__m512 __A, __m512 __B) { + // CHECK-LABEL: @test_mm512_cvtx_round2ps_ph( + // CHECK: call <32 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.512 + return _mm512_cvtx_round2ps_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask_cvtx_round2ps_ph(__m512h __W, __mmask32 __U, __m512 __A, __m512 __B) { +// CHECK-LABEL: @test_mm512_mask_cvtx_round2ps_ph( +// CHECK: call <32 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.512 + return _mm512_mask_cvtx_round2ps_ph(__W, __U, __A, __B, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_maskz_cvtx_round2ps_ph(__mmask32 __U, __m512 __A, __m512 __B) { +// CHECK-LABEL: @test_mm512_maskz_cvtx_round2ps_ph( +// CHECK: call <32 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.512 + return _mm512_maskz_cvtx_round2ps_ph(__U, __A, __B, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); +} + +__m256i test_mm512_cvtbiasph_pbf8(__m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_cvtbiasph_pbf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8512( + return _mm512_cvtbiasph_pbf8(__A, __B); +} + +__m256i test_mm512_mask_cvtbiasph_pbf8(__m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_cvtbiasph_pbf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8512( + return _mm512_mask_cvtbiasph_pbf8(__W, __U, __A, __B); +} + +__m256i test_mm512_maskz_cvtbiasph_pbf8(__mmask32 __U, __m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvtbiasph_pbf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8512( + return _mm512_maskz_cvtbiasph_pbf8(__U, __A, __B); +} + +__m256i test_mm512_cvtbiassph_pbf8(__m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_cvtbiassph_pbf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s512( + return _mm512_cvtbiassph_pbf8(__A, __B); +} + +__m256i test_mm512_mask_cvtbiassph_pbf8(__m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_cvtbiassph_pbf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s512( + return _mm512_mask_cvtbiassph_pbf8(__W, __U, __A, __B); +} + +__m256i test_mm512_maskz_cvtbiassph_pbf8(__mmask32 __U, __m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvtbiassph_pbf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s512( + return _mm512_maskz_cvtbiassph_pbf8(__U, __A, __B); +} + +__m256i test_mm512_cvtbiasph_phf8(__m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_cvtbiasph_phf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8512( + return _mm512_cvtbiasph_phf8(__A, __B); +} + +__m256i test_mm512_mask_cvtbiasph_phf8(__m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_cvtbiasph_phf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8512( + return _mm512_mask_cvtbiasph_phf8(__W, __U, __A, __B); +} + +__m256i test_mm512_maskz_cvtbiasph_phf8(__mmask32 __U, __m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvtbiasph_phf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8512( + return _mm512_maskz_cvtbiasph_phf8(__U, __A, __B); +} + +__m256i test_mm512_cvtbiassph_phf8(__m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_cvtbiassph_phf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s512( + return _mm512_cvtbiassph_phf8(__A, __B); +} + +__m256i test_mm512_mask_cvtbiassph_phf8(__m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_cvtbiassph_phf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s512( + return _mm512_mask_cvtbiassph_phf8(__W, __U, __A, __B); +} + +__m256i test_mm512_maskz_cvtbiassph_phf8(__mmask32 __U, __m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvtbiassph_phf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s512( + return _mm512_maskz_cvtbiassph_phf8(__U, __A, __B); +} + +__m512i test_mm512_cvtne2ph_pbf8(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_cvtne2ph_pbf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8512( + return _mm512_cvtne2ph_pbf8(__A, __B); +} + +__m512i test_mm512_mask_cvtne2ph_pbf8(__m512i __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_cvtne2ph_pbf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8512( + // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} + // CHECK: ret <8 x i64> %{{.*}} + return _mm512_mask_cvtne2ph_pbf8(__W, __U, __A, __B); +} + +__m512i test_mm512_maskz_cvtne2ph_pbf8(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvtne2ph_pbf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8512( + // CHECK: zeroinitializer + // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} + return _mm512_maskz_cvtne2ph_pbf8(__U, __A, __B); +} + +__m512i test_mm512_cvtnes2ph_pbf8(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_cvtnes2ph_pbf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s512( + return _mm512_cvtnes2ph_pbf8(__A, __B); +} + +__m512i test_mm512_mask_cvtnes2ph_pbf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_cvtnes2ph_pbf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s512( + // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} + // CHECK: ret <8 x i64> %{{.*}} + return _mm512_mask_cvtnes2ph_pbf8(__W, __U, __A, __B); +} + +__m512i test_mm512_maskz_cvtnes2ph_pbf8(__mmask64 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvtnes2ph_pbf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s512( + // CHECK: zeroinitializer + // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} + return _mm512_maskz_cvtnes2ph_pbf8(__U, __A, __B); +} + +__m512i test_mm512_cvtne2ph_phf8(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_cvtne2ph_phf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8512( + return _mm512_cvtne2ph_phf8(__A, __B); +} + +__m512i test_mm512_mask_cvtne2ph_phf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_cvtne2ph_phf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8512( + // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} + // CHECK: ret <8 x i64> %{{.*}} + return _mm512_mask_cvtne2ph_phf8(__W, __U, __A, __B); +} + +__m512i test_mm512_maskz_cvtne2ph_phf8(__mmask64 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvtne2ph_phf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8512( + // CHECK: zeroinitializer + // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} + return _mm512_maskz_cvtne2ph_phf8(__U, __A, __B); +} + +__m512i test_mm512_cvtnes2ph_phf8(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_cvtnes2ph_phf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s512( + return _mm512_cvtnes2ph_phf8(__A, __B); +} + +__m512i test_mm512_mask_cvtnes2ph_phf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_cvtnes2ph_phf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s512( + // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} + // CHECK: ret <8 x i64> %{{.*}} + return _mm512_mask_cvtnes2ph_phf8(__W, __U, __A, __B); +} + +__m512i test_mm512_maskz_cvtnes2ph_phf8(__mmask64 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvtnes2ph_phf8( + // CHECK: call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s512( + // CHECK: zeroinitializer + // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} + return _mm512_maskz_cvtnes2ph_phf8(__U, __A, __B); +} + +__m512h test_mm512_cvtnehf8_ph(__m256i __A) { + // CHECK-LABEL: @test_mm512_cvtnehf8_ph( + // CHECK: call <32 x half> @llvm.x86.avx10.mask.vcvthf82ph512( + return _mm512_cvtnehf8_ph(__A); +} + +__m512h test_mm512_mask_cvtnehf8_ph(__m512h __A, __mmask32 __B, __m256i __C) { + // CHECK-LABEL: @test_mm512_mask_cvtnehf8_ph( + // CHECK: call <32 x half> @llvm.x86.avx10.mask.vcvthf82ph512( + return _mm512_mask_cvtnehf8_ph(__A, __B, __C); +} + +__m512h test_mm512_maskz_cvtnehf8_ph(__mmask32 __A, __m256i __B) { + // CHECK-LABEL: @test_mm512_maskz_cvtnehf8_ph( + // CHECK: call <32 x half> @llvm.x86.avx10.mask.vcvthf82ph512( + return _mm512_maskz_cvtnehf8_ph(__A, __B); +} + +__m256i test_mm512_cvtneph_pbf8(__m512h __A) { + // CHECK-LABEL: @test_mm512_cvtneph_pbf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8512( + return _mm512_cvtneph_pbf8(__A); +} + +__m256i test_mm512_mask_cvtneph_pbf8(__m256i __A, __mmask32 __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_cvtneph_pbf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8512( + return _mm512_mask_cvtneph_pbf8(__A, __B, __C); +} + +__m256i test_mm512_maskz_cvtneph_pbf8(__mmask32 __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvtneph_pbf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8512( + return _mm512_maskz_cvtneph_pbf8(__A, __B); +} + +__m256i test_mm512_cvtnesph_pbf8(__m512h __A) { + // CHECK-LABEL: @test_mm512_cvtnesph_pbf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s512( + return _mm512_cvtnesph_pbf8(__A); +} + +__m256i test_mm512_mask_cvtnesph_pbf8(__m256i __A, __mmask32 __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_cvtnesph_pbf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s512( + return _mm512_mask_cvtnesph_pbf8(__A, __B, __C); +} + +__m256i test_mm512_maskz_cvtnesph_pbf8(__mmask32 __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvtnesph_pbf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s512( + return _mm512_maskz_cvtnesph_pbf8(__A, __B); +} + +__m256i test_mm512_cvtneph_phf8(__m512h __A) { + // CHECK-LABEL: @test_mm512_cvtneph_phf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8512( + return _mm512_cvtneph_phf8(__A); +} + +__m256i test_mm512_mask_cvtneph_phf8(__m256i __A, __mmask32 __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_cvtneph_phf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8512( + return _mm512_mask_cvtneph_phf8(__A, __B, __C); +} + +__m256i test_mm512_maskz_cvtneph_phf8(__mmask32 __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvtneph_phf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8512( + return _mm512_maskz_cvtneph_phf8(__A, __B); +} + +__m256i test_mm512_cvtnesph_phf8(__m512h __A) { + // CHECK-LABEL: @test_mm512_cvtnesph_phf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s512( + return _mm512_cvtnesph_phf8(__A); +} + +__m256i test_mm512_mask_cvtnesph_phf8(__m256i __A, __mmask32 __B, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_cvtnesph_phf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s512( + return _mm512_mask_cvtnesph_phf8(__A, __B, __C); +} + +__m256i test_mm512_maskz_cvtnesph_phf8(__mmask32 __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_cvtnesph_phf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s512( + return _mm512_maskz_cvtnesph_phf8(__A, __B); +} + +__m512h test_mm512_cvtpbf8_ph(__m256i A) { + // CHECK-LABEL: @test_mm512_cvtpbf8_ph + // CHECK: sext <32 x i8> %{{.*}} to <32 x i16> + // CHECK: @llvm.x86.avx512.pslli.w.512 + // CHECK: ret <32 x half> %{{.*}} + return _mm512_cvtpbf8_ph(A); +} + +__m512h test_mm512_mask_cvtpbf8_ph(__m512h S, __mmask16 M, __m256i A) { + // CHECK-LABEL: @test_mm512_mask_cvtpbf8_ph + // CHECK: sext <32 x i8> %{{.*}} to <32 x i16> + // CHECK: @llvm.x86.avx512.pslli.w.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} + // CHECK: ret <32 x half> %{{.*}} + return _mm512_mask_cvtpbf8_ph(S, M, A); +} + +__m512h test_mm512_maskz_cvtpbf8_ph(__mmask16 M, __m256i A) { + // CHECK-LABEL: @test_mm512_maskz_cvtpbf8_ph + // CHECK: sext <32 x i8> %{{.*}} to <32 x i16> + // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} + // CHECK: @llvm.x86.avx512.pslli.w.512 + // CHECK: ret <32 x half> %{{.*}} + return _mm512_maskz_cvtpbf8_ph(M, A); +} diff --git a/clang/test/CodeGen/X86/avx10_2convert-builtins.c b/clang/test/CodeGen/X86/avx10_2convert-builtins.c new file mode 100644 index 00000000000000..8086c1b5d33993 --- /dev/null +++ b/clang/test/CodeGen/X86/avx10_2convert-builtins.c @@ -0,0 +1,612 @@ +// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64 -target-feature +avx10.2-256 \ +// RUN: -emit-llvm -o - -Wno-invalid-feature-combination -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=i386 -target-feature +avx10.2-256 \ +// RUN: -emit-llvm -o - -Wno-invalid-feature-combination -Wall -Werror | FileCheck %s + +#include + +__m128h test_mm_cvtx2ps_ph(__m128 __A, __m128 __B) { + // CHECK-LABEL: @test_mm_cvtx2ps_ph( + // CHECK: call <8 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.128 + return _mm_cvtx2ps_ph(__A, __B); +} + +__m128h test_mm_mask_cvtx2ps_ph(__m128h __W, __mmask8 __U, __m128 __A, __m128 __B) { + // CHECK-LABEL: @test_mm_mask_cvtx2ps_ph( + // CHECK: call <8 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.128 + return _mm_mask_cvtx2ps_ph(__W, __U, __A, __B); +} + +__m128h test_mm_maskz_cvtx2ps_ph(__mmask8 __U, __m128 __A, __m128 __B) { + // CHECK-LABEL: @test_mm_maskz_cvtx2ps_ph( + // CHECK: call <8 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.128 + return _mm_maskz_cvtx2ps_ph(__U, __A, __B); +} + +__m256h test_mm256_cvtx2ps_ph(__m256 __A, __m256 __B) { + // CHECK-LABEL: @test_mm256_cvtx2ps_ph( + // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256 + return _mm256_cvtx2ps_ph(__A, __B); +} + +__m256h test_mm256_mask_cvtx2ps_ph(__m256h __W, __mmask16 __U, __m256 __A, __m256 __B) { + // CHECK-LABEL: @test_mm256_mask_cvtx2ps_ph( + // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256 + return _mm256_mask_cvtx2ps_ph(__W, __U, __A, __B); +} + +__m256h test_mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) { + // CHECK-LABEL: @test_mm256_maskz_cvtx2ps_ph( + // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256 + return _mm256_maskz_cvtx2ps_ph(__U, __A, __B); +} + +__m256h test_mm256_cvtx_round2ps_ph(__m256 __A, __m256 __B) { + // CHECK-LABEL: @test_mm256_cvtx_round2ps_ph( + // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256( + return _mm256_cvtx_round2ps_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m256h test_mm256_mask_cvtx_round2ps_ph(__m256h __W, __mmask8 __U, __m256 __A, __m256 __B) { + // CHECK-LABEL: @test_mm256_mask_cvtx_round2ps_ph( + // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256( + return _mm256_mask_cvtx_round2ps_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m256h test_mm256_maskz_cvtx_round2ps_ph(__mmask8 __U, __m256 __A, __m256 __B) { + // CHECK-LABEL: @test_mm256_maskz_cvtx_round2ps_ph( + // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256( + return _mm256_maskz_cvtx_round2ps_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128i test_mm_cvtbiasph_pbf8(__m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_cvtbiasph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8128( + return _mm_cvtbiasph_pbf8(__A, __B); +} + +__m128i test_mm_mask_cvtbiasph_pbf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_cvtbiasph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8128( + return _mm_mask_cvtbiasph_pbf8(__W, __U, __A, __B); +} + +__m128i test_mm_maskz_cvtbiasph_pbf8(__mmask8 __U, __m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvtbiasph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8128( + return _mm_maskz_cvtbiasph_pbf8(__U, __A, __B); +} + +__m128i test_mm256_cvtbiasph_pbf8(__m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_cvtbiasph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8256( + return _mm256_cvtbiasph_pbf8(__A, __B); +} + +__m128i test_mm256_mask_cvtbiasph_pbf8(__m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_cvtbiasph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8256( + return _mm256_mask_cvtbiasph_pbf8(__W, __U, __A, __B); +} + +__m128i test_mm256_maskz_cvtbiasph_pbf8(__mmask16 __U, __m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvtbiasph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8256( + return _mm256_maskz_cvtbiasph_pbf8(__U, __A, __B); +} + +__m128i test_mm_cvtbiassph_pbf8(__m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_cvtbiassph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s128( + return _mm_cvtbiassph_pbf8(__A, __B); +} + +__m128i test_mm_mask_cvtbiassph_pbf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_cvtbiassph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s128( + return _mm_mask_cvtbiassph_pbf8(__W, __U, __A, __B); +} + +__m128i test_mm_maskz_cvtbiassph_pbf8(__mmask8 __U, __m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvtbiassph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s128( + return _mm_maskz_cvtbiassph_pbf8(__U, __A, __B); +} + +__m128i test_mm256_cvtbiassph_pbf8(__m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_cvtbiassph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s256( + return _mm256_cvtbiassph_pbf8(__A, __B); +} + +__m128i test_mm256_mask_cvtbiassph_pbf8(__m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_cvtbiassph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s256( + return _mm256_mask_cvtbiassph_pbf8(__W, __U, __A, __B); +} + +__m128i test_mm256_maskz_cvtbiassph_pbf8(__mmask16 __U, __m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvtbiassph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s256( + return _mm256_maskz_cvtbiassph_pbf8(__U, __A, __B); +} + +__m128i test_mm_cvtbiasph_phf8(__m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_cvtbiasph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8128( + return _mm_cvtbiasph_phf8(__A, __B); +} + +__m128i test_mm_mask_cvtbiasph_phf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_cvtbiasph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8128( + return _mm_mask_cvtbiasph_phf8(__W, __U, __A, __B); +} + +__m128i test_mm_maskz_cvtbiasph_phf8(__mmask8 __U, __m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvtbiasph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8128( + return _mm_maskz_cvtbiasph_phf8(__U, __A, __B); +} + +__m128i test_mm256_cvtbiasph_phf8(__m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_cvtbiasph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8256( + return _mm256_cvtbiasph_phf8(__A, __B); +} + +__m128i test_mm256_mask_cvtbiasph_phf8(__m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_cvtbiasph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8256( + return _mm256_mask_cvtbiasph_phf8(__W, __U, __A, __B); +} + +__m128i test_mm256_maskz_cvtbiasph_phf8(__mmask16 __U, __m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvtbiasph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8256( + return _mm256_maskz_cvtbiasph_phf8(__U, __A, __B); +} + +__m128i test_mm_cvtbiassph_phf8(__m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_cvtbiassph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s128( + return _mm_cvtbiassph_phf8(__A, __B); +} + +__m128i test_mm_mask_cvtbiassph_phf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_cvtbiassph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s128( + return _mm_mask_cvtbiassph_phf8(__W, __U, __A, __B); +} + +__m128i test_mm_maskz_cvtbiassph_phf8(__mmask8 __U, __m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvtbiassph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s128( + return _mm_maskz_cvtbiassph_phf8(__U, __A, __B); +} + +__m128i test_mm256_cvtbiassph_phf8(__m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_cvtbiassph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s256( + return _mm256_cvtbiassph_phf8(__A, __B); +} + +__m128i test_mm256_mask_cvtbiassph_phf8(__m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_cvtbiassph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s256( + return _mm256_mask_cvtbiassph_phf8(__W, __U, __A, __B); +} + +__m128i test_mm256_maskz_cvtbiassph_phf8(__mmask16 __U, __m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvtbiassph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s256( + return _mm256_maskz_cvtbiassph_phf8(__U, __A, __B); +} + +__m128i test_mm_cvtne2ph_pbf8(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_cvtne2ph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8128( + return _mm_cvtne2ph_pbf8(__A, __B); +} + +__m128i test_mm_mask_cvtne2ph_pbf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_cvtne2ph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8128( + // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + // CHECK: ret <2 x i64> %{{.*}} + return _mm_mask_cvtne2ph_pbf8(__W, __U, __A, __B); +} + +__m128i test_mm_maskz_cvtne2ph_pbf8(__mmask16 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvtne2ph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8128( + // CHECK: zeroinitializer + // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + return _mm_maskz_cvtne2ph_pbf8(__U, __A, __B); +} + +__m256i test_mm256_cvtne2ph_pbf8(__m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_cvtne2ph_pbf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8256( + return _mm256_cvtne2ph_pbf8(__A, __B); +} + +__m256i test_mm256_mask_cvtne2ph_pbf8(__m256i __W, __mmask16 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_cvtne2ph_pbf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8256( + // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} + // CHECK: ret <4 x i64> %{{.*}} + return _mm256_mask_cvtne2ph_pbf8(__W, __U, __A, __B); +} + +__m256i test_mm256_maskz_cvtne2ph_pbf8(__mmask16 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvtne2ph_pbf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8256( + // CHECK: zeroinitializer + // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} + return _mm256_maskz_cvtne2ph_pbf8(__U, __A, __B); +} + +__m128i test_mm_cvtnes2ph_pbf8(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_cvtnes2ph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s128( + return _mm_cvtnes2ph_pbf8(__A, __B); +} + +__m128i test_mm_mask_cvtnes2ph_pbf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_cvtnes2ph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s128( + // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + // CHECK: ret <2 x i64> %{{.*}} + return _mm_mask_cvtnes2ph_pbf8(__W, __U, __A, __B); +} + +__m128i test_mm_maskz_cvtnes2ph_pbf8(__mmask16 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvtnes2ph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s128( + // CHECK: zeroinitializer + // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + return _mm_maskz_cvtnes2ph_pbf8(__U, __A, __B); +} + +__m256i test_mm256_cvtnes2ph_pbf8(__m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_cvtnes2ph_pbf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s256( + return _mm256_cvtnes2ph_pbf8(__A, __B); +} + +__m256i test_mm256_mask_cvtnes2ph_pbf8(__m256i __W, __mmask16 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_cvtnes2ph_pbf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s256( + // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} + // CHECK: ret <4 x i64> %{{.*}} + return _mm256_mask_cvtnes2ph_pbf8(__W, __U, __A, __B); +} + +__m256i test_mm256_maskz_cvtnes2ph_pbf8(__mmask16 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvtnes2ph_pbf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s256( + // CHECK: zeroinitializer + // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} + return _mm256_maskz_cvtnes2ph_pbf8(__U, __A, __B); +} + +__m128i test_mm_cvtne2ph_phf8(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_cvtne2ph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8128( + return _mm_cvtne2ph_phf8(__A, __B); +} + +__m128i test_mm_mask_cvtne2ph_phf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_cvtne2ph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8128( + // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + // CHECK: ret <2 x i64> %{{.*}} + return _mm_mask_cvtne2ph_phf8(__W, __U, __A, __B); +} + +__m128i test_mm_maskz_cvtne2ph_phf8(__mmask16 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvtne2ph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8128( + // CHECK: zeroinitializer + // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + return _mm_maskz_cvtne2ph_phf8(__U, __A, __B); +} + +__m256i test_mm256_cvtne2ph_phf8(__m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_cvtne2ph_phf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8256( + return _mm256_cvtne2ph_phf8(__A, __B); +} + +__m256i test_mm256_mask_cvtne2ph_phf8(__m256i __W, __mmask16 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_cvtne2ph_phf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8256( + // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} + // CHECK: ret <4 x i64> %{{.*}} + return _mm256_mask_cvtne2ph_phf8(__W, __U, __A, __B); +} + +__m256i test_mm256_maskz_cvtne2ph_phf8(__mmask16 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvtne2ph_phf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8256( + // CHECK: zeroinitializer + // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} + return _mm256_maskz_cvtne2ph_phf8(__U, __A, __B); +} + +__m128i test_mm_cvtnes2ph_phf8(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_cvtnes2ph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s128( + return _mm_cvtnes2ph_phf8(__A, __B); +} + +__m128i test_mm_mask_cvtnes2ph_phf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_cvtnes2ph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s128( + // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + // CHECK: ret <2 x i64> %{{.*}} + return _mm_mask_cvtnes2ph_phf8(__W, __U, __A, __B); +} + +__m128i test_mm_maskz_cvtnes2ph_phf8(__mmask16 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvtnes2ph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s128( + // CHECK: zeroinitializer + // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + return _mm_maskz_cvtnes2ph_phf8(__U, __A, __B); +} + +__m256i test_mm256_cvtnes2ph_phf8(__m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_cvtnes2ph_phf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s256( + return _mm256_cvtnes2ph_phf8(__A, __B); +} + +__m256i test_mm256_mask_cvtnes2ph_phf8(__m256i __W, __mmask16 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_cvtnes2ph_phf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s256( + // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} + // CHECK: ret <4 x i64> %{{.*}} + return _mm256_mask_cvtnes2ph_phf8(__W, __U, __A, __B); +} + +__m256i test_mm256_maskz_cvtnes2ph_phf8(__mmask16 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvtnes2ph_phf8( + // CHECK: call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s256( + // CHECK: zeroinitializer + // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} + return _mm256_maskz_cvtnes2ph_phf8(__U, __A, __B); +} + +__m128h test_mm_cvtnehf8_ph(__m128i __A) { + // CHECK-LABEL: @test_mm_cvtnehf8_ph( + // CHECK: call <8 x half> @llvm.x86.avx10.mask.vcvthf82ph128( + return _mm_cvtnehf8_ph(__A); +} + +__m128h test_mm_mask_cvtnehf8_ph(__m128h __A, __mmask8 __B, __m128i __C) { + // CHECK-LABEL: @test_mm_mask_cvtnehf8_ph( + // CHECK: call <8 x half> @llvm.x86.avx10.mask.vcvthf82ph128( + return _mm_mask_cvtnehf8_ph(__A, __B, __C); +} + +__m128h test_mm_maskz_cvtnehf8_ph(__mmask8 __A, __m128i __B) { + // CHECK-LABEL: @test_mm_maskz_cvtnehf8_ph( + // CHECK: call <8 x half> @llvm.x86.avx10.mask.vcvthf82ph128( + return _mm_maskz_cvtnehf8_ph(__A, __B); +} + +__m256h test_mm256_cvtnehf8_ph(__m128i __A) { + // CHECK-LABEL: @test_mm256_cvtnehf8_ph( + // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvthf82ph256( + return _mm256_cvtnehf8_ph(__A); +} + +__m256h test_mm256_mask_cvtnehf8_ph(__m256h __A, __mmask16 __B, __m128i __C) { + // CHECK-LABEL: @test_mm256_mask_cvtnehf8_ph( + // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvthf82ph256( + return _mm256_mask_cvtnehf8_ph(__A, __B, __C); +} + +__m256h test_mm256_maskz_cvtnehf8_ph(__mmask16 __A, __m128i __B) { + // CHECK-LABEL: @test_mm256_maskz_cvtnehf8_ph( + // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvthf82ph256( + return _mm256_maskz_cvtnehf8_ph(__A, __B); +} + +__m128i test_mm_cvtneph_pbf8(__m128h __A) { + // CHECK-LABEL: @test_mm_cvtneph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8128( + return _mm_cvtneph_pbf8(__A); +} + +__m128i test_mm_mask_cvtneph_pbf8(__m128i __A, __mmask8 __B, __m128h __C) { + // CHECK-LABEL: @test_mm_mask_cvtneph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8128( + return _mm_mask_cvtneph_pbf8(__A, __B, __C); +} + +__m128i test_mm_maskz_cvtneph_pbf8(__mmask8 __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvtneph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8128( + return _mm_maskz_cvtneph_pbf8(__A, __B); +} + +__m128i test_mm256_cvtneph_pbf8(__m256h __A) { + // CHECK-LABEL: @test_mm256_cvtneph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8256( + return _mm256_cvtneph_pbf8(__A); +} + +__m128i test_mm256_mask_cvtneph_pbf8(__m128i __A, __mmask16 __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_mask_cvtneph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8256( + return _mm256_mask_cvtneph_pbf8(__A, __B, __C); +} + +__m128i test_mm256_maskz_cvtneph_pbf8(__mmask16 __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvtneph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8256( + return _mm256_maskz_cvtneph_pbf8(__A, __B); +} + +__m128i test_mm_cvtnesph_pbf8(__m128h __A) { + // CHECK-LABEL: @test_mm_cvtnesph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s128( + return _mm_cvtnesph_pbf8(__A); +} + +__m128i test_mm_mask_cvtnesph_pbf8(__m128i __A, __mmask8 __B, __m128h __C) { + // CHECK-LABEL: @test_mm_mask_cvtnesph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s128( + return _mm_mask_cvtnesph_pbf8(__A, __B, __C); +} + +__m128i test_mm_maskz_cvtnesph_pbf8(__mmask8 __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvtnesph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s128( + return _mm_maskz_cvtnesph_pbf8(__A, __B); +} + +__m128i test_mm256_cvtnesph_pbf8(__m256h __A) { + // CHECK-LABEL: @test_mm256_cvtnesph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s256( + return _mm256_cvtnesph_pbf8(__A); +} + +__m128i test_mm256_mask_cvtnesph_pbf8(__m128i __A, __mmask16 __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_mask_cvtnesph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s256( + return _mm256_mask_cvtnesph_pbf8(__A, __B, __C); +} + +__m128i test_mm256_maskz_cvtnesph_pbf8(__mmask16 __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvtnesph_pbf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s256( + return _mm256_maskz_cvtnesph_pbf8(__A, __B); +} + +__m128i test_mm_cvtneph_phf8(__m128h __A) { + // CHECK-LABEL: @test_mm_cvtneph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8128( + return _mm_cvtneph_phf8(__A); +} + +__m128i test_mm_mask_cvtneph_phf8(__m128i __A, __mmask8 __B, __m128h __C) { + // CHECK-LABEL: @test_mm_mask_cvtneph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8128( + return _mm_mask_cvtneph_phf8(__A, __B, __C); +} + +__m128i test_mm_maskz_cvtneph_phf8(__mmask8 __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvtneph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8128( + return _mm_maskz_cvtneph_phf8(__A, __B); +} + +__m128i test_mm256_cvtneph_phf8(__m256h __A) { + // CHECK-LABEL: @test_mm256_cvtneph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8256( + return _mm256_cvtneph_phf8(__A); +} + +__m128i test_mm256_mask_cvtneph_phf8(__m128i __A, __mmask16 __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_mask_cvtneph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8256( + return _mm256_mask_cvtneph_phf8(__A, __B, __C); +} + +__m128i test_mm256_maskz_cvtneph_phf8(__mmask16 __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvtneph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8256( + return _mm256_maskz_cvtneph_phf8(__A, __B); +} + +__m128i test_mm_cvtnesph_phf8(__m128h __A) { + // CHECK-LABEL: @test_mm_cvtnesph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s128( + return _mm_cvtnesph_phf8(__A); +} + +__m128i test_mm_mask_cvtnesph_phf8(__m128i __A, __mmask8 __B, __m128h __C) { + // CHECK-LABEL: @test_mm_mask_cvtnesph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s128( + return _mm_mask_cvtnesph_phf8(__A, __B, __C); +} + +__m128i test_mm_maskz_cvtnesph_phf8(__mmask8 __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_cvtnesph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s128( + return _mm_maskz_cvtnesph_phf8(__A, __B); +} + +__m128i test_mm256_cvtnesph_phf8(__m256h __A) { + // CHECK-LABEL: @test_mm256_cvtnesph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s256( + return _mm256_cvtnesph_phf8(__A); +} + +__m128i test_mm256_mask_cvtnesph_phf8(__m128i __A, __mmask16 __B, __m256h __C) { + // CHECK-LABEL: @test_mm256_mask_cvtnesph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s256( + return _mm256_mask_cvtnesph_phf8(__A, __B, __C); +} + +__m128i test_mm256_maskz_cvtnesph_phf8(__mmask16 __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_cvtnesph_phf8( + // CHECK: call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s256( + return _mm256_maskz_cvtnesph_phf8(__A, __B); +} + +__m256h test_mm256_cvtpbf8_ph(__m128i A) { + // CHECK-LABEL: @test_mm256_cvtpbf8_ph + // CHECK: sext <16 x i8> %{{.*}} to <16 x i16> + // CHECK: @llvm.x86.avx2.pslli.w + // CHECK: ret <16 x half> %{{.*}} + return _mm256_cvtpbf8_ph(A); +} + +__m256h test_mm256_mask_cvtpbf8_ph(__m256h S, __mmask16 M, __m128i A) { + // CHECK-LABEL: @test_mm256_mask_cvtpbf8_ph + // CHECK: sext <16 x i8> %{{.*}} to <16 x i16> + // CHECK: @llvm.x86.avx2.pslli.w + // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} + // CHECK: ret <16 x half> %{{.*}} + return _mm256_mask_cvtpbf8_ph(S, M, A); +} + +__m256h test_mm256_maskz_cvtpbf8_ph(__mmask16 M, __m128i A) { + // CHECK-LABEL: @test_mm256_maskz_cvtpbf8_ph + // CHECK: sext <16 x i8> %{{.*}} to <16 x i16> + // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} + // CHECK: @llvm.x86.avx2.pslli.w + // CHECK: ret <16 x half> %{{.*}} + return _mm256_maskz_cvtpbf8_ph(M, A); +} + +__m128h test_mm_cvtpbf8_ph(__m128i A) { + // CHECK-LABEL: @test_mm_cvtpbf8_ph + // CHECK: sext <8 x i8> %{{.*}} to <8 x i16> + // CHECK: @llvm.x86.sse2.pslli.w + // CHECK: ret <8 x half> %{{.*}} + return _mm_cvtpbf8_ph(A); +} + +__m128h test_mm_mask_cvtpbf8_ph(__m128h S, __mmask8 M, __m128i A) { + // CHECK-LABEL: @test_mm_mask_cvtpbf8_ph + // CHECK: sext <8 x i8> %{{.*}} to <8 x i16> + // CHECK: @llvm.x86.sse2.pslli.w + // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} + // CHECK: ret <8 x half> %{{.*}} + return _mm_mask_cvtpbf8_ph(S, M, A); +} + +__m128h test_mm_maskz_cvtpbf8_ph(__mmask8 M, __m128i A) { + // CHECK-LABEL: @test_mm_maskz_cvtpbf8_ph + // CHECK: sext <8 x i8> %{{.*}} to <8 x i16> + // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} + // CHECK: @llvm.x86.sse2.pslli.w + // CHECK: ret <8 x half> %{{.*}} + return _mm_maskz_cvtpbf8_ph(M, A); +} diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 1ab2002f7f6960..8d000ed1e4f859 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -7089,3 +7089,133 @@ def int_x86_avx10_mask_vcvttps2iubs512 : ClangBuiltin<"__builtin_ia32_vcvttps2iu DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16f32_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; } + +//===----------------------------------------------------------------------===// +let TargetPrefix = "x86" in { +def int_x86_avx10_mask_vcvt2ps2phx_128 : ClangBuiltin<"__builtin_ia32_vcvt2ps2phx128_mask">, + DefaultAttrsIntrinsic<[llvm_v8f16_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v8f16_ty, llvm_i8_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvt2ps2phx_256 : ClangBuiltin<"__builtin_ia32_vcvt2ps2phx256_mask">, + DefaultAttrsIntrinsic<[llvm_v16f16_ty], [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v16f16_ty, llvm_i16_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; +def int_x86_avx10_mask_vcvt2ps2phx_512 : ClangBuiltin<"__builtin_ia32_vcvt2ps2phx512_mask">, + DefaultAttrsIntrinsic<[llvm_v32f16_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v32f16_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; +def int_x86_avx10_mask_vcvtbiasph2bf8128 : ClangBuiltin<"__builtin_ia32_vcvtbiasph2bf8_128_mask">, + DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v8f16_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvtbiasph2bf8256 : ClangBuiltin<"__builtin_ia32_vcvtbiasph2bf8_256_mask">, + DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v32i8_ty, llvm_v16f16_ty, llvm_v16i8_ty, llvm_i16_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvtbiasph2bf8512 : ClangBuiltin<"__builtin_ia32_vcvtbiasph2bf8_512_mask">, + DefaultAttrsIntrinsic<[llvm_v32i8_ty], [llvm_v64i8_ty, llvm_v32f16_ty, llvm_v32i8_ty, llvm_i32_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvtbiasph2bf8s128 : ClangBuiltin<"__builtin_ia32_vcvtbiasph2bf8s_128_mask">, + DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v8f16_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvtbiasph2bf8s256 : ClangBuiltin<"__builtin_ia32_vcvtbiasph2bf8s_256_mask">, + DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v32i8_ty, llvm_v16f16_ty, llvm_v16i8_ty, llvm_i16_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvtbiasph2bf8s512 : ClangBuiltin<"__builtin_ia32_vcvtbiasph2bf8s_512_mask">, + DefaultAttrsIntrinsic<[llvm_v32i8_ty], [llvm_v64i8_ty, llvm_v32f16_ty, llvm_v32i8_ty, llvm_i32_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvtbiasph2hf8128 : ClangBuiltin<"__builtin_ia32_vcvtbiasph2hf8_128_mask">, + DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v8f16_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvtbiasph2hf8256 : ClangBuiltin<"__builtin_ia32_vcvtbiasph2hf8_256_mask">, + DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v32i8_ty, llvm_v16f16_ty, llvm_v16i8_ty, llvm_i16_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvtbiasph2hf8512 : ClangBuiltin<"__builtin_ia32_vcvtbiasph2hf8_512_mask">, + DefaultAttrsIntrinsic<[llvm_v32i8_ty], [llvm_v64i8_ty, llvm_v32f16_ty, llvm_v32i8_ty, llvm_i32_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvtbiasph2hf8s128 : ClangBuiltin<"__builtin_ia32_vcvtbiasph2hf8s_128_mask">, + DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v8f16_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvtbiasph2hf8s256 : ClangBuiltin<"__builtin_ia32_vcvtbiasph2hf8s_256_mask">, + DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v32i8_ty, llvm_v16f16_ty, llvm_v16i8_ty, llvm_i16_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvtbiasph2hf8s512 : ClangBuiltin<"__builtin_ia32_vcvtbiasph2hf8s_512_mask">, + DefaultAttrsIntrinsic<[llvm_v32i8_ty], [llvm_v64i8_ty, llvm_v32f16_ty, llvm_v32i8_ty, llvm_i32_ty], + [IntrNoMem]>; +def int_x86_avx10_vcvtne2ph2bf8128 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2bf8_128">, + DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v8f16_ty, llvm_v8f16_ty], + [IntrNoMem]>; +def int_x86_avx10_vcvtne2ph2bf8256 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2bf8_256">, + DefaultAttrsIntrinsic<[llvm_v32i8_ty], [llvm_v16f16_ty, llvm_v16f16_ty], + [IntrNoMem]>; +def int_x86_avx10_vcvtne2ph2bf8512 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2bf8_512">, + DefaultAttrsIntrinsic<[llvm_v64i8_ty], [llvm_v32f16_ty, llvm_v32f16_ty], + [IntrNoMem]>; +def int_x86_avx10_vcvtne2ph2bf8s128 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2bf8s_128">, + DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v8f16_ty, llvm_v8f16_ty], + [IntrNoMem]>; +def int_x86_avx10_vcvtne2ph2bf8s256 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2bf8s_256">, + DefaultAttrsIntrinsic<[llvm_v32i8_ty], [llvm_v16f16_ty, llvm_v16f16_ty], + [IntrNoMem]>; +def int_x86_avx10_vcvtne2ph2bf8s512 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2bf8s_512">, + DefaultAttrsIntrinsic<[llvm_v64i8_ty], [llvm_v32f16_ty, llvm_v32f16_ty], + [IntrNoMem]>; +def int_x86_avx10_vcvtne2ph2hf8128 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2hf8_128">, + DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v8f16_ty, llvm_v8f16_ty], + [IntrNoMem]>; +def int_x86_avx10_vcvtne2ph2hf8256 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2hf8_256">, + DefaultAttrsIntrinsic<[llvm_v32i8_ty], [llvm_v16f16_ty, llvm_v16f16_ty], + [IntrNoMem]>; +def int_x86_avx10_vcvtne2ph2hf8512 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2hf8_512">, + DefaultAttrsIntrinsic<[llvm_v64i8_ty], [llvm_v32f16_ty, llvm_v32f16_ty], + [IntrNoMem]>; +def int_x86_avx10_vcvtne2ph2hf8s128 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2hf8s_128">, + DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v8f16_ty, llvm_v8f16_ty], + [IntrNoMem]>; +def int_x86_avx10_vcvtne2ph2hf8s256 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2hf8s_256">, + DefaultAttrsIntrinsic<[llvm_v32i8_ty], [llvm_v16f16_ty, llvm_v16f16_ty], + [IntrNoMem]>; +def int_x86_avx10_vcvtne2ph2hf8s512 : ClangBuiltin<"__builtin_ia32_vcvtne2ph2hf8s_512">, + DefaultAttrsIntrinsic<[llvm_v64i8_ty], [llvm_v32f16_ty, llvm_v32f16_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvthf82ph128 : ClangBuiltin<"__builtin_ia32_vcvthf8_2ph128_mask">, + DefaultAttrsIntrinsic<[llvm_v8f16_ty], [llvm_v16i8_ty, llvm_v8f16_ty, llvm_i8_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvthf82ph256 : ClangBuiltin<"__builtin_ia32_vcvthf8_2ph256_mask">, + DefaultAttrsIntrinsic<[llvm_v16f16_ty], [llvm_v16i8_ty, llvm_v16f16_ty, llvm_i16_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvthf82ph512 : ClangBuiltin<"__builtin_ia32_vcvthf8_2ph512_mask">, + DefaultAttrsIntrinsic<[llvm_v32f16_ty], [llvm_v32i8_ty, llvm_v32f16_ty, llvm_i32_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvtneph2bf8128 : ClangBuiltin<"__builtin_ia32_vcvtneph2bf8_128_mask">, + DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v8f16_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvtneph2bf8256 : ClangBuiltin<"__builtin_ia32_vcvtneph2bf8_256_mask">, + DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v16f16_ty, llvm_v16i8_ty, llvm_i16_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvtneph2bf8512 : ClangBuiltin<"__builtin_ia32_vcvtneph2bf8_512_mask">, + DefaultAttrsIntrinsic<[llvm_v32i8_ty], [llvm_v32f16_ty, llvm_v32i8_ty, llvm_i32_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvtneph2bf8s128 : ClangBuiltin<"__builtin_ia32_vcvtneph2bf8s_128_mask">, + DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v8f16_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvtneph2bf8s256 : ClangBuiltin<"__builtin_ia32_vcvtneph2bf8s_256_mask">, + DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v16f16_ty, llvm_v16i8_ty, llvm_i16_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvtneph2bf8s512 : ClangBuiltin<"__builtin_ia32_vcvtneph2bf8s_512_mask">, + DefaultAttrsIntrinsic<[llvm_v32i8_ty], [llvm_v32f16_ty, llvm_v32i8_ty, llvm_i32_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvtneph2hf8128 : ClangBuiltin<"__builtin_ia32_vcvtneph2hf8_128_mask">, + DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v8f16_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvtneph2hf8256 : ClangBuiltin<"__builtin_ia32_vcvtneph2hf8_256_mask">, + DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v16f16_ty, llvm_v16i8_ty, llvm_i16_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvtneph2hf8512 : ClangBuiltin<"__builtin_ia32_vcvtneph2hf8_512_mask">, + DefaultAttrsIntrinsic<[llvm_v32i8_ty], [llvm_v32f16_ty, llvm_v32i8_ty, llvm_i32_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvtneph2hf8s128 : ClangBuiltin<"__builtin_ia32_vcvtneph2hf8s_128_mask">, + DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v8f16_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvtneph2hf8s256 : ClangBuiltin<"__builtin_ia32_vcvtneph2hf8s_256_mask">, + DefaultAttrsIntrinsic<[llvm_v16i8_ty], [llvm_v16f16_ty, llvm_v16i8_ty, llvm_i16_ty], + [IntrNoMem]>; +def int_x86_avx10_mask_vcvtneph2hf8s512 : ClangBuiltin<"__builtin_ia32_vcvtneph2hf8s_512_mask">, + DefaultAttrsIntrinsic<[llvm_v32i8_ty], [llvm_v32f16_ty, llvm_v32i8_ty, llvm_i32_ty], + [IntrNoMem]>; +} diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 169c955f0ba89f..da5ea50f80ce04 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -26121,6 +26121,21 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), {Src, PassThru, Mask}); } + case TRUNCATE2_TO_REG: { + SDValue Src = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue PassThru = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + + if (isAllOnesConstant(Mask)) + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), {Src, Src2}); + + MVT Src2VT = Src2.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, Src2VT.getVectorNumElements()); + Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), + {Src, Src2, PassThru, Mask}); + } case CVTPS2PH_MASK: { SDValue Src = Op.getOperand(1); SDValue Rnd = Op.getOperand(2); @@ -33812,6 +33827,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(VFPEXTS) NODE_NAME_CASE(VFPEXTS_SAE) NODE_NAME_CASE(VFPROUND) + NODE_NAME_CASE(VFPROUND2) + NODE_NAME_CASE(VFPROUND2_RND) NODE_NAME_CASE(STRICT_VFPROUND) NODE_NAME_CASE(VMFPROUND) NODE_NAME_CASE(VFPROUND_RND) @@ -34048,7 +34065,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CVTS2UI) NODE_NAME_CASE(CVTS2SI_RND) NODE_NAME_CASE(CVTS2UI_RND) - NODE_NAME_CASE(CVTNE2PS2BF16) NODE_NAME_CASE(CVTNEPS2BF16) NODE_NAME_CASE(MCVTNEPS2BF16) NODE_NAME_CASE(DPBF16PS) @@ -34096,6 +34112,27 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CVTTP2IUBS) NODE_NAME_CASE(CVTTP2IBS_SAE) NODE_NAME_CASE(CVTTP2IUBS_SAE) + NODE_NAME_CASE(VCVTNE2PH2BF8) + NODE_NAME_CASE(VCVTNE2PH2BF8S) + NODE_NAME_CASE(VCVTNE2PH2HF8) + NODE_NAME_CASE(VCVTNE2PH2HF8S) + NODE_NAME_CASE(VCVTBIASPH2BF8) + NODE_NAME_CASE(VCVTBIASPH2BF8S) + NODE_NAME_CASE(VCVTBIASPH2HF8) + NODE_NAME_CASE(VCVTBIASPH2HF8S) + NODE_NAME_CASE(VCVTNEPH2BF8) + NODE_NAME_CASE(VCVTNEPH2BF8S) + NODE_NAME_CASE(VCVTNEPH2HF8) + NODE_NAME_CASE(VCVTNEPH2HF8S) + NODE_NAME_CASE(VMCVTBIASPH2BF8) + NODE_NAME_CASE(VMCVTBIASPH2BF8S) + NODE_NAME_CASE(VMCVTBIASPH2HF8) + NODE_NAME_CASE(VMCVTBIASPH2HF8S) + NODE_NAME_CASE(VMCVTNEPH2BF8) + NODE_NAME_CASE(VMCVTNEPH2BF8S) + NODE_NAME_CASE(VMCVTNEPH2HF8) + NODE_NAME_CASE(VMCVTNEPH2HF8S) + NODE_NAME_CASE(VCVTHF82PH) NODE_NAME_CASE(AESENC128KL) NODE_NAME_CASE(AESDEC128KL) NODE_NAME_CASE(AESENC256KL) diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 2e7538cb3c1183..93d2b3e65742b2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -340,6 +340,9 @@ namespace llvm { // Vector FP round. VFPROUND, + // Convert TWO packed single data to one packed data + VFPROUND2, + VFPROUND2_RND, VFPROUND_RND, VFPROUNDS, VFPROUNDS_RND, @@ -618,6 +621,28 @@ namespace llvm { MPSADBW, + VCVTNE2PH2BF8, + VCVTNE2PH2BF8S, + VCVTNE2PH2HF8, + VCVTNE2PH2HF8S, + VCVTBIASPH2BF8, + VCVTBIASPH2BF8S, + VCVTBIASPH2HF8, + VCVTBIASPH2HF8S, + VCVTNEPH2BF8, + VCVTNEPH2BF8S, + VCVTNEPH2HF8, + VCVTNEPH2HF8S, + VMCVTBIASPH2BF8, + VMCVTBIASPH2BF8S, + VMCVTBIASPH2HF8, + VMCVTBIASPH2HF8S, + VMCVTNEPH2BF8, + VMCVTNEPH2BF8S, + VMCVTNEPH2HF8, + VMCVTNEPH2HF8S, + VCVTHF82PH, + // Compress and expand. COMPRESS, EXPAND, @@ -669,8 +694,6 @@ namespace llvm { MCVTUI2P, // Vector float to bfloat16. - // Convert TWO packed single data to one packed BF16 data - CVTNE2PS2BF16, // Convert packed single data to packed BF16 data CVTNEPS2BF16, // Masked version of above. diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td index fe381b37782629..a518347cfcd82e 100644 --- a/llvm/lib/Target/X86/X86InstrAVX10.td +++ b/llvm/lib/Target/X86/X86InstrAVX10.td @@ -624,3 +624,289 @@ defm VCVTTPS2IUBS : avx10_sat_cvt_base<0x6a, "vcvttps2iubs", SchedWriteVecIMul, avx512vl_i32_info, avx512vl_f32_info, X86vcvttp2iubsSAE>, AVX512PDIi8Base, T_MAP5, EVEX_CD8<32, CD8VF>; + +//------------------------------------------------- +// AVX10 CONVERT instructions +//------------------------------------------------- + +multiclass avx10_cvt2ps2ph_rc opc, string OpcodeStr, X86FoldableSchedWrite sched, + X86VectorVTInfo _Src, X86VectorVTInfo _, + SDNode OpNodeRnd> { + let Uses = [MXCSR] in + defm rrb : AVX512_maskable, + EVEX, VVVV, EVEX_B, EVEX_RC, PD, Sched<[sched]>; +} + +//TODO: Merge into avx512_binop_all, difference is rounding control added here. +multiclass avx10_cvt2ps2ph opc, string OpcodeStr, + X86SchedWriteWidths sched, + AVX512VLVectorVTInfo _SrcVTInfo, + AVX512VLVectorVTInfo _DstVTInfo, + SDNode OpNode, SDNode OpNodeRnd> { + let Predicates = [HasAVX10_2_512], Uses = [MXCSR] in { + defm Z : avx512_binop_rm2, + avx10_cvt2ps2ph_rc, + EVEX_V512, EVEX_CD8<32, CD8VF>; + } + let Predicates = [HasAVX10_2] in { + defm Z256 : avx512_binop_rm2, + EVEX_V256, EVEX_CD8<32, CD8VF>; + defm Z128 : avx512_binop_rm2, + EVEX_V128, EVEX_CD8<32, CD8VF>; + } + + let Predicates = [HasAVX10_2], hasEVEX_U = 1 in { + defm Z256 : avx10_cvt2ps2ph_rc; + } +} + +defm VCVT2PS2PHX : avx10_cvt2ps2ph<0x67, "vcvt2ps2phx", + SchedWriteCvtPD2PS, + avx512vl_f32_info, avx512vl_f16_info, + X86vfpround2, X86vfpround2Rnd>, T8; + +defm VCVTNE2PH2BF8 : avx512_binop_all<0x74, "vcvtne2ph2bf8", SchedWriteCvtPD2PS, + avx512vl_f16_info, avx512vl_i8_info, + X86vcvtne2ph2bf8, [HasAVX10_2_512], [HasAVX10_2]>, + EVEX_CD8<16, CD8VF>, T8, XD; +defm VCVTNE2PH2BF8S : avx512_binop_all<0x74, "vcvtne2ph2bf8s", SchedWriteCvtPD2PS, + avx512vl_f16_info, avx512vl_i8_info, + X86vcvtne2ph2bf8s, [HasAVX10_2_512], [HasAVX10_2]>, + EVEX_CD8<16, CD8VF>, T_MAP5, XD; +defm VCVTNE2PH2HF8 : avx512_binop_all<0x18, "vcvtne2ph2hf8", SchedWriteCvtPD2PS, + avx512vl_f16_info, avx512vl_i8_info, + X86vcvtne2ph2hf8, [HasAVX10_2_512], [HasAVX10_2]>, + EVEX_CD8<16, CD8VF>, T_MAP5, XD; +defm VCVTNE2PH2HF8S : avx512_binop_all<0x1b, "vcvtne2ph2hf8s", SchedWriteCvtPD2PS, + avx512vl_f16_info, avx512vl_i8_info, + X86vcvtne2ph2hf8s, [HasAVX10_2_512], [HasAVX10_2]>, + EVEX_CD8<16, CD8VF>, T_MAP5, XD; + +//TODO: Merge into avx512_vcvt_fp, diffrence is one more source register here. +multiclass avx10_convert_3op_packed OpCode, string OpcodeStr, + X86VectorVTInfo vt_dst, X86VectorVTInfo vt_src1, + X86VectorVTInfo vt_src2, SDPatternOperator OpNode, + SDPatternOperator MaskOpNode, X86FoldableSchedWrite sched, + string Broadcast = vt_src2.BroadcastStr, + X86MemOperand MemOp = vt_src2.MemOp, + RegisterClass MaskRC = vt_src2.KRCWM, + dag LdDAG = (vt_dst.VT (OpNode (vt_src1.VT vt_src1.RC:$src1), + (vt_src2.VT (vt_src2.LdFrag addr:$src2)))), + dag MaskLdDAG = (vt_dst.VT (MaskOpNode (vt_src1.VT vt_src1.RC:$src1), + (vt_src2.VT (vt_src2.LdFrag addr:$src2))))> { + defm rr : AVX512_maskable_cvt, + EVEX, VVVV, Sched<[sched]>; + let mayLoad = 1 in + defm rm : AVX512_maskable_cvt, + EVEX, VVVV, Sched<[sched]>; + + let mayLoad = 1 in + defm rmb : AVX512_maskable_cvt, + EVEX, VVVV, EVEX_B, Sched<[sched]>; +} + +//TODO: Merge into avx512_cvt_trunc +multiclass avx10_convert_3op OpCode, string OpcodeStr, + AVX512VLVectorVTInfo vt_dst, AVX512VLVectorVTInfo vt_src, + X86SchedWriteWidths sched, + SDPatternOperator OpNode, + SDPatternOperator MaskOpNode, + PatFrag bcast128 = vt_src.info128.BroadcastLdFrag, + PatFrag loadVT128 = vt_src.info128.LdFrag, + RegisterClass maskRC128 = vt_src.info128.KRCWM> { + let Predicates = [HasAVX10_2_512] in + defm Z : avx10_convert_3op_packed, + EVEX_V512, EVEX_CD8<16, CD8VF>; + let Predicates = [HasAVX10_2] in { + defm Z256 : avx10_convert_3op_packed, + EVEX_V256, EVEX_CD8<16, CD8VF>; + defm Z128 : avx10_convert_3op_packed, + EVEX_V128, EVEX_CD8<16, CD8VF>; + // Special patterns to allow use of MaskOpNode for masking 128 version. Instruction + // patterns have been disabled with null_frag. + def : Pat<(vt_dst.info128.VT (OpNode (vt_dst.info128.VT VR128X:$src1), + (vt_src.info128.VT VR128X:$src2))), + (!cast(NAME # "Z128rr") VR128X:$src1, VR128X:$src2)>; + def : Pat<(MaskOpNode (vt_dst.info128.VT VR128X:$src1), + (vt_src.info128.VT VR128X:$src2), + (vt_dst.info128.VT VR128X:$src0), maskRC128:$mask), + (!cast(NAME # "Z128rrk") VR128X:$src0, maskRC128:$mask, + VR128X:$src1, VR128X:$src2)>; + def : Pat<(MaskOpNode (vt_dst.info128.VT VR128X:$src1), + (vt_src.info128.VT VR128X:$src2), + vt_dst.info128.ImmAllZerosV, maskRC128:$mask), + (!cast(NAME # "Z128rrkz") maskRC128:$mask, + VR128X:$src1, VR128X:$src2)>; + + def : Pat<(vt_dst.info128.VT (OpNode (vt_dst.info128.VT VR128X:$src1), + (loadVT128 addr:$src2))), + (!cast(NAME # "Z128rm") VR128X:$src1, addr:$src2)>; + def : Pat<(MaskOpNode (vt_dst.info128.VT VR128X:$src1), + (loadVT128 addr:$src2), + (vt_dst.info128.VT VR128X:$src0), + maskRC128:$mask), + (!cast(NAME # "Z128rmk") VR128X:$src0, maskRC128:$mask, + VR128X:$src1, addr:$src2)>; + def : Pat<(MaskOpNode (vt_dst.info128.VT VR128X:$src1), + (loadVT128 addr:$src2), + vt_dst.info128.ImmAllZerosV, + maskRC128:$mask), + (!cast(NAME # "Z128rmkz") maskRC128:$mask, + VR128X:$src1, addr:$src2)>; + + def : Pat<(vt_dst.info128.VT (OpNode (vt_dst.info128.VT VR128X:$src1), + (vt_src.info128.VT (bcast128 addr:$src2)))), + (!cast(NAME # "Z128rmb") VR128X:$src1, addr:$src2)>; + def : Pat<(MaskOpNode (vt_dst.info128.VT VR128X:$src1), + (vt_src.info128.VT (bcast128 addr:$src2)), + (vt_dst.info128.VT VR128X:$src0), maskRC128:$mask), + (!cast(NAME # "Z128rmbk") VR128X:$src0, maskRC128:$mask, + VR128X:$src1, addr:$src2)>; + def : Pat<(MaskOpNode (vt_dst.info128.VT VR128X:$src1), + (vt_src.info128.VT (bcast128 addr:$src2)), + vt_dst.info128.ImmAllZerosV, maskRC128:$mask), + (!cast(NAME # "Z128rmbkz") maskRC128:$mask, + VR128X:$src1, addr:$src2)>; + } +} + +defm VCVTBIASPH2BF8 : avx10_convert_3op<0x74, "vcvtbiasph2bf8", + avx512vl_i8_info, avx512vl_f16_info, + SchedWriteCvtPD2PS, + X86vcvtbiasph2bf8, X86vmcvtbiasph2bf8>, + T8, PS; +defm VCVTBIASPH2BF8S : avx10_convert_3op<0x74, "vcvtbiasph2bf8s", + avx512vl_i8_info, avx512vl_f16_info, + SchedWriteCvtPD2PS, + X86vcvtbiasph2bf8s, X86vmcvtbiasph2bf8s>, + T_MAP5, PS; +defm VCVTBIASPH2HF8 : avx10_convert_3op<0x18, "vcvtbiasph2hf8", + avx512vl_i8_info, avx512vl_f16_info, + SchedWriteCvtPD2PS, + X86vcvtbiasph2hf8, X86vmcvtbiasph2hf8>, + T_MAP5, PS; +defm VCVTBIASPH2HF8S : avx10_convert_3op<0x1b, "vcvtbiasph2hf8s", + avx512vl_i8_info, avx512vl_f16_info, + SchedWriteCvtPD2PS, + X86vcvtbiasph2hf8s, X86vmcvtbiasph2hf8s>, + T_MAP5, PS; + +defm VCVTNEPH2BF8 : avx512_cvt_trunc_ne<0x74, "vcvtneph2bf8", avx512vl_i8_info, + avx512vl_f16_info, SchedWriteCvtPD2PS, + X86vcvtneph2bf8, X86vmcvtneph2bf8, + [HasAVX10_2], [HasAVX10_2_512]>, + T8, XS, EVEX_CD8<16, CD8VF>; + +defm VCVTNEPH2BF8S : avx512_cvt_trunc_ne<0x74, "vcvtneph2bf8s", avx512vl_i8_info, + avx512vl_f16_info, SchedWriteCvtPD2PS, + X86vcvtneph2bf8s, X86vmcvtneph2bf8s, + [HasAVX10_2], [HasAVX10_2_512]>, + T_MAP5, XS, EVEX_CD8<16, CD8VF>; + +defm VCVTNEPH2HF8 : avx512_cvt_trunc_ne<0x18, "vcvtneph2hf8", avx512vl_i8_info, + avx512vl_f16_info, SchedWriteCvtPD2PS, + X86vcvtneph2hf8, X86vmcvtneph2hf8, + [HasAVX10_2], [HasAVX10_2_512]>, + T_MAP5, XS, EVEX_CD8<16, CD8VF>; + +defm VCVTNEPH2HF8S : avx512_cvt_trunc_ne<0x1b, "vcvtneph2hf8s", avx512vl_i8_info, + avx512vl_f16_info, SchedWriteCvtPD2PS, + X86vcvtneph2hf8s, X86vmcvtneph2hf8s, + [HasAVX10_2], [HasAVX10_2_512]>, + T_MAP5, XS, EVEX_CD8<16, CD8VF>; + +multiclass avx10_convert_2op_nomb_packed opc, string OpcodeStr, + X86VectorVTInfo _dest, X86VectorVTInfo _src, + SDNode OpNode, X86MemOperand x86memop, + X86FoldableSchedWrite sched, + dag ld_dag = (load addr:$src)> { + let ExeDomain = _dest.ExeDomain in { + defm rr : AVX512_maskable_split, + Sched<[sched]>; + defm rm : AVX512_maskable_split, + Sched<[sched.Folded]>; + } +} + +multiclass avx10_convert_2op_nomb opc, SDNode OpNode> { + let Predicates = [HasAVX10_2_512] in + defm Z : avx10_convert_2op_nomb_packed, EVEX_V512; + let Predicates = [HasAVX10_2] in { + defm Z128 : avx10_convert_2op_nomb_packed, EVEX_V128; + defm Z256 : avx10_convert_2op_nomb_packed, EVEX_V256; + } +} + +defm VCVTHF82PH : avx10_convert_2op_nomb<"vcvthf82ph", avx512vl_f16_info, + avx512vl_i8_info, 0x1e, X86vcvthf82ph>, + AVX512XDIi8Base, T_MAP5, EVEX, EVEX_CD8<16, CD8VH>; diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index a606962a553809..88d1eb59862433 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -4855,15 +4855,16 @@ multiclass avx512_binop_all opc, string OpcodeStr, X86SchedWriteWidths sched, AVX512VLVectorVTInfo _SrcVTInfo, AVX512VLVectorVTInfo _DstVTInfo, - SDNode OpNode, Predicate prd, + SDNode OpNode, list prds512, + list prds, X86VectorVTInfo _VTInfo512 = _SrcVTInfo.info512, X86VectorVTInfo _VTInfo256 = _SrcVTInfo.info256, X86VectorVTInfo _VTInfo128 = _SrcVTInfo.info128> { - let Predicates = [prd] in + let Predicates = prds512 in defm NAME#Z : avx512_binop_rm2, EVEX_V512; - let Predicates = [HasVLX, prd] in { + let Predicates = prds in { defm NAME#Z256 : avx512_binop_rm2, EVEX_V256; @@ -4875,8 +4876,8 @@ multiclass avx512_binop_all opc, string OpcodeStr, defm VPMULTISHIFTQB : avx512_binop_all<0x83, "vpmultishiftqb", SchedWriteVecALU, avx512vl_i8_info, avx512vl_i8_info, - X86multishift, HasVBMI, v8i64_info, - v4i64x_info, v2i64x_info>, T8, + X86multishift, [HasVBMI], [HasVLX, HasVBMI], + v8i64_info, v4i64x_info, v2i64x_info>, T8, EVEX_CD8<64, CD8VF>, REX_W; multiclass avx512_packs_rmb opc, string OpcodeStr, SDNode OpNode, @@ -12676,27 +12677,35 @@ let ExeDomain = SSEPackedSingle in defm VCVTNE2PS2BF16 : avx512_binop_all<0x72, "vcvtne2ps2bf16", SchedWriteCvtPD2PS, //FIXME: Should be SchedWriteCvtPS2BF avx512vl_f32_info, avx512vl_bf16_info, - X86cvtne2ps2bf16, HasBF16>, T8, XD, + X86vfpround2, [HasBF16], [HasVLX, HasBF16]>, T8, XD, EVEX_CD8<32, CD8VF>; -// Truncate Float to BFloat16 -multiclass avx512_cvtps2bf16 opc, string OpcodeStr, - X86SchedWriteWidths sched> { +// Truncate Float to BFloat16, Float16 to BF8/HF8[,S] +multiclass avx512_cvt_trunc_ne opc, string OpcodeStr, + AVX512VLVectorVTInfo vt_dst, + AVX512VLVectorVTInfo vt_src, + X86SchedWriteWidths sched, + SDPatternOperator OpNode, + SDPatternOperator MaskOpNode, + list prds, list prds512, + PatFrag bcast128 = vt_src.info128.BroadcastLdFrag, + PatFrag loadVT128 = vt_src.info128.LdFrag, + RegisterClass maskRC128 = vt_src.info128.KRCWM> { let ExeDomain = SSEPackedSingle in { - let Predicates = [HasBF16], Uses = [], mayRaiseFPException = 0 in { - defm Z : avx512_vcvt_fp, EVEX_V512; + let Predicates = prds512, Uses = [], mayRaiseFPException = 0 in { + defm Z : avx512_vcvt_fp, EVEX_V512; } - let Predicates = [HasBF16, HasVLX] in { + let Predicates = prds in { let Uses = [], mayRaiseFPException = 0 in { - defm Z128 : avx512_vcvt_fp, EVEX_V128; - defm Z256 : avx512_vcvt_fp, EVEX_V256; + defm Z128 : avx512_vcvt_fp, EVEX_V128; + defm Z256 : avx512_vcvt_fp, EVEX_V256; } - } // Predicates = [HasBF16, HasVLX] + } // Predicates = prds } // ExeDomain = SSEPackedSingle def : InstAlias opc, string OpcodeStr, def : InstAlias(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0, "intel">; + + let Predicates = prds in { + // Special patterns to allow use of MaskOpNode for masking 128 version. Instruction + // patterns have been disabled with null_frag. + def : Pat<(vt_dst.info128.VT (OpNode (vt_src.info128.VT VR128X:$src))), + (!cast(NAME # "Z128rr") VR128X:$src)>; + def : Pat<(MaskOpNode (vt_src.info128.VT VR128X:$src), (vt_dst.info128.VT VR128X:$src0), + maskRC128:$mask), + (!cast(NAME # "Z128rrk") VR128X:$src0, maskRC128:$mask, VR128X:$src)>; + def : Pat<(MaskOpNode (vt_src.info128.VT VR128X:$src), vt_dst.info128.ImmAllZerosV, + maskRC128:$mask), + (!cast(NAME # "Z128rrkz") maskRC128:$mask, VR128X:$src)>; + + def : Pat<(vt_dst.info128.VT (OpNode (loadVT128 addr:$src))), + (!cast(NAME # "Z128rm") addr:$src)>; + def : Pat<(MaskOpNode (loadVT128 addr:$src), (vt_dst.info128.VT VR128X:$src0), + maskRC128:$mask), + (!cast(NAME # "Z128rmk") VR128X:$src0, maskRC128:$mask, addr:$src)>; + def : Pat<(MaskOpNode (loadVT128 addr:$src), vt_dst.info128.ImmAllZerosV, + maskRC128:$mask), + (!cast(NAME # "Z128rmkz") maskRC128:$mask, addr:$src)>; + + def : Pat<(vt_dst.info128.VT (OpNode (vt_src.info128.VT (bcast128 addr:$src)))), + (!cast(NAME # "Z128rmb") addr:$src)>; + def : Pat<(MaskOpNode (vt_src.info128.VT (bcast128 addr:$src)), + (vt_dst.info128.VT VR128X:$src0), maskRC128:$mask), + (!cast(NAME # "Z128rmbk") VR128X:$src0, maskRC128:$mask, addr:$src)>; + def : Pat<(MaskOpNode (vt_src.info128.VT (bcast128 addr:$src)), + vt_dst.info128.ImmAllZerosV, maskRC128:$mask), + (!cast(NAME # "Z128rmbkz") maskRC128:$mask, addr:$src)>; + } } -defm VCVTNEPS2BF16 : avx512_cvtps2bf16<0x72, "vcvtneps2bf16", - SchedWriteCvtPD2PS>, T8, XS, - EVEX_CD8<32, CD8VF>; +defm VCVTNEPS2BF16 : avx512_cvt_trunc_ne<0x72, "vcvtneps2bf16", + avx512vl_bf16_info, avx512vl_f32_info, + SchedWriteCvtPD2PS, X86cvtneps2bf16, + X86mcvtneps2bf16, [HasBF16, HasVLX], + [HasBF16]>, T8, XS, EVEX_CD8<32, CD8VF>; let Predicates = [HasBF16, HasVLX] in { - // Special patterns to allow use of X86mcvtneps2bf16 for masking. Instruction - // patterns have been disabled with null_frag. - def : Pat<(v8bf16 (X86cvtneps2bf16 (v4f32 VR128X:$src))), - (VCVTNEPS2BF16Z128rr VR128X:$src)>; - def : Pat<(X86mcvtneps2bf16 (v4f32 VR128X:$src), (v8bf16 VR128X:$src0), - VK4WM:$mask), - (VCVTNEPS2BF16Z128rrk VR128X:$src0, VK4WM:$mask, VR128X:$src)>; - def : Pat<(X86mcvtneps2bf16 (v4f32 VR128X:$src), v8bf16x_info.ImmAllZerosV, - VK4WM:$mask), - (VCVTNEPS2BF16Z128rrkz VK4WM:$mask, VR128X:$src)>; - - def : Pat<(v8bf16 (X86cvtneps2bf16 (loadv4f32 addr:$src))), - (VCVTNEPS2BF16Z128rm addr:$src)>; - def : Pat<(X86mcvtneps2bf16 (loadv4f32 addr:$src), (v8bf16 VR128X:$src0), - VK4WM:$mask), - (VCVTNEPS2BF16Z128rmk VR128X:$src0, VK4WM:$mask, addr:$src)>; - def : Pat<(X86mcvtneps2bf16 (loadv4f32 addr:$src), v8bf16x_info.ImmAllZerosV, - VK4WM:$mask), - (VCVTNEPS2BF16Z128rmkz VK4WM:$mask, addr:$src)>; - - def : Pat<(v8bf16 (X86cvtneps2bf16 (v4f32 - (X86VBroadcastld32 addr:$src)))), - (VCVTNEPS2BF16Z128rmb addr:$src)>; - def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcastld32 addr:$src)), - (v8bf16 VR128X:$src0), VK4WM:$mask), - (VCVTNEPS2BF16Z128rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>; - def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcastld32 addr:$src)), - v8bf16x_info.ImmAllZerosV, VK4WM:$mask), - (VCVTNEPS2BF16Z128rmbkz VK4WM:$mask, addr:$src)>; - def : Pat<(v8bf16 (int_x86_vcvtneps2bf16128 (v4f32 VR128X:$src))), (VCVTNEPS2BF16Z128rr VR128X:$src)>; def : Pat<(v8bf16 (int_x86_vcvtneps2bf16128 (loadv4f32 addr:$src))), diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 6db1cf7c9ee1fd..e81f2a2fbb9512 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -146,6 +146,11 @@ def X86vfpround: SDNode<"X86ISD::VFPROUND", SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>, SDTCisFP<1>, SDTCisVec<1>, SDTCisOpSmallerThanOp<0, 1>]>>; +def X86vfpround2 : SDNode<"X86ISD::VFPROUND2", + SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisVec<0>, + SDTCisFP<1>, SDTCisVec<1>, + SDTCisSameAs<1, 2>, + SDTCisOpSmallerThanOp<0, 1>]>>; def X86strict_vfpround: SDNode<"X86ISD::STRICT_VFPROUND", SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>, @@ -783,11 +788,6 @@ def X86vminmaxsSae : SDNode<"X86ISD::VMINMAXS_SAE", SDTypeProfile<1, 3, [SDTCisS SDTCisSameAs<0,2>, SDTCisInt<3>]>>; // cvt fp to bfloat16 -def X86cvtne2ps2bf16 : SDNode<"X86ISD::CVTNE2PS2BF16", - SDTypeProfile<1, 2, [SDTCVecEltisVT<0, bf16>, - SDTCVecEltisVT<1, f32>, - SDTCisSameSizeAs<0,1>, - SDTCisSameAs<1,2>]>>; def X86mcvtneps2bf16 : SDNode<"X86ISD::MCVTNEPS2BF16", SDTypeProfile<1, 3, [SDTCVecEltisVT<0, bf16>, SDTCVecEltisVT<1, f32>, @@ -847,6 +847,86 @@ def X86vcvttp2iubs : SDNode<"X86ISD::CVTTP2IUBS", SDTFloatToInt>; def X86vcvttp2ibsSAE : SDNode<"X86ISD::CVTTP2IBS_SAE", SDTFloatToInt>; def X86vcvttp2iubsSAE : SDNode<"X86ISD::CVTTP2IUBS_SAE", SDTFloatToInt>; +def SDTAVX10CONVERT_I82F16 : SDTypeProfile<1, 2, [ + SDTCVecEltisVT<0, i8>, SDTCVecEltisVT<1, f16>, SDTCisSameAs<1, 2> +]>; + +def SDTAVX10CONVERT_F16I8 : SDTypeProfile<1, 1, [ + SDTCVecEltisVT<0, f16>, SDTCVecEltisVT<1, i8> +]>; + +def SDTAVX10CONVERT_I8F16 : SDTypeProfile<1, 1, [ + SDTCVecEltisVT<0, i8>, SDTCVecEltisVT<1, f16> +]>; + +def SDTAVX10CONVERT_I8F16_MASK : SDTypeProfile<1, 3, [ + SDTCVecEltisVT<0, i8>, SDTCVecEltisVT<1, f16>, + SDTCisSameAs<0, 2>, SDTCVecEltisVT<3, i1>, + SDTCisSameNumEltsAs<1, 3> +]>; + +def SDTAVX10CONVERT_2I8F16 : SDTypeProfile<1, 2, [ + SDTCVecEltisVT<0, i8>, SDTCVecEltisVT<1, i8>, SDTCVecEltisVT<2, f16> +]>; + +def SDTAVX10CONVERT_2I8F16_MASK : SDTypeProfile<1, 4, [ + SDTCVecEltisVT<0, i8>, SDTCisSameAs<0, 1>, + SDTCVecEltisVT<2, f16>, SDTCisSameAs<0, 3>, SDTCVecEltisVT<4, i1>, + SDTCisSameNumEltsAs<2, 4> +]>; + +def X86vfpround2Rnd : SDNode<"X86ISD::VFPROUND2_RND", + SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f16>, + SDTCVecEltisVT<1, f32>, + SDTCisSameAs<1, 2>, + SDTCisVT<3, i32>]>>; +// 3op +def X86vcvtne2ph2bf8 : SDNode<"X86ISD::VCVTNE2PH2BF8", + SDTAVX10CONVERT_I82F16>; +def X86vcvtne2ph2bf8s : SDNode<"X86ISD::VCVTNE2PH2BF8S", + SDTAVX10CONVERT_I82F16>; +def X86vcvtne2ph2hf8 : SDNode<"X86ISD::VCVTNE2PH2HF8", + SDTAVX10CONVERT_I82F16>; +def X86vcvtne2ph2hf8s : SDNode<"X86ISD::VCVTNE2PH2HF8S", + SDTAVX10CONVERT_I82F16>; +// 2op no broadcast +def X86vcvthf82ph : SDNode<"X86ISD::VCVTHF82PH", + SDTAVX10CONVERT_F16I8>; +// 2op +def X86vcvtbiasph2bf8 : SDNode<"X86ISD::VCVTBIASPH2BF8", + SDTAVX10CONVERT_2I8F16>; +def X86vcvtbiasph2bf8s : SDNode<"X86ISD::VCVTBIASPH2BF8S", + SDTAVX10CONVERT_2I8F16>; +def X86vcvtbiasph2hf8 : SDNode<"X86ISD::VCVTBIASPH2HF8", + SDTAVX10CONVERT_2I8F16>; +def X86vcvtbiasph2hf8s : SDNode<"X86ISD::VCVTBIASPH2HF8S", + SDTAVX10CONVERT_2I8F16>; +def X86vcvtneph2bf8 : SDNode<"X86ISD::VCVTNEPH2BF8", + SDTAVX10CONVERT_I8F16>; +def X86vcvtneph2bf8s : SDNode<"X86ISD::VCVTNEPH2BF8S", + SDTAVX10CONVERT_I8F16>; +def X86vcvtneph2hf8 : SDNode<"X86ISD::VCVTNEPH2HF8", + SDTAVX10CONVERT_I8F16>; +def X86vcvtneph2hf8s : SDNode<"X86ISD::VCVTNEPH2HF8S", + SDTAVX10CONVERT_I8F16>; + +def X86vmcvtbiasph2bf8 : SDNode<"X86ISD::VMCVTBIASPH2BF8", + SDTAVX10CONVERT_2I8F16_MASK>; +def X86vmcvtbiasph2bf8s : SDNode<"X86ISD::VMCVTBIASPH2BF8S", + SDTAVX10CONVERT_2I8F16_MASK>; +def X86vmcvtbiasph2hf8 : SDNode<"X86ISD::VMCVTBIASPH2HF8", + SDTAVX10CONVERT_2I8F16_MASK>; +def X86vmcvtbiasph2hf8s : SDNode<"X86ISD::VMCVTBIASPH2HF8S", + SDTAVX10CONVERT_2I8F16_MASK>; +def X86vmcvtneph2bf8 : SDNode<"X86ISD::VMCVTNEPH2BF8", + SDTAVX10CONVERT_I8F16_MASK>; +def X86vmcvtneph2bf8s : SDNode<"X86ISD::VMCVTNEPH2BF8S", + SDTAVX10CONVERT_I8F16_MASK>; +def X86vmcvtneph2hf8 : SDNode<"X86ISD::VMCVTNEPH2HF8", + SDTAVX10CONVERT_I8F16_MASK>; +def X86vmcvtneph2hf8s : SDNode<"X86ISD::VMCVTNEPH2HF8S", + SDTAVX10CONVERT_I8F16_MASK>; + //===----------------------------------------------------------------------===// // SSE pattern fragments //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 47be08c8af3efe..68c1ce072549b9 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -62,6 +62,7 @@ enum IntrinsicType : uint16_t { INTR_TYPE_3OP_SCALAR_MASK_SAE, COMPRESS_EXPAND_IN_REG, TRUNCATE_TO_REG, + TRUNCATE2_TO_REG, CVTPS2PH_MASK, CVTPD2DQ_MASK, CVTQQ2PS_MASK, @@ -394,6 +395,66 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::CMPMM_SAE), X86_INTRINSIC_DATA(avx10_mask_vcmpps256, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE), + X86_INTRINSIC_DATA(avx10_mask_vcvt2ps2phx_128, INTR_TYPE_2OP_MASK, + X86ISD::VFPROUND2, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvt2ps2phx_256, INTR_TYPE_2OP_MASK, + X86ISD::VFPROUND2, X86ISD::VFPROUND2_RND), + X86_INTRINSIC_DATA(avx10_mask_vcvt2ps2phx_512, INTR_TYPE_2OP_MASK, + X86ISD::VFPROUND2, X86ISD::VFPROUND2_RND), + X86_INTRINSIC_DATA(avx10_mask_vcvtbiasph2bf8128, TRUNCATE2_TO_REG, + X86ISD::VCVTBIASPH2BF8, X86ISD::VMCVTBIASPH2BF8), + X86_INTRINSIC_DATA(avx10_mask_vcvtbiasph2bf8256, INTR_TYPE_2OP_MASK, + X86ISD::VCVTBIASPH2BF8, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtbiasph2bf8512, INTR_TYPE_2OP_MASK, + X86ISD::VCVTBIASPH2BF8, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtbiasph2bf8s128, TRUNCATE2_TO_REG, + X86ISD::VCVTBIASPH2BF8S, X86ISD::VMCVTBIASPH2BF8S), + X86_INTRINSIC_DATA(avx10_mask_vcvtbiasph2bf8s256, INTR_TYPE_2OP_MASK, + X86ISD::VCVTBIASPH2BF8S, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtbiasph2bf8s512, INTR_TYPE_2OP_MASK, + X86ISD::VCVTBIASPH2BF8S, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtbiasph2hf8128, TRUNCATE2_TO_REG, + X86ISD::VCVTBIASPH2HF8, X86ISD::VMCVTBIASPH2HF8), + X86_INTRINSIC_DATA(avx10_mask_vcvtbiasph2hf8256, INTR_TYPE_2OP_MASK, + X86ISD::VCVTBIASPH2HF8, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtbiasph2hf8512, INTR_TYPE_2OP_MASK, + X86ISD::VCVTBIASPH2HF8, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtbiasph2hf8s128, TRUNCATE2_TO_REG, + X86ISD::VCVTBIASPH2HF8S, X86ISD::VMCVTBIASPH2HF8S), + X86_INTRINSIC_DATA(avx10_mask_vcvtbiasph2hf8s256, INTR_TYPE_2OP_MASK, + X86ISD::VCVTBIASPH2HF8S, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtbiasph2hf8s512, INTR_TYPE_2OP_MASK, + X86ISD::VCVTBIASPH2HF8S, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvthf82ph128, INTR_TYPE_1OP_MASK, + X86ISD::VCVTHF82PH, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvthf82ph256, INTR_TYPE_1OP_MASK, + X86ISD::VCVTHF82PH, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvthf82ph512, INTR_TYPE_1OP_MASK, + X86ISD::VCVTHF82PH, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtneph2bf8128, TRUNCATE_TO_REG, + X86ISD::VCVTNEPH2BF8, X86ISD::VMCVTNEPH2BF8), + X86_INTRINSIC_DATA(avx10_mask_vcvtneph2bf8256, INTR_TYPE_1OP_MASK, + X86ISD::VCVTNEPH2BF8, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtneph2bf8512, INTR_TYPE_1OP_MASK, + X86ISD::VCVTNEPH2BF8, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtneph2bf8s128, TRUNCATE_TO_REG, + X86ISD::VCVTNEPH2BF8S, X86ISD::VMCVTNEPH2BF8S), + X86_INTRINSIC_DATA(avx10_mask_vcvtneph2bf8s256, INTR_TYPE_1OP_MASK, + X86ISD::VCVTNEPH2BF8S, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtneph2bf8s512, INTR_TYPE_1OP_MASK, + X86ISD::VCVTNEPH2BF8S, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtneph2hf8128, TRUNCATE_TO_REG, + X86ISD::VCVTNEPH2HF8, X86ISD::VMCVTNEPH2HF8), + X86_INTRINSIC_DATA(avx10_mask_vcvtneph2hf8256, INTR_TYPE_1OP_MASK, + X86ISD::VCVTNEPH2HF8, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtneph2hf8512, INTR_TYPE_1OP_MASK, + X86ISD::VCVTNEPH2HF8, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtneph2hf8s128, TRUNCATE_TO_REG, + X86ISD::VCVTNEPH2HF8S, X86ISD::VMCVTNEPH2HF8S), + X86_INTRINSIC_DATA(avx10_mask_vcvtneph2hf8s256, INTR_TYPE_1OP_MASK, + X86ISD::VCVTNEPH2HF8S, 0), + X86_INTRINSIC_DATA(avx10_mask_vcvtneph2hf8s512, INTR_TYPE_1OP_MASK, + X86ISD::VCVTNEPH2HF8S, 0), X86_INTRINSIC_DATA(avx10_mask_vcvtpd2dq256, INTR_TYPE_1OP_MASK, X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND), X86_INTRINSIC_DATA(avx10_mask_vcvtpd2ph256, INTR_TYPE_1OP_MASK, @@ -594,6 +655,30 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FADD_RND), X86_INTRINSIC_DATA(avx10_vaddps256, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND), + X86_INTRINSIC_DATA(avx10_vcvtne2ph2bf8128, INTR_TYPE_2OP, + X86ISD::VCVTNE2PH2BF8, 0), + X86_INTRINSIC_DATA(avx10_vcvtne2ph2bf8256, INTR_TYPE_2OP, + X86ISD::VCVTNE2PH2BF8, 0), + X86_INTRINSIC_DATA(avx10_vcvtne2ph2bf8512, INTR_TYPE_2OP, + X86ISD::VCVTNE2PH2BF8, 0), + X86_INTRINSIC_DATA(avx10_vcvtne2ph2bf8s128, INTR_TYPE_2OP, + X86ISD::VCVTNE2PH2BF8S, 0), + X86_INTRINSIC_DATA(avx10_vcvtne2ph2bf8s256, INTR_TYPE_2OP, + X86ISD::VCVTNE2PH2BF8S, 0), + X86_INTRINSIC_DATA(avx10_vcvtne2ph2bf8s512, INTR_TYPE_2OP, + X86ISD::VCVTNE2PH2BF8S, 0), + X86_INTRINSIC_DATA(avx10_vcvtne2ph2hf8128, INTR_TYPE_2OP, + X86ISD::VCVTNE2PH2HF8, 0), + X86_INTRINSIC_DATA(avx10_vcvtne2ph2hf8256, INTR_TYPE_2OP, + X86ISD::VCVTNE2PH2HF8, 0), + X86_INTRINSIC_DATA(avx10_vcvtne2ph2hf8512, INTR_TYPE_2OP, + X86ISD::VCVTNE2PH2HF8, 0), + X86_INTRINSIC_DATA(avx10_vcvtne2ph2hf8s128, INTR_TYPE_2OP, + X86ISD::VCVTNE2PH2HF8S, 0), + X86_INTRINSIC_DATA(avx10_vcvtne2ph2hf8s256, INTR_TYPE_2OP, + X86ISD::VCVTNE2PH2HF8S, 0), + X86_INTRINSIC_DATA(avx10_vcvtne2ph2hf8s512, INTR_TYPE_2OP, + X86ISD::VCVTNE2PH2HF8S, 0), X86_INTRINSIC_DATA(avx10_vcvtnebf162ibs128, INTR_TYPE_1OP, X86ISD::CVTP2IBS, 0), X86_INTRINSIC_DATA(avx10_vcvtnebf162ibs256, INTR_TYPE_1OP, X86ISD::CVTP2IBS, @@ -1473,11 +1558,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VPSHUFBITQMB, 0), // bfloat16 X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_128, INTR_TYPE_2OP, - X86ISD::CVTNE2PS2BF16, 0), + X86ISD::VFPROUND2, 0), X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_256, INTR_TYPE_2OP, - X86ISD::CVTNE2PS2BF16, 0), + X86ISD::VFPROUND2, 0), X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_512, INTR_TYPE_2OP, - X86ISD::CVTNE2PS2BF16, 0), + X86ISD::VFPROUND2, 0), X86_INTRINSIC_DATA(avx512bf16_cvtneps2bf16_256, INTR_TYPE_1OP, X86ISD::CVTNEPS2BF16, 0), X86_INTRINSIC_DATA(avx512bf16_cvtneps2bf16_512, INTR_TYPE_1OP, diff --git a/llvm/test/CodeGen/X86/avx10_2_512convert-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512convert-intrinsics.ll new file mode 100644 index 00000000000000..e755b56f30d4c0 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx10_2_512convert-intrinsics.ll @@ -0,0 +1,677 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X86 + +define <32 x half> @test_int_x86_avx10_vcvt2ps2phx512(<16 x float> %A, <16 x float> %B) { +; CHECK-LABEL: test_int_x86_avx10_vcvt2ps2phx512: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvt2ps2phx %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x67,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <32 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.512(<16 x float> %A, <16 x float> %B, <32 x half> zeroinitializer, i32 -1, i32 4) + ret <32 x half> %ret +} + +define <32 x half> @test_int_x86_avx10_vcvt2ps2phx512_mask(<32 x half> %W, i32 %U, <16 x float> %A, <16 x float> %B) { +; X64-LABEL: test_int_x86_avx10_vcvt2ps2phx512_mask: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvt2ps2phx %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x67,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvt2ps2phx512_mask: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvt2ps2phx %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x67,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <32 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.512(<16 x float> %A, <16 x float> %B, <32 x half> %W, i32 %U, i32 4) + ret <32 x half> %ret +} + +define <32 x half> @test_int_x86_avx10_vcvt2ps2phx512_maskz(i32 %U, <16 x float> %A, <16 x float> %B) { +; X64-LABEL: test_int_x86_avx10_vcvt2ps2phx512_maskz: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvt2ps2phx %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x67,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvt2ps2phx512_maskz: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvt2ps2phx %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x67,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <32 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.512(<16 x float> %A, <16 x float> %B, <32 x half> zeroinitializer, i32 %U, i32 4) + ret <32 x half> %ret +} + +define <32 x half> @test_int_x86_avx10_vcvt2ps2phx512_round(<16 x float> %A, <16 x float> %B) { +; CHECK-LABEL: test_int_x86_avx10_vcvt2ps2phx512_round: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvt2ps2phx {rz-sae}, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x78,0x67,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <32 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.512(<16 x float> %A, <16 x float> %B, <32 x half> zeroinitializer, i32 -1, i32 11) + ret <32 x half> %ret +} + +define <32 x half> @test_int_x86_avx10_vcvt2ps2phx512_round_mask(<32 x half> %W, i32 %U, <16 x float> %A, <16 x float> %B) { +; X64-LABEL: test_int_x86_avx10_vcvt2ps2phx512_round_mask: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvt2ps2phx {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x79,0x67,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvt2ps2phx512_round_mask: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvt2ps2phx {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x79,0x67,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <32 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.512(<16 x float> %A, <16 x float> %B, <32 x half> %W, i32 %U, i32 11) + ret <32 x half> %ret +} + +define <32 x half> @test_int_x86_avx10_vcvt2ps2phx512_round_maskz(i32 %U, <16 x float> %A, <16 x float> %B) { +; X64-LABEL: test_int_x86_avx10_vcvt2ps2phx512_round_maskz: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvt2ps2phx {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xf9,0x67,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvt2ps2phx512_round_maskz: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvt2ps2phx {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xf9,0x67,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <32 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.512(<16 x float> %A, <16 x float> %B, <32 x half> zeroinitializer, i32 %U, i32 11) + ret <32 x half> %ret +} + +declare <32 x half> @llvm.x86.avx10.mask.vcvt2ps2phx512(<16 x float>, <16 x float>, i32, i32) + +define <32 x i8> @test_int_x86_avx10_vcvtbiasph2bf8512(<64 x i8> %A, <32 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtbiasph2bf8512: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtbiasph2bf8 %zmm1, %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7c,0x48,0x74,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8512(<64 x i8> %A, <32 x half> %B, <32 x i8> undef, i32 -1) + ret <32 x i8> %ret +} + +define <32 x i8> @test_int_x86_avx10_mask_vcvtbiasph2bf8512(<32 x i8> %W, i32 %U, <64 x i8> %A, <32 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtbiasph2bf8512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2bf8 %zmm2, %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x74,0x49,0x74,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtbiasph2bf8512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2bf8 %zmm2, %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x74,0x49,0x74,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8512(<64 x i8> %A, <32 x half> %B, <32 x i8> %W, i32 %U) + ret <32 x i8> %ret +} + +declare <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8512(<64 x i8> %A, <32 x half> %B, <32 x i8> %W, i32 %U) + +define <32 x i8> @test_int_x86_avx10_maskz_vcvtbiasph2bf8512(<64 x i8> %A, <32 x half> %B, i32 %U) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2bf8512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2bf8 %zmm1, %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7c,0xc9,0x74,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2bf8512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2bf8 %zmm1, %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7c,0xc9,0x74,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8512(<64 x i8> %A, <32 x half> %B, <32 x i8> zeroinitializer, i32 %U) + ret <32 x i8> %ret +} + +define <32 x i8> @test_int_x86_avx10_vcvtbiasph2bf8s512(<64 x i8> %A, <32 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtbiasph2bf8s512: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtbiasph2bf8s %zmm1, %zmm0, %ymm0 # encoding: [0x62,0xf5,0x7c,0x48,0x74,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s512(<64 x i8> %A, <32 x half> %B, <32 x i8> undef, i32 -1) + ret <32 x i8> %ret +} + +define <32 x i8> @test_int_x86_avx10_mask_vcvtbiasph2bf8s512(<32 x i8> %W, i32 %U, <64 x i8> %A, <32 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtbiasph2bf8s512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2bf8s %zmm2, %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x74,0x49,0x74,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtbiasph2bf8s512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2bf8s %zmm2, %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x74,0x49,0x74,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s512(<64 x i8> %A, <32 x half> %B, <32 x i8> %W, i32 %U) + ret <32 x i8> %ret +} + +declare <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s512(<64 x i8> %A, <32 x half> %B, <32 x i8> %W, i32 %U) + +define <32 x i8> @test_int_x86_avx10_maskz_vcvtbiasph2bf8s512(<64 x i8> %A, <32 x half> %B, i32 %U) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2bf8s512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2bf8s %zmm1, %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xc9,0x74,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2bf8s512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2bf8s %zmm1, %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xc9,0x74,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s512(<64 x i8> %A, <32 x half> %B, <32 x i8> zeroinitializer, i32 %U) + ret <32 x i8> %ret +} + +define <32 x i8> @test_int_x86_avx10_vcvtbiasph2hf8512(<64 x i8> %A, <32 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtbiasph2hf8512: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtbiasph2hf8 %zmm1, %zmm0, %ymm0 # encoding: [0x62,0xf5,0x7c,0x48,0x18,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8512(<64 x i8> %A, <32 x half> %B, <32 x i8> undef, i32 -1) + ret <32 x i8> %ret +} + +define <32 x i8> @test_int_x86_avx10_mask_vcvtbiasph2hf8512(<32 x i8> %W, i32 %U, <64 x i8> %A, <32 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtbiasph2hf8512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2hf8 %zmm2, %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x74,0x49,0x18,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtbiasph2hf8512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2hf8 %zmm2, %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x74,0x49,0x18,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8512(<64 x i8> %A, <32 x half> %B, <32 x i8> %W, i32 %U) + ret <32 x i8> %ret +} + +declare <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8512(<64 x i8> %A, <32 x half> %B, <32 x i8> %W, i32 %U) + +define <32 x i8> @test_int_x86_avx10_maskz_vcvtbiasph2hf8512(<64 x i8> %A, <32 x half> %B, i32 %U) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2hf8512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2hf8 %zmm1, %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xc9,0x18,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2hf8512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2hf8 %zmm1, %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xc9,0x18,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8512(<64 x i8> %A, <32 x half> %B, <32 x i8> zeroinitializer, i32 %U) + ret <32 x i8> %ret +} + +define <32 x i8> @test_int_x86_avx10_vcvtbiasph2hf8s512(<64 x i8> %A, <32 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtbiasph2hf8s512: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtbiasph2hf8s %zmm1, %zmm0, %ymm0 # encoding: [0x62,0xf5,0x7c,0x48,0x1b,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s512(<64 x i8> %A, <32 x half> %B, <32 x i8> undef, i32 -1) + ret <32 x i8> %ret +} + +define <32 x i8> @test_int_x86_avx10_mask_vcvtbiasph2hf8s512(<32 x i8> %W, i32 %U, <64 x i8> %A, <32 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtbiasph2hf8s512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2hf8s %zmm2, %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x74,0x49,0x1b,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtbiasph2hf8s512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2hf8s %zmm2, %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x74,0x49,0x1b,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s512(<64 x i8> %A, <32 x half> %B, <32 x i8> %W, i32 %U) + ret <32 x i8> %ret +} + +declare <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s512(<64 x i8> %A, <32 x half> %B, <32 x i8> %W, i32 %U) + +define <32 x i8> @test_int_x86_avx10_maskz_vcvtbiasph2hf8s512(<64 x i8> %A, <32 x half> %B, i32 %U) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2hf8s512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2hf8s %zmm1, %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xc9,0x1b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2hf8s512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2hf8s %zmm1, %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xc9,0x1b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s512(<64 x i8> %A, <32 x half> %B, <32 x i8> zeroinitializer, i32 %U) + ret <32 x i8> %ret +} + +define <64 x i8> @test_int_x86_avx10_vcvtne2ph2bf8512(<32 x half> %A, <32 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2bf8512: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtne2ph2bf8 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7f,0x48,0x74,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8512(<32 x half> %A, <32 x half> %B) + ret <64 x i8> %ret +} + +define <8 x i64> @test_int_x86_avx10_vcvtne2ph2bf8512_mask(<8 x i64> %C, i64 %U, <32 x half> %A, <32 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8512_mask: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2bf8 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x49,0x74,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8512_mask: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2bf8 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x49,0x74,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8512(<32 x half> %A, <32 x half> %B) + %2 = bitcast <8 x i64> %C to <64 x i8> + %3 = bitcast i64 %U to <64 x i1> + %4 = select <64 x i1> %3, <64 x i8> %1, <64 x i8> %2 + %5 = bitcast <64 x i8> %4 to <8 x i64> + ret <8 x i64> %5 +} + +define <8 x i64> @test_int_x86_avx10_vcvtne2ph2bf8512_maskz(i64 %U, <32 x half> %A, <32 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8512_maskz: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2bf8 %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0xc9,0x74,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8512_maskz: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2bf8 %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0xc9,0x74,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8512(<32 x half> %A, <32 x half> %B) + %3 = bitcast i64 %U to <64 x i1> + %4 = select <64 x i1> %3, <64 x i8> %1, <64 x i8> zeroinitializer + %5 = bitcast <64 x i8> %4 to <8 x i64> + ret <8 x i64> %5 +} + +declare <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8512(<32 x half> %A, <32 x half> %B) + +define <64 x i8> @test_int_x86_avx10_vcvtne2ph2bf8s512(<32 x half> %A, <32 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s512: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtne2ph2bf8s %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7f,0x48,0x74,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s512(<32 x half> %A, <32 x half> %B) + ret <64 x i8> %ret +} + +declare <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s512(<32 x half> %A, <32 x half> %B) + +define <8 x i64> @test_int_x86_avx10_vcvtne2ph2bf8s512_mask(<8 x i64> %C, i64 %U, <32 x half> %A, <32 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s512_mask: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2bf8s %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x49,0x74,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s512_mask: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2bf8s %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x49,0x74,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s512(<32 x half> %A, <32 x half> %B) + %2 = bitcast <8 x i64> %C to <64 x i8> + %3 = bitcast i64 %U to <64 x i1> + %4 = select <64 x i1> %3, <64 x i8> %1, <64 x i8> %2 + %5 = bitcast <64 x i8> %4 to <8 x i64> + ret <8 x i64> %5 +} + +define <8 x i64> @test_int_x86_avx10_vcvtne2ph2bf8s512_maskz(i64 %U, <32 x half> %A, <32 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s512_maskz: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2bf8s %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x74,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s512_maskz: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2bf8s %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x74,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s512(<32 x half> %A, <32 x half> %B) + %3 = bitcast i64 %U to <64 x i1> + %4 = select <64 x i1> %3, <64 x i8> %1, <64 x i8> zeroinitializer + %5 = bitcast <64 x i8> %4 to <8 x i64> + ret <8 x i64> %5 +} + +define <64 x i8> @test_int_x86_avx10_vcvtne2ph2hf8512(<32 x half> %A, <32 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2hf8512: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtne2ph2hf8 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7f,0x48,0x18,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8512(<32 x half> %A, <32 x half> %B) + ret <64 x i8> %ret +} + +define <8 x i64> @test_int_x86_avx10_vcvtne2ph2hf8512_mask(<8 x i64> %C, i64 %U, <32 x half> %A, <32 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8512_mask: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2hf8 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x49,0x18,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8512_mask: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2hf8 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x49,0x18,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8512(<32 x half> %A, <32 x half> %B) + %2 = bitcast <8 x i64> %C to <64 x i8> + %3 = bitcast i64 %U to <64 x i1> + %4 = select <64 x i1> %3, <64 x i8> %1, <64 x i8> %2 + %5 = bitcast <64 x i8> %4 to <8 x i64> + ret <8 x i64> %5 +} + +define <8 x i64> @test_int_x86_avx10_vcvtne2ph2hf8512_maskz(i64 %U, <32 x half> %A, <32 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8512_maskz: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2hf8 %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x18,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8512_maskz: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2hf8 %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x18,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8512(<32 x half> %A, <32 x half> %B) + %3 = bitcast i64 %U to <64 x i1> + %4 = select <64 x i1> %3, <64 x i8> %1, <64 x i8> zeroinitializer + %5 = bitcast <64 x i8> %4 to <8 x i64> + ret <8 x i64> %5 +} + +declare <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8512(<32 x half> %A, <32 x half> %B) + +define <64 x i8> @test_int_x86_avx10_vcvtne2ph2hf8s512(<32 x half> %A, <32 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s512: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtne2ph2hf8s %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7f,0x48,0x1b,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s512(<32 x half> %A, <32 x half> %B) + ret <64 x i8> %ret +} + +define <8 x i64> @test_int_x86_avx10_vcvtne2ph2hf8s512_mask(<8 x i64> %C, i64 %U, <32 x half> %A, <32 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s512_mask: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2hf8s %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x49,0x1b,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s512_mask: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2hf8s %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x49,0x1b,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s512(<32 x half> %A, <32 x half> %B) + %2 = bitcast <8 x i64> %C to <64 x i8> + %3 = bitcast i64 %U to <64 x i1> + %4 = select <64 x i1> %3, <64 x i8> %1, <64 x i8> %2 + %5 = bitcast <64 x i8> %4 to <8 x i64> + ret <8 x i64> %5 +} + + +define <8 x i64> @test_int_x86_avx10_vcvtne2ph2hf8s512_maskz(i64 %U, <32 x half> %A, <32 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s512_maskz: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2hf8s %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x1b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s512_maskz: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2hf8s %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x1b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s512(<32 x half> %A, <32 x half> %B) + %3 = bitcast i64 %U to <64 x i1> + %4 = select <64 x i1> %3, <64 x i8> %1, <64 x i8> zeroinitializer + %5 = bitcast <64 x i8> %4 to <8 x i64> + ret <8 x i64> %5 +} + +declare <64 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s512(<32 x half> %A, <32 x half> %B) + +define <32 x half> @test_int_x86_avx10_vcvthf82ph512(<32 x i8> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvthf82ph512: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvthf82ph %ymm0, %zmm0 # encoding: [0x62,0xf5,0x7f,0x48,0x1e,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <32 x half> @llvm.x86.avx10.mask.vcvthf82ph512(<32 x i8> %A, <32 x half> undef, i32 -1) + ret <32 x half> %ret +} + +define <32 x half> @test_int_x86_avx10_mask_vcvthf82ph512(<32 x i8> %A, <32 x half> %B, i32 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvthf82ph512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvthf82ph %ymm0, %zmm1 {%k1} # encoding: [0x62,0xf5,0x7f,0x49,0x1e,0xc8] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvthf82ph512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvthf82ph %ymm0, %zmm1 {%k1} # encoding: [0x62,0xf5,0x7f,0x49,0x1e,0xc8] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <32 x half> @llvm.x86.avx10.mask.vcvthf82ph512(<32 x i8> %A, <32 x half> %B, i32 %C) + ret <32 x half> %ret +} + +declare <32 x half> @llvm.x86.avx10.mask.vcvthf82ph512(<32 x i8> %A, <32 x half> %B, i32 %C) + +define <32 x half> @test_int_x86_avx10_maskz_vcvthf82ph512(<32 x i8> %A, i32 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvthf82ph512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvthf82ph %ymm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x1e,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvthf82ph512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvthf82ph %ymm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x1e,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <32 x half> @llvm.x86.avx10.mask.vcvthf82ph512(<32 x i8> %A, <32 x half> zeroinitializer, i32 %B) + ret <32 x half> %ret +} + +define <32 x i8> @test_int_x86_avx10_vcvtneph2bf8512(<32 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtneph2bf8512: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtneph2bf8 %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x74,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8512(<32 x half> %A, <32 x i8> undef, i32 -1) + ret <32 x i8> %ret +} + +define <32 x i8> @test_int_x86_avx10_mask_vcvtneph2bf8512(<32 x i8> %B, <32 x half> %A, i32 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2bf8 %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x74,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2bf8 %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x74,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8512(<32 x half> %A, <32 x i8> %B, i32 %C) + ret <32 x i8> %ret +} + +declare <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8512(<32 x half> %A, <32 x i8> %B, i32 %C) + +define <32 x i8> @test_int_x86_avx10_maskz_vcvtneph2bf8512(<32 x half> %A, i32 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2bf8 %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x74,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2bf8 %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x74,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8512(<32 x half> %A, <32 x i8> zeroinitializer, i32 %B) + ret <32 x i8> %ret +} + +define <32 x i8> @test_int_x86_avx10_vcvtneph2bf8s512(<32 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtneph2bf8s512: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtneph2bf8s %zmm0, %ymm0 # encoding: [0x62,0xf5,0x7e,0x48,0x74,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s512(<32 x half> %A, <32 x i8> undef, i32 -1) + ret <32 x i8> %ret +} + +define <32 x i8> @test_int_x86_avx10_mask_vcvtneph2bf8s512(<32 x i8> %B, <32 x half> %A, i32 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8s512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2bf8s %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x49,0x74,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8s512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2bf8s %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x49,0x74,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s512(<32 x half> %A, <32 x i8> %B, i32 %C) + ret <32 x i8> %ret +} + +declare <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s512(<32 x half> %A, <32 x i8> %B, i32 %C) + +define <32 x i8> @test_int_x86_avx10_maskz_vcvtneph2bf8s512(<32 x half> %A, i32 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8s512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2bf8s %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xc9,0x74,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8s512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2bf8s %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xc9,0x74,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s512(<32 x half> %A, <32 x i8> zeroinitializer, i32 %B) + ret <32 x i8> %ret +} + +define <32 x i8> @test_int_x86_avx10_vcvtneph2hf8512(<32 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtneph2hf8512: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtneph2hf8 %zmm0, %ymm0 # encoding: [0x62,0xf5,0x7e,0x48,0x18,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8512(<32 x half> %A, <32 x i8> undef, i32 -1) + ret <32 x i8> %ret +} + +define <32 x i8> @test_int_x86_avx10_mask_vcvtneph2hf8512(<32 x i8> %B, <32 x half> %A, i32 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2hf8 %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x49,0x18,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2hf8 %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x49,0x18,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8512(<32 x half> %A, <32 x i8> %B, i32 %C) + ret <32 x i8> %ret +} + +declare <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8512(<32 x half> %A, <32 x i8> %B, i32 %C) + +define <32 x i8> @test_int_x86_avx10_maskz_vcvtneph2hf8512(<32 x half> %A, i32 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2hf8 %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xc9,0x18,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2hf8 %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xc9,0x18,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8512(<32 x half> %A, <32 x i8> zeroinitializer, i32 %B) + ret <32 x i8> %ret +} + +define <32 x i8> @test_int_x86_avx10_vcvtneph2hf8s512(<32 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtneph2hf8s512: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtneph2hf8s %zmm0, %ymm0 # encoding: [0x62,0xf5,0x7e,0x48,0x1b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s512(<32 x half> %A, <32 x i8> undef, i32 -1) + ret <32 x i8> %ret +} + +define <32 x i8> @test_int_x86_avx10_mask_vcvtneph2hf8s512(<32 x i8> %B, <32 x half> %A, i32 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8s512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2hf8s %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x49,0x1b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8s512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2hf8s %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x49,0x1b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s512(<32 x half> %A, <32 x i8> %B, i32 %C) + ret <32 x i8> %ret +} + +declare <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s512(<32 x half> %A, <32 x i8> %B, i32 %C) + +define <32 x i8> @test_int_x86_avx10_maskz_vcvtneph2hf8s512(<32 x half> %A, i32 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8s512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2hf8s %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xc9,0x1b,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8s512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2hf8s %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xc9,0x1b,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s512(<32 x half> %A, <32 x i8> zeroinitializer, i32 %B) + ret <32 x i8> %ret +} diff --git a/llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll new file mode 100644 index 00000000000000..fc74f0b490cd85 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll @@ -0,0 +1,1324 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X86 + +define <8 x half> @test_int_x86_avx10_vcvt2ps2phx128(<4 x float> %A, <4 x float> %B) { +; CHECK-LABEL: test_int_x86_avx10_vcvt2ps2phx128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvt2ps2phx %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0x67,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <8 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.128(<4 x float> %A, <4 x float> %B, <8 x half> zeroinitializer, i8 -1) + ret <8 x half> %ret +} + +define <8 x half> @test_int_x86_avx10_vcvt2ps2phx128_mask(<8 x half> %W, i8 %U, <4 x float> %A, <4 x float> %B) { +; X64-LABEL: test_int_x86_avx10_vcvt2ps2phx128_mask: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvt2ps2phx %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x67,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvt2ps2phx128_mask: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvt2ps2phx %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x67,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <8 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.128(<4 x float> %A, <4 x float> %B, <8 x half> %W, i8 %U) + ret <8 x half> %ret +} + +define <8 x half> @test_int_x86_avx10_vcvt2ps2phx128_maskz(i8 %U, <4 x float> %A, <4 x float> %B) { +; X64-LABEL: test_int_x86_avx10_vcvt2ps2phx128_maskz: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvt2ps2phx %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x67,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvt2ps2phx128_maskz: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvt2ps2phx %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x67,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <8 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.128(<4 x float> %A, <4 x float> %B, <8 x half> zeroinitializer, i8 %U) + ret <8 x half> %ret +} + +declare <8 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.128(<4 x float>, <4 x float>, <8 x half>, i8) + +define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256(<8 x float> %A, <8 x float> %B) { +; CHECK-LABEL: test_int_x86_avx10_vcvt2ps2phx256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvt2ps2phx %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x67,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> zeroinitializer, i16 -1, i32 4) + ret <16 x half> %ret +} + +define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256_mask(<16 x half> %W, i16 %U, <8 x float> %A, <8 x float> %B) { +; X64-LABEL: test_int_x86_avx10_vcvt2ps2phx256_mask: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvt2ps2phx %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x67,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvt2ps2phx256_mask: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvt2ps2phx %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x67,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> %W, i16 %U, i32 4) + ret <16 x half> %ret +} + +define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256_maskz(<16 x half> %W, i16 %U, <8 x float> %A, <8 x float> %B) { +; X64-LABEL: test_int_x86_avx10_vcvt2ps2phx256_maskz: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvt2ps2phx %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x67,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvt2ps2phx256_maskz: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvt2ps2phx %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x67,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> zeroinitializer, i16 %U, i32 4) + ret <16 x half> %ret +} + +define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256_round(<8 x float> %A, <8 x float> %B) { +; CHECK-LABEL: test_int_x86_avx10_vcvt2ps2phx256_round: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvt2ps2phx {rz-sae}, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x79,0x78,0x67,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> zeroinitializer, i16 -1, i32 11) + ret <16 x half> %ret +} + +define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256_round_mask(<16 x half> %W, i16 %U, <8 x float> %A, <8 x float> %B) { +; X64-LABEL: test_int_x86_avx10_vcvt2ps2phx256_round_mask: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvt2ps2phx {rz-sae}, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x71,0x79,0x67,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvt2ps2phx256_round_mask: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvt2ps2phx {rz-sae}, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x71,0x79,0x67,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> %W, i16 %U, i32 11) + ret <16 x half> %ret +} + +define <16 x half> @test_int_x86_avx10_vcvt2ps2phx256_round_maskz(i16 %U, <8 x float> %A, <8 x float> %B) { +; X64-LABEL: test_int_x86_avx10_vcvt2ps2phx256_round_maskz: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvt2ps2phx {rz-sae}, %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x79,0xf9,0x67,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvt2ps2phx256_round_maskz: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvt2ps2phx {rz-sae}, %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x79,0xf9,0x67,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float> %A, <8 x float> %B, <16 x half> zeroinitializer, i16 %U, i32 11) + ret <16 x half> %ret +} + +declare <16 x half> @llvm.x86.avx10.mask.vcvt2ps2phx.256(<8 x float>, <8 x float>, <16 x half>, i16, i32) + +define <16 x i8> @test_int_x86_avx10_vcvtbiasph2bf8128(<16 x i8> %A, <8 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtbiasph2bf8128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtbiasph2bf8 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7c,0x08,0x74,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8128(<16 x i8> %A, <8 x half> %B, <16 x i8> undef, i8 -1) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_mask_vcvtbiasph2bf8128(<16 x i8> %W, <16 x i8> %A, <8 x half> %B, i8 %U) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtbiasph2bf8128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2bf8 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x09,0x74,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtbiasph2bf8128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2bf8 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x09,0x74,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8128(<16 x i8> %A, <8 x half> %B, <16 x i8> %W, i8 %U) + ret <16 x i8> %ret +} + +declare <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8128(<16 x i8> %A, <8 x half> %B, <16 x i8> %W, i8 %U) + +define <16 x i8> @test_int_x86_avx10_maskz_vcvtbiasph2bf8128(<16 x i8> %A, <8 x half> %B, i8 %U) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2bf8128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2bf8 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7c,0x89,0x74,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2bf8128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2bf8 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7c,0x89,0x74,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8128(<16 x i8> %A, <8 x half> %B, <16 x i8> zeroinitializer, i8 %U) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_vcvtbiasph2bf8256(<32 x i8> %A, <16 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtbiasph2bf8256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtbiasph2bf8 %ymm1, %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7c,0x28,0x74,0xc1] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8256(<32 x i8> %A, <16 x half> %B, <16 x i8> undef, i16 -1) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_mask_vcvtbiasph2bf8256(<16 x i8> %W, <32 x i8> %A, <16 x half> %B, i16 %U) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtbiasph2bf8256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2bf8 %ymm2, %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x29,0x74,0xc2] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtbiasph2bf8256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2bf8 %ymm2, %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x29,0x74,0xc2] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8256(<32 x i8> %A, <16 x half> %B, <16 x i8> %W, i16 %U) + ret <16 x i8> %ret +} + +declare <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8256(<32 x i8> %A, <16 x half> %B, <16 x i8> %W, i16 %U) + +define <16 x i8> @test_int_x86_avx10_maskz_vcvtbiasph2bf8256(<32 x i8> %A, <16 x half> %B, i16 %U) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2bf8256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2bf8 %ymm1, %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7c,0xa9,0x74,0xc1] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2bf8256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2bf8 %ymm1, %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7c,0xa9,0x74,0xc1] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8256(<32 x i8> %A, <16 x half> %B, <16 x i8> zeroinitializer, i16 %U) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_vcvtbiasph2bf8s128(<16 x i8> %A, <8 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtbiasph2bf8s128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtbiasph2bf8s %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7c,0x08,0x74,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s128(<16 x i8> %A, <8 x half> %B, <16 x i8> undef, i8 -1) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_mask_vcvtbiasph2bf8s128(<16 x i8> %W, <16 x i8> %A, <8 x half> %B, i8 %U) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtbiasph2bf8s128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2bf8s %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x74,0x09,0x74,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtbiasph2bf8s128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2bf8s %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x74,0x09,0x74,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s128(<16 x i8> %A, <8 x half> %B, <16 x i8> %W, i8 %U) + ret <16 x i8> %ret +} + +declare <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s128(<16 x i8> %A, <8 x half> %B, <16 x i8> %W, i8 %U) + +define <16 x i8> @test_int_x86_avx10_maskz_vcvtbiasph2bf8s128(<16 x i8> %A, <8 x half> %B, i8 %U) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2bf8s128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2bf8s %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0x89,0x74,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2bf8s128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2bf8s %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0x89,0x74,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s128(<16 x i8> %A, <8 x half> %B, <16 x i8> zeroinitializer, i8 %U) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_vcvtbiasph2bf8s256(<32 x i8> %A, <16 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtbiasph2bf8s256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtbiasph2bf8s %ymm1, %ymm0, %xmm0 # encoding: [0x62,0xf5,0x7c,0x28,0x74,0xc1] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s256(<32 x i8> %A, <16 x half> %B, <16 x i8> undef, i16 -1) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_mask_vcvtbiasph2bf8s256(<16 x i8> %W, <32 x i8> %A, <16 x half> %B, i16 %U) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtbiasph2bf8s256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2bf8s %ymm2, %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x74,0x29,0x74,0xc2] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtbiasph2bf8s256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2bf8s %ymm2, %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x74,0x29,0x74,0xc2] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s256(<32 x i8> %A, <16 x half> %B, <16 x i8> %W, i16 %U) + ret <16 x i8> %ret +} + +declare <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s256(<32 x i8> %A, <16 x half> %B, <16 x i8> %W, i16 %U) + +define <16 x i8> @test_int_x86_avx10_maskz_vcvtbiasph2bf8s256(<32 x i8> %A, <16 x half> %B, i16 %U) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2bf8s256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2bf8s %ymm1, %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xa9,0x74,0xc1] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2bf8s256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2bf8s %ymm1, %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xa9,0x74,0xc1] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2bf8s256(<32 x i8> %A, <16 x half> %B, <16 x i8> zeroinitializer, i16 %U) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_vcvtbiasph2hf8128(<16 x i8> %A, <8 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtbiasph2hf8128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtbiasph2hf8 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7c,0x08,0x18,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8128(<16 x i8> %A, <8 x half> %B, <16 x i8> undef, i8 -1) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_mask_vcvtbiasph2hf8128(<16 x i8> %W, <16 x i8> %A, <8 x half> %B, i8 %U) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtbiasph2hf8128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2hf8 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x74,0x09,0x18,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtbiasph2hf8128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2hf8 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x74,0x09,0x18,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8128(<16 x i8> %A, <8 x half> %B, <16 x i8> %W, i8 %U) + ret <16 x i8> %ret +} + +declare <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8128(<16 x i8> %A, <8 x half> %B, <16 x i8> %W, i8 %U) + +define <16 x i8> @test_int_x86_avx10_maskz_vcvtbiasph2hf8128(<16 x i8> %A, <8 x half> %B, i8 %U) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2hf8128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2hf8 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0x89,0x18,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2hf8128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2hf8 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0x89,0x18,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8128(<16 x i8> %A, <8 x half> %B, <16 x i8> zeroinitializer, i8 %U) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_vcvtbiasph2hf8256(<32 x i8> %A, <16 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtbiasph2hf8256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtbiasph2hf8 %ymm1, %ymm0, %xmm0 # encoding: [0x62,0xf5,0x7c,0x28,0x18,0xc1] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8256(<32 x i8> %A, <16 x half> %B, <16 x i8> undef, i16 -1) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_mask_vcvtbiasph2hf8256(<16 x i8> %W, <32 x i8> %A, <16 x half> %B, i16 %U) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtbiasph2hf8256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2hf8 %ymm2, %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x74,0x29,0x18,0xc2] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtbiasph2hf8256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2hf8 %ymm2, %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x74,0x29,0x18,0xc2] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8256(<32 x i8> %A, <16 x half> %B, <16 x i8> %W, i16 %U) + ret <16 x i8> %ret +} + +declare <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8256(<32 x i8> %A, <16 x half> %B, <16 x i8> %W, i16 %U) + +define <16 x i8> @test_int_x86_avx10_maskz_vcvtbiasph2hf8256(<32 x i8> %A, <16 x half> %B, i16 %U) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2hf8256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2hf8 %ymm1, %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xa9,0x18,0xc1] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2hf8256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2hf8 %ymm1, %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xa9,0x18,0xc1] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8256(<32 x i8> %A, <16 x half> %B, <16 x i8> zeroinitializer, i16 %U) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_vcvtbiasph2hf8s128(<16 x i8> %A, <8 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtbiasph2hf8s128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtbiasph2hf8s %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7c,0x08,0x1b,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s128(<16 x i8> %A, <8 x half> %B, <16 x i8> undef, i8 -1) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_mask_vcvtbiasph2hf8s128(<16 x i8> %W, <16 x i8> %A, <8 x half> %B, i8 %U) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtbiasph2hf8s128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2hf8s %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x74,0x09,0x1b,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtbiasph2hf8s128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2hf8s %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x74,0x09,0x1b,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s128(<16 x i8> %A, <8 x half> %B, <16 x i8> %W, i8 %U) + ret <16 x i8> %ret +} + +declare <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s128(<16 x i8> %A, <8 x half> %B, <16 x i8> %W, i8 %U) + +define <16 x i8> @test_int_x86_avx10_maskz_vcvtbiasph2hf8s128(<16 x i8> %A, <8 x half> %B, i8 %U) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2hf8s128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2hf8s %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0x89,0x1b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2hf8s128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2hf8s %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0x89,0x1b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s128(<16 x i8> %A, <8 x half> %B, <16 x i8> zeroinitializer, i8 %U) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_vcvtbiasph2hf8s256(<32 x i8> %A, <16 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtbiasph2hf8s256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtbiasph2hf8s %ymm1, %ymm0, %xmm0 # encoding: [0x62,0xf5,0x7c,0x28,0x1b,0xc1] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s256(<32 x i8> %A, <16 x half> %B, <16 x i8> undef, i16 -1) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_mask_vcvtbiasph2hf8s256(<16 x i8> %W, <32 x i8> %A, <16 x half> %B, i16 %U) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtbiasph2hf8s256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2hf8s %ymm2, %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x74,0x29,0x1b,0xc2] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtbiasph2hf8s256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2hf8s %ymm2, %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x74,0x29,0x1b,0xc2] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s256(<32 x i8> %A, <16 x half> %B, <16 x i8> %W, i16 %U) + ret <16 x i8> %ret +} + +declare <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s256(<32 x i8> %A, <16 x half> %B, <16 x i8> %W, i16 %U) + +define <16 x i8> @test_int_x86_avx10_maskz_vcvtbiasph2hf8s256(<32 x i8> %A, <16 x half> %B, i16 %U) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2hf8s256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtbiasph2hf8s %ymm1, %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xa9,0x1b,0xc1] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtbiasph2hf8s256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtbiasph2hf8s %ymm1, %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xa9,0x1b,0xc1] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtbiasph2hf8s256(<32 x i8> %A, <16 x half> %B, <16 x i8> zeroinitializer, i16 %U) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_vcvtne2ph2bf8128(<8 x half> %A, <8 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2bf8128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtne2ph2bf8 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7f,0x08,0x74,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8128(<8 x half> %A, <8 x half> %B) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_vcvtne2ph2bf8128_mask(<16 x i8> %C, i16 %U, <8 x half> %A, <8 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8128_mask: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2bf8 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x74,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8128_mask: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2bf8 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x74,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8128(<8 x half> %A, <8 x half> %B) + %2 = bitcast i16 %U to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %C + ret <16 x i8> %3 +} + +define <16 x i8> @test_int_x86_avx10_vcvtne2ph2bf8128_maskz(<16 x i8> %C, i16 %U, <8 x half> %A, <8 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8128_maskz: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2bf8 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0x89,0x74,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8128_maskz: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2bf8 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0x89,0x74,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8128(<8 x half> %A, <8 x half> %B) + %2 = bitcast i16 %U to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 +} + +declare <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8128(<8 x half> %A, <8 x half> %B) + +define <32 x i8> @test_int_x86_avx10_vcvtne2ph2bf8256(<16 x half> %A, <16 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2bf8256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtne2ph2bf8 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7f,0x28,0x74,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8256(<16 x half> %A, <16 x half> %B) + ret <32 x i8> %ret +} + +define <32 x i8> @test_int_x86_avx10_vcvtne2ph2bf8256_mask(<32 x i8> %C, i32 %U, <16 x half> %A, <16 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8256_mask: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2bf8 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x74,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8256_mask: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2bf8 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x74,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8256(<16 x half> %A, <16 x half> %B) + %2 = bitcast i32 %U to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %C + ret <32 x i8> %3 +} + +define <32 x i8> @test_int_x86_avx10_vcvtne2ph2bf8256_maskz(<32 x i8> %C, i32 %U, <16 x half> %A, <16 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8256_maskz: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2bf8 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0xa9,0x74,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8256_maskz: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2bf8 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0xa9,0x74,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8256(<16 x half> %A, <16 x half> %B) + %2 = bitcast i32 %U to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 +} + +declare <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8256(<16 x half> %A, <16 x half> %B) + +define <16 x i8> @test_int_x86_avx10_vcvtne2ph2bf8s128(<8 x half> %A, <8 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtne2ph2bf8s %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7f,0x08,0x74,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s128(<8 x half> %A, <8 x half> %B) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_vcvtne2ph2bf8s128_mask(<16 x i8> %C, i16 %U, <8 x half> %A, <8 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s128_mask: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2bf8s %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x09,0x74,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s128_mask: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2bf8s %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x09,0x74,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s128(<8 x half> %A, <8 x half> %B) + %2 = bitcast i16 %U to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %C + ret <16 x i8> %3 +} + +define <16 x i8> @test_int_x86_avx10_vcvtne2ph2bf8s128_maskz(i16 %U, <8 x half> %A, <8 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s128_maskz: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2bf8s %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x74,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s128_maskz: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2bf8s %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x74,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s128(<8 x half> %A, <8 x half> %B) + %2 = bitcast i16 %U to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 +} + +declare <16 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s128(<8 x half> %A, <8 x half> %B) + +define <32 x i8> @test_int_x86_avx10_vcvtne2ph2bf8s256(<16 x half> %A, <16 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtne2ph2bf8s %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7f,0x28,0x74,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s256(<16 x half> %A, <16 x half> %B) + ret <32 x i8> %ret +} + +define <32 x i8> @test_int_x86_avx10_vcvtne2ph2bf8s256_mask(<32 x i8> %C, i32 %U, <16 x half> %A, <16 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s256_mask: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2bf8s %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x77,0x29,0x74,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s256_mask: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2bf8s %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x77,0x29,0x74,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s256(<16 x half> %A, <16 x half> %B) + %2 = bitcast i32 %U to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %C + ret <32 x i8> %3 +} + +define <32 x i8> @test_int_x86_avx10_vcvtne2ph2bf8s256_maskz(i32 %U, <16 x half> %A, <16 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s256_maskz: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2bf8s %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x74,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2bf8s256_maskz: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2bf8s %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x74,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s256(<16 x half> %A, <16 x half> %B) + %2 = bitcast i32 %U to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 +} + +declare <32 x i8> @llvm.x86.avx10.vcvtne2ph2bf8s256(<16 x half> %A, <16 x half> %B) + +define <16 x i8> @test_int_x86_avx10_vcvtne2ph2hf8128(<8 x half> %A, <8 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2hf8128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtne2ph2hf8 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7f,0x08,0x18,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8128(<8 x half> %A, <8 x half> %B) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_vcvtne2ph2hf8128_mask(<16 x i8> %C, i16 %U, <8 x half> %A, <8 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8128_mask: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2hf8 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x09,0x18,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8128_mask: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2hf8 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x09,0x18,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8128(<8 x half> %A, <8 x half> %B) + %2 = bitcast i16 %U to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %C + ret <16 x i8> %3 +} + +define <16 x i8> @test_int_x86_avx10_vcvtne2ph2hf8128_maskz(i16 %U, <8 x half> %A, <8 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8128_maskz: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2hf8 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x18,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8128_maskz: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2hf8 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x18,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8128(<8 x half> %A, <8 x half> %B) + %2 = bitcast i16 %U to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 +} + +declare <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8128(<8 x half> %A, <8 x half> %B) + +define <32 x i8> @test_int_x86_avx10_vcvtne2ph2hf8256(<16 x half> %A, <16 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2hf8256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtne2ph2hf8 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7f,0x28,0x18,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8256(<16 x half> %A, <16 x half> %B) + ret <32 x i8> %ret +} + +define <32 x i8> @test_int_x86_avx10_vcvtne2ph2hf8256_mask(<32 x i8> %C, i32 %U, <16 x half> %A, <16 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8256_mask: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2hf8 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x77,0x29,0x18,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8256_mask: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2hf8 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x77,0x29,0x18,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8256(<16 x half> %A, <16 x half> %B) + %2 = bitcast i32 %U to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %C + ret <32 x i8> %3 +} + +define <32 x i8> @test_int_x86_avx10_vcvtne2ph2hf8256_maskz(i32 %U, <16 x half> %A, <16 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8256_maskz: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2hf8 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x18,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8256_maskz: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2hf8 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x18,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8256(<16 x half> %A, <16 x half> %B) + %2 = bitcast i32 %U to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 +} + +declare <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8256(<16 x half> %A, <16 x half> %B) + +define <16 x i8> @test_int_x86_avx10_vcvtne2ph2hf8s128(<8 x half> %A, <8 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtne2ph2hf8s %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7f,0x08,0x1b,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s128(<8 x half> %A, <8 x half> %B) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_vcvtne2ph2hf8s128_mask(<16 x i8> %C, i16 %U, <8 x half> %A, <8 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s128_mask: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2hf8s %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x09,0x1b,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s128_mask: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2hf8s %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x77,0x09,0x1b,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s128(<8 x half> %A, <8 x half> %B) + %2 = bitcast i16 %U to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %C + ret <16 x i8> %3 +} + +define <16 x i8> @test_int_x86_avx10_vcvtne2ph2hf8s128_maskz(i16 %U, <8 x half> %A, <8 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s128_maskz: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2hf8s %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x1b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s128_maskz: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2hf8s %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x1b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s128(<8 x half> %A, <8 x half> %B) + %2 = bitcast i16 %U to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 +} + +declare <16 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s128(<8 x half> %A, <8 x half> %B) + +define <32 x i8> @test_int_x86_avx10_vcvtne2ph2hf8s256(<16 x half> %A, <16 x half> %B) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtne2ph2hf8s %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7f,0x28,0x1b,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s256(<16 x half> %A, <16 x half> %B) + ret <32 x i8> %ret +} + +define <32 x i8> @test_int_x86_avx10_vcvtne2ph2hf8s256_mask(<32 x i8> %C, i32 %U, <16 x half> %A, <16 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s256_mask: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2hf8s %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x77,0x29,0x1b,0xc2] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s256_mask: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2hf8s %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x77,0x29,0x1b,0xc2] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s256(<16 x half> %A, <16 x half> %B) + %2 = bitcast i32 %U to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %C + ret <32 x i8> %3 +} + +define <32 x i8> @test_int_x86_avx10_vcvtne2ph2hf8s256_maskz(i32 %U, <16 x half> %A, <16 x half> %B) nounwind { +; X64-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s256_maskz: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtne2ph2hf8s %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x1b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_vcvtne2ph2hf8s256_maskz: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtne2ph2hf8s %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x1b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s256(<16 x half> %A, <16 x half> %B) + %2 = bitcast i32 %U to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 +} + +declare <32 x i8> @llvm.x86.avx10.vcvtne2ph2hf8s256(<16 x half> %A, <16 x half> %B) + +define <8 x half> @test_int_x86_avx10_vcvthf82ph128(<16 x i8> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvthf82ph128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvthf82ph %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7f,0x08,0x1e,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <8 x half> @llvm.x86.avx10.mask.vcvthf82ph128(<16 x i8> %A, <8 x half> undef, i8 -1) + ret <8 x half> %ret +} + +define <8 x half> @test_int_x86_avx10_mask_vcvthf82ph128(<16 x i8> %A, <8 x half> %B, i8 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvthf82ph128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvthf82ph %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0x7f,0x09,0x1e,0xc8] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvthf82ph128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvthf82ph %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0x7f,0x09,0x1e,0xc8] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <8 x half> @llvm.x86.avx10.mask.vcvthf82ph128(<16 x i8> %A, <8 x half> %B, i8 %C) + ret <8 x half> %ret +} + +declare <8 x half> @llvm.x86.avx10.mask.vcvthf82ph128(<16 x i8> %A, <8 x half> %B, i8 %C) + +define <8 x half> @test_int_x86_avx10_maskz_vcvthf82ph128(<16 x i8> %A, i8 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvthf82ph128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvthf82ph %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x1e,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvthf82ph128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvthf82ph %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x1e,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <8 x half> @llvm.x86.avx10.mask.vcvthf82ph128(<16 x i8> %A, <8 x half> zeroinitializer, i8 %B) + ret <8 x half> %ret +} + +define <16 x half> @test_int_x86_avx10_vcvthf82ph256(<16 x i8> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvthf82ph256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvthf82ph %xmm0, %ymm0 # encoding: [0x62,0xf5,0x7f,0x28,0x1e,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <16 x half> @llvm.x86.avx10.mask.vcvthf82ph256(<16 x i8> %A, <16 x half> undef, i16 -1) + ret <16 x half> %ret +} + +define <16 x half> @test_int_x86_avx10_mask_vcvthf82ph256(<16 x i8> %A, <16 x half> %B, i16 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvthf82ph256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvthf82ph %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf5,0x7f,0x29,0x1e,0xc8] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvthf82ph256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvthf82ph %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf5,0x7f,0x29,0x1e,0xc8] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x half> @llvm.x86.avx10.mask.vcvthf82ph256(<16 x i8> %A, <16 x half> %B, i16 %C) + ret <16 x half> %ret +} + +declare <16 x half> @llvm.x86.avx10.mask.vcvthf82ph256(<16 x i8> %A, <16 x half> %B, i16 %C) + +define <16 x half> @test_int_x86_avx10_maskz_vcvthf82ph256(<16 x i8> %A, i16 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvthf82ph256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvthf82ph %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x1e,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvthf82ph256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvthf82ph %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x1e,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x half> @llvm.x86.avx10.mask.vcvthf82ph256(<16 x i8> %A, <16 x half> zeroinitializer, i16 %B) + ret <16 x half> %ret +} + +define <16 x i8> @test_int_x86_avx10_vcvtneph2bf8128(<8 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtneph2bf8128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtneph2bf8 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x74,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8128(<8 x half> %A, <16 x i8> undef, i8 -1) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_mask_vcvtneph2bf8128(<16 x i8> %B, <8 x half> %A, i8 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2bf8 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x74,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2bf8 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x74,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8128(<8 x half> %A, <16 x i8> %B, i8 %C) + ret <16 x i8> %ret +} + +declare <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8128(<8 x half> %A, <16 x i8> %B, i8 %C) + +define <16 x i8> @test_int_x86_avx10_maskz_vcvtneph2bf8128(<8 x half> %A, i8 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2bf8 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x74,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2bf8 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x74,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8128(<8 x half> %A, <16 x i8> zeroinitializer, i8 %B) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_vcvtneph2bf8256(<16 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtneph2bf8256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtneph2bf8 %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x74,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8256(<16 x half> %A, <16 x i8> undef, i16 -1) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_mask_vcvtneph2bf8256(<16 x i8> %B, <16 x half> %A, i16 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2bf8 %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x74,0xc1] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2bf8 %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x74,0xc1] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8256(<16 x half> %A, <16 x i8> %B, i16 %C) + ret <16 x i8> %ret +} + +declare <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8256(<16 x half> %A, <16 x i8> %B, i16 %C) + +define <16 x i8> @test_int_x86_avx10_maskz_vcvtneph2bf8256(<16 x half> %A, i16 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2bf8 %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x74,0xc0] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2bf8 %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x74,0xc0] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8256(<16 x half> %A, <16 x i8> zeroinitializer, i16 %B) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_vcvtneph2bf8s128(<8 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtneph2bf8s128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtneph2bf8s %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x74,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s128(<8 x half> %A, <16 x i8> undef, i8 -1) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_mask_vcvtneph2bf8s128(<16 x i8> %B, <8 x half> %A, i8 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8s128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2bf8s %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x74,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8s128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2bf8s %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x74,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s128(<8 x half> %A, <16 x i8> %B, i8 %C) + ret <16 x i8> %ret +} + +declare <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s128(<8 x half> %A, <16 x i8> %B, i8 %C) + +define <16 x i8> @test_int_x86_avx10_maskz_vcvtneph2bf8s128(<8 x half> %A, i8 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8s128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2bf8s %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0x89,0x74,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8s128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2bf8s %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0x89,0x74,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s128(<8 x half> %A, <16 x i8> zeroinitializer, i8 %B) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_vcvtneph2bf8s256(<16 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtneph2bf8s256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtneph2bf8s %ymm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x28,0x74,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s256(<16 x half> %A, <16 x i8> undef, i16 -1) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_mask_vcvtneph2bf8s256(<16 x i8> %B, <16 x half> %A, i16 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8s256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2bf8s %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x29,0x74,0xc1] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2bf8s256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2bf8s %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x29,0x74,0xc1] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s256(<16 x half> %A, <16 x i8> %B, i16 %C) + ret <16 x i8> %ret +} + +declare <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s256(<16 x half> %A, <16 x i8> %B, i16 %C) + +define <16 x i8> @test_int_x86_avx10_maskz_vcvtneph2bf8s256(<16 x half> %A, i16 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8s256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2bf8s %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xa9,0x74,0xc0] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2bf8s256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2bf8s %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xa9,0x74,0xc0] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2bf8s256(<16 x half> %A, <16 x i8> zeroinitializer, i16 %B) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_vcvtneph2hf8128(<8 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtneph2hf8128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtneph2hf8 %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x18,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8128(<8 x half> %A, <16 x i8> undef, i8 -1) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_mask_vcvtneph2hf8128(<16 x i8> %B, <8 x half> %A, i8 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2hf8 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x18,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2hf8 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x18,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8128(<8 x half> %A, <16 x i8> %B, i8 %C) + ret <16 x i8> %ret +} + +declare <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8128(<8 x half> %A, <16 x i8> %B, i8 %C) + +define <16 x i8> @test_int_x86_avx10_maskz_vcvtneph2hf8128(<8 x half> %A, i8 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2hf8 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0x89,0x18,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2hf8 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0x89,0x18,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8128(<8 x half> %A, <16 x i8> zeroinitializer, i8 %B) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_vcvtneph2hf8256(<16 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtneph2hf8256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtneph2hf8 %ymm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x28,0x18,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8256(<16 x half> %A, <16 x i8> undef, i16 -1) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_mask_vcvtneph2hf8256(<16 x i8> %B, <16 x half> %A, i16 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2hf8 %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x29,0x18,0xc1] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2hf8 %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x29,0x18,0xc1] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8256(<16 x half> %A, <16 x i8> %B, i16 %C) + ret <16 x i8> %ret +} + +declare <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8256(<16 x half> %A, <16 x i8> %B, i16 %C) + +define <16 x i8> @test_int_x86_avx10_maskz_vcvtneph2hf8256(<16 x half> %A, i16 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2hf8 %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xa9,0x18,0xc0] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2hf8 %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xa9,0x18,0xc0] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8256(<16 x half> %A, <16 x i8> zeroinitializer, i16 %B) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_vcvtneph2hf8s128(<8 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtneph2hf8s128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtneph2hf8s %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x1b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s128(<8 x half> %A, <16 x i8> undef, i8 -1) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_mask_vcvtneph2hf8s128(<16 x i8> %B, <8 x half> %A, i8 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8s128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2hf8s %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x1b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8s128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2hf8s %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x1b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s128(<8 x half> %A, <16 x i8> %B, i8 %C) + ret <16 x i8> %ret +} + +declare <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s128(<8 x half> %A, <16 x i8> %B, i8 %C) + +define <16 x i8> @test_int_x86_avx10_maskz_vcvtneph2hf8s128(<8 x half> %A, i8 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8s128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2hf8s %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0x89,0x1b,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8s128: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2hf8s %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0x89,0x1b,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s128(<8 x half> %A, <16 x i8> zeroinitializer, i8 %B) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_vcvtneph2hf8s256(<16 x half> %A) nounwind { +; CHECK-LABEL: test_int_x86_avx10_vcvtneph2hf8s256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtneph2hf8s %ymm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x28,0x1b,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s256(<16 x half> %A, <16 x i8> undef, i16 -1) + ret <16 x i8> %ret +} + +define <16 x i8> @test_int_x86_avx10_mask_vcvtneph2hf8s256(<16 x i8> %B, <16 x half> %A, i16 %C) nounwind { +; X64-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8s256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2hf8s %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x29,0x1b,0xc1] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_mask_vcvtneph2hf8s256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2hf8s %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x29,0x1b,0xc1] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s256(<16 x half> %A, <16 x i8> %B, i16 %C) + ret <16 x i8> %ret +} + +declare <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s256(<16 x half> %A, <16 x i8> %B, i16 %C) + +define <16 x i8> @test_int_x86_avx10_maskz_vcvtneph2hf8s256(<16 x half> %A, i16 %B) nounwind { +; X64-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8s256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtneph2hf8s %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xa9,0x1b,0xc0] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_int_x86_avx10_maskz_vcvtneph2hf8s256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtneph2hf8s %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7e,0xa9,0x1b,0xc0] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] + %ret = call <16 x i8> @llvm.x86.avx10.mask.vcvtneph2hf8s256(<16 x half> %A, <16 x i8> zeroinitializer, i16 %B) + ret <16 x i8> %ret +} diff --git a/llvm/test/MC/Disassembler/X86/avx10.2convert-32.txt b/llvm/test/MC/Disassembler/X86/avx10.2convert-32.txt new file mode 100644 index 00000000000000..71506201cffe83 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/avx10.2convert-32.txt @@ -0,0 +1,1491 @@ +# RUN: llvm-mc --disassemble %s -triple=i386 | FileCheck %s --check-prefixes=ATT +# RUN: llvm-mc --disassemble %s -triple=i386 --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL + +# ATT: vcvt2ps2phx %ymm4, %ymm3, %ymm2 +# INTEL: vcvt2ps2phx ymm2, ymm3, ymm4 +0x62,0xf2,0x65,0x28,0x67,0xd4 + +# ATT: vcvt2ps2phx {rn-sae}, %ymm4, %ymm3, %ymm2 +# INTEL: vcvt2ps2phx ymm2, ymm3, ymm4, {rn-sae} +0x62,0xf2,0x61,0x18,0x67,0xd4 + +# ATT: vcvt2ps2phx %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vcvt2ps2phx ymm2 {k7}, ymm3, ymm4 +0x62,0xf2,0x65,0x2f,0x67,0xd4 + +# ATT: vcvt2ps2phx {rz-sae}, %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvt2ps2phx ymm2 {k7} {z}, ymm3, ymm4, {rz-sae} +0x62,0xf2,0x61,0xff,0x67,0xd4 + +# ATT: vcvt2ps2phx %zmm4, %zmm3, %zmm2 +# INTEL: vcvt2ps2phx zmm2, zmm3, zmm4 +0x62,0xf2,0x65,0x48,0x67,0xd4 + +# ATT: vcvt2ps2phx {rn-sae}, %zmm4, %zmm3, %zmm2 +# INTEL: vcvt2ps2phx zmm2, zmm3, zmm4, {rn-sae} +0x62,0xf2,0x65,0x18,0x67,0xd4 + +# ATT: vcvt2ps2phx %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vcvt2ps2phx zmm2 {k7}, zmm3, zmm4 +0x62,0xf2,0x65,0x4f,0x67,0xd4 + +# ATT: vcvt2ps2phx {rz-sae}, %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvt2ps2phx zmm2 {k7} {z}, zmm3, zmm4, {rz-sae} +0x62,0xf2,0x65,0xff,0x67,0xd4 + +# ATT: vcvt2ps2phx %xmm4, %xmm3, %xmm2 +# INTEL: vcvt2ps2phx xmm2, xmm3, xmm4 +0x62,0xf2,0x65,0x08,0x67,0xd4 + +# ATT: vcvt2ps2phx %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vcvt2ps2phx xmm2 {k7}, xmm3, xmm4 +0x62,0xf2,0x65,0x0f,0x67,0xd4 + +# ATT: vcvt2ps2phx %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvt2ps2phx xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf2,0x65,0x8f,0x67,0xd4 + +# ATT: vcvt2ps2phx 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vcvt2ps2phx zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x65,0x48,0x67,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvt2ps2phx 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vcvt2ps2phx zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x65,0x4f,0x67,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvt2ps2phx (%eax){1to16}, %zmm3, %zmm2 +# INTEL: vcvt2ps2phx zmm2, zmm3, dword ptr [eax]{1to16} +0x62,0xf2,0x65,0x58,0x67,0x10 + +# ATT: vcvt2ps2phx -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vcvt2ps2phx zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf2,0x65,0x48,0x67,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvt2ps2phx 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvt2ps2phx zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf2,0x65,0xcf,0x67,0x51,0x7f + +# ATT: vcvt2ps2phx -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvt2ps2phx zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +0x62,0xf2,0x65,0xdf,0x67,0x52,0x80 + +# ATT: vcvt2ps2phx 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vcvt2ps2phx ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x65,0x28,0x67,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvt2ps2phx 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vcvt2ps2phx ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x65,0x2f,0x67,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvt2ps2phx (%eax){1to8}, %ymm3, %ymm2 +# INTEL: vcvt2ps2phx ymm2, ymm3, dword ptr [eax]{1to8} +0x62,0xf2,0x65,0x38,0x67,0x10 + +# ATT: vcvt2ps2phx -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vcvt2ps2phx ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf2,0x65,0x28,0x67,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvt2ps2phx 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvt2ps2phx ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf2,0x65,0xaf,0x67,0x51,0x7f + +# ATT: vcvt2ps2phx -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvt2ps2phx ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +0x62,0xf2,0x65,0xbf,0x67,0x52,0x80 + +# ATT: vcvt2ps2phx 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vcvt2ps2phx xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x65,0x08,0x67,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvt2ps2phx 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vcvt2ps2phx xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x65,0x0f,0x67,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvt2ps2phx (%eax){1to4}, %xmm3, %xmm2 +# INTEL: vcvt2ps2phx xmm2, xmm3, dword ptr [eax]{1to4} +0x62,0xf2,0x65,0x18,0x67,0x10 + +# ATT: vcvt2ps2phx -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vcvt2ps2phx xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf2,0x65,0x08,0x67,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvt2ps2phx 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvt2ps2phx xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf2,0x65,0x8f,0x67,0x51,0x7f + +# ATT: vcvt2ps2phx -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvt2ps2phx xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +0x62,0xf2,0x65,0x9f,0x67,0x52,0x80 + +# ATT: vcvtbiasph2bf8 %zmm4, %zmm3, %ymm2 +# INTEL: vcvtbiasph2bf8 ymm2, zmm3, zmm4 +0x62,0xf2,0x64,0x48,0x74,0xd4 + +# ATT: vcvtbiasph2bf8 %zmm4, %zmm3, %ymm2 {%k7} +# INTEL: vcvtbiasph2bf8 ymm2 {k7}, zmm3, zmm4 +0x62,0xf2,0x64,0x4f,0x74,0xd4 + +# ATT: vcvtbiasph2bf8 %zmm4, %zmm3, %ymm2 {%k7} {z} +# INTEL: vcvtbiasph2bf8 ymm2 {k7} {z}, zmm3, zmm4 +0x62,0xf2,0x64,0xcf,0x74,0xd4 + +# ATT: vcvtbiasph2bf8 %xmm4, %xmm3, %xmm2 +# INTEL: vcvtbiasph2bf8 xmm2, xmm3, xmm4 +0x62,0xf2,0x64,0x08,0x74,0xd4 + +# ATT: vcvtbiasph2bf8 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vcvtbiasph2bf8 xmm2 {k7}, xmm3, xmm4 +0x62,0xf2,0x64,0x0f,0x74,0xd4 + +# ATT: vcvtbiasph2bf8 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2bf8 xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf2,0x64,0x8f,0x74,0xd4 + +# ATT: vcvtbiasph2bf8 %ymm4, %ymm3, %xmm2 +# INTEL: vcvtbiasph2bf8 xmm2, ymm3, ymm4 +0x62,0xf2,0x64,0x28,0x74,0xd4 + +# ATT: vcvtbiasph2bf8 %ymm4, %ymm3, %xmm2 {%k7} +# INTEL: vcvtbiasph2bf8 xmm2 {k7}, ymm3, ymm4 +0x62,0xf2,0x64,0x2f,0x74,0xd4 + +# ATT: vcvtbiasph2bf8 %ymm4, %ymm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2bf8 xmm2 {k7} {z}, ymm3, ymm4 +0x62,0xf2,0x64,0xaf,0x74,0xd4 + +# ATT: vcvtbiasph2bf8 268435456(%esp,%esi,8), %ymm3, %xmm2 +# INTEL: vcvtbiasph2bf8 xmm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x64,0x28,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2bf8 291(%edi,%eax,4), %ymm3, %xmm2 {%k7} +# INTEL: vcvtbiasph2bf8 xmm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x64,0x2f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2bf8 (%eax){1to16}, %ymm3, %xmm2 +# INTEL: vcvtbiasph2bf8 xmm2, ymm3, word ptr [eax]{1to16} +0x62,0xf2,0x64,0x38,0x74,0x10 + +# ATT: vcvtbiasph2bf8 -1024(,%ebp,2), %ymm3, %xmm2 +# INTEL: vcvtbiasph2bf8 xmm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf2,0x64,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtbiasph2bf8 4064(%ecx), %ymm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2bf8 xmm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf2,0x64,0xaf,0x74,0x51,0x7f + +# ATT: vcvtbiasph2bf8 -256(%edx){1to16}, %ymm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2bf8 xmm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf2,0x64,0xbf,0x74,0x52,0x80 + +# ATT: vcvtbiasph2bf8 268435456(%esp,%esi,8), %zmm3, %ymm2 +# INTEL: vcvtbiasph2bf8 ymm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x64,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2bf8 291(%edi,%eax,4), %zmm3, %ymm2 {%k7} +# INTEL: vcvtbiasph2bf8 ymm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x64,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2bf8 (%eax){1to32}, %zmm3, %ymm2 +# INTEL: vcvtbiasph2bf8 ymm2, zmm3, word ptr [eax]{1to32} +0x62,0xf2,0x64,0x58,0x74,0x10 + +# ATT: vcvtbiasph2bf8 -2048(,%ebp,2), %zmm3, %ymm2 +# INTEL: vcvtbiasph2bf8 ymm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf2,0x64,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtbiasph2bf8 8128(%ecx), %zmm3, %ymm2 {%k7} {z} +# INTEL: vcvtbiasph2bf8 ymm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf2,0x64,0xcf,0x74,0x51,0x7f + +# ATT: vcvtbiasph2bf8 -256(%edx){1to32}, %zmm3, %ymm2 {%k7} {z} +# INTEL: vcvtbiasph2bf8 ymm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf2,0x64,0xdf,0x74,0x52,0x80 + +# ATT: vcvtbiasph2bf8 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vcvtbiasph2bf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x64,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2bf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vcvtbiasph2bf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x64,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2bf8 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vcvtbiasph2bf8 xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf2,0x64,0x18,0x74,0x10 + +# ATT: vcvtbiasph2bf8 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vcvtbiasph2bf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf2,0x64,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtbiasph2bf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2bf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf2,0x64,0x8f,0x74,0x51,0x7f + +# ATT: vcvtbiasph2bf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2bf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf2,0x64,0x9f,0x74,0x52,0x80 + +# ATT: vcvtbiasph2bf8s %zmm4, %zmm3, %ymm2 +# INTEL: vcvtbiasph2bf8s ymm2, zmm3, zmm4 +0x62,0xf5,0x64,0x48,0x74,0xd4 + +# ATT: vcvtbiasph2bf8s %zmm4, %zmm3, %ymm2 {%k7} +# INTEL: vcvtbiasph2bf8s ymm2 {k7}, zmm3, zmm4 +0x62,0xf5,0x64,0x4f,0x74,0xd4 + +# ATT: vcvtbiasph2bf8s %zmm4, %zmm3, %ymm2 {%k7} {z} +# INTEL: vcvtbiasph2bf8s ymm2 {k7} {z}, zmm3, zmm4 +0x62,0xf5,0x64,0xcf,0x74,0xd4 + +# ATT: vcvtbiasph2bf8s %xmm4, %xmm3, %xmm2 +# INTEL: vcvtbiasph2bf8s xmm2, xmm3, xmm4 +0x62,0xf5,0x64,0x08,0x74,0xd4 + +# ATT: vcvtbiasph2bf8s %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vcvtbiasph2bf8s xmm2 {k7}, xmm3, xmm4 +0x62,0xf5,0x64,0x0f,0x74,0xd4 + +# ATT: vcvtbiasph2bf8s %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2bf8s xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf5,0x64,0x8f,0x74,0xd4 + +# ATT: vcvtbiasph2bf8s %ymm4, %ymm3, %xmm2 +# INTEL: vcvtbiasph2bf8s xmm2, ymm3, ymm4 +0x62,0xf5,0x64,0x28,0x74,0xd4 + +# ATT: vcvtbiasph2bf8s %ymm4, %ymm3, %xmm2 {%k7} +# INTEL: vcvtbiasph2bf8s xmm2 {k7}, ymm3, ymm4 +0x62,0xf5,0x64,0x2f,0x74,0xd4 + +# ATT: vcvtbiasph2bf8s %ymm4, %ymm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2bf8s xmm2 {k7} {z}, ymm3, ymm4 +0x62,0xf5,0x64,0xaf,0x74,0xd4 + +# ATT: vcvtbiasph2bf8s 268435456(%esp,%esi,8), %ymm3, %xmm2 +# INTEL: vcvtbiasph2bf8s xmm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x64,0x28,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2bf8s 291(%edi,%eax,4), %ymm3, %xmm2 {%k7} +# INTEL: vcvtbiasph2bf8s xmm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x64,0x2f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2bf8s (%eax){1to16}, %ymm3, %xmm2 +# INTEL: vcvtbiasph2bf8s xmm2, ymm3, word ptr [eax]{1to16} +0x62,0xf5,0x64,0x38,0x74,0x10 + +# ATT: vcvtbiasph2bf8s -1024(,%ebp,2), %ymm3, %xmm2 +# INTEL: vcvtbiasph2bf8s xmm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf5,0x64,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtbiasph2bf8s 4064(%ecx), %ymm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2bf8s xmm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf5,0x64,0xaf,0x74,0x51,0x7f + +# ATT: vcvtbiasph2bf8s -256(%edx){1to16}, %ymm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2bf8s xmm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf5,0x64,0xbf,0x74,0x52,0x80 + +# ATT: vcvtbiasph2bf8s 268435456(%esp,%esi,8), %zmm3, %ymm2 +# INTEL: vcvtbiasph2bf8s ymm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x64,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2bf8s 291(%edi,%eax,4), %zmm3, %ymm2 {%k7} +# INTEL: vcvtbiasph2bf8s ymm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x64,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2bf8s (%eax){1to32}, %zmm3, %ymm2 +# INTEL: vcvtbiasph2bf8s ymm2, zmm3, word ptr [eax]{1to32} +0x62,0xf5,0x64,0x58,0x74,0x10 + +# ATT: vcvtbiasph2bf8s -2048(,%ebp,2), %zmm3, %ymm2 +# INTEL: vcvtbiasph2bf8s ymm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf5,0x64,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtbiasph2bf8s 8128(%ecx), %zmm3, %ymm2 {%k7} {z} +# INTEL: vcvtbiasph2bf8s ymm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf5,0x64,0xcf,0x74,0x51,0x7f + +# ATT: vcvtbiasph2bf8s -256(%edx){1to32}, %zmm3, %ymm2 {%k7} {z} +# INTEL: vcvtbiasph2bf8s ymm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf5,0x64,0xdf,0x74,0x52,0x80 + +# ATT: vcvtbiasph2bf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vcvtbiasph2bf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x64,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2bf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vcvtbiasph2bf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x64,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2bf8s (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vcvtbiasph2bf8s xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf5,0x64,0x18,0x74,0x10 + +# ATT: vcvtbiasph2bf8s -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vcvtbiasph2bf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf5,0x64,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtbiasph2bf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2bf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf5,0x64,0x8f,0x74,0x51,0x7f + +# ATT: vcvtbiasph2bf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2bf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf5,0x64,0x9f,0x74,0x52,0x80 + +# ATT: vcvtbiasph2hf8 %zmm4, %zmm3, %ymm2 +# INTEL: vcvtbiasph2hf8 ymm2, zmm3, zmm4 +0x62,0xf5,0x64,0x48,0x18,0xd4 + +# ATT: vcvtbiasph2hf8 %zmm4, %zmm3, %ymm2 {%k7} +# INTEL: vcvtbiasph2hf8 ymm2 {k7}, zmm3, zmm4 +0x62,0xf5,0x64,0x4f,0x18,0xd4 + +# ATT: vcvtbiasph2hf8 %zmm4, %zmm3, %ymm2 {%k7} {z} +# INTEL: vcvtbiasph2hf8 ymm2 {k7} {z}, zmm3, zmm4 +0x62,0xf5,0x64,0xcf,0x18,0xd4 + +# ATT: vcvtbiasph2hf8 %xmm4, %xmm3, %xmm2 +# INTEL: vcvtbiasph2hf8 xmm2, xmm3, xmm4 +0x62,0xf5,0x64,0x08,0x18,0xd4 + +# ATT: vcvtbiasph2hf8 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vcvtbiasph2hf8 xmm2 {k7}, xmm3, xmm4 +0x62,0xf5,0x64,0x0f,0x18,0xd4 + +# ATT: vcvtbiasph2hf8 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2hf8 xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf5,0x64,0x8f,0x18,0xd4 + +# ATT: vcvtbiasph2hf8 %ymm4, %ymm3, %xmm2 +# INTEL: vcvtbiasph2hf8 xmm2, ymm3, ymm4 +0x62,0xf5,0x64,0x28,0x18,0xd4 + +# ATT: vcvtbiasph2hf8 %ymm4, %ymm3, %xmm2 {%k7} +# INTEL: vcvtbiasph2hf8 xmm2 {k7}, ymm3, ymm4 +0x62,0xf5,0x64,0x2f,0x18,0xd4 + +# ATT: vcvtbiasph2hf8 %ymm4, %ymm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2hf8 xmm2 {k7} {z}, ymm3, ymm4 +0x62,0xf5,0x64,0xaf,0x18,0xd4 + +# ATT: vcvtbiasph2hf8 268435456(%esp,%esi,8), %ymm3, %xmm2 +# INTEL: vcvtbiasph2hf8 xmm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x64,0x28,0x18,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2hf8 291(%edi,%eax,4), %ymm3, %xmm2 {%k7} +# INTEL: vcvtbiasph2hf8 xmm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x64,0x2f,0x18,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2hf8 (%eax){1to16}, %ymm3, %xmm2 +# INTEL: vcvtbiasph2hf8 xmm2, ymm3, word ptr [eax]{1to16} +0x62,0xf5,0x64,0x38,0x18,0x10 + +# ATT: vcvtbiasph2hf8 -1024(,%ebp,2), %ymm3, %xmm2 +# INTEL: vcvtbiasph2hf8 xmm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf5,0x64,0x28,0x18,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtbiasph2hf8 4064(%ecx), %ymm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2hf8 xmm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf5,0x64,0xaf,0x18,0x51,0x7f + +# ATT: vcvtbiasph2hf8 -256(%edx){1to16}, %ymm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2hf8 xmm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf5,0x64,0xbf,0x18,0x52,0x80 + +# ATT: vcvtbiasph2hf8 268435456(%esp,%esi,8), %zmm3, %ymm2 +# INTEL: vcvtbiasph2hf8 ymm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x64,0x48,0x18,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2hf8 291(%edi,%eax,4), %zmm3, %ymm2 {%k7} +# INTEL: vcvtbiasph2hf8 ymm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x64,0x4f,0x18,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2hf8 (%eax){1to32}, %zmm3, %ymm2 +# INTEL: vcvtbiasph2hf8 ymm2, zmm3, word ptr [eax]{1to32} +0x62,0xf5,0x64,0x58,0x18,0x10 + +# ATT: vcvtbiasph2hf8 -2048(,%ebp,2), %zmm3, %ymm2 +# INTEL: vcvtbiasph2hf8 ymm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf5,0x64,0x48,0x18,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtbiasph2hf8 8128(%ecx), %zmm3, %ymm2 {%k7} {z} +# INTEL: vcvtbiasph2hf8 ymm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf5,0x64,0xcf,0x18,0x51,0x7f + +# ATT: vcvtbiasph2hf8 -256(%edx){1to32}, %zmm3, %ymm2 {%k7} {z} +# INTEL: vcvtbiasph2hf8 ymm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf5,0x64,0xdf,0x18,0x52,0x80 + +# ATT: vcvtbiasph2hf8 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vcvtbiasph2hf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x64,0x08,0x18,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2hf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vcvtbiasph2hf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x64,0x0f,0x18,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2hf8 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vcvtbiasph2hf8 xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf5,0x64,0x18,0x18,0x10 + +# ATT: vcvtbiasph2hf8 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vcvtbiasph2hf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf5,0x64,0x08,0x18,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtbiasph2hf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2hf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf5,0x64,0x8f,0x18,0x51,0x7f + +# ATT: vcvtbiasph2hf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2hf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf5,0x64,0x9f,0x18,0x52,0x80 + +# ATT: vcvtbiasph2hf8s %zmm4, %zmm3, %ymm2 +# INTEL: vcvtbiasph2hf8s ymm2, zmm3, zmm4 +0x62,0xf5,0x64,0x48,0x1b,0xd4 + +# ATT: vcvtbiasph2hf8s %zmm4, %zmm3, %ymm2 {%k7} +# INTEL: vcvtbiasph2hf8s ymm2 {k7}, zmm3, zmm4 +0x62,0xf5,0x64,0x4f,0x1b,0xd4 + +# ATT: vcvtbiasph2hf8s %zmm4, %zmm3, %ymm2 {%k7} {z} +# INTEL: vcvtbiasph2hf8s ymm2 {k7} {z}, zmm3, zmm4 +0x62,0xf5,0x64,0xcf,0x1b,0xd4 + +# ATT: vcvtbiasph2hf8s %xmm4, %xmm3, %xmm2 +# INTEL: vcvtbiasph2hf8s xmm2, xmm3, xmm4 +0x62,0xf5,0x64,0x08,0x1b,0xd4 + +# ATT: vcvtbiasph2hf8s %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vcvtbiasph2hf8s xmm2 {k7}, xmm3, xmm4 +0x62,0xf5,0x64,0x0f,0x1b,0xd4 + +# ATT: vcvtbiasph2hf8s %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2hf8s xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf5,0x64,0x8f,0x1b,0xd4 + +# ATT: vcvtbiasph2hf8s %ymm4, %ymm3, %xmm2 +# INTEL: vcvtbiasph2hf8s xmm2, ymm3, ymm4 +0x62,0xf5,0x64,0x28,0x1b,0xd4 + +# ATT: vcvtbiasph2hf8s %ymm4, %ymm3, %xmm2 {%k7} +# INTEL: vcvtbiasph2hf8s xmm2 {k7}, ymm3, ymm4 +0x62,0xf5,0x64,0x2f,0x1b,0xd4 + +# ATT: vcvtbiasph2hf8s %ymm4, %ymm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2hf8s xmm2 {k7} {z}, ymm3, ymm4 +0x62,0xf5,0x64,0xaf,0x1b,0xd4 + +# ATT: vcvtbiasph2hf8s 268435456(%esp,%esi,8), %ymm3, %xmm2 +# INTEL: vcvtbiasph2hf8s xmm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x64,0x28,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2hf8s 291(%edi,%eax,4), %ymm3, %xmm2 {%k7} +# INTEL: vcvtbiasph2hf8s xmm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x64,0x2f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2hf8s (%eax){1to16}, %ymm3, %xmm2 +# INTEL: vcvtbiasph2hf8s xmm2, ymm3, word ptr [eax]{1to16} +0x62,0xf5,0x64,0x38,0x1b,0x10 + +# ATT: vcvtbiasph2hf8s -1024(,%ebp,2), %ymm3, %xmm2 +# INTEL: vcvtbiasph2hf8s xmm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf5,0x64,0x28,0x1b,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtbiasph2hf8s 4064(%ecx), %ymm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2hf8s xmm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf5,0x64,0xaf,0x1b,0x51,0x7f + +# ATT: vcvtbiasph2hf8s -256(%edx){1to16}, %ymm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2hf8s xmm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf5,0x64,0xbf,0x1b,0x52,0x80 + +# ATT: vcvtbiasph2hf8s 268435456(%esp,%esi,8), %zmm3, %ymm2 +# INTEL: vcvtbiasph2hf8s ymm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x64,0x48,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2hf8s 291(%edi,%eax,4), %zmm3, %ymm2 {%k7} +# INTEL: vcvtbiasph2hf8s ymm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x64,0x4f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2hf8s (%eax){1to32}, %zmm3, %ymm2 +# INTEL: vcvtbiasph2hf8s ymm2, zmm3, word ptr [eax]{1to32} +0x62,0xf5,0x64,0x58,0x1b,0x10 + +# ATT: vcvtbiasph2hf8s -2048(,%ebp,2), %zmm3, %ymm2 +# INTEL: vcvtbiasph2hf8s ymm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf5,0x64,0x48,0x1b,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtbiasph2hf8s 8128(%ecx), %zmm3, %ymm2 {%k7} {z} +# INTEL: vcvtbiasph2hf8s ymm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf5,0x64,0xcf,0x1b,0x51,0x7f + +# ATT: vcvtbiasph2hf8s -256(%edx){1to32}, %zmm3, %ymm2 {%k7} {z} +# INTEL: vcvtbiasph2hf8s ymm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf5,0x64,0xdf,0x1b,0x52,0x80 + +# ATT: vcvtbiasph2hf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vcvtbiasph2hf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x64,0x08,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2hf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vcvtbiasph2hf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x64,0x0f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2hf8s (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vcvtbiasph2hf8s xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf5,0x64,0x18,0x1b,0x10 + +# ATT: vcvtbiasph2hf8s -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vcvtbiasph2hf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf5,0x64,0x08,0x1b,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtbiasph2hf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2hf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf5,0x64,0x8f,0x1b,0x51,0x7f + +# ATT: vcvtbiasph2hf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtbiasph2hf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf5,0x64,0x9f,0x1b,0x52,0x80 + +# ATT: vcvthf82ph %xmm3, %xmm2 +# INTEL: vcvthf82ph xmm2, xmm3 +0x62,0xf5,0x7f,0x08,0x1e,0xd3 + +# ATT: vcvthf82ph %xmm3, %xmm2 {%k7} +# INTEL: vcvthf82ph xmm2 {k7}, xmm3 +0x62,0xf5,0x7f,0x0f,0x1e,0xd3 + +# ATT: vcvthf82ph %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvthf82ph xmm2 {k7} {z}, xmm3 +0x62,0xf5,0x7f,0x8f,0x1e,0xd3 + +# ATT: vcvthf82ph %xmm3, %ymm2 +# INTEL: vcvthf82ph ymm2, xmm3 +0x62,0xf5,0x7f,0x28,0x1e,0xd3 + +# ATT: vcvthf82ph %xmm3, %ymm2 {%k7} +# INTEL: vcvthf82ph ymm2 {k7}, xmm3 +0x62,0xf5,0x7f,0x2f,0x1e,0xd3 + +# ATT: vcvthf82ph %xmm3, %ymm2 {%k7} {z} +# INTEL: vcvthf82ph ymm2 {k7} {z}, xmm3 +0x62,0xf5,0x7f,0xaf,0x1e,0xd3 + +# ATT: vcvthf82ph %ymm3, %zmm2 +# INTEL: vcvthf82ph zmm2, ymm3 +0x62,0xf5,0x7f,0x48,0x1e,0xd3 + +# ATT: vcvthf82ph %ymm3, %zmm2 {%k7} +# INTEL: vcvthf82ph zmm2 {k7}, ymm3 +0x62,0xf5,0x7f,0x4f,0x1e,0xd3 + +# ATT: vcvthf82ph %ymm3, %zmm2 {%k7} {z} +# INTEL: vcvthf82ph zmm2 {k7} {z}, ymm3 +0x62,0xf5,0x7f,0xcf,0x1e,0xd3 + +# ATT: vcvthf82ph 268435456(%esp,%esi,8), %xmm2 +# INTEL: vcvthf82ph xmm2, qword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7f,0x08,0x1e,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvthf82ph 291(%edi,%eax,4), %xmm2 {%k7} +# INTEL: vcvthf82ph xmm2 {k7}, qword ptr [edi + 4*eax + 291] +0x62,0xf5,0x7f,0x0f,0x1e,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvthf82ph (%eax), %xmm2 +# INTEL: vcvthf82ph xmm2, qword ptr [eax] +0x62,0xf5,0x7f,0x08,0x1e,0x10 + +# ATT: vcvthf82ph -256(,%ebp,2), %xmm2 +# INTEL: vcvthf82ph xmm2, qword ptr [2*ebp - 256] +0x62,0xf5,0x7f,0x08,0x1e,0x14,0x6d,0x00,0xff,0xff,0xff + +# ATT: vcvthf82ph 1016(%ecx), %xmm2 {%k7} {z} +# INTEL: vcvthf82ph xmm2 {k7} {z}, qword ptr [ecx + 1016] +0x62,0xf5,0x7f,0x8f,0x1e,0x51,0x7f + +# ATT: vcvthf82ph -1024(%edx), %xmm2 {%k7} {z} +# INTEL: vcvthf82ph xmm2 {k7} {z}, qword ptr [edx - 1024] +0x62,0xf5,0x7f,0x8f,0x1e,0x52,0x80 + +# ATT: vcvthf82ph 268435456(%esp,%esi,8), %ymm2 +# INTEL: vcvthf82ph ymm2, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7f,0x28,0x1e,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvthf82ph 291(%edi,%eax,4), %ymm2 {%k7} +# INTEL: vcvthf82ph ymm2 {k7}, xmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x7f,0x2f,0x1e,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvthf82ph (%eax), %ymm2 +# INTEL: vcvthf82ph ymm2, xmmword ptr [eax] +0x62,0xf5,0x7f,0x28,0x1e,0x10 + +# ATT: vcvthf82ph -512(,%ebp,2), %ymm2 +# INTEL: vcvthf82ph ymm2, xmmword ptr [2*ebp - 512] +0x62,0xf5,0x7f,0x28,0x1e,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvthf82ph 2032(%ecx), %ymm2 {%k7} {z} +# INTEL: vcvthf82ph ymm2 {k7} {z}, xmmword ptr [ecx + 2032] +0x62,0xf5,0x7f,0xaf,0x1e,0x51,0x7f + +# ATT: vcvthf82ph -2048(%edx), %ymm2 {%k7} {z} +# INTEL: vcvthf82ph ymm2 {k7} {z}, xmmword ptr [edx - 2048] +0x62,0xf5,0x7f,0xaf,0x1e,0x52,0x80 + +# ATT: vcvthf82ph 268435456(%esp,%esi,8), %zmm2 +# INTEL: vcvthf82ph zmm2, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7f,0x48,0x1e,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvthf82ph 291(%edi,%eax,4), %zmm2 {%k7} +# INTEL: vcvthf82ph zmm2 {k7}, ymmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x7f,0x4f,0x1e,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvthf82ph (%eax), %zmm2 +# INTEL: vcvthf82ph zmm2, ymmword ptr [eax] +0x62,0xf5,0x7f,0x48,0x1e,0x10 + +# ATT: vcvthf82ph -1024(,%ebp,2), %zmm2 +# INTEL: vcvthf82ph zmm2, ymmword ptr [2*ebp - 1024] +0x62,0xf5,0x7f,0x48,0x1e,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvthf82ph 4064(%ecx), %zmm2 {%k7} {z} +# INTEL: vcvthf82ph zmm2 {k7} {z}, ymmword ptr [ecx + 4064] +0x62,0xf5,0x7f,0xcf,0x1e,0x51,0x7f + +# ATT: vcvthf82ph -4096(%edx), %zmm2 {%k7} {z} +# INTEL: vcvthf82ph zmm2 {k7} {z}, ymmword ptr [edx - 4096] +0x62,0xf5,0x7f,0xcf,0x1e,0x52,0x80 + +# ATT: vcvtne2ph2bf8 %ymm4, %ymm3, %ymm2 +# INTEL: vcvtne2ph2bf8 ymm2, ymm3, ymm4 +0x62,0xf2,0x67,0x28,0x74,0xd4 + +# ATT: vcvtne2ph2bf8 %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vcvtne2ph2bf8 ymm2 {k7}, ymm3, ymm4 +0x62,0xf2,0x67,0x2f,0x74,0xd4 + +# ATT: vcvtne2ph2bf8 %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvtne2ph2bf8 ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf2,0x67,0xaf,0x74,0xd4 + +# ATT: vcvtne2ph2bf8 %zmm4, %zmm3, %zmm2 +# INTEL: vcvtne2ph2bf8 zmm2, zmm3, zmm4 +0x62,0xf2,0x67,0x48,0x74,0xd4 + +# ATT: vcvtne2ph2bf8 %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vcvtne2ph2bf8 zmm2 {k7}, zmm3, zmm4 +0x62,0xf2,0x67,0x4f,0x74,0xd4 + +# ATT: vcvtne2ph2bf8 %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvtne2ph2bf8 zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf2,0x67,0xcf,0x74,0xd4 + +# ATT: vcvtne2ph2bf8 %xmm4, %xmm3, %xmm2 +# INTEL: vcvtne2ph2bf8 xmm2, xmm3, xmm4 +0x62,0xf2,0x67,0x08,0x74,0xd4 + +# ATT: vcvtne2ph2bf8 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vcvtne2ph2bf8 xmm2 {k7}, xmm3, xmm4 +0x62,0xf2,0x67,0x0f,0x74,0xd4 + +# ATT: vcvtne2ph2bf8 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtne2ph2bf8 xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf2,0x67,0x8f,0x74,0xd4 + +# ATT: vcvtne2ph2bf8 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vcvtne2ph2bf8 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x67,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2bf8 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vcvtne2ph2bf8 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x67,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2bf8 (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vcvtne2ph2bf8 zmm2, zmm3, word ptr [eax]{1to32} +0x62,0xf2,0x67,0x58,0x74,0x10 + +# ATT: vcvtne2ph2bf8 -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vcvtne2ph2bf8 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf2,0x67,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtne2ph2bf8 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvtne2ph2bf8 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf2,0x67,0xcf,0x74,0x51,0x7f + +# ATT: vcvtne2ph2bf8 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvtne2ph2bf8 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf2,0x67,0xdf,0x74,0x52,0x80 + +# ATT: vcvtne2ph2bf8 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vcvtne2ph2bf8 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x67,0x28,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2bf8 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vcvtne2ph2bf8 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x67,0x2f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2bf8 (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vcvtne2ph2bf8 ymm2, ymm3, word ptr [eax]{1to16} +0x62,0xf2,0x67,0x38,0x74,0x10 + +# ATT: vcvtne2ph2bf8 -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vcvtne2ph2bf8 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf2,0x67,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtne2ph2bf8 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvtne2ph2bf8 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf2,0x67,0xaf,0x74,0x51,0x7f + +# ATT: vcvtne2ph2bf8 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvtne2ph2bf8 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf2,0x67,0xbf,0x74,0x52,0x80 + +# ATT: vcvtne2ph2bf8 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vcvtne2ph2bf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x67,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2bf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vcvtne2ph2bf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x67,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2bf8 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vcvtne2ph2bf8 xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf2,0x67,0x18,0x74,0x10 + +# ATT: vcvtne2ph2bf8 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vcvtne2ph2bf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf2,0x67,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtne2ph2bf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtne2ph2bf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf2,0x67,0x8f,0x74,0x51,0x7f + +# ATT: vcvtne2ph2bf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtne2ph2bf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf2,0x67,0x9f,0x74,0x52,0x80 + +# ATT: vcvtne2ph2bf8s %ymm4, %ymm3, %ymm2 +# INTEL: vcvtne2ph2bf8s ymm2, ymm3, ymm4 +0x62,0xf5,0x67,0x28,0x74,0xd4 + +# ATT: vcvtne2ph2bf8s %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vcvtne2ph2bf8s ymm2 {k7}, ymm3, ymm4 +0x62,0xf5,0x67,0x2f,0x74,0xd4 + +# ATT: vcvtne2ph2bf8s %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvtne2ph2bf8s ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf5,0x67,0xaf,0x74,0xd4 + +# ATT: vcvtne2ph2bf8s %zmm4, %zmm3, %zmm2 +# INTEL: vcvtne2ph2bf8s zmm2, zmm3, zmm4 +0x62,0xf5,0x67,0x48,0x74,0xd4 + +# ATT: vcvtne2ph2bf8s %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vcvtne2ph2bf8s zmm2 {k7}, zmm3, zmm4 +0x62,0xf5,0x67,0x4f,0x74,0xd4 + +# ATT: vcvtne2ph2bf8s %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvtne2ph2bf8s zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf5,0x67,0xcf,0x74,0xd4 + +# ATT: vcvtne2ph2bf8s %xmm4, %xmm3, %xmm2 +# INTEL: vcvtne2ph2bf8s xmm2, xmm3, xmm4 +0x62,0xf5,0x67,0x08,0x74,0xd4 + +# ATT: vcvtne2ph2bf8s %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vcvtne2ph2bf8s xmm2 {k7}, xmm3, xmm4 +0x62,0xf5,0x67,0x0f,0x74,0xd4 + +# ATT: vcvtne2ph2bf8s %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtne2ph2bf8s xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf5,0x67,0x8f,0x74,0xd4 + +# ATT: vcvtne2ph2bf8s 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vcvtne2ph2bf8s zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x67,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2bf8s 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vcvtne2ph2bf8s zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x67,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2bf8s (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vcvtne2ph2bf8s zmm2, zmm3, word ptr [eax]{1to32} +0x62,0xf5,0x67,0x58,0x74,0x10 + +# ATT: vcvtne2ph2bf8s -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vcvtne2ph2bf8s zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf5,0x67,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtne2ph2bf8s 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvtne2ph2bf8s zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf5,0x67,0xcf,0x74,0x51,0x7f + +# ATT: vcvtne2ph2bf8s -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvtne2ph2bf8s zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf5,0x67,0xdf,0x74,0x52,0x80 + +# ATT: vcvtne2ph2bf8s 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vcvtne2ph2bf8s ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x67,0x28,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2bf8s 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vcvtne2ph2bf8s ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x67,0x2f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2bf8s (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vcvtne2ph2bf8s ymm2, ymm3, word ptr [eax]{1to16} +0x62,0xf5,0x67,0x38,0x74,0x10 + +# ATT: vcvtne2ph2bf8s -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vcvtne2ph2bf8s ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf5,0x67,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtne2ph2bf8s 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvtne2ph2bf8s ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf5,0x67,0xaf,0x74,0x51,0x7f + +# ATT: vcvtne2ph2bf8s -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvtne2ph2bf8s ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf5,0x67,0xbf,0x74,0x52,0x80 + +# ATT: vcvtne2ph2bf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vcvtne2ph2bf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x67,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2bf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vcvtne2ph2bf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x67,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2bf8s (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vcvtne2ph2bf8s xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf5,0x67,0x18,0x74,0x10 + +# ATT: vcvtne2ph2bf8s -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vcvtne2ph2bf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf5,0x67,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtne2ph2bf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtne2ph2bf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf5,0x67,0x8f,0x74,0x51,0x7f + +# ATT: vcvtne2ph2bf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtne2ph2bf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf5,0x67,0x9f,0x74,0x52,0x80 + +# ATT: vcvtne2ph2hf8 %ymm4, %ymm3, %ymm2 +# INTEL: vcvtne2ph2hf8 ymm2, ymm3, ymm4 +0x62,0xf5,0x67,0x28,0x18,0xd4 + +# ATT: vcvtne2ph2hf8 %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vcvtne2ph2hf8 ymm2 {k7}, ymm3, ymm4 +0x62,0xf5,0x67,0x2f,0x18,0xd4 + +# ATT: vcvtne2ph2hf8 %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvtne2ph2hf8 ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf5,0x67,0xaf,0x18,0xd4 + +# ATT: vcvtne2ph2hf8 %zmm4, %zmm3, %zmm2 +# INTEL: vcvtne2ph2hf8 zmm2, zmm3, zmm4 +0x62,0xf5,0x67,0x48,0x18,0xd4 + +# ATT: vcvtne2ph2hf8 %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vcvtne2ph2hf8 zmm2 {k7}, zmm3, zmm4 +0x62,0xf5,0x67,0x4f,0x18,0xd4 + +# ATT: vcvtne2ph2hf8 %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvtne2ph2hf8 zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf5,0x67,0xcf,0x18,0xd4 + +# ATT: vcvtne2ph2hf8 %xmm4, %xmm3, %xmm2 +# INTEL: vcvtne2ph2hf8 xmm2, xmm3, xmm4 +0x62,0xf5,0x67,0x08,0x18,0xd4 + +# ATT: vcvtne2ph2hf8 %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vcvtne2ph2hf8 xmm2 {k7}, xmm3, xmm4 +0x62,0xf5,0x67,0x0f,0x18,0xd4 + +# ATT: vcvtne2ph2hf8 %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtne2ph2hf8 xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf5,0x67,0x8f,0x18,0xd4 + +# ATT: vcvtne2ph2hf8 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vcvtne2ph2hf8 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x67,0x48,0x18,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2hf8 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vcvtne2ph2hf8 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x67,0x4f,0x18,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2hf8 (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vcvtne2ph2hf8 zmm2, zmm3, word ptr [eax]{1to32} +0x62,0xf5,0x67,0x58,0x18,0x10 + +# ATT: vcvtne2ph2hf8 -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vcvtne2ph2hf8 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf5,0x67,0x48,0x18,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtne2ph2hf8 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvtne2ph2hf8 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf5,0x67,0xcf,0x18,0x51,0x7f + +# ATT: vcvtne2ph2hf8 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvtne2ph2hf8 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf5,0x67,0xdf,0x18,0x52,0x80 + +# ATT: vcvtne2ph2hf8 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vcvtne2ph2hf8 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x67,0x28,0x18,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2hf8 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vcvtne2ph2hf8 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x67,0x2f,0x18,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2hf8 (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vcvtne2ph2hf8 ymm2, ymm3, word ptr [eax]{1to16} +0x62,0xf5,0x67,0x38,0x18,0x10 + +# ATT: vcvtne2ph2hf8 -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vcvtne2ph2hf8 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf5,0x67,0x28,0x18,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtne2ph2hf8 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvtne2ph2hf8 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf5,0x67,0xaf,0x18,0x51,0x7f + +# ATT: vcvtne2ph2hf8 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvtne2ph2hf8 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf5,0x67,0xbf,0x18,0x52,0x80 + +# ATT: vcvtne2ph2hf8 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vcvtne2ph2hf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x67,0x08,0x18,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2hf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vcvtne2ph2hf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x67,0x0f,0x18,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2hf8 (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vcvtne2ph2hf8 xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf5,0x67,0x18,0x18,0x10 + +# ATT: vcvtne2ph2hf8 -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vcvtne2ph2hf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf5,0x67,0x08,0x18,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtne2ph2hf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtne2ph2hf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf5,0x67,0x8f,0x18,0x51,0x7f + +# ATT: vcvtne2ph2hf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtne2ph2hf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf5,0x67,0x9f,0x18,0x52,0x80 + +# ATT: vcvtne2ph2hf8s %ymm4, %ymm3, %ymm2 +# INTEL: vcvtne2ph2hf8s ymm2, ymm3, ymm4 +0x62,0xf5,0x67,0x28,0x1b,0xd4 + +# ATT: vcvtne2ph2hf8s %ymm4, %ymm3, %ymm2 {%k7} +# INTEL: vcvtne2ph2hf8s ymm2 {k7}, ymm3, ymm4 +0x62,0xf5,0x67,0x2f,0x1b,0xd4 + +# ATT: vcvtne2ph2hf8s %ymm4, %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvtne2ph2hf8s ymm2 {k7} {z}, ymm3, ymm4 +0x62,0xf5,0x67,0xaf,0x1b,0xd4 + +# ATT: vcvtne2ph2hf8s %zmm4, %zmm3, %zmm2 +# INTEL: vcvtne2ph2hf8s zmm2, zmm3, zmm4 +0x62,0xf5,0x67,0x48,0x1b,0xd4 + +# ATT: vcvtne2ph2hf8s %zmm4, %zmm3, %zmm2 {%k7} +# INTEL: vcvtne2ph2hf8s zmm2 {k7}, zmm3, zmm4 +0x62,0xf5,0x67,0x4f,0x1b,0xd4 + +# ATT: vcvtne2ph2hf8s %zmm4, %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvtne2ph2hf8s zmm2 {k7} {z}, zmm3, zmm4 +0x62,0xf5,0x67,0xcf,0x1b,0xd4 + +# ATT: vcvtne2ph2hf8s %xmm4, %xmm3, %xmm2 +# INTEL: vcvtne2ph2hf8s xmm2, xmm3, xmm4 +0x62,0xf5,0x67,0x08,0x1b,0xd4 + +# ATT: vcvtne2ph2hf8s %xmm4, %xmm3, %xmm2 {%k7} +# INTEL: vcvtne2ph2hf8s xmm2 {k7}, xmm3, xmm4 +0x62,0xf5,0x67,0x0f,0x1b,0xd4 + +# ATT: vcvtne2ph2hf8s %xmm4, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtne2ph2hf8s xmm2 {k7} {z}, xmm3, xmm4 +0x62,0xf5,0x67,0x8f,0x1b,0xd4 + +# ATT: vcvtne2ph2hf8s 268435456(%esp,%esi,8), %zmm3, %zmm2 +# INTEL: vcvtne2ph2hf8s zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x67,0x48,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2hf8s 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +# INTEL: vcvtne2ph2hf8s zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x67,0x4f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2hf8s (%eax){1to32}, %zmm3, %zmm2 +# INTEL: vcvtne2ph2hf8s zmm2, zmm3, word ptr [eax]{1to32} +0x62,0xf5,0x67,0x58,0x1b,0x10 + +# ATT: vcvtne2ph2hf8s -2048(,%ebp,2), %zmm3, %zmm2 +# INTEL: vcvtne2ph2hf8s zmm2, zmm3, zmmword ptr [2*ebp - 2048] +0x62,0xf5,0x67,0x48,0x1b,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtne2ph2hf8s 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvtne2ph2hf8s zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +0x62,0xf5,0x67,0xcf,0x1b,0x51,0x7f + +# ATT: vcvtne2ph2hf8s -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +# INTEL: vcvtne2ph2hf8s zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +0x62,0xf5,0x67,0xdf,0x1b,0x52,0x80 + +# ATT: vcvtne2ph2hf8s 268435456(%esp,%esi,8), %ymm3, %ymm2 +# INTEL: vcvtne2ph2hf8s ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x67,0x28,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2hf8s 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +# INTEL: vcvtne2ph2hf8s ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x67,0x2f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2hf8s (%eax){1to16}, %ymm3, %ymm2 +# INTEL: vcvtne2ph2hf8s ymm2, ymm3, word ptr [eax]{1to16} +0x62,0xf5,0x67,0x38,0x1b,0x10 + +# ATT: vcvtne2ph2hf8s -1024(,%ebp,2), %ymm3, %ymm2 +# INTEL: vcvtne2ph2hf8s ymm2, ymm3, ymmword ptr [2*ebp - 1024] +0x62,0xf5,0x67,0x28,0x1b,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtne2ph2hf8s 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvtne2ph2hf8s ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +0x62,0xf5,0x67,0xaf,0x1b,0x51,0x7f + +# ATT: vcvtne2ph2hf8s -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +# INTEL: vcvtne2ph2hf8s ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +0x62,0xf5,0x67,0xbf,0x1b,0x52,0x80 + +# ATT: vcvtne2ph2hf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 +# INTEL: vcvtne2ph2hf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x67,0x08,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2hf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +# INTEL: vcvtne2ph2hf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x67,0x0f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2hf8s (%eax){1to8}, %xmm3, %xmm2 +# INTEL: vcvtne2ph2hf8s xmm2, xmm3, word ptr [eax]{1to8} +0x62,0xf5,0x67,0x18,0x1b,0x10 + +# ATT: vcvtne2ph2hf8s -512(,%ebp,2), %xmm3, %xmm2 +# INTEL: vcvtne2ph2hf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] +0x62,0xf5,0x67,0x08,0x1b,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtne2ph2hf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtne2ph2hf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +0x62,0xf5,0x67,0x8f,0x1b,0x51,0x7f + +# ATT: vcvtne2ph2hf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtne2ph2hf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +0x62,0xf5,0x67,0x9f,0x1b,0x52,0x80 + +# ATT: vcvtneph2bf8 %xmm3, %xmm2 +# INTEL: vcvtneph2bf8 xmm2, xmm3 +0x62,0xf2,0x7e,0x08,0x74,0xd3 + +# ATT: vcvtneph2bf8 %xmm3, %xmm2 {%k7} +# INTEL: vcvtneph2bf8 xmm2 {k7}, xmm3 +0x62,0xf2,0x7e,0x0f,0x74,0xd3 + +# ATT: vcvtneph2bf8 %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtneph2bf8 xmm2 {k7} {z}, xmm3 +0x62,0xf2,0x7e,0x8f,0x74,0xd3 + +# ATT: vcvtneph2bf8 %zmm3, %ymm2 +# INTEL: vcvtneph2bf8 ymm2, zmm3 +0x62,0xf2,0x7e,0x48,0x74,0xd3 + +# ATT: vcvtneph2bf8 %zmm3, %ymm2 {%k7} +# INTEL: vcvtneph2bf8 ymm2 {k7}, zmm3 +0x62,0xf2,0x7e,0x4f,0x74,0xd3 + +# ATT: vcvtneph2bf8 %zmm3, %ymm2 {%k7} {z} +# INTEL: vcvtneph2bf8 ymm2 {k7} {z}, zmm3 +0x62,0xf2,0x7e,0xcf,0x74,0xd3 + +# ATT: vcvtneph2bf8 %ymm3, %xmm2 +# INTEL: vcvtneph2bf8 xmm2, ymm3 +0x62,0xf2,0x7e,0x28,0x74,0xd3 + +# ATT: vcvtneph2bf8 %ymm3, %xmm2 {%k7} +# INTEL: vcvtneph2bf8 xmm2 {k7}, ymm3 +0x62,0xf2,0x7e,0x2f,0x74,0xd3 + +# ATT: vcvtneph2bf8 %ymm3, %xmm2 {%k7} {z} +# INTEL: vcvtneph2bf8 xmm2 {k7} {z}, ymm3 +0x62,0xf2,0x7e,0xaf,0x74,0xd3 + +# ATT: vcvtneph2bf8x 268435456(%esp,%esi,8), %xmm2 +# INTEL: vcvtneph2bf8 xmm2, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x7e,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtneph2bf8x 291(%edi,%eax,4), %xmm2 {%k7} +# INTEL: vcvtneph2bf8 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x7e,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtneph2bf8 (%eax){1to8}, %xmm2 +# INTEL: vcvtneph2bf8 xmm2, word ptr [eax]{1to8} +0x62,0xf2,0x7e,0x18,0x74,0x10 + +# ATT: vcvtneph2bf8x -512(,%ebp,2), %xmm2 +# INTEL: vcvtneph2bf8 xmm2, xmmword ptr [2*ebp - 512] +0x62,0xf2,0x7e,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtneph2bf8x 2032(%ecx), %xmm2 {%k7} {z} +# INTEL: vcvtneph2bf8 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +0x62,0xf2,0x7e,0x8f,0x74,0x51,0x7f + +# ATT: vcvtneph2bf8 -256(%edx){1to8}, %xmm2 {%k7} {z} +# INTEL: vcvtneph2bf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +0x62,0xf2,0x7e,0x9f,0x74,0x52,0x80 + +# ATT: vcvtneph2bf8 (%eax){1to16}, %xmm2 +# INTEL: vcvtneph2bf8 xmm2, word ptr [eax]{1to16} +0x62,0xf2,0x7e,0x38,0x74,0x10 + +# ATT: vcvtneph2bf8y -1024(,%ebp,2), %xmm2 +# INTEL: vcvtneph2bf8 xmm2, ymmword ptr [2*ebp - 1024] +0x62,0xf2,0x7e,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtneph2bf8y 4064(%ecx), %xmm2 {%k7} {z} +# INTEL: vcvtneph2bf8 xmm2 {k7} {z}, ymmword ptr [ecx + 4064] +0x62,0xf2,0x7e,0xaf,0x74,0x51,0x7f + +# ATT: vcvtneph2bf8 -256(%edx){1to16}, %xmm2 {%k7} {z} +# INTEL: vcvtneph2bf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to16} +0x62,0xf2,0x7e,0xbf,0x74,0x52,0x80 + +# ATT: vcvtneph2bf8 268435456(%esp,%esi,8), %ymm2 +# INTEL: vcvtneph2bf8 ymm2, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf2,0x7e,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtneph2bf8 291(%edi,%eax,4), %ymm2 {%k7} +# INTEL: vcvtneph2bf8 ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] +0x62,0xf2,0x7e,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtneph2bf8 (%eax){1to32}, %ymm2 +# INTEL: vcvtneph2bf8 ymm2, word ptr [eax]{1to32} +0x62,0xf2,0x7e,0x58,0x74,0x10 + +# ATT: vcvtneph2bf8 -2048(,%ebp,2), %ymm2 +# INTEL: vcvtneph2bf8 ymm2, zmmword ptr [2*ebp - 2048] +0x62,0xf2,0x7e,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtneph2bf8 8128(%ecx), %ymm2 {%k7} {z} +# INTEL: vcvtneph2bf8 ymm2 {k7} {z}, zmmword ptr [ecx + 8128] +0x62,0xf2,0x7e,0xcf,0x74,0x51,0x7f + +# ATT: vcvtneph2bf8 -256(%edx){1to32}, %ymm2 {%k7} {z} +# INTEL: vcvtneph2bf8 ymm2 {k7} {z}, word ptr [edx - 256]{1to32} +0x62,0xf2,0x7e,0xdf,0x74,0x52,0x80 + +# ATT: vcvtneph2bf8s %xmm3, %xmm2 +# INTEL: vcvtneph2bf8s xmm2, xmm3 +0x62,0xf5,0x7e,0x08,0x74,0xd3 + +# ATT: vcvtneph2bf8s %xmm3, %xmm2 {%k7} +# INTEL: vcvtneph2bf8s xmm2 {k7}, xmm3 +0x62,0xf5,0x7e,0x0f,0x74,0xd3 + +# ATT: vcvtneph2bf8s %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtneph2bf8s xmm2 {k7} {z}, xmm3 +0x62,0xf5,0x7e,0x8f,0x74,0xd3 + +# ATT: vcvtneph2bf8s %zmm3, %ymm2 +# INTEL: vcvtneph2bf8s ymm2, zmm3 +0x62,0xf5,0x7e,0x48,0x74,0xd3 + +# ATT: vcvtneph2bf8s %zmm3, %ymm2 {%k7} +# INTEL: vcvtneph2bf8s ymm2 {k7}, zmm3 +0x62,0xf5,0x7e,0x4f,0x74,0xd3 + +# ATT: vcvtneph2bf8s %zmm3, %ymm2 {%k7} {z} +# INTEL: vcvtneph2bf8s ymm2 {k7} {z}, zmm3 +0x62,0xf5,0x7e,0xcf,0x74,0xd3 + +# ATT: vcvtneph2bf8s %ymm3, %xmm2 +# INTEL: vcvtneph2bf8s xmm2, ymm3 +0x62,0xf5,0x7e,0x28,0x74,0xd3 + +# ATT: vcvtneph2bf8s %ymm3, %xmm2 {%k7} +# INTEL: vcvtneph2bf8s xmm2 {k7}, ymm3 +0x62,0xf5,0x7e,0x2f,0x74,0xd3 + +# ATT: vcvtneph2bf8s %ymm3, %xmm2 {%k7} {z} +# INTEL: vcvtneph2bf8s xmm2 {k7} {z}, ymm3 +0x62,0xf5,0x7e,0xaf,0x74,0xd3 + +# ATT: vcvtneph2bf8sx 268435456(%esp,%esi,8), %xmm2 +# INTEL: vcvtneph2bf8s xmm2, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7e,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtneph2bf8sx 291(%edi,%eax,4), %xmm2 {%k7} +# INTEL: vcvtneph2bf8s xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x7e,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtneph2bf8s (%eax){1to8}, %xmm2 +# INTEL: vcvtneph2bf8s xmm2, word ptr [eax]{1to8} +0x62,0xf5,0x7e,0x18,0x74,0x10 + +# ATT: vcvtneph2bf8sx -512(,%ebp,2), %xmm2 +# INTEL: vcvtneph2bf8s xmm2, xmmword ptr [2*ebp - 512] +0x62,0xf5,0x7e,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtneph2bf8sx 2032(%ecx), %xmm2 {%k7} {z} +# INTEL: vcvtneph2bf8s xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +0x62,0xf5,0x7e,0x8f,0x74,0x51,0x7f + +# ATT: vcvtneph2bf8s -256(%edx){1to8}, %xmm2 {%k7} {z} +# INTEL: vcvtneph2bf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +0x62,0xf5,0x7e,0x9f,0x74,0x52,0x80 + +# ATT: vcvtneph2bf8s (%eax){1to16}, %xmm2 +# INTEL: vcvtneph2bf8s xmm2, word ptr [eax]{1to16} +0x62,0xf5,0x7e,0x38,0x74,0x10 + +# ATT: vcvtneph2bf8sy -1024(,%ebp,2), %xmm2 +# INTEL: vcvtneph2bf8s xmm2, ymmword ptr [2*ebp - 1024] +0x62,0xf5,0x7e,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtneph2bf8sy 4064(%ecx), %xmm2 {%k7} {z} +# INTEL: vcvtneph2bf8s xmm2 {k7} {z}, ymmword ptr [ecx + 4064] +0x62,0xf5,0x7e,0xaf,0x74,0x51,0x7f + +# ATT: vcvtneph2bf8s -256(%edx){1to16}, %xmm2 {%k7} {z} +# INTEL: vcvtneph2bf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to16} +0x62,0xf5,0x7e,0xbf,0x74,0x52,0x80 + +# ATT: vcvtneph2bf8s 268435456(%esp,%esi,8), %ymm2 +# INTEL: vcvtneph2bf8s ymm2, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7e,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtneph2bf8s 291(%edi,%eax,4), %ymm2 {%k7} +# INTEL: vcvtneph2bf8s ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x7e,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtneph2bf8s (%eax){1to32}, %ymm2 +# INTEL: vcvtneph2bf8s ymm2, word ptr [eax]{1to32} +0x62,0xf5,0x7e,0x58,0x74,0x10 + +# ATT: vcvtneph2bf8s -2048(,%ebp,2), %ymm2 +# INTEL: vcvtneph2bf8s ymm2, zmmword ptr [2*ebp - 2048] +0x62,0xf5,0x7e,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtneph2bf8s 8128(%ecx), %ymm2 {%k7} {z} +# INTEL: vcvtneph2bf8s ymm2 {k7} {z}, zmmword ptr [ecx + 8128] +0x62,0xf5,0x7e,0xcf,0x74,0x51,0x7f + +# ATT: vcvtneph2bf8s -256(%edx){1to32}, %ymm2 {%k7} {z} +# INTEL: vcvtneph2bf8s ymm2 {k7} {z}, word ptr [edx - 256]{1to32} +0x62,0xf5,0x7e,0xdf,0x74,0x52,0x80 + +# ATT: vcvtneph2hf8 %xmm3, %xmm2 +# INTEL: vcvtneph2hf8 xmm2, xmm3 +0x62,0xf5,0x7e,0x08,0x18,0xd3 + +# ATT: vcvtneph2hf8 %xmm3, %xmm2 {%k7} +# INTEL: vcvtneph2hf8 xmm2 {k7}, xmm3 +0x62,0xf5,0x7e,0x0f,0x18,0xd3 + +# ATT: vcvtneph2hf8 %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtneph2hf8 xmm2 {k7} {z}, xmm3 +0x62,0xf5,0x7e,0x8f,0x18,0xd3 + +# ATT: vcvtneph2hf8 %zmm3, %ymm2 +# INTEL: vcvtneph2hf8 ymm2, zmm3 +0x62,0xf5,0x7e,0x48,0x18,0xd3 + +# ATT: vcvtneph2hf8 %zmm3, %ymm2 {%k7} +# INTEL: vcvtneph2hf8 ymm2 {k7}, zmm3 +0x62,0xf5,0x7e,0x4f,0x18,0xd3 + +# ATT: vcvtneph2hf8 %zmm3, %ymm2 {%k7} {z} +# INTEL: vcvtneph2hf8 ymm2 {k7} {z}, zmm3 +0x62,0xf5,0x7e,0xcf,0x18,0xd3 + +# ATT: vcvtneph2hf8 %ymm3, %xmm2 +# INTEL: vcvtneph2hf8 xmm2, ymm3 +0x62,0xf5,0x7e,0x28,0x18,0xd3 + +# ATT: vcvtneph2hf8 %ymm3, %xmm2 {%k7} +# INTEL: vcvtneph2hf8 xmm2 {k7}, ymm3 +0x62,0xf5,0x7e,0x2f,0x18,0xd3 + +# ATT: vcvtneph2hf8 %ymm3, %xmm2 {%k7} {z} +# INTEL: vcvtneph2hf8 xmm2 {k7} {z}, ymm3 +0x62,0xf5,0x7e,0xaf,0x18,0xd3 + +# ATT: vcvtneph2hf8x 268435456(%esp,%esi,8), %xmm2 +# INTEL: vcvtneph2hf8 xmm2, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7e,0x08,0x18,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtneph2hf8x 291(%edi,%eax,4), %xmm2 {%k7} +# INTEL: vcvtneph2hf8 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x7e,0x0f,0x18,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtneph2hf8 (%eax){1to8}, %xmm2 +# INTEL: vcvtneph2hf8 xmm2, word ptr [eax]{1to8} +0x62,0xf5,0x7e,0x18,0x18,0x10 + +# ATT: vcvtneph2hf8x -512(,%ebp,2), %xmm2 +# INTEL: vcvtneph2hf8 xmm2, xmmword ptr [2*ebp - 512] +0x62,0xf5,0x7e,0x08,0x18,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtneph2hf8x 2032(%ecx), %xmm2 {%k7} {z} +# INTEL: vcvtneph2hf8 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +0x62,0xf5,0x7e,0x8f,0x18,0x51,0x7f + +# ATT: vcvtneph2hf8 -256(%edx){1to8}, %xmm2 {%k7} {z} +# INTEL: vcvtneph2hf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +0x62,0xf5,0x7e,0x9f,0x18,0x52,0x80 + +# ATT: vcvtneph2hf8 (%eax){1to16}, %xmm2 +# INTEL: vcvtneph2hf8 xmm2, word ptr [eax]{1to16} +0x62,0xf5,0x7e,0x38,0x18,0x10 + +# ATT: vcvtneph2hf8y -1024(,%ebp,2), %xmm2 +# INTEL: vcvtneph2hf8 xmm2, ymmword ptr [2*ebp - 1024] +0x62,0xf5,0x7e,0x28,0x18,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtneph2hf8y 4064(%ecx), %xmm2 {%k7} {z} +# INTEL: vcvtneph2hf8 xmm2 {k7} {z}, ymmword ptr [ecx + 4064] +0x62,0xf5,0x7e,0xaf,0x18,0x51,0x7f + +# ATT: vcvtneph2hf8 -256(%edx){1to16}, %xmm2 {%k7} {z} +# INTEL: vcvtneph2hf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to16} +0x62,0xf5,0x7e,0xbf,0x18,0x52,0x80 + +# ATT: vcvtneph2hf8 268435456(%esp,%esi,8), %ymm2 +# INTEL: vcvtneph2hf8 ymm2, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7e,0x48,0x18,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtneph2hf8 291(%edi,%eax,4), %ymm2 {%k7} +# INTEL: vcvtneph2hf8 ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x7e,0x4f,0x18,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtneph2hf8 (%eax){1to32}, %ymm2 +# INTEL: vcvtneph2hf8 ymm2, word ptr [eax]{1to32} +0x62,0xf5,0x7e,0x58,0x18,0x10 + +# ATT: vcvtneph2hf8 -2048(,%ebp,2), %ymm2 +# INTEL: vcvtneph2hf8 ymm2, zmmword ptr [2*ebp - 2048] +0x62,0xf5,0x7e,0x48,0x18,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtneph2hf8 8128(%ecx), %ymm2 {%k7} {z} +# INTEL: vcvtneph2hf8 ymm2 {k7} {z}, zmmword ptr [ecx + 8128] +0x62,0xf5,0x7e,0xcf,0x18,0x51,0x7f + +# ATT: vcvtneph2hf8 -256(%edx){1to32}, %ymm2 {%k7} {z} +# INTEL: vcvtneph2hf8 ymm2 {k7} {z}, word ptr [edx - 256]{1to32} +0x62,0xf5,0x7e,0xdf,0x18,0x52,0x80 + +# ATT: vcvtneph2hf8s %xmm3, %xmm2 +# INTEL: vcvtneph2hf8s xmm2, xmm3 +0x62,0xf5,0x7e,0x08,0x1b,0xd3 + +# ATT: vcvtneph2hf8s %xmm3, %xmm2 {%k7} +# INTEL: vcvtneph2hf8s xmm2 {k7}, xmm3 +0x62,0xf5,0x7e,0x0f,0x1b,0xd3 + +# ATT: vcvtneph2hf8s %xmm3, %xmm2 {%k7} {z} +# INTEL: vcvtneph2hf8s xmm2 {k7} {z}, xmm3 +0x62,0xf5,0x7e,0x8f,0x1b,0xd3 + +# ATT: vcvtneph2hf8s %zmm3, %ymm2 +# INTEL: vcvtneph2hf8s ymm2, zmm3 +0x62,0xf5,0x7e,0x48,0x1b,0xd3 + +# ATT: vcvtneph2hf8s %zmm3, %ymm2 {%k7} +# INTEL: vcvtneph2hf8s ymm2 {k7}, zmm3 +0x62,0xf5,0x7e,0x4f,0x1b,0xd3 + +# ATT: vcvtneph2hf8s %zmm3, %ymm2 {%k7} {z} +# INTEL: vcvtneph2hf8s ymm2 {k7} {z}, zmm3 +0x62,0xf5,0x7e,0xcf,0x1b,0xd3 + +# ATT: vcvtneph2hf8s %ymm3, %xmm2 +# INTEL: vcvtneph2hf8s xmm2, ymm3 +0x62,0xf5,0x7e,0x28,0x1b,0xd3 + +# ATT: vcvtneph2hf8s %ymm3, %xmm2 {%k7} +# INTEL: vcvtneph2hf8s xmm2 {k7}, ymm3 +0x62,0xf5,0x7e,0x2f,0x1b,0xd3 + +# ATT: vcvtneph2hf8s %ymm3, %xmm2 {%k7} {z} +# INTEL: vcvtneph2hf8s xmm2 {k7} {z}, ymm3 +0x62,0xf5,0x7e,0xaf,0x1b,0xd3 + +# ATT: vcvtneph2hf8sx 268435456(%esp,%esi,8), %xmm2 +# INTEL: vcvtneph2hf8s xmm2, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7e,0x08,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtneph2hf8sx 291(%edi,%eax,4), %xmm2 {%k7} +# INTEL: vcvtneph2hf8s xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x7e,0x0f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtneph2hf8s (%eax){1to8}, %xmm2 +# INTEL: vcvtneph2hf8s xmm2, word ptr [eax]{1to8} +0x62,0xf5,0x7e,0x18,0x1b,0x10 + +# ATT: vcvtneph2hf8sx -512(,%ebp,2), %xmm2 +# INTEL: vcvtneph2hf8s xmm2, xmmword ptr [2*ebp - 512] +0x62,0xf5,0x7e,0x08,0x1b,0x14,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtneph2hf8sx 2032(%ecx), %xmm2 {%k7} {z} +# INTEL: vcvtneph2hf8s xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +0x62,0xf5,0x7e,0x8f,0x1b,0x51,0x7f + +# ATT: vcvtneph2hf8s -256(%edx){1to8}, %xmm2 {%k7} {z} +# INTEL: vcvtneph2hf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +0x62,0xf5,0x7e,0x9f,0x1b,0x52,0x80 + +# ATT: vcvtneph2hf8s (%eax){1to16}, %xmm2 +# INTEL: vcvtneph2hf8s xmm2, word ptr [eax]{1to16} +0x62,0xf5,0x7e,0x38,0x1b,0x10 + +# ATT: vcvtneph2hf8sy -1024(,%ebp,2), %xmm2 +# INTEL: vcvtneph2hf8s xmm2, ymmword ptr [2*ebp - 1024] +0x62,0xf5,0x7e,0x28,0x1b,0x14,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtneph2hf8sy 4064(%ecx), %xmm2 {%k7} {z} +# INTEL: vcvtneph2hf8s xmm2 {k7} {z}, ymmword ptr [ecx + 4064] +0x62,0xf5,0x7e,0xaf,0x1b,0x51,0x7f + +# ATT: vcvtneph2hf8s -256(%edx){1to16}, %xmm2 {%k7} {z} +# INTEL: vcvtneph2hf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to16} +0x62,0xf5,0x7e,0xbf,0x1b,0x52,0x80 + +# ATT: vcvtneph2hf8s 268435456(%esp,%esi,8), %ymm2 +# INTEL: vcvtneph2hf8s ymm2, zmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7e,0x48,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vcvtneph2hf8s 291(%edi,%eax,4), %ymm2 {%k7} +# INTEL: vcvtneph2hf8s ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] +0x62,0xf5,0x7e,0x4f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00 + +# ATT: vcvtneph2hf8s (%eax){1to32}, %ymm2 +# INTEL: vcvtneph2hf8s ymm2, word ptr [eax]{1to32} +0x62,0xf5,0x7e,0x58,0x1b,0x10 + +# ATT: vcvtneph2hf8s -2048(,%ebp,2), %ymm2 +# INTEL: vcvtneph2hf8s ymm2, zmmword ptr [2*ebp - 2048] +0x62,0xf5,0x7e,0x48,0x1b,0x14,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtneph2hf8s 8128(%ecx), %ymm2 {%k7} {z} +# INTEL: vcvtneph2hf8s ymm2 {k7} {z}, zmmword ptr [ecx + 8128] +0x62,0xf5,0x7e,0xcf,0x1b,0x51,0x7f + +# ATT: vcvtneph2hf8s -256(%edx){1to32}, %ymm2 {%k7} {z} +# INTEL: vcvtneph2hf8s ymm2 {k7} {z}, word ptr [edx - 256]{1to32} +0x62,0xf5,0x7e,0xdf,0x1b,0x52,0x80 + diff --git a/llvm/test/MC/Disassembler/X86/avx10.2convert-64.txt b/llvm/test/MC/Disassembler/X86/avx10.2convert-64.txt new file mode 100644 index 00000000000000..82bf09c49e9260 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/avx10.2convert-64.txt @@ -0,0 +1,1491 @@ +# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT +# RUN: llvm-mc --disassemble %s -triple=x86_64 --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL + +# ATT: vcvt2ps2phx %ymm24, %ymm23, %ymm22 +# INTEL: vcvt2ps2phx ymm22, ymm23, ymm24 +0x62,0x82,0x45,0x20,0x67,0xf0 + +# ATT: vcvt2ps2phx {rn-sae}, %ymm24, %ymm23, %ymm22 +# INTEL: vcvt2ps2phx ymm22, ymm23, ymm24, {rn-sae} +0x62,0x82,0x41,0x10,0x67,0xf0 + +# ATT: vcvt2ps2phx %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vcvt2ps2phx ymm22 {k7}, ymm23, ymm24 +0x62,0x82,0x45,0x27,0x67,0xf0 + +# ATT: vcvt2ps2phx {rz-sae}, %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvt2ps2phx ymm22 {k7} {z}, ymm23, ymm24, {rz-sae} +0x62,0x82,0x41,0xf7,0x67,0xf0 + +# ATT: vcvt2ps2phx %zmm24, %zmm23, %zmm22 +# INTEL: vcvt2ps2phx zmm22, zmm23, zmm24 +0x62,0x82,0x45,0x40,0x67,0xf0 + +# ATT: vcvt2ps2phx {rn-sae}, %zmm24, %zmm23, %zmm22 +# INTEL: vcvt2ps2phx zmm22, zmm23, zmm24, {rn-sae} +0x62,0x82,0x45,0x10,0x67,0xf0 + +# ATT: vcvt2ps2phx %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vcvt2ps2phx zmm22 {k7}, zmm23, zmm24 +0x62,0x82,0x45,0x47,0x67,0xf0 + +# ATT: vcvt2ps2phx {rz-sae}, %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvt2ps2phx zmm22 {k7} {z}, zmm23, zmm24, {rz-sae} +0x62,0x82,0x45,0xf7,0x67,0xf0 + +# ATT: vcvt2ps2phx %xmm24, %xmm23, %xmm22 +# INTEL: vcvt2ps2phx xmm22, xmm23, xmm24 +0x62,0x82,0x45,0x00,0x67,0xf0 + +# ATT: vcvt2ps2phx %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vcvt2ps2phx xmm22 {k7}, xmm23, xmm24 +0x62,0x82,0x45,0x07,0x67,0xf0 + +# ATT: vcvt2ps2phx %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvt2ps2phx xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x82,0x45,0x87,0x67,0xf0 + +# ATT: vcvt2ps2phx 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vcvt2ps2phx zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x45,0x40,0x67,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvt2ps2phx 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vcvt2ps2phx zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x45,0x47,0x67,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvt2ps2phx (%rip){1to16}, %zmm23, %zmm22 +# INTEL: vcvt2ps2phx zmm22, zmm23, dword ptr [rip]{1to16} +0x62,0xe2,0x45,0x50,0x67,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvt2ps2phx -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vcvt2ps2phx zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe2,0x45,0x40,0x67,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvt2ps2phx 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvt2ps2phx zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe2,0x45,0xc7,0x67,0x71,0x7f + +# ATT: vcvt2ps2phx -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvt2ps2phx zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16} +0x62,0xe2,0x45,0xd7,0x67,0x72,0x80 + +# ATT: vcvt2ps2phx 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vcvt2ps2phx ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x45,0x20,0x67,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvt2ps2phx 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vcvt2ps2phx ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x45,0x27,0x67,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvt2ps2phx (%rip){1to8}, %ymm23, %ymm22 +# INTEL: vcvt2ps2phx ymm22, ymm23, dword ptr [rip]{1to8} +0x62,0xe2,0x45,0x30,0x67,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvt2ps2phx -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vcvt2ps2phx ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe2,0x45,0x20,0x67,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvt2ps2phx 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvt2ps2phx ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe2,0x45,0xa7,0x67,0x71,0x7f + +# ATT: vcvt2ps2phx -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvt2ps2phx ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8} +0x62,0xe2,0x45,0xb7,0x67,0x72,0x80 + +# ATT: vcvt2ps2phx 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vcvt2ps2phx xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x45,0x00,0x67,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvt2ps2phx 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vcvt2ps2phx xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x45,0x07,0x67,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvt2ps2phx (%rip){1to4}, %xmm23, %xmm22 +# INTEL: vcvt2ps2phx xmm22, xmm23, dword ptr [rip]{1to4} +0x62,0xe2,0x45,0x10,0x67,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvt2ps2phx -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vcvt2ps2phx xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe2,0x45,0x00,0x67,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvt2ps2phx 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvt2ps2phx xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe2,0x45,0x87,0x67,0x71,0x7f + +# ATT: vcvt2ps2phx -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvt2ps2phx xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4} +0x62,0xe2,0x45,0x97,0x67,0x72,0x80 + +# ATT: vcvtbiasph2bf8 %zmm24, %zmm23, %ymm22 +# INTEL: vcvtbiasph2bf8 ymm22, zmm23, zmm24 +0x62,0x82,0x44,0x40,0x74,0xf0 + +# ATT: vcvtbiasph2bf8 %zmm24, %zmm23, %ymm22 {%k7} +# INTEL: vcvtbiasph2bf8 ymm22 {k7}, zmm23, zmm24 +0x62,0x82,0x44,0x47,0x74,0xf0 + +# ATT: vcvtbiasph2bf8 %zmm24, %zmm23, %ymm22 {%k7} {z} +# INTEL: vcvtbiasph2bf8 ymm22 {k7} {z}, zmm23, zmm24 +0x62,0x82,0x44,0xc7,0x74,0xf0 + +# ATT: vcvtbiasph2bf8 %xmm24, %xmm23, %xmm22 +# INTEL: vcvtbiasph2bf8 xmm22, xmm23, xmm24 +0x62,0x82,0x44,0x00,0x74,0xf0 + +# ATT: vcvtbiasph2bf8 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vcvtbiasph2bf8 xmm22 {k7}, xmm23, xmm24 +0x62,0x82,0x44,0x07,0x74,0xf0 + +# ATT: vcvtbiasph2bf8 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2bf8 xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x82,0x44,0x87,0x74,0xf0 + +# ATT: vcvtbiasph2bf8 %ymm24, %ymm23, %xmm22 +# INTEL: vcvtbiasph2bf8 xmm22, ymm23, ymm24 +0x62,0x82,0x44,0x20,0x74,0xf0 + +# ATT: vcvtbiasph2bf8 %ymm24, %ymm23, %xmm22 {%k7} +# INTEL: vcvtbiasph2bf8 xmm22 {k7}, ymm23, ymm24 +0x62,0x82,0x44,0x27,0x74,0xf0 + +# ATT: vcvtbiasph2bf8 %ymm24, %ymm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2bf8 xmm22 {k7} {z}, ymm23, ymm24 +0x62,0x82,0x44,0xa7,0x74,0xf0 + +# ATT: vcvtbiasph2bf8 268435456(%rbp,%r14,8), %ymm23, %xmm22 +# INTEL: vcvtbiasph2bf8 xmm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x44,0x20,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2bf8 291(%r8,%rax,4), %ymm23, %xmm22 {%k7} +# INTEL: vcvtbiasph2bf8 xmm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x44,0x27,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2bf8 (%rip){1to16}, %ymm23, %xmm22 +# INTEL: vcvtbiasph2bf8 xmm22, ymm23, word ptr [rip]{1to16} +0x62,0xe2,0x44,0x30,0x74,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtbiasph2bf8 -1024(,%rbp,2), %ymm23, %xmm22 +# INTEL: vcvtbiasph2bf8 xmm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe2,0x44,0x20,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtbiasph2bf8 4064(%rcx), %ymm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2bf8 xmm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe2,0x44,0xa7,0x74,0x71,0x7f + +# ATT: vcvtbiasph2bf8 -256(%rdx){1to16}, %ymm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2bf8 xmm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe2,0x44,0xb7,0x74,0x72,0x80 + +# ATT: vcvtbiasph2bf8 268435456(%rbp,%r14,8), %zmm23, %ymm22 +# INTEL: vcvtbiasph2bf8 ymm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x44,0x40,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2bf8 291(%r8,%rax,4), %zmm23, %ymm22 {%k7} +# INTEL: vcvtbiasph2bf8 ymm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x44,0x47,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2bf8 (%rip){1to32}, %zmm23, %ymm22 +# INTEL: vcvtbiasph2bf8 ymm22, zmm23, word ptr [rip]{1to32} +0x62,0xe2,0x44,0x50,0x74,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtbiasph2bf8 -2048(,%rbp,2), %zmm23, %ymm22 +# INTEL: vcvtbiasph2bf8 ymm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe2,0x44,0x40,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtbiasph2bf8 8128(%rcx), %zmm23, %ymm22 {%k7} {z} +# INTEL: vcvtbiasph2bf8 ymm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe2,0x44,0xc7,0x74,0x71,0x7f + +# ATT: vcvtbiasph2bf8 -256(%rdx){1to32}, %zmm23, %ymm22 {%k7} {z} +# INTEL: vcvtbiasph2bf8 ymm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe2,0x44,0xd7,0x74,0x72,0x80 + +# ATT: vcvtbiasph2bf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vcvtbiasph2bf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x44,0x00,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2bf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vcvtbiasph2bf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x44,0x07,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2bf8 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vcvtbiasph2bf8 xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe2,0x44,0x10,0x74,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtbiasph2bf8 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vcvtbiasph2bf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe2,0x44,0x00,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtbiasph2bf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2bf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe2,0x44,0x87,0x74,0x71,0x7f + +# ATT: vcvtbiasph2bf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2bf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe2,0x44,0x97,0x74,0x72,0x80 + +# ATT: vcvtbiasph2bf8s %zmm24, %zmm23, %ymm22 +# INTEL: vcvtbiasph2bf8s ymm22, zmm23, zmm24 +0x62,0x85,0x44,0x40,0x74,0xf0 + +# ATT: vcvtbiasph2bf8s %zmm24, %zmm23, %ymm22 {%k7} +# INTEL: vcvtbiasph2bf8s ymm22 {k7}, zmm23, zmm24 +0x62,0x85,0x44,0x47,0x74,0xf0 + +# ATT: vcvtbiasph2bf8s %zmm24, %zmm23, %ymm22 {%k7} {z} +# INTEL: vcvtbiasph2bf8s ymm22 {k7} {z}, zmm23, zmm24 +0x62,0x85,0x44,0xc7,0x74,0xf0 + +# ATT: vcvtbiasph2bf8s %xmm24, %xmm23, %xmm22 +# INTEL: vcvtbiasph2bf8s xmm22, xmm23, xmm24 +0x62,0x85,0x44,0x00,0x74,0xf0 + +# ATT: vcvtbiasph2bf8s %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vcvtbiasph2bf8s xmm22 {k7}, xmm23, xmm24 +0x62,0x85,0x44,0x07,0x74,0xf0 + +# ATT: vcvtbiasph2bf8s %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2bf8s xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x85,0x44,0x87,0x74,0xf0 + +# ATT: vcvtbiasph2bf8s %ymm24, %ymm23, %xmm22 +# INTEL: vcvtbiasph2bf8s xmm22, ymm23, ymm24 +0x62,0x85,0x44,0x20,0x74,0xf0 + +# ATT: vcvtbiasph2bf8s %ymm24, %ymm23, %xmm22 {%k7} +# INTEL: vcvtbiasph2bf8s xmm22 {k7}, ymm23, ymm24 +0x62,0x85,0x44,0x27,0x74,0xf0 + +# ATT: vcvtbiasph2bf8s %ymm24, %ymm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2bf8s xmm22 {k7} {z}, ymm23, ymm24 +0x62,0x85,0x44,0xa7,0x74,0xf0 + +# ATT: vcvtbiasph2bf8s 268435456(%rbp,%r14,8), %ymm23, %xmm22 +# INTEL: vcvtbiasph2bf8s xmm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x44,0x20,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2bf8s 291(%r8,%rax,4), %ymm23, %xmm22 {%k7} +# INTEL: vcvtbiasph2bf8s xmm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x44,0x27,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2bf8s (%rip){1to16}, %ymm23, %xmm22 +# INTEL: vcvtbiasph2bf8s xmm22, ymm23, word ptr [rip]{1to16} +0x62,0xe5,0x44,0x30,0x74,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtbiasph2bf8s -1024(,%rbp,2), %ymm23, %xmm22 +# INTEL: vcvtbiasph2bf8s xmm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe5,0x44,0x20,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtbiasph2bf8s 4064(%rcx), %ymm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2bf8s xmm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe5,0x44,0xa7,0x74,0x71,0x7f + +# ATT: vcvtbiasph2bf8s -256(%rdx){1to16}, %ymm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2bf8s xmm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe5,0x44,0xb7,0x74,0x72,0x80 + +# ATT: vcvtbiasph2bf8s 268435456(%rbp,%r14,8), %zmm23, %ymm22 +# INTEL: vcvtbiasph2bf8s ymm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x44,0x40,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2bf8s 291(%r8,%rax,4), %zmm23, %ymm22 {%k7} +# INTEL: vcvtbiasph2bf8s ymm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x44,0x47,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2bf8s (%rip){1to32}, %zmm23, %ymm22 +# INTEL: vcvtbiasph2bf8s ymm22, zmm23, word ptr [rip]{1to32} +0x62,0xe5,0x44,0x50,0x74,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtbiasph2bf8s -2048(,%rbp,2), %zmm23, %ymm22 +# INTEL: vcvtbiasph2bf8s ymm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe5,0x44,0x40,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtbiasph2bf8s 8128(%rcx), %zmm23, %ymm22 {%k7} {z} +# INTEL: vcvtbiasph2bf8s ymm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe5,0x44,0xc7,0x74,0x71,0x7f + +# ATT: vcvtbiasph2bf8s -256(%rdx){1to32}, %zmm23, %ymm22 {%k7} {z} +# INTEL: vcvtbiasph2bf8s ymm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe5,0x44,0xd7,0x74,0x72,0x80 + +# ATT: vcvtbiasph2bf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vcvtbiasph2bf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x44,0x00,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2bf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vcvtbiasph2bf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x44,0x07,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2bf8s (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vcvtbiasph2bf8s xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe5,0x44,0x10,0x74,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtbiasph2bf8s -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vcvtbiasph2bf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe5,0x44,0x00,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtbiasph2bf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2bf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe5,0x44,0x87,0x74,0x71,0x7f + +# ATT: vcvtbiasph2bf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2bf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe5,0x44,0x97,0x74,0x72,0x80 + +# ATT: vcvtbiasph2hf8 %zmm24, %zmm23, %ymm22 +# INTEL: vcvtbiasph2hf8 ymm22, zmm23, zmm24 +0x62,0x85,0x44,0x40,0x18,0xf0 + +# ATT: vcvtbiasph2hf8 %zmm24, %zmm23, %ymm22 {%k7} +# INTEL: vcvtbiasph2hf8 ymm22 {k7}, zmm23, zmm24 +0x62,0x85,0x44,0x47,0x18,0xf0 + +# ATT: vcvtbiasph2hf8 %zmm24, %zmm23, %ymm22 {%k7} {z} +# INTEL: vcvtbiasph2hf8 ymm22 {k7} {z}, zmm23, zmm24 +0x62,0x85,0x44,0xc7,0x18,0xf0 + +# ATT: vcvtbiasph2hf8 %xmm24, %xmm23, %xmm22 +# INTEL: vcvtbiasph2hf8 xmm22, xmm23, xmm24 +0x62,0x85,0x44,0x00,0x18,0xf0 + +# ATT: vcvtbiasph2hf8 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vcvtbiasph2hf8 xmm22 {k7}, xmm23, xmm24 +0x62,0x85,0x44,0x07,0x18,0xf0 + +# ATT: vcvtbiasph2hf8 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2hf8 xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x85,0x44,0x87,0x18,0xf0 + +# ATT: vcvtbiasph2hf8 %ymm24, %ymm23, %xmm22 +# INTEL: vcvtbiasph2hf8 xmm22, ymm23, ymm24 +0x62,0x85,0x44,0x20,0x18,0xf0 + +# ATT: vcvtbiasph2hf8 %ymm24, %ymm23, %xmm22 {%k7} +# INTEL: vcvtbiasph2hf8 xmm22 {k7}, ymm23, ymm24 +0x62,0x85,0x44,0x27,0x18,0xf0 + +# ATT: vcvtbiasph2hf8 %ymm24, %ymm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2hf8 xmm22 {k7} {z}, ymm23, ymm24 +0x62,0x85,0x44,0xa7,0x18,0xf0 + +# ATT: vcvtbiasph2hf8 268435456(%rbp,%r14,8), %ymm23, %xmm22 +# INTEL: vcvtbiasph2hf8 xmm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x44,0x20,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2hf8 291(%r8,%rax,4), %ymm23, %xmm22 {%k7} +# INTEL: vcvtbiasph2hf8 xmm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x44,0x27,0x18,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2hf8 (%rip){1to16}, %ymm23, %xmm22 +# INTEL: vcvtbiasph2hf8 xmm22, ymm23, word ptr [rip]{1to16} +0x62,0xe5,0x44,0x30,0x18,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtbiasph2hf8 -1024(,%rbp,2), %ymm23, %xmm22 +# INTEL: vcvtbiasph2hf8 xmm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe5,0x44,0x20,0x18,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtbiasph2hf8 4064(%rcx), %ymm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2hf8 xmm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe5,0x44,0xa7,0x18,0x71,0x7f + +# ATT: vcvtbiasph2hf8 -256(%rdx){1to16}, %ymm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2hf8 xmm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe5,0x44,0xb7,0x18,0x72,0x80 + +# ATT: vcvtbiasph2hf8 268435456(%rbp,%r14,8), %zmm23, %ymm22 +# INTEL: vcvtbiasph2hf8 ymm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x44,0x40,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2hf8 291(%r8,%rax,4), %zmm23, %ymm22 {%k7} +# INTEL: vcvtbiasph2hf8 ymm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x44,0x47,0x18,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2hf8 (%rip){1to32}, %zmm23, %ymm22 +# INTEL: vcvtbiasph2hf8 ymm22, zmm23, word ptr [rip]{1to32} +0x62,0xe5,0x44,0x50,0x18,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtbiasph2hf8 -2048(,%rbp,2), %zmm23, %ymm22 +# INTEL: vcvtbiasph2hf8 ymm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe5,0x44,0x40,0x18,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtbiasph2hf8 8128(%rcx), %zmm23, %ymm22 {%k7} {z} +# INTEL: vcvtbiasph2hf8 ymm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe5,0x44,0xc7,0x18,0x71,0x7f + +# ATT: vcvtbiasph2hf8 -256(%rdx){1to32}, %zmm23, %ymm22 {%k7} {z} +# INTEL: vcvtbiasph2hf8 ymm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe5,0x44,0xd7,0x18,0x72,0x80 + +# ATT: vcvtbiasph2hf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vcvtbiasph2hf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x44,0x00,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2hf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vcvtbiasph2hf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x44,0x07,0x18,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2hf8 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vcvtbiasph2hf8 xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe5,0x44,0x10,0x18,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtbiasph2hf8 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vcvtbiasph2hf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe5,0x44,0x00,0x18,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtbiasph2hf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2hf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe5,0x44,0x87,0x18,0x71,0x7f + +# ATT: vcvtbiasph2hf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2hf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe5,0x44,0x97,0x18,0x72,0x80 + +# ATT: vcvtbiasph2hf8s %zmm24, %zmm23, %ymm22 +# INTEL: vcvtbiasph2hf8s ymm22, zmm23, zmm24 +0x62,0x85,0x44,0x40,0x1b,0xf0 + +# ATT: vcvtbiasph2hf8s %zmm24, %zmm23, %ymm22 {%k7} +# INTEL: vcvtbiasph2hf8s ymm22 {k7}, zmm23, zmm24 +0x62,0x85,0x44,0x47,0x1b,0xf0 + +# ATT: vcvtbiasph2hf8s %zmm24, %zmm23, %ymm22 {%k7} {z} +# INTEL: vcvtbiasph2hf8s ymm22 {k7} {z}, zmm23, zmm24 +0x62,0x85,0x44,0xc7,0x1b,0xf0 + +# ATT: vcvtbiasph2hf8s %xmm24, %xmm23, %xmm22 +# INTEL: vcvtbiasph2hf8s xmm22, xmm23, xmm24 +0x62,0x85,0x44,0x00,0x1b,0xf0 + +# ATT: vcvtbiasph2hf8s %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vcvtbiasph2hf8s xmm22 {k7}, xmm23, xmm24 +0x62,0x85,0x44,0x07,0x1b,0xf0 + +# ATT: vcvtbiasph2hf8s %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2hf8s xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x85,0x44,0x87,0x1b,0xf0 + +# ATT: vcvtbiasph2hf8s %ymm24, %ymm23, %xmm22 +# INTEL: vcvtbiasph2hf8s xmm22, ymm23, ymm24 +0x62,0x85,0x44,0x20,0x1b,0xf0 + +# ATT: vcvtbiasph2hf8s %ymm24, %ymm23, %xmm22 {%k7} +# INTEL: vcvtbiasph2hf8s xmm22 {k7}, ymm23, ymm24 +0x62,0x85,0x44,0x27,0x1b,0xf0 + +# ATT: vcvtbiasph2hf8s %ymm24, %ymm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2hf8s xmm22 {k7} {z}, ymm23, ymm24 +0x62,0x85,0x44,0xa7,0x1b,0xf0 + +# ATT: vcvtbiasph2hf8s 268435456(%rbp,%r14,8), %ymm23, %xmm22 +# INTEL: vcvtbiasph2hf8s xmm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x44,0x20,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2hf8s 291(%r8,%rax,4), %ymm23, %xmm22 {%k7} +# INTEL: vcvtbiasph2hf8s xmm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x44,0x27,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2hf8s (%rip){1to16}, %ymm23, %xmm22 +# INTEL: vcvtbiasph2hf8s xmm22, ymm23, word ptr [rip]{1to16} +0x62,0xe5,0x44,0x30,0x1b,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtbiasph2hf8s -1024(,%rbp,2), %ymm23, %xmm22 +# INTEL: vcvtbiasph2hf8s xmm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe5,0x44,0x20,0x1b,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtbiasph2hf8s 4064(%rcx), %ymm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2hf8s xmm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe5,0x44,0xa7,0x1b,0x71,0x7f + +# ATT: vcvtbiasph2hf8s -256(%rdx){1to16}, %ymm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2hf8s xmm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe5,0x44,0xb7,0x1b,0x72,0x80 + +# ATT: vcvtbiasph2hf8s 268435456(%rbp,%r14,8), %zmm23, %ymm22 +# INTEL: vcvtbiasph2hf8s ymm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x44,0x40,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2hf8s 291(%r8,%rax,4), %zmm23, %ymm22 {%k7} +# INTEL: vcvtbiasph2hf8s ymm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x44,0x47,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2hf8s (%rip){1to32}, %zmm23, %ymm22 +# INTEL: vcvtbiasph2hf8s ymm22, zmm23, word ptr [rip]{1to32} +0x62,0xe5,0x44,0x50,0x1b,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtbiasph2hf8s -2048(,%rbp,2), %zmm23, %ymm22 +# INTEL: vcvtbiasph2hf8s ymm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe5,0x44,0x40,0x1b,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtbiasph2hf8s 8128(%rcx), %zmm23, %ymm22 {%k7} {z} +# INTEL: vcvtbiasph2hf8s ymm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe5,0x44,0xc7,0x1b,0x71,0x7f + +# ATT: vcvtbiasph2hf8s -256(%rdx){1to32}, %zmm23, %ymm22 {%k7} {z} +# INTEL: vcvtbiasph2hf8s ymm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe5,0x44,0xd7,0x1b,0x72,0x80 + +# ATT: vcvtbiasph2hf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vcvtbiasph2hf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x44,0x00,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtbiasph2hf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vcvtbiasph2hf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x44,0x07,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtbiasph2hf8s (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vcvtbiasph2hf8s xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe5,0x44,0x10,0x1b,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtbiasph2hf8s -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vcvtbiasph2hf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe5,0x44,0x00,0x1b,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtbiasph2hf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2hf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe5,0x44,0x87,0x1b,0x71,0x7f + +# ATT: vcvtbiasph2hf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtbiasph2hf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe5,0x44,0x97,0x1b,0x72,0x80 + +# ATT: vcvthf82ph %xmm23, %xmm22 +# INTEL: vcvthf82ph xmm22, xmm23 +0x62,0xa5,0x7f,0x08,0x1e,0xf7 + +# ATT: vcvthf82ph %xmm23, %xmm22 {%k7} +# INTEL: vcvthf82ph xmm22 {k7}, xmm23 +0x62,0xa5,0x7f,0x0f,0x1e,0xf7 + +# ATT: vcvthf82ph %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvthf82ph xmm22 {k7} {z}, xmm23 +0x62,0xa5,0x7f,0x8f,0x1e,0xf7 + +# ATT: vcvthf82ph %xmm23, %ymm22 +# INTEL: vcvthf82ph ymm22, xmm23 +0x62,0xa5,0x7f,0x28,0x1e,0xf7 + +# ATT: vcvthf82ph %xmm23, %ymm22 {%k7} +# INTEL: vcvthf82ph ymm22 {k7}, xmm23 +0x62,0xa5,0x7f,0x2f,0x1e,0xf7 + +# ATT: vcvthf82ph %xmm23, %ymm22 {%k7} {z} +# INTEL: vcvthf82ph ymm22 {k7} {z}, xmm23 +0x62,0xa5,0x7f,0xaf,0x1e,0xf7 + +# ATT: vcvthf82ph %ymm23, %zmm22 +# INTEL: vcvthf82ph zmm22, ymm23 +0x62,0xa5,0x7f,0x48,0x1e,0xf7 + +# ATT: vcvthf82ph %ymm23, %zmm22 {%k7} +# INTEL: vcvthf82ph zmm22 {k7}, ymm23 +0x62,0xa5,0x7f,0x4f,0x1e,0xf7 + +# ATT: vcvthf82ph %ymm23, %zmm22 {%k7} {z} +# INTEL: vcvthf82ph zmm22 {k7} {z}, ymm23 +0x62,0xa5,0x7f,0xcf,0x1e,0xf7 + +# ATT: vcvthf82ph 268435456(%rbp,%r14,8), %xmm22 +# INTEL: vcvthf82ph xmm22, qword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x7f,0x08,0x1e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvthf82ph 291(%r8,%rax,4), %xmm22 {%k7} +# INTEL: vcvthf82ph xmm22 {k7}, qword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x7f,0x0f,0x1e,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvthf82ph (%rip), %xmm22 +# INTEL: vcvthf82ph xmm22, qword ptr [rip] +0x62,0xe5,0x7f,0x08,0x1e,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvthf82ph -256(,%rbp,2), %xmm22 +# INTEL: vcvthf82ph xmm22, qword ptr [2*rbp - 256] +0x62,0xe5,0x7f,0x08,0x1e,0x34,0x6d,0x00,0xff,0xff,0xff + +# ATT: vcvthf82ph 1016(%rcx), %xmm22 {%k7} {z} +# INTEL: vcvthf82ph xmm22 {k7} {z}, qword ptr [rcx + 1016] +0x62,0xe5,0x7f,0x8f,0x1e,0x71,0x7f + +# ATT: vcvthf82ph -1024(%rdx), %xmm22 {%k7} {z} +# INTEL: vcvthf82ph xmm22 {k7} {z}, qword ptr [rdx - 1024] +0x62,0xe5,0x7f,0x8f,0x1e,0x72,0x80 + +# ATT: vcvthf82ph 268435456(%rbp,%r14,8), %ymm22 +# INTEL: vcvthf82ph ymm22, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x7f,0x28,0x1e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvthf82ph 291(%r8,%rax,4), %ymm22 {%k7} +# INTEL: vcvthf82ph ymm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x7f,0x2f,0x1e,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvthf82ph (%rip), %ymm22 +# INTEL: vcvthf82ph ymm22, xmmword ptr [rip] +0x62,0xe5,0x7f,0x28,0x1e,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvthf82ph -512(,%rbp,2), %ymm22 +# INTEL: vcvthf82ph ymm22, xmmword ptr [2*rbp - 512] +0x62,0xe5,0x7f,0x28,0x1e,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvthf82ph 2032(%rcx), %ymm22 {%k7} {z} +# INTEL: vcvthf82ph ymm22 {k7} {z}, xmmword ptr [rcx + 2032] +0x62,0xe5,0x7f,0xaf,0x1e,0x71,0x7f + +# ATT: vcvthf82ph -2048(%rdx), %ymm22 {%k7} {z} +# INTEL: vcvthf82ph ymm22 {k7} {z}, xmmword ptr [rdx - 2048] +0x62,0xe5,0x7f,0xaf,0x1e,0x72,0x80 + +# ATT: vcvthf82ph 268435456(%rbp,%r14,8), %zmm22 +# INTEL: vcvthf82ph zmm22, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x7f,0x48,0x1e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvthf82ph 291(%r8,%rax,4), %zmm22 {%k7} +# INTEL: vcvthf82ph zmm22 {k7}, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x7f,0x4f,0x1e,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvthf82ph (%rip), %zmm22 +# INTEL: vcvthf82ph zmm22, ymmword ptr [rip] +0x62,0xe5,0x7f,0x48,0x1e,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvthf82ph -1024(,%rbp,2), %zmm22 +# INTEL: vcvthf82ph zmm22, ymmword ptr [2*rbp - 1024] +0x62,0xe5,0x7f,0x48,0x1e,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvthf82ph 4064(%rcx), %zmm22 {%k7} {z} +# INTEL: vcvthf82ph zmm22 {k7} {z}, ymmword ptr [rcx + 4064] +0x62,0xe5,0x7f,0xcf,0x1e,0x71,0x7f + +# ATT: vcvthf82ph -4096(%rdx), %zmm22 {%k7} {z} +# INTEL: vcvthf82ph zmm22 {k7} {z}, ymmword ptr [rdx - 4096] +0x62,0xe5,0x7f,0xcf,0x1e,0x72,0x80 + +# ATT: vcvtne2ph2bf8 %ymm24, %ymm23, %ymm22 +# INTEL: vcvtne2ph2bf8 ymm22, ymm23, ymm24 +0x62,0x82,0x47,0x20,0x74,0xf0 + +# ATT: vcvtne2ph2bf8 %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vcvtne2ph2bf8 ymm22 {k7}, ymm23, ymm24 +0x62,0x82,0x47,0x27,0x74,0xf0 + +# ATT: vcvtne2ph2bf8 %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvtne2ph2bf8 ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x82,0x47,0xa7,0x74,0xf0 + +# ATT: vcvtne2ph2bf8 %zmm24, %zmm23, %zmm22 +# INTEL: vcvtne2ph2bf8 zmm22, zmm23, zmm24 +0x62,0x82,0x47,0x40,0x74,0xf0 + +# ATT: vcvtne2ph2bf8 %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vcvtne2ph2bf8 zmm22 {k7}, zmm23, zmm24 +0x62,0x82,0x47,0x47,0x74,0xf0 + +# ATT: vcvtne2ph2bf8 %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvtne2ph2bf8 zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x82,0x47,0xc7,0x74,0xf0 + +# ATT: vcvtne2ph2bf8 %xmm24, %xmm23, %xmm22 +# INTEL: vcvtne2ph2bf8 xmm22, xmm23, xmm24 +0x62,0x82,0x47,0x00,0x74,0xf0 + +# ATT: vcvtne2ph2bf8 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vcvtne2ph2bf8 xmm22 {k7}, xmm23, xmm24 +0x62,0x82,0x47,0x07,0x74,0xf0 + +# ATT: vcvtne2ph2bf8 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtne2ph2bf8 xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x82,0x47,0x87,0x74,0xf0 + +# ATT: vcvtne2ph2bf8 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vcvtne2ph2bf8 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x47,0x40,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2bf8 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vcvtne2ph2bf8 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x47,0x47,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2bf8 (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vcvtne2ph2bf8 zmm22, zmm23, word ptr [rip]{1to32} +0x62,0xe2,0x47,0x50,0x74,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtne2ph2bf8 -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vcvtne2ph2bf8 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe2,0x47,0x40,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtne2ph2bf8 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvtne2ph2bf8 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe2,0x47,0xc7,0x74,0x71,0x7f + +# ATT: vcvtne2ph2bf8 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvtne2ph2bf8 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe2,0x47,0xd7,0x74,0x72,0x80 + +# ATT: vcvtne2ph2bf8 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vcvtne2ph2bf8 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x47,0x20,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2bf8 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vcvtne2ph2bf8 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x47,0x27,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2bf8 (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vcvtne2ph2bf8 ymm22, ymm23, word ptr [rip]{1to16} +0x62,0xe2,0x47,0x30,0x74,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtne2ph2bf8 -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vcvtne2ph2bf8 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe2,0x47,0x20,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtne2ph2bf8 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvtne2ph2bf8 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe2,0x47,0xa7,0x74,0x71,0x7f + +# ATT: vcvtne2ph2bf8 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvtne2ph2bf8 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe2,0x47,0xb7,0x74,0x72,0x80 + +# ATT: vcvtne2ph2bf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vcvtne2ph2bf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x47,0x00,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2bf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vcvtne2ph2bf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x47,0x07,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2bf8 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vcvtne2ph2bf8 xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe2,0x47,0x10,0x74,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtne2ph2bf8 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vcvtne2ph2bf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe2,0x47,0x00,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtne2ph2bf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtne2ph2bf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe2,0x47,0x87,0x74,0x71,0x7f + +# ATT: vcvtne2ph2bf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtne2ph2bf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe2,0x47,0x97,0x74,0x72,0x80 + +# ATT: vcvtne2ph2bf8s %ymm24, %ymm23, %ymm22 +# INTEL: vcvtne2ph2bf8s ymm22, ymm23, ymm24 +0x62,0x85,0x47,0x20,0x74,0xf0 + +# ATT: vcvtne2ph2bf8s %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vcvtne2ph2bf8s ymm22 {k7}, ymm23, ymm24 +0x62,0x85,0x47,0x27,0x74,0xf0 + +# ATT: vcvtne2ph2bf8s %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvtne2ph2bf8s ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x85,0x47,0xa7,0x74,0xf0 + +# ATT: vcvtne2ph2bf8s %zmm24, %zmm23, %zmm22 +# INTEL: vcvtne2ph2bf8s zmm22, zmm23, zmm24 +0x62,0x85,0x47,0x40,0x74,0xf0 + +# ATT: vcvtne2ph2bf8s %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vcvtne2ph2bf8s zmm22 {k7}, zmm23, zmm24 +0x62,0x85,0x47,0x47,0x74,0xf0 + +# ATT: vcvtne2ph2bf8s %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvtne2ph2bf8s zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x85,0x47,0xc7,0x74,0xf0 + +# ATT: vcvtne2ph2bf8s %xmm24, %xmm23, %xmm22 +# INTEL: vcvtne2ph2bf8s xmm22, xmm23, xmm24 +0x62,0x85,0x47,0x00,0x74,0xf0 + +# ATT: vcvtne2ph2bf8s %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vcvtne2ph2bf8s xmm22 {k7}, xmm23, xmm24 +0x62,0x85,0x47,0x07,0x74,0xf0 + +# ATT: vcvtne2ph2bf8s %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtne2ph2bf8s xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x85,0x47,0x87,0x74,0xf0 + +# ATT: vcvtne2ph2bf8s 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vcvtne2ph2bf8s zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x47,0x40,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2bf8s 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vcvtne2ph2bf8s zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x47,0x47,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2bf8s (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vcvtne2ph2bf8s zmm22, zmm23, word ptr [rip]{1to32} +0x62,0xe5,0x47,0x50,0x74,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtne2ph2bf8s -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vcvtne2ph2bf8s zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe5,0x47,0x40,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtne2ph2bf8s 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvtne2ph2bf8s zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe5,0x47,0xc7,0x74,0x71,0x7f + +# ATT: vcvtne2ph2bf8s -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvtne2ph2bf8s zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe5,0x47,0xd7,0x74,0x72,0x80 + +# ATT: vcvtne2ph2bf8s 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vcvtne2ph2bf8s ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x47,0x20,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2bf8s 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vcvtne2ph2bf8s ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x47,0x27,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2bf8s (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vcvtne2ph2bf8s ymm22, ymm23, word ptr [rip]{1to16} +0x62,0xe5,0x47,0x30,0x74,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtne2ph2bf8s -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vcvtne2ph2bf8s ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe5,0x47,0x20,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtne2ph2bf8s 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvtne2ph2bf8s ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe5,0x47,0xa7,0x74,0x71,0x7f + +# ATT: vcvtne2ph2bf8s -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvtne2ph2bf8s ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe5,0x47,0xb7,0x74,0x72,0x80 + +# ATT: vcvtne2ph2bf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vcvtne2ph2bf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x47,0x00,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2bf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vcvtne2ph2bf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x47,0x07,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2bf8s (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vcvtne2ph2bf8s xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe5,0x47,0x10,0x74,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtne2ph2bf8s -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vcvtne2ph2bf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe5,0x47,0x00,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtne2ph2bf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtne2ph2bf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe5,0x47,0x87,0x74,0x71,0x7f + +# ATT: vcvtne2ph2bf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtne2ph2bf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe5,0x47,0x97,0x74,0x72,0x80 + +# ATT: vcvtne2ph2hf8 %ymm24, %ymm23, %ymm22 +# INTEL: vcvtne2ph2hf8 ymm22, ymm23, ymm24 +0x62,0x85,0x47,0x20,0x18,0xf0 + +# ATT: vcvtne2ph2hf8 %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vcvtne2ph2hf8 ymm22 {k7}, ymm23, ymm24 +0x62,0x85,0x47,0x27,0x18,0xf0 + +# ATT: vcvtne2ph2hf8 %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvtne2ph2hf8 ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x85,0x47,0xa7,0x18,0xf0 + +# ATT: vcvtne2ph2hf8 %zmm24, %zmm23, %zmm22 +# INTEL: vcvtne2ph2hf8 zmm22, zmm23, zmm24 +0x62,0x85,0x47,0x40,0x18,0xf0 + +# ATT: vcvtne2ph2hf8 %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vcvtne2ph2hf8 zmm22 {k7}, zmm23, zmm24 +0x62,0x85,0x47,0x47,0x18,0xf0 + +# ATT: vcvtne2ph2hf8 %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvtne2ph2hf8 zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x85,0x47,0xc7,0x18,0xf0 + +# ATT: vcvtne2ph2hf8 %xmm24, %xmm23, %xmm22 +# INTEL: vcvtne2ph2hf8 xmm22, xmm23, xmm24 +0x62,0x85,0x47,0x00,0x18,0xf0 + +# ATT: vcvtne2ph2hf8 %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vcvtne2ph2hf8 xmm22 {k7}, xmm23, xmm24 +0x62,0x85,0x47,0x07,0x18,0xf0 + +# ATT: vcvtne2ph2hf8 %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtne2ph2hf8 xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x85,0x47,0x87,0x18,0xf0 + +# ATT: vcvtne2ph2hf8 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vcvtne2ph2hf8 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x47,0x40,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2hf8 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vcvtne2ph2hf8 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x47,0x47,0x18,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2hf8 (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vcvtne2ph2hf8 zmm22, zmm23, word ptr [rip]{1to32} +0x62,0xe5,0x47,0x50,0x18,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtne2ph2hf8 -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vcvtne2ph2hf8 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe5,0x47,0x40,0x18,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtne2ph2hf8 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvtne2ph2hf8 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe5,0x47,0xc7,0x18,0x71,0x7f + +# ATT: vcvtne2ph2hf8 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvtne2ph2hf8 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe5,0x47,0xd7,0x18,0x72,0x80 + +# ATT: vcvtne2ph2hf8 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vcvtne2ph2hf8 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x47,0x20,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2hf8 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vcvtne2ph2hf8 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x47,0x27,0x18,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2hf8 (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vcvtne2ph2hf8 ymm22, ymm23, word ptr [rip]{1to16} +0x62,0xe5,0x47,0x30,0x18,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtne2ph2hf8 -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vcvtne2ph2hf8 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe5,0x47,0x20,0x18,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtne2ph2hf8 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvtne2ph2hf8 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe5,0x47,0xa7,0x18,0x71,0x7f + +# ATT: vcvtne2ph2hf8 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvtne2ph2hf8 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe5,0x47,0xb7,0x18,0x72,0x80 + +# ATT: vcvtne2ph2hf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vcvtne2ph2hf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x47,0x00,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2hf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vcvtne2ph2hf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x47,0x07,0x18,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2hf8 (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vcvtne2ph2hf8 xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe5,0x47,0x10,0x18,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtne2ph2hf8 -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vcvtne2ph2hf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe5,0x47,0x00,0x18,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtne2ph2hf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtne2ph2hf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe5,0x47,0x87,0x18,0x71,0x7f + +# ATT: vcvtne2ph2hf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtne2ph2hf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe5,0x47,0x97,0x18,0x72,0x80 + +# ATT: vcvtne2ph2hf8s %ymm24, %ymm23, %ymm22 +# INTEL: vcvtne2ph2hf8s ymm22, ymm23, ymm24 +0x62,0x85,0x47,0x20,0x1b,0xf0 + +# ATT: vcvtne2ph2hf8s %ymm24, %ymm23, %ymm22 {%k7} +# INTEL: vcvtne2ph2hf8s ymm22 {k7}, ymm23, ymm24 +0x62,0x85,0x47,0x27,0x1b,0xf0 + +# ATT: vcvtne2ph2hf8s %ymm24, %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvtne2ph2hf8s ymm22 {k7} {z}, ymm23, ymm24 +0x62,0x85,0x47,0xa7,0x1b,0xf0 + +# ATT: vcvtne2ph2hf8s %zmm24, %zmm23, %zmm22 +# INTEL: vcvtne2ph2hf8s zmm22, zmm23, zmm24 +0x62,0x85,0x47,0x40,0x1b,0xf0 + +# ATT: vcvtne2ph2hf8s %zmm24, %zmm23, %zmm22 {%k7} +# INTEL: vcvtne2ph2hf8s zmm22 {k7}, zmm23, zmm24 +0x62,0x85,0x47,0x47,0x1b,0xf0 + +# ATT: vcvtne2ph2hf8s %zmm24, %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvtne2ph2hf8s zmm22 {k7} {z}, zmm23, zmm24 +0x62,0x85,0x47,0xc7,0x1b,0xf0 + +# ATT: vcvtne2ph2hf8s %xmm24, %xmm23, %xmm22 +# INTEL: vcvtne2ph2hf8s xmm22, xmm23, xmm24 +0x62,0x85,0x47,0x00,0x1b,0xf0 + +# ATT: vcvtne2ph2hf8s %xmm24, %xmm23, %xmm22 {%k7} +# INTEL: vcvtne2ph2hf8s xmm22 {k7}, xmm23, xmm24 +0x62,0x85,0x47,0x07,0x1b,0xf0 + +# ATT: vcvtne2ph2hf8s %xmm24, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtne2ph2hf8s xmm22 {k7} {z}, xmm23, xmm24 +0x62,0x85,0x47,0x87,0x1b,0xf0 + +# ATT: vcvtne2ph2hf8s 268435456(%rbp,%r14,8), %zmm23, %zmm22 +# INTEL: vcvtne2ph2hf8s zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x47,0x40,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2hf8s 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +# INTEL: vcvtne2ph2hf8s zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x47,0x47,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2hf8s (%rip){1to32}, %zmm23, %zmm22 +# INTEL: vcvtne2ph2hf8s zmm22, zmm23, word ptr [rip]{1to32} +0x62,0xe5,0x47,0x50,0x1b,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtne2ph2hf8s -2048(,%rbp,2), %zmm23, %zmm22 +# INTEL: vcvtne2ph2hf8s zmm22, zmm23, zmmword ptr [2*rbp - 2048] +0x62,0xe5,0x47,0x40,0x1b,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtne2ph2hf8s 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvtne2ph2hf8s zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +0x62,0xe5,0x47,0xc7,0x1b,0x71,0x7f + +# ATT: vcvtne2ph2hf8s -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +# INTEL: vcvtne2ph2hf8s zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +0x62,0xe5,0x47,0xd7,0x1b,0x72,0x80 + +# ATT: vcvtne2ph2hf8s 268435456(%rbp,%r14,8), %ymm23, %ymm22 +# INTEL: vcvtne2ph2hf8s ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x47,0x20,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2hf8s 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +# INTEL: vcvtne2ph2hf8s ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x47,0x27,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2hf8s (%rip){1to16}, %ymm23, %ymm22 +# INTEL: vcvtne2ph2hf8s ymm22, ymm23, word ptr [rip]{1to16} +0x62,0xe5,0x47,0x30,0x1b,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtne2ph2hf8s -1024(,%rbp,2), %ymm23, %ymm22 +# INTEL: vcvtne2ph2hf8s ymm22, ymm23, ymmword ptr [2*rbp - 1024] +0x62,0xe5,0x47,0x20,0x1b,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtne2ph2hf8s 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvtne2ph2hf8s ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +0x62,0xe5,0x47,0xa7,0x1b,0x71,0x7f + +# ATT: vcvtne2ph2hf8s -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +# INTEL: vcvtne2ph2hf8s ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +0x62,0xe5,0x47,0xb7,0x1b,0x72,0x80 + +# ATT: vcvtne2ph2hf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 +# INTEL: vcvtne2ph2hf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x47,0x00,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtne2ph2hf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +# INTEL: vcvtne2ph2hf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x47,0x07,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtne2ph2hf8s (%rip){1to8}, %xmm23, %xmm22 +# INTEL: vcvtne2ph2hf8s xmm22, xmm23, word ptr [rip]{1to8} +0x62,0xe5,0x47,0x10,0x1b,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtne2ph2hf8s -512(,%rbp,2), %xmm23, %xmm22 +# INTEL: vcvtne2ph2hf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] +0x62,0xe5,0x47,0x00,0x1b,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtne2ph2hf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtne2ph2hf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +0x62,0xe5,0x47,0x87,0x1b,0x71,0x7f + +# ATT: vcvtne2ph2hf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtne2ph2hf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +0x62,0xe5,0x47,0x97,0x1b,0x72,0x80 + +# ATT: vcvtneph2bf8 %xmm23, %xmm22 +# INTEL: vcvtneph2bf8 xmm22, xmm23 +0x62,0xa2,0x7e,0x08,0x74,0xf7 + +# ATT: vcvtneph2bf8 %xmm23, %xmm22 {%k7} +# INTEL: vcvtneph2bf8 xmm22 {k7}, xmm23 +0x62,0xa2,0x7e,0x0f,0x74,0xf7 + +# ATT: vcvtneph2bf8 %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtneph2bf8 xmm22 {k7} {z}, xmm23 +0x62,0xa2,0x7e,0x8f,0x74,0xf7 + +# ATT: vcvtneph2bf8 %zmm23, %ymm22 +# INTEL: vcvtneph2bf8 ymm22, zmm23 +0x62,0xa2,0x7e,0x48,0x74,0xf7 + +# ATT: vcvtneph2bf8 %zmm23, %ymm22 {%k7} +# INTEL: vcvtneph2bf8 ymm22 {k7}, zmm23 +0x62,0xa2,0x7e,0x4f,0x74,0xf7 + +# ATT: vcvtneph2bf8 %zmm23, %ymm22 {%k7} {z} +# INTEL: vcvtneph2bf8 ymm22 {k7} {z}, zmm23 +0x62,0xa2,0x7e,0xcf,0x74,0xf7 + +# ATT: vcvtneph2bf8 %ymm23, %xmm22 +# INTEL: vcvtneph2bf8 xmm22, ymm23 +0x62,0xa2,0x7e,0x28,0x74,0xf7 + +# ATT: vcvtneph2bf8 %ymm23, %xmm22 {%k7} +# INTEL: vcvtneph2bf8 xmm22 {k7}, ymm23 +0x62,0xa2,0x7e,0x2f,0x74,0xf7 + +# ATT: vcvtneph2bf8 %ymm23, %xmm22 {%k7} {z} +# INTEL: vcvtneph2bf8 xmm22 {k7} {z}, ymm23 +0x62,0xa2,0x7e,0xaf,0x74,0xf7 + +# ATT: vcvtneph2bf8x 268435456(%rbp,%r14,8), %xmm22 +# INTEL: vcvtneph2bf8 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x7e,0x08,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtneph2bf8x 291(%r8,%rax,4), %xmm22 {%k7} +# INTEL: vcvtneph2bf8 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x7e,0x0f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtneph2bf8 (%rip){1to8}, %xmm22 +# INTEL: vcvtneph2bf8 xmm22, word ptr [rip]{1to8} +0x62,0xe2,0x7e,0x18,0x74,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtneph2bf8x -512(,%rbp,2), %xmm22 +# INTEL: vcvtneph2bf8 xmm22, xmmword ptr [2*rbp - 512] +0x62,0xe2,0x7e,0x08,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtneph2bf8x 2032(%rcx), %xmm22 {%k7} {z} +# INTEL: vcvtneph2bf8 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +0x62,0xe2,0x7e,0x8f,0x74,0x71,0x7f + +# ATT: vcvtneph2bf8 -256(%rdx){1to8}, %xmm22 {%k7} {z} +# INTEL: vcvtneph2bf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +0x62,0xe2,0x7e,0x9f,0x74,0x72,0x80 + +# ATT: vcvtneph2bf8 (%rip){1to16}, %xmm22 +# INTEL: vcvtneph2bf8 xmm22, word ptr [rip]{1to16} +0x62,0xe2,0x7e,0x38,0x74,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtneph2bf8y -1024(,%rbp,2), %xmm22 +# INTEL: vcvtneph2bf8 xmm22, ymmword ptr [2*rbp - 1024] +0x62,0xe2,0x7e,0x28,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtneph2bf8y 4064(%rcx), %xmm22 {%k7} {z} +# INTEL: vcvtneph2bf8 xmm22 {k7} {z}, ymmword ptr [rcx + 4064] +0x62,0xe2,0x7e,0xaf,0x74,0x71,0x7f + +# ATT: vcvtneph2bf8 -256(%rdx){1to16}, %xmm22 {%k7} {z} +# INTEL: vcvtneph2bf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} +0x62,0xe2,0x7e,0xbf,0x74,0x72,0x80 + +# ATT: vcvtneph2bf8 268435456(%rbp,%r14,8), %ymm22 +# INTEL: vcvtneph2bf8 ymm22, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa2,0x7e,0x48,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtneph2bf8 291(%r8,%rax,4), %ymm22 {%k7} +# INTEL: vcvtneph2bf8 ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc2,0x7e,0x4f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtneph2bf8 (%rip){1to32}, %ymm22 +# INTEL: vcvtneph2bf8 ymm22, word ptr [rip]{1to32} +0x62,0xe2,0x7e,0x58,0x74,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtneph2bf8 -2048(,%rbp,2), %ymm22 +# INTEL: vcvtneph2bf8 ymm22, zmmword ptr [2*rbp - 2048] +0x62,0xe2,0x7e,0x48,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtneph2bf8 8128(%rcx), %ymm22 {%k7} {z} +# INTEL: vcvtneph2bf8 ymm22 {k7} {z}, zmmword ptr [rcx + 8128] +0x62,0xe2,0x7e,0xcf,0x74,0x71,0x7f + +# ATT: vcvtneph2bf8 -256(%rdx){1to32}, %ymm22 {%k7} {z} +# INTEL: vcvtneph2bf8 ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} +0x62,0xe2,0x7e,0xdf,0x74,0x72,0x80 + +# ATT: vcvtneph2bf8s %xmm23, %xmm22 +# INTEL: vcvtneph2bf8s xmm22, xmm23 +0x62,0xa5,0x7e,0x08,0x74,0xf7 + +# ATT: vcvtneph2bf8s %xmm23, %xmm22 {%k7} +# INTEL: vcvtneph2bf8s xmm22 {k7}, xmm23 +0x62,0xa5,0x7e,0x0f,0x74,0xf7 + +# ATT: vcvtneph2bf8s %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtneph2bf8s xmm22 {k7} {z}, xmm23 +0x62,0xa5,0x7e,0x8f,0x74,0xf7 + +# ATT: vcvtneph2bf8s %zmm23, %ymm22 +# INTEL: vcvtneph2bf8s ymm22, zmm23 +0x62,0xa5,0x7e,0x48,0x74,0xf7 + +# ATT: vcvtneph2bf8s %zmm23, %ymm22 {%k7} +# INTEL: vcvtneph2bf8s ymm22 {k7}, zmm23 +0x62,0xa5,0x7e,0x4f,0x74,0xf7 + +# ATT: vcvtneph2bf8s %zmm23, %ymm22 {%k7} {z} +# INTEL: vcvtneph2bf8s ymm22 {k7} {z}, zmm23 +0x62,0xa5,0x7e,0xcf,0x74,0xf7 + +# ATT: vcvtneph2bf8s %ymm23, %xmm22 +# INTEL: vcvtneph2bf8s xmm22, ymm23 +0x62,0xa5,0x7e,0x28,0x74,0xf7 + +# ATT: vcvtneph2bf8s %ymm23, %xmm22 {%k7} +# INTEL: vcvtneph2bf8s xmm22 {k7}, ymm23 +0x62,0xa5,0x7e,0x2f,0x74,0xf7 + +# ATT: vcvtneph2bf8s %ymm23, %xmm22 {%k7} {z} +# INTEL: vcvtneph2bf8s xmm22 {k7} {z}, ymm23 +0x62,0xa5,0x7e,0xaf,0x74,0xf7 + +# ATT: vcvtneph2bf8sx 268435456(%rbp,%r14,8), %xmm22 +# INTEL: vcvtneph2bf8s xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x7e,0x08,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtneph2bf8sx 291(%r8,%rax,4), %xmm22 {%k7} +# INTEL: vcvtneph2bf8s xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x7e,0x0f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtneph2bf8s (%rip){1to8}, %xmm22 +# INTEL: vcvtneph2bf8s xmm22, word ptr [rip]{1to8} +0x62,0xe5,0x7e,0x18,0x74,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtneph2bf8sx -512(,%rbp,2), %xmm22 +# INTEL: vcvtneph2bf8s xmm22, xmmword ptr [2*rbp - 512] +0x62,0xe5,0x7e,0x08,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtneph2bf8sx 2032(%rcx), %xmm22 {%k7} {z} +# INTEL: vcvtneph2bf8s xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +0x62,0xe5,0x7e,0x8f,0x74,0x71,0x7f + +# ATT: vcvtneph2bf8s -256(%rdx){1to8}, %xmm22 {%k7} {z} +# INTEL: vcvtneph2bf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +0x62,0xe5,0x7e,0x9f,0x74,0x72,0x80 + +# ATT: vcvtneph2bf8s (%rip){1to16}, %xmm22 +# INTEL: vcvtneph2bf8s xmm22, word ptr [rip]{1to16} +0x62,0xe5,0x7e,0x38,0x74,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtneph2bf8sy -1024(,%rbp,2), %xmm22 +# INTEL: vcvtneph2bf8s xmm22, ymmword ptr [2*rbp - 1024] +0x62,0xe5,0x7e,0x28,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtneph2bf8sy 4064(%rcx), %xmm22 {%k7} {z} +# INTEL: vcvtneph2bf8s xmm22 {k7} {z}, ymmword ptr [rcx + 4064] +0x62,0xe5,0x7e,0xaf,0x74,0x71,0x7f + +# ATT: vcvtneph2bf8s -256(%rdx){1to16}, %xmm22 {%k7} {z} +# INTEL: vcvtneph2bf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} +0x62,0xe5,0x7e,0xbf,0x74,0x72,0x80 + +# ATT: vcvtneph2bf8s 268435456(%rbp,%r14,8), %ymm22 +# INTEL: vcvtneph2bf8s ymm22, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x7e,0x48,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtneph2bf8s 291(%r8,%rax,4), %ymm22 {%k7} +# INTEL: vcvtneph2bf8s ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x7e,0x4f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtneph2bf8s (%rip){1to32}, %ymm22 +# INTEL: vcvtneph2bf8s ymm22, word ptr [rip]{1to32} +0x62,0xe5,0x7e,0x58,0x74,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtneph2bf8s -2048(,%rbp,2), %ymm22 +# INTEL: vcvtneph2bf8s ymm22, zmmword ptr [2*rbp - 2048] +0x62,0xe5,0x7e,0x48,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtneph2bf8s 8128(%rcx), %ymm22 {%k7} {z} +# INTEL: vcvtneph2bf8s ymm22 {k7} {z}, zmmword ptr [rcx + 8128] +0x62,0xe5,0x7e,0xcf,0x74,0x71,0x7f + +# ATT: vcvtneph2bf8s -256(%rdx){1to32}, %ymm22 {%k7} {z} +# INTEL: vcvtneph2bf8s ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} +0x62,0xe5,0x7e,0xdf,0x74,0x72,0x80 + +# ATT: vcvtneph2hf8 %xmm23, %xmm22 +# INTEL: vcvtneph2hf8 xmm22, xmm23 +0x62,0xa5,0x7e,0x08,0x18,0xf7 + +# ATT: vcvtneph2hf8 %xmm23, %xmm22 {%k7} +# INTEL: vcvtneph2hf8 xmm22 {k7}, xmm23 +0x62,0xa5,0x7e,0x0f,0x18,0xf7 + +# ATT: vcvtneph2hf8 %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtneph2hf8 xmm22 {k7} {z}, xmm23 +0x62,0xa5,0x7e,0x8f,0x18,0xf7 + +# ATT: vcvtneph2hf8 %zmm23, %ymm22 +# INTEL: vcvtneph2hf8 ymm22, zmm23 +0x62,0xa5,0x7e,0x48,0x18,0xf7 + +# ATT: vcvtneph2hf8 %zmm23, %ymm22 {%k7} +# INTEL: vcvtneph2hf8 ymm22 {k7}, zmm23 +0x62,0xa5,0x7e,0x4f,0x18,0xf7 + +# ATT: vcvtneph2hf8 %zmm23, %ymm22 {%k7} {z} +# INTEL: vcvtneph2hf8 ymm22 {k7} {z}, zmm23 +0x62,0xa5,0x7e,0xcf,0x18,0xf7 + +# ATT: vcvtneph2hf8 %ymm23, %xmm22 +# INTEL: vcvtneph2hf8 xmm22, ymm23 +0x62,0xa5,0x7e,0x28,0x18,0xf7 + +# ATT: vcvtneph2hf8 %ymm23, %xmm22 {%k7} +# INTEL: vcvtneph2hf8 xmm22 {k7}, ymm23 +0x62,0xa5,0x7e,0x2f,0x18,0xf7 + +# ATT: vcvtneph2hf8 %ymm23, %xmm22 {%k7} {z} +# INTEL: vcvtneph2hf8 xmm22 {k7} {z}, ymm23 +0x62,0xa5,0x7e,0xaf,0x18,0xf7 + +# ATT: vcvtneph2hf8x 268435456(%rbp,%r14,8), %xmm22 +# INTEL: vcvtneph2hf8 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x7e,0x08,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtneph2hf8x 291(%r8,%rax,4), %xmm22 {%k7} +# INTEL: vcvtneph2hf8 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x7e,0x0f,0x18,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtneph2hf8 (%rip){1to8}, %xmm22 +# INTEL: vcvtneph2hf8 xmm22, word ptr [rip]{1to8} +0x62,0xe5,0x7e,0x18,0x18,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtneph2hf8x -512(,%rbp,2), %xmm22 +# INTEL: vcvtneph2hf8 xmm22, xmmword ptr [2*rbp - 512] +0x62,0xe5,0x7e,0x08,0x18,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtneph2hf8x 2032(%rcx), %xmm22 {%k7} {z} +# INTEL: vcvtneph2hf8 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +0x62,0xe5,0x7e,0x8f,0x18,0x71,0x7f + +# ATT: vcvtneph2hf8 -256(%rdx){1to8}, %xmm22 {%k7} {z} +# INTEL: vcvtneph2hf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +0x62,0xe5,0x7e,0x9f,0x18,0x72,0x80 + +# ATT: vcvtneph2hf8 (%rip){1to16}, %xmm22 +# INTEL: vcvtneph2hf8 xmm22, word ptr [rip]{1to16} +0x62,0xe5,0x7e,0x38,0x18,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtneph2hf8y -1024(,%rbp,2), %xmm22 +# INTEL: vcvtneph2hf8 xmm22, ymmword ptr [2*rbp - 1024] +0x62,0xe5,0x7e,0x28,0x18,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtneph2hf8y 4064(%rcx), %xmm22 {%k7} {z} +# INTEL: vcvtneph2hf8 xmm22 {k7} {z}, ymmword ptr [rcx + 4064] +0x62,0xe5,0x7e,0xaf,0x18,0x71,0x7f + +# ATT: vcvtneph2hf8 -256(%rdx){1to16}, %xmm22 {%k7} {z} +# INTEL: vcvtneph2hf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} +0x62,0xe5,0x7e,0xbf,0x18,0x72,0x80 + +# ATT: vcvtneph2hf8 268435456(%rbp,%r14,8), %ymm22 +# INTEL: vcvtneph2hf8 ymm22, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x7e,0x48,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtneph2hf8 291(%r8,%rax,4), %ymm22 {%k7} +# INTEL: vcvtneph2hf8 ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x7e,0x4f,0x18,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtneph2hf8 (%rip){1to32}, %ymm22 +# INTEL: vcvtneph2hf8 ymm22, word ptr [rip]{1to32} +0x62,0xe5,0x7e,0x58,0x18,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtneph2hf8 -2048(,%rbp,2), %ymm22 +# INTEL: vcvtneph2hf8 ymm22, zmmword ptr [2*rbp - 2048] +0x62,0xe5,0x7e,0x48,0x18,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtneph2hf8 8128(%rcx), %ymm22 {%k7} {z} +# INTEL: vcvtneph2hf8 ymm22 {k7} {z}, zmmword ptr [rcx + 8128] +0x62,0xe5,0x7e,0xcf,0x18,0x71,0x7f + +# ATT: vcvtneph2hf8 -256(%rdx){1to32}, %ymm22 {%k7} {z} +# INTEL: vcvtneph2hf8 ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} +0x62,0xe5,0x7e,0xdf,0x18,0x72,0x80 + +# ATT: vcvtneph2hf8s %xmm23, %xmm22 +# INTEL: vcvtneph2hf8s xmm22, xmm23 +0x62,0xa5,0x7e,0x08,0x1b,0xf7 + +# ATT: vcvtneph2hf8s %xmm23, %xmm22 {%k7} +# INTEL: vcvtneph2hf8s xmm22 {k7}, xmm23 +0x62,0xa5,0x7e,0x0f,0x1b,0xf7 + +# ATT: vcvtneph2hf8s %xmm23, %xmm22 {%k7} {z} +# INTEL: vcvtneph2hf8s xmm22 {k7} {z}, xmm23 +0x62,0xa5,0x7e,0x8f,0x1b,0xf7 + +# ATT: vcvtneph2hf8s %zmm23, %ymm22 +# INTEL: vcvtneph2hf8s ymm22, zmm23 +0x62,0xa5,0x7e,0x48,0x1b,0xf7 + +# ATT: vcvtneph2hf8s %zmm23, %ymm22 {%k7} +# INTEL: vcvtneph2hf8s ymm22 {k7}, zmm23 +0x62,0xa5,0x7e,0x4f,0x1b,0xf7 + +# ATT: vcvtneph2hf8s %zmm23, %ymm22 {%k7} {z} +# INTEL: vcvtneph2hf8s ymm22 {k7} {z}, zmm23 +0x62,0xa5,0x7e,0xcf,0x1b,0xf7 + +# ATT: vcvtneph2hf8s %ymm23, %xmm22 +# INTEL: vcvtneph2hf8s xmm22, ymm23 +0x62,0xa5,0x7e,0x28,0x1b,0xf7 + +# ATT: vcvtneph2hf8s %ymm23, %xmm22 {%k7} +# INTEL: vcvtneph2hf8s xmm22 {k7}, ymm23 +0x62,0xa5,0x7e,0x2f,0x1b,0xf7 + +# ATT: vcvtneph2hf8s %ymm23, %xmm22 {%k7} {z} +# INTEL: vcvtneph2hf8s xmm22 {k7} {z}, ymm23 +0x62,0xa5,0x7e,0xaf,0x1b,0xf7 + +# ATT: vcvtneph2hf8sx 268435456(%rbp,%r14,8), %xmm22 +# INTEL: vcvtneph2hf8s xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x7e,0x08,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtneph2hf8sx 291(%r8,%rax,4), %xmm22 {%k7} +# INTEL: vcvtneph2hf8s xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x7e,0x0f,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtneph2hf8s (%rip){1to8}, %xmm22 +# INTEL: vcvtneph2hf8s xmm22, word ptr [rip]{1to8} +0x62,0xe5,0x7e,0x18,0x1b,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtneph2hf8sx -512(,%rbp,2), %xmm22 +# INTEL: vcvtneph2hf8s xmm22, xmmword ptr [2*rbp - 512] +0x62,0xe5,0x7e,0x08,0x1b,0x34,0x6d,0x00,0xfe,0xff,0xff + +# ATT: vcvtneph2hf8sx 2032(%rcx), %xmm22 {%k7} {z} +# INTEL: vcvtneph2hf8s xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +0x62,0xe5,0x7e,0x8f,0x1b,0x71,0x7f + +# ATT: vcvtneph2hf8s -256(%rdx){1to8}, %xmm22 {%k7} {z} +# INTEL: vcvtneph2hf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +0x62,0xe5,0x7e,0x9f,0x1b,0x72,0x80 + +# ATT: vcvtneph2hf8s (%rip){1to16}, %xmm22 +# INTEL: vcvtneph2hf8s xmm22, word ptr [rip]{1to16} +0x62,0xe5,0x7e,0x38,0x1b,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtneph2hf8sy -1024(,%rbp,2), %xmm22 +# INTEL: vcvtneph2hf8s xmm22, ymmword ptr [2*rbp - 1024] +0x62,0xe5,0x7e,0x28,0x1b,0x34,0x6d,0x00,0xfc,0xff,0xff + +# ATT: vcvtneph2hf8sy 4064(%rcx), %xmm22 {%k7} {z} +# INTEL: vcvtneph2hf8s xmm22 {k7} {z}, ymmword ptr [rcx + 4064] +0x62,0xe5,0x7e,0xaf,0x1b,0x71,0x7f + +# ATT: vcvtneph2hf8s -256(%rdx){1to16}, %xmm22 {%k7} {z} +# INTEL: vcvtneph2hf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} +0x62,0xe5,0x7e,0xbf,0x1b,0x72,0x80 + +# ATT: vcvtneph2hf8s 268435456(%rbp,%r14,8), %ymm22 +# INTEL: vcvtneph2hf8s ymm22, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0xa5,0x7e,0x48,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcvtneph2hf8s 291(%r8,%rax,4), %ymm22 {%k7} +# INTEL: vcvtneph2hf8s ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +0x62,0xc5,0x7e,0x4f,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00 + +# ATT: vcvtneph2hf8s (%rip){1to32}, %ymm22 +# INTEL: vcvtneph2hf8s ymm22, word ptr [rip]{1to32} +0x62,0xe5,0x7e,0x58,0x1b,0x35,0x00,0x00,0x00,0x00 + +# ATT: vcvtneph2hf8s -2048(,%rbp,2), %ymm22 +# INTEL: vcvtneph2hf8s ymm22, zmmword ptr [2*rbp - 2048] +0x62,0xe5,0x7e,0x48,0x1b,0x34,0x6d,0x00,0xf8,0xff,0xff + +# ATT: vcvtneph2hf8s 8128(%rcx), %ymm22 {%k7} {z} +# INTEL: vcvtneph2hf8s ymm22 {k7} {z}, zmmword ptr [rcx + 8128] +0x62,0xe5,0x7e,0xcf,0x1b,0x71,0x7f + +# ATT: vcvtneph2hf8s -256(%rdx){1to32}, %ymm22 {%k7} {z} +# INTEL: vcvtneph2hf8s ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} +0x62,0xe5,0x7e,0xdf,0x1b,0x72,0x80 + diff --git a/llvm/test/MC/X86/avx10.2convert-32-att.s b/llvm/test/MC/X86/avx10.2convert-32-att.s new file mode 100644 index 00000000000000..beb48245578010 --- /dev/null +++ b/llvm/test/MC/X86/avx10.2convert-32-att.s @@ -0,0 +1,1490 @@ +// RUN: llvm-mc -triple i386 --show-encoding %s | FileCheck %s + +// CHECK: vcvt2ps2phx %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf2,0x65,0x28,0x67,0xd4] + vcvt2ps2phx %ymm4, %ymm3, %ymm2 + +// CHECK: vcvt2ps2phx {rn-sae}, %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf2,0x61,0x18,0x67,0xd4] + vcvt2ps2phx {rn-sae}, %ymm4, %ymm3, %ymm2 + +// CHECK: vcvt2ps2phx %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf2,0x65,0x2f,0x67,0xd4] + vcvt2ps2phx %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vcvt2ps2phx {rz-sae}, %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x61,0xff,0x67,0xd4] + vcvt2ps2phx {rz-sae}, %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vcvt2ps2phx %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf2,0x65,0x48,0x67,0xd4] + vcvt2ps2phx %zmm4, %zmm3, %zmm2 + +// CHECK: vcvt2ps2phx {rn-sae}, %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf2,0x65,0x18,0x67,0xd4] + vcvt2ps2phx {rn-sae}, %zmm4, %zmm3, %zmm2 + +// CHECK: vcvt2ps2phx %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf2,0x65,0x4f,0x67,0xd4] + vcvt2ps2phx %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vcvt2ps2phx {rz-sae}, %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x65,0xff,0x67,0xd4] + vcvt2ps2phx {rz-sae}, %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vcvt2ps2phx %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf2,0x65,0x08,0x67,0xd4] + vcvt2ps2phx %xmm4, %xmm3, %xmm2 + +// CHECK: vcvt2ps2phx %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf2,0x65,0x0f,0x67,0xd4] + vcvt2ps2phx %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vcvt2ps2phx %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x65,0x8f,0x67,0xd4] + vcvt2ps2phx %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvt2ps2phx 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf2,0x65,0x48,0x67,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvt2ps2phx 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vcvt2ps2phx 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf2,0x65,0x4f,0x67,0x94,0x87,0x23,0x01,0x00,0x00] + vcvt2ps2phx 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vcvt2ps2phx (%eax){1to16}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf2,0x65,0x58,0x67,0x10] + vcvt2ps2phx (%eax){1to16}, %zmm3, %zmm2 + +// CHECK: vcvt2ps2phx -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf2,0x65,0x48,0x67,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvt2ps2phx -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vcvt2ps2phx 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x65,0xcf,0x67,0x51,0x7f] + vcvt2ps2phx 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vcvt2ps2phx -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x65,0xdf,0x67,0x52,0x80] + vcvt2ps2phx -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vcvt2ps2phx 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf2,0x65,0x28,0x67,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvt2ps2phx 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vcvt2ps2phx 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf2,0x65,0x2f,0x67,0x94,0x87,0x23,0x01,0x00,0x00] + vcvt2ps2phx 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vcvt2ps2phx (%eax){1to8}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf2,0x65,0x38,0x67,0x10] + vcvt2ps2phx (%eax){1to8}, %ymm3, %ymm2 + +// CHECK: vcvt2ps2phx -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf2,0x65,0x28,0x67,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvt2ps2phx -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vcvt2ps2phx 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x65,0xaf,0x67,0x51,0x7f] + vcvt2ps2phx 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vcvt2ps2phx -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x65,0xbf,0x67,0x52,0x80] + vcvt2ps2phx -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vcvt2ps2phx 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf2,0x65,0x08,0x67,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvt2ps2phx 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vcvt2ps2phx 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf2,0x65,0x0f,0x67,0x94,0x87,0x23,0x01,0x00,0x00] + vcvt2ps2phx 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vcvt2ps2phx (%eax){1to4}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf2,0x65,0x18,0x67,0x10] + vcvt2ps2phx (%eax){1to4}, %xmm3, %xmm2 + +// CHECK: vcvt2ps2phx -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf2,0x65,0x08,0x67,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvt2ps2phx -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vcvt2ps2phx 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x65,0x8f,0x67,0x51,0x7f] + vcvt2ps2phx 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvt2ps2phx -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x65,0x9f,0x67,0x52,0x80] + vcvt2ps2phx -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2bf8 %zmm4, %zmm3, %ymm2 +// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x74,0xd4] + vcvtbiasph2bf8 %zmm4, %zmm3, %ymm2 + +// CHECK: vcvtbiasph2bf8 %zmm4, %zmm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0x74,0xd4] + vcvtbiasph2bf8 %zmm4, %zmm3, %ymm2 {%k7} + +// CHECK: vcvtbiasph2bf8 %zmm4, %zmm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0x74,0xd4] + vcvtbiasph2bf8 %zmm4, %zmm3, %ymm2 {%k7} {z} + +// CHECK: vcvtbiasph2bf8 %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf2,0x64,0x08,0x74,0xd4] + vcvtbiasph2bf8 %xmm4, %xmm3, %xmm2 + +// CHECK: vcvtbiasph2bf8 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0x74,0xd4] + vcvtbiasph2bf8 %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vcvtbiasph2bf8 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0x74,0xd4] + vcvtbiasph2bf8 %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2bf8 %ymm4, %ymm3, %xmm2 +// CHECK: encoding: [0x62,0xf2,0x64,0x28,0x74,0xd4] + vcvtbiasph2bf8 %ymm4, %ymm3, %xmm2 + +// CHECK: vcvtbiasph2bf8 %ymm4, %ymm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0x74,0xd4] + vcvtbiasph2bf8 %ymm4, %ymm3, %xmm2 {%k7} + +// CHECK: vcvtbiasph2bf8 %ymm4, %ymm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0x74,0xd4] + vcvtbiasph2bf8 %ymm4, %ymm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2bf8 268435456(%esp,%esi,8), %ymm3, %xmm2 +// CHECK: encoding: [0x62,0xf2,0x64,0x28,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8 268435456(%esp,%esi,8), %ymm3, %xmm2 + +// CHECK: vcvtbiasph2bf8 291(%edi,%eax,4), %ymm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8 291(%edi,%eax,4), %ymm3, %xmm2 {%k7} + +// CHECK: vcvtbiasph2bf8 (%eax){1to16}, %ymm3, %xmm2 +// CHECK: encoding: [0x62,0xf2,0x64,0x38,0x74,0x10] + vcvtbiasph2bf8 (%eax){1to16}, %ymm3, %xmm2 + +// CHECK: vcvtbiasph2bf8 -1024(,%ebp,2), %ymm3, %xmm2 +// CHECK: encoding: [0x62,0xf2,0x64,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtbiasph2bf8 -1024(,%ebp,2), %ymm3, %xmm2 + +// CHECK: vcvtbiasph2bf8 4064(%ecx), %ymm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0x74,0x51,0x7f] + vcvtbiasph2bf8 4064(%ecx), %ymm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2bf8 -256(%edx){1to16}, %ymm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x64,0xbf,0x74,0x52,0x80] + vcvtbiasph2bf8 -256(%edx){1to16}, %ymm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2bf8 268435456(%esp,%esi,8), %zmm3, %ymm2 +// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8 268435456(%esp,%esi,8), %zmm3, %ymm2 + +// CHECK: vcvtbiasph2bf8 291(%edi,%eax,4), %zmm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8 291(%edi,%eax,4), %zmm3, %ymm2 {%k7} + +// CHECK: vcvtbiasph2bf8 (%eax){1to32}, %zmm3, %ymm2 +// CHECK: encoding: [0x62,0xf2,0x64,0x58,0x74,0x10] + vcvtbiasph2bf8 (%eax){1to32}, %zmm3, %ymm2 + +// CHECK: vcvtbiasph2bf8 -2048(,%ebp,2), %zmm3, %ymm2 +// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtbiasph2bf8 -2048(,%ebp,2), %zmm3, %ymm2 + +// CHECK: vcvtbiasph2bf8 8128(%ecx), %zmm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0x74,0x51,0x7f] + vcvtbiasph2bf8 8128(%ecx), %zmm3, %ymm2 {%k7} {z} + +// CHECK: vcvtbiasph2bf8 -256(%edx){1to32}, %zmm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x64,0xdf,0x74,0x52,0x80] + vcvtbiasph2bf8 -256(%edx){1to32}, %zmm3, %ymm2 {%k7} {z} + +// CHECK: vcvtbiasph2bf8 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf2,0x64,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vcvtbiasph2bf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vcvtbiasph2bf8 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf2,0x64,0x18,0x74,0x10] + vcvtbiasph2bf8 (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vcvtbiasph2bf8 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf2,0x64,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtbiasph2bf8 -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vcvtbiasph2bf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0x74,0x51,0x7f] + vcvtbiasph2bf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2bf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x64,0x9f,0x74,0x52,0x80] + vcvtbiasph2bf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2bf8s %zmm4, %zmm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x48,0x74,0xd4] + vcvtbiasph2bf8s %zmm4, %zmm3, %ymm2 + +// CHECK: vcvtbiasph2bf8s %zmm4, %zmm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x64,0x4f,0x74,0xd4] + vcvtbiasph2bf8s %zmm4, %zmm3, %ymm2 {%k7} + +// CHECK: vcvtbiasph2bf8s %zmm4, %zmm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0xcf,0x74,0xd4] + vcvtbiasph2bf8s %zmm4, %zmm3, %ymm2 {%k7} {z} + +// CHECK: vcvtbiasph2bf8s %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x08,0x74,0xd4] + vcvtbiasph2bf8s %xmm4, %xmm3, %xmm2 + +// CHECK: vcvtbiasph2bf8s %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x64,0x0f,0x74,0xd4] + vcvtbiasph2bf8s %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vcvtbiasph2bf8s %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0x8f,0x74,0xd4] + vcvtbiasph2bf8s %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2bf8s %ymm4, %ymm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x28,0x74,0xd4] + vcvtbiasph2bf8s %ymm4, %ymm3, %xmm2 + +// CHECK: vcvtbiasph2bf8s %ymm4, %ymm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x64,0x2f,0x74,0xd4] + vcvtbiasph2bf8s %ymm4, %ymm3, %xmm2 {%k7} + +// CHECK: vcvtbiasph2bf8s %ymm4, %ymm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0xaf,0x74,0xd4] + vcvtbiasph2bf8s %ymm4, %ymm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2bf8s 268435456(%esp,%esi,8), %ymm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x28,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8s 268435456(%esp,%esi,8), %ymm3, %xmm2 + +// CHECK: vcvtbiasph2bf8s 291(%edi,%eax,4), %ymm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x64,0x2f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8s 291(%edi,%eax,4), %ymm3, %xmm2 {%k7} + +// CHECK: vcvtbiasph2bf8s (%eax){1to16}, %ymm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x38,0x74,0x10] + vcvtbiasph2bf8s (%eax){1to16}, %ymm3, %xmm2 + +// CHECK: vcvtbiasph2bf8s -1024(,%ebp,2), %ymm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtbiasph2bf8s -1024(,%ebp,2), %ymm3, %xmm2 + +// CHECK: vcvtbiasph2bf8s 4064(%ecx), %ymm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0xaf,0x74,0x51,0x7f] + vcvtbiasph2bf8s 4064(%ecx), %ymm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2bf8s -256(%edx){1to16}, %ymm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0xbf,0x74,0x52,0x80] + vcvtbiasph2bf8s -256(%edx){1to16}, %ymm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2bf8s 268435456(%esp,%esi,8), %zmm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8s 268435456(%esp,%esi,8), %zmm3, %ymm2 + +// CHECK: vcvtbiasph2bf8s 291(%edi,%eax,4), %zmm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x64,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8s 291(%edi,%eax,4), %zmm3, %ymm2 {%k7} + +// CHECK: vcvtbiasph2bf8s (%eax){1to32}, %zmm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x58,0x74,0x10] + vcvtbiasph2bf8s (%eax){1to32}, %zmm3, %ymm2 + +// CHECK: vcvtbiasph2bf8s -2048(,%ebp,2), %zmm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtbiasph2bf8s -2048(,%ebp,2), %zmm3, %ymm2 + +// CHECK: vcvtbiasph2bf8s 8128(%ecx), %zmm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0xcf,0x74,0x51,0x7f] + vcvtbiasph2bf8s 8128(%ecx), %zmm3, %ymm2 {%k7} {z} + +// CHECK: vcvtbiasph2bf8s -256(%edx){1to32}, %zmm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0xdf,0x74,0x52,0x80] + vcvtbiasph2bf8s -256(%edx){1to32}, %zmm3, %ymm2 {%k7} {z} + +// CHECK: vcvtbiasph2bf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vcvtbiasph2bf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x64,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vcvtbiasph2bf8s (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x18,0x74,0x10] + vcvtbiasph2bf8s (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vcvtbiasph2bf8s -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtbiasph2bf8s -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vcvtbiasph2bf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0x8f,0x74,0x51,0x7f] + vcvtbiasph2bf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2bf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0x9f,0x74,0x52,0x80] + vcvtbiasph2bf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2hf8 %zmm4, %zmm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x48,0x18,0xd4] + vcvtbiasph2hf8 %zmm4, %zmm3, %ymm2 + +// CHECK: vcvtbiasph2hf8 %zmm4, %zmm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x64,0x4f,0x18,0xd4] + vcvtbiasph2hf8 %zmm4, %zmm3, %ymm2 {%k7} + +// CHECK: vcvtbiasph2hf8 %zmm4, %zmm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0xcf,0x18,0xd4] + vcvtbiasph2hf8 %zmm4, %zmm3, %ymm2 {%k7} {z} + +// CHECK: vcvtbiasph2hf8 %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x08,0x18,0xd4] + vcvtbiasph2hf8 %xmm4, %xmm3, %xmm2 + +// CHECK: vcvtbiasph2hf8 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x64,0x0f,0x18,0xd4] + vcvtbiasph2hf8 %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vcvtbiasph2hf8 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0x8f,0x18,0xd4] + vcvtbiasph2hf8 %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2hf8 %ymm4, %ymm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x28,0x18,0xd4] + vcvtbiasph2hf8 %ymm4, %ymm3, %xmm2 + +// CHECK: vcvtbiasph2hf8 %ymm4, %ymm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x64,0x2f,0x18,0xd4] + vcvtbiasph2hf8 %ymm4, %ymm3, %xmm2 {%k7} + +// CHECK: vcvtbiasph2hf8 %ymm4, %ymm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0xaf,0x18,0xd4] + vcvtbiasph2hf8 %ymm4, %ymm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2hf8 268435456(%esp,%esi,8), %ymm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x28,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8 268435456(%esp,%esi,8), %ymm3, %xmm2 + +// CHECK: vcvtbiasph2hf8 291(%edi,%eax,4), %ymm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x64,0x2f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8 291(%edi,%eax,4), %ymm3, %xmm2 {%k7} + +// CHECK: vcvtbiasph2hf8 (%eax){1to16}, %ymm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x38,0x18,0x10] + vcvtbiasph2hf8 (%eax){1to16}, %ymm3, %xmm2 + +// CHECK: vcvtbiasph2hf8 -1024(,%ebp,2), %ymm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x28,0x18,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtbiasph2hf8 -1024(,%ebp,2), %ymm3, %xmm2 + +// CHECK: vcvtbiasph2hf8 4064(%ecx), %ymm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0xaf,0x18,0x51,0x7f] + vcvtbiasph2hf8 4064(%ecx), %ymm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2hf8 -256(%edx){1to16}, %ymm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0xbf,0x18,0x52,0x80] + vcvtbiasph2hf8 -256(%edx){1to16}, %ymm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2hf8 268435456(%esp,%esi,8), %zmm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x48,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8 268435456(%esp,%esi,8), %zmm3, %ymm2 + +// CHECK: vcvtbiasph2hf8 291(%edi,%eax,4), %zmm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x64,0x4f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8 291(%edi,%eax,4), %zmm3, %ymm2 {%k7} + +// CHECK: vcvtbiasph2hf8 (%eax){1to32}, %zmm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x58,0x18,0x10] + vcvtbiasph2hf8 (%eax){1to32}, %zmm3, %ymm2 + +// CHECK: vcvtbiasph2hf8 -2048(,%ebp,2), %zmm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x48,0x18,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtbiasph2hf8 -2048(,%ebp,2), %zmm3, %ymm2 + +// CHECK: vcvtbiasph2hf8 8128(%ecx), %zmm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0xcf,0x18,0x51,0x7f] + vcvtbiasph2hf8 8128(%ecx), %zmm3, %ymm2 {%k7} {z} + +// CHECK: vcvtbiasph2hf8 -256(%edx){1to32}, %zmm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0xdf,0x18,0x52,0x80] + vcvtbiasph2hf8 -256(%edx){1to32}, %zmm3, %ymm2 {%k7} {z} + +// CHECK: vcvtbiasph2hf8 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x08,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vcvtbiasph2hf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x64,0x0f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vcvtbiasph2hf8 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x18,0x18,0x10] + vcvtbiasph2hf8 (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vcvtbiasph2hf8 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x08,0x18,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtbiasph2hf8 -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vcvtbiasph2hf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0x8f,0x18,0x51,0x7f] + vcvtbiasph2hf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2hf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0x9f,0x18,0x52,0x80] + vcvtbiasph2hf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2hf8s %zmm4, %zmm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x48,0x1b,0xd4] + vcvtbiasph2hf8s %zmm4, %zmm3, %ymm2 + +// CHECK: vcvtbiasph2hf8s %zmm4, %zmm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x64,0x4f,0x1b,0xd4] + vcvtbiasph2hf8s %zmm4, %zmm3, %ymm2 {%k7} + +// CHECK: vcvtbiasph2hf8s %zmm4, %zmm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0xcf,0x1b,0xd4] + vcvtbiasph2hf8s %zmm4, %zmm3, %ymm2 {%k7} {z} + +// CHECK: vcvtbiasph2hf8s %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x08,0x1b,0xd4] + vcvtbiasph2hf8s %xmm4, %xmm3, %xmm2 + +// CHECK: vcvtbiasph2hf8s %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x64,0x0f,0x1b,0xd4] + vcvtbiasph2hf8s %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vcvtbiasph2hf8s %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0x8f,0x1b,0xd4] + vcvtbiasph2hf8s %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2hf8s %ymm4, %ymm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x28,0x1b,0xd4] + vcvtbiasph2hf8s %ymm4, %ymm3, %xmm2 + +// CHECK: vcvtbiasph2hf8s %ymm4, %ymm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x64,0x2f,0x1b,0xd4] + vcvtbiasph2hf8s %ymm4, %ymm3, %xmm2 {%k7} + +// CHECK: vcvtbiasph2hf8s %ymm4, %ymm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0xaf,0x1b,0xd4] + vcvtbiasph2hf8s %ymm4, %ymm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2hf8s 268435456(%esp,%esi,8), %ymm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x28,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8s 268435456(%esp,%esi,8), %ymm3, %xmm2 + +// CHECK: vcvtbiasph2hf8s 291(%edi,%eax,4), %ymm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x64,0x2f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8s 291(%edi,%eax,4), %ymm3, %xmm2 {%k7} + +// CHECK: vcvtbiasph2hf8s (%eax){1to16}, %ymm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x38,0x1b,0x10] + vcvtbiasph2hf8s (%eax){1to16}, %ymm3, %xmm2 + +// CHECK: vcvtbiasph2hf8s -1024(,%ebp,2), %ymm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x28,0x1b,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtbiasph2hf8s -1024(,%ebp,2), %ymm3, %xmm2 + +// CHECK: vcvtbiasph2hf8s 4064(%ecx), %ymm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0xaf,0x1b,0x51,0x7f] + vcvtbiasph2hf8s 4064(%ecx), %ymm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2hf8s -256(%edx){1to16}, %ymm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0xbf,0x1b,0x52,0x80] + vcvtbiasph2hf8s -256(%edx){1to16}, %ymm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2hf8s 268435456(%esp,%esi,8), %zmm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x48,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8s 268435456(%esp,%esi,8), %zmm3, %ymm2 + +// CHECK: vcvtbiasph2hf8s 291(%edi,%eax,4), %zmm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x64,0x4f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8s 291(%edi,%eax,4), %zmm3, %ymm2 {%k7} + +// CHECK: vcvtbiasph2hf8s (%eax){1to32}, %zmm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x58,0x1b,0x10] + vcvtbiasph2hf8s (%eax){1to32}, %zmm3, %ymm2 + +// CHECK: vcvtbiasph2hf8s -2048(,%ebp,2), %zmm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x48,0x1b,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtbiasph2hf8s -2048(,%ebp,2), %zmm3, %ymm2 + +// CHECK: vcvtbiasph2hf8s 8128(%ecx), %zmm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0xcf,0x1b,0x51,0x7f] + vcvtbiasph2hf8s 8128(%ecx), %zmm3, %ymm2 {%k7} {z} + +// CHECK: vcvtbiasph2hf8s -256(%edx){1to32}, %zmm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0xdf,0x1b,0x52,0x80] + vcvtbiasph2hf8s -256(%edx){1to32}, %zmm3, %ymm2 {%k7} {z} + +// CHECK: vcvtbiasph2hf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x08,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vcvtbiasph2hf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x64,0x0f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vcvtbiasph2hf8s (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x18,0x1b,0x10] + vcvtbiasph2hf8s (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vcvtbiasph2hf8s -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x64,0x08,0x1b,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtbiasph2hf8s -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vcvtbiasph2hf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0x8f,0x1b,0x51,0x7f] + vcvtbiasph2hf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtbiasph2hf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x64,0x9f,0x1b,0x52,0x80] + vcvtbiasph2hf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvthf82ph %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x1e,0xd3] + vcvthf82ph %xmm3, %xmm2 + +// CHECK: vcvthf82ph %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7f,0x0f,0x1e,0xd3] + vcvthf82ph %xmm3, %xmm2 {%k7} + +// CHECK: vcvthf82ph %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7f,0x8f,0x1e,0xd3] + vcvthf82ph %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvthf82ph %xmm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x28,0x1e,0xd3] + vcvthf82ph %xmm3, %ymm2 + +// CHECK: vcvthf82ph %xmm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7f,0x2f,0x1e,0xd3] + vcvthf82ph %xmm3, %ymm2 {%k7} + +// CHECK: vcvthf82ph %xmm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7f,0xaf,0x1e,0xd3] + vcvthf82ph %xmm3, %ymm2 {%k7} {z} + +// CHECK: vcvthf82ph %ymm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x48,0x1e,0xd3] + vcvthf82ph %ymm3, %zmm2 + +// CHECK: vcvthf82ph %ymm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7f,0x4f,0x1e,0xd3] + vcvthf82ph %ymm3, %zmm2 {%k7} + +// CHECK: vcvthf82ph %ymm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7f,0xcf,0x1e,0xd3] + vcvthf82ph %ymm3, %zmm2 {%k7} {z} + +// CHECK: vcvthf82ph 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x1e,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvthf82ph 268435456(%esp,%esi,8), %xmm2 + +// CHECK: vcvthf82ph 291(%edi,%eax,4), %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7f,0x0f,0x1e,0x94,0x87,0x23,0x01,0x00,0x00] + vcvthf82ph 291(%edi,%eax,4), %xmm2 {%k7} + +// CHECK: vcvthf82ph (%eax), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x1e,0x10] + vcvthf82ph (%eax), %xmm2 + +// CHECK: vcvthf82ph -256(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x1e,0x14,0x6d,0x00,0xff,0xff,0xff] + vcvthf82ph -256(,%ebp,2), %xmm2 + +// CHECK: vcvthf82ph 1016(%ecx), %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7f,0x8f,0x1e,0x51,0x7f] + vcvthf82ph 1016(%ecx), %xmm2 {%k7} {z} + +// CHECK: vcvthf82ph -1024(%edx), %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7f,0x8f,0x1e,0x52,0x80] + vcvthf82ph -1024(%edx), %xmm2 {%k7} {z} + +// CHECK: vcvthf82ph 268435456(%esp,%esi,8), %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x28,0x1e,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvthf82ph 268435456(%esp,%esi,8), %ymm2 + +// CHECK: vcvthf82ph 291(%edi,%eax,4), %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7f,0x2f,0x1e,0x94,0x87,0x23,0x01,0x00,0x00] + vcvthf82ph 291(%edi,%eax,4), %ymm2 {%k7} + +// CHECK: vcvthf82ph (%eax), %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x28,0x1e,0x10] + vcvthf82ph (%eax), %ymm2 + +// CHECK: vcvthf82ph -512(,%ebp,2), %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x28,0x1e,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvthf82ph -512(,%ebp,2), %ymm2 + +// CHECK: vcvthf82ph 2032(%ecx), %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7f,0xaf,0x1e,0x51,0x7f] + vcvthf82ph 2032(%ecx), %ymm2 {%k7} {z} + +// CHECK: vcvthf82ph -2048(%edx), %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7f,0xaf,0x1e,0x52,0x80] + vcvthf82ph -2048(%edx), %ymm2 {%k7} {z} + +// CHECK: vcvthf82ph 268435456(%esp,%esi,8), %zmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x48,0x1e,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvthf82ph 268435456(%esp,%esi,8), %zmm2 + +// CHECK: vcvthf82ph 291(%edi,%eax,4), %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7f,0x4f,0x1e,0x94,0x87,0x23,0x01,0x00,0x00] + vcvthf82ph 291(%edi,%eax,4), %zmm2 {%k7} + +// CHECK: vcvthf82ph (%eax), %zmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x48,0x1e,0x10] + vcvthf82ph (%eax), %zmm2 + +// CHECK: vcvthf82ph -1024(,%ebp,2), %zmm2 +// CHECK: encoding: [0x62,0xf5,0x7f,0x48,0x1e,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvthf82ph -1024(,%ebp,2), %zmm2 + +// CHECK: vcvthf82ph 4064(%ecx), %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7f,0xcf,0x1e,0x51,0x7f] + vcvthf82ph 4064(%ecx), %zmm2 {%k7} {z} + +// CHECK: vcvthf82ph -4096(%edx), %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7f,0xcf,0x1e,0x52,0x80] + vcvthf82ph -4096(%edx), %zmm2 {%k7} {z} + +// CHECK: vcvtne2ph2bf8 %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf2,0x67,0x28,0x74,0xd4] + vcvtne2ph2bf8 %ymm4, %ymm3, %ymm2 + +// CHECK: vcvtne2ph2bf8 %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf2,0x67,0x2f,0x74,0xd4] + vcvtne2ph2bf8 %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vcvtne2ph2bf8 %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x67,0xaf,0x74,0xd4] + vcvtne2ph2bf8 %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vcvtne2ph2bf8 %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x74,0xd4] + vcvtne2ph2bf8 %zmm4, %zmm3, %zmm2 + +// CHECK: vcvtne2ph2bf8 %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf2,0x67,0x4f,0x74,0xd4] + vcvtne2ph2bf8 %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vcvtne2ph2bf8 %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x67,0xcf,0x74,0xd4] + vcvtne2ph2bf8 %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vcvtne2ph2bf8 %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf2,0x67,0x08,0x74,0xd4] + vcvtne2ph2bf8 %xmm4, %xmm3, %xmm2 + +// CHECK: vcvtne2ph2bf8 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf2,0x67,0x0f,0x74,0xd4] + vcvtne2ph2bf8 %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vcvtne2ph2bf8 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x67,0x8f,0x74,0xd4] + vcvtne2ph2bf8 %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtne2ph2bf8 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vcvtne2ph2bf8 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf2,0x67,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vcvtne2ph2bf8 (%eax){1to32}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf2,0x67,0x58,0x74,0x10] + vcvtne2ph2bf8 (%eax){1to32}, %zmm3, %zmm2 + +// CHECK: vcvtne2ph2bf8 -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtne2ph2bf8 -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vcvtne2ph2bf8 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x67,0xcf,0x74,0x51,0x7f] + vcvtne2ph2bf8 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vcvtne2ph2bf8 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x67,0xdf,0x74,0x52,0x80] + vcvtne2ph2bf8 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vcvtne2ph2bf8 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf2,0x67,0x28,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vcvtne2ph2bf8 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf2,0x67,0x2f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vcvtne2ph2bf8 (%eax){1to16}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf2,0x67,0x38,0x74,0x10] + vcvtne2ph2bf8 (%eax){1to16}, %ymm3, %ymm2 + +// CHECK: vcvtne2ph2bf8 -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf2,0x67,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtne2ph2bf8 -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vcvtne2ph2bf8 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x67,0xaf,0x74,0x51,0x7f] + vcvtne2ph2bf8 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vcvtne2ph2bf8 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x67,0xbf,0x74,0x52,0x80] + vcvtne2ph2bf8 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vcvtne2ph2bf8 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf2,0x67,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vcvtne2ph2bf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf2,0x67,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vcvtne2ph2bf8 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf2,0x67,0x18,0x74,0x10] + vcvtne2ph2bf8 (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vcvtne2ph2bf8 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf2,0x67,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtne2ph2bf8 -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vcvtne2ph2bf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x67,0x8f,0x74,0x51,0x7f] + vcvtne2ph2bf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtne2ph2bf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x67,0x9f,0x74,0x52,0x80] + vcvtne2ph2bf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtne2ph2bf8s %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x28,0x74,0xd4] + vcvtne2ph2bf8s %ymm4, %ymm3, %ymm2 + +// CHECK: vcvtne2ph2bf8s %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x74,0xd4] + vcvtne2ph2bf8s %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vcvtne2ph2bf8s %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x74,0xd4] + vcvtne2ph2bf8s %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vcvtne2ph2bf8s %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x48,0x74,0xd4] + vcvtne2ph2bf8s %zmm4, %zmm3, %zmm2 + +// CHECK: vcvtne2ph2bf8s %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x74,0xd4] + vcvtne2ph2bf8s %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vcvtne2ph2bf8s %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x74,0xd4] + vcvtne2ph2bf8s %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vcvtne2ph2bf8s %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x08,0x74,0xd4] + vcvtne2ph2bf8s %xmm4, %xmm3, %xmm2 + +// CHECK: vcvtne2ph2bf8s %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x74,0xd4] + vcvtne2ph2bf8s %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vcvtne2ph2bf8s %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x74,0xd4] + vcvtne2ph2bf8s %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtne2ph2bf8s 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8s 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vcvtne2ph2bf8s 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8s 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vcvtne2ph2bf8s (%eax){1to32}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x58,0x74,0x10] + vcvtne2ph2bf8s (%eax){1to32}, %zmm3, %zmm2 + +// CHECK: vcvtne2ph2bf8s -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtne2ph2bf8s -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vcvtne2ph2bf8s 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x74,0x51,0x7f] + vcvtne2ph2bf8s 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vcvtne2ph2bf8s -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0xdf,0x74,0x52,0x80] + vcvtne2ph2bf8s -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vcvtne2ph2bf8s 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x28,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8s 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vcvtne2ph2bf8s 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8s 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vcvtne2ph2bf8s (%eax){1to16}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x38,0x74,0x10] + vcvtne2ph2bf8s (%eax){1to16}, %ymm3, %ymm2 + +// CHECK: vcvtne2ph2bf8s -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtne2ph2bf8s -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vcvtne2ph2bf8s 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x74,0x51,0x7f] + vcvtne2ph2bf8s 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vcvtne2ph2bf8s -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0xbf,0x74,0x52,0x80] + vcvtne2ph2bf8s -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vcvtne2ph2bf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vcvtne2ph2bf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vcvtne2ph2bf8s (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x18,0x74,0x10] + vcvtne2ph2bf8s (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vcvtne2ph2bf8s -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtne2ph2bf8s -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vcvtne2ph2bf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x74,0x51,0x7f] + vcvtne2ph2bf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtne2ph2bf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0x9f,0x74,0x52,0x80] + vcvtne2ph2bf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtne2ph2hf8 %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x28,0x18,0xd4] + vcvtne2ph2hf8 %ymm4, %ymm3, %ymm2 + +// CHECK: vcvtne2ph2hf8 %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x18,0xd4] + vcvtne2ph2hf8 %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vcvtne2ph2hf8 %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x18,0xd4] + vcvtne2ph2hf8 %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vcvtne2ph2hf8 %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x48,0x18,0xd4] + vcvtne2ph2hf8 %zmm4, %zmm3, %zmm2 + +// CHECK: vcvtne2ph2hf8 %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x18,0xd4] + vcvtne2ph2hf8 %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vcvtne2ph2hf8 %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x18,0xd4] + vcvtne2ph2hf8 %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vcvtne2ph2hf8 %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x08,0x18,0xd4] + vcvtne2ph2hf8 %xmm4, %xmm3, %xmm2 + +// CHECK: vcvtne2ph2hf8 %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x18,0xd4] + vcvtne2ph2hf8 %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vcvtne2ph2hf8 %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x18,0xd4] + vcvtne2ph2hf8 %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtne2ph2hf8 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x48,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vcvtne2ph2hf8 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vcvtne2ph2hf8 (%eax){1to32}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x58,0x18,0x10] + vcvtne2ph2hf8 (%eax){1to32}, %zmm3, %zmm2 + +// CHECK: vcvtne2ph2hf8 -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x48,0x18,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtne2ph2hf8 -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vcvtne2ph2hf8 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x18,0x51,0x7f] + vcvtne2ph2hf8 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vcvtne2ph2hf8 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0xdf,0x18,0x52,0x80] + vcvtne2ph2hf8 -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vcvtne2ph2hf8 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x28,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vcvtne2ph2hf8 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vcvtne2ph2hf8 (%eax){1to16}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x38,0x18,0x10] + vcvtne2ph2hf8 (%eax){1to16}, %ymm3, %ymm2 + +// CHECK: vcvtne2ph2hf8 -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x28,0x18,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtne2ph2hf8 -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vcvtne2ph2hf8 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x18,0x51,0x7f] + vcvtne2ph2hf8 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vcvtne2ph2hf8 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0xbf,0x18,0x52,0x80] + vcvtne2ph2hf8 -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vcvtne2ph2hf8 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x08,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vcvtne2ph2hf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vcvtne2ph2hf8 (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x18,0x18,0x10] + vcvtne2ph2hf8 (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vcvtne2ph2hf8 -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x08,0x18,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtne2ph2hf8 -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vcvtne2ph2hf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x18,0x51,0x7f] + vcvtne2ph2hf8 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtne2ph2hf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0x9f,0x18,0x52,0x80] + vcvtne2ph2hf8 -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtne2ph2hf8s %ymm4, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x28,0x1b,0xd4] + vcvtne2ph2hf8s %ymm4, %ymm3, %ymm2 + +// CHECK: vcvtne2ph2hf8s %ymm4, %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x1b,0xd4] + vcvtne2ph2hf8s %ymm4, %ymm3, %ymm2 {%k7} + +// CHECK: vcvtne2ph2hf8s %ymm4, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x1b,0xd4] + vcvtne2ph2hf8s %ymm4, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vcvtne2ph2hf8s %zmm4, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x48,0x1b,0xd4] + vcvtne2ph2hf8s %zmm4, %zmm3, %zmm2 + +// CHECK: vcvtne2ph2hf8s %zmm4, %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x1b,0xd4] + vcvtne2ph2hf8s %zmm4, %zmm3, %zmm2 {%k7} + +// CHECK: vcvtne2ph2hf8s %zmm4, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x1b,0xd4] + vcvtne2ph2hf8s %zmm4, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vcvtne2ph2hf8s %xmm4, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x08,0x1b,0xd4] + vcvtne2ph2hf8s %xmm4, %xmm3, %xmm2 + +// CHECK: vcvtne2ph2hf8s %xmm4, %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x1b,0xd4] + vcvtne2ph2hf8s %xmm4, %xmm3, %xmm2 {%k7} + +// CHECK: vcvtne2ph2hf8s %xmm4, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x1b,0xd4] + vcvtne2ph2hf8s %xmm4, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtne2ph2hf8s 268435456(%esp,%esi,8), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x48,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8s 268435456(%esp,%esi,8), %zmm3, %zmm2 + +// CHECK: vcvtne2ph2hf8s 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8s 291(%edi,%eax,4), %zmm3, %zmm2 {%k7} + +// CHECK: vcvtne2ph2hf8s (%eax){1to32}, %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x58,0x1b,0x10] + vcvtne2ph2hf8s (%eax){1to32}, %zmm3, %zmm2 + +// CHECK: vcvtne2ph2hf8s -2048(,%ebp,2), %zmm3, %zmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x48,0x1b,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtne2ph2hf8s -2048(,%ebp,2), %zmm3, %zmm2 + +// CHECK: vcvtne2ph2hf8s 8128(%ecx), %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x1b,0x51,0x7f] + vcvtne2ph2hf8s 8128(%ecx), %zmm3, %zmm2 {%k7} {z} + +// CHECK: vcvtne2ph2hf8s -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0xdf,0x1b,0x52,0x80] + vcvtne2ph2hf8s -256(%edx){1to32}, %zmm3, %zmm2 {%k7} {z} + +// CHECK: vcvtne2ph2hf8s 268435456(%esp,%esi,8), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x28,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8s 268435456(%esp,%esi,8), %ymm3, %ymm2 + +// CHECK: vcvtne2ph2hf8s 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8s 291(%edi,%eax,4), %ymm3, %ymm2 {%k7} + +// CHECK: vcvtne2ph2hf8s (%eax){1to16}, %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x38,0x1b,0x10] + vcvtne2ph2hf8s (%eax){1to16}, %ymm3, %ymm2 + +// CHECK: vcvtne2ph2hf8s -1024(,%ebp,2), %ymm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x28,0x1b,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtne2ph2hf8s -1024(,%ebp,2), %ymm3, %ymm2 + +// CHECK: vcvtne2ph2hf8s 4064(%ecx), %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x1b,0x51,0x7f] + vcvtne2ph2hf8s 4064(%ecx), %ymm3, %ymm2 {%k7} {z} + +// CHECK: vcvtne2ph2hf8s -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0xbf,0x1b,0x52,0x80] + vcvtne2ph2hf8s -256(%edx){1to16}, %ymm3, %ymm2 {%k7} {z} + +// CHECK: vcvtne2ph2hf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x08,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8s 268435456(%esp,%esi,8), %xmm3, %xmm2 + +// CHECK: vcvtne2ph2hf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8s 291(%edi,%eax,4), %xmm3, %xmm2 {%k7} + +// CHECK: vcvtne2ph2hf8s (%eax){1to8}, %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x18,0x1b,0x10] + vcvtne2ph2hf8s (%eax){1to8}, %xmm3, %xmm2 + +// CHECK: vcvtne2ph2hf8s -512(,%ebp,2), %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x67,0x08,0x1b,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtne2ph2hf8s -512(,%ebp,2), %xmm3, %xmm2 + +// CHECK: vcvtne2ph2hf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x1b,0x51,0x7f] + vcvtne2ph2hf8s 2032(%ecx), %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtne2ph2hf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x67,0x9f,0x1b,0x52,0x80] + vcvtne2ph2hf8s -256(%edx){1to8}, %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtneph2bf8 %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf2,0x7e,0x08,0x74,0xd3] + vcvtneph2bf8 %xmm3, %xmm2 + +// CHECK: vcvtneph2bf8 %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf2,0x7e,0x0f,0x74,0xd3] + vcvtneph2bf8 %xmm3, %xmm2 {%k7} + +// CHECK: vcvtneph2bf8 %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x7e,0x8f,0x74,0xd3] + vcvtneph2bf8 %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtneph2bf8 %zmm3, %ymm2 +// CHECK: encoding: [0x62,0xf2,0x7e,0x48,0x74,0xd3] + vcvtneph2bf8 %zmm3, %ymm2 + +// CHECK: vcvtneph2bf8 %zmm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf2,0x7e,0x4f,0x74,0xd3] + vcvtneph2bf8 %zmm3, %ymm2 {%k7} + +// CHECK: vcvtneph2bf8 %zmm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x7e,0xcf,0x74,0xd3] + vcvtneph2bf8 %zmm3, %ymm2 {%k7} {z} + +// CHECK: vcvtneph2bf8 %ymm3, %xmm2 +// CHECK: encoding: [0x62,0xf2,0x7e,0x28,0x74,0xd3] + vcvtneph2bf8 %ymm3, %xmm2 + +// CHECK: vcvtneph2bf8 %ymm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf2,0x7e,0x2f,0x74,0xd3] + vcvtneph2bf8 %ymm3, %xmm2 {%k7} + +// CHECK: vcvtneph2bf8 %ymm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x7e,0xaf,0x74,0xd3] + vcvtneph2bf8 %ymm3, %xmm2 {%k7} {z} + +// CHECK: vcvtneph2bf8x 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0x62,0xf2,0x7e,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtneph2bf8x 268435456(%esp,%esi,8), %xmm2 + +// CHECK: vcvtneph2bf8x 291(%edi,%eax,4), %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf2,0x7e,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtneph2bf8x 291(%edi,%eax,4), %xmm2 {%k7} + +// CHECK: vcvtneph2bf8 (%eax){1to8}, %xmm2 +// CHECK: encoding: [0x62,0xf2,0x7e,0x18,0x74,0x10] + vcvtneph2bf8 (%eax){1to8}, %xmm2 + +// CHECK: vcvtneph2bf8x -512(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf2,0x7e,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtneph2bf8x -512(,%ebp,2), %xmm2 + +// CHECK: vcvtneph2bf8x 2032(%ecx), %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x7e,0x8f,0x74,0x51,0x7f] + vcvtneph2bf8x 2032(%ecx), %xmm2 {%k7} {z} + +// CHECK: vcvtneph2bf8 -256(%edx){1to8}, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x7e,0x9f,0x74,0x52,0x80] + vcvtneph2bf8 -256(%edx){1to8}, %xmm2 {%k7} {z} + +// CHECK: vcvtneph2bf8 (%eax){1to16}, %xmm2 +// CHECK: encoding: [0x62,0xf2,0x7e,0x38,0x74,0x10] + vcvtneph2bf8 (%eax){1to16}, %xmm2 + +// CHECK: vcvtneph2bf8y -1024(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf2,0x7e,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtneph2bf8y -1024(,%ebp,2), %xmm2 + +// CHECK: vcvtneph2bf8y 4064(%ecx), %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x7e,0xaf,0x74,0x51,0x7f] + vcvtneph2bf8y 4064(%ecx), %xmm2 {%k7} {z} + +// CHECK: vcvtneph2bf8 -256(%edx){1to16}, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x7e,0xbf,0x74,0x52,0x80] + vcvtneph2bf8 -256(%edx){1to16}, %xmm2 {%k7} {z} + +// CHECK: vcvtneph2bf8 268435456(%esp,%esi,8), %ymm2 +// CHECK: encoding: [0x62,0xf2,0x7e,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtneph2bf8 268435456(%esp,%esi,8), %ymm2 + +// CHECK: vcvtneph2bf8 291(%edi,%eax,4), %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf2,0x7e,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtneph2bf8 291(%edi,%eax,4), %ymm2 {%k7} + +// CHECK: vcvtneph2bf8 (%eax){1to32}, %ymm2 +// CHECK: encoding: [0x62,0xf2,0x7e,0x58,0x74,0x10] + vcvtneph2bf8 (%eax){1to32}, %ymm2 + +// CHECK: vcvtneph2bf8 -2048(,%ebp,2), %ymm2 +// CHECK: encoding: [0x62,0xf2,0x7e,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtneph2bf8 -2048(,%ebp,2), %ymm2 + +// CHECK: vcvtneph2bf8 8128(%ecx), %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x7e,0xcf,0x74,0x51,0x7f] + vcvtneph2bf8 8128(%ecx), %ymm2 {%k7} {z} + +// CHECK: vcvtneph2bf8 -256(%edx){1to32}, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf2,0x7e,0xdf,0x74,0x52,0x80] + vcvtneph2bf8 -256(%edx){1to32}, %ymm2 {%k7} {z} + +// CHECK: vcvtneph2bf8s %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x74,0xd3] + vcvtneph2bf8s %xmm3, %xmm2 + +// CHECK: vcvtneph2bf8s %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x74,0xd3] + vcvtneph2bf8s %xmm3, %xmm2 {%k7} + +// CHECK: vcvtneph2bf8s %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x74,0xd3] + vcvtneph2bf8s %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtneph2bf8s %zmm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x74,0xd3] + vcvtneph2bf8s %zmm3, %ymm2 + +// CHECK: vcvtneph2bf8s %zmm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x74,0xd3] + vcvtneph2bf8s %zmm3, %ymm2 {%k7} + +// CHECK: vcvtneph2bf8s %zmm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x74,0xd3] + vcvtneph2bf8s %zmm3, %ymm2 {%k7} {z} + +// CHECK: vcvtneph2bf8s %ymm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x74,0xd3] + vcvtneph2bf8s %ymm3, %xmm2 + +// CHECK: vcvtneph2bf8s %ymm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7e,0x2f,0x74,0xd3] + vcvtneph2bf8s %ymm3, %xmm2 {%k7} + +// CHECK: vcvtneph2bf8s %ymm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x74,0xd3] + vcvtneph2bf8s %ymm3, %xmm2 {%k7} {z} + +// CHECK: vcvtneph2bf8sx 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtneph2bf8sx 268435456(%esp,%esi,8), %xmm2 + +// CHECK: vcvtneph2bf8sx 291(%edi,%eax,4), %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtneph2bf8sx 291(%edi,%eax,4), %xmm2 {%k7} + +// CHECK: vcvtneph2bf8s (%eax){1to8}, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x74,0x10] + vcvtneph2bf8s (%eax){1to8}, %xmm2 + +// CHECK: vcvtneph2bf8sx -512(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtneph2bf8sx -512(,%ebp,2), %xmm2 + +// CHECK: vcvtneph2bf8sx 2032(%ecx), %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x74,0x51,0x7f] + vcvtneph2bf8sx 2032(%ecx), %xmm2 {%k7} {z} + +// CHECK: vcvtneph2bf8s -256(%edx){1to8}, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0x9f,0x74,0x52,0x80] + vcvtneph2bf8s -256(%edx){1to8}, %xmm2 {%k7} {z} + +// CHECK: vcvtneph2bf8s (%eax){1to16}, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x38,0x74,0x10] + vcvtneph2bf8s (%eax){1to16}, %xmm2 + +// CHECK: vcvtneph2bf8sy -1024(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtneph2bf8sy -1024(,%ebp,2), %xmm2 + +// CHECK: vcvtneph2bf8sy 4064(%ecx), %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x74,0x51,0x7f] + vcvtneph2bf8sy 4064(%ecx), %xmm2 {%k7} {z} + +// CHECK: vcvtneph2bf8s -256(%edx){1to16}, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0xbf,0x74,0x52,0x80] + vcvtneph2bf8s -256(%edx){1to16}, %xmm2 {%k7} {z} + +// CHECK: vcvtneph2bf8s 268435456(%esp,%esi,8), %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtneph2bf8s 268435456(%esp,%esi,8), %ymm2 + +// CHECK: vcvtneph2bf8s 291(%edi,%eax,4), %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtneph2bf8s 291(%edi,%eax,4), %ymm2 {%k7} + +// CHECK: vcvtneph2bf8s (%eax){1to32}, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x58,0x74,0x10] + vcvtneph2bf8s (%eax){1to32}, %ymm2 + +// CHECK: vcvtneph2bf8s -2048(,%ebp,2), %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtneph2bf8s -2048(,%ebp,2), %ymm2 + +// CHECK: vcvtneph2bf8s 8128(%ecx), %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x74,0x51,0x7f] + vcvtneph2bf8s 8128(%ecx), %ymm2 {%k7} {z} + +// CHECK: vcvtneph2bf8s -256(%edx){1to32}, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0xdf,0x74,0x52,0x80] + vcvtneph2bf8s -256(%edx){1to32}, %ymm2 {%k7} {z} + +// CHECK: vcvtneph2hf8 %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x18,0xd3] + vcvtneph2hf8 %xmm3, %xmm2 + +// CHECK: vcvtneph2hf8 %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x18,0xd3] + vcvtneph2hf8 %xmm3, %xmm2 {%k7} + +// CHECK: vcvtneph2hf8 %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x18,0xd3] + vcvtneph2hf8 %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtneph2hf8 %zmm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x18,0xd3] + vcvtneph2hf8 %zmm3, %ymm2 + +// CHECK: vcvtneph2hf8 %zmm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x18,0xd3] + vcvtneph2hf8 %zmm3, %ymm2 {%k7} + +// CHECK: vcvtneph2hf8 %zmm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x18,0xd3] + vcvtneph2hf8 %zmm3, %ymm2 {%k7} {z} + +// CHECK: vcvtneph2hf8 %ymm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x18,0xd3] + vcvtneph2hf8 %ymm3, %xmm2 + +// CHECK: vcvtneph2hf8 %ymm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7e,0x2f,0x18,0xd3] + vcvtneph2hf8 %ymm3, %xmm2 {%k7} + +// CHECK: vcvtneph2hf8 %ymm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x18,0xd3] + vcvtneph2hf8 %ymm3, %xmm2 {%k7} {z} + +// CHECK: vcvtneph2hf8x 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtneph2hf8x 268435456(%esp,%esi,8), %xmm2 + +// CHECK: vcvtneph2hf8x 291(%edi,%eax,4), %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtneph2hf8x 291(%edi,%eax,4), %xmm2 {%k7} + +// CHECK: vcvtneph2hf8 (%eax){1to8}, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x18,0x10] + vcvtneph2hf8 (%eax){1to8}, %xmm2 + +// CHECK: vcvtneph2hf8x -512(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x18,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtneph2hf8x -512(,%ebp,2), %xmm2 + +// CHECK: vcvtneph2hf8x 2032(%ecx), %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x18,0x51,0x7f] + vcvtneph2hf8x 2032(%ecx), %xmm2 {%k7} {z} + +// CHECK: vcvtneph2hf8 -256(%edx){1to8}, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0x9f,0x18,0x52,0x80] + vcvtneph2hf8 -256(%edx){1to8}, %xmm2 {%k7} {z} + +// CHECK: vcvtneph2hf8 (%eax){1to16}, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x38,0x18,0x10] + vcvtneph2hf8 (%eax){1to16}, %xmm2 + +// CHECK: vcvtneph2hf8y -1024(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x18,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtneph2hf8y -1024(,%ebp,2), %xmm2 + +// CHECK: vcvtneph2hf8y 4064(%ecx), %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x18,0x51,0x7f] + vcvtneph2hf8y 4064(%ecx), %xmm2 {%k7} {z} + +// CHECK: vcvtneph2hf8 -256(%edx){1to16}, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0xbf,0x18,0x52,0x80] + vcvtneph2hf8 -256(%edx){1to16}, %xmm2 {%k7} {z} + +// CHECK: vcvtneph2hf8 268435456(%esp,%esi,8), %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtneph2hf8 268435456(%esp,%esi,8), %ymm2 + +// CHECK: vcvtneph2hf8 291(%edi,%eax,4), %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtneph2hf8 291(%edi,%eax,4), %ymm2 {%k7} + +// CHECK: vcvtneph2hf8 (%eax){1to32}, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x58,0x18,0x10] + vcvtneph2hf8 (%eax){1to32}, %ymm2 + +// CHECK: vcvtneph2hf8 -2048(,%ebp,2), %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x18,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtneph2hf8 -2048(,%ebp,2), %ymm2 + +// CHECK: vcvtneph2hf8 8128(%ecx), %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x18,0x51,0x7f] + vcvtneph2hf8 8128(%ecx), %ymm2 {%k7} {z} + +// CHECK: vcvtneph2hf8 -256(%edx){1to32}, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0xdf,0x18,0x52,0x80] + vcvtneph2hf8 -256(%edx){1to32}, %ymm2 {%k7} {z} + +// CHECK: vcvtneph2hf8s %xmm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x1b,0xd3] + vcvtneph2hf8s %xmm3, %xmm2 + +// CHECK: vcvtneph2hf8s %xmm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x1b,0xd3] + vcvtneph2hf8s %xmm3, %xmm2 {%k7} + +// CHECK: vcvtneph2hf8s %xmm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x1b,0xd3] + vcvtneph2hf8s %xmm3, %xmm2 {%k7} {z} + +// CHECK: vcvtneph2hf8s %zmm3, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x1b,0xd3] + vcvtneph2hf8s %zmm3, %ymm2 + +// CHECK: vcvtneph2hf8s %zmm3, %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x1b,0xd3] + vcvtneph2hf8s %zmm3, %ymm2 {%k7} + +// CHECK: vcvtneph2hf8s %zmm3, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x1b,0xd3] + vcvtneph2hf8s %zmm3, %ymm2 {%k7} {z} + +// CHECK: vcvtneph2hf8s %ymm3, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x1b,0xd3] + vcvtneph2hf8s %ymm3, %xmm2 + +// CHECK: vcvtneph2hf8s %ymm3, %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7e,0x2f,0x1b,0xd3] + vcvtneph2hf8s %ymm3, %xmm2 {%k7} + +// CHECK: vcvtneph2hf8s %ymm3, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x1b,0xd3] + vcvtneph2hf8s %ymm3, %xmm2 {%k7} {z} + +// CHECK: vcvtneph2hf8sx 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtneph2hf8sx 268435456(%esp,%esi,8), %xmm2 + +// CHECK: vcvtneph2hf8sx 291(%edi,%eax,4), %xmm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtneph2hf8sx 291(%edi,%eax,4), %xmm2 {%k7} + +// CHECK: vcvtneph2hf8s (%eax){1to8}, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x1b,0x10] + vcvtneph2hf8s (%eax){1to8}, %xmm2 + +// CHECK: vcvtneph2hf8sx -512(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x1b,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtneph2hf8sx -512(,%ebp,2), %xmm2 + +// CHECK: vcvtneph2hf8sx 2032(%ecx), %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x1b,0x51,0x7f] + vcvtneph2hf8sx 2032(%ecx), %xmm2 {%k7} {z} + +// CHECK: vcvtneph2hf8s -256(%edx){1to8}, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0x9f,0x1b,0x52,0x80] + vcvtneph2hf8s -256(%edx){1to8}, %xmm2 {%k7} {z} + +// CHECK: vcvtneph2hf8s (%eax){1to16}, %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x38,0x1b,0x10] + vcvtneph2hf8s (%eax){1to16}, %xmm2 + +// CHECK: vcvtneph2hf8sy -1024(,%ebp,2), %xmm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x1b,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtneph2hf8sy -1024(,%ebp,2), %xmm2 + +// CHECK: vcvtneph2hf8sy 4064(%ecx), %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x1b,0x51,0x7f] + vcvtneph2hf8sy 4064(%ecx), %xmm2 {%k7} {z} + +// CHECK: vcvtneph2hf8s -256(%edx){1to16}, %xmm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0xbf,0x1b,0x52,0x80] + vcvtneph2hf8s -256(%edx){1to16}, %xmm2 {%k7} {z} + +// CHECK: vcvtneph2hf8s 268435456(%esp,%esi,8), %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtneph2hf8s 268435456(%esp,%esi,8), %ymm2 + +// CHECK: vcvtneph2hf8s 291(%edi,%eax,4), %ymm2 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtneph2hf8s 291(%edi,%eax,4), %ymm2 {%k7} + +// CHECK: vcvtneph2hf8s (%eax){1to32}, %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x58,0x1b,0x10] + vcvtneph2hf8s (%eax){1to32}, %ymm2 + +// CHECK: vcvtneph2hf8s -2048(,%ebp,2), %ymm2 +// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x1b,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtneph2hf8s -2048(,%ebp,2), %ymm2 + +// CHECK: vcvtneph2hf8s 8128(%ecx), %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x1b,0x51,0x7f] + vcvtneph2hf8s 8128(%ecx), %ymm2 {%k7} {z} + +// CHECK: vcvtneph2hf8s -256(%edx){1to32}, %ymm2 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7e,0xdf,0x1b,0x52,0x80] + vcvtneph2hf8s -256(%edx){1to32}, %ymm2 {%k7} {z} + diff --git a/llvm/test/MC/X86/avx10.2convert-32-intel.s b/llvm/test/MC/X86/avx10.2convert-32-intel.s new file mode 100644 index 00000000000000..493cdae7a64259 --- /dev/null +++ b/llvm/test/MC/X86/avx10.2convert-32-intel.s @@ -0,0 +1,1490 @@ +// RUN: llvm-mc -triple i386 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: vcvt2ps2phx ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x65,0x28,0x67,0xd4] + vcvt2ps2phx ymm2, ymm3, ymm4 + +// CHECK: vcvt2ps2phx ymm2, ymm3, ymm4, {rn-sae} +// CHECK: encoding: [0x62,0xf2,0x61,0x18,0x67,0xd4] + vcvt2ps2phx ymm2, ymm3, ymm4, {rn-sae} + +// CHECK: vcvt2ps2phx ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x65,0x2f,0x67,0xd4] + vcvt2ps2phx ymm2 {k7}, ymm3, ymm4 + +// CHECK: vcvt2ps2phx ymm2 {k7} {z}, ymm3, ymm4, {rz-sae} +// CHECK: encoding: [0x62,0xf2,0x61,0xff,0x67,0xd4] + vcvt2ps2phx ymm2 {k7} {z}, ymm3, ymm4, {rz-sae} + +// CHECK: vcvt2ps2phx zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x65,0x48,0x67,0xd4] + vcvt2ps2phx zmm2, zmm3, zmm4 + +// CHECK: vcvt2ps2phx zmm2, zmm3, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf2,0x65,0x18,0x67,0xd4] + vcvt2ps2phx zmm2, zmm3, zmm4, {rn-sae} + +// CHECK: vcvt2ps2phx zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x65,0x4f,0x67,0xd4] + vcvt2ps2phx zmm2 {k7}, zmm3, zmm4 + +// CHECK: vcvt2ps2phx zmm2 {k7} {z}, zmm3, zmm4, {rz-sae} +// CHECK: encoding: [0x62,0xf2,0x65,0xff,0x67,0xd4] + vcvt2ps2phx zmm2 {k7} {z}, zmm3, zmm4, {rz-sae} + +// CHECK: vcvt2ps2phx xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x65,0x08,0x67,0xd4] + vcvt2ps2phx xmm2, xmm3, xmm4 + +// CHECK: vcvt2ps2phx xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x65,0x0f,0x67,0xd4] + vcvt2ps2phx xmm2 {k7}, xmm3, xmm4 + +// CHECK: vcvt2ps2phx xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x65,0x8f,0x67,0xd4] + vcvt2ps2phx xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vcvt2ps2phx zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x65,0x48,0x67,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvt2ps2phx zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvt2ps2phx zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x65,0x4f,0x67,0x94,0x87,0x23,0x01,0x00,0x00] + vcvt2ps2phx zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvt2ps2phx zmm2, zmm3, dword ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf2,0x65,0x58,0x67,0x10] + vcvt2ps2phx zmm2, zmm3, dword ptr [eax]{1to16} + +// CHECK: vcvt2ps2phx zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf2,0x65,0x48,0x67,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvt2ps2phx zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vcvt2ps2phx zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf2,0x65,0xcf,0x67,0x51,0x7f] + vcvt2ps2phx zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vcvt2ps2phx zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} +// CHECK: encoding: [0x62,0xf2,0x65,0xdf,0x67,0x52,0x80] + vcvt2ps2phx zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16} + +// CHECK: vcvt2ps2phx ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x65,0x28,0x67,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvt2ps2phx ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvt2ps2phx ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x65,0x2f,0x67,0x94,0x87,0x23,0x01,0x00,0x00] + vcvt2ps2phx ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vcvt2ps2phx ymm2, ymm3, dword ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf2,0x65,0x38,0x67,0x10] + vcvt2ps2phx ymm2, ymm3, dword ptr [eax]{1to8} + +// CHECK: vcvt2ps2phx ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf2,0x65,0x28,0x67,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvt2ps2phx ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vcvt2ps2phx ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf2,0x65,0xaf,0x67,0x51,0x7f] + vcvt2ps2phx ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vcvt2ps2phx ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} +// CHECK: encoding: [0x62,0xf2,0x65,0xbf,0x67,0x52,0x80] + vcvt2ps2phx ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8} + +// CHECK: vcvt2ps2phx xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x65,0x08,0x67,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvt2ps2phx xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvt2ps2phx xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x65,0x0f,0x67,0x94,0x87,0x23,0x01,0x00,0x00] + vcvt2ps2phx xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvt2ps2phx xmm2, xmm3, dword ptr [eax]{1to4} +// CHECK: encoding: [0x62,0xf2,0x65,0x18,0x67,0x10] + vcvt2ps2phx xmm2, xmm3, dword ptr [eax]{1to4} + +// CHECK: vcvt2ps2phx xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf2,0x65,0x08,0x67,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvt2ps2phx xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vcvt2ps2phx xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf2,0x65,0x8f,0x67,0x51,0x7f] + vcvt2ps2phx xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vcvt2ps2phx xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} +// CHECK: encoding: [0x62,0xf2,0x65,0x9f,0x67,0x52,0x80] + vcvt2ps2phx xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4} + +// CHECK: vcvtbiasph2bf8 ymm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x74,0xd4] + vcvtbiasph2bf8 ymm2, zmm3, zmm4 + +// CHECK: vcvtbiasph2bf8 ymm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0x74,0xd4] + vcvtbiasph2bf8 ymm2 {k7}, zmm3, zmm4 + +// CHECK: vcvtbiasph2bf8 ymm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0x74,0xd4] + vcvtbiasph2bf8 ymm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vcvtbiasph2bf8 xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x08,0x74,0xd4] + vcvtbiasph2bf8 xmm2, xmm3, xmm4 + +// CHECK: vcvtbiasph2bf8 xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0x74,0xd4] + vcvtbiasph2bf8 xmm2 {k7}, xmm3, xmm4 + +// CHECK: vcvtbiasph2bf8 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0x74,0xd4] + vcvtbiasph2bf8 xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vcvtbiasph2bf8 xmm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x28,0x74,0xd4] + vcvtbiasph2bf8 xmm2, ymm3, ymm4 + +// CHECK: vcvtbiasph2bf8 xmm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0x74,0xd4] + vcvtbiasph2bf8 xmm2 {k7}, ymm3, ymm4 + +// CHECK: vcvtbiasph2bf8 xmm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0x74,0xd4] + vcvtbiasph2bf8 xmm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vcvtbiasph2bf8 xmm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x64,0x28,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8 xmm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtbiasph2bf8 xmm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8 xmm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtbiasph2bf8 xmm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf2,0x64,0x38,0x74,0x10] + vcvtbiasph2bf8 xmm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vcvtbiasph2bf8 xmm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf2,0x64,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtbiasph2bf8 xmm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vcvtbiasph2bf8 xmm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0x74,0x51,0x7f] + vcvtbiasph2bf8 xmm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vcvtbiasph2bf8 xmm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf2,0x64,0xbf,0x74,0x52,0x80] + vcvtbiasph2bf8 xmm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vcvtbiasph2bf8 ymm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8 ymm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtbiasph2bf8 ymm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8 ymm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtbiasph2bf8 ymm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf2,0x64,0x58,0x74,0x10] + vcvtbiasph2bf8 ymm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vcvtbiasph2bf8 ymm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtbiasph2bf8 ymm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vcvtbiasph2bf8 ymm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0x74,0x51,0x7f] + vcvtbiasph2bf8 ymm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vcvtbiasph2bf8 ymm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf2,0x64,0xdf,0x74,0x52,0x80] + vcvtbiasph2bf8 ymm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vcvtbiasph2bf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x64,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtbiasph2bf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtbiasph2bf8 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf2,0x64,0x18,0x74,0x10] + vcvtbiasph2bf8 xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vcvtbiasph2bf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf2,0x64,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtbiasph2bf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vcvtbiasph2bf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0x74,0x51,0x7f] + vcvtbiasph2bf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vcvtbiasph2bf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf2,0x64,0x9f,0x74,0x52,0x80] + vcvtbiasph2bf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vcvtbiasph2bf8s ymm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x64,0x48,0x74,0xd4] + vcvtbiasph2bf8s ymm2, zmm3, zmm4 + +// CHECK: vcvtbiasph2bf8s ymm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x64,0x4f,0x74,0xd4] + vcvtbiasph2bf8s ymm2 {k7}, zmm3, zmm4 + +// CHECK: vcvtbiasph2bf8s ymm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x64,0xcf,0x74,0xd4] + vcvtbiasph2bf8s ymm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vcvtbiasph2bf8s xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x64,0x08,0x74,0xd4] + vcvtbiasph2bf8s xmm2, xmm3, xmm4 + +// CHECK: vcvtbiasph2bf8s xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x64,0x0f,0x74,0xd4] + vcvtbiasph2bf8s xmm2 {k7}, xmm3, xmm4 + +// CHECK: vcvtbiasph2bf8s xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x64,0x8f,0x74,0xd4] + vcvtbiasph2bf8s xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vcvtbiasph2bf8s xmm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x64,0x28,0x74,0xd4] + vcvtbiasph2bf8s xmm2, ymm3, ymm4 + +// CHECK: vcvtbiasph2bf8s xmm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x64,0x2f,0x74,0xd4] + vcvtbiasph2bf8s xmm2 {k7}, ymm3, ymm4 + +// CHECK: vcvtbiasph2bf8s xmm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x64,0xaf,0x74,0xd4] + vcvtbiasph2bf8s xmm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vcvtbiasph2bf8s xmm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x64,0x28,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8s xmm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtbiasph2bf8s xmm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x64,0x2f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8s xmm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtbiasph2bf8s xmm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf5,0x64,0x38,0x74,0x10] + vcvtbiasph2bf8s xmm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vcvtbiasph2bf8s xmm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf5,0x64,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtbiasph2bf8s xmm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vcvtbiasph2bf8s xmm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf5,0x64,0xaf,0x74,0x51,0x7f] + vcvtbiasph2bf8s xmm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vcvtbiasph2bf8s xmm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf5,0x64,0xbf,0x74,0x52,0x80] + vcvtbiasph2bf8s xmm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vcvtbiasph2bf8s ymm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x64,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8s ymm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtbiasph2bf8s ymm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x64,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8s ymm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtbiasph2bf8s ymm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf5,0x64,0x58,0x74,0x10] + vcvtbiasph2bf8s ymm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vcvtbiasph2bf8s ymm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf5,0x64,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtbiasph2bf8s ymm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vcvtbiasph2bf8s ymm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x64,0xcf,0x74,0x51,0x7f] + vcvtbiasph2bf8s ymm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vcvtbiasph2bf8s ymm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x64,0xdf,0x74,0x52,0x80] + vcvtbiasph2bf8s ymm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vcvtbiasph2bf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x64,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtbiasph2bf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x64,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtbiasph2bf8s xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf5,0x64,0x18,0x74,0x10] + vcvtbiasph2bf8s xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vcvtbiasph2bf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf5,0x64,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtbiasph2bf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vcvtbiasph2bf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf5,0x64,0x8f,0x74,0x51,0x7f] + vcvtbiasph2bf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vcvtbiasph2bf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf5,0x64,0x9f,0x74,0x52,0x80] + vcvtbiasph2bf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vcvtbiasph2hf8 ymm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x64,0x48,0x18,0xd4] + vcvtbiasph2hf8 ymm2, zmm3, zmm4 + +// CHECK: vcvtbiasph2hf8 ymm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x64,0x4f,0x18,0xd4] + vcvtbiasph2hf8 ymm2 {k7}, zmm3, zmm4 + +// CHECK: vcvtbiasph2hf8 ymm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x64,0xcf,0x18,0xd4] + vcvtbiasph2hf8 ymm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vcvtbiasph2hf8 xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x64,0x08,0x18,0xd4] + vcvtbiasph2hf8 xmm2, xmm3, xmm4 + +// CHECK: vcvtbiasph2hf8 xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x64,0x0f,0x18,0xd4] + vcvtbiasph2hf8 xmm2 {k7}, xmm3, xmm4 + +// CHECK: vcvtbiasph2hf8 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x64,0x8f,0x18,0xd4] + vcvtbiasph2hf8 xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vcvtbiasph2hf8 xmm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x64,0x28,0x18,0xd4] + vcvtbiasph2hf8 xmm2, ymm3, ymm4 + +// CHECK: vcvtbiasph2hf8 xmm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x64,0x2f,0x18,0xd4] + vcvtbiasph2hf8 xmm2 {k7}, ymm3, ymm4 + +// CHECK: vcvtbiasph2hf8 xmm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x64,0xaf,0x18,0xd4] + vcvtbiasph2hf8 xmm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vcvtbiasph2hf8 xmm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x64,0x28,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8 xmm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtbiasph2hf8 xmm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x64,0x2f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8 xmm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtbiasph2hf8 xmm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf5,0x64,0x38,0x18,0x10] + vcvtbiasph2hf8 xmm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vcvtbiasph2hf8 xmm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf5,0x64,0x28,0x18,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtbiasph2hf8 xmm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vcvtbiasph2hf8 xmm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf5,0x64,0xaf,0x18,0x51,0x7f] + vcvtbiasph2hf8 xmm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vcvtbiasph2hf8 xmm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf5,0x64,0xbf,0x18,0x52,0x80] + vcvtbiasph2hf8 xmm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vcvtbiasph2hf8 ymm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x64,0x48,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8 ymm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtbiasph2hf8 ymm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x64,0x4f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8 ymm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtbiasph2hf8 ymm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf5,0x64,0x58,0x18,0x10] + vcvtbiasph2hf8 ymm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vcvtbiasph2hf8 ymm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf5,0x64,0x48,0x18,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtbiasph2hf8 ymm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vcvtbiasph2hf8 ymm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x64,0xcf,0x18,0x51,0x7f] + vcvtbiasph2hf8 ymm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vcvtbiasph2hf8 ymm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x64,0xdf,0x18,0x52,0x80] + vcvtbiasph2hf8 ymm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vcvtbiasph2hf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x64,0x08,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtbiasph2hf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x64,0x0f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtbiasph2hf8 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf5,0x64,0x18,0x18,0x10] + vcvtbiasph2hf8 xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vcvtbiasph2hf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf5,0x64,0x08,0x18,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtbiasph2hf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vcvtbiasph2hf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf5,0x64,0x8f,0x18,0x51,0x7f] + vcvtbiasph2hf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vcvtbiasph2hf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf5,0x64,0x9f,0x18,0x52,0x80] + vcvtbiasph2hf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vcvtbiasph2hf8s ymm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x64,0x48,0x1b,0xd4] + vcvtbiasph2hf8s ymm2, zmm3, zmm4 + +// CHECK: vcvtbiasph2hf8s ymm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x64,0x4f,0x1b,0xd4] + vcvtbiasph2hf8s ymm2 {k7}, zmm3, zmm4 + +// CHECK: vcvtbiasph2hf8s ymm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x64,0xcf,0x1b,0xd4] + vcvtbiasph2hf8s ymm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vcvtbiasph2hf8s xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x64,0x08,0x1b,0xd4] + vcvtbiasph2hf8s xmm2, xmm3, xmm4 + +// CHECK: vcvtbiasph2hf8s xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x64,0x0f,0x1b,0xd4] + vcvtbiasph2hf8s xmm2 {k7}, xmm3, xmm4 + +// CHECK: vcvtbiasph2hf8s xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x64,0x8f,0x1b,0xd4] + vcvtbiasph2hf8s xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vcvtbiasph2hf8s xmm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x64,0x28,0x1b,0xd4] + vcvtbiasph2hf8s xmm2, ymm3, ymm4 + +// CHECK: vcvtbiasph2hf8s xmm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x64,0x2f,0x1b,0xd4] + vcvtbiasph2hf8s xmm2 {k7}, ymm3, ymm4 + +// CHECK: vcvtbiasph2hf8s xmm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x64,0xaf,0x1b,0xd4] + vcvtbiasph2hf8s xmm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vcvtbiasph2hf8s xmm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x64,0x28,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8s xmm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtbiasph2hf8s xmm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x64,0x2f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8s xmm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtbiasph2hf8s xmm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf5,0x64,0x38,0x1b,0x10] + vcvtbiasph2hf8s xmm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vcvtbiasph2hf8s xmm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf5,0x64,0x28,0x1b,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtbiasph2hf8s xmm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vcvtbiasph2hf8s xmm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf5,0x64,0xaf,0x1b,0x51,0x7f] + vcvtbiasph2hf8s xmm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vcvtbiasph2hf8s xmm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf5,0x64,0xbf,0x1b,0x52,0x80] + vcvtbiasph2hf8s xmm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vcvtbiasph2hf8s ymm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x64,0x48,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8s ymm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtbiasph2hf8s ymm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x64,0x4f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8s ymm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtbiasph2hf8s ymm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf5,0x64,0x58,0x1b,0x10] + vcvtbiasph2hf8s ymm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vcvtbiasph2hf8s ymm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf5,0x64,0x48,0x1b,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtbiasph2hf8s ymm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vcvtbiasph2hf8s ymm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x64,0xcf,0x1b,0x51,0x7f] + vcvtbiasph2hf8s ymm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vcvtbiasph2hf8s ymm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x64,0xdf,0x1b,0x52,0x80] + vcvtbiasph2hf8s ymm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vcvtbiasph2hf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x64,0x08,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtbiasph2hf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x64,0x0f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtbiasph2hf8s xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf5,0x64,0x18,0x1b,0x10] + vcvtbiasph2hf8s xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vcvtbiasph2hf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf5,0x64,0x08,0x1b,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtbiasph2hf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vcvtbiasph2hf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf5,0x64,0x8f,0x1b,0x51,0x7f] + vcvtbiasph2hf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vcvtbiasph2hf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf5,0x64,0x9f,0x1b,0x52,0x80] + vcvtbiasph2hf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vcvthf82ph xmm2, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x1e,0xd3] + vcvthf82ph xmm2, xmm3 + +// CHECK: vcvthf82ph xmm2 {k7}, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7f,0x0f,0x1e,0xd3] + vcvthf82ph xmm2 {k7}, xmm3 + +// CHECK: vcvthf82ph xmm2 {k7} {z}, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7f,0x8f,0x1e,0xd3] + vcvthf82ph xmm2 {k7} {z}, xmm3 + +// CHECK: vcvthf82ph ymm2, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7f,0x28,0x1e,0xd3] + vcvthf82ph ymm2, xmm3 + +// CHECK: vcvthf82ph ymm2 {k7}, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7f,0x2f,0x1e,0xd3] + vcvthf82ph ymm2 {k7}, xmm3 + +// CHECK: vcvthf82ph ymm2 {k7} {z}, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7f,0xaf,0x1e,0xd3] + vcvthf82ph ymm2 {k7} {z}, xmm3 + +// CHECK: vcvthf82ph zmm2, ymm3 +// CHECK: encoding: [0x62,0xf5,0x7f,0x48,0x1e,0xd3] + vcvthf82ph zmm2, ymm3 + +// CHECK: vcvthf82ph zmm2 {k7}, ymm3 +// CHECK: encoding: [0x62,0xf5,0x7f,0x4f,0x1e,0xd3] + vcvthf82ph zmm2 {k7}, ymm3 + +// CHECK: vcvthf82ph zmm2 {k7} {z}, ymm3 +// CHECK: encoding: [0x62,0xf5,0x7f,0xcf,0x1e,0xd3] + vcvthf82ph zmm2 {k7} {z}, ymm3 + +// CHECK: vcvthf82ph xmm2, qword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x1e,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvthf82ph xmm2, qword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvthf82ph xmm2 {k7}, qword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x7f,0x0f,0x1e,0x94,0x87,0x23,0x01,0x00,0x00] + vcvthf82ph xmm2 {k7}, qword ptr [edi + 4*eax + 291] + +// CHECK: vcvthf82ph xmm2, qword ptr [eax] +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x1e,0x10] + vcvthf82ph xmm2, qword ptr [eax] + +// CHECK: vcvthf82ph xmm2, qword ptr [2*ebp - 256] +// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x1e,0x14,0x6d,0x00,0xff,0xff,0xff] + vcvthf82ph xmm2, qword ptr [2*ebp - 256] + +// CHECK: vcvthf82ph xmm2 {k7} {z}, qword ptr [ecx + 1016] +// CHECK: encoding: [0x62,0xf5,0x7f,0x8f,0x1e,0x51,0x7f] + vcvthf82ph xmm2 {k7} {z}, qword ptr [ecx + 1016] + +// CHECK: vcvthf82ph xmm2 {k7} {z}, qword ptr [edx - 1024] +// CHECK: encoding: [0x62,0xf5,0x7f,0x8f,0x1e,0x52,0x80] + vcvthf82ph xmm2 {k7} {z}, qword ptr [edx - 1024] + +// CHECK: vcvthf82ph ymm2, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7f,0x28,0x1e,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvthf82ph ymm2, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvthf82ph ymm2 {k7}, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x7f,0x2f,0x1e,0x94,0x87,0x23,0x01,0x00,0x00] + vcvthf82ph ymm2 {k7}, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvthf82ph ymm2, xmmword ptr [eax] +// CHECK: encoding: [0x62,0xf5,0x7f,0x28,0x1e,0x10] + vcvthf82ph ymm2, xmmword ptr [eax] + +// CHECK: vcvthf82ph ymm2, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf5,0x7f,0x28,0x1e,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvthf82ph ymm2, xmmword ptr [2*ebp - 512] + +// CHECK: vcvthf82ph ymm2 {k7} {z}, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf5,0x7f,0xaf,0x1e,0x51,0x7f] + vcvthf82ph ymm2 {k7} {z}, xmmword ptr [ecx + 2032] + +// CHECK: vcvthf82ph ymm2 {k7} {z}, xmmword ptr [edx - 2048] +// CHECK: encoding: [0x62,0xf5,0x7f,0xaf,0x1e,0x52,0x80] + vcvthf82ph ymm2 {k7} {z}, xmmword ptr [edx - 2048] + +// CHECK: vcvthf82ph zmm2, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7f,0x48,0x1e,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvthf82ph zmm2, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvthf82ph zmm2 {k7}, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x7f,0x4f,0x1e,0x94,0x87,0x23,0x01,0x00,0x00] + vcvthf82ph zmm2 {k7}, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vcvthf82ph zmm2, ymmword ptr [eax] +// CHECK: encoding: [0x62,0xf5,0x7f,0x48,0x1e,0x10] + vcvthf82ph zmm2, ymmword ptr [eax] + +// CHECK: vcvthf82ph zmm2, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf5,0x7f,0x48,0x1e,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvthf82ph zmm2, ymmword ptr [2*ebp - 1024] + +// CHECK: vcvthf82ph zmm2 {k7} {z}, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf5,0x7f,0xcf,0x1e,0x51,0x7f] + vcvthf82ph zmm2 {k7} {z}, ymmword ptr [ecx + 4064] + +// CHECK: vcvthf82ph zmm2 {k7} {z}, ymmword ptr [edx - 4096] +// CHECK: encoding: [0x62,0xf5,0x7f,0xcf,0x1e,0x52,0x80] + vcvthf82ph zmm2 {k7} {z}, ymmword ptr [edx - 4096] + +// CHECK: vcvtne2ph2bf8 ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x67,0x28,0x74,0xd4] + vcvtne2ph2bf8 ymm2, ymm3, ymm4 + +// CHECK: vcvtne2ph2bf8 ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x67,0x2f,0x74,0xd4] + vcvtne2ph2bf8 ymm2 {k7}, ymm3, ymm4 + +// CHECK: vcvtne2ph2bf8 ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf2,0x67,0xaf,0x74,0xd4] + vcvtne2ph2bf8 ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vcvtne2ph2bf8 zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x74,0xd4] + vcvtne2ph2bf8 zmm2, zmm3, zmm4 + +// CHECK: vcvtne2ph2bf8 zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x67,0x4f,0x74,0xd4] + vcvtne2ph2bf8 zmm2 {k7}, zmm3, zmm4 + +// CHECK: vcvtne2ph2bf8 zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf2,0x67,0xcf,0x74,0xd4] + vcvtne2ph2bf8 zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vcvtne2ph2bf8 xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x67,0x08,0x74,0xd4] + vcvtne2ph2bf8 xmm2, xmm3, xmm4 + +// CHECK: vcvtne2ph2bf8 xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x67,0x0f,0x74,0xd4] + vcvtne2ph2bf8 xmm2 {k7}, xmm3, xmm4 + +// CHECK: vcvtne2ph2bf8 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf2,0x67,0x8f,0x74,0xd4] + vcvtne2ph2bf8 xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vcvtne2ph2bf8 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtne2ph2bf8 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x67,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtne2ph2bf8 zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf2,0x67,0x58,0x74,0x10] + vcvtne2ph2bf8 zmm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vcvtne2ph2bf8 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtne2ph2bf8 zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vcvtne2ph2bf8 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf2,0x67,0xcf,0x74,0x51,0x7f] + vcvtne2ph2bf8 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vcvtne2ph2bf8 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf2,0x67,0xdf,0x74,0x52,0x80] + vcvtne2ph2bf8 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vcvtne2ph2bf8 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x67,0x28,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtne2ph2bf8 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x67,0x2f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtne2ph2bf8 ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf2,0x67,0x38,0x74,0x10] + vcvtne2ph2bf8 ymm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vcvtne2ph2bf8 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf2,0x67,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtne2ph2bf8 ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vcvtne2ph2bf8 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf2,0x67,0xaf,0x74,0x51,0x7f] + vcvtne2ph2bf8 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vcvtne2ph2bf8 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf2,0x67,0xbf,0x74,0x52,0x80] + vcvtne2ph2bf8 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vcvtne2ph2bf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x67,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtne2ph2bf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x67,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtne2ph2bf8 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf2,0x67,0x18,0x74,0x10] + vcvtne2ph2bf8 xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vcvtne2ph2bf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf2,0x67,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtne2ph2bf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vcvtne2ph2bf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf2,0x67,0x8f,0x74,0x51,0x7f] + vcvtne2ph2bf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vcvtne2ph2bf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf2,0x67,0x9f,0x74,0x52,0x80] + vcvtne2ph2bf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vcvtne2ph2bf8s ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x67,0x28,0x74,0xd4] + vcvtne2ph2bf8s ymm2, ymm3, ymm4 + +// CHECK: vcvtne2ph2bf8s ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x74,0xd4] + vcvtne2ph2bf8s ymm2 {k7}, ymm3, ymm4 + +// CHECK: vcvtne2ph2bf8s ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x74,0xd4] + vcvtne2ph2bf8s ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vcvtne2ph2bf8s zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x67,0x48,0x74,0xd4] + vcvtne2ph2bf8s zmm2, zmm3, zmm4 + +// CHECK: vcvtne2ph2bf8s zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x74,0xd4] + vcvtne2ph2bf8s zmm2 {k7}, zmm3, zmm4 + +// CHECK: vcvtne2ph2bf8s zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x74,0xd4] + vcvtne2ph2bf8s zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vcvtne2ph2bf8s xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x67,0x08,0x74,0xd4] + vcvtne2ph2bf8s xmm2, xmm3, xmm4 + +// CHECK: vcvtne2ph2bf8s xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x74,0xd4] + vcvtne2ph2bf8s xmm2 {k7}, xmm3, xmm4 + +// CHECK: vcvtne2ph2bf8s xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x74,0xd4] + vcvtne2ph2bf8s xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vcvtne2ph2bf8s zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x67,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8s zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtne2ph2bf8s zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8s zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtne2ph2bf8s zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf5,0x67,0x58,0x74,0x10] + vcvtne2ph2bf8s zmm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vcvtne2ph2bf8s zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf5,0x67,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtne2ph2bf8s zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vcvtne2ph2bf8s zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x74,0x51,0x7f] + vcvtne2ph2bf8s zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vcvtne2ph2bf8s zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x67,0xdf,0x74,0x52,0x80] + vcvtne2ph2bf8s zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vcvtne2ph2bf8s ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x67,0x28,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8s ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtne2ph2bf8s ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8s ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtne2ph2bf8s ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf5,0x67,0x38,0x74,0x10] + vcvtne2ph2bf8s ymm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vcvtne2ph2bf8s ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf5,0x67,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtne2ph2bf8s ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vcvtne2ph2bf8s ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x74,0x51,0x7f] + vcvtne2ph2bf8s ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vcvtne2ph2bf8s ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf5,0x67,0xbf,0x74,0x52,0x80] + vcvtne2ph2bf8s ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vcvtne2ph2bf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x67,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtne2ph2bf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtne2ph2bf8s xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf5,0x67,0x18,0x74,0x10] + vcvtne2ph2bf8s xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vcvtne2ph2bf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf5,0x67,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtne2ph2bf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vcvtne2ph2bf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x74,0x51,0x7f] + vcvtne2ph2bf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vcvtne2ph2bf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf5,0x67,0x9f,0x74,0x52,0x80] + vcvtne2ph2bf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vcvtne2ph2hf8 ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x67,0x28,0x18,0xd4] + vcvtne2ph2hf8 ymm2, ymm3, ymm4 + +// CHECK: vcvtne2ph2hf8 ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x18,0xd4] + vcvtne2ph2hf8 ymm2 {k7}, ymm3, ymm4 + +// CHECK: vcvtne2ph2hf8 ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x18,0xd4] + vcvtne2ph2hf8 ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vcvtne2ph2hf8 zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x67,0x48,0x18,0xd4] + vcvtne2ph2hf8 zmm2, zmm3, zmm4 + +// CHECK: vcvtne2ph2hf8 zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x18,0xd4] + vcvtne2ph2hf8 zmm2 {k7}, zmm3, zmm4 + +// CHECK: vcvtne2ph2hf8 zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x18,0xd4] + vcvtne2ph2hf8 zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vcvtne2ph2hf8 xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x67,0x08,0x18,0xd4] + vcvtne2ph2hf8 xmm2, xmm3, xmm4 + +// CHECK: vcvtne2ph2hf8 xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x18,0xd4] + vcvtne2ph2hf8 xmm2 {k7}, xmm3, xmm4 + +// CHECK: vcvtne2ph2hf8 xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x18,0xd4] + vcvtne2ph2hf8 xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vcvtne2ph2hf8 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x67,0x48,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8 zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtne2ph2hf8 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8 zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtne2ph2hf8 zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf5,0x67,0x58,0x18,0x10] + vcvtne2ph2hf8 zmm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vcvtne2ph2hf8 zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf5,0x67,0x48,0x18,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtne2ph2hf8 zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vcvtne2ph2hf8 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x18,0x51,0x7f] + vcvtne2ph2hf8 zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vcvtne2ph2hf8 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x67,0xdf,0x18,0x52,0x80] + vcvtne2ph2hf8 zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vcvtne2ph2hf8 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x67,0x28,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8 ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtne2ph2hf8 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8 ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtne2ph2hf8 ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf5,0x67,0x38,0x18,0x10] + vcvtne2ph2hf8 ymm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vcvtne2ph2hf8 ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf5,0x67,0x28,0x18,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtne2ph2hf8 ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vcvtne2ph2hf8 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x18,0x51,0x7f] + vcvtne2ph2hf8 ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vcvtne2ph2hf8 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf5,0x67,0xbf,0x18,0x52,0x80] + vcvtne2ph2hf8 ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vcvtne2ph2hf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x67,0x08,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8 xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtne2ph2hf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8 xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtne2ph2hf8 xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf5,0x67,0x18,0x18,0x10] + vcvtne2ph2hf8 xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vcvtne2ph2hf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf5,0x67,0x08,0x18,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtne2ph2hf8 xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vcvtne2ph2hf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x18,0x51,0x7f] + vcvtne2ph2hf8 xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vcvtne2ph2hf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf5,0x67,0x9f,0x18,0x52,0x80] + vcvtne2ph2hf8 xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vcvtne2ph2hf8s ymm2, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x67,0x28,0x1b,0xd4] + vcvtne2ph2hf8s ymm2, ymm3, ymm4 + +// CHECK: vcvtne2ph2hf8s ymm2 {k7}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x1b,0xd4] + vcvtne2ph2hf8s ymm2 {k7}, ymm3, ymm4 + +// CHECK: vcvtne2ph2hf8s ymm2 {k7} {z}, ymm3, ymm4 +// CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x1b,0xd4] + vcvtne2ph2hf8s ymm2 {k7} {z}, ymm3, ymm4 + +// CHECK: vcvtne2ph2hf8s zmm2, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x67,0x48,0x1b,0xd4] + vcvtne2ph2hf8s zmm2, zmm3, zmm4 + +// CHECK: vcvtne2ph2hf8s zmm2 {k7}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x1b,0xd4] + vcvtne2ph2hf8s zmm2 {k7}, zmm3, zmm4 + +// CHECK: vcvtne2ph2hf8s zmm2 {k7} {z}, zmm3, zmm4 +// CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x1b,0xd4] + vcvtne2ph2hf8s zmm2 {k7} {z}, zmm3, zmm4 + +// CHECK: vcvtne2ph2hf8s xmm2, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x67,0x08,0x1b,0xd4] + vcvtne2ph2hf8s xmm2, xmm3, xmm4 + +// CHECK: vcvtne2ph2hf8s xmm2 {k7}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x1b,0xd4] + vcvtne2ph2hf8s xmm2 {k7}, xmm3, xmm4 + +// CHECK: vcvtne2ph2hf8s xmm2 {k7} {z}, xmm3, xmm4 +// CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x1b,0xd4] + vcvtne2ph2hf8s xmm2 {k7} {z}, xmm3, xmm4 + +// CHECK: vcvtne2ph2hf8s zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x67,0x48,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8s zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtne2ph2hf8s zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x67,0x4f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8s zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtne2ph2hf8s zmm2, zmm3, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf5,0x67,0x58,0x1b,0x10] + vcvtne2ph2hf8s zmm2, zmm3, word ptr [eax]{1to32} + +// CHECK: vcvtne2ph2hf8s zmm2, zmm3, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf5,0x67,0x48,0x1b,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtne2ph2hf8s zmm2, zmm3, zmmword ptr [2*ebp - 2048] + +// CHECK: vcvtne2ph2hf8s zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x67,0xcf,0x1b,0x51,0x7f] + vcvtne2ph2hf8s zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128] + +// CHECK: vcvtne2ph2hf8s zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x67,0xdf,0x1b,0x52,0x80] + vcvtne2ph2hf8s zmm2 {k7} {z}, zmm3, word ptr [edx - 256]{1to32} + +// CHECK: vcvtne2ph2hf8s ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x67,0x28,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8s ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtne2ph2hf8s ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x67,0x2f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8s ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtne2ph2hf8s ymm2, ymm3, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf5,0x67,0x38,0x1b,0x10] + vcvtne2ph2hf8s ymm2, ymm3, word ptr [eax]{1to16} + +// CHECK: vcvtne2ph2hf8s ymm2, ymm3, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf5,0x67,0x28,0x1b,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtne2ph2hf8s ymm2, ymm3, ymmword ptr [2*ebp - 1024] + +// CHECK: vcvtne2ph2hf8s ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf5,0x67,0xaf,0x1b,0x51,0x7f] + vcvtne2ph2hf8s ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064] + +// CHECK: vcvtne2ph2hf8s ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf5,0x67,0xbf,0x1b,0x52,0x80] + vcvtne2ph2hf8s ymm2 {k7} {z}, ymm3, word ptr [edx - 256]{1to16} + +// CHECK: vcvtne2ph2hf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x67,0x08,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8s xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtne2ph2hf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x67,0x0f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8s xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtne2ph2hf8s xmm2, xmm3, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf5,0x67,0x18,0x1b,0x10] + vcvtne2ph2hf8s xmm2, xmm3, word ptr [eax]{1to8} + +// CHECK: vcvtne2ph2hf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf5,0x67,0x08,0x1b,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtne2ph2hf8s xmm2, xmm3, xmmword ptr [2*ebp - 512] + +// CHECK: vcvtne2ph2hf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf5,0x67,0x8f,0x1b,0x51,0x7f] + vcvtne2ph2hf8s xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032] + +// CHECK: vcvtne2ph2hf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf5,0x67,0x9f,0x1b,0x52,0x80] + vcvtne2ph2hf8s xmm2 {k7} {z}, xmm3, word ptr [edx - 256]{1to8} + +// CHECK: vcvtneph2bf8 xmm2, xmm3 +// CHECK: encoding: [0x62,0xf2,0x7e,0x08,0x74,0xd3] + vcvtneph2bf8 xmm2, xmm3 + +// CHECK: vcvtneph2bf8 xmm2 {k7}, xmm3 +// CHECK: encoding: [0x62,0xf2,0x7e,0x0f,0x74,0xd3] + vcvtneph2bf8 xmm2 {k7}, xmm3 + +// CHECK: vcvtneph2bf8 xmm2 {k7} {z}, xmm3 +// CHECK: encoding: [0x62,0xf2,0x7e,0x8f,0x74,0xd3] + vcvtneph2bf8 xmm2 {k7} {z}, xmm3 + +// CHECK: vcvtneph2bf8 ymm2, zmm3 +// CHECK: encoding: [0x62,0xf2,0x7e,0x48,0x74,0xd3] + vcvtneph2bf8 ymm2, zmm3 + +// CHECK: vcvtneph2bf8 ymm2 {k7}, zmm3 +// CHECK: encoding: [0x62,0xf2,0x7e,0x4f,0x74,0xd3] + vcvtneph2bf8 ymm2 {k7}, zmm3 + +// CHECK: vcvtneph2bf8 ymm2 {k7} {z}, zmm3 +// CHECK: encoding: [0x62,0xf2,0x7e,0xcf,0x74,0xd3] + vcvtneph2bf8 ymm2 {k7} {z}, zmm3 + +// CHECK: vcvtneph2bf8 xmm2, ymm3 +// CHECK: encoding: [0x62,0xf2,0x7e,0x28,0x74,0xd3] + vcvtneph2bf8 xmm2, ymm3 + +// CHECK: vcvtneph2bf8 xmm2 {k7}, ymm3 +// CHECK: encoding: [0x62,0xf2,0x7e,0x2f,0x74,0xd3] + vcvtneph2bf8 xmm2 {k7}, ymm3 + +// CHECK: vcvtneph2bf8 xmm2 {k7} {z}, ymm3 +// CHECK: encoding: [0x62,0xf2,0x7e,0xaf,0x74,0xd3] + vcvtneph2bf8 xmm2 {k7} {z}, ymm3 + +// CHECK: vcvtneph2bf8 xmm2, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x7e,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtneph2bf8 xmm2, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtneph2bf8 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x7e,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtneph2bf8 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtneph2bf8 xmm2, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf2,0x7e,0x18,0x74,0x10] + vcvtneph2bf8 xmm2, word ptr [eax]{1to8} + +// CHECK: vcvtneph2bf8 xmm2, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf2,0x7e,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtneph2bf8 xmm2, xmmword ptr [2*ebp - 512] + +// CHECK: vcvtneph2bf8 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf2,0x7e,0x8f,0x74,0x51,0x7f] + vcvtneph2bf8 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] + +// CHECK: vcvtneph2bf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf2,0x7e,0x9f,0x74,0x52,0x80] + vcvtneph2bf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} + +// CHECK: vcvtneph2bf8 xmm2, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf2,0x7e,0x38,0x74,0x10] + vcvtneph2bf8 xmm2, word ptr [eax]{1to16} + +// CHECK: vcvtneph2bf8 xmm2, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf2,0x7e,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtneph2bf8 xmm2, ymmword ptr [2*ebp - 1024] + +// CHECK: vcvtneph2bf8 xmm2 {k7} {z}, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf2,0x7e,0xaf,0x74,0x51,0x7f] + vcvtneph2bf8 xmm2 {k7} {z}, ymmword ptr [ecx + 4064] + +// CHECK: vcvtneph2bf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf2,0x7e,0xbf,0x74,0x52,0x80] + vcvtneph2bf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to16} + +// CHECK: vcvtneph2bf8 ymm2, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf2,0x7e,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtneph2bf8 ymm2, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtneph2bf8 ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf2,0x7e,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtneph2bf8 ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtneph2bf8 ymm2, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf2,0x7e,0x58,0x74,0x10] + vcvtneph2bf8 ymm2, word ptr [eax]{1to32} + +// CHECK: vcvtneph2bf8 ymm2, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf2,0x7e,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtneph2bf8 ymm2, zmmword ptr [2*ebp - 2048] + +// CHECK: vcvtneph2bf8 ymm2 {k7} {z}, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf2,0x7e,0xcf,0x74,0x51,0x7f] + vcvtneph2bf8 ymm2 {k7} {z}, zmmword ptr [ecx + 8128] + +// CHECK: vcvtneph2bf8 ymm2 {k7} {z}, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf2,0x7e,0xdf,0x74,0x52,0x80] + vcvtneph2bf8 ymm2 {k7} {z}, word ptr [edx - 256]{1to32} + +// CHECK: vcvtneph2bf8s xmm2, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x74,0xd3] + vcvtneph2bf8s xmm2, xmm3 + +// CHECK: vcvtneph2bf8s xmm2 {k7}, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x74,0xd3] + vcvtneph2bf8s xmm2 {k7}, xmm3 + +// CHECK: vcvtneph2bf8s xmm2 {k7} {z}, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x74,0xd3] + vcvtneph2bf8s xmm2 {k7} {z}, xmm3 + +// CHECK: vcvtneph2bf8s ymm2, zmm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x74,0xd3] + vcvtneph2bf8s ymm2, zmm3 + +// CHECK: vcvtneph2bf8s ymm2 {k7}, zmm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x74,0xd3] + vcvtneph2bf8s ymm2 {k7}, zmm3 + +// CHECK: vcvtneph2bf8s ymm2 {k7} {z}, zmm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x74,0xd3] + vcvtneph2bf8s ymm2 {k7} {z}, zmm3 + +// CHECK: vcvtneph2bf8s xmm2, ymm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x74,0xd3] + vcvtneph2bf8s xmm2, ymm3 + +// CHECK: vcvtneph2bf8s xmm2 {k7}, ymm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0x2f,0x74,0xd3] + vcvtneph2bf8s xmm2 {k7}, ymm3 + +// CHECK: vcvtneph2bf8s xmm2 {k7} {z}, ymm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x74,0xd3] + vcvtneph2bf8s xmm2 {k7} {z}, ymm3 + +// CHECK: vcvtneph2bf8s xmm2, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtneph2bf8s xmm2, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtneph2bf8s xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtneph2bf8s xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtneph2bf8s xmm2, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x74,0x10] + vcvtneph2bf8s xmm2, word ptr [eax]{1to8} + +// CHECK: vcvtneph2bf8s xmm2, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x74,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtneph2bf8s xmm2, xmmword ptr [2*ebp - 512] + +// CHECK: vcvtneph2bf8s xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x74,0x51,0x7f] + vcvtneph2bf8s xmm2 {k7} {z}, xmmword ptr [ecx + 2032] + +// CHECK: vcvtneph2bf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf5,0x7e,0x9f,0x74,0x52,0x80] + vcvtneph2bf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to8} + +// CHECK: vcvtneph2bf8s xmm2, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7e,0x38,0x74,0x10] + vcvtneph2bf8s xmm2, word ptr [eax]{1to16} + +// CHECK: vcvtneph2bf8s xmm2, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x74,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtneph2bf8s xmm2, ymmword ptr [2*ebp - 1024] + +// CHECK: vcvtneph2bf8s xmm2 {k7} {z}, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x74,0x51,0x7f] + vcvtneph2bf8s xmm2 {k7} {z}, ymmword ptr [ecx + 4064] + +// CHECK: vcvtneph2bf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7e,0xbf,0x74,0x52,0x80] + vcvtneph2bf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to16} + +// CHECK: vcvtneph2bf8s ymm2, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x74,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtneph2bf8s ymm2, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtneph2bf8s ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x74,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtneph2bf8s ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtneph2bf8s ymm2, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7e,0x58,0x74,0x10] + vcvtneph2bf8s ymm2, word ptr [eax]{1to32} + +// CHECK: vcvtneph2bf8s ymm2, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x74,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtneph2bf8s ymm2, zmmword ptr [2*ebp - 2048] + +// CHECK: vcvtneph2bf8s ymm2 {k7} {z}, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x74,0x51,0x7f] + vcvtneph2bf8s ymm2 {k7} {z}, zmmword ptr [ecx + 8128] + +// CHECK: vcvtneph2bf8s ymm2 {k7} {z}, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7e,0xdf,0x74,0x52,0x80] + vcvtneph2bf8s ymm2 {k7} {z}, word ptr [edx - 256]{1to32} + +// CHECK: vcvtneph2hf8 xmm2, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x18,0xd3] + vcvtneph2hf8 xmm2, xmm3 + +// CHECK: vcvtneph2hf8 xmm2 {k7}, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x18,0xd3] + vcvtneph2hf8 xmm2 {k7}, xmm3 + +// CHECK: vcvtneph2hf8 xmm2 {k7} {z}, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x18,0xd3] + vcvtneph2hf8 xmm2 {k7} {z}, xmm3 + +// CHECK: vcvtneph2hf8 ymm2, zmm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x18,0xd3] + vcvtneph2hf8 ymm2, zmm3 + +// CHECK: vcvtneph2hf8 ymm2 {k7}, zmm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x18,0xd3] + vcvtneph2hf8 ymm2 {k7}, zmm3 + +// CHECK: vcvtneph2hf8 ymm2 {k7} {z}, zmm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x18,0xd3] + vcvtneph2hf8 ymm2 {k7} {z}, zmm3 + +// CHECK: vcvtneph2hf8 xmm2, ymm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x18,0xd3] + vcvtneph2hf8 xmm2, ymm3 + +// CHECK: vcvtneph2hf8 xmm2 {k7}, ymm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0x2f,0x18,0xd3] + vcvtneph2hf8 xmm2 {k7}, ymm3 + +// CHECK: vcvtneph2hf8 xmm2 {k7} {z}, ymm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x18,0xd3] + vcvtneph2hf8 xmm2 {k7} {z}, ymm3 + +// CHECK: vcvtneph2hf8 xmm2, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtneph2hf8 xmm2, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtneph2hf8 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtneph2hf8 xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtneph2hf8 xmm2, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x18,0x10] + vcvtneph2hf8 xmm2, word ptr [eax]{1to8} + +// CHECK: vcvtneph2hf8 xmm2, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x18,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtneph2hf8 xmm2, xmmword ptr [2*ebp - 512] + +// CHECK: vcvtneph2hf8 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x18,0x51,0x7f] + vcvtneph2hf8 xmm2 {k7} {z}, xmmword ptr [ecx + 2032] + +// CHECK: vcvtneph2hf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf5,0x7e,0x9f,0x18,0x52,0x80] + vcvtneph2hf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to8} + +// CHECK: vcvtneph2hf8 xmm2, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7e,0x38,0x18,0x10] + vcvtneph2hf8 xmm2, word ptr [eax]{1to16} + +// CHECK: vcvtneph2hf8 xmm2, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x18,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtneph2hf8 xmm2, ymmword ptr [2*ebp - 1024] + +// CHECK: vcvtneph2hf8 xmm2 {k7} {z}, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x18,0x51,0x7f] + vcvtneph2hf8 xmm2 {k7} {z}, ymmword ptr [ecx + 4064] + +// CHECK: vcvtneph2hf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7e,0xbf,0x18,0x52,0x80] + vcvtneph2hf8 xmm2 {k7} {z}, word ptr [edx - 256]{1to16} + +// CHECK: vcvtneph2hf8 ymm2, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x18,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtneph2hf8 ymm2, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtneph2hf8 ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x18,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtneph2hf8 ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtneph2hf8 ymm2, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7e,0x58,0x18,0x10] + vcvtneph2hf8 ymm2, word ptr [eax]{1to32} + +// CHECK: vcvtneph2hf8 ymm2, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x18,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtneph2hf8 ymm2, zmmword ptr [2*ebp - 2048] + +// CHECK: vcvtneph2hf8 ymm2 {k7} {z}, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x18,0x51,0x7f] + vcvtneph2hf8 ymm2 {k7} {z}, zmmword ptr [ecx + 8128] + +// CHECK: vcvtneph2hf8 ymm2 {k7} {z}, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7e,0xdf,0x18,0x52,0x80] + vcvtneph2hf8 ymm2 {k7} {z}, word ptr [edx - 256]{1to32} + +// CHECK: vcvtneph2hf8s xmm2, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x1b,0xd3] + vcvtneph2hf8s xmm2, xmm3 + +// CHECK: vcvtneph2hf8s xmm2 {k7}, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x1b,0xd3] + vcvtneph2hf8s xmm2 {k7}, xmm3 + +// CHECK: vcvtneph2hf8s xmm2 {k7} {z}, xmm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x1b,0xd3] + vcvtneph2hf8s xmm2 {k7} {z}, xmm3 + +// CHECK: vcvtneph2hf8s ymm2, zmm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x1b,0xd3] + vcvtneph2hf8s ymm2, zmm3 + +// CHECK: vcvtneph2hf8s ymm2 {k7}, zmm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x1b,0xd3] + vcvtneph2hf8s ymm2 {k7}, zmm3 + +// CHECK: vcvtneph2hf8s ymm2 {k7} {z}, zmm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x1b,0xd3] + vcvtneph2hf8s ymm2 {k7} {z}, zmm3 + +// CHECK: vcvtneph2hf8s xmm2, ymm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x1b,0xd3] + vcvtneph2hf8s xmm2, ymm3 + +// CHECK: vcvtneph2hf8s xmm2 {k7}, ymm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0x2f,0x1b,0xd3] + vcvtneph2hf8s xmm2 {k7}, ymm3 + +// CHECK: vcvtneph2hf8s xmm2 {k7} {z}, ymm3 +// CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x1b,0xd3] + vcvtneph2hf8s xmm2 {k7} {z}, ymm3 + +// CHECK: vcvtneph2hf8s xmm2, xmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtneph2hf8s xmm2, xmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtneph2hf8s xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtneph2hf8s xmm2 {k7}, xmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtneph2hf8s xmm2, word ptr [eax]{1to8} +// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x1b,0x10] + vcvtneph2hf8s xmm2, word ptr [eax]{1to8} + +// CHECK: vcvtneph2hf8s xmm2, xmmword ptr [2*ebp - 512] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x1b,0x14,0x6d,0x00,0xfe,0xff,0xff] + vcvtneph2hf8s xmm2, xmmword ptr [2*ebp - 512] + +// CHECK: vcvtneph2hf8s xmm2 {k7} {z}, xmmword ptr [ecx + 2032] +// CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x1b,0x51,0x7f] + vcvtneph2hf8s xmm2 {k7} {z}, xmmword ptr [ecx + 2032] + +// CHECK: vcvtneph2hf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to8} +// CHECK: encoding: [0x62,0xf5,0x7e,0x9f,0x1b,0x52,0x80] + vcvtneph2hf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to8} + +// CHECK: vcvtneph2hf8s xmm2, word ptr [eax]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7e,0x38,0x1b,0x10] + vcvtneph2hf8s xmm2, word ptr [eax]{1to16} + +// CHECK: vcvtneph2hf8s xmm2, ymmword ptr [2*ebp - 1024] +// CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x1b,0x14,0x6d,0x00,0xfc,0xff,0xff] + vcvtneph2hf8s xmm2, ymmword ptr [2*ebp - 1024] + +// CHECK: vcvtneph2hf8s xmm2 {k7} {z}, ymmword ptr [ecx + 4064] +// CHECK: encoding: [0x62,0xf5,0x7e,0xaf,0x1b,0x51,0x7f] + vcvtneph2hf8s xmm2 {k7} {z}, ymmword ptr [ecx + 4064] + +// CHECK: vcvtneph2hf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to16} +// CHECK: encoding: [0x62,0xf5,0x7e,0xbf,0x1b,0x52,0x80] + vcvtneph2hf8s xmm2 {k7} {z}, word ptr [edx - 256]{1to16} + +// CHECK: vcvtneph2hf8s ymm2, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x1b,0x94,0xf4,0x00,0x00,0x00,0x10] + vcvtneph2hf8s ymm2, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vcvtneph2hf8s ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] +// CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x1b,0x94,0x87,0x23,0x01,0x00,0x00] + vcvtneph2hf8s ymm2 {k7}, zmmword ptr [edi + 4*eax + 291] + +// CHECK: vcvtneph2hf8s ymm2, word ptr [eax]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7e,0x58,0x1b,0x10] + vcvtneph2hf8s ymm2, word ptr [eax]{1to32} + +// CHECK: vcvtneph2hf8s ymm2, zmmword ptr [2*ebp - 2048] +// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x1b,0x14,0x6d,0x00,0xf8,0xff,0xff] + vcvtneph2hf8s ymm2, zmmword ptr [2*ebp - 2048] + +// CHECK: vcvtneph2hf8s ymm2 {k7} {z}, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x7e,0xcf,0x1b,0x51,0x7f] + vcvtneph2hf8s ymm2 {k7} {z}, zmmword ptr [ecx + 8128] + +// CHECK: vcvtneph2hf8s ymm2 {k7} {z}, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7e,0xdf,0x1b,0x52,0x80] + vcvtneph2hf8s ymm2 {k7} {z}, word ptr [edx - 256]{1to32} + diff --git a/llvm/test/MC/X86/avx10.2convert-64-att.s b/llvm/test/MC/X86/avx10.2convert-64-att.s new file mode 100644 index 00000000000000..ccf1e004c07f25 --- /dev/null +++ b/llvm/test/MC/X86/avx10.2convert-64-att.s @@ -0,0 +1,1490 @@ +// RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s + +// CHECK: vcvt2ps2phx %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x82,0x45,0x20,0x67,0xf0] + vcvt2ps2phx %ymm24, %ymm23, %ymm22 + +// CHECK: vcvt2ps2phx {rn-sae}, %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x82,0x41,0x10,0x67,0xf0] + vcvt2ps2phx {rn-sae}, %ymm24, %ymm23, %ymm22 + +// CHECK: vcvt2ps2phx %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x45,0x27,0x67,0xf0] + vcvt2ps2phx %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vcvt2ps2phx {rz-sae}, %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x41,0xf7,0x67,0xf0] + vcvt2ps2phx {rz-sae}, %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vcvt2ps2phx %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x82,0x45,0x40,0x67,0xf0] + vcvt2ps2phx %zmm24, %zmm23, %zmm22 + +// CHECK: vcvt2ps2phx {rn-sae}, %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x82,0x45,0x10,0x67,0xf0] + vcvt2ps2phx {rn-sae}, %zmm24, %zmm23, %zmm22 + +// CHECK: vcvt2ps2phx %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x45,0x47,0x67,0xf0] + vcvt2ps2phx %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vcvt2ps2phx {rz-sae}, %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x45,0xf7,0x67,0xf0] + vcvt2ps2phx {rz-sae}, %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vcvt2ps2phx %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x82,0x45,0x00,0x67,0xf0] + vcvt2ps2phx %xmm24, %xmm23, %xmm22 + +// CHECK: vcvt2ps2phx %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x45,0x07,0x67,0xf0] + vcvt2ps2phx %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vcvt2ps2phx %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x45,0x87,0x67,0xf0] + vcvt2ps2phx %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvt2ps2phx 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa2,0x45,0x40,0x67,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvt2ps2phx 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vcvt2ps2phx 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x45,0x47,0x67,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvt2ps2phx 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vcvt2ps2phx (%rip){1to16}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x45,0x50,0x67,0x35,0x00,0x00,0x00,0x00] + vcvt2ps2phx (%rip){1to16}, %zmm23, %zmm22 + +// CHECK: vcvt2ps2phx -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x45,0x40,0x67,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvt2ps2phx -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vcvt2ps2phx 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x45,0xc7,0x67,0x71,0x7f] + vcvt2ps2phx 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vcvt2ps2phx -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x45,0xd7,0x67,0x72,0x80] + vcvt2ps2phx -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vcvt2ps2phx 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa2,0x45,0x20,0x67,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvt2ps2phx 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vcvt2ps2phx 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x45,0x27,0x67,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvt2ps2phx 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vcvt2ps2phx (%rip){1to8}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x45,0x30,0x67,0x35,0x00,0x00,0x00,0x00] + vcvt2ps2phx (%rip){1to8}, %ymm23, %ymm22 + +// CHECK: vcvt2ps2phx -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x45,0x20,0x67,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvt2ps2phx -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vcvt2ps2phx 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x45,0xa7,0x67,0x71,0x7f] + vcvt2ps2phx 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vcvt2ps2phx -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x45,0xb7,0x67,0x72,0x80] + vcvt2ps2phx -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vcvt2ps2phx 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa2,0x45,0x00,0x67,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvt2ps2phx 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vcvt2ps2phx 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x45,0x07,0x67,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvt2ps2phx 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vcvt2ps2phx (%rip){1to4}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x45,0x10,0x67,0x35,0x00,0x00,0x00,0x00] + vcvt2ps2phx (%rip){1to4}, %xmm23, %xmm22 + +// CHECK: vcvt2ps2phx -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x45,0x00,0x67,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvt2ps2phx -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vcvt2ps2phx 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x45,0x87,0x67,0x71,0x7f] + vcvt2ps2phx 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvt2ps2phx -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x45,0x97,0x67,0x72,0x80] + vcvt2ps2phx -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2bf8 %zmm24, %zmm23, %ymm22 +// CHECK: encoding: [0x62,0x82,0x44,0x40,0x74,0xf0] + vcvtbiasph2bf8 %zmm24, %zmm23, %ymm22 + +// CHECK: vcvtbiasph2bf8 %zmm24, %zmm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x44,0x47,0x74,0xf0] + vcvtbiasph2bf8 %zmm24, %zmm23, %ymm22 {%k7} + +// CHECK: vcvtbiasph2bf8 %zmm24, %zmm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x44,0xc7,0x74,0xf0] + vcvtbiasph2bf8 %zmm24, %zmm23, %ymm22 {%k7} {z} + +// CHECK: vcvtbiasph2bf8 %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x82,0x44,0x00,0x74,0xf0] + vcvtbiasph2bf8 %xmm24, %xmm23, %xmm22 + +// CHECK: vcvtbiasph2bf8 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x44,0x07,0x74,0xf0] + vcvtbiasph2bf8 %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vcvtbiasph2bf8 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x44,0x87,0x74,0xf0] + vcvtbiasph2bf8 %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2bf8 %ymm24, %ymm23, %xmm22 +// CHECK: encoding: [0x62,0x82,0x44,0x20,0x74,0xf0] + vcvtbiasph2bf8 %ymm24, %ymm23, %xmm22 + +// CHECK: vcvtbiasph2bf8 %ymm24, %ymm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x44,0x27,0x74,0xf0] + vcvtbiasph2bf8 %ymm24, %ymm23, %xmm22 {%k7} + +// CHECK: vcvtbiasph2bf8 %ymm24, %ymm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x44,0xa7,0x74,0xf0] + vcvtbiasph2bf8 %ymm24, %ymm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2bf8 268435456(%rbp,%r14,8), %ymm23, %xmm22 +// CHECK: encoding: [0x62,0xa2,0x44,0x20,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8 268435456(%rbp,%r14,8), %ymm23, %xmm22 + +// CHECK: vcvtbiasph2bf8 291(%r8,%rax,4), %ymm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x44,0x27,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8 291(%r8,%rax,4), %ymm23, %xmm22 {%k7} + +// CHECK: vcvtbiasph2bf8 (%rip){1to16}, %ymm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x30,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2bf8 (%rip){1to16}, %ymm23, %xmm22 + +// CHECK: vcvtbiasph2bf8 -1024(,%rbp,2), %ymm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x20,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtbiasph2bf8 -1024(,%rbp,2), %ymm23, %xmm22 + +// CHECK: vcvtbiasph2bf8 4064(%rcx), %ymm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xa7,0x74,0x71,0x7f] + vcvtbiasph2bf8 4064(%rcx), %ymm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2bf8 -256(%rdx){1to16}, %ymm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xb7,0x74,0x72,0x80] + vcvtbiasph2bf8 -256(%rdx){1to16}, %ymm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2bf8 268435456(%rbp,%r14,8), %zmm23, %ymm22 +// CHECK: encoding: [0x62,0xa2,0x44,0x40,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8 268435456(%rbp,%r14,8), %zmm23, %ymm22 + +// CHECK: vcvtbiasph2bf8 291(%r8,%rax,4), %zmm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x44,0x47,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8 291(%r8,%rax,4), %zmm23, %ymm22 {%k7} + +// CHECK: vcvtbiasph2bf8 (%rip){1to32}, %zmm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x50,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2bf8 (%rip){1to32}, %zmm23, %ymm22 + +// CHECK: vcvtbiasph2bf8 -2048(,%rbp,2), %zmm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x40,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtbiasph2bf8 -2048(,%rbp,2), %zmm23, %ymm22 + +// CHECK: vcvtbiasph2bf8 8128(%rcx), %zmm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xc7,0x74,0x71,0x7f] + vcvtbiasph2bf8 8128(%rcx), %zmm23, %ymm22 {%k7} {z} + +// CHECK: vcvtbiasph2bf8 -256(%rdx){1to32}, %zmm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0xd7,0x74,0x72,0x80] + vcvtbiasph2bf8 -256(%rdx){1to32}, %zmm23, %ymm22 {%k7} {z} + +// CHECK: vcvtbiasph2bf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa2,0x44,0x00,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vcvtbiasph2bf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x44,0x07,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vcvtbiasph2bf8 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x10,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2bf8 (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vcvtbiasph2bf8 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x44,0x00,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtbiasph2bf8 -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vcvtbiasph2bf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0x87,0x74,0x71,0x7f] + vcvtbiasph2bf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2bf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x44,0x97,0x74,0x72,0x80] + vcvtbiasph2bf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2bf8s %zmm24, %zmm23, %ymm22 +// CHECK: encoding: [0x62,0x85,0x44,0x40,0x74,0xf0] + vcvtbiasph2bf8s %zmm24, %zmm23, %ymm22 + +// CHECK: vcvtbiasph2bf8s %zmm24, %zmm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x44,0x47,0x74,0xf0] + vcvtbiasph2bf8s %zmm24, %zmm23, %ymm22 {%k7} + +// CHECK: vcvtbiasph2bf8s %zmm24, %zmm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x44,0xc7,0x74,0xf0] + vcvtbiasph2bf8s %zmm24, %zmm23, %ymm22 {%k7} {z} + +// CHECK: vcvtbiasph2bf8s %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x85,0x44,0x00,0x74,0xf0] + vcvtbiasph2bf8s %xmm24, %xmm23, %xmm22 + +// CHECK: vcvtbiasph2bf8s %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x44,0x07,0x74,0xf0] + vcvtbiasph2bf8s %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vcvtbiasph2bf8s %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x44,0x87,0x74,0xf0] + vcvtbiasph2bf8s %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2bf8s %ymm24, %ymm23, %xmm22 +// CHECK: encoding: [0x62,0x85,0x44,0x20,0x74,0xf0] + vcvtbiasph2bf8s %ymm24, %ymm23, %xmm22 + +// CHECK: vcvtbiasph2bf8s %ymm24, %ymm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x44,0x27,0x74,0xf0] + vcvtbiasph2bf8s %ymm24, %ymm23, %xmm22 {%k7} + +// CHECK: vcvtbiasph2bf8s %ymm24, %ymm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x44,0xa7,0x74,0xf0] + vcvtbiasph2bf8s %ymm24, %ymm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2bf8s 268435456(%rbp,%r14,8), %ymm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x44,0x20,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8s 268435456(%rbp,%r14,8), %ymm23, %xmm22 + +// CHECK: vcvtbiasph2bf8s 291(%r8,%rax,4), %ymm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x44,0x27,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8s 291(%r8,%rax,4), %ymm23, %xmm22 {%k7} + +// CHECK: vcvtbiasph2bf8s (%rip){1to16}, %ymm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x44,0x30,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2bf8s (%rip){1to16}, %ymm23, %xmm22 + +// CHECK: vcvtbiasph2bf8s -1024(,%rbp,2), %ymm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x44,0x20,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtbiasph2bf8s -1024(,%rbp,2), %ymm23, %xmm22 + +// CHECK: vcvtbiasph2bf8s 4064(%rcx), %ymm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x44,0xa7,0x74,0x71,0x7f] + vcvtbiasph2bf8s 4064(%rcx), %ymm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2bf8s -256(%rdx){1to16}, %ymm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x44,0xb7,0x74,0x72,0x80] + vcvtbiasph2bf8s -256(%rdx){1to16}, %ymm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2bf8s 268435456(%rbp,%r14,8), %zmm23, %ymm22 +// CHECK: encoding: [0x62,0xa5,0x44,0x40,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8s 268435456(%rbp,%r14,8), %zmm23, %ymm22 + +// CHECK: vcvtbiasph2bf8s 291(%r8,%rax,4), %zmm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x44,0x47,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8s 291(%r8,%rax,4), %zmm23, %ymm22 {%k7} + +// CHECK: vcvtbiasph2bf8s (%rip){1to32}, %zmm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x44,0x50,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2bf8s (%rip){1to32}, %zmm23, %ymm22 + +// CHECK: vcvtbiasph2bf8s -2048(,%rbp,2), %zmm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x44,0x40,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtbiasph2bf8s -2048(,%rbp,2), %zmm23, %ymm22 + +// CHECK: vcvtbiasph2bf8s 8128(%rcx), %zmm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x44,0xc7,0x74,0x71,0x7f] + vcvtbiasph2bf8s 8128(%rcx), %zmm23, %ymm22 {%k7} {z} + +// CHECK: vcvtbiasph2bf8s -256(%rdx){1to32}, %zmm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x44,0xd7,0x74,0x72,0x80] + vcvtbiasph2bf8s -256(%rdx){1to32}, %zmm23, %ymm22 {%k7} {z} + +// CHECK: vcvtbiasph2bf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x44,0x00,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vcvtbiasph2bf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x44,0x07,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vcvtbiasph2bf8s (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x44,0x10,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2bf8s (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vcvtbiasph2bf8s -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x44,0x00,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtbiasph2bf8s -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vcvtbiasph2bf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x44,0x87,0x74,0x71,0x7f] + vcvtbiasph2bf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2bf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x44,0x97,0x74,0x72,0x80] + vcvtbiasph2bf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2hf8 %zmm24, %zmm23, %ymm22 +// CHECK: encoding: [0x62,0x85,0x44,0x40,0x18,0xf0] + vcvtbiasph2hf8 %zmm24, %zmm23, %ymm22 + +// CHECK: vcvtbiasph2hf8 %zmm24, %zmm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x44,0x47,0x18,0xf0] + vcvtbiasph2hf8 %zmm24, %zmm23, %ymm22 {%k7} + +// CHECK: vcvtbiasph2hf8 %zmm24, %zmm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x44,0xc7,0x18,0xf0] + vcvtbiasph2hf8 %zmm24, %zmm23, %ymm22 {%k7} {z} + +// CHECK: vcvtbiasph2hf8 %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x85,0x44,0x00,0x18,0xf0] + vcvtbiasph2hf8 %xmm24, %xmm23, %xmm22 + +// CHECK: vcvtbiasph2hf8 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x44,0x07,0x18,0xf0] + vcvtbiasph2hf8 %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vcvtbiasph2hf8 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x44,0x87,0x18,0xf0] + vcvtbiasph2hf8 %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2hf8 %ymm24, %ymm23, %xmm22 +// CHECK: encoding: [0x62,0x85,0x44,0x20,0x18,0xf0] + vcvtbiasph2hf8 %ymm24, %ymm23, %xmm22 + +// CHECK: vcvtbiasph2hf8 %ymm24, %ymm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x44,0x27,0x18,0xf0] + vcvtbiasph2hf8 %ymm24, %ymm23, %xmm22 {%k7} + +// CHECK: vcvtbiasph2hf8 %ymm24, %ymm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x44,0xa7,0x18,0xf0] + vcvtbiasph2hf8 %ymm24, %ymm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2hf8 268435456(%rbp,%r14,8), %ymm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x44,0x20,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8 268435456(%rbp,%r14,8), %ymm23, %xmm22 + +// CHECK: vcvtbiasph2hf8 291(%r8,%rax,4), %ymm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x44,0x27,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8 291(%r8,%rax,4), %ymm23, %xmm22 {%k7} + +// CHECK: vcvtbiasph2hf8 (%rip){1to16}, %ymm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x44,0x30,0x18,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2hf8 (%rip){1to16}, %ymm23, %xmm22 + +// CHECK: vcvtbiasph2hf8 -1024(,%rbp,2), %ymm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x44,0x20,0x18,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtbiasph2hf8 -1024(,%rbp,2), %ymm23, %xmm22 + +// CHECK: vcvtbiasph2hf8 4064(%rcx), %ymm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x44,0xa7,0x18,0x71,0x7f] + vcvtbiasph2hf8 4064(%rcx), %ymm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2hf8 -256(%rdx){1to16}, %ymm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x44,0xb7,0x18,0x72,0x80] + vcvtbiasph2hf8 -256(%rdx){1to16}, %ymm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2hf8 268435456(%rbp,%r14,8), %zmm23, %ymm22 +// CHECK: encoding: [0x62,0xa5,0x44,0x40,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8 268435456(%rbp,%r14,8), %zmm23, %ymm22 + +// CHECK: vcvtbiasph2hf8 291(%r8,%rax,4), %zmm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x44,0x47,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8 291(%r8,%rax,4), %zmm23, %ymm22 {%k7} + +// CHECK: vcvtbiasph2hf8 (%rip){1to32}, %zmm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x44,0x50,0x18,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2hf8 (%rip){1to32}, %zmm23, %ymm22 + +// CHECK: vcvtbiasph2hf8 -2048(,%rbp,2), %zmm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x44,0x40,0x18,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtbiasph2hf8 -2048(,%rbp,2), %zmm23, %ymm22 + +// CHECK: vcvtbiasph2hf8 8128(%rcx), %zmm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x44,0xc7,0x18,0x71,0x7f] + vcvtbiasph2hf8 8128(%rcx), %zmm23, %ymm22 {%k7} {z} + +// CHECK: vcvtbiasph2hf8 -256(%rdx){1to32}, %zmm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x44,0xd7,0x18,0x72,0x80] + vcvtbiasph2hf8 -256(%rdx){1to32}, %zmm23, %ymm22 {%k7} {z} + +// CHECK: vcvtbiasph2hf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x44,0x00,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vcvtbiasph2hf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x44,0x07,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vcvtbiasph2hf8 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x44,0x10,0x18,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2hf8 (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vcvtbiasph2hf8 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x44,0x00,0x18,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtbiasph2hf8 -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vcvtbiasph2hf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x44,0x87,0x18,0x71,0x7f] + vcvtbiasph2hf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2hf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x44,0x97,0x18,0x72,0x80] + vcvtbiasph2hf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2hf8s %zmm24, %zmm23, %ymm22 +// CHECK: encoding: [0x62,0x85,0x44,0x40,0x1b,0xf0] + vcvtbiasph2hf8s %zmm24, %zmm23, %ymm22 + +// CHECK: vcvtbiasph2hf8s %zmm24, %zmm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x44,0x47,0x1b,0xf0] + vcvtbiasph2hf8s %zmm24, %zmm23, %ymm22 {%k7} + +// CHECK: vcvtbiasph2hf8s %zmm24, %zmm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x44,0xc7,0x1b,0xf0] + vcvtbiasph2hf8s %zmm24, %zmm23, %ymm22 {%k7} {z} + +// CHECK: vcvtbiasph2hf8s %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x85,0x44,0x00,0x1b,0xf0] + vcvtbiasph2hf8s %xmm24, %xmm23, %xmm22 + +// CHECK: vcvtbiasph2hf8s %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x44,0x07,0x1b,0xf0] + vcvtbiasph2hf8s %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vcvtbiasph2hf8s %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x44,0x87,0x1b,0xf0] + vcvtbiasph2hf8s %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2hf8s %ymm24, %ymm23, %xmm22 +// CHECK: encoding: [0x62,0x85,0x44,0x20,0x1b,0xf0] + vcvtbiasph2hf8s %ymm24, %ymm23, %xmm22 + +// CHECK: vcvtbiasph2hf8s %ymm24, %ymm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x44,0x27,0x1b,0xf0] + vcvtbiasph2hf8s %ymm24, %ymm23, %xmm22 {%k7} + +// CHECK: vcvtbiasph2hf8s %ymm24, %ymm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x44,0xa7,0x1b,0xf0] + vcvtbiasph2hf8s %ymm24, %ymm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2hf8s 268435456(%rbp,%r14,8), %ymm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x44,0x20,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8s 268435456(%rbp,%r14,8), %ymm23, %xmm22 + +// CHECK: vcvtbiasph2hf8s 291(%r8,%rax,4), %ymm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x44,0x27,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8s 291(%r8,%rax,4), %ymm23, %xmm22 {%k7} + +// CHECK: vcvtbiasph2hf8s (%rip){1to16}, %ymm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x44,0x30,0x1b,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2hf8s (%rip){1to16}, %ymm23, %xmm22 + +// CHECK: vcvtbiasph2hf8s -1024(,%rbp,2), %ymm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x44,0x20,0x1b,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtbiasph2hf8s -1024(,%rbp,2), %ymm23, %xmm22 + +// CHECK: vcvtbiasph2hf8s 4064(%rcx), %ymm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x44,0xa7,0x1b,0x71,0x7f] + vcvtbiasph2hf8s 4064(%rcx), %ymm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2hf8s -256(%rdx){1to16}, %ymm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x44,0xb7,0x1b,0x72,0x80] + vcvtbiasph2hf8s -256(%rdx){1to16}, %ymm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2hf8s 268435456(%rbp,%r14,8), %zmm23, %ymm22 +// CHECK: encoding: [0x62,0xa5,0x44,0x40,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8s 268435456(%rbp,%r14,8), %zmm23, %ymm22 + +// CHECK: vcvtbiasph2hf8s 291(%r8,%rax,4), %zmm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x44,0x47,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8s 291(%r8,%rax,4), %zmm23, %ymm22 {%k7} + +// CHECK: vcvtbiasph2hf8s (%rip){1to32}, %zmm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x44,0x50,0x1b,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2hf8s (%rip){1to32}, %zmm23, %ymm22 + +// CHECK: vcvtbiasph2hf8s -2048(,%rbp,2), %zmm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x44,0x40,0x1b,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtbiasph2hf8s -2048(,%rbp,2), %zmm23, %ymm22 + +// CHECK: vcvtbiasph2hf8s 8128(%rcx), %zmm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x44,0xc7,0x1b,0x71,0x7f] + vcvtbiasph2hf8s 8128(%rcx), %zmm23, %ymm22 {%k7} {z} + +// CHECK: vcvtbiasph2hf8s -256(%rdx){1to32}, %zmm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x44,0xd7,0x1b,0x72,0x80] + vcvtbiasph2hf8s -256(%rdx){1to32}, %zmm23, %ymm22 {%k7} {z} + +// CHECK: vcvtbiasph2hf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x44,0x00,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vcvtbiasph2hf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x44,0x07,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vcvtbiasph2hf8s (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x44,0x10,0x1b,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2hf8s (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vcvtbiasph2hf8s -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x44,0x00,0x1b,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtbiasph2hf8s -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vcvtbiasph2hf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x44,0x87,0x1b,0x71,0x7f] + vcvtbiasph2hf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtbiasph2hf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x44,0x97,0x1b,0x72,0x80] + vcvtbiasph2hf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvthf82ph %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7f,0x08,0x1e,0xf7] + vcvthf82ph %xmm23, %xmm22 + +// CHECK: vcvthf82ph %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xa5,0x7f,0x0f,0x1e,0xf7] + vcvthf82ph %xmm23, %xmm22 {%k7} + +// CHECK: vcvthf82ph %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa5,0x7f,0x8f,0x1e,0xf7] + vcvthf82ph %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvthf82ph %xmm23, %ymm22 +// CHECK: encoding: [0x62,0xa5,0x7f,0x28,0x1e,0xf7] + vcvthf82ph %xmm23, %ymm22 + +// CHECK: vcvthf82ph %xmm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xa5,0x7f,0x2f,0x1e,0xf7] + vcvthf82ph %xmm23, %ymm22 {%k7} + +// CHECK: vcvthf82ph %xmm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa5,0x7f,0xaf,0x1e,0xf7] + vcvthf82ph %xmm23, %ymm22 {%k7} {z} + +// CHECK: vcvthf82ph %ymm23, %zmm22 +// CHECK: encoding: [0x62,0xa5,0x7f,0x48,0x1e,0xf7] + vcvthf82ph %ymm23, %zmm22 + +// CHECK: vcvthf82ph %ymm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xa5,0x7f,0x4f,0x1e,0xf7] + vcvthf82ph %ymm23, %zmm22 {%k7} + +// CHECK: vcvthf82ph %ymm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa5,0x7f,0xcf,0x1e,0xf7] + vcvthf82ph %ymm23, %zmm22 {%k7} {z} + +// CHECK: vcvthf82ph 268435456(%rbp,%r14,8), %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7f,0x08,0x1e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvthf82ph 268435456(%rbp,%r14,8), %xmm22 + +// CHECK: vcvthf82ph 291(%r8,%rax,4), %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x7f,0x0f,0x1e,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvthf82ph 291(%r8,%rax,4), %xmm22 {%k7} + +// CHECK: vcvthf82ph (%rip), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x1e,0x35,0x00,0x00,0x00,0x00] + vcvthf82ph (%rip), %xmm22 + +// CHECK: vcvthf82ph -256(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x1e,0x34,0x6d,0x00,0xff,0xff,0xff] + vcvthf82ph -256(,%rbp,2), %xmm22 + +// CHECK: vcvthf82ph 1016(%rcx), %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7f,0x8f,0x1e,0x71,0x7f] + vcvthf82ph 1016(%rcx), %xmm22 {%k7} {z} + +// CHECK: vcvthf82ph -1024(%rdx), %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7f,0x8f,0x1e,0x72,0x80] + vcvthf82ph -1024(%rdx), %xmm22 {%k7} {z} + +// CHECK: vcvthf82ph 268435456(%rbp,%r14,8), %ymm22 +// CHECK: encoding: [0x62,0xa5,0x7f,0x28,0x1e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvthf82ph 268435456(%rbp,%r14,8), %ymm22 + +// CHECK: vcvthf82ph 291(%r8,%rax,4), %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x7f,0x2f,0x1e,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvthf82ph 291(%r8,%rax,4), %ymm22 {%k7} + +// CHECK: vcvthf82ph (%rip), %ymm22 +// CHECK: encoding: [0x62,0xe5,0x7f,0x28,0x1e,0x35,0x00,0x00,0x00,0x00] + vcvthf82ph (%rip), %ymm22 + +// CHECK: vcvthf82ph -512(,%rbp,2), %ymm22 +// CHECK: encoding: [0x62,0xe5,0x7f,0x28,0x1e,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvthf82ph -512(,%rbp,2), %ymm22 + +// CHECK: vcvthf82ph 2032(%rcx), %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7f,0xaf,0x1e,0x71,0x7f] + vcvthf82ph 2032(%rcx), %ymm22 {%k7} {z} + +// CHECK: vcvthf82ph -2048(%rdx), %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7f,0xaf,0x1e,0x72,0x80] + vcvthf82ph -2048(%rdx), %ymm22 {%k7} {z} + +// CHECK: vcvthf82ph 268435456(%rbp,%r14,8), %zmm22 +// CHECK: encoding: [0x62,0xa5,0x7f,0x48,0x1e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvthf82ph 268435456(%rbp,%r14,8), %zmm22 + +// CHECK: vcvthf82ph 291(%r8,%rax,4), %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x7f,0x4f,0x1e,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvthf82ph 291(%r8,%rax,4), %zmm22 {%k7} + +// CHECK: vcvthf82ph (%rip), %zmm22 +// CHECK: encoding: [0x62,0xe5,0x7f,0x48,0x1e,0x35,0x00,0x00,0x00,0x00] + vcvthf82ph (%rip), %zmm22 + +// CHECK: vcvthf82ph -1024(,%rbp,2), %zmm22 +// CHECK: encoding: [0x62,0xe5,0x7f,0x48,0x1e,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvthf82ph -1024(,%rbp,2), %zmm22 + +// CHECK: vcvthf82ph 4064(%rcx), %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7f,0xcf,0x1e,0x71,0x7f] + vcvthf82ph 4064(%rcx), %zmm22 {%k7} {z} + +// CHECK: vcvthf82ph -4096(%rdx), %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7f,0xcf,0x1e,0x72,0x80] + vcvthf82ph -4096(%rdx), %zmm22 {%k7} {z} + +// CHECK: vcvtne2ph2bf8 %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x82,0x47,0x20,0x74,0xf0] + vcvtne2ph2bf8 %ymm24, %ymm23, %ymm22 + +// CHECK: vcvtne2ph2bf8 %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x47,0x27,0x74,0xf0] + vcvtne2ph2bf8 %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vcvtne2ph2bf8 %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x47,0xa7,0x74,0xf0] + vcvtne2ph2bf8 %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vcvtne2ph2bf8 %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x82,0x47,0x40,0x74,0xf0] + vcvtne2ph2bf8 %zmm24, %zmm23, %zmm22 + +// CHECK: vcvtne2ph2bf8 %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x47,0x47,0x74,0xf0] + vcvtne2ph2bf8 %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vcvtne2ph2bf8 %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x47,0xc7,0x74,0xf0] + vcvtne2ph2bf8 %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vcvtne2ph2bf8 %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x82,0x47,0x00,0x74,0xf0] + vcvtne2ph2bf8 %xmm24, %xmm23, %xmm22 + +// CHECK: vcvtne2ph2bf8 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x82,0x47,0x07,0x74,0xf0] + vcvtne2ph2bf8 %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vcvtne2ph2bf8 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x47,0x87,0x74,0xf0] + vcvtne2ph2bf8 %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtne2ph2bf8 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa2,0x47,0x40,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vcvtne2ph2bf8 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x47,0x47,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vcvtne2ph2bf8 (%rip){1to32}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x47,0x50,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2bf8 (%rip){1to32}, %zmm23, %zmm22 + +// CHECK: vcvtne2ph2bf8 -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x47,0x40,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtne2ph2bf8 -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vcvtne2ph2bf8 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x47,0xc7,0x74,0x71,0x7f] + vcvtne2ph2bf8 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vcvtne2ph2bf8 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x47,0xd7,0x74,0x72,0x80] + vcvtne2ph2bf8 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vcvtne2ph2bf8 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa2,0x47,0x20,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vcvtne2ph2bf8 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x47,0x27,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vcvtne2ph2bf8 (%rip){1to16}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x47,0x30,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2bf8 (%rip){1to16}, %ymm23, %ymm22 + +// CHECK: vcvtne2ph2bf8 -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x47,0x20,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtne2ph2bf8 -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vcvtne2ph2bf8 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x47,0xa7,0x74,0x71,0x7f] + vcvtne2ph2bf8 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vcvtne2ph2bf8 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x47,0xb7,0x74,0x72,0x80] + vcvtne2ph2bf8 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vcvtne2ph2bf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa2,0x47,0x00,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vcvtne2ph2bf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x47,0x07,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vcvtne2ph2bf8 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x47,0x10,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2bf8 (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vcvtne2ph2bf8 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x47,0x00,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtne2ph2bf8 -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vcvtne2ph2bf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x47,0x87,0x74,0x71,0x7f] + vcvtne2ph2bf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtne2ph2bf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x47,0x97,0x74,0x72,0x80] + vcvtne2ph2bf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtne2ph2bf8s %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x85,0x47,0x20,0x74,0xf0] + vcvtne2ph2bf8s %ymm24, %ymm23, %ymm22 + +// CHECK: vcvtne2ph2bf8s %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x47,0x27,0x74,0xf0] + vcvtne2ph2bf8s %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vcvtne2ph2bf8s %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x47,0xa7,0x74,0xf0] + vcvtne2ph2bf8s %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vcvtne2ph2bf8s %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x85,0x47,0x40,0x74,0xf0] + vcvtne2ph2bf8s %zmm24, %zmm23, %zmm22 + +// CHECK: vcvtne2ph2bf8s %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x47,0x47,0x74,0xf0] + vcvtne2ph2bf8s %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vcvtne2ph2bf8s %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x47,0xc7,0x74,0xf0] + vcvtne2ph2bf8s %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vcvtne2ph2bf8s %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x85,0x47,0x00,0x74,0xf0] + vcvtne2ph2bf8s %xmm24, %xmm23, %xmm22 + +// CHECK: vcvtne2ph2bf8s %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x47,0x07,0x74,0xf0] + vcvtne2ph2bf8s %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vcvtne2ph2bf8s %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x47,0x87,0x74,0xf0] + vcvtne2ph2bf8s %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtne2ph2bf8s 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa5,0x47,0x40,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8s 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vcvtne2ph2bf8s 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x47,0x47,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8s 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vcvtne2ph2bf8s (%rip){1to32}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe5,0x47,0x50,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2bf8s (%rip){1to32}, %zmm23, %zmm22 + +// CHECK: vcvtne2ph2bf8s -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe5,0x47,0x40,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtne2ph2bf8s -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vcvtne2ph2bf8s 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x47,0xc7,0x74,0x71,0x7f] + vcvtne2ph2bf8s 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vcvtne2ph2bf8s -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x47,0xd7,0x74,0x72,0x80] + vcvtne2ph2bf8s -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vcvtne2ph2bf8s 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa5,0x47,0x20,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8s 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vcvtne2ph2bf8s 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x47,0x27,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8s 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vcvtne2ph2bf8s (%rip){1to16}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x47,0x30,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2bf8s (%rip){1to16}, %ymm23, %ymm22 + +// CHECK: vcvtne2ph2bf8s -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x47,0x20,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtne2ph2bf8s -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vcvtne2ph2bf8s 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x47,0xa7,0x74,0x71,0x7f] + vcvtne2ph2bf8s 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vcvtne2ph2bf8s -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x47,0xb7,0x74,0x72,0x80] + vcvtne2ph2bf8s -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vcvtne2ph2bf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x47,0x00,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vcvtne2ph2bf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x47,0x07,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vcvtne2ph2bf8s (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x47,0x10,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2bf8s (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vcvtne2ph2bf8s -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x47,0x00,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtne2ph2bf8s -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vcvtne2ph2bf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x47,0x87,0x74,0x71,0x7f] + vcvtne2ph2bf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtne2ph2bf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x47,0x97,0x74,0x72,0x80] + vcvtne2ph2bf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtne2ph2hf8 %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x85,0x47,0x20,0x18,0xf0] + vcvtne2ph2hf8 %ymm24, %ymm23, %ymm22 + +// CHECK: vcvtne2ph2hf8 %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x47,0x27,0x18,0xf0] + vcvtne2ph2hf8 %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vcvtne2ph2hf8 %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x47,0xa7,0x18,0xf0] + vcvtne2ph2hf8 %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vcvtne2ph2hf8 %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x85,0x47,0x40,0x18,0xf0] + vcvtne2ph2hf8 %zmm24, %zmm23, %zmm22 + +// CHECK: vcvtne2ph2hf8 %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x47,0x47,0x18,0xf0] + vcvtne2ph2hf8 %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vcvtne2ph2hf8 %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x47,0xc7,0x18,0xf0] + vcvtne2ph2hf8 %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vcvtne2ph2hf8 %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x85,0x47,0x00,0x18,0xf0] + vcvtne2ph2hf8 %xmm24, %xmm23, %xmm22 + +// CHECK: vcvtne2ph2hf8 %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x47,0x07,0x18,0xf0] + vcvtne2ph2hf8 %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vcvtne2ph2hf8 %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x47,0x87,0x18,0xf0] + vcvtne2ph2hf8 %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtne2ph2hf8 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa5,0x47,0x40,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vcvtne2ph2hf8 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x47,0x47,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vcvtne2ph2hf8 (%rip){1to32}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe5,0x47,0x50,0x18,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2hf8 (%rip){1to32}, %zmm23, %zmm22 + +// CHECK: vcvtne2ph2hf8 -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe5,0x47,0x40,0x18,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtne2ph2hf8 -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vcvtne2ph2hf8 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x47,0xc7,0x18,0x71,0x7f] + vcvtne2ph2hf8 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vcvtne2ph2hf8 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x47,0xd7,0x18,0x72,0x80] + vcvtne2ph2hf8 -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vcvtne2ph2hf8 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa5,0x47,0x20,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vcvtne2ph2hf8 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x47,0x27,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vcvtne2ph2hf8 (%rip){1to16}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x47,0x30,0x18,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2hf8 (%rip){1to16}, %ymm23, %ymm22 + +// CHECK: vcvtne2ph2hf8 -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x47,0x20,0x18,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtne2ph2hf8 -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vcvtne2ph2hf8 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x47,0xa7,0x18,0x71,0x7f] + vcvtne2ph2hf8 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vcvtne2ph2hf8 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x47,0xb7,0x18,0x72,0x80] + vcvtne2ph2hf8 -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vcvtne2ph2hf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x47,0x00,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vcvtne2ph2hf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x47,0x07,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vcvtne2ph2hf8 (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x47,0x10,0x18,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2hf8 (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vcvtne2ph2hf8 -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x47,0x00,0x18,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtne2ph2hf8 -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vcvtne2ph2hf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x47,0x87,0x18,0x71,0x7f] + vcvtne2ph2hf8 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtne2ph2hf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x47,0x97,0x18,0x72,0x80] + vcvtne2ph2hf8 -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtne2ph2hf8s %ymm24, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0x85,0x47,0x20,0x1b,0xf0] + vcvtne2ph2hf8s %ymm24, %ymm23, %ymm22 + +// CHECK: vcvtne2ph2hf8s %ymm24, %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x47,0x27,0x1b,0xf0] + vcvtne2ph2hf8s %ymm24, %ymm23, %ymm22 {%k7} + +// CHECK: vcvtne2ph2hf8s %ymm24, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x47,0xa7,0x1b,0xf0] + vcvtne2ph2hf8s %ymm24, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vcvtne2ph2hf8s %zmm24, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0x85,0x47,0x40,0x1b,0xf0] + vcvtne2ph2hf8s %zmm24, %zmm23, %zmm22 + +// CHECK: vcvtne2ph2hf8s %zmm24, %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x47,0x47,0x1b,0xf0] + vcvtne2ph2hf8s %zmm24, %zmm23, %zmm22 {%k7} + +// CHECK: vcvtne2ph2hf8s %zmm24, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x47,0xc7,0x1b,0xf0] + vcvtne2ph2hf8s %zmm24, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vcvtne2ph2hf8s %xmm24, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0x85,0x47,0x00,0x1b,0xf0] + vcvtne2ph2hf8s %xmm24, %xmm23, %xmm22 + +// CHECK: vcvtne2ph2hf8s %xmm24, %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0x85,0x47,0x07,0x1b,0xf0] + vcvtne2ph2hf8s %xmm24, %xmm23, %xmm22 {%k7} + +// CHECK: vcvtne2ph2hf8s %xmm24, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0x85,0x47,0x87,0x1b,0xf0] + vcvtne2ph2hf8s %xmm24, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtne2ph2hf8s 268435456(%rbp,%r14,8), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xa5,0x47,0x40,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8s 268435456(%rbp,%r14,8), %zmm23, %zmm22 + +// CHECK: vcvtne2ph2hf8s 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x47,0x47,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8s 291(%r8,%rax,4), %zmm23, %zmm22 {%k7} + +// CHECK: vcvtne2ph2hf8s (%rip){1to32}, %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe5,0x47,0x50,0x1b,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2hf8s (%rip){1to32}, %zmm23, %zmm22 + +// CHECK: vcvtne2ph2hf8s -2048(,%rbp,2), %zmm23, %zmm22 +// CHECK: encoding: [0x62,0xe5,0x47,0x40,0x1b,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtne2ph2hf8s -2048(,%rbp,2), %zmm23, %zmm22 + +// CHECK: vcvtne2ph2hf8s 8128(%rcx), %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x47,0xc7,0x1b,0x71,0x7f] + vcvtne2ph2hf8s 8128(%rcx), %zmm23, %zmm22 {%k7} {z} + +// CHECK: vcvtne2ph2hf8s -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x47,0xd7,0x1b,0x72,0x80] + vcvtne2ph2hf8s -256(%rdx){1to32}, %zmm23, %zmm22 {%k7} {z} + +// CHECK: vcvtne2ph2hf8s 268435456(%rbp,%r14,8), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xa5,0x47,0x20,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8s 268435456(%rbp,%r14,8), %ymm23, %ymm22 + +// CHECK: vcvtne2ph2hf8s 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x47,0x27,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8s 291(%r8,%rax,4), %ymm23, %ymm22 {%k7} + +// CHECK: vcvtne2ph2hf8s (%rip){1to16}, %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x47,0x30,0x1b,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2hf8s (%rip){1to16}, %ymm23, %ymm22 + +// CHECK: vcvtne2ph2hf8s -1024(,%rbp,2), %ymm23, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x47,0x20,0x1b,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtne2ph2hf8s -1024(,%rbp,2), %ymm23, %ymm22 + +// CHECK: vcvtne2ph2hf8s 4064(%rcx), %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x47,0xa7,0x1b,0x71,0x7f] + vcvtne2ph2hf8s 4064(%rcx), %ymm23, %ymm22 {%k7} {z} + +// CHECK: vcvtne2ph2hf8s -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x47,0xb7,0x1b,0x72,0x80] + vcvtne2ph2hf8s -256(%rdx){1to16}, %ymm23, %ymm22 {%k7} {z} + +// CHECK: vcvtne2ph2hf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x47,0x00,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8s 268435456(%rbp,%r14,8), %xmm23, %xmm22 + +// CHECK: vcvtne2ph2hf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x47,0x07,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8s 291(%r8,%rax,4), %xmm23, %xmm22 {%k7} + +// CHECK: vcvtne2ph2hf8s (%rip){1to8}, %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x47,0x10,0x1b,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2hf8s (%rip){1to8}, %xmm23, %xmm22 + +// CHECK: vcvtne2ph2hf8s -512(,%rbp,2), %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x47,0x00,0x1b,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtne2ph2hf8s -512(,%rbp,2), %xmm23, %xmm22 + +// CHECK: vcvtne2ph2hf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x47,0x87,0x1b,0x71,0x7f] + vcvtne2ph2hf8s 2032(%rcx), %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtne2ph2hf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x47,0x97,0x1b,0x72,0x80] + vcvtne2ph2hf8s -256(%rdx){1to8}, %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtneph2bf8 %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x74,0xf7] + vcvtneph2bf8 %xmm23, %xmm22 + +// CHECK: vcvtneph2bf8 %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xa2,0x7e,0x0f,0x74,0xf7] + vcvtneph2bf8 %xmm23, %xmm22 {%k7} + +// CHECK: vcvtneph2bf8 %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0x8f,0x74,0xf7] + vcvtneph2bf8 %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtneph2bf8 %zmm23, %ymm22 +// CHECK: encoding: [0x62,0xa2,0x7e,0x48,0x74,0xf7] + vcvtneph2bf8 %zmm23, %ymm22 + +// CHECK: vcvtneph2bf8 %zmm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xa2,0x7e,0x4f,0x74,0xf7] + vcvtneph2bf8 %zmm23, %ymm22 {%k7} + +// CHECK: vcvtneph2bf8 %zmm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xcf,0x74,0xf7] + vcvtneph2bf8 %zmm23, %ymm22 {%k7} {z} + +// CHECK: vcvtneph2bf8 %ymm23, %xmm22 +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x74,0xf7] + vcvtneph2bf8 %ymm23, %xmm22 + +// CHECK: vcvtneph2bf8 %ymm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xa2,0x7e,0x2f,0x74,0xf7] + vcvtneph2bf8 %ymm23, %xmm22 {%k7} + +// CHECK: vcvtneph2bf8 %ymm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xaf,0x74,0xf7] + vcvtneph2bf8 %ymm23, %xmm22 {%k7} {z} + +// CHECK: vcvtneph2bf8x 268435456(%rbp,%r14,8), %xmm22 +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtneph2bf8x 268435456(%rbp,%r14,8), %xmm22 + +// CHECK: vcvtneph2bf8x 291(%r8,%rax,4), %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x7e,0x0f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtneph2bf8x 291(%r8,%rax,4), %xmm22 {%k7} + +// CHECK: vcvtneph2bf8 (%rip){1to8}, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x7e,0x18,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtneph2bf8 (%rip){1to8}, %xmm22 + +// CHECK: vcvtneph2bf8x -512(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtneph2bf8x -512(,%rbp,2), %xmm22 + +// CHECK: vcvtneph2bf8x 2032(%rcx), %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x7e,0x8f,0x74,0x71,0x7f] + vcvtneph2bf8x 2032(%rcx), %xmm22 {%k7} {z} + +// CHECK: vcvtneph2bf8 -256(%rdx){1to8}, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x7e,0x9f,0x74,0x72,0x80] + vcvtneph2bf8 -256(%rdx){1to8}, %xmm22 {%k7} {z} + +// CHECK: vcvtneph2bf8 (%rip){1to16}, %xmm22 +// CHECK: encoding: [0x62,0xe2,0x7e,0x38,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtneph2bf8 (%rip){1to16}, %xmm22 + +// CHECK: vcvtneph2bf8y -1024(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtneph2bf8y -1024(,%rbp,2), %xmm22 + +// CHECK: vcvtneph2bf8y 4064(%rcx), %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x7e,0xaf,0x74,0x71,0x7f] + vcvtneph2bf8y 4064(%rcx), %xmm22 {%k7} {z} + +// CHECK: vcvtneph2bf8 -256(%rdx){1to16}, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x7e,0xbf,0x74,0x72,0x80] + vcvtneph2bf8 -256(%rdx){1to16}, %xmm22 {%k7} {z} + +// CHECK: vcvtneph2bf8 268435456(%rbp,%r14,8), %ymm22 +// CHECK: encoding: [0x62,0xa2,0x7e,0x48,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtneph2bf8 268435456(%rbp,%r14,8), %ymm22 + +// CHECK: vcvtneph2bf8 291(%r8,%rax,4), %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc2,0x7e,0x4f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtneph2bf8 291(%r8,%rax,4), %ymm22 {%k7} + +// CHECK: vcvtneph2bf8 (%rip){1to32}, %ymm22 +// CHECK: encoding: [0x62,0xe2,0x7e,0x58,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtneph2bf8 (%rip){1to32}, %ymm22 + +// CHECK: vcvtneph2bf8 -2048(,%rbp,2), %ymm22 +// CHECK: encoding: [0x62,0xe2,0x7e,0x48,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtneph2bf8 -2048(,%rbp,2), %ymm22 + +// CHECK: vcvtneph2bf8 8128(%rcx), %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x7e,0xcf,0x74,0x71,0x7f] + vcvtneph2bf8 8128(%rcx), %ymm22 {%k7} {z} + +// CHECK: vcvtneph2bf8 -256(%rdx){1to32}, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe2,0x7e,0xdf,0x74,0x72,0x80] + vcvtneph2bf8 -256(%rdx){1to32}, %ymm22 {%k7} {z} + +// CHECK: vcvtneph2bf8s %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x74,0xf7] + vcvtneph2bf8s %xmm23, %xmm22 + +// CHECK: vcvtneph2bf8s %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xa5,0x7e,0x0f,0x74,0xf7] + vcvtneph2bf8s %xmm23, %xmm22 {%k7} + +// CHECK: vcvtneph2bf8s %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa5,0x7e,0x8f,0x74,0xf7] + vcvtneph2bf8s %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtneph2bf8s %zmm23, %ymm22 +// CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x74,0xf7] + vcvtneph2bf8s %zmm23, %ymm22 + +// CHECK: vcvtneph2bf8s %zmm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xa5,0x7e,0x4f,0x74,0xf7] + vcvtneph2bf8s %zmm23, %ymm22 {%k7} + +// CHECK: vcvtneph2bf8s %zmm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa5,0x7e,0xcf,0x74,0xf7] + vcvtneph2bf8s %zmm23, %ymm22 {%k7} {z} + +// CHECK: vcvtneph2bf8s %ymm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7e,0x28,0x74,0xf7] + vcvtneph2bf8s %ymm23, %xmm22 + +// CHECK: vcvtneph2bf8s %ymm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xa5,0x7e,0x2f,0x74,0xf7] + vcvtneph2bf8s %ymm23, %xmm22 {%k7} + +// CHECK: vcvtneph2bf8s %ymm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa5,0x7e,0xaf,0x74,0xf7] + vcvtneph2bf8s %ymm23, %xmm22 {%k7} {z} + +// CHECK: vcvtneph2bf8sx 268435456(%rbp,%r14,8), %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtneph2bf8sx 268435456(%rbp,%r14,8), %xmm22 + +// CHECK: vcvtneph2bf8sx 291(%r8,%rax,4), %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x7e,0x0f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtneph2bf8sx 291(%r8,%rax,4), %xmm22 {%k7} + +// CHECK: vcvtneph2bf8s (%rip){1to8}, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7e,0x18,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtneph2bf8s (%rip){1to8}, %xmm22 + +// CHECK: vcvtneph2bf8sx -512(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtneph2bf8sx -512(,%rbp,2), %xmm22 + +// CHECK: vcvtneph2bf8sx 2032(%rcx), %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7e,0x8f,0x74,0x71,0x7f] + vcvtneph2bf8sx 2032(%rcx), %xmm22 {%k7} {z} + +// CHECK: vcvtneph2bf8s -256(%rdx){1to8}, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7e,0x9f,0x74,0x72,0x80] + vcvtneph2bf8s -256(%rdx){1to8}, %xmm22 {%k7} {z} + +// CHECK: vcvtneph2bf8s (%rip){1to16}, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7e,0x38,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtneph2bf8s (%rip){1to16}, %xmm22 + +// CHECK: vcvtneph2bf8sy -1024(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7e,0x28,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtneph2bf8sy -1024(,%rbp,2), %xmm22 + +// CHECK: vcvtneph2bf8sy 4064(%rcx), %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7e,0xaf,0x74,0x71,0x7f] + vcvtneph2bf8sy 4064(%rcx), %xmm22 {%k7} {z} + +// CHECK: vcvtneph2bf8s -256(%rdx){1to16}, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7e,0xbf,0x74,0x72,0x80] + vcvtneph2bf8s -256(%rdx){1to16}, %xmm22 {%k7} {z} + +// CHECK: vcvtneph2bf8s 268435456(%rbp,%r14,8), %ymm22 +// CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtneph2bf8s 268435456(%rbp,%r14,8), %ymm22 + +// CHECK: vcvtneph2bf8s 291(%r8,%rax,4), %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x7e,0x4f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtneph2bf8s 291(%r8,%rax,4), %ymm22 {%k7} + +// CHECK: vcvtneph2bf8s (%rip){1to32}, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x7e,0x58,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtneph2bf8s (%rip){1to32}, %ymm22 + +// CHECK: vcvtneph2bf8s -2048(,%rbp,2), %ymm22 +// CHECK: encoding: [0x62,0xe5,0x7e,0x48,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtneph2bf8s -2048(,%rbp,2), %ymm22 + +// CHECK: vcvtneph2bf8s 8128(%rcx), %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7e,0xcf,0x74,0x71,0x7f] + vcvtneph2bf8s 8128(%rcx), %ymm22 {%k7} {z} + +// CHECK: vcvtneph2bf8s -256(%rdx){1to32}, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7e,0xdf,0x74,0x72,0x80] + vcvtneph2bf8s -256(%rdx){1to32}, %ymm22 {%k7} {z} + +// CHECK: vcvtneph2hf8 %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x18,0xf7] + vcvtneph2hf8 %xmm23, %xmm22 + +// CHECK: vcvtneph2hf8 %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xa5,0x7e,0x0f,0x18,0xf7] + vcvtneph2hf8 %xmm23, %xmm22 {%k7} + +// CHECK: vcvtneph2hf8 %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa5,0x7e,0x8f,0x18,0xf7] + vcvtneph2hf8 %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtneph2hf8 %zmm23, %ymm22 +// CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x18,0xf7] + vcvtneph2hf8 %zmm23, %ymm22 + +// CHECK: vcvtneph2hf8 %zmm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xa5,0x7e,0x4f,0x18,0xf7] + vcvtneph2hf8 %zmm23, %ymm22 {%k7} + +// CHECK: vcvtneph2hf8 %zmm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa5,0x7e,0xcf,0x18,0xf7] + vcvtneph2hf8 %zmm23, %ymm22 {%k7} {z} + +// CHECK: vcvtneph2hf8 %ymm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7e,0x28,0x18,0xf7] + vcvtneph2hf8 %ymm23, %xmm22 + +// CHECK: vcvtneph2hf8 %ymm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xa5,0x7e,0x2f,0x18,0xf7] + vcvtneph2hf8 %ymm23, %xmm22 {%k7} + +// CHECK: vcvtneph2hf8 %ymm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa5,0x7e,0xaf,0x18,0xf7] + vcvtneph2hf8 %ymm23, %xmm22 {%k7} {z} + +// CHECK: vcvtneph2hf8x 268435456(%rbp,%r14,8), %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtneph2hf8x 268435456(%rbp,%r14,8), %xmm22 + +// CHECK: vcvtneph2hf8x 291(%r8,%rax,4), %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x7e,0x0f,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtneph2hf8x 291(%r8,%rax,4), %xmm22 {%k7} + +// CHECK: vcvtneph2hf8 (%rip){1to8}, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7e,0x18,0x18,0x35,0x00,0x00,0x00,0x00] + vcvtneph2hf8 (%rip){1to8}, %xmm22 + +// CHECK: vcvtneph2hf8x -512(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x18,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtneph2hf8x -512(,%rbp,2), %xmm22 + +// CHECK: vcvtneph2hf8x 2032(%rcx), %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7e,0x8f,0x18,0x71,0x7f] + vcvtneph2hf8x 2032(%rcx), %xmm22 {%k7} {z} + +// CHECK: vcvtneph2hf8 -256(%rdx){1to8}, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7e,0x9f,0x18,0x72,0x80] + vcvtneph2hf8 -256(%rdx){1to8}, %xmm22 {%k7} {z} + +// CHECK: vcvtneph2hf8 (%rip){1to16}, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7e,0x38,0x18,0x35,0x00,0x00,0x00,0x00] + vcvtneph2hf8 (%rip){1to16}, %xmm22 + +// CHECK: vcvtneph2hf8y -1024(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7e,0x28,0x18,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtneph2hf8y -1024(,%rbp,2), %xmm22 + +// CHECK: vcvtneph2hf8y 4064(%rcx), %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7e,0xaf,0x18,0x71,0x7f] + vcvtneph2hf8y 4064(%rcx), %xmm22 {%k7} {z} + +// CHECK: vcvtneph2hf8 -256(%rdx){1to16}, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7e,0xbf,0x18,0x72,0x80] + vcvtneph2hf8 -256(%rdx){1to16}, %xmm22 {%k7} {z} + +// CHECK: vcvtneph2hf8 268435456(%rbp,%r14,8), %ymm22 +// CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtneph2hf8 268435456(%rbp,%r14,8), %ymm22 + +// CHECK: vcvtneph2hf8 291(%r8,%rax,4), %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x7e,0x4f,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtneph2hf8 291(%r8,%rax,4), %ymm22 {%k7} + +// CHECK: vcvtneph2hf8 (%rip){1to32}, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x7e,0x58,0x18,0x35,0x00,0x00,0x00,0x00] + vcvtneph2hf8 (%rip){1to32}, %ymm22 + +// CHECK: vcvtneph2hf8 -2048(,%rbp,2), %ymm22 +// CHECK: encoding: [0x62,0xe5,0x7e,0x48,0x18,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtneph2hf8 -2048(,%rbp,2), %ymm22 + +// CHECK: vcvtneph2hf8 8128(%rcx), %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7e,0xcf,0x18,0x71,0x7f] + vcvtneph2hf8 8128(%rcx), %ymm22 {%k7} {z} + +// CHECK: vcvtneph2hf8 -256(%rdx){1to32}, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7e,0xdf,0x18,0x72,0x80] + vcvtneph2hf8 -256(%rdx){1to32}, %ymm22 {%k7} {z} + +// CHECK: vcvtneph2hf8s %xmm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x1b,0xf7] + vcvtneph2hf8s %xmm23, %xmm22 + +// CHECK: vcvtneph2hf8s %xmm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xa5,0x7e,0x0f,0x1b,0xf7] + vcvtneph2hf8s %xmm23, %xmm22 {%k7} + +// CHECK: vcvtneph2hf8s %xmm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa5,0x7e,0x8f,0x1b,0xf7] + vcvtneph2hf8s %xmm23, %xmm22 {%k7} {z} + +// CHECK: vcvtneph2hf8s %zmm23, %ymm22 +// CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x1b,0xf7] + vcvtneph2hf8s %zmm23, %ymm22 + +// CHECK: vcvtneph2hf8s %zmm23, %ymm22 {%k7} +// CHECK: encoding: [0x62,0xa5,0x7e,0x4f,0x1b,0xf7] + vcvtneph2hf8s %zmm23, %ymm22 {%k7} + +// CHECK: vcvtneph2hf8s %zmm23, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa5,0x7e,0xcf,0x1b,0xf7] + vcvtneph2hf8s %zmm23, %ymm22 {%k7} {z} + +// CHECK: vcvtneph2hf8s %ymm23, %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7e,0x28,0x1b,0xf7] + vcvtneph2hf8s %ymm23, %xmm22 + +// CHECK: vcvtneph2hf8s %ymm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xa5,0x7e,0x2f,0x1b,0xf7] + vcvtneph2hf8s %ymm23, %xmm22 {%k7} + +// CHECK: vcvtneph2hf8s %ymm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa5,0x7e,0xaf,0x1b,0xf7] + vcvtneph2hf8s %ymm23, %xmm22 {%k7} {z} + +// CHECK: vcvtneph2hf8sx 268435456(%rbp,%r14,8), %xmm22 +// CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtneph2hf8sx 268435456(%rbp,%r14,8), %xmm22 + +// CHECK: vcvtneph2hf8sx 291(%r8,%rax,4), %xmm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x7e,0x0f,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtneph2hf8sx 291(%r8,%rax,4), %xmm22 {%k7} + +// CHECK: vcvtneph2hf8s (%rip){1to8}, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7e,0x18,0x1b,0x35,0x00,0x00,0x00,0x00] + vcvtneph2hf8s (%rip){1to8}, %xmm22 + +// CHECK: vcvtneph2hf8sx -512(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x1b,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtneph2hf8sx -512(,%rbp,2), %xmm22 + +// CHECK: vcvtneph2hf8sx 2032(%rcx), %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7e,0x8f,0x1b,0x71,0x7f] + vcvtneph2hf8sx 2032(%rcx), %xmm22 {%k7} {z} + +// CHECK: vcvtneph2hf8s -256(%rdx){1to8}, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7e,0x9f,0x1b,0x72,0x80] + vcvtneph2hf8s -256(%rdx){1to8}, %xmm22 {%k7} {z} + +// CHECK: vcvtneph2hf8s (%rip){1to16}, %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7e,0x38,0x1b,0x35,0x00,0x00,0x00,0x00] + vcvtneph2hf8s (%rip){1to16}, %xmm22 + +// CHECK: vcvtneph2hf8sy -1024(,%rbp,2), %xmm22 +// CHECK: encoding: [0x62,0xe5,0x7e,0x28,0x1b,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtneph2hf8sy -1024(,%rbp,2), %xmm22 + +// CHECK: vcvtneph2hf8sy 4064(%rcx), %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7e,0xaf,0x1b,0x71,0x7f] + vcvtneph2hf8sy 4064(%rcx), %xmm22 {%k7} {z} + +// CHECK: vcvtneph2hf8s -256(%rdx){1to16}, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7e,0xbf,0x1b,0x72,0x80] + vcvtneph2hf8s -256(%rdx){1to16}, %xmm22 {%k7} {z} + +// CHECK: vcvtneph2hf8s 268435456(%rbp,%r14,8), %ymm22 +// CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtneph2hf8s 268435456(%rbp,%r14,8), %ymm22 + +// CHECK: vcvtneph2hf8s 291(%r8,%rax,4), %ymm22 {%k7} +// CHECK: encoding: [0x62,0xc5,0x7e,0x4f,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtneph2hf8s 291(%r8,%rax,4), %ymm22 {%k7} + +// CHECK: vcvtneph2hf8s (%rip){1to32}, %ymm22 +// CHECK: encoding: [0x62,0xe5,0x7e,0x58,0x1b,0x35,0x00,0x00,0x00,0x00] + vcvtneph2hf8s (%rip){1to32}, %ymm22 + +// CHECK: vcvtneph2hf8s -2048(,%rbp,2), %ymm22 +// CHECK: encoding: [0x62,0xe5,0x7e,0x48,0x1b,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtneph2hf8s -2048(,%rbp,2), %ymm22 + +// CHECK: vcvtneph2hf8s 8128(%rcx), %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7e,0xcf,0x1b,0x71,0x7f] + vcvtneph2hf8s 8128(%rcx), %ymm22 {%k7} {z} + +// CHECK: vcvtneph2hf8s -256(%rdx){1to32}, %ymm22 {%k7} {z} +// CHECK: encoding: [0x62,0xe5,0x7e,0xdf,0x1b,0x72,0x80] + vcvtneph2hf8s -256(%rdx){1to32}, %ymm22 {%k7} {z} + diff --git a/llvm/test/MC/X86/avx10.2convert-64-intel.s b/llvm/test/MC/X86/avx10.2convert-64-intel.s new file mode 100644 index 00000000000000..2f0cd1b2809357 --- /dev/null +++ b/llvm/test/MC/X86/avx10.2convert-64-intel.s @@ -0,0 +1,1490 @@ +// RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: vcvt2ps2phx ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x82,0x45,0x20,0x67,0xf0] + vcvt2ps2phx ymm22, ymm23, ymm24 + +// CHECK: vcvt2ps2phx ymm22, ymm23, ymm24, {rn-sae} +// CHECK: encoding: [0x62,0x82,0x41,0x10,0x67,0xf0] + vcvt2ps2phx ymm22, ymm23, ymm24, {rn-sae} + +// CHECK: vcvt2ps2phx ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x82,0x45,0x27,0x67,0xf0] + vcvt2ps2phx ymm22 {k7}, ymm23, ymm24 + +// CHECK: vcvt2ps2phx ymm22 {k7} {z}, ymm23, ymm24, {rz-sae} +// CHECK: encoding: [0x62,0x82,0x41,0xf7,0x67,0xf0] + vcvt2ps2phx ymm22 {k7} {z}, ymm23, ymm24, {rz-sae} + +// CHECK: vcvt2ps2phx zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x82,0x45,0x40,0x67,0xf0] + vcvt2ps2phx zmm22, zmm23, zmm24 + +// CHECK: vcvt2ps2phx zmm22, zmm23, zmm24, {rn-sae} +// CHECK: encoding: [0x62,0x82,0x45,0x10,0x67,0xf0] + vcvt2ps2phx zmm22, zmm23, zmm24, {rn-sae} + +// CHECK: vcvt2ps2phx zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x82,0x45,0x47,0x67,0xf0] + vcvt2ps2phx zmm22 {k7}, zmm23, zmm24 + +// CHECK: vcvt2ps2phx zmm22 {k7} {z}, zmm23, zmm24, {rz-sae} +// CHECK: encoding: [0x62,0x82,0x45,0xf7,0x67,0xf0] + vcvt2ps2phx zmm22 {k7} {z}, zmm23, zmm24, {rz-sae} + +// CHECK: vcvt2ps2phx xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x82,0x45,0x00,0x67,0xf0] + vcvt2ps2phx xmm22, xmm23, xmm24 + +// CHECK: vcvt2ps2phx xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x82,0x45,0x07,0x67,0xf0] + vcvt2ps2phx xmm22 {k7}, xmm23, xmm24 + +// CHECK: vcvt2ps2phx xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x82,0x45,0x87,0x67,0xf0] + vcvt2ps2phx xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vcvt2ps2phx zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa2,0x45,0x40,0x67,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvt2ps2phx zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvt2ps2phx zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc2,0x45,0x47,0x67,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvt2ps2phx zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvt2ps2phx zmm22, zmm23, dword ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe2,0x45,0x50,0x67,0x35,0x00,0x00,0x00,0x00] + vcvt2ps2phx zmm22, zmm23, dword ptr [rip]{1to16} + +// CHECK: vcvt2ps2phx zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe2,0x45,0x40,0x67,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvt2ps2phx zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vcvt2ps2phx zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe2,0x45,0xc7,0x67,0x71,0x7f] + vcvt2ps2phx zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vcvt2ps2phx zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16} +// CHECK: encoding: [0x62,0xe2,0x45,0xd7,0x67,0x72,0x80] + vcvt2ps2phx zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16} + +// CHECK: vcvt2ps2phx ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa2,0x45,0x20,0x67,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvt2ps2phx ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvt2ps2phx ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc2,0x45,0x27,0x67,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvt2ps2phx ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvt2ps2phx ymm22, ymm23, dword ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe2,0x45,0x30,0x67,0x35,0x00,0x00,0x00,0x00] + vcvt2ps2phx ymm22, ymm23, dword ptr [rip]{1to8} + +// CHECK: vcvt2ps2phx ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe2,0x45,0x20,0x67,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvt2ps2phx ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vcvt2ps2phx ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe2,0x45,0xa7,0x67,0x71,0x7f] + vcvt2ps2phx ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vcvt2ps2phx ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8} +// CHECK: encoding: [0x62,0xe2,0x45,0xb7,0x67,0x72,0x80] + vcvt2ps2phx ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8} + +// CHECK: vcvt2ps2phx xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa2,0x45,0x00,0x67,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvt2ps2phx xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvt2ps2phx xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc2,0x45,0x07,0x67,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvt2ps2phx xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvt2ps2phx xmm22, xmm23, dword ptr [rip]{1to4} +// CHECK: encoding: [0x62,0xe2,0x45,0x10,0x67,0x35,0x00,0x00,0x00,0x00] + vcvt2ps2phx xmm22, xmm23, dword ptr [rip]{1to4} + +// CHECK: vcvt2ps2phx xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe2,0x45,0x00,0x67,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvt2ps2phx xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vcvt2ps2phx xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe2,0x45,0x87,0x67,0x71,0x7f] + vcvt2ps2phx xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vcvt2ps2phx xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4} +// CHECK: encoding: [0x62,0xe2,0x45,0x97,0x67,0x72,0x80] + vcvt2ps2phx xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4} + +// CHECK: vcvtbiasph2bf8 ymm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x82,0x44,0x40,0x74,0xf0] + vcvtbiasph2bf8 ymm22, zmm23, zmm24 + +// CHECK: vcvtbiasph2bf8 ymm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x82,0x44,0x47,0x74,0xf0] + vcvtbiasph2bf8 ymm22 {k7}, zmm23, zmm24 + +// CHECK: vcvtbiasph2bf8 ymm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x82,0x44,0xc7,0x74,0xf0] + vcvtbiasph2bf8 ymm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vcvtbiasph2bf8 xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x82,0x44,0x00,0x74,0xf0] + vcvtbiasph2bf8 xmm22, xmm23, xmm24 + +// CHECK: vcvtbiasph2bf8 xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x82,0x44,0x07,0x74,0xf0] + vcvtbiasph2bf8 xmm22 {k7}, xmm23, xmm24 + +// CHECK: vcvtbiasph2bf8 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x82,0x44,0x87,0x74,0xf0] + vcvtbiasph2bf8 xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vcvtbiasph2bf8 xmm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x82,0x44,0x20,0x74,0xf0] + vcvtbiasph2bf8 xmm22, ymm23, ymm24 + +// CHECK: vcvtbiasph2bf8 xmm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x82,0x44,0x27,0x74,0xf0] + vcvtbiasph2bf8 xmm22 {k7}, ymm23, ymm24 + +// CHECK: vcvtbiasph2bf8 xmm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x82,0x44,0xa7,0x74,0xf0] + vcvtbiasph2bf8 xmm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vcvtbiasph2bf8 xmm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa2,0x44,0x20,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8 xmm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtbiasph2bf8 xmm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc2,0x44,0x27,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8 xmm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtbiasph2bf8 xmm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe2,0x44,0x30,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2bf8 xmm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vcvtbiasph2bf8 xmm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe2,0x44,0x20,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtbiasph2bf8 xmm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vcvtbiasph2bf8 xmm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe2,0x44,0xa7,0x74,0x71,0x7f] + vcvtbiasph2bf8 xmm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vcvtbiasph2bf8 xmm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe2,0x44,0xb7,0x74,0x72,0x80] + vcvtbiasph2bf8 xmm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vcvtbiasph2bf8 ymm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa2,0x44,0x40,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8 ymm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtbiasph2bf8 ymm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc2,0x44,0x47,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8 ymm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtbiasph2bf8 ymm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe2,0x44,0x50,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2bf8 ymm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vcvtbiasph2bf8 ymm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe2,0x44,0x40,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtbiasph2bf8 ymm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vcvtbiasph2bf8 ymm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe2,0x44,0xc7,0x74,0x71,0x7f] + vcvtbiasph2bf8 ymm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vcvtbiasph2bf8 ymm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe2,0x44,0xd7,0x74,0x72,0x80] + vcvtbiasph2bf8 ymm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vcvtbiasph2bf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa2,0x44,0x00,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtbiasph2bf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc2,0x44,0x07,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtbiasph2bf8 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe2,0x44,0x10,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2bf8 xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vcvtbiasph2bf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe2,0x44,0x00,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtbiasph2bf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vcvtbiasph2bf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe2,0x44,0x87,0x74,0x71,0x7f] + vcvtbiasph2bf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vcvtbiasph2bf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe2,0x44,0x97,0x74,0x72,0x80] + vcvtbiasph2bf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vcvtbiasph2bf8s ymm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x44,0x40,0x74,0xf0] + vcvtbiasph2bf8s ymm22, zmm23, zmm24 + +// CHECK: vcvtbiasph2bf8s ymm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x44,0x47,0x74,0xf0] + vcvtbiasph2bf8s ymm22 {k7}, zmm23, zmm24 + +// CHECK: vcvtbiasph2bf8s ymm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x44,0xc7,0x74,0xf0] + vcvtbiasph2bf8s ymm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vcvtbiasph2bf8s xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x44,0x00,0x74,0xf0] + vcvtbiasph2bf8s xmm22, xmm23, xmm24 + +// CHECK: vcvtbiasph2bf8s xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x44,0x07,0x74,0xf0] + vcvtbiasph2bf8s xmm22 {k7}, xmm23, xmm24 + +// CHECK: vcvtbiasph2bf8s xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x44,0x87,0x74,0xf0] + vcvtbiasph2bf8s xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vcvtbiasph2bf8s xmm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x44,0x20,0x74,0xf0] + vcvtbiasph2bf8s xmm22, ymm23, ymm24 + +// CHECK: vcvtbiasph2bf8s xmm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x44,0x27,0x74,0xf0] + vcvtbiasph2bf8s xmm22 {k7}, ymm23, ymm24 + +// CHECK: vcvtbiasph2bf8s xmm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x44,0xa7,0x74,0xf0] + vcvtbiasph2bf8s xmm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vcvtbiasph2bf8s xmm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x44,0x20,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8s xmm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtbiasph2bf8s xmm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x44,0x27,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8s xmm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtbiasph2bf8s xmm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe5,0x44,0x30,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2bf8s xmm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vcvtbiasph2bf8s xmm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe5,0x44,0x20,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtbiasph2bf8s xmm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vcvtbiasph2bf8s xmm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe5,0x44,0xa7,0x74,0x71,0x7f] + vcvtbiasph2bf8s xmm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vcvtbiasph2bf8s xmm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe5,0x44,0xb7,0x74,0x72,0x80] + vcvtbiasph2bf8s xmm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vcvtbiasph2bf8s ymm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x44,0x40,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8s ymm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtbiasph2bf8s ymm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x44,0x47,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8s ymm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtbiasph2bf8s ymm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe5,0x44,0x50,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2bf8s ymm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vcvtbiasph2bf8s ymm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe5,0x44,0x40,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtbiasph2bf8s ymm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vcvtbiasph2bf8s ymm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe5,0x44,0xc7,0x74,0x71,0x7f] + vcvtbiasph2bf8s ymm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vcvtbiasph2bf8s ymm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe5,0x44,0xd7,0x74,0x72,0x80] + vcvtbiasph2bf8s ymm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vcvtbiasph2bf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x44,0x00,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2bf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtbiasph2bf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x44,0x07,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2bf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtbiasph2bf8s xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe5,0x44,0x10,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2bf8s xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vcvtbiasph2bf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe5,0x44,0x00,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtbiasph2bf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vcvtbiasph2bf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe5,0x44,0x87,0x74,0x71,0x7f] + vcvtbiasph2bf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vcvtbiasph2bf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe5,0x44,0x97,0x74,0x72,0x80] + vcvtbiasph2bf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vcvtbiasph2hf8 ymm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x44,0x40,0x18,0xf0] + vcvtbiasph2hf8 ymm22, zmm23, zmm24 + +// CHECK: vcvtbiasph2hf8 ymm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x44,0x47,0x18,0xf0] + vcvtbiasph2hf8 ymm22 {k7}, zmm23, zmm24 + +// CHECK: vcvtbiasph2hf8 ymm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x44,0xc7,0x18,0xf0] + vcvtbiasph2hf8 ymm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vcvtbiasph2hf8 xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x44,0x00,0x18,0xf0] + vcvtbiasph2hf8 xmm22, xmm23, xmm24 + +// CHECK: vcvtbiasph2hf8 xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x44,0x07,0x18,0xf0] + vcvtbiasph2hf8 xmm22 {k7}, xmm23, xmm24 + +// CHECK: vcvtbiasph2hf8 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x44,0x87,0x18,0xf0] + vcvtbiasph2hf8 xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vcvtbiasph2hf8 xmm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x44,0x20,0x18,0xf0] + vcvtbiasph2hf8 xmm22, ymm23, ymm24 + +// CHECK: vcvtbiasph2hf8 xmm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x44,0x27,0x18,0xf0] + vcvtbiasph2hf8 xmm22 {k7}, ymm23, ymm24 + +// CHECK: vcvtbiasph2hf8 xmm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x44,0xa7,0x18,0xf0] + vcvtbiasph2hf8 xmm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vcvtbiasph2hf8 xmm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x44,0x20,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8 xmm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtbiasph2hf8 xmm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x44,0x27,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8 xmm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtbiasph2hf8 xmm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe5,0x44,0x30,0x18,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2hf8 xmm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vcvtbiasph2hf8 xmm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe5,0x44,0x20,0x18,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtbiasph2hf8 xmm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vcvtbiasph2hf8 xmm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe5,0x44,0xa7,0x18,0x71,0x7f] + vcvtbiasph2hf8 xmm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vcvtbiasph2hf8 xmm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe5,0x44,0xb7,0x18,0x72,0x80] + vcvtbiasph2hf8 xmm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vcvtbiasph2hf8 ymm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x44,0x40,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8 ymm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtbiasph2hf8 ymm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x44,0x47,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8 ymm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtbiasph2hf8 ymm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe5,0x44,0x50,0x18,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2hf8 ymm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vcvtbiasph2hf8 ymm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe5,0x44,0x40,0x18,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtbiasph2hf8 ymm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vcvtbiasph2hf8 ymm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe5,0x44,0xc7,0x18,0x71,0x7f] + vcvtbiasph2hf8 ymm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vcvtbiasph2hf8 ymm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe5,0x44,0xd7,0x18,0x72,0x80] + vcvtbiasph2hf8 ymm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vcvtbiasph2hf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x44,0x00,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtbiasph2hf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x44,0x07,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtbiasph2hf8 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe5,0x44,0x10,0x18,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2hf8 xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vcvtbiasph2hf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe5,0x44,0x00,0x18,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtbiasph2hf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vcvtbiasph2hf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe5,0x44,0x87,0x18,0x71,0x7f] + vcvtbiasph2hf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vcvtbiasph2hf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe5,0x44,0x97,0x18,0x72,0x80] + vcvtbiasph2hf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vcvtbiasph2hf8s ymm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x44,0x40,0x1b,0xf0] + vcvtbiasph2hf8s ymm22, zmm23, zmm24 + +// CHECK: vcvtbiasph2hf8s ymm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x44,0x47,0x1b,0xf0] + vcvtbiasph2hf8s ymm22 {k7}, zmm23, zmm24 + +// CHECK: vcvtbiasph2hf8s ymm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x44,0xc7,0x1b,0xf0] + vcvtbiasph2hf8s ymm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vcvtbiasph2hf8s xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x44,0x00,0x1b,0xf0] + vcvtbiasph2hf8s xmm22, xmm23, xmm24 + +// CHECK: vcvtbiasph2hf8s xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x44,0x07,0x1b,0xf0] + vcvtbiasph2hf8s xmm22 {k7}, xmm23, xmm24 + +// CHECK: vcvtbiasph2hf8s xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x44,0x87,0x1b,0xf0] + vcvtbiasph2hf8s xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vcvtbiasph2hf8s xmm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x44,0x20,0x1b,0xf0] + vcvtbiasph2hf8s xmm22, ymm23, ymm24 + +// CHECK: vcvtbiasph2hf8s xmm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x44,0x27,0x1b,0xf0] + vcvtbiasph2hf8s xmm22 {k7}, ymm23, ymm24 + +// CHECK: vcvtbiasph2hf8s xmm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x44,0xa7,0x1b,0xf0] + vcvtbiasph2hf8s xmm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vcvtbiasph2hf8s xmm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x44,0x20,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8s xmm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtbiasph2hf8s xmm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x44,0x27,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8s xmm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtbiasph2hf8s xmm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe5,0x44,0x30,0x1b,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2hf8s xmm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vcvtbiasph2hf8s xmm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe5,0x44,0x20,0x1b,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtbiasph2hf8s xmm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vcvtbiasph2hf8s xmm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe5,0x44,0xa7,0x1b,0x71,0x7f] + vcvtbiasph2hf8s xmm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vcvtbiasph2hf8s xmm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe5,0x44,0xb7,0x1b,0x72,0x80] + vcvtbiasph2hf8s xmm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vcvtbiasph2hf8s ymm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x44,0x40,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8s ymm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtbiasph2hf8s ymm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x44,0x47,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8s ymm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtbiasph2hf8s ymm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe5,0x44,0x50,0x1b,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2hf8s ymm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vcvtbiasph2hf8s ymm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe5,0x44,0x40,0x1b,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtbiasph2hf8s ymm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vcvtbiasph2hf8s ymm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe5,0x44,0xc7,0x1b,0x71,0x7f] + vcvtbiasph2hf8s ymm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vcvtbiasph2hf8s ymm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe5,0x44,0xd7,0x1b,0x72,0x80] + vcvtbiasph2hf8s ymm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vcvtbiasph2hf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x44,0x00,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtbiasph2hf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtbiasph2hf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x44,0x07,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtbiasph2hf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtbiasph2hf8s xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe5,0x44,0x10,0x1b,0x35,0x00,0x00,0x00,0x00] + vcvtbiasph2hf8s xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vcvtbiasph2hf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe5,0x44,0x00,0x1b,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtbiasph2hf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vcvtbiasph2hf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe5,0x44,0x87,0x1b,0x71,0x7f] + vcvtbiasph2hf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vcvtbiasph2hf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe5,0x44,0x97,0x1b,0x72,0x80] + vcvtbiasph2hf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vcvthf82ph xmm22, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7f,0x08,0x1e,0xf7] + vcvthf82ph xmm22, xmm23 + +// CHECK: vcvthf82ph xmm22 {k7}, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7f,0x0f,0x1e,0xf7] + vcvthf82ph xmm22 {k7}, xmm23 + +// CHECK: vcvthf82ph xmm22 {k7} {z}, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7f,0x8f,0x1e,0xf7] + vcvthf82ph xmm22 {k7} {z}, xmm23 + +// CHECK: vcvthf82ph ymm22, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7f,0x28,0x1e,0xf7] + vcvthf82ph ymm22, xmm23 + +// CHECK: vcvthf82ph ymm22 {k7}, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7f,0x2f,0x1e,0xf7] + vcvthf82ph ymm22 {k7}, xmm23 + +// CHECK: vcvthf82ph ymm22 {k7} {z}, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7f,0xaf,0x1e,0xf7] + vcvthf82ph ymm22 {k7} {z}, xmm23 + +// CHECK: vcvthf82ph zmm22, ymm23 +// CHECK: encoding: [0x62,0xa5,0x7f,0x48,0x1e,0xf7] + vcvthf82ph zmm22, ymm23 + +// CHECK: vcvthf82ph zmm22 {k7}, ymm23 +// CHECK: encoding: [0x62,0xa5,0x7f,0x4f,0x1e,0xf7] + vcvthf82ph zmm22 {k7}, ymm23 + +// CHECK: vcvthf82ph zmm22 {k7} {z}, ymm23 +// CHECK: encoding: [0x62,0xa5,0x7f,0xcf,0x1e,0xf7] + vcvthf82ph zmm22 {k7} {z}, ymm23 + +// CHECK: vcvthf82ph xmm22, qword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x7f,0x08,0x1e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvthf82ph xmm22, qword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvthf82ph xmm22 {k7}, qword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x7f,0x0f,0x1e,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvthf82ph xmm22 {k7}, qword ptr [r8 + 4*rax + 291] + +// CHECK: vcvthf82ph xmm22, qword ptr [rip] +// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x1e,0x35,0x00,0x00,0x00,0x00] + vcvthf82ph xmm22, qword ptr [rip] + +// CHECK: vcvthf82ph xmm22, qword ptr [2*rbp - 256] +// CHECK: encoding: [0x62,0xe5,0x7f,0x08,0x1e,0x34,0x6d,0x00,0xff,0xff,0xff] + vcvthf82ph xmm22, qword ptr [2*rbp - 256] + +// CHECK: vcvthf82ph xmm22 {k7} {z}, qword ptr [rcx + 1016] +// CHECK: encoding: [0x62,0xe5,0x7f,0x8f,0x1e,0x71,0x7f] + vcvthf82ph xmm22 {k7} {z}, qword ptr [rcx + 1016] + +// CHECK: vcvthf82ph xmm22 {k7} {z}, qword ptr [rdx - 1024] +// CHECK: encoding: [0x62,0xe5,0x7f,0x8f,0x1e,0x72,0x80] + vcvthf82ph xmm22 {k7} {z}, qword ptr [rdx - 1024] + +// CHECK: vcvthf82ph ymm22, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x7f,0x28,0x1e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvthf82ph ymm22, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvthf82ph ymm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x7f,0x2f,0x1e,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvthf82ph ymm22 {k7}, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvthf82ph ymm22, xmmword ptr [rip] +// CHECK: encoding: [0x62,0xe5,0x7f,0x28,0x1e,0x35,0x00,0x00,0x00,0x00] + vcvthf82ph ymm22, xmmword ptr [rip] + +// CHECK: vcvthf82ph ymm22, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe5,0x7f,0x28,0x1e,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvthf82ph ymm22, xmmword ptr [2*rbp - 512] + +// CHECK: vcvthf82ph ymm22 {k7} {z}, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe5,0x7f,0xaf,0x1e,0x71,0x7f] + vcvthf82ph ymm22 {k7} {z}, xmmword ptr [rcx + 2032] + +// CHECK: vcvthf82ph ymm22 {k7} {z}, xmmword ptr [rdx - 2048] +// CHECK: encoding: [0x62,0xe5,0x7f,0xaf,0x1e,0x72,0x80] + vcvthf82ph ymm22 {k7} {z}, xmmword ptr [rdx - 2048] + +// CHECK: vcvthf82ph zmm22, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x7f,0x48,0x1e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvthf82ph zmm22, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvthf82ph zmm22 {k7}, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x7f,0x4f,0x1e,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvthf82ph zmm22 {k7}, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvthf82ph zmm22, ymmword ptr [rip] +// CHECK: encoding: [0x62,0xe5,0x7f,0x48,0x1e,0x35,0x00,0x00,0x00,0x00] + vcvthf82ph zmm22, ymmword ptr [rip] + +// CHECK: vcvthf82ph zmm22, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe5,0x7f,0x48,0x1e,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvthf82ph zmm22, ymmword ptr [2*rbp - 1024] + +// CHECK: vcvthf82ph zmm22 {k7} {z}, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe5,0x7f,0xcf,0x1e,0x71,0x7f] + vcvthf82ph zmm22 {k7} {z}, ymmword ptr [rcx + 4064] + +// CHECK: vcvthf82ph zmm22 {k7} {z}, ymmword ptr [rdx - 4096] +// CHECK: encoding: [0x62,0xe5,0x7f,0xcf,0x1e,0x72,0x80] + vcvthf82ph zmm22 {k7} {z}, ymmword ptr [rdx - 4096] + +// CHECK: vcvtne2ph2bf8 ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x82,0x47,0x20,0x74,0xf0] + vcvtne2ph2bf8 ymm22, ymm23, ymm24 + +// CHECK: vcvtne2ph2bf8 ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x82,0x47,0x27,0x74,0xf0] + vcvtne2ph2bf8 ymm22 {k7}, ymm23, ymm24 + +// CHECK: vcvtne2ph2bf8 ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x82,0x47,0xa7,0x74,0xf0] + vcvtne2ph2bf8 ymm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vcvtne2ph2bf8 zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x82,0x47,0x40,0x74,0xf0] + vcvtne2ph2bf8 zmm22, zmm23, zmm24 + +// CHECK: vcvtne2ph2bf8 zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x82,0x47,0x47,0x74,0xf0] + vcvtne2ph2bf8 zmm22 {k7}, zmm23, zmm24 + +// CHECK: vcvtne2ph2bf8 zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x82,0x47,0xc7,0x74,0xf0] + vcvtne2ph2bf8 zmm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vcvtne2ph2bf8 xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x82,0x47,0x00,0x74,0xf0] + vcvtne2ph2bf8 xmm22, xmm23, xmm24 + +// CHECK: vcvtne2ph2bf8 xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x82,0x47,0x07,0x74,0xf0] + vcvtne2ph2bf8 xmm22 {k7}, xmm23, xmm24 + +// CHECK: vcvtne2ph2bf8 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x82,0x47,0x87,0x74,0xf0] + vcvtne2ph2bf8 xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vcvtne2ph2bf8 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa2,0x47,0x40,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtne2ph2bf8 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc2,0x47,0x47,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtne2ph2bf8 zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe2,0x47,0x50,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2bf8 zmm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vcvtne2ph2bf8 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe2,0x47,0x40,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtne2ph2bf8 zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vcvtne2ph2bf8 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe2,0x47,0xc7,0x74,0x71,0x7f] + vcvtne2ph2bf8 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vcvtne2ph2bf8 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe2,0x47,0xd7,0x74,0x72,0x80] + vcvtne2ph2bf8 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vcvtne2ph2bf8 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa2,0x47,0x20,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtne2ph2bf8 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc2,0x47,0x27,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtne2ph2bf8 ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe2,0x47,0x30,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2bf8 ymm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vcvtne2ph2bf8 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe2,0x47,0x20,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtne2ph2bf8 ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vcvtne2ph2bf8 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe2,0x47,0xa7,0x74,0x71,0x7f] + vcvtne2ph2bf8 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vcvtne2ph2bf8 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe2,0x47,0xb7,0x74,0x72,0x80] + vcvtne2ph2bf8 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vcvtne2ph2bf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa2,0x47,0x00,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtne2ph2bf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc2,0x47,0x07,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtne2ph2bf8 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe2,0x47,0x10,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2bf8 xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vcvtne2ph2bf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe2,0x47,0x00,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtne2ph2bf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vcvtne2ph2bf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe2,0x47,0x87,0x74,0x71,0x7f] + vcvtne2ph2bf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vcvtne2ph2bf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe2,0x47,0x97,0x74,0x72,0x80] + vcvtne2ph2bf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vcvtne2ph2bf8s ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x47,0x20,0x74,0xf0] + vcvtne2ph2bf8s ymm22, ymm23, ymm24 + +// CHECK: vcvtne2ph2bf8s ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x47,0x27,0x74,0xf0] + vcvtne2ph2bf8s ymm22 {k7}, ymm23, ymm24 + +// CHECK: vcvtne2ph2bf8s ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x47,0xa7,0x74,0xf0] + vcvtne2ph2bf8s ymm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vcvtne2ph2bf8s zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x47,0x40,0x74,0xf0] + vcvtne2ph2bf8s zmm22, zmm23, zmm24 + +// CHECK: vcvtne2ph2bf8s zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x47,0x47,0x74,0xf0] + vcvtne2ph2bf8s zmm22 {k7}, zmm23, zmm24 + +// CHECK: vcvtne2ph2bf8s zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x47,0xc7,0x74,0xf0] + vcvtne2ph2bf8s zmm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vcvtne2ph2bf8s xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x47,0x00,0x74,0xf0] + vcvtne2ph2bf8s xmm22, xmm23, xmm24 + +// CHECK: vcvtne2ph2bf8s xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x47,0x07,0x74,0xf0] + vcvtne2ph2bf8s xmm22 {k7}, xmm23, xmm24 + +// CHECK: vcvtne2ph2bf8s xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x47,0x87,0x74,0xf0] + vcvtne2ph2bf8s xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vcvtne2ph2bf8s zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x47,0x40,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8s zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtne2ph2bf8s zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x47,0x47,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8s zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtne2ph2bf8s zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe5,0x47,0x50,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2bf8s zmm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vcvtne2ph2bf8s zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe5,0x47,0x40,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtne2ph2bf8s zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vcvtne2ph2bf8s zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe5,0x47,0xc7,0x74,0x71,0x7f] + vcvtne2ph2bf8s zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vcvtne2ph2bf8s zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe5,0x47,0xd7,0x74,0x72,0x80] + vcvtne2ph2bf8s zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vcvtne2ph2bf8s ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x47,0x20,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8s ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtne2ph2bf8s ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x47,0x27,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8s ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtne2ph2bf8s ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe5,0x47,0x30,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2bf8s ymm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vcvtne2ph2bf8s ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe5,0x47,0x20,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtne2ph2bf8s ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vcvtne2ph2bf8s ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe5,0x47,0xa7,0x74,0x71,0x7f] + vcvtne2ph2bf8s ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vcvtne2ph2bf8s ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe5,0x47,0xb7,0x74,0x72,0x80] + vcvtne2ph2bf8s ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vcvtne2ph2bf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x47,0x00,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2bf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtne2ph2bf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x47,0x07,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2bf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtne2ph2bf8s xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe5,0x47,0x10,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2bf8s xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vcvtne2ph2bf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe5,0x47,0x00,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtne2ph2bf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vcvtne2ph2bf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe5,0x47,0x87,0x74,0x71,0x7f] + vcvtne2ph2bf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vcvtne2ph2bf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe5,0x47,0x97,0x74,0x72,0x80] + vcvtne2ph2bf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vcvtne2ph2hf8 ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x47,0x20,0x18,0xf0] + vcvtne2ph2hf8 ymm22, ymm23, ymm24 + +// CHECK: vcvtne2ph2hf8 ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x47,0x27,0x18,0xf0] + vcvtne2ph2hf8 ymm22 {k7}, ymm23, ymm24 + +// CHECK: vcvtne2ph2hf8 ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x47,0xa7,0x18,0xf0] + vcvtne2ph2hf8 ymm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vcvtne2ph2hf8 zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x47,0x40,0x18,0xf0] + vcvtne2ph2hf8 zmm22, zmm23, zmm24 + +// CHECK: vcvtne2ph2hf8 zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x47,0x47,0x18,0xf0] + vcvtne2ph2hf8 zmm22 {k7}, zmm23, zmm24 + +// CHECK: vcvtne2ph2hf8 zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x47,0xc7,0x18,0xf0] + vcvtne2ph2hf8 zmm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vcvtne2ph2hf8 xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x47,0x00,0x18,0xf0] + vcvtne2ph2hf8 xmm22, xmm23, xmm24 + +// CHECK: vcvtne2ph2hf8 xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x47,0x07,0x18,0xf0] + vcvtne2ph2hf8 xmm22 {k7}, xmm23, xmm24 + +// CHECK: vcvtne2ph2hf8 xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x47,0x87,0x18,0xf0] + vcvtne2ph2hf8 xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vcvtne2ph2hf8 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x47,0x40,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8 zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtne2ph2hf8 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x47,0x47,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8 zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtne2ph2hf8 zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe5,0x47,0x50,0x18,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2hf8 zmm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vcvtne2ph2hf8 zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe5,0x47,0x40,0x18,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtne2ph2hf8 zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vcvtne2ph2hf8 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe5,0x47,0xc7,0x18,0x71,0x7f] + vcvtne2ph2hf8 zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vcvtne2ph2hf8 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe5,0x47,0xd7,0x18,0x72,0x80] + vcvtne2ph2hf8 zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vcvtne2ph2hf8 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x47,0x20,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8 ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtne2ph2hf8 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x47,0x27,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8 ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtne2ph2hf8 ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe5,0x47,0x30,0x18,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2hf8 ymm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vcvtne2ph2hf8 ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe5,0x47,0x20,0x18,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtne2ph2hf8 ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vcvtne2ph2hf8 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe5,0x47,0xa7,0x18,0x71,0x7f] + vcvtne2ph2hf8 ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vcvtne2ph2hf8 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe5,0x47,0xb7,0x18,0x72,0x80] + vcvtne2ph2hf8 ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vcvtne2ph2hf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x47,0x00,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8 xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtne2ph2hf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x47,0x07,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8 xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtne2ph2hf8 xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe5,0x47,0x10,0x18,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2hf8 xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vcvtne2ph2hf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe5,0x47,0x00,0x18,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtne2ph2hf8 xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vcvtne2ph2hf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe5,0x47,0x87,0x18,0x71,0x7f] + vcvtne2ph2hf8 xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vcvtne2ph2hf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe5,0x47,0x97,0x18,0x72,0x80] + vcvtne2ph2hf8 xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vcvtne2ph2hf8s ymm22, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x47,0x20,0x1b,0xf0] + vcvtne2ph2hf8s ymm22, ymm23, ymm24 + +// CHECK: vcvtne2ph2hf8s ymm22 {k7}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x47,0x27,0x1b,0xf0] + vcvtne2ph2hf8s ymm22 {k7}, ymm23, ymm24 + +// CHECK: vcvtne2ph2hf8s ymm22 {k7} {z}, ymm23, ymm24 +// CHECK: encoding: [0x62,0x85,0x47,0xa7,0x1b,0xf0] + vcvtne2ph2hf8s ymm22 {k7} {z}, ymm23, ymm24 + +// CHECK: vcvtne2ph2hf8s zmm22, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x47,0x40,0x1b,0xf0] + vcvtne2ph2hf8s zmm22, zmm23, zmm24 + +// CHECK: vcvtne2ph2hf8s zmm22 {k7}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x47,0x47,0x1b,0xf0] + vcvtne2ph2hf8s zmm22 {k7}, zmm23, zmm24 + +// CHECK: vcvtne2ph2hf8s zmm22 {k7} {z}, zmm23, zmm24 +// CHECK: encoding: [0x62,0x85,0x47,0xc7,0x1b,0xf0] + vcvtne2ph2hf8s zmm22 {k7} {z}, zmm23, zmm24 + +// CHECK: vcvtne2ph2hf8s xmm22, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x47,0x00,0x1b,0xf0] + vcvtne2ph2hf8s xmm22, xmm23, xmm24 + +// CHECK: vcvtne2ph2hf8s xmm22 {k7}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x47,0x07,0x1b,0xf0] + vcvtne2ph2hf8s xmm22 {k7}, xmm23, xmm24 + +// CHECK: vcvtne2ph2hf8s xmm22 {k7} {z}, xmm23, xmm24 +// CHECK: encoding: [0x62,0x85,0x47,0x87,0x1b,0xf0] + vcvtne2ph2hf8s xmm22 {k7} {z}, xmm23, xmm24 + +// CHECK: vcvtne2ph2hf8s zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x47,0x40,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8s zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtne2ph2hf8s zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x47,0x47,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8s zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtne2ph2hf8s zmm22, zmm23, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe5,0x47,0x50,0x1b,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2hf8s zmm22, zmm23, word ptr [rip]{1to32} + +// CHECK: vcvtne2ph2hf8s zmm22, zmm23, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe5,0x47,0x40,0x1b,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtne2ph2hf8s zmm22, zmm23, zmmword ptr [2*rbp - 2048] + +// CHECK: vcvtne2ph2hf8s zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe5,0x47,0xc7,0x1b,0x71,0x7f] + vcvtne2ph2hf8s zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128] + +// CHECK: vcvtne2ph2hf8s zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe5,0x47,0xd7,0x1b,0x72,0x80] + vcvtne2ph2hf8s zmm22 {k7} {z}, zmm23, word ptr [rdx - 256]{1to32} + +// CHECK: vcvtne2ph2hf8s ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x47,0x20,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8s ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtne2ph2hf8s ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x47,0x27,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8s ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtne2ph2hf8s ymm22, ymm23, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe5,0x47,0x30,0x1b,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2hf8s ymm22, ymm23, word ptr [rip]{1to16} + +// CHECK: vcvtne2ph2hf8s ymm22, ymm23, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe5,0x47,0x20,0x1b,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtne2ph2hf8s ymm22, ymm23, ymmword ptr [2*rbp - 1024] + +// CHECK: vcvtne2ph2hf8s ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe5,0x47,0xa7,0x1b,0x71,0x7f] + vcvtne2ph2hf8s ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064] + +// CHECK: vcvtne2ph2hf8s ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe5,0x47,0xb7,0x1b,0x72,0x80] + vcvtne2ph2hf8s ymm22 {k7} {z}, ymm23, word ptr [rdx - 256]{1to16} + +// CHECK: vcvtne2ph2hf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x47,0x00,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtne2ph2hf8s xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtne2ph2hf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x47,0x07,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtne2ph2hf8s xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtne2ph2hf8s xmm22, xmm23, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe5,0x47,0x10,0x1b,0x35,0x00,0x00,0x00,0x00] + vcvtne2ph2hf8s xmm22, xmm23, word ptr [rip]{1to8} + +// CHECK: vcvtne2ph2hf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe5,0x47,0x00,0x1b,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtne2ph2hf8s xmm22, xmm23, xmmword ptr [2*rbp - 512] + +// CHECK: vcvtne2ph2hf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe5,0x47,0x87,0x1b,0x71,0x7f] + vcvtne2ph2hf8s xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032] + +// CHECK: vcvtne2ph2hf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe5,0x47,0x97,0x1b,0x72,0x80] + vcvtne2ph2hf8s xmm22 {k7} {z}, xmm23, word ptr [rdx - 256]{1to8} + +// CHECK: vcvtneph2bf8 xmm22, xmm23 +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x74,0xf7] + vcvtneph2bf8 xmm22, xmm23 + +// CHECK: vcvtneph2bf8 xmm22 {k7}, xmm23 +// CHECK: encoding: [0x62,0xa2,0x7e,0x0f,0x74,0xf7] + vcvtneph2bf8 xmm22 {k7}, xmm23 + +// CHECK: vcvtneph2bf8 xmm22 {k7} {z}, xmm23 +// CHECK: encoding: [0x62,0xa2,0x7e,0x8f,0x74,0xf7] + vcvtneph2bf8 xmm22 {k7} {z}, xmm23 + +// CHECK: vcvtneph2bf8 ymm22, zmm23 +// CHECK: encoding: [0x62,0xa2,0x7e,0x48,0x74,0xf7] + vcvtneph2bf8 ymm22, zmm23 + +// CHECK: vcvtneph2bf8 ymm22 {k7}, zmm23 +// CHECK: encoding: [0x62,0xa2,0x7e,0x4f,0x74,0xf7] + vcvtneph2bf8 ymm22 {k7}, zmm23 + +// CHECK: vcvtneph2bf8 ymm22 {k7} {z}, zmm23 +// CHECK: encoding: [0x62,0xa2,0x7e,0xcf,0x74,0xf7] + vcvtneph2bf8 ymm22 {k7} {z}, zmm23 + +// CHECK: vcvtneph2bf8 xmm22, ymm23 +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x74,0xf7] + vcvtneph2bf8 xmm22, ymm23 + +// CHECK: vcvtneph2bf8 xmm22 {k7}, ymm23 +// CHECK: encoding: [0x62,0xa2,0x7e,0x2f,0x74,0xf7] + vcvtneph2bf8 xmm22 {k7}, ymm23 + +// CHECK: vcvtneph2bf8 xmm22 {k7} {z}, ymm23 +// CHECK: encoding: [0x62,0xa2,0x7e,0xaf,0x74,0xf7] + vcvtneph2bf8 xmm22 {k7} {z}, ymm23 + +// CHECK: vcvtneph2bf8 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtneph2bf8 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtneph2bf8 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc2,0x7e,0x0f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtneph2bf8 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtneph2bf8 xmm22, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe2,0x7e,0x18,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtneph2bf8 xmm22, word ptr [rip]{1to8} + +// CHECK: vcvtneph2bf8 xmm22, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtneph2bf8 xmm22, xmmword ptr [2*rbp - 512] + +// CHECK: vcvtneph2bf8 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe2,0x7e,0x8f,0x74,0x71,0x7f] + vcvtneph2bf8 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] + +// CHECK: vcvtneph2bf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe2,0x7e,0x9f,0x74,0x72,0x80] + vcvtneph2bf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} + +// CHECK: vcvtneph2bf8 xmm22, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe2,0x7e,0x38,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtneph2bf8 xmm22, word ptr [rip]{1to16} + +// CHECK: vcvtneph2bf8 xmm22, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtneph2bf8 xmm22, ymmword ptr [2*rbp - 1024] + +// CHECK: vcvtneph2bf8 xmm22 {k7} {z}, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe2,0x7e,0xaf,0x74,0x71,0x7f] + vcvtneph2bf8 xmm22 {k7} {z}, ymmword ptr [rcx + 4064] + +// CHECK: vcvtneph2bf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe2,0x7e,0xbf,0x74,0x72,0x80] + vcvtneph2bf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} + +// CHECK: vcvtneph2bf8 ymm22, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa2,0x7e,0x48,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtneph2bf8 ymm22, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtneph2bf8 ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc2,0x7e,0x4f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtneph2bf8 ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtneph2bf8 ymm22, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe2,0x7e,0x58,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtneph2bf8 ymm22, word ptr [rip]{1to32} + +// CHECK: vcvtneph2bf8 ymm22, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe2,0x7e,0x48,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtneph2bf8 ymm22, zmmword ptr [2*rbp - 2048] + +// CHECK: vcvtneph2bf8 ymm22 {k7} {z}, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe2,0x7e,0xcf,0x74,0x71,0x7f] + vcvtneph2bf8 ymm22 {k7} {z}, zmmword ptr [rcx + 8128] + +// CHECK: vcvtneph2bf8 ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe2,0x7e,0xdf,0x74,0x72,0x80] + vcvtneph2bf8 ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} + +// CHECK: vcvtneph2bf8s xmm22, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x74,0xf7] + vcvtneph2bf8s xmm22, xmm23 + +// CHECK: vcvtneph2bf8s xmm22 {k7}, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0x0f,0x74,0xf7] + vcvtneph2bf8s xmm22 {k7}, xmm23 + +// CHECK: vcvtneph2bf8s xmm22 {k7} {z}, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0x8f,0x74,0xf7] + vcvtneph2bf8s xmm22 {k7} {z}, xmm23 + +// CHECK: vcvtneph2bf8s ymm22, zmm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x74,0xf7] + vcvtneph2bf8s ymm22, zmm23 + +// CHECK: vcvtneph2bf8s ymm22 {k7}, zmm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0x4f,0x74,0xf7] + vcvtneph2bf8s ymm22 {k7}, zmm23 + +// CHECK: vcvtneph2bf8s ymm22 {k7} {z}, zmm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0xcf,0x74,0xf7] + vcvtneph2bf8s ymm22 {k7} {z}, zmm23 + +// CHECK: vcvtneph2bf8s xmm22, ymm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0x28,0x74,0xf7] + vcvtneph2bf8s xmm22, ymm23 + +// CHECK: vcvtneph2bf8s xmm22 {k7}, ymm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0x2f,0x74,0xf7] + vcvtneph2bf8s xmm22 {k7}, ymm23 + +// CHECK: vcvtneph2bf8s xmm22 {k7} {z}, ymm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0xaf,0x74,0xf7] + vcvtneph2bf8s xmm22 {k7} {z}, ymm23 + +// CHECK: vcvtneph2bf8s xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtneph2bf8s xmm22, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtneph2bf8s xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x7e,0x0f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtneph2bf8s xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtneph2bf8s xmm22, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe5,0x7e,0x18,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtneph2bf8s xmm22, word ptr [rip]{1to8} + +// CHECK: vcvtneph2bf8s xmm22, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x74,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtneph2bf8s xmm22, xmmword ptr [2*rbp - 512] + +// CHECK: vcvtneph2bf8s xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe5,0x7e,0x8f,0x74,0x71,0x7f] + vcvtneph2bf8s xmm22 {k7} {z}, xmmword ptr [rcx + 2032] + +// CHECK: vcvtneph2bf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe5,0x7e,0x9f,0x74,0x72,0x80] + vcvtneph2bf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} + +// CHECK: vcvtneph2bf8s xmm22, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe5,0x7e,0x38,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtneph2bf8s xmm22, word ptr [rip]{1to16} + +// CHECK: vcvtneph2bf8s xmm22, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe5,0x7e,0x28,0x74,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtneph2bf8s xmm22, ymmword ptr [2*rbp - 1024] + +// CHECK: vcvtneph2bf8s xmm22 {k7} {z}, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe5,0x7e,0xaf,0x74,0x71,0x7f] + vcvtneph2bf8s xmm22 {k7} {z}, ymmword ptr [rcx + 4064] + +// CHECK: vcvtneph2bf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe5,0x7e,0xbf,0x74,0x72,0x80] + vcvtneph2bf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} + +// CHECK: vcvtneph2bf8s ymm22, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x74,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtneph2bf8s ymm22, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtneph2bf8s ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x7e,0x4f,0x74,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtneph2bf8s ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtneph2bf8s ymm22, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe5,0x7e,0x58,0x74,0x35,0x00,0x00,0x00,0x00] + vcvtneph2bf8s ymm22, word ptr [rip]{1to32} + +// CHECK: vcvtneph2bf8s ymm22, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe5,0x7e,0x48,0x74,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtneph2bf8s ymm22, zmmword ptr [2*rbp - 2048] + +// CHECK: vcvtneph2bf8s ymm22 {k7} {z}, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe5,0x7e,0xcf,0x74,0x71,0x7f] + vcvtneph2bf8s ymm22 {k7} {z}, zmmword ptr [rcx + 8128] + +// CHECK: vcvtneph2bf8s ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe5,0x7e,0xdf,0x74,0x72,0x80] + vcvtneph2bf8s ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} + +// CHECK: vcvtneph2hf8 xmm22, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x18,0xf7] + vcvtneph2hf8 xmm22, xmm23 + +// CHECK: vcvtneph2hf8 xmm22 {k7}, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0x0f,0x18,0xf7] + vcvtneph2hf8 xmm22 {k7}, xmm23 + +// CHECK: vcvtneph2hf8 xmm22 {k7} {z}, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0x8f,0x18,0xf7] + vcvtneph2hf8 xmm22 {k7} {z}, xmm23 + +// CHECK: vcvtneph2hf8 ymm22, zmm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x18,0xf7] + vcvtneph2hf8 ymm22, zmm23 + +// CHECK: vcvtneph2hf8 ymm22 {k7}, zmm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0x4f,0x18,0xf7] + vcvtneph2hf8 ymm22 {k7}, zmm23 + +// CHECK: vcvtneph2hf8 ymm22 {k7} {z}, zmm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0xcf,0x18,0xf7] + vcvtneph2hf8 ymm22 {k7} {z}, zmm23 + +// CHECK: vcvtneph2hf8 xmm22, ymm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0x28,0x18,0xf7] + vcvtneph2hf8 xmm22, ymm23 + +// CHECK: vcvtneph2hf8 xmm22 {k7}, ymm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0x2f,0x18,0xf7] + vcvtneph2hf8 xmm22 {k7}, ymm23 + +// CHECK: vcvtneph2hf8 xmm22 {k7} {z}, ymm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0xaf,0x18,0xf7] + vcvtneph2hf8 xmm22 {k7} {z}, ymm23 + +// CHECK: vcvtneph2hf8 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtneph2hf8 xmm22, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtneph2hf8 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x7e,0x0f,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtneph2hf8 xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtneph2hf8 xmm22, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe5,0x7e,0x18,0x18,0x35,0x00,0x00,0x00,0x00] + vcvtneph2hf8 xmm22, word ptr [rip]{1to8} + +// CHECK: vcvtneph2hf8 xmm22, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x18,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtneph2hf8 xmm22, xmmword ptr [2*rbp - 512] + +// CHECK: vcvtneph2hf8 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe5,0x7e,0x8f,0x18,0x71,0x7f] + vcvtneph2hf8 xmm22 {k7} {z}, xmmword ptr [rcx + 2032] + +// CHECK: vcvtneph2hf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe5,0x7e,0x9f,0x18,0x72,0x80] + vcvtneph2hf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} + +// CHECK: vcvtneph2hf8 xmm22, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe5,0x7e,0x38,0x18,0x35,0x00,0x00,0x00,0x00] + vcvtneph2hf8 xmm22, word ptr [rip]{1to16} + +// CHECK: vcvtneph2hf8 xmm22, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe5,0x7e,0x28,0x18,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtneph2hf8 xmm22, ymmword ptr [2*rbp - 1024] + +// CHECK: vcvtneph2hf8 xmm22 {k7} {z}, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe5,0x7e,0xaf,0x18,0x71,0x7f] + vcvtneph2hf8 xmm22 {k7} {z}, ymmword ptr [rcx + 4064] + +// CHECK: vcvtneph2hf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe5,0x7e,0xbf,0x18,0x72,0x80] + vcvtneph2hf8 xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} + +// CHECK: vcvtneph2hf8 ymm22, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x18,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtneph2hf8 ymm22, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtneph2hf8 ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x7e,0x4f,0x18,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtneph2hf8 ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtneph2hf8 ymm22, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe5,0x7e,0x58,0x18,0x35,0x00,0x00,0x00,0x00] + vcvtneph2hf8 ymm22, word ptr [rip]{1to32} + +// CHECK: vcvtneph2hf8 ymm22, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe5,0x7e,0x48,0x18,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtneph2hf8 ymm22, zmmword ptr [2*rbp - 2048] + +// CHECK: vcvtneph2hf8 ymm22 {k7} {z}, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe5,0x7e,0xcf,0x18,0x71,0x7f] + vcvtneph2hf8 ymm22 {k7} {z}, zmmword ptr [rcx + 8128] + +// CHECK: vcvtneph2hf8 ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe5,0x7e,0xdf,0x18,0x72,0x80] + vcvtneph2hf8 ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} + +// CHECK: vcvtneph2hf8s xmm22, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x1b,0xf7] + vcvtneph2hf8s xmm22, xmm23 + +// CHECK: vcvtneph2hf8s xmm22 {k7}, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0x0f,0x1b,0xf7] + vcvtneph2hf8s xmm22 {k7}, xmm23 + +// CHECK: vcvtneph2hf8s xmm22 {k7} {z}, xmm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0x8f,0x1b,0xf7] + vcvtneph2hf8s xmm22 {k7} {z}, xmm23 + +// CHECK: vcvtneph2hf8s ymm22, zmm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x1b,0xf7] + vcvtneph2hf8s ymm22, zmm23 + +// CHECK: vcvtneph2hf8s ymm22 {k7}, zmm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0x4f,0x1b,0xf7] + vcvtneph2hf8s ymm22 {k7}, zmm23 + +// CHECK: vcvtneph2hf8s ymm22 {k7} {z}, zmm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0xcf,0x1b,0xf7] + vcvtneph2hf8s ymm22 {k7} {z}, zmm23 + +// CHECK: vcvtneph2hf8s xmm22, ymm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0x28,0x1b,0xf7] + vcvtneph2hf8s xmm22, ymm23 + +// CHECK: vcvtneph2hf8s xmm22 {k7}, ymm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0x2f,0x1b,0xf7] + vcvtneph2hf8s xmm22 {k7}, ymm23 + +// CHECK: vcvtneph2hf8s xmm22 {k7} {z}, ymm23 +// CHECK: encoding: [0x62,0xa5,0x7e,0xaf,0x1b,0xf7] + vcvtneph2hf8s xmm22 {k7} {z}, ymm23 + +// CHECK: vcvtneph2hf8s xmm22, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x7e,0x08,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtneph2hf8s xmm22, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtneph2hf8s xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x7e,0x0f,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtneph2hf8s xmm22 {k7}, xmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtneph2hf8s xmm22, word ptr [rip]{1to8} +// CHECK: encoding: [0x62,0xe5,0x7e,0x18,0x1b,0x35,0x00,0x00,0x00,0x00] + vcvtneph2hf8s xmm22, word ptr [rip]{1to8} + +// CHECK: vcvtneph2hf8s xmm22, xmmword ptr [2*rbp - 512] +// CHECK: encoding: [0x62,0xe5,0x7e,0x08,0x1b,0x34,0x6d,0x00,0xfe,0xff,0xff] + vcvtneph2hf8s xmm22, xmmword ptr [2*rbp - 512] + +// CHECK: vcvtneph2hf8s xmm22 {k7} {z}, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xe5,0x7e,0x8f,0x1b,0x71,0x7f] + vcvtneph2hf8s xmm22 {k7} {z}, xmmword ptr [rcx + 2032] + +// CHECK: vcvtneph2hf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xe5,0x7e,0x9f,0x1b,0x72,0x80] + vcvtneph2hf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to8} + +// CHECK: vcvtneph2hf8s xmm22, word ptr [rip]{1to16} +// CHECK: encoding: [0x62,0xe5,0x7e,0x38,0x1b,0x35,0x00,0x00,0x00,0x00] + vcvtneph2hf8s xmm22, word ptr [rip]{1to16} + +// CHECK: vcvtneph2hf8s xmm22, ymmword ptr [2*rbp - 1024] +// CHECK: encoding: [0x62,0xe5,0x7e,0x28,0x1b,0x34,0x6d,0x00,0xfc,0xff,0xff] + vcvtneph2hf8s xmm22, ymmword ptr [2*rbp - 1024] + +// CHECK: vcvtneph2hf8s xmm22 {k7} {z}, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xe5,0x7e,0xaf,0x1b,0x71,0x7f] + vcvtneph2hf8s xmm22 {k7} {z}, ymmword ptr [rcx + 4064] + +// CHECK: vcvtneph2hf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xe5,0x7e,0xbf,0x1b,0x72,0x80] + vcvtneph2hf8s xmm22 {k7} {z}, word ptr [rdx - 256]{1to16} + +// CHECK: vcvtneph2hf8s ymm22, zmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa5,0x7e,0x48,0x1b,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcvtneph2hf8s ymm22, zmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vcvtneph2hf8s ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] +// CHECK: encoding: [0x62,0xc5,0x7e,0x4f,0x1b,0xb4,0x80,0x23,0x01,0x00,0x00] + vcvtneph2hf8s ymm22 {k7}, zmmword ptr [r8 + 4*rax + 291] + +// CHECK: vcvtneph2hf8s ymm22, word ptr [rip]{1to32} +// CHECK: encoding: [0x62,0xe5,0x7e,0x58,0x1b,0x35,0x00,0x00,0x00,0x00] + vcvtneph2hf8s ymm22, word ptr [rip]{1to32} + +// CHECK: vcvtneph2hf8s ymm22, zmmword ptr [2*rbp - 2048] +// CHECK: encoding: [0x62,0xe5,0x7e,0x48,0x1b,0x34,0x6d,0x00,0xf8,0xff,0xff] + vcvtneph2hf8s ymm22, zmmword ptr [2*rbp - 2048] + +// CHECK: vcvtneph2hf8s ymm22 {k7} {z}, zmmword ptr [rcx + 8128] +// CHECK: encoding: [0x62,0xe5,0x7e,0xcf,0x1b,0x71,0x7f] + vcvtneph2hf8s ymm22 {k7} {z}, zmmword ptr [rcx + 8128] + +// CHECK: vcvtneph2hf8s ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} +// CHECK: encoding: [0x62,0xe5,0x7e,0xdf,0x1b,0x72,0x80] + vcvtneph2hf8s ymm22 {k7} {z}, word ptr [rdx - 256]{1to32} + diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc index 523db92bc543ea..b88abbb461d087 100644 --- a/llvm/test/TableGen/x86-fold-tables.inc +++ b/llvm/test/TableGen/x86-fold-tables.inc @@ -1189,12 +1189,27 @@ static const X86FoldTableEntry Table1[] = { {X86::VCVTDQ2PSZ256rr, X86::VCVTDQ2PSZ256rm, 0}, {X86::VCVTDQ2PSZrr, X86::VCVTDQ2PSZrm, 0}, {X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0}, + {X86::VCVTHF82PHZ128rr, X86::VCVTHF82PHZ128rm, TB_NO_REVERSE}, + {X86::VCVTHF82PHZ256rr, X86::VCVTHF82PHZ256rm, 0}, + {X86::VCVTHF82PHZrr, X86::VCVTHF82PHZrm, 0}, {X86::VCVTNEBF162IBSZ128rr, X86::VCVTNEBF162IBSZ128rm, 0}, {X86::VCVTNEBF162IBSZ256rr, X86::VCVTNEBF162IBSZ256rm, 0}, {X86::VCVTNEBF162IBSZrr, X86::VCVTNEBF162IBSZrm, 0}, {X86::VCVTNEBF162IUBSZ128rr, X86::VCVTNEBF162IUBSZ128rm, 0}, {X86::VCVTNEBF162IUBSZ256rr, X86::VCVTNEBF162IUBSZ256rm, 0}, {X86::VCVTNEBF162IUBSZrr, X86::VCVTNEBF162IUBSZrm, 0}, + {X86::VCVTNEPH2BF8SZ128rr, X86::VCVTNEPH2BF8SZ128rm, 0}, + {X86::VCVTNEPH2BF8SZ256rr, X86::VCVTNEPH2BF8SZ256rm, 0}, + {X86::VCVTNEPH2BF8SZrr, X86::VCVTNEPH2BF8SZrm, 0}, + {X86::VCVTNEPH2BF8Z128rr, X86::VCVTNEPH2BF8Z128rm, 0}, + {X86::VCVTNEPH2BF8Z256rr, X86::VCVTNEPH2BF8Z256rm, 0}, + {X86::VCVTNEPH2BF8Zrr, X86::VCVTNEPH2BF8Zrm, 0}, + {X86::VCVTNEPH2HF8SZ128rr, X86::VCVTNEPH2HF8SZ128rm, 0}, + {X86::VCVTNEPH2HF8SZ256rr, X86::VCVTNEPH2HF8SZ256rm, 0}, + {X86::VCVTNEPH2HF8SZrr, X86::VCVTNEPH2HF8SZrm, 0}, + {X86::VCVTNEPH2HF8Z128rr, X86::VCVTNEPH2HF8Z128rm, 0}, + {X86::VCVTNEPH2HF8Z256rr, X86::VCVTNEPH2HF8Z256rm, 0}, + {X86::VCVTNEPH2HF8Zrr, X86::VCVTNEPH2HF8Zrm, 0}, {X86::VCVTNEPS2BF16Yrr, X86::VCVTNEPS2BF16Yrm, 0}, {X86::VCVTNEPS2BF16Z128rr, X86::VCVTNEPS2BF16Z128rm, 0}, {X86::VCVTNEPS2BF16Z256rr, X86::VCVTNEPS2BF16Z256rm, 0}, @@ -2440,6 +2455,21 @@ static const X86FoldTableEntry Table2[] = { {X86::VCMPSSZrri_Int, X86::VCMPSSZrmi_Int, TB_NO_REVERSE}, {X86::VCMPSSrri, X86::VCMPSSrmi, 0}, {X86::VCMPSSrri_Int, X86::VCMPSSrmi_Int, TB_NO_REVERSE}, + {X86::VCVT2PS2PHXZ128rr, X86::VCVT2PS2PHXZ128rm, 0}, + {X86::VCVT2PS2PHXZ256rr, X86::VCVT2PS2PHXZ256rm, 0}, + {X86::VCVT2PS2PHXZrr, X86::VCVT2PS2PHXZrm, 0}, + {X86::VCVTBIASPH2BF8SZ128rr, X86::VCVTBIASPH2BF8SZ128rm, 0}, + {X86::VCVTBIASPH2BF8SZ256rr, X86::VCVTBIASPH2BF8SZ256rm, 0}, + {X86::VCVTBIASPH2BF8SZrr, X86::VCVTBIASPH2BF8SZrm, 0}, + {X86::VCVTBIASPH2BF8Z128rr, X86::VCVTBIASPH2BF8Z128rm, 0}, + {X86::VCVTBIASPH2BF8Z256rr, X86::VCVTBIASPH2BF8Z256rm, 0}, + {X86::VCVTBIASPH2BF8Zrr, X86::VCVTBIASPH2BF8Zrm, 0}, + {X86::VCVTBIASPH2HF8SZ128rr, X86::VCVTBIASPH2HF8SZ128rm, 0}, + {X86::VCVTBIASPH2HF8SZ256rr, X86::VCVTBIASPH2HF8SZ256rm, 0}, + {X86::VCVTBIASPH2HF8SZrr, X86::VCVTBIASPH2HF8SZrm, 0}, + {X86::VCVTBIASPH2HF8Z128rr, X86::VCVTBIASPH2HF8Z128rm, 0}, + {X86::VCVTBIASPH2HF8Z256rr, X86::VCVTBIASPH2HF8Z256rm, 0}, + {X86::VCVTBIASPH2HF8Zrr, X86::VCVTBIASPH2HF8Zrm, 0}, {X86::VCVTDQ2PDZ128rrkz, X86::VCVTDQ2PDZ128rmkz, TB_NO_REVERSE}, {X86::VCVTDQ2PDZ256rrkz, X86::VCVTDQ2PDZ256rmkz, 0}, {X86::VCVTDQ2PDZrrkz, X86::VCVTDQ2PDZrmkz, 0}, @@ -2449,6 +2479,21 @@ static const X86FoldTableEntry Table2[] = { {X86::VCVTDQ2PSZ128rrkz, X86::VCVTDQ2PSZ128rmkz, 0}, {X86::VCVTDQ2PSZ256rrkz, X86::VCVTDQ2PSZ256rmkz, 0}, {X86::VCVTDQ2PSZrrkz, X86::VCVTDQ2PSZrmkz, 0}, + {X86::VCVTHF82PHZ128rrkz, X86::VCVTHF82PHZ128rmkz, TB_NO_REVERSE}, + {X86::VCVTHF82PHZ256rrkz, X86::VCVTHF82PHZ256rmkz, 0}, + {X86::VCVTHF82PHZrrkz, X86::VCVTHF82PHZrmkz, 0}, + {X86::VCVTNE2PH2BF8SZ128rr, X86::VCVTNE2PH2BF8SZ128rm, 0}, + {X86::VCVTNE2PH2BF8SZ256rr, X86::VCVTNE2PH2BF8SZ256rm, 0}, + {X86::VCVTNE2PH2BF8SZrr, X86::VCVTNE2PH2BF8SZrm, 0}, + {X86::VCVTNE2PH2BF8Z128rr, X86::VCVTNE2PH2BF8Z128rm, 0}, + {X86::VCVTNE2PH2BF8Z256rr, X86::VCVTNE2PH2BF8Z256rm, 0}, + {X86::VCVTNE2PH2BF8Zrr, X86::VCVTNE2PH2BF8Zrm, 0}, + {X86::VCVTNE2PH2HF8SZ128rr, X86::VCVTNE2PH2HF8SZ128rm, 0}, + {X86::VCVTNE2PH2HF8SZ256rr, X86::VCVTNE2PH2HF8SZ256rm, 0}, + {X86::VCVTNE2PH2HF8SZrr, X86::VCVTNE2PH2HF8SZrm, 0}, + {X86::VCVTNE2PH2HF8Z128rr, X86::VCVTNE2PH2HF8Z128rm, 0}, + {X86::VCVTNE2PH2HF8Z256rr, X86::VCVTNE2PH2HF8Z256rm, 0}, + {X86::VCVTNE2PH2HF8Zrr, X86::VCVTNE2PH2HF8Zrm, 0}, {X86::VCVTNE2PS2BF16Z128rr, X86::VCVTNE2PS2BF16Z128rm, 0}, {X86::VCVTNE2PS2BF16Z256rr, X86::VCVTNE2PS2BF16Z256rm, 0}, {X86::VCVTNE2PS2BF16Zrr, X86::VCVTNE2PS2BF16Zrm, 0}, @@ -2458,6 +2503,18 @@ static const X86FoldTableEntry Table2[] = { {X86::VCVTNEBF162IUBSZ128rrkz, X86::VCVTNEBF162IUBSZ128rmkz, 0}, {X86::VCVTNEBF162IUBSZ256rrkz, X86::VCVTNEBF162IUBSZ256rmkz, 0}, {X86::VCVTNEBF162IUBSZrrkz, X86::VCVTNEBF162IUBSZrmkz, 0}, + {X86::VCVTNEPH2BF8SZ128rrkz, X86::VCVTNEPH2BF8SZ128rmkz, 0}, + {X86::VCVTNEPH2BF8SZ256rrkz, X86::VCVTNEPH2BF8SZ256rmkz, 0}, + {X86::VCVTNEPH2BF8SZrrkz, X86::VCVTNEPH2BF8SZrmkz, 0}, + {X86::VCVTNEPH2BF8Z128rrkz, X86::VCVTNEPH2BF8Z128rmkz, 0}, + {X86::VCVTNEPH2BF8Z256rrkz, X86::VCVTNEPH2BF8Z256rmkz, 0}, + {X86::VCVTNEPH2BF8Zrrkz, X86::VCVTNEPH2BF8Zrmkz, 0}, + {X86::VCVTNEPH2HF8SZ128rrkz, X86::VCVTNEPH2HF8SZ128rmkz, 0}, + {X86::VCVTNEPH2HF8SZ256rrkz, X86::VCVTNEPH2HF8SZ256rmkz, 0}, + {X86::VCVTNEPH2HF8SZrrkz, X86::VCVTNEPH2HF8SZrmkz, 0}, + {X86::VCVTNEPH2HF8Z128rrkz, X86::VCVTNEPH2HF8Z128rmkz, 0}, + {X86::VCVTNEPH2HF8Z256rrkz, X86::VCVTNEPH2HF8Z256rmkz, 0}, + {X86::VCVTNEPH2HF8Zrrkz, X86::VCVTNEPH2HF8Zrmkz, 0}, {X86::VCVTNEPS2BF16Z128rrkz, X86::VCVTNEPS2BF16Z128rmkz, 0}, {X86::VCVTNEPS2BF16Z256rrkz, X86::VCVTNEPS2BF16Z256rmkz, 0}, {X86::VCVTNEPS2BF16Zrrkz, X86::VCVTNEPS2BF16Zrmkz, 0}, @@ -4070,6 +4127,21 @@ static const X86FoldTableEntry Table3[] = { {X86::VCMPSDZrri_Intk, X86::VCMPSDZrmi_Intk, TB_NO_REVERSE}, {X86::VCMPSHZrri_Intk, X86::VCMPSHZrmi_Intk, TB_NO_REVERSE}, {X86::VCMPSSZrri_Intk, X86::VCMPSSZrmi_Intk, TB_NO_REVERSE}, + {X86::VCVT2PS2PHXZ128rrkz, X86::VCVT2PS2PHXZ128rmkz, 0}, + {X86::VCVT2PS2PHXZ256rrkz, X86::VCVT2PS2PHXZ256rmkz, 0}, + {X86::VCVT2PS2PHXZrrkz, X86::VCVT2PS2PHXZrmkz, 0}, + {X86::VCVTBIASPH2BF8SZ128rrkz, X86::VCVTBIASPH2BF8SZ128rmkz, 0}, + {X86::VCVTBIASPH2BF8SZ256rrkz, X86::VCVTBIASPH2BF8SZ256rmkz, 0}, + {X86::VCVTBIASPH2BF8SZrrkz, X86::VCVTBIASPH2BF8SZrmkz, 0}, + {X86::VCVTBIASPH2BF8Z128rrkz, X86::VCVTBIASPH2BF8Z128rmkz, 0}, + {X86::VCVTBIASPH2BF8Z256rrkz, X86::VCVTBIASPH2BF8Z256rmkz, 0}, + {X86::VCVTBIASPH2BF8Zrrkz, X86::VCVTBIASPH2BF8Zrmkz, 0}, + {X86::VCVTBIASPH2HF8SZ128rrkz, X86::VCVTBIASPH2HF8SZ128rmkz, 0}, + {X86::VCVTBIASPH2HF8SZ256rrkz, X86::VCVTBIASPH2HF8SZ256rmkz, 0}, + {X86::VCVTBIASPH2HF8SZrrkz, X86::VCVTBIASPH2HF8SZrmkz, 0}, + {X86::VCVTBIASPH2HF8Z128rrkz, X86::VCVTBIASPH2HF8Z128rmkz, 0}, + {X86::VCVTBIASPH2HF8Z256rrkz, X86::VCVTBIASPH2HF8Z256rmkz, 0}, + {X86::VCVTBIASPH2HF8Zrrkz, X86::VCVTBIASPH2HF8Zrmkz, 0}, {X86::VCVTDQ2PDZ128rrk, X86::VCVTDQ2PDZ128rmk, TB_NO_REVERSE}, {X86::VCVTDQ2PDZ256rrk, X86::VCVTDQ2PDZ256rmk, 0}, {X86::VCVTDQ2PDZrrk, X86::VCVTDQ2PDZrmk, 0}, @@ -4079,6 +4151,21 @@ static const X86FoldTableEntry Table3[] = { {X86::VCVTDQ2PSZ128rrk, X86::VCVTDQ2PSZ128rmk, 0}, {X86::VCVTDQ2PSZ256rrk, X86::VCVTDQ2PSZ256rmk, 0}, {X86::VCVTDQ2PSZrrk, X86::VCVTDQ2PSZrmk, 0}, + {X86::VCVTHF82PHZ128rrk, X86::VCVTHF82PHZ128rmk, TB_NO_REVERSE}, + {X86::VCVTHF82PHZ256rrk, X86::VCVTHF82PHZ256rmk, 0}, + {X86::VCVTHF82PHZrrk, X86::VCVTHF82PHZrmk, 0}, + {X86::VCVTNE2PH2BF8SZ128rrkz, X86::VCVTNE2PH2BF8SZ128rmkz, 0}, + {X86::VCVTNE2PH2BF8SZ256rrkz, X86::VCVTNE2PH2BF8SZ256rmkz, 0}, + {X86::VCVTNE2PH2BF8SZrrkz, X86::VCVTNE2PH2BF8SZrmkz, 0}, + {X86::VCVTNE2PH2BF8Z128rrkz, X86::VCVTNE2PH2BF8Z128rmkz, 0}, + {X86::VCVTNE2PH2BF8Z256rrkz, X86::VCVTNE2PH2BF8Z256rmkz, 0}, + {X86::VCVTNE2PH2BF8Zrrkz, X86::VCVTNE2PH2BF8Zrmkz, 0}, + {X86::VCVTNE2PH2HF8SZ128rrkz, X86::VCVTNE2PH2HF8SZ128rmkz, 0}, + {X86::VCVTNE2PH2HF8SZ256rrkz, X86::VCVTNE2PH2HF8SZ256rmkz, 0}, + {X86::VCVTNE2PH2HF8SZrrkz, X86::VCVTNE2PH2HF8SZrmkz, 0}, + {X86::VCVTNE2PH2HF8Z128rrkz, X86::VCVTNE2PH2HF8Z128rmkz, 0}, + {X86::VCVTNE2PH2HF8Z256rrkz, X86::VCVTNE2PH2HF8Z256rmkz, 0}, + {X86::VCVTNE2PH2HF8Zrrkz, X86::VCVTNE2PH2HF8Zrmkz, 0}, {X86::VCVTNE2PS2BF16Z128rrkz, X86::VCVTNE2PS2BF16Z128rmkz, 0}, {X86::VCVTNE2PS2BF16Z256rrkz, X86::VCVTNE2PS2BF16Z256rmkz, 0}, {X86::VCVTNE2PS2BF16Zrrkz, X86::VCVTNE2PS2BF16Zrmkz, 0}, @@ -4088,6 +4175,18 @@ static const X86FoldTableEntry Table3[] = { {X86::VCVTNEBF162IUBSZ128rrk, X86::VCVTNEBF162IUBSZ128rmk, 0}, {X86::VCVTNEBF162IUBSZ256rrk, X86::VCVTNEBF162IUBSZ256rmk, 0}, {X86::VCVTNEBF162IUBSZrrk, X86::VCVTNEBF162IUBSZrmk, 0}, + {X86::VCVTNEPH2BF8SZ128rrk, X86::VCVTNEPH2BF8SZ128rmk, 0}, + {X86::VCVTNEPH2BF8SZ256rrk, X86::VCVTNEPH2BF8SZ256rmk, 0}, + {X86::VCVTNEPH2BF8SZrrk, X86::VCVTNEPH2BF8SZrmk, 0}, + {X86::VCVTNEPH2BF8Z128rrk, X86::VCVTNEPH2BF8Z128rmk, 0}, + {X86::VCVTNEPH2BF8Z256rrk, X86::VCVTNEPH2BF8Z256rmk, 0}, + {X86::VCVTNEPH2BF8Zrrk, X86::VCVTNEPH2BF8Zrmk, 0}, + {X86::VCVTNEPH2HF8SZ128rrk, X86::VCVTNEPH2HF8SZ128rmk, 0}, + {X86::VCVTNEPH2HF8SZ256rrk, X86::VCVTNEPH2HF8SZ256rmk, 0}, + {X86::VCVTNEPH2HF8SZrrk, X86::VCVTNEPH2HF8SZrmk, 0}, + {X86::VCVTNEPH2HF8Z128rrk, X86::VCVTNEPH2HF8Z128rmk, 0}, + {X86::VCVTNEPH2HF8Z256rrk, X86::VCVTNEPH2HF8Z256rmk, 0}, + {X86::VCVTNEPH2HF8Zrrk, X86::VCVTNEPH2HF8Zrmk, 0}, {X86::VCVTNEPS2BF16Z128rrk, X86::VCVTNEPS2BF16Z128rmk, 0}, {X86::VCVTNEPS2BF16Z256rrk, X86::VCVTNEPS2BF16Z256rmk, 0}, {X86::VCVTNEPS2BF16Zrrk, X86::VCVTNEPS2BF16Zrmk, 0}, @@ -5745,6 +5844,33 @@ static const X86FoldTableEntry Table4[] = { {X86::VANDPSZ128rrk, X86::VANDPSZ128rmk, 0}, {X86::VANDPSZ256rrk, X86::VANDPSZ256rmk, 0}, {X86::VANDPSZrrk, X86::VANDPSZrmk, 0}, + {X86::VCVT2PS2PHXZ128rrk, X86::VCVT2PS2PHXZ128rmk, 0}, + {X86::VCVT2PS2PHXZ256rrk, X86::VCVT2PS2PHXZ256rmk, 0}, + {X86::VCVT2PS2PHXZrrk, X86::VCVT2PS2PHXZrmk, 0}, + {X86::VCVTBIASPH2BF8SZ128rrk, X86::VCVTBIASPH2BF8SZ128rmk, 0}, + {X86::VCVTBIASPH2BF8SZ256rrk, X86::VCVTBIASPH2BF8SZ256rmk, 0}, + {X86::VCVTBIASPH2BF8SZrrk, X86::VCVTBIASPH2BF8SZrmk, 0}, + {X86::VCVTBIASPH2BF8Z128rrk, X86::VCVTBIASPH2BF8Z128rmk, 0}, + {X86::VCVTBIASPH2BF8Z256rrk, X86::VCVTBIASPH2BF8Z256rmk, 0}, + {X86::VCVTBIASPH2BF8Zrrk, X86::VCVTBIASPH2BF8Zrmk, 0}, + {X86::VCVTBIASPH2HF8SZ128rrk, X86::VCVTBIASPH2HF8SZ128rmk, 0}, + {X86::VCVTBIASPH2HF8SZ256rrk, X86::VCVTBIASPH2HF8SZ256rmk, 0}, + {X86::VCVTBIASPH2HF8SZrrk, X86::VCVTBIASPH2HF8SZrmk, 0}, + {X86::VCVTBIASPH2HF8Z128rrk, X86::VCVTBIASPH2HF8Z128rmk, 0}, + {X86::VCVTBIASPH2HF8Z256rrk, X86::VCVTBIASPH2HF8Z256rmk, 0}, + {X86::VCVTBIASPH2HF8Zrrk, X86::VCVTBIASPH2HF8Zrmk, 0}, + {X86::VCVTNE2PH2BF8SZ128rrk, X86::VCVTNE2PH2BF8SZ128rmk, 0}, + {X86::VCVTNE2PH2BF8SZ256rrk, X86::VCVTNE2PH2BF8SZ256rmk, 0}, + {X86::VCVTNE2PH2BF8SZrrk, X86::VCVTNE2PH2BF8SZrmk, 0}, + {X86::VCVTNE2PH2BF8Z128rrk, X86::VCVTNE2PH2BF8Z128rmk, 0}, + {X86::VCVTNE2PH2BF8Z256rrk, X86::VCVTNE2PH2BF8Z256rmk, 0}, + {X86::VCVTNE2PH2BF8Zrrk, X86::VCVTNE2PH2BF8Zrmk, 0}, + {X86::VCVTNE2PH2HF8SZ128rrk, X86::VCVTNE2PH2HF8SZ128rmk, 0}, + {X86::VCVTNE2PH2HF8SZ256rrk, X86::VCVTNE2PH2HF8SZ256rmk, 0}, + {X86::VCVTNE2PH2HF8SZrrk, X86::VCVTNE2PH2HF8SZrmk, 0}, + {X86::VCVTNE2PH2HF8Z128rrk, X86::VCVTNE2PH2HF8Z128rmk, 0}, + {X86::VCVTNE2PH2HF8Z256rrk, X86::VCVTNE2PH2HF8Z256rmk, 0}, + {X86::VCVTNE2PH2HF8Zrrk, X86::VCVTNE2PH2HF8Zrmk, 0}, {X86::VCVTNE2PS2BF16Z128rrk, X86::VCVTNE2PS2BF16Z128rmk, 0}, {X86::VCVTNE2PS2BF16Z256rrk, X86::VCVTNE2PS2BF16Z256rmk, 0}, {X86::VCVTNE2PS2BF16Zrrk, X86::VCVTNE2PS2BF16Zrmk, 0}, @@ -6956,6 +7082,18 @@ static const X86FoldTableEntry BroadcastTable1[] = { {X86::VCVTNEBF162IUBSZ128rr, X86::VCVTNEBF162IUBSZ128rmb, TB_BCAST_SH}, {X86::VCVTNEBF162IUBSZ256rr, X86::VCVTNEBF162IUBSZ256rmb, TB_BCAST_SH}, {X86::VCVTNEBF162IUBSZrr, X86::VCVTNEBF162IUBSZrmb, TB_BCAST_SH}, + {X86::VCVTNEPH2BF8SZ128rr, X86::VCVTNEPH2BF8SZ128rmb, TB_BCAST_SH}, + {X86::VCVTNEPH2BF8SZ256rr, X86::VCVTNEPH2BF8SZ256rmb, TB_BCAST_SH}, + {X86::VCVTNEPH2BF8SZrr, X86::VCVTNEPH2BF8SZrmb, TB_BCAST_SH}, + {X86::VCVTNEPH2BF8Z128rr, X86::VCVTNEPH2BF8Z128rmb, TB_BCAST_SH}, + {X86::VCVTNEPH2BF8Z256rr, X86::VCVTNEPH2BF8Z256rmb, TB_BCAST_SH}, + {X86::VCVTNEPH2BF8Zrr, X86::VCVTNEPH2BF8Zrmb, TB_BCAST_SH}, + {X86::VCVTNEPH2HF8SZ128rr, X86::VCVTNEPH2HF8SZ128rmb, TB_BCAST_SH}, + {X86::VCVTNEPH2HF8SZ256rr, X86::VCVTNEPH2HF8SZ256rmb, TB_BCAST_SH}, + {X86::VCVTNEPH2HF8SZrr, X86::VCVTNEPH2HF8SZrmb, TB_BCAST_SH}, + {X86::VCVTNEPH2HF8Z128rr, X86::VCVTNEPH2HF8Z128rmb, TB_BCAST_SH}, + {X86::VCVTNEPH2HF8Z256rr, X86::VCVTNEPH2HF8Z256rmb, TB_BCAST_SH}, + {X86::VCVTNEPH2HF8Zrr, X86::VCVTNEPH2HF8Zrmb, TB_BCAST_SH}, {X86::VCVTNEPS2BF16Z128rr, X86::VCVTNEPS2BF16Z128rmb, TB_BCAST_SS}, {X86::VCVTNEPS2BF16Z256rr, X86::VCVTNEPS2BF16Z256rmb, TB_BCAST_SS}, {X86::VCVTNEPS2BF16Zrr, X86::VCVTNEPS2BF16Zrmb, TB_BCAST_SS}, @@ -7314,6 +7452,21 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VCMPPSZ128rri, X86::VCMPPSZ128rmbi, TB_BCAST_SS}, {X86::VCMPPSZ256rri, X86::VCMPPSZ256rmbi, TB_BCAST_SS}, {X86::VCMPPSZrri, X86::VCMPPSZrmbi, TB_BCAST_SS}, + {X86::VCVT2PS2PHXZ128rr, X86::VCVT2PS2PHXZ128rmb, TB_BCAST_SS}, + {X86::VCVT2PS2PHXZ256rr, X86::VCVT2PS2PHXZ256rmb, TB_BCAST_SS}, + {X86::VCVT2PS2PHXZrr, X86::VCVT2PS2PHXZrmb, TB_BCAST_SS}, + {X86::VCVTBIASPH2BF8SZ128rr, X86::VCVTBIASPH2BF8SZ128rmb, TB_BCAST_SH}, + {X86::VCVTBIASPH2BF8SZ256rr, X86::VCVTBIASPH2BF8SZ256rmb, TB_BCAST_SH}, + {X86::VCVTBIASPH2BF8SZrr, X86::VCVTBIASPH2BF8SZrmb, TB_BCAST_SH}, + {X86::VCVTBIASPH2BF8Z128rr, X86::VCVTBIASPH2BF8Z128rmb, TB_BCAST_SH}, + {X86::VCVTBIASPH2BF8Z256rr, X86::VCVTBIASPH2BF8Z256rmb, TB_BCAST_SH}, + {X86::VCVTBIASPH2BF8Zrr, X86::VCVTBIASPH2BF8Zrmb, TB_BCAST_SH}, + {X86::VCVTBIASPH2HF8SZ128rr, X86::VCVTBIASPH2HF8SZ128rmb, TB_BCAST_SH}, + {X86::VCVTBIASPH2HF8SZ256rr, X86::VCVTBIASPH2HF8SZ256rmb, TB_BCAST_SH}, + {X86::VCVTBIASPH2HF8SZrr, X86::VCVTBIASPH2HF8SZrmb, TB_BCAST_SH}, + {X86::VCVTBIASPH2HF8Z128rr, X86::VCVTBIASPH2HF8Z128rmb, TB_BCAST_SH}, + {X86::VCVTBIASPH2HF8Z256rr, X86::VCVTBIASPH2HF8Z256rmb, TB_BCAST_SH}, + {X86::VCVTBIASPH2HF8Zrr, X86::VCVTBIASPH2HF8Zrmb, TB_BCAST_SH}, {X86::VCVTDQ2PDZ128rrkz, X86::VCVTDQ2PDZ128rmbkz, TB_BCAST_D}, {X86::VCVTDQ2PDZ256rrkz, X86::VCVTDQ2PDZ256rmbkz, TB_BCAST_D}, {X86::VCVTDQ2PDZrrkz, X86::VCVTDQ2PDZrmbkz, TB_BCAST_D}, @@ -7323,6 +7476,18 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VCVTDQ2PSZ128rrkz, X86::VCVTDQ2PSZ128rmbkz, TB_BCAST_D}, {X86::VCVTDQ2PSZ256rrkz, X86::VCVTDQ2PSZ256rmbkz, TB_BCAST_D}, {X86::VCVTDQ2PSZrrkz, X86::VCVTDQ2PSZrmbkz, TB_BCAST_D}, + {X86::VCVTNE2PH2BF8SZ128rr, X86::VCVTNE2PH2BF8SZ128rmb, TB_BCAST_SH}, + {X86::VCVTNE2PH2BF8SZ256rr, X86::VCVTNE2PH2BF8SZ256rmb, TB_BCAST_SH}, + {X86::VCVTNE2PH2BF8SZrr, X86::VCVTNE2PH2BF8SZrmb, TB_BCAST_SH}, + {X86::VCVTNE2PH2BF8Z128rr, X86::VCVTNE2PH2BF8Z128rmb, TB_BCAST_SH}, + {X86::VCVTNE2PH2BF8Z256rr, X86::VCVTNE2PH2BF8Z256rmb, TB_BCAST_SH}, + {X86::VCVTNE2PH2BF8Zrr, X86::VCVTNE2PH2BF8Zrmb, TB_BCAST_SH}, + {X86::VCVTNE2PH2HF8SZ128rr, X86::VCVTNE2PH2HF8SZ128rmb, TB_BCAST_SH}, + {X86::VCVTNE2PH2HF8SZ256rr, X86::VCVTNE2PH2HF8SZ256rmb, TB_BCAST_SH}, + {X86::VCVTNE2PH2HF8SZrr, X86::VCVTNE2PH2HF8SZrmb, TB_BCAST_SH}, + {X86::VCVTNE2PH2HF8Z128rr, X86::VCVTNE2PH2HF8Z128rmb, TB_BCAST_SH}, + {X86::VCVTNE2PH2HF8Z256rr, X86::VCVTNE2PH2HF8Z256rmb, TB_BCAST_SH}, + {X86::VCVTNE2PH2HF8Zrr, X86::VCVTNE2PH2HF8Zrmb, TB_BCAST_SH}, {X86::VCVTNE2PS2BF16Z128rr, X86::VCVTNE2PS2BF16Z128rmb, TB_BCAST_SS}, {X86::VCVTNE2PS2BF16Z256rr, X86::VCVTNE2PS2BF16Z256rmb, TB_BCAST_SS}, {X86::VCVTNE2PS2BF16Zrr, X86::VCVTNE2PS2BF16Zrmb, TB_BCAST_SS}, @@ -7332,6 +7497,18 @@ static const X86FoldTableEntry BroadcastTable2[] = { {X86::VCVTNEBF162IUBSZ128rrkz, X86::VCVTNEBF162IUBSZ128rmbkz, TB_BCAST_SH}, {X86::VCVTNEBF162IUBSZ256rrkz, X86::VCVTNEBF162IUBSZ256rmbkz, TB_BCAST_SH}, {X86::VCVTNEBF162IUBSZrrkz, X86::VCVTNEBF162IUBSZrmbkz, TB_BCAST_SH}, + {X86::VCVTNEPH2BF8SZ128rrkz, X86::VCVTNEPH2BF8SZ128rmbkz, TB_BCAST_SH}, + {X86::VCVTNEPH2BF8SZ256rrkz, X86::VCVTNEPH2BF8SZ256rmbkz, TB_BCAST_SH}, + {X86::VCVTNEPH2BF8SZrrkz, X86::VCVTNEPH2BF8SZrmbkz, TB_BCAST_SH}, + {X86::VCVTNEPH2BF8Z128rrkz, X86::VCVTNEPH2BF8Z128rmbkz, TB_BCAST_SH}, + {X86::VCVTNEPH2BF8Z256rrkz, X86::VCVTNEPH2BF8Z256rmbkz, TB_BCAST_SH}, + {X86::VCVTNEPH2BF8Zrrkz, X86::VCVTNEPH2BF8Zrmbkz, TB_BCAST_SH}, + {X86::VCVTNEPH2HF8SZ128rrkz, X86::VCVTNEPH2HF8SZ128rmbkz, TB_BCAST_SH}, + {X86::VCVTNEPH2HF8SZ256rrkz, X86::VCVTNEPH2HF8SZ256rmbkz, TB_BCAST_SH}, + {X86::VCVTNEPH2HF8SZrrkz, X86::VCVTNEPH2HF8SZrmbkz, TB_BCAST_SH}, + {X86::VCVTNEPH2HF8Z128rrkz, X86::VCVTNEPH2HF8Z128rmbkz, TB_BCAST_SH}, + {X86::VCVTNEPH2HF8Z256rrkz, X86::VCVTNEPH2HF8Z256rmbkz, TB_BCAST_SH}, + {X86::VCVTNEPH2HF8Zrrkz, X86::VCVTNEPH2HF8Zrmbkz, TB_BCAST_SH}, {X86::VCVTNEPS2BF16Z128rrkz, X86::VCVTNEPS2BF16Z128rmbkz, TB_BCAST_SS}, {X86::VCVTNEPS2BF16Z256rrkz, X86::VCVTNEPS2BF16Z256rmbkz, TB_BCAST_SS}, {X86::VCVTNEPS2BF16Zrrkz, X86::VCVTNEPS2BF16Zrmbkz, TB_BCAST_SS}, @@ -8027,6 +8204,21 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VCMPPSZ128rrik, X86::VCMPPSZ128rmbik, TB_BCAST_SS}, {X86::VCMPPSZ256rrik, X86::VCMPPSZ256rmbik, TB_BCAST_SS}, {X86::VCMPPSZrrik, X86::VCMPPSZrmbik, TB_BCAST_SS}, + {X86::VCVT2PS2PHXZ128rrkz, X86::VCVT2PS2PHXZ128rmbkz, TB_BCAST_SS}, + {X86::VCVT2PS2PHXZ256rrkz, X86::VCVT2PS2PHXZ256rmbkz, TB_BCAST_SS}, + {X86::VCVT2PS2PHXZrrkz, X86::VCVT2PS2PHXZrmbkz, TB_BCAST_SS}, + {X86::VCVTBIASPH2BF8SZ128rrkz, X86::VCVTBIASPH2BF8SZ128rmbkz, TB_BCAST_SH}, + {X86::VCVTBIASPH2BF8SZ256rrkz, X86::VCVTBIASPH2BF8SZ256rmbkz, TB_BCAST_SH}, + {X86::VCVTBIASPH2BF8SZrrkz, X86::VCVTBIASPH2BF8SZrmbkz, TB_BCAST_SH}, + {X86::VCVTBIASPH2BF8Z128rrkz, X86::VCVTBIASPH2BF8Z128rmbkz, TB_BCAST_SH}, + {X86::VCVTBIASPH2BF8Z256rrkz, X86::VCVTBIASPH2BF8Z256rmbkz, TB_BCAST_SH}, + {X86::VCVTBIASPH2BF8Zrrkz, X86::VCVTBIASPH2BF8Zrmbkz, TB_BCAST_SH}, + {X86::VCVTBIASPH2HF8SZ128rrkz, X86::VCVTBIASPH2HF8SZ128rmbkz, TB_BCAST_SH}, + {X86::VCVTBIASPH2HF8SZ256rrkz, X86::VCVTBIASPH2HF8SZ256rmbkz, TB_BCAST_SH}, + {X86::VCVTBIASPH2HF8SZrrkz, X86::VCVTBIASPH2HF8SZrmbkz, TB_BCAST_SH}, + {X86::VCVTBIASPH2HF8Z128rrkz, X86::VCVTBIASPH2HF8Z128rmbkz, TB_BCAST_SH}, + {X86::VCVTBIASPH2HF8Z256rrkz, X86::VCVTBIASPH2HF8Z256rmbkz, TB_BCAST_SH}, + {X86::VCVTBIASPH2HF8Zrrkz, X86::VCVTBIASPH2HF8Zrmbkz, TB_BCAST_SH}, {X86::VCVTDQ2PDZ128rrk, X86::VCVTDQ2PDZ128rmbk, TB_BCAST_D}, {X86::VCVTDQ2PDZ256rrk, X86::VCVTDQ2PDZ256rmbk, TB_BCAST_D}, {X86::VCVTDQ2PDZrrk, X86::VCVTDQ2PDZrmbk, TB_BCAST_D}, @@ -8036,6 +8228,18 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VCVTDQ2PSZ128rrk, X86::VCVTDQ2PSZ128rmbk, TB_BCAST_D}, {X86::VCVTDQ2PSZ256rrk, X86::VCVTDQ2PSZ256rmbk, TB_BCAST_D}, {X86::VCVTDQ2PSZrrk, X86::VCVTDQ2PSZrmbk, TB_BCAST_D}, + {X86::VCVTNE2PH2BF8SZ128rrkz, X86::VCVTNE2PH2BF8SZ128rmbkz, TB_BCAST_SH}, + {X86::VCVTNE2PH2BF8SZ256rrkz, X86::VCVTNE2PH2BF8SZ256rmbkz, TB_BCAST_SH}, + {X86::VCVTNE2PH2BF8SZrrkz, X86::VCVTNE2PH2BF8SZrmbkz, TB_BCAST_SH}, + {X86::VCVTNE2PH2BF8Z128rrkz, X86::VCVTNE2PH2BF8Z128rmbkz, TB_BCAST_SH}, + {X86::VCVTNE2PH2BF8Z256rrkz, X86::VCVTNE2PH2BF8Z256rmbkz, TB_BCAST_SH}, + {X86::VCVTNE2PH2BF8Zrrkz, X86::VCVTNE2PH2BF8Zrmbkz, TB_BCAST_SH}, + {X86::VCVTNE2PH2HF8SZ128rrkz, X86::VCVTNE2PH2HF8SZ128rmbkz, TB_BCAST_SH}, + {X86::VCVTNE2PH2HF8SZ256rrkz, X86::VCVTNE2PH2HF8SZ256rmbkz, TB_BCAST_SH}, + {X86::VCVTNE2PH2HF8SZrrkz, X86::VCVTNE2PH2HF8SZrmbkz, TB_BCAST_SH}, + {X86::VCVTNE2PH2HF8Z128rrkz, X86::VCVTNE2PH2HF8Z128rmbkz, TB_BCAST_SH}, + {X86::VCVTNE2PH2HF8Z256rrkz, X86::VCVTNE2PH2HF8Z256rmbkz, TB_BCAST_SH}, + {X86::VCVTNE2PH2HF8Zrrkz, X86::VCVTNE2PH2HF8Zrmbkz, TB_BCAST_SH}, {X86::VCVTNE2PS2BF16Z128rrkz, X86::VCVTNE2PS2BF16Z128rmbkz, TB_BCAST_SS}, {X86::VCVTNE2PS2BF16Z256rrkz, X86::VCVTNE2PS2BF16Z256rmbkz, TB_BCAST_SS}, {X86::VCVTNE2PS2BF16Zrrkz, X86::VCVTNE2PS2BF16Zrmbkz, TB_BCAST_SS}, @@ -8045,6 +8249,18 @@ static const X86FoldTableEntry BroadcastTable3[] = { {X86::VCVTNEBF162IUBSZ128rrk, X86::VCVTNEBF162IUBSZ128rmbk, TB_BCAST_SH}, {X86::VCVTNEBF162IUBSZ256rrk, X86::VCVTNEBF162IUBSZ256rmbk, TB_BCAST_SH}, {X86::VCVTNEBF162IUBSZrrk, X86::VCVTNEBF162IUBSZrmbk, TB_BCAST_SH}, + {X86::VCVTNEPH2BF8SZ128rrk, X86::VCVTNEPH2BF8SZ128rmbk, TB_BCAST_SH}, + {X86::VCVTNEPH2BF8SZ256rrk, X86::VCVTNEPH2BF8SZ256rmbk, TB_BCAST_SH}, + {X86::VCVTNEPH2BF8SZrrk, X86::VCVTNEPH2BF8SZrmbk, TB_BCAST_SH}, + {X86::VCVTNEPH2BF8Z128rrk, X86::VCVTNEPH2BF8Z128rmbk, TB_BCAST_SH}, + {X86::VCVTNEPH2BF8Z256rrk, X86::VCVTNEPH2BF8Z256rmbk, TB_BCAST_SH}, + {X86::VCVTNEPH2BF8Zrrk, X86::VCVTNEPH2BF8Zrmbk, TB_BCAST_SH}, + {X86::VCVTNEPH2HF8SZ128rrk, X86::VCVTNEPH2HF8SZ128rmbk, TB_BCAST_SH}, + {X86::VCVTNEPH2HF8SZ256rrk, X86::VCVTNEPH2HF8SZ256rmbk, TB_BCAST_SH}, + {X86::VCVTNEPH2HF8SZrrk, X86::VCVTNEPH2HF8SZrmbk, TB_BCAST_SH}, + {X86::VCVTNEPH2HF8Z128rrk, X86::VCVTNEPH2HF8Z128rmbk, TB_BCAST_SH}, + {X86::VCVTNEPH2HF8Z256rrk, X86::VCVTNEPH2HF8Z256rmbk, TB_BCAST_SH}, + {X86::VCVTNEPH2HF8Zrrk, X86::VCVTNEPH2HF8Zrmbk, TB_BCAST_SH}, {X86::VCVTNEPS2BF16Z128rrk, X86::VCVTNEPS2BF16Z128rmbk, TB_BCAST_SS}, {X86::VCVTNEPS2BF16Z256rrk, X86::VCVTNEPS2BF16Z256rmbk, TB_BCAST_SS}, {X86::VCVTNEPS2BF16Zrrk, X86::VCVTNEPS2BF16Zrmbk, TB_BCAST_SS}, @@ -8986,6 +9202,33 @@ static const X86FoldTableEntry BroadcastTable4[] = { {X86::VANDPSZ128rrk, X86::VANDPSZ128rmbk, TB_BCAST_SS}, {X86::VANDPSZ256rrk, X86::VANDPSZ256rmbk, TB_BCAST_SS}, {X86::VANDPSZrrk, X86::VANDPSZrmbk, TB_BCAST_SS}, + {X86::VCVT2PS2PHXZ128rrk, X86::VCVT2PS2PHXZ128rmbk, TB_BCAST_SS}, + {X86::VCVT2PS2PHXZ256rrk, X86::VCVT2PS2PHXZ256rmbk, TB_BCAST_SS}, + {X86::VCVT2PS2PHXZrrk, X86::VCVT2PS2PHXZrmbk, TB_BCAST_SS}, + {X86::VCVTBIASPH2BF8SZ128rrk, X86::VCVTBIASPH2BF8SZ128rmbk, TB_BCAST_SH}, + {X86::VCVTBIASPH2BF8SZ256rrk, X86::VCVTBIASPH2BF8SZ256rmbk, TB_BCAST_SH}, + {X86::VCVTBIASPH2BF8SZrrk, X86::VCVTBIASPH2BF8SZrmbk, TB_BCAST_SH}, + {X86::VCVTBIASPH2BF8Z128rrk, X86::VCVTBIASPH2BF8Z128rmbk, TB_BCAST_SH}, + {X86::VCVTBIASPH2BF8Z256rrk, X86::VCVTBIASPH2BF8Z256rmbk, TB_BCAST_SH}, + {X86::VCVTBIASPH2BF8Zrrk, X86::VCVTBIASPH2BF8Zrmbk, TB_BCAST_SH}, + {X86::VCVTBIASPH2HF8SZ128rrk, X86::VCVTBIASPH2HF8SZ128rmbk, TB_BCAST_SH}, + {X86::VCVTBIASPH2HF8SZ256rrk, X86::VCVTBIASPH2HF8SZ256rmbk, TB_BCAST_SH}, + {X86::VCVTBIASPH2HF8SZrrk, X86::VCVTBIASPH2HF8SZrmbk, TB_BCAST_SH}, + {X86::VCVTBIASPH2HF8Z128rrk, X86::VCVTBIASPH2HF8Z128rmbk, TB_BCAST_SH}, + {X86::VCVTBIASPH2HF8Z256rrk, X86::VCVTBIASPH2HF8Z256rmbk, TB_BCAST_SH}, + {X86::VCVTBIASPH2HF8Zrrk, X86::VCVTBIASPH2HF8Zrmbk, TB_BCAST_SH}, + {X86::VCVTNE2PH2BF8SZ128rrk, X86::VCVTNE2PH2BF8SZ128rmbk, TB_BCAST_SH}, + {X86::VCVTNE2PH2BF8SZ256rrk, X86::VCVTNE2PH2BF8SZ256rmbk, TB_BCAST_SH}, + {X86::VCVTNE2PH2BF8SZrrk, X86::VCVTNE2PH2BF8SZrmbk, TB_BCAST_SH}, + {X86::VCVTNE2PH2BF8Z128rrk, X86::VCVTNE2PH2BF8Z128rmbk, TB_BCAST_SH}, + {X86::VCVTNE2PH2BF8Z256rrk, X86::VCVTNE2PH2BF8Z256rmbk, TB_BCAST_SH}, + {X86::VCVTNE2PH2BF8Zrrk, X86::VCVTNE2PH2BF8Zrmbk, TB_BCAST_SH}, + {X86::VCVTNE2PH2HF8SZ128rrk, X86::VCVTNE2PH2HF8SZ128rmbk, TB_BCAST_SH}, + {X86::VCVTNE2PH2HF8SZ256rrk, X86::VCVTNE2PH2HF8SZ256rmbk, TB_BCAST_SH}, + {X86::VCVTNE2PH2HF8SZrrk, X86::VCVTNE2PH2HF8SZrmbk, TB_BCAST_SH}, + {X86::VCVTNE2PH2HF8Z128rrk, X86::VCVTNE2PH2HF8Z128rmbk, TB_BCAST_SH}, + {X86::VCVTNE2PH2HF8Z256rrk, X86::VCVTNE2PH2HF8Z256rmbk, TB_BCAST_SH}, + {X86::VCVTNE2PH2HF8Zrrk, X86::VCVTNE2PH2HF8Zrmbk, TB_BCAST_SH}, {X86::VCVTNE2PS2BF16Z128rrk, X86::VCVTNE2PS2BF16Z128rmbk, TB_BCAST_SS}, {X86::VCVTNE2PS2BF16Z256rrk, X86::VCVTNE2PS2BF16Z256rmbk, TB_BCAST_SS}, {X86::VCVTNE2PS2BF16Zrrk, X86::VCVTNE2PS2BF16Zrmbk, TB_BCAST_SS}, From 9fa2386ff13289d46ebf31656f4be7859f501468 Mon Sep 17 00:00:00 2001 From: Sam Elliott Date: Wed, 21 Aug 2024 08:45:45 +0100 Subject: [PATCH 024/426] [RISCV] Add Hazard3 Core as taped out for RP2350 (#102452) Luke Wren's Hazard3 is a configurable, open-source 32-bit RISC-V core. The core's source code and docs are available on github: https://github.com/wren6991/hazard3 This is the RISC-V core used in the RP2350, a recently announced SoC by Raspberry Pi (which also contains Arm cores): https://datasheets.raspberrypi.com/rp2350/rp2350-datasheet.pdf We have agreed to name this `-mcpu` option `rp2350-hazard3`, and it reflects exactly the options configured in the RP2350 chips. Notably, the Zbc is not configured, and nor is B because the `misa.B` bit is not either. --- clang/test/Driver/riscv-cpus.c | 15 +++++++++++++++ clang/test/Misc/target-invalid-cpu-note/riscv.c | 2 ++ llvm/docs/ReleaseNotes.rst | 2 ++ llvm/lib/Target/RISCV/RISCVProcessors.td | 16 ++++++++++++++++ 4 files changed, 35 insertions(+) diff --git a/clang/test/Driver/riscv-cpus.c b/clang/test/Driver/riscv-cpus.c index 2fa5b1753745f8..481eaae9153e86 100644 --- a/clang/test/Driver/riscv-cpus.c +++ b/clang/test/Driver/riscv-cpus.c @@ -402,6 +402,21 @@ // MCPU-SIFIVE-P670-SAME: "-target-feature" "+zvkt" // MCPU-SIFIVE-P670-SAME: "-target-abi" "lp64d" +// RUN: %clang -target riscv32 -### -c %s 2>&1 -mcpu=rp2350-hazard3 | FileCheck -check-prefix=MCPU-HAZARD3 %s +// MCPU-HAZARD3: "-target-cpu" "rp2350-hazard3" +// MCPU-HAZARD3-SAME: "-target-feature" "+m" +// MCPU-HAZARD3-SAME: "-target-feature" "+a" +// MCPU-HAZARD3-SAME: "-target-feature" "+c" +// MCPU-HAZARD3-SAME: "-target-feature" "+zicsr" +// MCPU-HAZARD3-SAME: "-target-feature" "+zifencei" +// MCPU-HAZARD3-SAME: "-target-feature" "+zcb" +// MCPU-HAZARD3-SAME: "-target-feature" "+zcmp" +// MCPU-HAZARD3-SAME: "-target-feature" "+zba" +// MCPU-HAZARD3-SAME: "-target-feature" "+zbb" +// MCPU-HAZARD3-SAME: "-target-feature" "+zbkb" +// MCPU-HAZARD3-SAME: "-target-feature" "+zbs" +// MCPU-HAZARD3-SAME: "-target-abi" "ilp32" + // Check failed cases // RUN: not %clang --target=riscv32 -### -c %s 2>&1 -mcpu=generic-rv321 | FileCheck -check-prefix=FAIL-MCPU-NAME %s diff --git a/clang/test/Misc/target-invalid-cpu-note/riscv.c b/clang/test/Misc/target-invalid-cpu-note/riscv.c index 0a49755de7d25f..96d3cefd434d78 100644 --- a/clang/test/Misc/target-invalid-cpu-note/riscv.c +++ b/clang/test/Misc/target-invalid-cpu-note/riscv.c @@ -7,6 +7,7 @@ // RISCV32-NEXT: note: valid target CPU values are: // RISCV32-SAME: {{^}} generic-rv32 // RISCV32-SAME: {{^}}, rocket-rv32 +// RISCV32-SAME: {{^}}, rp2350-hazard3 // RISCV32-SAME: {{^}}, sifive-e20 // RISCV32-SAME: {{^}}, sifive-e21 // RISCV32-SAME: {{^}}, sifive-e24 @@ -48,6 +49,7 @@ // TUNE-RISCV32-NEXT: note: valid target CPU values are: // TUNE-RISCV32-SAME: {{^}} generic-rv32 // TUNE-RISCV32-SAME: {{^}}, rocket-rv32 +// TUNE-RISCV32-SAME: {{^}}, rp2350-hazard3 // TUNE-RISCV32-SAME: {{^}}, sifive-e20 // TUNE-RISCV32-SAME: {{^}}, sifive-e21 // TUNE-RISCV32-SAME: {{^}}, sifive-e24 diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 005c59e00fb128..65fa21e517940b 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -108,6 +108,8 @@ Changes to the RISC-V Backend fill value) rather than NOPs. * Added Syntacore SCR4 and SCR5 CPUs: ``-mcpu=syntacore-scr4/5-rv32/64`` * ``-mcpu=sifive-p470`` was added. +* Added Hazard3 CPU as taped out for RP2350: ``-mcpu=rp2350-hazard3`` (32-bit + only). * Fixed length vector support using RVV instructions now requires VLEN>=64. This means Zve32x and Zve32f will also require Zvl64b. The prior support was largely untested. diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index 640fe9670d542b..d4ec5ecc6489c1 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -454,3 +454,19 @@ def SPACEMIT_X60 : RISCVProcessorModel<"spacemit-x60", FeatureStdExtZvkt, FeatureStdExtZvl256b]), [TuneDLenFactor2]>; + +def RP2350_HAZARD3 : RISCVProcessorModel<"rp2350-hazard3", + NoSchedModel, + [Feature32Bit, + FeatureStdExtI, + FeatureStdExtM, + FeatureStdExtA, + FeatureStdExtC, + FeatureStdExtZicsr, + FeatureStdExtZifencei, + FeatureStdExtZba, + FeatureStdExtZbb, + FeatureStdExtZbs, + FeatureStdExtZbkb, + FeatureStdExtZcb, + FeatureStdExtZcmp]>; From 7063c9427e11b5028ab2e926768faa7ff431bb85 Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Wed, 21 Aug 2024 15:49:10 +0800 Subject: [PATCH 025/426] [mlir][Linalg] Bugfix for folder of `linalg.transpose` (#102888) Folder of linalg transpose only support tensor type. Fix #102576. --- mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 4 +++ mlir/test/Dialect/Linalg/canonicalize.mlir | 16 ++++++++++ mlir/test/Dialect/Linalg/loops.mlir | 36 ++++++++++++++++++++++ 3 files changed, 56 insertions(+) diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index 775ed8f37344ed..76df3ecf2d2bd4 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -1908,6 +1908,10 @@ void TransposeOp::getEffects( LogicalResult TransposeOp::fold(FoldAdaptor adaptor, SmallVectorImpl &result) { + // Only the tensor type is supported. + if (!isa(getInput().getType())) + return failure(); + // Single dimension transpose. if (getPermutation().size() == 0) { result.push_back(getInput()); diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir index a50fbb0fc3b86c..4bc2ed140da91a 100644 --- a/mlir/test/Dialect/Linalg/canonicalize.mlir +++ b/mlir/test/Dialect/Linalg/canonicalize.mlir @@ -1216,3 +1216,19 @@ func.func @concats_of_fill( // CHECK: %[[CONCAT:.+]] = tensor.concat dim(1) %[[EMPTY0]], %[[EMPTY1]] // CHECK: %[[FILL:.+]] = linalg.fill ins(%[[CST]] : f32) outs(%[[CONCAT]] : // CHECK: return %[[FILL]] + +// ----- + +func.func @transpose_buffer(%input: memref, + %init: memref) { + linalg.transpose ins(%input:memref) + outs(%init:memref) + permutation = [0] + func.return +} + +// CHECK-LABEL: func.func @transpose_buffer( +// CHECK-SAME: %[[VAL_0:.*]]: memref, +// CHECK-SAME: %[[VAL_1:.*]]: memref) { +// CHECK: linalg.transpose ins(%[[VAL_0]] : memref) +// CHECK-SAME: outs(%[[VAL_1]] : memref) permutation = [0] diff --git a/mlir/test/Dialect/Linalg/loops.mlir b/mlir/test/Dialect/Linalg/loops.mlir index b818170a8e7974..6ddbd06389f5eb 100644 --- a/mlir/test/Dialect/Linalg/loops.mlir +++ b/mlir/test/Dialect/Linalg/loops.mlir @@ -873,3 +873,39 @@ func.func @lower_to_loops_with_rank_reducing_subviews( // CHECKPARALLEL: %[[VAL:.+]] = memref.load %{{.+}}[%[[IV]]] // CHECKPARALLEL: memref.store %[[VAL]], %{{.+}}[%[[IV]]] // CHECKPARALLEL: } + +// ----- + +func.func @transpose(%input: memref, + %init: memref) { + linalg.transpose ins(%input:memref) + outs(%init:memref) + permutation = [0] + return +} +// CHECK-LABEL: func.func @transpose( +// CHECK-SAME: %[[VAL_0:.*]]: memref, +// CHECK-SAME: %[[VAL_1:.*]]: memref) { +// CHECK: %[[VAL_2:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_4:.*]] = memref.dim %[[VAL_0]], %[[VAL_3]] : memref +// CHECK: scf.for %[[VAL_5:.*]] = %[[VAL_3]] to %[[VAL_4]] step %[[VAL_2]] { +// CHECK: %[[VAL_6:.*]] = memref.load %[[VAL_0]]{{\[}}%[[VAL_5]]] : memref +// CHECK: memref.store %[[VAL_6]], %[[VAL_1]]{{\[}}%[[VAL_5]]] : memref +// CHECK: } +// CHECK: return +// CHECK: } + +// CHECKPARALLEL-LABEL: func.func @transpose( +// CHECKPARALLEL-SAME: %[[VAL_0:.*]]: memref, +// CHECKPARALLEL-SAME: %[[VAL_1:.*]]: memref) { +// CHECKPARALLEL: %[[VAL_2:.*]] = arith.constant 1 : index +// CHECKPARALLEL: %[[VAL_3:.*]] = arith.constant 0 : index +// CHECKPARALLEL: %[[VAL_4:.*]] = memref.dim %[[VAL_0]], %[[VAL_3]] : memref +// CHECKPARALLEL: scf.parallel (%[[VAL_5:.*]]) = (%[[VAL_3]]) to (%[[VAL_4]]) step (%[[VAL_2]]) { +// CHECKPARALLEL: %[[VAL_6:.*]] = memref.load %[[VAL_0]]{{\[}}%[[VAL_5]]] : memref +// CHECKPARALLEL: memref.store %[[VAL_6]], %[[VAL_1]]{{\[}}%[[VAL_5]]] : memref +// CHECKPARALLEL: scf.reduce +// CHECKPARALLEL: } +// CHECKPARALLEL: return +// CHECKPARALLEL: } From 749ba7f6b29887a74d8f68837b21f478cd6e1486 Mon Sep 17 00:00:00 2001 From: Hugo Trachino Date: Wed, 21 Aug 2024 08:55:50 +0100 Subject: [PATCH 026/426] [mlir][vector] Add more tests for ConvertVectorToLLVM (5/n) (#104784) This patch aims to disambiguate test names for some of the Vector-To-LLVM conversion pass. Covers the following Ops: * vector.extractelement * vector.extract * vector.insertelement * vector.insert 1. Tests targetting `vector.{insert|extract}` Ops do not have names like `{insert|extract}_element*` which was confusing against `vector.{insert|extract}element` ops targetting tests. 2. Tests mention the type of the target/source buffer. e.g. `@extractelement` => `@extractelement_from_vec_1d` 3. Align LIT ligns consistently with other tests. 4. Tests with a different type for position have a name updated accordingly. `@extractelement_index` =>`@extractelement_index_position` 5. Tests with a dynamic value for position have a name updated accordingly. `@extract_element_with_value_1d` =>`@extract_scalar_dynamic_position_from_vec_1d` 6. Added the scalable flavour of the tests `insert_scalar_into_vec_2d_dynamic_position` and `@extract_scalar_from_vec_2d_dynamic_position` --- .../VectorToLLVM/vector-to-llvm.mlir | 232 ++++++++++-------- 1 file changed, 127 insertions(+), 105 deletions(-) diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir index 0cd7ee6fb424fc..e7da21dbfbcdf3 100644 --- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir +++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir @@ -1049,59 +1049,58 @@ func.func @shuffle_2D(%a: vector<1x4xf32>, %b: vector<2x4xf32>) -> vector<3x4xf3 // ----- -// CHECK-LABEL: @extractelement_0d -func.func @extractelement_0d(%a: vector) -> f32 { - // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : index) : i64 - // CHECK: llvm.extractelement %{{.*}}[%[[C0]] : {{.*}}] : vector<1xf32> - %1 = vector.extractelement %a[] : vector +func.func @extractelement_from_vec_0d_f32(%arg0: vector) -> f32 { + %1 = vector.extractelement %arg0[] : vector return %1 : f32 } +// CHECK-LABEL: @extractelement_from_vec_0d_f32 +// CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : index) : i64 +// CHECK: llvm.extractelement %{{.*}}[%[[C0]] : {{.*}}] : vector<1xf32> // ----- -func.func @extractelement(%arg0: vector<16xf32>) -> f32 { +func.func @extractelement_from_vec_1d_f32_idx_as_i32(%arg0: vector<16xf32>) -> f32 { %0 = arith.constant 15 : i32 %1 = vector.extractelement %arg0[%0 : i32]: vector<16xf32> return %1 : f32 } -// CHECK-LABEL: @extractelement( -// CHECK-SAME: %[[A:.*]]: vector<16xf32>) +// CHECK-LABEL: @extractelement_from_vec_1d_f32_idx_as_i32( +// CHECK-SAME: %[[A:.*]]: vector<16xf32>) // CHECK: %[[c:.*]] = arith.constant 15 : i32 // CHECK: %[[x:.*]] = llvm.extractelement %[[A]][%[[c]] : i32] : vector<16xf32> // CHECK: return %[[x]] : f32 -func.func @extractelement_scalable(%arg0: vector<[16]xf32>) -> f32 { +func.func @extractelement_from_vec_1d_f32_idx_as_i32_scalable(%arg0: vector<[16]xf32>) -> f32 { %0 = arith.constant 15 : i32 %1 = vector.extractelement %arg0[%0 : i32]: vector<[16]xf32> return %1 : f32 } -// CHECK-LABEL: @extractelement_scalable( -// CHECK-SAME: %[[A:.*]]: vector<[16]xf32>) +// CHECK-LABEL: @extractelement_from_vec_1d_f32_idx_as_i32_scalable( +// CHECK-SAME: %[[A:.*]]: vector<[16]xf32>) // CHECK: %[[c:.*]] = arith.constant 15 : i32 // CHECK: %[[x:.*]] = llvm.extractelement %[[A]][%[[c]] : i32] : vector<[16]xf32> // CHECK: return %[[x]] : f32 // ----- - -func.func @extractelement_index(%arg0: vector<16xf32>) -> f32 { +func.func @extractelement_from_vec_1d_f32_idx_as_index(%arg0: vector<16xf32>) -> f32 { %0 = arith.constant 15 : index %1 = vector.extractelement %arg0[%0 : index]: vector<16xf32> return %1 : f32 } -// CHECK-LABEL: @extractelement_index( -// CHECK-SAME: %[[A:.*]]: vector<16xf32>) +// CHECK-LABEL: @extractelement_from_vec_1d_f32_idx_as_index( +// CHECK-SAME: %[[A:.*]]: vector<16xf32>) // CHECK: %[[c:.*]] = arith.constant 15 : index // CHECK: %[[i:.*]] = builtin.unrealized_conversion_cast %[[c]] : index to i64 // CHECK: %[[x:.*]] = llvm.extractelement %[[A]][%[[i]] : i64] : vector<16xf32> // CHECK: return %[[x]] : f32 -func.func @extractelement_index_scalable(%arg0: vector<[16]xf32>) -> f32 { +func.func @extractelement_from_vec_1d_f32_idx_as_index_scalable(%arg0: vector<[16]xf32>) -> f32 { %0 = arith.constant 15 : index %1 = vector.extractelement %arg0[%0 : index]: vector<[16]xf32> return %1 : f32 } -// CHECK-LABEL: @extractelement_index_scalable( -// CHECK-SAME: %[[A:.*]]: vector<[16]xf32>) +// CHECK-LABEL: @extractelement_from_vec_1d_f32_idx_as_index_scalable( +// CHECK-SAME: %[[A:.*]]: vector<[16]xf32>) // CHECK: %[[c:.*]] = arith.constant 15 : index // CHECK: %[[i:.*]] = builtin.unrealized_conversion_cast %[[c]] : index to i64 // CHECK: %[[x:.*]] = llvm.extractelement %[[A]][%[[i]] : i64] : vector<[16]xf32> @@ -1109,44 +1108,44 @@ func.func @extractelement_index_scalable(%arg0: vector<[16]xf32>) -> f32 { // ----- -func.func @extract_element_from_vec_1d(%arg0: vector<16xf32>) -> f32 { +func.func @extract_scalar_from_vec_1d_f32(%arg0: vector<16xf32>) -> f32 { %0 = vector.extract %arg0[15]: f32 from vector<16xf32> return %0 : f32 } -// CHECK-LABEL: @extract_element_from_vec_1d +// CHECK-LABEL: @extract_scalar_from_vec_1d_f32 // CHECK: llvm.mlir.constant(15 : i64) : i64 // CHECK: llvm.extractelement {{.*}}[{{.*}} : i64] : vector<16xf32> // CHECK: return {{.*}} : f32 -func.func @extract_element_from_vec_1d_scalable(%arg0: vector<[16]xf32>) -> f32 { +func.func @extract_scalar_from_vec_1d_f32_scalable(%arg0: vector<[16]xf32>) -> f32 { %0 = vector.extract %arg0[15]: f32 from vector<[16]xf32> return %0 : f32 } -// CHECK-LABEL: @extract_element_from_vec_1d_scalable +// CHECK-LABEL: @extract_scalar_from_vec_1d_f32_scalable // CHECK: llvm.mlir.constant(15 : i64) : i64 // CHECK: llvm.extractelement {{.*}}[{{.*}} : i64] : vector<[16]xf32> // CHECK: return {{.*}} : f32 // ----- -func.func @extract_index_element_from_vec_1d(%arg0: vector<16xindex>) -> index { +func.func @extract_scalar_from_vec_1d_index(%arg0: vector<16xindex>) -> index { %0 = vector.extract %arg0[15]: index from vector<16xindex> return %0 : index } -// CHECK-LABEL: @extract_index_element_from_vec_1d( -// CHECK-SAME: %[[A:.*]]: vector<16xindex>) +// CHECK-LABEL: @extract_scalar_from_vec_1d_index( +// CHECK-SAME: %[[A:.*]]: vector<16xindex>) // CHECK: %[[T0:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<16xindex> to vector<16xi64> // CHECK: %[[T1:.*]] = llvm.mlir.constant(15 : i64) : i64 // CHECK: %[[T2:.*]] = llvm.extractelement %[[T0]][%[[T1]] : i64] : vector<16xi64> // CHECK: %[[T3:.*]] = builtin.unrealized_conversion_cast %[[T2]] : i64 to index // CHECK: return %[[T3]] : index -func.func @extract_index_element_from_vec_1d_scalable(%arg0: vector<[16]xindex>) -> index { +func.func @extract_scalar_from_vec_1d_index_scalable(%arg0: vector<[16]xindex>) -> index { %0 = vector.extract %arg0[15]: index from vector<[16]xindex> return %0 : index } -// CHECK-LABEL: @extract_index_element_from_vec_1d_scalable( -// CHECK-SAME: %[[A:.*]]: vector<[16]xindex>) +// CHECK-LABEL: @extract_scalar_from_vec_1d_index_scalable( +// CHECK-SAME: %[[A:.*]]: vector<[16]xindex>) // CHECK: %[[T0:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<[16]xindex> to vector<[16]xi64> // CHECK: %[[T1:.*]] = llvm.mlir.constant(15 : i64) : i64 // CHECK: %[[T2:.*]] = llvm.extractelement %[[T0]][%[[T1]] : i64] : vector<[16]xi64> @@ -1155,57 +1154,57 @@ func.func @extract_index_element_from_vec_1d_scalable(%arg0: vector<[16]xindex>) // ----- -func.func @extract_vec_2d_from_vec_3d(%arg0: vector<4x3x16xf32>) -> vector<3x16xf32> { +func.func @extract_vec_2d_from_vec_3d_f32(%arg0: vector<4x3x16xf32>) -> vector<3x16xf32> { %0 = vector.extract %arg0[0]: vector<3x16xf32> from vector<4x3x16xf32> return %0 : vector<3x16xf32> } -// CHECK-LABEL: @extract_vec_2d_from_vec_3d +// CHECK-LABEL: @extract_vec_2d_from_vec_3d_f32 // CHECK: llvm.extractvalue {{.*}}[0] : !llvm.array<4 x array<3 x vector<16xf32>>> // CHECK: return {{.*}} : vector<3x16xf32> -func.func @extract_vec_2d_from_vec_3d_scalable(%arg0: vector<4x3x[16]xf32>) -> vector<3x[16]xf32> { +func.func @extract_vec_2d_from_vec_3d_f32_scalable(%arg0: vector<4x3x[16]xf32>) -> vector<3x[16]xf32> { %0 = vector.extract %arg0[0]: vector<3x[16]xf32> from vector<4x3x[16]xf32> return %0 : vector<3x[16]xf32> } -// CHECK-LABEL: @extract_vec_2d_from_vec_3d_scalable +// CHECK-LABEL: @extract_vec_2d_from_vec_3d_f32_scalable // CHECK: llvm.extractvalue {{.*}}[0] : !llvm.array<4 x array<3 x vector<[16]xf32>>> // CHECK: return {{.*}} : vector<3x[16]xf32> // ----- -func.func @extract_vec_1d_from_vec_3d(%arg0: vector<4x3x16xf32>) -> vector<16xf32> { +func.func @extract_vec_1d_from_vec_3d_f32(%arg0: vector<4x3x16xf32>) -> vector<16xf32> { %0 = vector.extract %arg0[0, 0]: vector<16xf32> from vector<4x3x16xf32> return %0 : vector<16xf32> } -// CHECK-LABEL: @extract_vec_1d_from_vec_3d +// CHECK-LABEL: @extract_vec_1d_from_vec_3d_f32 // CHECK: llvm.extractvalue {{.*}}[0, 0] : !llvm.array<4 x array<3 x vector<16xf32>>> // CHECK: return {{.*}} : vector<16xf32> -func.func @extract_vec_1d_from_vec_3d_scalable(%arg0: vector<4x3x[16]xf32>) -> vector<[16]xf32> { +func.func @extract_vec_1d_from_vec_3d_f32_scalable(%arg0: vector<4x3x[16]xf32>) -> vector<[16]xf32> { %0 = vector.extract %arg0[0, 0]: vector<[16]xf32> from vector<4x3x[16]xf32> return %0 : vector<[16]xf32> } -// CHECK-LABEL: @extract_vec_1d_from_vec_3d_scalable +// CHECK-LABEL: @extract_vec_1d_from_vec_3d_f32_scalable // CHECK: llvm.extractvalue {{.*}}[0, 0] : !llvm.array<4 x array<3 x vector<[16]xf32>>> // CHECK: return {{.*}} : vector<[16]xf32> // ----- -func.func @extract_element_from_vec_3d(%arg0: vector<4x3x16xf32>) -> f32 { +func.func @extract_scalar_from_vec_3d_f32(%arg0: vector<4x3x16xf32>) -> f32 { %0 = vector.extract %arg0[0, 0, 0]: f32 from vector<4x3x16xf32> return %0 : f32 } -// CHECK-LABEL: @extract_element_from_vec_3d +// CHECK-LABEL: @extract_scalar_from_vec_3d_f32 // CHECK: llvm.extractvalue {{.*}}[0, 0] : !llvm.array<4 x array<3 x vector<16xf32>>> // CHECK: llvm.mlir.constant(0 : i64) : i64 // CHECK: llvm.extractelement {{.*}}[{{.*}} : i64] : vector<16xf32> // CHECK: return {{.*}} : f32 -func.func @extract_element_from_vec_3d_scalable(%arg0: vector<4x3x[16]xf32>) -> f32 { +func.func @extract_scalar_from_vec_3d_f32_scalable(%arg0: vector<4x3x[16]xf32>) -> f32 { %0 = vector.extract %arg0[0, 0, 0]: f32 from vector<4x3x[16]xf32> return %0 : f32 } -// CHECK-LABEL: @extract_element_from_vec_3d_scalable +// CHECK-LABEL: @extract_scalar_from_vec_3d_f32_scalable // CHECK: llvm.extractvalue {{.*}}[0, 0] : !llvm.array<4 x array<3 x vector<[16]xf32>>> // CHECK: llvm.mlir.constant(0 : i64) : i64 // CHECK: llvm.extractelement {{.*}}[{{.*}} : i64] : vector<[16]xf32> @@ -1213,98 +1212,108 @@ func.func @extract_element_from_vec_3d_scalable(%arg0: vector<4x3x[16]xf32>) -> // ----- -func.func @extract_element_with_value_1d(%arg0: vector<16xf32>, %arg1: index) -> f32 { +func.func @extract_scalar_from_vec_1d_f32_dynamic_idx(%arg0: vector<16xf32>, %arg1: index) -> f32 { %0 = vector.extract %arg0[%arg1]: f32 from vector<16xf32> return %0 : f32 } -// CHECK-LABEL: @extract_element_with_value_1d +// CHECK-LABEL: @extract_scalar_from_vec_1d_f32_dynamic_idx // CHECK-SAME: %[[VEC:.+]]: vector<16xf32>, %[[INDEX:.+]]: index // CHECK: %[[UC:.+]] = builtin.unrealized_conversion_cast %[[INDEX]] : index to i64 // CHECK: llvm.extractelement %[[VEC]][%[[UC]] : i64] : vector<16xf32> -func.func @extract_element_with_value_1d_scalable(%arg0: vector<[16]xf32>, %arg1: index) -> f32 { +func.func @extract_scalar_from_vec_1d_f32_dynamic_idx_scalable(%arg0: vector<[16]xf32>, %arg1: index) -> f32 { %0 = vector.extract %arg0[%arg1]: f32 from vector<[16]xf32> return %0 : f32 } -// CHECK-LABEL: @extract_element_with_value_1d_scalable +// CHECK-LABEL: @extract_scalar_from_vec_1d_f32_dynamic_idx_scalable // CHECK-SAME: %[[VEC:.+]]: vector<[16]xf32>, %[[INDEX:.+]]: index // CHECK: %[[UC:.+]] = builtin.unrealized_conversion_cast %[[INDEX]] : index to i64 // CHECK: llvm.extractelement %[[VEC]][%[[UC]] : i64] : vector<[16]xf32> // ----- -func.func @extract_element_with_value_2d(%arg0: vector<1x16xf32>, %arg1: index) -> f32 { +func.func @extract_scalar_from_vec_2d_f32_dynamic_idx(%arg0: vector<1x16xf32>, %arg1: index) -> f32 { %0 = vector.extract %arg0[0, %arg1]: f32 from vector<1x16xf32> return %0 : f32 } // Multi-dim vectors are not supported but this test shouldn't crash. -// CHECK-LABEL: @extract_element_with_value_2d( +// CHECK-LABEL: @extract_scalar_from_vec_2d_f32_dynamic_idx( +// CHECK: vector.extract + +func.func @extract_scalar_from_vec_2d_f32_dynamic_idx_scalable(%arg0: vector<1x[16]xf32>, %arg1: index) -> f32 { + %0 = vector.extract %arg0[0, %arg1]: f32 from vector<1x[16]xf32> + return %0 : f32 +} + +// Multi-dim vectors are not supported but this test shouldn't crash. + +// CHECK-LABEL: @extract_scalar_from_vec_2d_f32_dynamic_idx_scalable( // CHECK: vector.extract // ----- -// CHECK-LABEL: @insertelement_0d -// CHECK-SAME: %[[A:.*]]: f32, -func.func @insertelement_0d(%a: f32, %b: vector) -> vector { - // CHECK: %[[B:.*]] = builtin.unrealized_conversion_cast %{{.*}} : - // CHECK: vector to vector<1xf32> - // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : index) : i64 - // CHECK: %[[x:.*]] = llvm.insertelement %[[A]], %[[B]][%[[C0]] : {{.*}}] : vector<1xf32> - %1 = vector.insertelement %a, %b[] : vector +func.func @insertelement_into_vec_0d_f32(%arg0: f32, %arg1: vector) -> vector { + %1 = vector.insertelement %arg0, %arg1[] : vector return %1 : vector } +// CHECK-LABEL: @insertelement_into_vec_0d_f32 +// CHECK-SAME: %[[A:.*]]: f32, +// CHECK: %[[B:.*]] = builtin.unrealized_conversion_cast %{{.*}} : +// CHECK: vector to vector<1xf32> +// CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : index) : i64 +// CHECK: %[[x:.*]] = llvm.insertelement %[[A]], %[[B]][%[[C0]] : {{.*}}] : vector<1xf32> // ----- -func.func @insertelement(%arg0: f32, %arg1: vector<4xf32>) -> vector<4xf32> { +func.func @insertelement_into_vec_1d_f32_idx_as_i32(%arg0: f32, %arg1: vector<4xf32>) -> vector<4xf32> { %0 = arith.constant 3 : i32 %1 = vector.insertelement %arg0, %arg1[%0 : i32] : vector<4xf32> return %1 : vector<4xf32> } -// CHECK-LABEL: @insertelement( -// CHECK-SAME: %[[A:.*]]: f32, -// CHECK-SAME: %[[B:.*]]: vector<4xf32>) +// CHECK-LABEL: @insertelement_into_vec_1d_f32_idx_as_i32( +// CHECK-SAME: %[[A:.*]]: f32, +// CHECK-SAME: %[[B:.*]]: vector<4xf32>) // CHECK: %[[c:.*]] = arith.constant 3 : i32 // CHECK: %[[x:.*]] = llvm.insertelement %[[A]], %[[B]][%[[c]] : i32] : vector<4xf32> // CHECK: return %[[x]] : vector<4xf32> -func.func @insertelement_scalable(%arg0: f32, %arg1: vector<[4]xf32>) -> vector<[4]xf32> { +func.func @insertelement_into_vec_1d_f32_idx_as_i32_scalable(%arg0: f32, %arg1: vector<[4]xf32>) -> vector<[4]xf32> { %0 = arith.constant 3 : i32 %1 = vector.insertelement %arg0, %arg1[%0 : i32] : vector<[4]xf32> return %1 : vector<[4]xf32> } -// CHECK-LABEL: @insertelement_scalable( -// CHECK-SAME: %[[A:.*]]: f32, -// CHECK-SAME: %[[B:.*]]: vector<[4]xf32>) +// CHECK-LABEL: @insertelement_into_vec_1d_f32_idx_as_i32_scalable( +// CHECK-SAME: %[[A:.*]]: f32, +// CHECK-SAME: %[[B:.*]]: vector<[4]xf32>) // CHECK: %[[c:.*]] = arith.constant 3 : i32 // CHECK: %[[x:.*]] = llvm.insertelement %[[A]], %[[B]][%[[c]] : i32] : vector<[4]xf32> // CHECK: return %[[x]] : vector<[4]xf32> // ----- -func.func @insertelement_index(%arg0: f32, %arg1: vector<4xf32>) -> vector<4xf32> { +func.func @insertelement_into_vec_1d_f32_scalable_idx_as_index(%arg0: f32, %arg1: vector<4xf32>) -> vector<4xf32> { %0 = arith.constant 3 : index %1 = vector.insertelement %arg0, %arg1[%0 : index] : vector<4xf32> return %1 : vector<4xf32> } -// CHECK-LABEL: @insertelement_index( -// CHECK-SAME: %[[A:.*]]: f32, -// CHECK-SAME: %[[B:.*]]: vector<4xf32>) +// CHECK-LABEL: @insertelement_into_vec_1d_f32_scalable_idx_as_index( +// CHECK-SAME: %[[A:.*]]: f32, +// CHECK-SAME: %[[B:.*]]: vector<4xf32>) // CHECK: %[[c:.*]] = arith.constant 3 : index // CHECK: %[[i:.*]] = builtin.unrealized_conversion_cast %[[c]] : index to i64 // CHECK: %[[x:.*]] = llvm.insertelement %[[A]], %[[B]][%[[i]] : i64] : vector<4xf32> // CHECK: return %[[x]] : vector<4xf32> -func.func @insertelement_index_scalable(%arg0: f32, %arg1: vector<[4]xf32>) -> vector<[4]xf32> { +func.func @insertelement_into_vec_1d_f32_scalable_idx_as_index_scalable(%arg0: f32, %arg1: vector<[4]xf32>) -> vector<[4]xf32> { %0 = arith.constant 3 : index %1 = vector.insertelement %arg0, %arg1[%0 : index] : vector<[4]xf32> return %1 : vector<[4]xf32> } -// CHECK-LABEL: @insertelement_index_scalable( -// CHECK-SAME: %[[A:.*]]: f32, -// CHECK-SAME: %[[B:.*]]: vector<[4]xf32>) +// CHECK-LABEL: @insertelement_into_vec_1d_f32_scalable_idx_as_index_scalable( +// CHECK-SAME: %[[A:.*]]: f32, +// CHECK-SAME: %[[B:.*]]: vector<[4]xf32>) // CHECK: %[[c:.*]] = arith.constant 3 : index // CHECK: %[[i:.*]] = builtin.unrealized_conversion_cast %[[c]] : index to i64 // CHECK: %[[x:.*]] = llvm.insertelement %[[A]], %[[B]][%[[i]] : i64] : vector<[4]xf32> @@ -1312,50 +1321,50 @@ func.func @insertelement_index_scalable(%arg0: f32, %arg1: vector<[4]xf32>) -> v // ----- -func.func @insert_element_into_vec_1d(%arg0: f32, %arg1: vector<4xf32>) -> vector<4xf32> { +func.func @insert_scalar_into_vec_1d_f32(%arg0: f32, %arg1: vector<4xf32>) -> vector<4xf32> { %0 = vector.insert %arg0, %arg1[3] : f32 into vector<4xf32> return %0 : vector<4xf32> } -// CHECK-LABEL: @insert_element_into_vec_1d +// CHECK-LABEL: @insert_scalar_into_vec_1d_f32 // CHECK: llvm.mlir.constant(3 : i64) : i64 // CHECK: llvm.insertelement {{.*}}, {{.*}}[{{.*}} : i64] : vector<4xf32> // CHECK: return {{.*}} : vector<4xf32> -func.func @insert_element_into_vec_1d_scalable(%arg0: f32, %arg1: vector<[4]xf32>) -> vector<[4]xf32> { +func.func @insert_scalar_into_vec_1d_f32_scalable(%arg0: f32, %arg1: vector<[4]xf32>) -> vector<[4]xf32> { %0 = vector.insert %arg0, %arg1[3] : f32 into vector<[4]xf32> return %0 : vector<[4]xf32> } -// CHECK-LABEL: @insert_element_into_vec_1d_scalable +// CHECK-LABEL: @insert_scalar_into_vec_1d_f32_scalable // CHECK: llvm.mlir.constant(3 : i64) : i64 // CHECK: llvm.insertelement {{.*}}, {{.*}}[{{.*}} : i64] : vector<[4]xf32> // CHECK: return {{.*}} : vector<[4]xf32> // ----- -func.func @insert_index_element_into_vec_1d(%arg0: index, %arg1: vector<4xindex>) -> vector<4xindex> { +func.func @insert_scalar_into_vec_1d_index(%arg0: index, %arg1: vector<4xindex>) -> vector<4xindex> { %0 = vector.insert %arg0, %arg1[3] : index into vector<4xindex> return %0 : vector<4xindex> } -// CHECK-LABEL: @insert_index_element_into_vec_1d( -// CHECK-SAME: %[[A:.*]]: index, -// CHECK-SAME: %[[B:.*]]: vector<4xindex>) -// CHECK-DAG: %[[T0:.*]] = builtin.unrealized_conversion_cast %[[A]] : index to i64 -// CHECK-DAG: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[B]] : vector<4xindex> to vector<4xi64> +// CHECK-LABEL: @insert_scalar_into_vec_1d_index( +// CHECK-SAME: %[[A:.*]]: index, +// CHECK-SAME: %[[B:.*]]: vector<4xindex>) +// CHECK-DAG: %[[T0:.*]] = builtin.unrealized_conversion_cast %[[A]] : index to i64 +// CHECK-DAG: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[B]] : vector<4xindex> to vector<4xi64> // CHECK: %[[T3:.*]] = llvm.mlir.constant(3 : i64) : i64 // CHECK: %[[T4:.*]] = llvm.insertelement %[[T0]], %[[T1]][%[[T3]] : i64] : vector<4xi64> // CHECK: %[[T5:.*]] = builtin.unrealized_conversion_cast %[[T4]] : vector<4xi64> to vector<4xindex> // CHECK: return %[[T5]] : vector<4xindex> -func.func @insert_index_element_into_vec_1d_scalable(%arg0: index, %arg1: vector<[4]xindex>) -> vector<[4]xindex> { +func.func @insert_scalar_into_vec_1d_index_scalable(%arg0: index, %arg1: vector<[4]xindex>) -> vector<[4]xindex> { %0 = vector.insert %arg0, %arg1[3] : index into vector<[4]xindex> return %0 : vector<[4]xindex> } -// CHECK-LABEL: @insert_index_element_into_vec_1d_scalable( -// CHECK-SAME: %[[A:.*]]: index, -// CHECK-SAME: %[[B:.*]]: vector<[4]xindex>) -// CHECK-DAG: %[[T0:.*]] = builtin.unrealized_conversion_cast %[[A]] : index to i64 -// CHECK-DAG: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[B]] : vector<[4]xindex> to vector<[4]xi64> +// CHECK-LABEL: @insert_scalar_into_vec_1d_index_scalable( +// CHECK-SAME: %[[A:.*]]: index, +// CHECK-SAME: %[[B:.*]]: vector<[4]xindex>) +// CHECK-DAG: %[[T0:.*]] = builtin.unrealized_conversion_cast %[[A]] : index to i64 +// CHECK-DAG: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[B]] : vector<[4]xindex> to vector<[4]xi64> // CHECK: %[[T3:.*]] = llvm.mlir.constant(3 : i64) : i64 // CHECK: %[[T4:.*]] = llvm.insertelement %[[T0]], %[[T1]][%[[T3]] : i64] : vector<[4]xi64> // CHECK: %[[T5:.*]] = builtin.unrealized_conversion_cast %[[T4]] : vector<[4]xi64> to vector<[4]xindex> @@ -1363,58 +1372,58 @@ func.func @insert_index_element_into_vec_1d_scalable(%arg0: index, %arg1: vector // ----- -func.func @insert_vec_2d_into_vec_3d(%arg0: vector<8x16xf32>, %arg1: vector<4x8x16xf32>) -> vector<4x8x16xf32> { +func.func @insert_vec_2d_into_vec_3d_f32(%arg0: vector<8x16xf32>, %arg1: vector<4x8x16xf32>) -> vector<4x8x16xf32> { %0 = vector.insert %arg0, %arg1[3] : vector<8x16xf32> into vector<4x8x16xf32> return %0 : vector<4x8x16xf32> } -// CHECK-LABEL: @insert_vec_2d_into_vec_3d +// CHECK-LABEL: @insert_vec_2d_into_vec_3d_f32 // CHECK: llvm.insertvalue {{.*}}, {{.*}}[3] : !llvm.array<4 x array<8 x vector<16xf32>>> // CHECK: return {{.*}} : vector<4x8x16xf32> -func.func @insert_vec_2d_into_vec_3d_scalable(%arg0: vector<8x[16]xf32>, %arg1: vector<4x8x[16]xf32>) -> vector<4x8x[16]xf32> { +func.func @insert_vec_2d_into_vec_3d_f32_scalable(%arg0: vector<8x[16]xf32>, %arg1: vector<4x8x[16]xf32>) -> vector<4x8x[16]xf32> { %0 = vector.insert %arg0, %arg1[3] : vector<8x[16]xf32> into vector<4x8x[16]xf32> return %0 : vector<4x8x[16]xf32> } -// CHECK-LABEL: @insert_vec_2d_into_vec_3d_scalable +// CHECK-LABEL: @insert_vec_2d_into_vec_3d_f32_scalable // CHECK: llvm.insertvalue {{.*}}, {{.*}}[3] : !llvm.array<4 x array<8 x vector<[16]xf32>>> // CHECK: return {{.*}} : vector<4x8x[16]xf32> // ----- -func.func @insert_vec_1d_into_vec_3d(%arg0: vector<16xf32>, %arg1: vector<4x8x16xf32>) -> vector<4x8x16xf32> { +func.func @insert_vec_1d_into_vec_3d_f32(%arg0: vector<16xf32>, %arg1: vector<4x8x16xf32>) -> vector<4x8x16xf32> { %0 = vector.insert %arg0, %arg1[3, 7] : vector<16xf32> into vector<4x8x16xf32> return %0 : vector<4x8x16xf32> } -// CHECK-LABEL: @insert_vec_1d_into_vec_3d +// CHECK-LABEL: @insert_vec_1d_into_vec_3d_f32 // CHECK: llvm.insertvalue {{.*}}, {{.*}}[3, 7] : !llvm.array<4 x array<8 x vector<16xf32>>> // CHECK: return {{.*}} : vector<4x8x16xf32> -func.func @insert_vec_1d_into_vec_3d_scalable(%arg0: vector<[16]xf32>, %arg1: vector<4x8x[16]xf32>) -> vector<4x8x[16]xf32> { +func.func @insert_vec_1d_into_vec_3d_f32_scalable(%arg0: vector<[16]xf32>, %arg1: vector<4x8x[16]xf32>) -> vector<4x8x[16]xf32> { %0 = vector.insert %arg0, %arg1[3, 7] : vector<[16]xf32> into vector<4x8x[16]xf32> return %0 : vector<4x8x[16]xf32> } -// CHECK-LABEL: @insert_vec_1d_into_vec_3d_scalable +// CHECK-LABEL: @insert_vec_1d_into_vec_3d_f32_scalable // CHECK: llvm.insertvalue {{.*}}, {{.*}}[3, 7] : !llvm.array<4 x array<8 x vector<[16]xf32>>> // CHECK: return {{.*}} : vector<4x8x[16]xf32> // ----- -func.func @insert_element_into_vec_3d(%arg0: f32, %arg1: vector<4x8x16xf32>) -> vector<4x8x16xf32> { +func.func @insert_scalar_into_vec_3d_f32(%arg0: f32, %arg1: vector<4x8x16xf32>) -> vector<4x8x16xf32> { %0 = vector.insert %arg0, %arg1[3, 7, 15] : f32 into vector<4x8x16xf32> return %0 : vector<4x8x16xf32> } -// CHECK-LABEL: @insert_element_into_vec_3d +// CHECK-LABEL: @insert_scalar_into_vec_3d_f32 // CHECK: llvm.extractvalue {{.*}}[3, 7] : !llvm.array<4 x array<8 x vector<16xf32>>> // CHECK: llvm.mlir.constant(15 : i64) : i64 // CHECK: llvm.insertelement {{.*}}, {{.*}}[{{.*}} : i64] : vector<16xf32> // CHECK: llvm.insertvalue {{.*}}, {{.*}}[3, 7] : !llvm.array<4 x array<8 x vector<16xf32>>> // CHECK: return {{.*}} : vector<4x8x16xf32> -func.func @insert_element_into_vec_3d_scalable(%arg0: f32, %arg1: vector<4x8x[16]xf32>) -> vector<4x8x[16]xf32> { +func.func @insert_scalar_into_vec_3d_f32_scalable(%arg0: f32, %arg1: vector<4x8x[16]xf32>) -> vector<4x8x[16]xf32> { %0 = vector.insert %arg0, %arg1[3, 7, 15] : f32 into vector<4x8x[16]xf32> return %0 : vector<4x8x[16]xf32> } -// CHECK-LABEL: @insert_element_into_vec_3d_scalable +// CHECK-LABEL: @insert_scalar_into_vec_3d_f32_scalable // CHECK: llvm.extractvalue {{.*}}[3, 7] : !llvm.array<4 x array<8 x vector<[16]xf32>>> // CHECK: llvm.mlir.constant(15 : i64) : i64 // CHECK: llvm.insertelement {{.*}}, {{.*}}[{{.*}} : i64] : vector<[16]xf32> @@ -1423,39 +1432,52 @@ func.func @insert_element_into_vec_3d_scalable(%arg0: f32, %arg1: vector<4x8x[16 // ----- -func.func @insert_element_with_value_1d(%arg0: vector<16xf32>, %arg1: f32, %arg2: index) +func.func @insert_scalar_into_vec_1d_f32_dynamic_idx(%arg0: vector<16xf32>, %arg1: f32, %arg2: index) -> vector<16xf32> { %0 = vector.insert %arg1, %arg0[%arg2]: f32 into vector<16xf32> return %0 : vector<16xf32> } -// CHECK-LABEL: @insert_element_with_value_1d +// CHECK-LABEL: @insert_scalar_into_vec_1d_f32_dynamic_idx // CHECK-SAME: %[[DST:.+]]: vector<16xf32>, %[[SRC:.+]]: f32, %[[INDEX:.+]]: index // CHECK: %[[UC:.+]] = builtin.unrealized_conversion_cast %[[INDEX]] : index to i64 // CHECK: llvm.insertelement %[[SRC]], %[[DST]][%[[UC]] : i64] : vector<16xf32> -func.func @insert_element_with_value_1d_scalable(%arg0: vector<[16]xf32>, %arg1: f32, %arg2: index) +func.func @insert_scalar_into_vec_1d_f32_dynamic_idx_scalable(%arg0: vector<[16]xf32>, %arg1: f32, %arg2: index) -> vector<[16]xf32> { %0 = vector.insert %arg1, %arg0[%arg2]: f32 into vector<[16]xf32> return %0 : vector<[16]xf32> } -// CHECK-LABEL: @insert_element_with_value_1d_scalable +// CHECK-LABEL: @insert_scalar_into_vec_1d_f32_dynamic_idx_scalable // CHECK-SAME: %[[DST:.+]]: vector<[16]xf32>, %[[SRC:.+]]: f32, %[[INDEX:.+]]: index // CHECK: %[[UC:.+]] = builtin.unrealized_conversion_cast %[[INDEX]] : index to i64 // CHECK: llvm.insertelement %[[SRC]], %[[DST]][%[[UC]] : i64] : vector<[16]xf32> // ----- -func.func @insert_element_with_value_2d(%base: vector<1x16xf32>, %value: f32, %idx: index) +func.func @insert_scalar_into_vec_2d_f32_dynamic_idx(%arg0: vector<1x16xf32>, %arg1: f32, %idx: index) -> vector<1x16xf32> { - %0 = vector.insert %value, %base[0, %idx]: f32 into vector<1x16xf32> + %0 = vector.insert %arg1, %arg0[0, %idx]: f32 into vector<1x16xf32> return %0 : vector<1x16xf32> } // Multi-dim vectors are not supported but this test shouldn't crash. -// CHECK-LABEL: @insert_element_with_value_2d( +// CHECK-LABEL: @insert_scalar_into_vec_2d_f32_dynamic_idx( +// CHECK: vector.insert + +// ----- + +func.func @insert_scalar_into_vec_2d_f32_dynamic_idx_scalable(%arg0: vector<1x[16]xf32>, %arg1: f32, %idx: index) + -> vector<1x[16]xf32> { + %0 = vector.insert %arg1, %arg0[0, %idx]: f32 into vector<1x[16]xf32> + return %0 : vector<1x[16]xf32> +} + +// Multi-dim vectors are not supported but this test shouldn't crash. + +// CHECK-LABEL: @insert_scalar_into_vec_2d_f32_dynamic_idx_scalable( // CHECK: vector.insert // ----- From 25ffd2efb33476f2a235f6cb1377759bab367324 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 21 Aug 2024 07:57:15 +0000 Subject: [PATCH 027/426] [gn build] Port 7c4cadfc4333 --- llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn index b38de8d65536b0..73cee07a8b9d7d 100644 --- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn @@ -134,9 +134,11 @@ copy("Headers") { "arm_cmse.h", "arm_neon_sve_bridge.h", "armintr.h", + "avx10_2_512convertintrin.h", "avx10_2_512minmaxintrin.h", "avx10_2_512niintrin.h", "avx10_2_512satcvtintrin.h", + "avx10_2convertintrin.h", "avx10_2minmaxintrin.h", "avx10_2niintrin.h", "avx10_2satcvtintrin.h", From 99741ac28522f519713907d7bea4438ea5412e21 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 21 Aug 2024 10:06:44 +0200 Subject: [PATCH 028/426] [VPlan] Introduce explicit ExtractFromEnd recipes for live-outs. (#100658) Introduce explicit ExtractFromEnd recipes to extract the final values for live-outs instead of implicitly extracting in VPLiveOut::fixPhi. This is a follow-up to the recent changes of modeling extracts for recurrences and consolidates live-out extract creation for fixed-order recurrences at a single place: addLiveOutsForFirstOrderRecurrences. It is also in preparation of replacing VPLiveOut with VPIRInstructions wrapping the original scalar phis. PR: https://github.com/llvm/llvm-project/pull/100658 --- .../Transforms/Vectorize/LoopVectorize.cpp | 205 +++++++++++++++--- .../Transforms/Vectorize/VPlanPatternMatch.h | 5 + .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 8 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 94 -------- .../RISCV/vplan-vp-intrinsics-reduction.ll | 9 +- .../first-order-recurrence-chains.ll | 26 +-- .../first-order-recurrence-complex.ll | 2 +- ...-order-recurrence-sink-replicate-region.ll | 3 +- .../instruction-only-used-outside-of-loop.ll | 22 +- llvm/test/Transforms/LoopVectorize/pr36983.ll | 2 +- .../pr55167-fold-tail-live-out.ll | 2 +- .../LoopVectorize/select-cmp-multiuse.ll | 4 +- .../LoopVectorize/vplan-printing.ll | 11 +- 13 files changed, 224 insertions(+), 169 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 86e50a7f914372..364166b3ab5380 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8527,9 +8527,11 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); } -// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the -// original exit block. -static void addUsersInExitBlock( +// Collect (ExitPhi, ExitingValue) pairs phis in the original exit block that +// are modeled in VPlan. Some exiting values are not modeled explicitly yet and +// won't be included. Those are un-truncated VPWidenIntOrFpInductionRecipe, +// VPWidenPointerInductionRecipe and induction increments. +static MapVector collectUsersInExitBlock( Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan, const MapVector &Inductions) { auto MiddleVPBB = @@ -8538,9 +8540,8 @@ static void addUsersInExitBlock( // and there is nothing to fix from vector loop; phis should have incoming // from scalar loop only. if (MiddleVPBB->getNumSuccessors() != 2) - return; - - // Introduce VPUsers modeling the exit values. + return {}; + MapVector ExitingValuesToFix; BasicBlock *ExitBB = cast(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock(); BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); @@ -8561,15 +8562,52 @@ static void addUsersInExitBlock( return P && Inductions.contains(P); }))) continue; - Plan.addLiveOut(&ExitPhi, V); + ExitingValuesToFix.insert({&ExitPhi, V}); } + return ExitingValuesToFix; } -/// Feed a resume value for every FOR from the vector loop to the scalar loop, -/// if middle block branches to scalar preheader, by introducing ExtractFromEnd -/// and ResumePhi recipes in each, respectively, and a VPLiveOut which uses the -/// latter and corresponds to the scalar header. -static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) { +// Add exit values to \p Plan. Extracts and VPLiveOuts are added for each entry +// in \p ExitingValuesToFix. +static void +addUsersInExitBlock(VPlan &Plan, + MapVector &ExitingValuesToFix) { + if (ExitingValuesToFix.empty()) + return; + + auto MiddleVPBB = + cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); + BasicBlock *ExitBB = + cast(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock(); + // TODO: set B to MiddleVPBB->getFirstNonPhi(), taking care of affected tests. + VPBuilder B(MiddleVPBB); + if (auto *Terminator = MiddleVPBB->getTerminator()) { + auto *Condition = dyn_cast(Terminator->getOperand(0)); + assert((!Condition || Condition->getParent() == MiddleVPBB) && + "Condition expected in MiddleVPBB"); + B.setInsertPoint(Condition ? Condition : Terminator); + } + + // Introduce VPUsers modeling the exit values. + for (const auto &[ExitPhi, V] : ExitingValuesToFix) { + VPValue *Ext = B.createNaryOp( + VPInstruction::ExtractFromEnd, + {V, Plan.getOrAddLiveIn(ConstantInt::get( + IntegerType::get(ExitBB->getContext(), 32), 1))}); + Plan.addLiveOut(ExitPhi, Ext); + } +} + +/// Handle live-outs for first order reductions, both in the scalar preheader +/// and the original exit block: +/// 1. Feed a resume value for every FOR from the vector loop to the scalar +/// loop, if middle block branches to scalar preheader, by introducing +/// ExtractFromEnd and ResumePhi recipes in each, respectively, and a +/// VPLiveOut which uses the latter and corresponds to the scalar header. +/// 2. Feed the penultimate value of recurrences to their LCSSA phi users in +/// the original exit block using a VPLiveOut. +static void addLiveOutsForFirstOrderRecurrences( + VPlan &Plan, MapVector &ExitingValuesToFix) { VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); // Start by finding out if middle block branches to scalar preheader, which is @@ -8578,21 +8616,31 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) { // TODO: Should be replaced by // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the // scalar region is modeled as well. - VPBasicBlock *ScalarPHVPBB = nullptr; auto *MiddleVPBB = cast(VectorRegion->getSingleSuccessor()); - for (VPBlockBase *Succ : MiddleVPBB->getSuccessors()) { - if (isa(Succ)) - continue; - assert(!ScalarPHVPBB && "Two candidates for ScalarPHVPBB?"); - ScalarPHVPBB = cast(Succ); + BasicBlock *ExitBB = nullptr; + VPBasicBlock *ScalarPHVPBB = nullptr; + if (MiddleVPBB->getNumSuccessors() == 2) { + // Order is strict: first is the exit block, second is the scalar preheader. + ExitBB = + cast(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock(); + ScalarPHVPBB = cast(MiddleVPBB->getSuccessors()[1]); + } else if (ExitingValuesToFix.empty()) { + ScalarPHVPBB = cast(MiddleVPBB->getSingleSuccessor()); + } else { + ExitBB = cast(MiddleVPBB->getSingleSuccessor()) + ->getIRBasicBlock(); } - if (!ScalarPHVPBB) + if (!ScalarPHVPBB) { + assert(ExitingValuesToFix.empty() && + "missed inserting extracts for exiting values"); return; + } VPBuilder ScalarPHBuilder(ScalarPHVPBB); VPBuilder MiddleBuilder(MiddleVPBB); // Reset insert point so new recipes are inserted before terminator and // condition, if there is either the former or both. + // TODO: set MiddleBuilder to MiddleVPBB->getFirstNonPhi(). if (auto *Terminator = MiddleVPBB->getTerminator()) { auto *Condition = dyn_cast(Terminator->getOperand(0)); assert((!Condition || Condition->getParent() == MiddleVPBB) && @@ -8601,12 +8649,81 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) { } VPValue *OneVPV = Plan.getOrAddLiveIn( ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); + VPValue *TwoVPV = Plan.getOrAddLiveIn( + ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2)); for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) { auto *FOR = dyn_cast(&HeaderPhi); if (!FOR) continue; + // This is the second phase of vectorizing first-order recurrences, creating + // extract for users outside the loop. An overview of the transformation is + // described below. Suppose we have the following loop with some use after + // the loop of the last a[i-1], + // + // for (int i = 0; i < n; ++i) { + // t = a[i - 1]; + // b[i] = a[i] - t; + // } + // use t; + // + // There is a first-order recurrence on "a". For this loop, the shorthand + // scalar IR looks like: + // + // scalar.ph: + // s.init = a[-1] + // br scalar.body + // + // scalar.body: + // i = phi [0, scalar.ph], [i+1, scalar.body] + // s1 = phi [s.init, scalar.ph], [s2, scalar.body] + // s2 = a[i] + // b[i] = s2 - s1 + // br cond, scalar.body, exit.block + // + // exit.block: + // use = lcssa.phi [s1, scalar.body] + // + // In this example, s1 is a recurrence because it's value depends on the + // previous iteration. In the first phase of vectorization, we created a + // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts + // for users in the scalar preheader and exit block. + // + // vector.ph: + // v_init = vector(..., ..., ..., a[-1]) + // br vector.body + // + // vector.body + // i = phi [0, vector.ph], [i+4, vector.body] + // v1 = phi [v_init, vector.ph], [v2, vector.body] + // v2 = a[i, i+1, i+2, i+3] + // b[i] = v2 - v1 + // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2)) + // b[i, i+1, i+2, i+3] = v2 - v1 + // br cond, vector.body, middle.block + // + // middle.block: + // vector.recur.extract.for.phi = v2(2) + // vector.recur.extract = v2(3) + // br cond, scalar.ph, exit.block + // + // scalar.ph: + // scalar.recur.init = phi [vector.recur.extract, middle.block], + // [s.init, otherwise] + // br scalar.body + // + // scalar.body: + // i = phi [0, scalar.ph], [i+1, scalar.body] + // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body] + // s2 = a[i] + // b[i] = s2 - s1 + // br cond, scalar.body, exit.block + // + // exit.block: + // lo = lcssa.phi [s1, scalar.body], + // [vector.recur.extract.for.phi, middle.block] + // // Extract the resume value and create a new VPLiveOut for it. auto *Resume = MiddleBuilder.createNaryOp(VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), OneVPV}, @@ -8614,7 +8731,28 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) { auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp( VPInstruction::ResumePhi, {Resume, FOR->getStartValue()}, {}, "scalar.recur.init"); - Plan.addLiveOut(cast(FOR->getUnderlyingInstr()), ResumePhiRecipe); + auto *FORPhi = cast(FOR->getUnderlyingInstr()); + Plan.addLiveOut(FORPhi, ResumePhiRecipe); + + // Now create VPLiveOuts for users in the exit block. + // Extract the penultimate value of the recurrence and add VPLiveOut + // users of the recurrence splice. + + // No edge from the middle block to the unique exit block has been inserted + // and there is nothing to fix from vector loop; phis should have incoming + // from scalar loop only. + if (ExitingValuesToFix.empty()) + continue; + for (User *U : FORPhi->users()) { + auto *UI = cast(U); + if (UI->getParent() != ExitBB) + continue; + VPValue *Ext = MiddleBuilder.createNaryOp( + VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {}, + "vector.recur.extract.for.phi"); + Plan.addLiveOut(cast(UI), Ext); + ExitingValuesToFix.erase(cast(UI)); + } } } @@ -8769,16 +8907,17 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { // After here, VPBB should not be used. VPBB = nullptr; - addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan, - Legal->getInductionVars()); - assert(isa(Plan->getVectorLoopRegion()) && !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry " "VPBasicBlock"); RecipeBuilder.fixHeaderPhis(); - addLiveOutsForFirstOrderRecurrences(*Plan); + MapVector ExitingValuesToFix = collectUsersInExitBlock( + OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars()); + + addLiveOutsForFirstOrderRecurrences(*Plan, ExitingValuesToFix); + addUsersInExitBlock(*Plan, ExitingValuesToFix); // --------------------------------------------------------------------------- // Transform initial VPlan: Apply previously taken decisions, in order, to @@ -8931,6 +9070,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { // iteration. The final value is selected by the final ComputeReductionResult. void LoopVectorizationPlanner::adjustRecipesForReductions( VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { + using namespace VPlanPatternMatch; VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion(); VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock(); // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores @@ -8988,10 +9128,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( for (unsigned I = 0; I != Worklist.size(); ++I) { VPSingleDefRecipe *Cur = Worklist[I]; for (VPUser *U : Cur->users()) { - auto *UserRecipe = dyn_cast(U); - if (!UserRecipe) { - assert(isa(U) && - "U must either be a VPSingleDef or VPLiveOut"); + auto *UserRecipe = cast(U); + if (!UserRecipe->getParent()->getEnclosingLoopRegion()) { + assert(match(U, m_Binary( + m_VPValue(), m_VPValue())) && + "U must be an ExtractFromEnd VPInstruction"); continue; } Worklist.insert(UserRecipe); @@ -9208,9 +9349,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( auto *FinalReductionResult = new VPInstruction( VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); FinalReductionResult->insertBefore(*MiddleVPBB, IP); - OrigExitingVPV->replaceUsesWithIf( - FinalReductionResult, - [](VPUser &User, unsigned) { return isa(&User); }); + OrigExitingVPV->replaceUsesWithIf(FinalReductionResult, [](VPUser &User, + unsigned) { + return match(&User, m_Binary(m_VPValue(), + m_VPValue())); + }); } VPlanTransforms::clearReductionWrapFlags(*Plan); diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 9cd7712624bac4..5f86f2c969651b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -29,6 +29,11 @@ template bool match(Val *V, const Pattern &P) { return const_cast(P).match(V); } +template bool match(VPUser *U, const Pattern &P) { + auto *R = dyn_cast(U); + return R && match(R, P); +} + template struct class_match { template bool match(ITy *V) { return isa(V); } }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index c9d603612aecea..aea5e681b081c6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -194,9 +194,6 @@ bool VPRecipeBase::mayHaveSideEffects() const { void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { VPValue *ExitValue = getOperand(0); - auto Lane = vputils::isUniformAfterVectorization(ExitValue) - ? VPLane::getFirstLane() - : VPLane::getLastLaneForVF(State.VF); VPBasicBlock *MiddleVPBB = cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe(); @@ -207,10 +204,7 @@ void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { ? MiddleVPBB : ExitingVPBB; BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; - // Set insertion point in PredBB in case an extract needs to be generated. - // TODO: Model extracts explicitly. - State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt()); - Value *V = State.get(ExitValue, VPIteration(State.UF - 1, Lane)); + Value *V = State.get(ExitValue, VPIteration(0, 0)); if (Phi->getBasicBlockIndex(PredBB) != -1) Phi->setIncomingValueForBlock(PredBB, V); else diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 045f6c356669fa..a2496f067024cb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -826,20 +826,6 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan, if (auto *FOR = dyn_cast(&R)) RecurrencePhis.push_back(FOR); - VPBasicBlock *MiddleVPBB = - cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); - VPBuilder MiddleBuilder; - // Set insert point so new recipes are inserted before terminator and - // condition, if there is either the former or both. - if (auto *Term = - dyn_cast_or_null(MiddleVPBB->getTerminator())) { - if (auto *Cmp = dyn_cast(Term->getOperand(0))) - MiddleBuilder.setInsertPoint(Cmp); - else - MiddleBuilder.setInsertPoint(Term); - } else - MiddleBuilder.setInsertPoint(MiddleVPBB); - for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) { SmallPtrSet SeenPhis; VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe(); @@ -872,86 +858,6 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan, // Set the first operand of RecurSplice to FOR again, after replacing // all users. RecurSplice->setOperand(0, FOR); - - // This is the second phase of vectorizing first-order recurrences. An - // overview of the transformation is described below. Suppose we have the - // following loop with some use after the loop of the last a[i-1], - // - // for (int i = 0; i < n; ++i) { - // t = a[i - 1]; - // b[i] = a[i] - t; - // } - // use t; - // - // There is a first-order recurrence on "a". For this loop, the shorthand - // scalar IR looks like: - // - // scalar.ph: - // s_init = a[-1] - // br scalar.body - // - // scalar.body: - // i = phi [0, scalar.ph], [i+1, scalar.body] - // s1 = phi [s_init, scalar.ph], [s2, scalar.body] - // s2 = a[i] - // b[i] = s2 - s1 - // br cond, scalar.body, exit.block - // - // exit.block: - // use = lcssa.phi [s1, scalar.body] - // - // In this example, s1 is a recurrence because it's value depends on the - // previous iteration. In the first phase of vectorization, we created a - // vector phi v1 for s1. We now complete the vectorization and produce the - // shorthand vector IR shown below (for VF = 4, UF = 1). - // - // vector.ph: - // v_init = vector(..., ..., ..., a[-1]) - // br vector.body - // - // vector.body - // i = phi [0, vector.ph], [i+4, vector.body] - // v1 = phi [v_init, vector.ph], [v2, vector.body] - // v2 = a[i, i+1, i+2, i+3]; - // v3 = vector(v1(3), v2(0, 1, 2)) - // b[i, i+1, i+2, i+3] = v2 - v3 - // br cond, vector.body, middle.block - // - // middle.block: - // s_penultimate = v2(2) = v3(3) - // s_resume = v2(3) - // br cond, scalar.ph, exit.block - // - // scalar.ph: - // s_init' = phi [s_resume, middle.block], [s_init, otherwise] - // br scalar.body - // - // scalar.body: - // i = phi [0, scalar.ph], [i+1, scalar.body] - // s1 = phi [s_init', scalar.ph], [s2, scalar.body] - // s2 = a[i] - // b[i] = s2 - s1 - // br cond, scalar.body, exit.block - // - // exit.block: - // lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block] - // - // After execution completes the vector loop, we extract the next value of - // the recurrence (x) to use as the initial value in the scalar loop. This - // is modeled by ExtractFromEnd. - Type *IntTy = Plan.getCanonicalIV()->getScalarType(); - - // Extract the penultimate value of the recurrence and update VPLiveOut - // users of the recurrence splice. Note that the extract of the final value - // used to resume in the scalar loop is created earlier during VPlan - // construction. - auto *Penultimate = cast(MiddleBuilder.createNaryOp( - VPInstruction::ExtractFromEnd, - {FOR->getBackedgeValue(), - Plan.getOrAddLiveIn(ConstantInt::get(IntTy, 2))}, - {}, "vector.recur.extract.for.phi")); - RecurSplice->replaceUsesWithIf( - Penultimate, [](VPUser &U, unsigned) { return isa(&U); }); } return true; } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index 16db6cf828af8a..f14ffe854a3a6b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -55,6 +55,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-EMPTY: ; IF-EVL-INLOOP-NEXT: middle.block: ; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]> +; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1> ; IF-EVL-INLOOP-NEXT: EMIT branch-on-cond ir ; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb, scalar.ph ; IF-EVL-INLOOP-EMPTY: @@ -64,7 +65,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: scalar.ph: ; IF-EVL-INLOOP-NEXT: No successors ; IF-EVL-INLOOP-EMPTY: -; IF-EVL-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]> +; IF-EVL-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]> ; IF-EVL-INLOOP-NEXT: } ; @@ -93,6 +94,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-OUTLOOP-EMPTY: ; NO-VP-OUTLOOP-NEXT: middle.block: ; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]> +; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1> ; NO-VP-OUTLOOP-NEXT: EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]> ; NO-VP-OUTLOOP-NEXT: EMIT branch-on-cond vp<[[BOC]]> ; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb, scalar.ph @@ -103,7 +105,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-OUTLOOP-NEXT: scalar.ph: ; NO-VP-OUTLOOP-NEXT: No successors ; NO-VP-OUTLOOP-EMPTY: -; NO-VP-OUTLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]> +; NO-VP-OUTLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]> ; NO-VP-OUTLOOP-NEXT: } ; @@ -132,6 +134,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-INLOOP-EMPTY: ; NO-VP-INLOOP-NEXT: middle.block: ; NO-VP-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]> +; NO-VP-INLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1> ; NO-VP-INLOOP-NEXT: EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]> ; NO-VP-INLOOP-NEXT: EMIT branch-on-cond vp<[[BOC]]> ; NO-VP-INLOOP-NEXT: Successor(s): ir-bb, scalar.ph @@ -142,7 +145,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-INLOOP-NEXT: scalar.ph: ; NO-VP-INLOOP-NEXT: No successors ; NO-VP-INLOOP-EMPTY: -; NO-VP-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]> +; NO-VP-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]> ; NO-VP-INLOOP-NEXT: } ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll index 94f35ad453670b..1e34e1d0d517d1 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll @@ -19,8 +19,8 @@ define i16 @test_chained_first_order_recurrences_1(ptr %ptr) { ; CHECK-NEXT: br i1 [[TMP8]], label %middle.block, label %vector.body ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 ; entry: @@ -62,8 +62,8 @@ define i16 @test_chained_first_order_recurrences_2(ptr %ptr) { ; CHECK-NEXT: br i1 [[TMP8]], label %middle.block, label %vector.body, !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 ; entry: @@ -108,10 +108,10 @@ define i16 @test_chained_first_order_recurrences_3(ptr %ptr) { ; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 ; entry: @@ -220,10 +220,10 @@ define i16 @test_chained_first_order_recurrences_3_reordered_1(ptr %ptr) { ; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 ; entry: @@ -271,10 +271,10 @@ define i16 @test_chained_first_order_recurrences_3_reordered_2(ptr %ptr) { ; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 ; entry: @@ -322,10 +322,10 @@ define i16 @test_chained_first_order_recurrences_3_for2_no_other_uses(ptr %ptr) ; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 ; entry: @@ -372,10 +372,10 @@ define i16 @test_chained_first_order_recurrences_3_for1_for2_no_other_uses(ptr % ; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 ; entry: @@ -421,8 +421,8 @@ define double @test_chained_first_order_recurrence_sink_users_1(ptr %ptr) { ; CHECK-NEXT: br i1 [[TMP9]], label %middle.block, label %vector.body, !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x double> [[TMP4]], i32 2 ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll index 371c58e8eb9cc9..eda92aae095ddf 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll @@ -1125,7 +1125,7 @@ define void @test_for_sink_instruction_after_same_incoming_2(ptr %ptr) { ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ] ; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2.000000e+01, [[ENTRY]] ] ; CHECK-NEXT: [[SCALAR_RECUR_INIT3:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ 1.000000e+01, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll index 06fbeafba31c01..9e49cf6b42c6b0 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -220,6 +220,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%and.red>, vp<[[SEL]]> ; CHECK-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-from-end ir<%recur.next>, ir<1> +; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1> ; CHECK-NEXT: EMIT branch-on-cond ir ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph ; CHECK-EMPTY: @@ -230,8 +231,8 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0> ; CHECK-NEXT: No successors ; CHECK-EMPTY: -; CHECK-NEXT: Live-out i32 %res = vp<[[RED_RES]]> ; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]> +; CHECK-NEXT: Live-out i32 %res = vp<[[RED_EX]]> ; CHECK-NEXT: } ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll index 5f5cd78dc2d30c..553fc374e0fdf2 100644 --- a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll @@ -34,7 +34,7 @@ define i32 @one_direct_branch(ptr %src) { ; CHECK-NEXT: [[PHI_XOR:%.*]] = phi i32 [ [[XOR]], [[LOOP]] ] ; CHECK-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 ; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[PHI_XOR]], [[LOOP_LATCH]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[XOR_LCSSA]] @@ -198,6 +198,8 @@ exit: } ; Test case for PR54370. +; TODO: Should either compute the final value of the truncated IV independent +; of loop or scalarize the vector IV. define i32 @optimizable_trunc_used_outside() { ; CHECK-LABEL: @optimizable_trunc_used_outside( ; CHECK-NEXT: entry: @@ -205,16 +207,14 @@ define i32 @optimizable_trunc_used_outside() { ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32 -; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 2 -; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], 3 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3 ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -226,7 +226,7 @@ define i32 @optimizable_trunc_used_outside() { ; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: exit: -; CHECK-NEXT: [[IV_TRUNC_LCSSA:%.*]] = phi i32 [ [[IV_TRUNC]], [[LOOP]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[IV_TRUNC_LCSSA:%.*]] = phi i32 [ [[IV_TRUNC]], [[LOOP]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[IV_TRUNC_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/pr36983.ll b/llvm/test/Transforms/LoopVectorize/pr36983.ll index f4da4f355eb0d1..20689456fa4d16 100644 --- a/llvm/test/Transforms/LoopVectorize/pr36983.ll +++ b/llvm/test/Transforms/LoopVectorize/pr36983.ll @@ -3,7 +3,7 @@ ; There could be more than one LCSSA PHIs in loop exit block. ; CHECK-LABEL: bb1.bb3_crit_edge: -; CHECK: %_tmp133.lcssa1 = phi i16 [ %_tmp133, %bb2 ], [ %vector.recur.extract.for.phi, %middle.block ] +; CHECK: %_tmp133.lcssa1 = phi i16 [ %_tmp133, %bb2 ], [ %vector.recur.extract.for.phi1, %middle.block ] ; CHECK: %_tmp133.lcssa = phi i16 [ %_tmp133, %bb2 ], [ %vector.recur.extract.for.phi, %middle.block ] define void @f1() { diff --git a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll index 72f8cf22cafa78..b79525bc3e440d 100644 --- a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll +++ b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll @@ -34,8 +34,8 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 { ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 176 ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[PREDPHI5]], i32 1 ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PREDPHI7]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[PREDPHI5]], i32 1 ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 182, [[MIDDLE_BLOCK]] ], [ 6, [[BB:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll b/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll index 9eb90099214e1c..b88e597e6bc8e8 100644 --- a/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll +++ b/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll @@ -916,13 +916,13 @@ define i32 @multi_user_cmp_branch_use_and_outside_bb_use(ptr readonly %a, i64 no ; CHECK-VF4-IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4-IC1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-VF4-IC1: middle.block: -; CHECK-VF4-IC1-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 ; CHECK-VF4-IC1-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) ; CHECK-VF4-IC1-NEXT: [[TMP10:%.*]] = freeze i1 [[TMP9]] ; CHECK-VF4-IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP10]], i1 false, i1 true ; CHECK-VF4-IC1-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) ; CHECK-VF4-IC1-NEXT: [[TMP12:%.*]] = freeze i1 [[TMP11]] ; CHECK-VF4-IC1-NEXT: [[RDX_SELECT2:%.*]] = select i1 [[TMP12]], i1 true, i1 false +; CHECK-VF4-IC1-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 ; CHECK-VF4-IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-VF4-IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-VF4-IC1: scalar.ph: @@ -986,7 +986,6 @@ define i32 @multi_user_cmp_branch_use_and_outside_bb_use(ptr readonly %a, i64 no ; CHECK-VF4-IC2-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4-IC2-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-VF4-IC2: middle.block: -; CHECK-VF4-IC2-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3 ; CHECK-VF4-IC2-NEXT: [[BIN_RDX:%.*]] = or <4 x i1> [[TMP13]], [[TMP12]] ; CHECK-VF4-IC2-NEXT: [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX]]) ; CHECK-VF4-IC2-NEXT: [[TMP17:%.*]] = freeze i1 [[TMP16]] @@ -995,6 +994,7 @@ define i32 @multi_user_cmp_branch_use_and_outside_bb_use(ptr readonly %a, i64 no ; CHECK-VF4-IC2-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX5]]) ; CHECK-VF4-IC2-NEXT: [[TMP19:%.*]] = freeze i1 [[TMP18]] ; CHECK-VF4-IC2-NEXT: [[RDX_SELECT6:%.*]] = select i1 [[TMP19]], i1 true, i1 false +; CHECK-VF4-IC2-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3 ; CHECK-VF4-IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-VF4-IC2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-VF4-IC2: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 3a664de748d2d7..f18ed825a6b886 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -154,6 +154,7 @@ define float @print_reduction(i64 %n, ptr noalias %y) { ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%red>, ir<%red.next> +; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1> ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VEC_TC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph @@ -164,7 +165,7 @@ define float @print_reduction(i64 %n, ptr noalias %y) { ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: No successors ; CHECK-EMPTY: -; CHECK-NEXT: Live-out float %red.next.lcssa = vp<[[RED_RES]]> +; CHECK-NEXT: Live-out float %red.next.lcssa = vp<[[RED_EX]]> ; CHECK-NEXT: } ; entry: @@ -435,6 +436,7 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) { ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%sum.07>, ir<[[MULADD]]> +; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1> ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VEC_TC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph @@ -445,7 +447,7 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) { ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: No successors ; CHECK-EMPTY: -; CHECK-NEXT: Live-out float %muladd.lcssa = vp<[[RED_RES]]> +; CHECK-NEXT: Live-out float %muladd.lcssa = vp<[[RED_EX]]> ; CHECK-NEXT:} entry: @@ -654,6 +656,7 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) { ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[EXIT:%.+]]> = extract-from-end ir<%add>, ir<1> ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1000>, vp<[[VEC_TC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph @@ -664,7 +667,7 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) { ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: No successors ; CHECK-EMPTY: -; CHECK-NEXT: Live-out i32 %lcssa = ir<%add> +; CHECK-NEXT: Live-out i32 %lcssa = vp<[[EXIT]]> ; CHECK-NEXT: } ; entry: @@ -1036,8 +1039,8 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) { ; CHECK-NEXT: EMIT vp<[[RESUME_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22> ; CHECK-NEXT: No successors ; CHECK-EMPTY: -; CHECK-NEXT: Live-out i16 %for.1.lcssa = vp<[[FOR_RESULT]]> ; CHECK-NEXT: Live-out i16 %for.1 = vp<[[RESUME_P]]> +; CHECK-NEXT: Live-out i16 %for.1.lcssa = vp<[[FOR_RESULT]]> ; CHECK-NEXT: } ; entry: From 9d739e54f4506bf9bd220c5d65f710b86a39f6d5 Mon Sep 17 00:00:00 2001 From: Mital Ashok Date: Wed, 21 Aug 2024 09:09:08 +0100 Subject: [PATCH 029/426] [Clang] Implement CWG2351 `void{}` (#78060) Per [CWG2351](https://wg21.link/CWG2351), allow `void{}`, treated the same as `void()`: a prvalue expression of type `void` that performs no initialization. Note that the AST for the expression `T{}` looks like: ``` // using T = int; CXXFunctionalCastExpr 'T':'int' functional cast to T `-InitListExpr 'T':'int' // using T = const int; CXXFunctionalCastExpr 'int' functional cast to T `-InitListExpr 'int' // using T = void; CXXFunctionalCastExpr 'T':'void' functional cast to T `-InitListExpr 'void' // using T = const void; CXXFunctionalCastExpr 'void' functional cast to T `-InitListExpr 'void' ``` As for `void()`/`T() [T = const void]`, that looked like `CXXScalarValueInitExpr 'void'` and is unchanged after this. For reference, C++98 [5.2.3p2] says: > The expression `T()`, where `T` is a simple-type-specifier (7.1.5.2) for a non-array complete object type or the (possibly cv-qualified) void type, creates an rvalue of the specified type, whose value is determined by default-initialization (8.5; no initialization is done for the `void()` case). [*Note:* if `T` is a non-class type that is *cv-qualified*, the `cv-qualifiers` are ignored when determining the type of the resulting rvalue (3.10). ] Though it is a bit of a misnomer that, for `T = void`, `CXXScalarValueInitExpr` does not perform value initialization, it would be a breaking change to change the AST node for `void()`, so I simply reworded the doc comment. --- clang/docs/ReleaseNotes.rst | 3 ++ clang/include/clang/AST/ExprCXX.h | 5 +-- clang/lib/Sema/SemaExprCXX.cpp | 23 +++++++++---- clang/lib/Sema/SemaInit.cpp | 1 + clang/test/CXX/drs/cwg23xx.cpp | 39 +++++++++++++++++++++- clang/test/SemaCXX/attr-annotate.cpp | 4 +-- clang/test/SemaCXX/cxx2a-explicit-bool.cpp | 4 +-- clang/test/SemaCXX/sugared-auto.cpp | 6 ++++ clang/www/cxx_dr_status.html | 2 +- 9 files changed, 73 insertions(+), 14 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 1df3f0e7e75ca3..127b9541d5c5d8 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -152,6 +152,9 @@ Resolutions to C++ Defect Reports - ``nullptr`` is now promoted to ``void*`` when passed to a C-style variadic function. (`CWG722: Can nullptr be passed to an ellipsis? `_) +- Allow ``void{}`` as a prvalue of type ``void``. + (`CWG2351: void{} `_). + C Language Changes ------------------ diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h index 847a6ea408e98e..975bcdac5069b9 100644 --- a/clang/include/clang/AST/ExprCXX.h +++ b/clang/include/clang/AST/ExprCXX.h @@ -2176,8 +2176,9 @@ class LambdaExpr final : public Expr, const_child_range children() const; }; -/// An expression "T()" which creates a value-initialized rvalue of type -/// T, which is a non-class type. See (C++98 [5.2.3p2]). +/// An expression "T()" which creates an rvalue of a non-class type T. +/// For non-void T, the rvalue is value-initialized. +/// See (C++98 [5.2.3p2]). class CXXScalarValueInitExpr : public Expr { friend class ASTStmtReader; diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index 5356bcf172f752..746c67ff1e979f 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -1646,12 +1646,23 @@ Sema::BuildCXXTypeConstructExpr(TypeSourceInfo *TInfo, return ExprError(Diag(TyBeginLoc, diag::err_init_for_function_type) << Ty << FullRange); - // C++17 [expr.type.conv]p2: - // If the type is cv void and the initializer is (), the expression is a - // prvalue of the specified type that performs no initialization. - if (!Ty->isVoidType() && - RequireCompleteType(TyBeginLoc, ElemTy, - diag::err_invalid_incomplete_type_use, FullRange)) + // C++17 [expr.type.conv]p2, per DR2351: + // If the type is cv void and the initializer is () or {}, the expression is + // a prvalue of the specified type that performs no initialization. + if (Ty->isVoidType()) { + if (Exprs.empty()) + return new (Context) CXXScalarValueInitExpr( + Ty.getUnqualifiedType(), TInfo, Kind.getRange().getEnd()); + if (ListInitialization && + cast(Exprs[0])->getNumInits() == 0) { + return CXXFunctionalCastExpr::Create( + Context, Ty.getUnqualifiedType(), VK_PRValue, TInfo, CK_ToVoid, + Exprs[0], /*Path=*/nullptr, CurFPFeatureOverrides(), + Exprs[0]->getBeginLoc(), Exprs[0]->getEndLoc()); + } + } else if (RequireCompleteType(TyBeginLoc, ElemTy, + diag::err_invalid_incomplete_type_use, + FullRange)) return ExprError(); // Otherwise, the expression is a prvalue of the specified type whose diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 2666e60c0dd67c..dcfe3bc80c87ac 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -5490,6 +5490,7 @@ static void TryValueInitialization(Sema &S, // // To value-initialize an object of type T means: QualType T = Entity.getType(); + assert(!T->isVoidType() && "Cannot value-init void"); // -- if T is an array type, then each element is value-initialized; T = S.Context.getBaseElementType(T); diff --git a/clang/test/CXX/drs/cwg23xx.cpp b/clang/test/CXX/drs/cwg23xx.cpp index 77fd6a337436e3..7f57d237526bc5 100644 --- a/clang/test/CXX/drs/cwg23xx.cpp +++ b/clang/test/CXX/drs/cwg23xx.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -std=c++98 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s +// RUN: %clang_cc1 -std=c++98 %s -verify=expected,cxx98 -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s // RUN: %clang_cc1 -std=c++11 %s -verify=expected,cxx11-14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s // RUN: %clang_cc1 -std=c++14 %s -verify=expected,cxx11-14,since-cxx11,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s // RUN: %clang_cc1 -std=c++17 %s -verify=expected,since-cxx11,since-cxx14,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s @@ -213,6 +213,43 @@ namespace cwg2346 { // cwg2346: 11 } } +namespace cwg2351 { // cwg2351: 20 +#if __cplusplus >= 201103L + static_assert((void{}, true), ""); + + void f() { + return void{}; + } + + template + void g() { + return T{}; + } + template void g(); + template void g(); + + void h() { + return {}; + // since-cxx11-error@-1 {{void function 'h' must not return a value}} + } + + template + T i() { + return T{I...}; + } + template void i(); + template const void i(); + + static_assert((void({}), true), ""); + // since-cxx11-error@-1 {{cannot initialize non-class type 'void' with a parenthesized initializer list}} +#else + int I = (void{}, 0); + // cxx98-error@-1 {{expected ')'}} + // cxx98-note@-2 {{to match this '('}} + // cxx98-error@-3 {{expected expression}} +#endif +} + namespace cwg2352 { // cwg2352: 10 int **p; const int *const *const &f1() { return p; } diff --git a/clang/test/SemaCXX/attr-annotate.cpp b/clang/test/SemaCXX/attr-annotate.cpp index 3854f72bbcac1c..846ef4119f1d7c 100644 --- a/clang/test/SemaCXX/attr-annotate.cpp +++ b/clang/test/SemaCXX/attr-annotate.cpp @@ -43,10 +43,10 @@ namespace test0 { template struct B { [[clang::annotate("test", ((void)T{}, 9))]] void t() {} - // expected-error@-1 {{illegal initializer type 'void'}} + // expected-error@-1 {{cannot create object of function type 'void ()'}} }; B b; - B b1; + B b1; // expected-note@-1 {{in instantiation of template class}} } diff --git a/clang/test/SemaCXX/cxx2a-explicit-bool.cpp b/clang/test/SemaCXX/cxx2a-explicit-bool.cpp index 03799c52654a5f..c106de1e5efd09 100644 --- a/clang/test/SemaCXX/cxx2a-explicit-bool.cpp +++ b/clang/test/SemaCXX/cxx2a-explicit-bool.cpp @@ -75,11 +75,11 @@ struct D { template struct E { // expected-note@-1+ {{candidate constructor}} explicit((T{}, false)) - // expected-error@-1 {{illegal initializer type 'void'}} + // expected-error@-1 {{cannot create object of function type 'void ()'}} E(int); }; -E e = 1; +E e = 1; // expected-error@-1 {{no viable conversion}} // expected-note@-2 {{in instantiation of}} diff --git a/clang/test/SemaCXX/sugared-auto.cpp b/clang/test/SemaCXX/sugared-auto.cpp index 5fdfb09689b667..b5bb4f0f85a775 100644 --- a/clang/test/SemaCXX/sugared-auto.cpp +++ b/clang/test/SemaCXX/sugared-auto.cpp @@ -112,6 +112,12 @@ N t6 = [] { // expected-error {{rvalue of type 'void'}} return; }(); +N t7 = [] { // expected-error {{rvalue of type 'Virus' (aka 'void')}} + if (true) + return Ebola(); + return SARS{}; +}(); + } // namespace function_multiple_basic #define TEST_AUTO(X, A, B) \ diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index e6c955a5c0e255..a8d2d813d0f536 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -13921,7 +13921,7 @@

C++ defect report implementation status

2351 CD5 void{} - Unknown + Clang 20 2352 From 0b0ccd56c9cabef48a6c99abe6e0b41ac69e5faa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= Date: Wed, 21 Aug 2024 10:10:36 +0200 Subject: [PATCH 030/426] [GlobalIsel] Push cast through build vector (#104634) Credits: https://github.com/llvm/llvm-project/pull/100563 --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 3 + .../include/llvm/Target/GlobalISel/Combine.td | 13 +- .../GlobalISel/CombinerHelperCasts.cpp | 40 ++ .../AArch64/GISel/AArch64LegalizerInfo.cpp | 1 + .../AArch64/GlobalISel/combine-cast.mir | 95 +++ .../GlobalISel/combine-extract-vec-elt.mir | 4 +- .../AArch64/GlobalISel/combine-with-flags.mir | 45 +- .../CodeGen/AArch64/arm64-subvector-extend.ll | 456 +++++++----- llvm/test/CodeGen/AArch64/neon-extadd.ll | 376 ++++++---- llvm/test/CodeGen/AArch64/sext.ll | 354 ++++++---- llvm/test/CodeGen/AArch64/vecreduce-add.ll | 664 +++++++++++------- llvm/test/CodeGen/AArch64/xtn.ll | 46 +- llvm/test/CodeGen/AArch64/zext.ll | 263 ++++--- 13 files changed, 1450 insertions(+), 910 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 32effc536eb35d..9b62d6067be39c 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -900,6 +900,9 @@ class CombinerHelper { bool matchExtOfExt(const MachineInstr &FirstMI, const MachineInstr &SecondMI, BuildFnTy &MatchInfo); + bool matchCastOfBuildVector(const MachineInstr &CastMI, + const MachineInstr &BVMI, BuildFnTy &MatchInfo); + private: /// Checks for legality of an indexed variant of \p LdSt. bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const; diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 814c5e789cb374..525cc815e73cef 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1857,6 +1857,16 @@ def anyext_of_anyext : ext_of_ext_opcodes; def anyext_of_zext : ext_of_ext_opcodes; def anyext_of_sext : ext_of_ext_opcodes; +// Push cast through build vector. +class buildvector_of_opcode : GICombineRule < + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (G_BUILD_VECTOR $bv, GIVariadic<>:$unused):$Build, + (castOpcode $root, $bv):$Cast, + [{ return Helper.matchCastOfBuildVector(*${Cast}, *${Build}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${Cast}, ${matchinfo}); }])>; + +def buildvector_of_truncate : buildvector_of_opcode; + def cast_combines: GICombineGroup<[ truncate_of_zext, truncate_of_sext, @@ -1870,7 +1880,8 @@ def cast_combines: GICombineGroup<[ sext_of_anyext, anyext_of_anyext, anyext_of_zext, - anyext_of_sext + anyext_of_sext, + buildvector_of_truncate ]>; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp index 494d8da84445d1..8714fdabf65494 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp @@ -273,3 +273,43 @@ bool CombinerHelper::matchExtOfExt(const MachineInstr &FirstMI, return false; } + +bool CombinerHelper::matchCastOfBuildVector(const MachineInstr &CastMI, + const MachineInstr &BVMI, + BuildFnTy &MatchInfo) { + const GExtOrTruncOp *Cast = cast(&CastMI); + const GBuildVector *BV = cast(&BVMI); + + if (!MRI.hasOneNonDBGUse(BV->getReg(0))) + return false; + + Register Dst = Cast->getReg(0); + // The type of the new build vector. + LLT DstTy = MRI.getType(Dst); + // The scalar or element type of the new build vector. + LLT ElemTy = DstTy.getScalarType(); + // The scalar or element type of the old build vector. + LLT InputElemTy = MRI.getType(BV->getReg(0)).getElementType(); + + // Check legality of new build vector, the scalar casts, and profitability of + // the many casts. + if (!isLegalOrBeforeLegalizer( + {TargetOpcode::G_BUILD_VECTOR, {DstTy, ElemTy}}) || + !isLegalOrBeforeLegalizer({Cast->getOpcode(), {ElemTy, InputElemTy}}) || + !isCastFree(Cast->getOpcode(), ElemTy, InputElemTy)) + return false; + + MatchInfo = [=](MachineIRBuilder &B) { + SmallVector Casts; + unsigned Elements = BV->getNumSources(); + for (unsigned I = 0; I < Elements; ++I) { + auto CastI = + B.buildInstr(Cast->getOpcode(), {ElemTy}, {BV->getSourceReg(I)}); + Casts.push_back(CastI.getReg(0)); + } + + B.buildBuildVector(Dst, Casts); + }; + + return true; +} diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index d3c5742cee3eb4..33a1fa1ad04fdf 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -953,6 +953,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .clampNumElements(0, v2s64, v2s64) .minScalarOrElt(0, s8) .widenVectorEltsToVectorMinSize(0, 64) + .widenScalarOrEltToNextPow2(0) .minScalarSameAs(1, 0); getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower(); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir index 0f436127ea2eb6..ae04cc77dcaf13 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir @@ -129,3 +129,98 @@ body: | %res:_(<2 x s64>) = G_SELECT %cond(<2 x s32>), %bv, %bv2 %small:_(<2 x s32>) = G_TRUNC %res(<2 x s64>) $x0 = COPY %small(<2 x s32>) +... +--- +name: test_combine_trunc_build_vector +legalized: true +body: | + bb.1: + ; CHECK-PRE-LABEL: name: test_combine_trunc_build_vector + ; CHECK-PRE: %arg1:_(s64) = COPY $x0 + ; CHECK-PRE-NEXT: %arg2:_(s64) = COPY $x0 + ; CHECK-PRE-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %arg1(s64) + ; CHECK-PRE-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC %arg2(s64) + ; CHECK-PRE-NEXT: %small:_(<2 x s32>) = G_BUILD_VECTOR [[TRUNC]](s32), [[TRUNC1]](s32) + ; CHECK-PRE-NEXT: $x0 = COPY %small(<2 x s32>) + ; + ; CHECK-POST-LABEL: name: test_combine_trunc_build_vector + ; CHECK-POST: %arg1:_(s64) = COPY $x0 + ; CHECK-POST-NEXT: %arg2:_(s64) = COPY $x0 + ; CHECK-POST-NEXT: %bv:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64) + ; CHECK-POST-NEXT: %small:_(<2 x s32>) = G_TRUNC %bv(<2 x s64>) + ; CHECK-POST-NEXT: $x0 = COPY %small(<2 x s32>) + %arg1:_(s64) = COPY $x0 + %arg2:_(s64) = COPY $x0 + %bv:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64) + %small:_(<2 x s32>) = G_TRUNC %bv(<2 x s64>) + $x0 = COPY %small(<2 x s32>) +... +--- +name: test_combine_zext_build_vector +legalized: true +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_zext_build_vector + ; CHECK: %arg1:_(s32) = COPY $w0 + ; CHECK-NEXT: %arg2:_(s32) = COPY $w0 + ; CHECK-NEXT: %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32) + ; CHECK-NEXT: %large:_(<2 x s64>) = G_ZEXT %bv(<2 x s32>) + ; CHECK-NEXT: $q0 = COPY %large(<2 x s64>) + %arg1:_(s32) = COPY $w0 + %arg2:_(s32) = COPY $w0 + %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32) + %large:_(<2 x s64>) = G_ZEXT %bv(<2 x s32>) + $q0 = COPY %large(<2 x s64>) +... +--- +name: test_combine_anyext_build_vector +legalized: true +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_anyext_build_vector + ; CHECK: %arg1:_(s32) = COPY $w0 + ; CHECK-NEXT: %arg2:_(s32) = COPY $w0 + ; CHECK-NEXT: %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32) + ; CHECK-NEXT: %large:_(<2 x s64>) = G_ANYEXT %bv(<2 x s32>) + ; CHECK-NEXT: $q0 = COPY %large(<2 x s64>) + %arg1:_(s32) = COPY $w0 + %arg2:_(s32) = COPY $w0 + %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32) + %large:_(<2 x s64>) = G_ANYEXT %bv(<2 x s32>) + $q0 = COPY %large(<2 x s64>) +... +--- +name: test_combine_sext_build_vector +legalized: true +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_sext_build_vector + ; CHECK: %arg1:_(s32) = COPY $w0 + ; CHECK-NEXT: %arg2:_(s32) = COPY $w0 + ; CHECK-NEXT: %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32) + ; CHECK-NEXT: %large:_(<2 x s64>) = G_SEXT %bv(<2 x s32>) + ; CHECK-NEXT: $q0 = COPY %large(<2 x s64>) + %arg1:_(s32) = COPY $w0 + %arg2:_(s32) = COPY $w0 + %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32) + %large:_(<2 x s64>) = G_SEXT %bv(<2 x s32>) + $q0 = COPY %large(<2 x s64>) +... +--- +name: test_combine_anyext_build_vector_multi_use +legalized: true +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_anyext_build_vector_multi_use + ; CHECK: %arg1:_(s32) = COPY $w0 + ; CHECK-NEXT: %arg2:_(s32) = COPY $w0 + ; CHECK-NEXT: %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32) + ; CHECK-NEXT: %large:_(<2 x s64>) = G_ANYEXT %bv(<2 x s32>) + ; CHECK-NEXT: $q0 = COPY %large(<2 x s64>) + ; CHECK-NEXT: $d0 = COPY %bv(<2 x s32>) + %arg1:_(s32) = COPY $w0 + %arg2:_(s32) = COPY $w0 + %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32) + %large:_(<2 x s64>) = G_ANYEXT %bv(<2 x s32>) + $q0 = COPY %large(<2 x s64>) + $d0 = COPY %bv(<2 x s32>) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir index 70241e71aa593f..c98dcf6ccb7966 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir @@ -49,8 +49,8 @@ body: | ; CHECK: liveins: $x0, $x1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %arg1:_(s64) = COPY $x0 - ; CHECK-NEXT: %extract:_(s32) = G_TRUNC %arg1(s64) - ; CHECK-NEXT: %zext:_(s64) = G_ZEXT %extract(s32) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %arg1(s64) + ; CHECK-NEXT: %zext:_(s64) = G_ZEXT [[TRUNC]](s32) ; CHECK-NEXT: $x0 = COPY %zext(s64) ; CHECK-NEXT: RET_ReallyLR implicit $x0 %arg1:_(s64) = COPY $x0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir index 6eece5c56258dc..8cb44605246ffa 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir @@ -60,8 +60,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 - ; CHECK-NEXT: %bv0:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY]](s32), [[COPY1]](s32) - ; CHECK-NEXT: $q0 = COPY %bv0(<4 x s32>) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK-NEXT: %trunc:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC]](s16), [[TRUNC1]](s16) + ; CHECK-NEXT: %zext:_(<4 x s32>) = G_ZEXT %trunc(<4 x s16>) + ; CHECK-NEXT: $q0 = COPY %zext(<4 x s32>) ; CHECK-NEXT: RET_ReallyLR implicit $w0 %0:_(s32) = COPY $w0 %1:_(s32) = COPY $w1 @@ -165,8 +168,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $w3 - ; CHECK-NEXT: %bv0:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; CHECK-NEXT: $q0 = COPY %bv0(<4 x s32>) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32) + ; CHECK-NEXT: %t:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; CHECK-NEXT: %s:_(<4 x s32>) = G_SEXT %t(<4 x s16>) + ; CHECK-NEXT: $q0 = COPY %s(<4 x s32>) %0:_(s32) = COPY $w0 %1:_(s32) = COPY $w1 %2:_(s32) = COPY $w2 @@ -188,8 +196,11 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $w3 - ; CHECK-NEXT: %bv0:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; CHECK-NEXT: %t:_(<4 x s16>) = G_TRUNC %bv0(<4 x s32>) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32) + ; CHECK-NEXT: %t:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) ; CHECK-NEXT: %z:_(<4 x s32>) = G_ZEXT %t(<4 x s16>) ; CHECK-NEXT: $q0 = COPY %z(<4 x s32>) %0:_(s32) = COPY $w0 @@ -213,8 +224,11 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $w3 - ; CHECK-NEXT: %bv0:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; CHECK-NEXT: %t:_(<4 x s16>) = nsw G_TRUNC %bv0(<4 x s32>) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32) + ; CHECK-NEXT: %t:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) ; CHECK-NEXT: %z:_(<4 x s32>) = G_ZEXT %t(<4 x s16>) ; CHECK-NEXT: $q0 = COPY %z(<4 x s32>) %0:_(s32) = COPY $w0 @@ -238,8 +252,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $w3 - ; CHECK-NEXT: %bv0:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; CHECK-NEXT: $q0 = COPY %bv0(<4 x s32>) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32) + ; CHECK-NEXT: %t:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; CHECK-NEXT: %z:_(<4 x s32>) = G_ZEXT %t(<4 x s16>) + ; CHECK-NEXT: $q0 = COPY %z(<4 x s32>) %0:_(s32) = COPY $w0 %1:_(s32) = COPY $w1 %2:_(s32) = COPY $w2 @@ -259,8 +278,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1 - ; CHECK-NEXT: %bv0:_(<2 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[COPY1]](s64) - ; CHECK-NEXT: %z:_(<2 x s32>) = nuw G_TRUNC %bv0(<2 x s64>) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: %t:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; CHECK-NEXT: %z:_(<2 x s32>) = G_ZEXT %t(<2 x s16>) ; CHECK-NEXT: $d0 = COPY %z(<2 x s32>) %0:_(s64) = COPY $x0 %1:_(s64) = COPY $x1 diff --git a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll index abf2e1272d6450..1f5654d59926dc 100644 --- a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll +++ b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll @@ -466,62 +466,92 @@ define <32 x i8> @sext_v32i1(<32 x i1> %arg) { ; ; CHECK-GI-LABEL: sext_v32i1: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fmov s17, w0 -; CHECK-GI-NEXT: fmov s19, w4 -; CHECK-GI-NEXT: ldr s0, [sp] -; CHECK-GI-NEXT: ldr s21, [sp, #8] -; CHECK-GI-NEXT: ldr s1, [sp, #32] -; CHECK-GI-NEXT: ldr s22, [sp, #40] -; CHECK-GI-NEXT: ldr s2, [sp, #64] -; CHECK-GI-NEXT: ldr s23, [sp, #72] -; CHECK-GI-NEXT: ldr s3, [sp, #96] -; CHECK-GI-NEXT: ldr s24, [sp, #104] -; CHECK-GI-NEXT: mov.s v17[1], w1 -; CHECK-GI-NEXT: mov.s v19[1], w5 -; CHECK-GI-NEXT: ldr s5, [sp, #128] -; CHECK-GI-NEXT: ldr s20, [sp, #136] -; CHECK-GI-NEXT: mov.s v0[1], v21[0] -; CHECK-GI-NEXT: ldr s7, [sp, #160] -; CHECK-GI-NEXT: ldr s25, [sp, #168] -; CHECK-GI-NEXT: mov.s v1[1], v22[0] -; CHECK-GI-NEXT: mov.s v2[1], v23[0] -; CHECK-GI-NEXT: mov.s v3[1], v24[0] -; CHECK-GI-NEXT: mov.s v5[1], v20[0] -; CHECK-GI-NEXT: mov.s v7[1], v25[0] -; CHECK-GI-NEXT: ldr s16, [sp, #16] -; CHECK-GI-NEXT: ldr s18, [sp, #48] -; CHECK-GI-NEXT: ldr s20, [sp, #80] -; CHECK-GI-NEXT: ldr s21, [sp, #112] -; CHECK-GI-NEXT: ldr s22, [sp, #144] -; CHECK-GI-NEXT: ldr s23, [sp, #176] -; CHECK-GI-NEXT: mov.s v17[2], w2 -; CHECK-GI-NEXT: mov.s v19[2], w6 -; CHECK-GI-NEXT: mov.s v0[2], v16[0] -; CHECK-GI-NEXT: mov.s v1[2], v18[0] -; CHECK-GI-NEXT: mov.s v2[2], v20[0] -; CHECK-GI-NEXT: mov.s v3[2], v21[0] -; CHECK-GI-NEXT: mov.s v5[2], v22[0] -; CHECK-GI-NEXT: mov.s v7[2], v23[0] -; CHECK-GI-NEXT: ldr s4, [sp, #24] -; CHECK-GI-NEXT: ldr s6, [sp, #56] -; CHECK-GI-NEXT: ldr s16, [sp, #88] -; CHECK-GI-NEXT: ldr s18, [sp, #120] -; CHECK-GI-NEXT: ldr s20, [sp, #152] -; CHECK-GI-NEXT: ldr s21, [sp, #184] -; CHECK-GI-NEXT: mov.s v17[3], w3 -; CHECK-GI-NEXT: mov.s v19[3], w7 -; CHECK-GI-NEXT: mov.s v0[3], v4[0] -; CHECK-GI-NEXT: mov.s v1[3], v6[0] -; CHECK-GI-NEXT: mov.s v2[3], v16[0] -; CHECK-GI-NEXT: mov.s v3[3], v18[0] -; CHECK-GI-NEXT: mov.s v5[3], v20[0] -; CHECK-GI-NEXT: mov.s v7[3], v21[0] -; CHECK-GI-NEXT: uzp1.8h v4, v17, v19 -; CHECK-GI-NEXT: uzp1.8h v0, v0, v1 -; CHECK-GI-NEXT: uzp1.8h v1, v2, v3 -; CHECK-GI-NEXT: uzp1.8h v2, v5, v7 -; CHECK-GI-NEXT: uzp1.16b v0, v4, v0 -; CHECK-GI-NEXT: uzp1.16b v1, v1, v2 +; CHECK-GI-NEXT: ldr w9, [sp, #64] +; CHECK-GI-NEXT: ldr w8, [sp, #72] +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: fmov s2, w1 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #80] +; CHECK-GI-NEXT: ldr w9, [sp, #128] +; CHECK-GI-NEXT: mov.b v0[1], v2[0] +; CHECK-GI-NEXT: fmov s2, w2 +; CHECK-GI-NEXT: mov.b v1[1], v3[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #88] +; CHECK-GI-NEXT: mov.b v0[2], v2[0] +; CHECK-GI-NEXT: fmov s2, w3 +; CHECK-GI-NEXT: mov.b v1[2], v3[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #96] +; CHECK-GI-NEXT: mov.b v0[3], v2[0] +; CHECK-GI-NEXT: fmov s2, w4 +; CHECK-GI-NEXT: mov.b v1[3], v3[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #104] +; CHECK-GI-NEXT: mov.b v0[4], v2[0] +; CHECK-GI-NEXT: fmov s2, w5 +; CHECK-GI-NEXT: mov.b v1[4], v3[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #112] +; CHECK-GI-NEXT: mov.b v0[5], v2[0] +; CHECK-GI-NEXT: fmov s2, w6 +; CHECK-GI-NEXT: mov.b v1[5], v3[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #120] +; CHECK-GI-NEXT: mov.b v0[6], v2[0] +; CHECK-GI-NEXT: fmov s2, w7 +; CHECK-GI-NEXT: mov.b v1[6], v3[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp] +; CHECK-GI-NEXT: mov.b v0[7], v2[0] +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #8] +; CHECK-GI-NEXT: mov.b v1[7], v3[0] +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #136] +; CHECK-GI-NEXT: mov.b v0[8], v2[0] +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #16] +; CHECK-GI-NEXT: mov.b v1[8], v3[0] +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #144] +; CHECK-GI-NEXT: mov.b v0[9], v2[0] +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #24] +; CHECK-GI-NEXT: mov.b v1[9], v3[0] +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #152] +; CHECK-GI-NEXT: mov.b v0[10], v2[0] +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #32] +; CHECK-GI-NEXT: mov.b v1[10], v3[0] +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #160] +; CHECK-GI-NEXT: mov.b v0[11], v2[0] +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #40] +; CHECK-GI-NEXT: mov.b v1[11], v3[0] +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #168] +; CHECK-GI-NEXT: mov.b v0[12], v2[0] +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #48] +; CHECK-GI-NEXT: mov.b v1[12], v3[0] +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #176] +; CHECK-GI-NEXT: mov.b v0[13], v2[0] +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #56] +; CHECK-GI-NEXT: mov.b v1[13], v3[0] +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #184] +; CHECK-GI-NEXT: mov.b v0[14], v2[0] +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov.b v1[14], v3[0] +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: mov.b v0[15], v2[0] +; CHECK-GI-NEXT: mov.b v1[15], v3[0] ; CHECK-GI-NEXT: shl.16b v0, v0, #7 ; CHECK-GI-NEXT: shl.16b v1, v1, #7 ; CHECK-GI-NEXT: sshr.16b v0, v0, #7 @@ -807,140 +837,198 @@ define <64 x i8> @sext_v64i1(<64 x i1> %arg) { ; ; CHECK-GI-LABEL: sext_v64i1: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: stp d9, d8, [sp, #-32]! // 16-byte Folded Spill -; CHECK-GI-NEXT: str x29, [sp, #16] // 8-byte Folded Spill -; CHECK-GI-NEXT: .cfi_def_cfa_offset 32 +; CHECK-GI-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 ; CHECK-GI-NEXT: .cfi_offset w29, -16 -; CHECK-GI-NEXT: .cfi_offset b8, -24 -; CHECK-GI-NEXT: .cfi_offset b9, -32 -; CHECK-GI-NEXT: ldr s0, [sp, #32] -; CHECK-GI-NEXT: ldr s4, [sp, #40] -; CHECK-GI-NEXT: ldr s2, [sp, #96] -; CHECK-GI-NEXT: ldr s5, [sp, #104] -; CHECK-GI-NEXT: ldr s1, [sp, #64] -; CHECK-GI-NEXT: ldr s23, [sp, #72] -; CHECK-GI-NEXT: mov.s v0[1], v4[0] -; CHECK-GI-NEXT: ldr s28, [sp, #200] -; CHECK-GI-NEXT: ldr s3, [sp, #128] -; CHECK-GI-NEXT: mov.s v2[1], v5[0] -; CHECK-GI-NEXT: mov.s v1[1], v23[0] -; CHECK-GI-NEXT: ldr s5, [sp, #192] -; CHECK-GI-NEXT: ldr s7, [sp, #136] -; CHECK-GI-NEXT: ldr s4, [sp, #160] -; CHECK-GI-NEXT: ldr s24, [sp, #168] -; CHECK-GI-NEXT: mov.s v5[1], v28[0] -; CHECK-GI-NEXT: ldr s6, [sp, #48] -; CHECK-GI-NEXT: ldr s21, [sp, #80] -; CHECK-GI-NEXT: mov.s v3[1], v7[0] -; CHECK-GI-NEXT: mov.s v4[1], v24[0] -; CHECK-GI-NEXT: ldr s16, [sp, #112] -; CHECK-GI-NEXT: ldr s29, [sp, #208] -; CHECK-GI-NEXT: mov.s v0[2], v6[0] -; CHECK-GI-NEXT: mov.s v1[2], v21[0] -; CHECK-GI-NEXT: ldr s6, [sp, #224] -; CHECK-GI-NEXT: ldr s30, [sp, #232] -; CHECK-GI-NEXT: mov.s v2[2], v16[0] -; CHECK-GI-NEXT: ldr s20, [sp, #144] -; CHECK-GI-NEXT: ldr s27, [sp, #176] -; CHECK-GI-NEXT: mov.s v5[2], v29[0] -; CHECK-GI-NEXT: mov.s v6[1], v30[0] -; CHECK-GI-NEXT: ldr s18, [sp, #88] -; CHECK-GI-NEXT: ldr s19, [sp, #120] -; CHECK-GI-NEXT: ldr s7, [sp, #256] -; CHECK-GI-NEXT: ldr s31, [sp, #264] -; CHECK-GI-NEXT: mov.s v3[2], v20[0] -; CHECK-GI-NEXT: mov.s v4[2], v27[0] -; CHECK-GI-NEXT: ldr s25, [sp, #216] -; CHECK-GI-NEXT: ldr s26, [sp, #240] -; CHECK-GI-NEXT: ldr s17, [sp, #56] -; CHECK-GI-NEXT: ldr s22, [sp, #152] -; CHECK-GI-NEXT: mov.s v1[3], v18[0] -; CHECK-GI-NEXT: ldr s23, [sp, #184] -; CHECK-GI-NEXT: mov.s v2[3], v19[0] -; CHECK-GI-NEXT: ldr s18, [sp, #320] -; CHECK-GI-NEXT: ldr s27, [sp, #328] -; CHECK-GI-NEXT: mov.s v7[1], v31[0] -; CHECK-GI-NEXT: ldr s19, [sp, #352] -; CHECK-GI-NEXT: ldr s29, [sp, #360] -; CHECK-GI-NEXT: mov.s v5[3], v25[0] -; CHECK-GI-NEXT: mov.s v6[2], v26[0] -; CHECK-GI-NEXT: fmov s25, w0 -; CHECK-GI-NEXT: fmov s26, w4 -; CHECK-GI-NEXT: ldr s28, [sp, #272] -; CHECK-GI-NEXT: mov.s v0[3], v17[0] -; CHECK-GI-NEXT: ldr s17, [sp, #288] -; CHECK-GI-NEXT: ldr s8, [sp, #296] -; CHECK-GI-NEXT: mov.s v3[3], v22[0] -; CHECK-GI-NEXT: ldr s20, [sp, #384] -; CHECK-GI-NEXT: mov.s v4[3], v23[0] -; CHECK-GI-NEXT: ldr s30, [sp, #392] -; CHECK-GI-NEXT: ldr s22, [sp, #416] -; CHECK-GI-NEXT: ldr s31, [sp, #424] -; CHECK-GI-NEXT: ldr s23, [sp, #448] -; CHECK-GI-NEXT: mov.s v18[1], v27[0] -; CHECK-GI-NEXT: mov.s v19[1], v29[0] -; CHECK-GI-NEXT: ldr s27, [sp, #456] -; CHECK-GI-NEXT: ldr s24, [sp, #336] -; CHECK-GI-NEXT: mov.s v17[1], v8[0] -; CHECK-GI-NEXT: mov.s v7[2], v28[0] -; CHECK-GI-NEXT: mov.s v25[1], w1 -; CHECK-GI-NEXT: mov.s v26[1], w5 -; CHECK-GI-NEXT: mov.s v20[1], v30[0] -; CHECK-GI-NEXT: ldr s28, [sp, #368] -; CHECK-GI-NEXT: mov.s v22[1], v31[0] -; CHECK-GI-NEXT: mov.s v23[1], v27[0] -; CHECK-GI-NEXT: ldr s9, [sp, #304] -; CHECK-GI-NEXT: ldr s27, [sp, #400] -; CHECK-GI-NEXT: mov.s v18[2], v24[0] -; CHECK-GI-NEXT: ldr s24, [sp, #432] -; CHECK-GI-NEXT: mov.s v19[2], v28[0] -; CHECK-GI-NEXT: ldr s28, [sp, #464] -; CHECK-GI-NEXT: ldr s16, [sp, #248] -; CHECK-GI-NEXT: ldr s21, [sp, #280] -; CHECK-GI-NEXT: mov.s v17[2], v9[0] -; CHECK-GI-NEXT: mov.s v25[2], w2 -; CHECK-GI-NEXT: mov.s v26[2], w6 -; CHECK-GI-NEXT: mov.s v20[2], v27[0] -; CHECK-GI-NEXT: mov.s v22[2], v24[0] -; CHECK-GI-NEXT: mov.s v23[2], v28[0] -; CHECK-GI-NEXT: ldr s29, [sp, #312] -; CHECK-GI-NEXT: ldr s27, [sp, #344] -; CHECK-GI-NEXT: ldr s24, [sp, #376] -; CHECK-GI-NEXT: ldr s28, [sp, #408] -; CHECK-GI-NEXT: mov.s v6[3], v16[0] -; CHECK-GI-NEXT: ldr s16, [sp, #440] -; CHECK-GI-NEXT: mov.s v7[3], v21[0] -; CHECK-GI-NEXT: ldr s21, [sp, #472] -; CHECK-GI-NEXT: mov.s v25[3], w3 -; CHECK-GI-NEXT: mov.s v26[3], w7 -; CHECK-GI-NEXT: mov.s v17[3], v29[0] -; CHECK-GI-NEXT: mov.s v18[3], v27[0] -; CHECK-GI-NEXT: mov.s v19[3], v24[0] -; CHECK-GI-NEXT: mov.s v20[3], v28[0] -; CHECK-GI-NEXT: mov.s v22[3], v16[0] -; CHECK-GI-NEXT: mov.s v23[3], v21[0] -; CHECK-GI-NEXT: uzp1.8h v0, v0, v1 -; CHECK-GI-NEXT: uzp1.8h v1, v2, v3 -; CHECK-GI-NEXT: uzp1.8h v2, v4, v5 -; CHECK-GI-NEXT: uzp1.8h v3, v6, v7 -; CHECK-GI-NEXT: ldr x29, [sp, #16] // 8-byte Folded Reload -; CHECK-GI-NEXT: uzp1.8h v16, v25, v26 -; CHECK-GI-NEXT: uzp1.8h v4, v17, v18 -; CHECK-GI-NEXT: uzp1.8h v5, v19, v20 -; CHECK-GI-NEXT: uzp1.8h v6, v22, v23 -; CHECK-GI-NEXT: uzp1.16b v1, v1, v2 -; CHECK-GI-NEXT: uzp1.16b v0, v16, v0 -; CHECK-GI-NEXT: uzp1.16b v2, v3, v4 -; CHECK-GI-NEXT: uzp1.16b v3, v5, v6 +; CHECK-GI-NEXT: ldr w9, [sp, #80] +; CHECK-GI-NEXT: ldr w11, [sp, #88] +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: fmov s3, w1 +; CHECK-GI-NEXT: ldr w8, [sp, #208] +; CHECK-GI-NEXT: ldr w10, [sp, #216] +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: fmov s4, w11 +; CHECK-GI-NEXT: ldr w9, [sp, #336] +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: fmov s5, w10 +; CHECK-GI-NEXT: ldr w11, [sp, #344] +; CHECK-GI-NEXT: mov.b v0[1], v3[0] +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #224] +; CHECK-GI-NEXT: mov.b v1[1], v4[0] +; CHECK-GI-NEXT: fmov s4, w2 +; CHECK-GI-NEXT: fmov s6, w11 +; CHECK-GI-NEXT: mov.b v2[1], v5[0] +; CHECK-GI-NEXT: ldr w8, [sp, #96] +; CHECK-GI-NEXT: ldr w10, [sp, #352] +; CHECK-GI-NEXT: ldr w11, [sp, #16] +; CHECK-GI-NEXT: mov.b v0[2], v4[0] +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #232] +; CHECK-GI-NEXT: mov.b v3[1], v6[0] +; CHECK-GI-NEXT: fmov s5, w8 +; CHECK-GI-NEXT: fmov s6, w10 +; CHECK-GI-NEXT: ldr w8, [sp, #104] +; CHECK-GI-NEXT: ldr w10, [sp, #360] +; CHECK-GI-NEXT: mov.b v2[2], v4[0] +; CHECK-GI-NEXT: fmov s4, w3 +; CHECK-GI-NEXT: mov.b v1[2], v5[0] +; CHECK-GI-NEXT: fmov s5, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #112] +; CHECK-GI-NEXT: mov.b v3[2], v6[0] +; CHECK-GI-NEXT: fmov s6, w10 +; CHECK-GI-NEXT: ldr w10, [sp, #368] +; CHECK-GI-NEXT: mov.b v0[3], v4[0] +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #240] +; CHECK-GI-NEXT: mov.b v1[3], v5[0] +; CHECK-GI-NEXT: fmov s5, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #120] +; CHECK-GI-NEXT: mov.b v2[3], v4[0] +; CHECK-GI-NEXT: fmov s4, w4 +; CHECK-GI-NEXT: mov.b v3[3], v6[0] +; CHECK-GI-NEXT: fmov s6, w10 +; CHECK-GI-NEXT: ldr w10, [sp, #376] +; CHECK-GI-NEXT: mov.b v0[4], v4[0] +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #248] +; CHECK-GI-NEXT: mov.b v1[4], v5[0] +; CHECK-GI-NEXT: mov.b v3[4], v6[0] +; CHECK-GI-NEXT: fmov s5, w8 +; CHECK-GI-NEXT: fmov s6, w10 +; CHECK-GI-NEXT: ldr w8, [sp, #128] +; CHECK-GI-NEXT: ldr w10, [sp, #384] +; CHECK-GI-NEXT: mov.b v2[4], v4[0] +; CHECK-GI-NEXT: fmov s4, w5 +; CHECK-GI-NEXT: mov.b v1[5], v5[0] +; CHECK-GI-NEXT: mov.b v3[5], v6[0] +; CHECK-GI-NEXT: fmov s5, w8 +; CHECK-GI-NEXT: mov.b v0[5], v4[0] +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #256] +; CHECK-GI-NEXT: fmov s6, w10 +; CHECK-GI-NEXT: ldr w8, [sp, #136] +; CHECK-GI-NEXT: ldr w10, [sp, #392] +; CHECK-GI-NEXT: mov.b v2[5], v4[0] +; CHECK-GI-NEXT: fmov s4, w6 +; CHECK-GI-NEXT: mov.b v1[6], v5[0] +; CHECK-GI-NEXT: mov.b v3[6], v6[0] +; CHECK-GI-NEXT: fmov s5, w8 +; CHECK-GI-NEXT: fmov s6, w10 +; CHECK-GI-NEXT: ldr w8, [sp, #144] +; CHECK-GI-NEXT: ldr w10, [sp, #400] +; CHECK-GI-NEXT: mov.b v0[6], v4[0] +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #264] +; CHECK-GI-NEXT: mov.b v1[7], v5[0] +; CHECK-GI-NEXT: fmov s5, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #152] +; CHECK-GI-NEXT: mov.b v3[7], v6[0] +; CHECK-GI-NEXT: fmov s6, w10 +; CHECK-GI-NEXT: ldr w10, [sp, #408] +; CHECK-GI-NEXT: mov.b v2[6], v4[0] +; CHECK-GI-NEXT: fmov s4, w7 +; CHECK-GI-NEXT: mov.b v1[8], v5[0] +; CHECK-GI-NEXT: fmov s5, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #160] +; CHECK-GI-NEXT: mov.b v0[7], v4[0] +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #272] +; CHECK-GI-NEXT: mov.b v3[8], v6[0] +; CHECK-GI-NEXT: fmov s6, w10 +; CHECK-GI-NEXT: ldr w10, [sp, #416] +; CHECK-GI-NEXT: mov.b v2[7], v4[0] +; CHECK-GI-NEXT: fmov s4, w11 +; CHECK-GI-NEXT: ldr w11, [sp, #24] +; CHECK-GI-NEXT: mov.b v1[9], v5[0] +; CHECK-GI-NEXT: fmov s5, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #168] +; CHECK-GI-NEXT: mov.b v3[9], v6[0] +; CHECK-GI-NEXT: fmov s6, w10 +; CHECK-GI-NEXT: ldr w10, [sp, #424] +; CHECK-GI-NEXT: mov.b v0[8], v4[0] +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #280] +; CHECK-GI-NEXT: mov.b v1[10], v5[0] +; CHECK-GI-NEXT: fmov s5, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #176] +; CHECK-GI-NEXT: mov.b v2[8], v4[0] +; CHECK-GI-NEXT: fmov s4, w11 +; CHECK-GI-NEXT: ldr w11, [sp, #32] +; CHECK-GI-NEXT: mov.b v3[10], v6[0] +; CHECK-GI-NEXT: fmov s6, w10 +; CHECK-GI-NEXT: ldr w10, [sp, #432] +; CHECK-GI-NEXT: mov.b v0[9], v4[0] +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #288] +; CHECK-GI-NEXT: mov.b v1[11], v5[0] +; CHECK-GI-NEXT: fmov s5, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #184] +; CHECK-GI-NEXT: mov.b v3[11], v6[0] +; CHECK-GI-NEXT: fmov s6, w10 +; CHECK-GI-NEXT: ldr w10, [sp, #440] +; CHECK-GI-NEXT: mov.b v2[9], v4[0] +; CHECK-GI-NEXT: fmov s4, w11 +; CHECK-GI-NEXT: ldr w11, [sp, #40] +; CHECK-GI-NEXT: mov.b v1[12], v5[0] +; CHECK-GI-NEXT: fmov s5, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #192] +; CHECK-GI-NEXT: mov.b v0[10], v4[0] +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #296] +; CHECK-GI-NEXT: mov.b v3[12], v6[0] +; CHECK-GI-NEXT: fmov s6, w10 +; CHECK-GI-NEXT: ldr w10, [sp, #448] +; CHECK-GI-NEXT: mov.b v2[10], v4[0] +; CHECK-GI-NEXT: fmov s4, w11 +; CHECK-GI-NEXT: ldr w11, [sp, #48] +; CHECK-GI-NEXT: mov.b v1[13], v5[0] +; CHECK-GI-NEXT: fmov s5, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #200] +; CHECK-GI-NEXT: mov.b v3[13], v6[0] +; CHECK-GI-NEXT: fmov s6, w10 +; CHECK-GI-NEXT: ldr w10, [sp, #456] +; CHECK-GI-NEXT: mov.b v0[11], v4[0] +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #304] +; CHECK-GI-NEXT: fmov s7, w10 +; CHECK-GI-NEXT: mov.b v1[14], v5[0] +; CHECK-GI-NEXT: fmov s5, w8 +; CHECK-GI-NEXT: mov.b v2[11], v4[0] +; CHECK-GI-NEXT: fmov s4, w11 +; CHECK-GI-NEXT: ldr w11, [sp, #56] +; CHECK-GI-NEXT: mov.b v3[14], v6[0] +; CHECK-GI-NEXT: mov.b v0[12], v4[0] +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #312] +; CHECK-GI-NEXT: mov.b v1[15], v5[0] +; CHECK-GI-NEXT: mov.b v3[15], v7[0] +; CHECK-GI-NEXT: mov.b v2[12], v4[0] +; CHECK-GI-NEXT: fmov s4, w11 +; CHECK-GI-NEXT: ldr w11, [sp, #64] ; CHECK-GI-NEXT: shl.16b v1, v1, #7 -; CHECK-GI-NEXT: shl.16b v0, v0, #7 -; CHECK-GI-NEXT: shl.16b v2, v2, #7 +; CHECK-GI-NEXT: mov.b v0[13], v4[0] +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #320] ; CHECK-GI-NEXT: shl.16b v3, v3, #7 ; CHECK-GI-NEXT: sshr.16b v1, v1, #7 +; CHECK-GI-NEXT: mov.b v2[13], v4[0] +; CHECK-GI-NEXT: fmov s4, w11 +; CHECK-GI-NEXT: ldr w11, [sp, #72] +; CHECK-GI-NEXT: sshr.16b v3, v3, #7 +; CHECK-GI-NEXT: mov.b v0[14], v4[0] +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #328] +; CHECK-GI-NEXT: fmov s6, w9 +; CHECK-GI-NEXT: mov.b v2[14], v4[0] +; CHECK-GI-NEXT: fmov s4, w11 +; CHECK-GI-NEXT: mov.b v0[15], v4[0] +; CHECK-GI-NEXT: mov.b v2[15], v6[0] +; CHECK-GI-NEXT: shl.16b v0, v0, #7 +; CHECK-GI-NEXT: shl.16b v2, v2, #7 ; CHECK-GI-NEXT: sshr.16b v0, v0, #7 ; CHECK-GI-NEXT: sshr.16b v2, v2, #7 -; CHECK-GI-NEXT: sshr.16b v3, v3, #7 -; CHECK-GI-NEXT: ldp d9, d8, [sp], #32 // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-GI-NEXT: ret %res = sext <64 x i1> %arg to <64 x i8> ret <64 x i8> %res diff --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll index 402682c89124bd..6f4b090fb22bd6 100644 --- a/llvm/test/CodeGen/AArch64/neon-extadd.ll +++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll @@ -1266,95 +1266,133 @@ define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) { ; ; CHECK-GI-LABEL: v20: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr s0, [sp] -; CHECK-GI-NEXT: ldr s4, [sp, #8] -; CHECK-GI-NEXT: fmov s1, w0 -; CHECK-GI-NEXT: ldr s2, [sp, #32] -; CHECK-GI-NEXT: ldr s19, [sp, #40] -; CHECK-GI-NEXT: fmov s3, w4 -; CHECK-GI-NEXT: mov v0.s[1], v4.s[0] -; CHECK-GI-NEXT: ldr s16, [sp, #96] -; CHECK-GI-NEXT: ldr s22, [sp, #104] -; CHECK-GI-NEXT: mov v2.s[1], v19.s[0] -; CHECK-GI-NEXT: ldr s19, [sp, #128] -; CHECK-GI-NEXT: ldr s23, [sp, #136] -; CHECK-GI-NEXT: ldr s18, [sp, #16] -; CHECK-GI-NEXT: mov v1.s[1], w1 -; CHECK-GI-NEXT: mov v3.s[1], w5 -; CHECK-GI-NEXT: mov v16.s[1], v22.s[0] -; CHECK-GI-NEXT: mov v19.s[1], v23.s[0] -; CHECK-GI-NEXT: ldr s4, [sp, #64] -; CHECK-GI-NEXT: ldr s21, [sp, #72] -; CHECK-GI-NEXT: mov v0.s[2], v18.s[0] -; CHECK-GI-NEXT: ldr s18, [sp, #160] -; CHECK-GI-NEXT: ldr s24, [sp, #168] -; CHECK-GI-NEXT: ldr s20, [sp, #192] -; CHECK-GI-NEXT: ldr s25, [sp, #200] -; CHECK-GI-NEXT: ldr s22, [sp, #224] -; CHECK-GI-NEXT: ldr s27, [sp, #232] -; CHECK-GI-NEXT: ldr s23, [sp, #112] -; CHECK-GI-NEXT: ldr s26, [sp, #144] -; CHECK-GI-NEXT: mov v18.s[1], v24.s[0] -; CHECK-GI-NEXT: mov v20.s[1], v25.s[0] -; CHECK-GI-NEXT: mov v4.s[1], v21.s[0] -; CHECK-GI-NEXT: mov v22.s[1], v27.s[0] -; CHECK-GI-NEXT: mov v1.s[2], w2 -; CHECK-GI-NEXT: ldr s17, [sp, #48] -; CHECK-GI-NEXT: mov v3.s[2], w6 -; CHECK-GI-NEXT: mov v16.s[2], v23.s[0] -; CHECK-GI-NEXT: mov v19.s[2], v26.s[0] -; CHECK-GI-NEXT: ldr s7, [sp, #80] -; CHECK-GI-NEXT: ldr s21, [sp, #176] -; CHECK-GI-NEXT: ldr s24, [sp, #208] -; CHECK-GI-NEXT: ldr s25, [sp, #240] -; CHECK-GI-NEXT: mov v2.s[2], v17.s[0] -; CHECK-GI-NEXT: ldr s17, [sp, #120] -; CHECK-GI-NEXT: ldr s23, [sp, #152] -; CHECK-GI-NEXT: ldr s5, [sp, #24] -; CHECK-GI-NEXT: mov v18.s[2], v21.s[0] -; CHECK-GI-NEXT: mov v20.s[2], v24.s[0] -; CHECK-GI-NEXT: mov v4.s[2], v7.s[0] -; CHECK-GI-NEXT: mov v22.s[2], v25.s[0] -; CHECK-GI-NEXT: mov v1.s[3], w3 -; CHECK-GI-NEXT: mov v3.s[3], w7 -; CHECK-GI-NEXT: mov v16.s[3], v17.s[0] -; CHECK-GI-NEXT: mov v19.s[3], v23.s[0] -; CHECK-GI-NEXT: ldr s6, [sp, #56] -; CHECK-GI-NEXT: ldr s7, [sp, #184] -; CHECK-GI-NEXT: ldr s21, [sp, #216] -; CHECK-GI-NEXT: ldr s17, [sp, #88] -; CHECK-GI-NEXT: mov v0.s[3], v5.s[0] -; CHECK-GI-NEXT: ldr s5, [sp, #248] -; CHECK-GI-NEXT: mov v2.s[3], v6.s[0] -; CHECK-GI-NEXT: mov v18.s[3], v7.s[0] -; CHECK-GI-NEXT: mov v20.s[3], v21.s[0] -; CHECK-GI-NEXT: mov v4.s[3], v17.s[0] -; CHECK-GI-NEXT: mov v22.s[3], v5.s[0] -; CHECK-GI-NEXT: uzp1 v1.8h, v1.8h, v3.8h -; CHECK-GI-NEXT: movi v3.2d, #0xff00ff00ff00ff -; CHECK-GI-NEXT: uzp1 v5.8h, v16.8h, v19.8h -; CHECK-GI-NEXT: dup v6.4s, w8 -; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; CHECK-GI-NEXT: uzp1 v2.8h, v18.8h, v20.8h -; CHECK-GI-NEXT: uzp1 v4.8h, v4.8h, v6.8h -; CHECK-GI-NEXT: uzp1 v6.8h, v22.8h, v6.8h -; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b -; CHECK-GI-NEXT: and v5.16b, v5.16b, v3.16b -; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b -; CHECK-GI-NEXT: and v2.16b, v2.16b, v3.16b -; CHECK-GI-NEXT: add v1.8h, v1.8h, v5.8h -; CHECK-GI-NEXT: and v4.16b, v4.16b, v3.16b -; CHECK-GI-NEXT: and v3.16b, v6.16b, v3.16b -; CHECK-GI-NEXT: add v0.8h, v0.8h, v2.8h -; CHECK-GI-NEXT: ushll v2.4s, v1.4h, #0 -; CHECK-GI-NEXT: add v3.4h, v4.4h, v3.4h +; CHECK-GI-NEXT: ldr w9, [sp, #64] +; CHECK-GI-NEXT: ldr w10, [sp, #72] +; CHECK-GI-NEXT: and w13, w2, #0xff +; CHECK-GI-NEXT: ldr w11, [sp, #80] +; CHECK-GI-NEXT: ldr w12, [sp, #88] +; CHECK-GI-NEXT: fmov s19, w13 +; CHECK-GI-NEXT: fmov s0, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #224] +; CHECK-GI-NEXT: fmov s16, w10 +; CHECK-GI-NEXT: ldr w10, [sp, #232] +; CHECK-GI-NEXT: fmov s3, w11 +; CHECK-GI-NEXT: ldr w11, [sp, #240] +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #248] +; CHECK-GI-NEXT: fmov s1, w12 +; CHECK-GI-NEXT: fmov s7, w10 +; CHECK-GI-NEXT: and w10, w1, #0xff +; CHECK-GI-NEXT: fmov s5, w11 +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: and w9, w0, #0xff +; CHECK-GI-NEXT: ldrb w11, [sp] +; CHECK-GI-NEXT: ldrb w12, [sp, #8] +; CHECK-GI-NEXT: fmov s6, w9 +; CHECK-GI-NEXT: fmov s20, w10 +; CHECK-GI-NEXT: ldrb w9, [sp, #96] +; CHECK-GI-NEXT: ldrb w10, [sp, #104] +; CHECK-GI-NEXT: fmov s17, w11 +; CHECK-GI-NEXT: fmov s21, w12 +; CHECK-GI-NEXT: ldrb w11, [sp, #160] +; CHECK-GI-NEXT: mov v0.b[1], v16.b[0] +; CHECK-GI-NEXT: fmov s18, w9 +; CHECK-GI-NEXT: fmov s22, w10 +; CHECK-GI-NEXT: ldrb w9, [sp, #168] +; CHECK-GI-NEXT: mov v6.h[1], v20.h[0] +; CHECK-GI-NEXT: fmov s20, w11 +; CHECK-GI-NEXT: ldrb w10, [sp, #16] +; CHECK-GI-NEXT: mov v17.h[1], v21.h[0] +; CHECK-GI-NEXT: fmov s21, w9 +; CHECK-GI-NEXT: ldrb w9, [sp, #112] +; CHECK-GI-NEXT: mov v18.h[1], v22.h[0] +; CHECK-GI-NEXT: fmov s23, w10 +; CHECK-GI-NEXT: ldrb w10, [sp, #176] +; CHECK-GI-NEXT: and w11, w3, #0xff +; CHECK-GI-NEXT: mov v2.b[1], v7.b[0] +; CHECK-GI-NEXT: mov v0.b[2], v3.b[0] +; CHECK-GI-NEXT: mov v6.h[2], v19.h[0] +; CHECK-GI-NEXT: fmov s19, w9 +; CHECK-GI-NEXT: mov v20.h[1], v21.h[0] +; CHECK-GI-NEXT: ldrb w9, [sp, #24] +; CHECK-GI-NEXT: fmov s22, w11 +; CHECK-GI-NEXT: mov v17.h[2], v23.h[0] +; CHECK-GI-NEXT: and w11, w4, #0xff +; CHECK-GI-NEXT: mov v18.h[2], v19.h[0] +; CHECK-GI-NEXT: fmov s19, w10 +; CHECK-GI-NEXT: ldrb w10, [sp, #120] +; CHECK-GI-NEXT: fmov s23, w9 +; CHECK-GI-NEXT: ldrb w9, [sp, #184] +; CHECK-GI-NEXT: mov v6.h[3], v22.h[0] +; CHECK-GI-NEXT: fmov s21, w11 +; CHECK-GI-NEXT: and w11, w6, #0xff +; CHECK-GI-NEXT: mov v2.b[2], v5.b[0] +; CHECK-GI-NEXT: mov v20.h[2], v19.h[0] +; CHECK-GI-NEXT: fmov s19, w10 +; CHECK-GI-NEXT: fmov s16, w9 +; CHECK-GI-NEXT: ldrb w9, [sp, #128] +; CHECK-GI-NEXT: and w10, w5, #0xff +; CHECK-GI-NEXT: mov v17.h[3], v23.h[0] +; CHECK-GI-NEXT: mov v6.h[4], v21.h[0] +; CHECK-GI-NEXT: mov v0.b[3], v1.b[0] +; CHECK-GI-NEXT: mov v18.h[3], v19.h[0] +; CHECK-GI-NEXT: fmov s19, w9 +; CHECK-GI-NEXT: ldrb w9, [sp, #192] +; CHECK-GI-NEXT: mov v20.h[3], v16.h[0] +; CHECK-GI-NEXT: fmov s16, w10 +; CHECK-GI-NEXT: ldrb w10, [sp, #32] +; CHECK-GI-NEXT: mov v2.b[3], v4.b[0] +; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: mov v18.h[4], v19.h[0] +; CHECK-GI-NEXT: fmov s19, w10 +; CHECK-GI-NEXT: ldrb w10, [sp, #136] +; CHECK-GI-NEXT: mov v6.h[5], v16.h[0] +; CHECK-GI-NEXT: fmov s16, w10 +; CHECK-GI-NEXT: ldrb w10, [sp, #48] +; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-GI-NEXT: mov v17.h[4], v19.h[0] +; CHECK-GI-NEXT: fmov s19, w9 +; CHECK-GI-NEXT: ldrb w9, [sp, #40] +; CHECK-GI-NEXT: mov v18.h[5], v16.h[0] +; CHECK-GI-NEXT: fmov s16, w9 +; CHECK-GI-NEXT: ldrb w9, [sp, #144] +; CHECK-GI-NEXT: mov v20.h[4], v19.h[0] +; CHECK-GI-NEXT: fmov s19, w11 +; CHECK-GI-NEXT: ldrb w11, [sp, #200] +; CHECK-GI-NEXT: add v0.4h, v0.4h, v2.4h +; CHECK-GI-NEXT: fmov s7, w11 +; CHECK-GI-NEXT: mov v17.h[5], v16.h[0] +; CHECK-GI-NEXT: fmov s16, w9 +; CHECK-GI-NEXT: ldrb w11, [sp, #208] +; CHECK-GI-NEXT: mov v6.h[6], v19.h[0] +; CHECK-GI-NEXT: ldrb w9, [sp, #56] +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: mov v20.h[5], v7.h[0] +; CHECK-GI-NEXT: fmov s7, w10 +; CHECK-GI-NEXT: mov v18.h[6], v16.h[0] +; CHECK-GI-NEXT: fmov s16, w11 +; CHECK-GI-NEXT: ldrb w10, [sp, #152] +; CHECK-GI-NEXT: and w11, w7, #0xff +; CHECK-GI-NEXT: fmov s3, w11 +; CHECK-GI-NEXT: str q0, [x8, #64] +; CHECK-GI-NEXT: fmov s5, w10 +; CHECK-GI-NEXT: ldrb w10, [sp, #216] +; CHECK-GI-NEXT: mov v17.h[6], v7.h[0] +; CHECK-GI-NEXT: mov v20.h[6], v16.h[0] +; CHECK-GI-NEXT: fmov s7, w9 +; CHECK-GI-NEXT: mov v6.h[7], v3.h[0] +; CHECK-GI-NEXT: fmov s3, w10 +; CHECK-GI-NEXT: mov v18.h[7], v5.h[0] +; CHECK-GI-NEXT: mov v17.h[7], v7.h[0] +; CHECK-GI-NEXT: mov v20.h[7], v3.h[0] +; CHECK-GI-NEXT: add v1.8h, v6.8h, v18.8h +; CHECK-GI-NEXT: add v3.8h, v17.8h, v20.8h +; CHECK-GI-NEXT: ushll v4.4s, v1.4h, #0 ; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-GI-NEXT: stp q2, q1, [x8] ; CHECK-GI-NEXT: ushll v2.4s, v3.4h, #0 -; CHECK-GI-NEXT: stp q4, q0, [x8, #32] -; CHECK-GI-NEXT: str q2, [x8, #64] +; CHECK-GI-NEXT: ushll2 v3.4s, v3.8h, #0 +; CHECK-GI-NEXT: stp q4, q1, [x8] +; CHECK-GI-NEXT: stp q2, q3, [x8, #32] ; CHECK-GI-NEXT: ret entry: %s0s = zext <20 x i8> %s0 to <20 x i32> @@ -1459,69 +1497,107 @@ define <16 x i32> @i12(<16 x i12> %s0, <16 x i12> %s1) { ; ; CHECK-GI-LABEL: i12: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s1, w0 -; CHECK-GI-NEXT: fmov s4, w4 -; CHECK-GI-NEXT: ldr s0, [sp] -; CHECK-GI-NEXT: ldr s20, [sp, #8] -; CHECK-GI-NEXT: ldr s2, [sp, #32] -; CHECK-GI-NEXT: ldr s21, [sp, #40] -; CHECK-GI-NEXT: ldr s16, [sp, #64] -; CHECK-GI-NEXT: ldr s22, [sp, #72] -; CHECK-GI-NEXT: ldr s17, [sp, #96] -; CHECK-GI-NEXT: ldr s23, [sp, #104] -; CHECK-GI-NEXT: mov v1.s[1], w1 -; CHECK-GI-NEXT: mov v4.s[1], w5 -; CHECK-GI-NEXT: ldr s18, [sp, #128] -; CHECK-GI-NEXT: ldr s24, [sp, #136] -; CHECK-GI-NEXT: mov v0.s[1], v20.s[0] -; CHECK-GI-NEXT: ldr s19, [sp, #160] -; CHECK-GI-NEXT: ldr s25, [sp, #168] -; CHECK-GI-NEXT: mov v2.s[1], v21.s[0] -; CHECK-GI-NEXT: mov v16.s[1], v22.s[0] -; CHECK-GI-NEXT: mov v17.s[1], v23.s[0] -; CHECK-GI-NEXT: mov v18.s[1], v24.s[0] -; CHECK-GI-NEXT: mov v19.s[1], v25.s[0] -; CHECK-GI-NEXT: ldr s6, [sp, #16] -; CHECK-GI-NEXT: ldr s7, [sp, #48] -; CHECK-GI-NEXT: ldr s20, [sp, #80] -; CHECK-GI-NEXT: ldr s21, [sp, #112] -; CHECK-GI-NEXT: ldr s22, [sp, #144] -; CHECK-GI-NEXT: ldr s23, [sp, #176] -; CHECK-GI-NEXT: mov v1.s[2], w2 -; CHECK-GI-NEXT: mov v4.s[2], w6 -; CHECK-GI-NEXT: mov v0.s[2], v6.s[0] -; CHECK-GI-NEXT: mov v2.s[2], v7.s[0] -; CHECK-GI-NEXT: mov v16.s[2], v20.s[0] -; CHECK-GI-NEXT: mov v17.s[2], v21.s[0] -; CHECK-GI-NEXT: mov v18.s[2], v22.s[0] -; CHECK-GI-NEXT: mov v19.s[2], v23.s[0] -; CHECK-GI-NEXT: ldr s3, [sp, #24] -; CHECK-GI-NEXT: ldr s5, [sp, #56] -; CHECK-GI-NEXT: ldr s6, [sp, #88] -; CHECK-GI-NEXT: ldr s7, [sp, #120] -; CHECK-GI-NEXT: ldr s20, [sp, #152] -; CHECK-GI-NEXT: ldr s21, [sp, #184] -; CHECK-GI-NEXT: mov v1.s[3], w3 -; CHECK-GI-NEXT: mov v4.s[3], w7 -; CHECK-GI-NEXT: movi v22.4s, #15, msl #8 -; CHECK-GI-NEXT: mov v0.s[3], v3.s[0] -; CHECK-GI-NEXT: mov v2.s[3], v5.s[0] -; CHECK-GI-NEXT: mov v16.s[3], v6.s[0] -; CHECK-GI-NEXT: mov v17.s[3], v7.s[0] -; CHECK-GI-NEXT: mov v18.s[3], v20.s[0] -; CHECK-GI-NEXT: mov v19.s[3], v21.s[0] -; CHECK-GI-NEXT: and v1.16b, v1.16b, v22.16b -; CHECK-GI-NEXT: and v3.16b, v4.16b, v22.16b -; CHECK-GI-NEXT: and v4.16b, v0.16b, v22.16b -; CHECK-GI-NEXT: and v5.16b, v2.16b, v22.16b -; CHECK-GI-NEXT: and v0.16b, v16.16b, v22.16b -; CHECK-GI-NEXT: and v2.16b, v17.16b, v22.16b -; CHECK-GI-NEXT: and v6.16b, v18.16b, v22.16b -; CHECK-GI-NEXT: and v7.16b, v19.16b, v22.16b -; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-GI-NEXT: add v1.4s, v3.4s, v2.4s -; CHECK-GI-NEXT: add v2.4s, v4.4s, v6.4s -; CHECK-GI-NEXT: add v3.4s, v5.4s, v7.4s +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: fmov s1, w1 +; CHECK-GI-NEXT: ldr w8, [sp] +; CHECK-GI-NEXT: fmov s2, w5 +; CHECK-GI-NEXT: ldr w9, [sp, #8] +; CHECK-GI-NEXT: ldr w11, [sp, #32] +; CHECK-GI-NEXT: ldr w12, [sp, #40] +; CHECK-GI-NEXT: fmov s5, w7 +; CHECK-GI-NEXT: ldr w10, [sp, #16] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: fmov s1, w4 +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: fmov s4, w12 +; CHECK-GI-NEXT: ldr w12, [sp, #96] +; CHECK-GI-NEXT: ldr w13, [sp, #104] +; CHECK-GI-NEXT: ldr w14, [sp, #128] +; CHECK-GI-NEXT: ldr w15, [sp, #136] +; CHECK-GI-NEXT: ldr w16, [sp, #160] +; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NEXT: fmov s2, w2 +; CHECK-GI-NEXT: fmov s7, w13 +; CHECK-GI-NEXT: fmov s16, w15 +; CHECK-GI-NEXT: ldr w17, [sp, #168] +; CHECK-GI-NEXT: ldr w9, [sp, #24] +; CHECK-GI-NEXT: ldr w13, [sp, #176] +; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] +; CHECK-GI-NEXT: fmov s2, w6 +; CHECK-GI-NEXT: fmov s17, w17 +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #56] +; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: fmov s3, w11 +; CHECK-GI-NEXT: ldr w11, [sp, #48] +; CHECK-GI-NEXT: mov v1.h[3], v5.h[0] +; CHECK-GI-NEXT: fmov s5, w10 +; CHECK-GI-NEXT: ldr w10, [sp, #64] +; CHECK-GI-NEXT: mov v3.h[1], v4.h[0] +; CHECK-GI-NEXT: fmov s4, w3 +; CHECK-GI-NEXT: mov v2.h[2], v5.h[0] +; CHECK-GI-NEXT: fmov s5, w11 +; CHECK-GI-NEXT: ldr w11, [sp, #72] +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: fmov s6, w11 +; CHECK-GI-NEXT: mov v0.h[3], v4.h[0] +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: mov v3.h[2], v5.h[0] +; CHECK-GI-NEXT: fmov s5, w10 +; CHECK-GI-NEXT: ldr w9, [sp, #80] +; CHECK-GI-NEXT: ldr w10, [sp, #112] +; CHECK-GI-NEXT: ldr w11, [sp, #144] +; CHECK-GI-NEXT: mov v2.h[3], v4.h[0] +; CHECK-GI-NEXT: mov v5.h[1], v6.h[0] +; CHECK-GI-NEXT: fmov s6, w12 +; CHECK-GI-NEXT: fmov s18, w11 +; CHECK-GI-NEXT: ldr w12, [sp, #88] +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: mov v6.h[1], v7.h[0] +; CHECK-GI-NEXT: fmov s7, w14 +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: mov v7.h[1], v16.h[0] +; CHECK-GI-NEXT: fmov s16, w16 +; CHECK-GI-NEXT: mov v16.h[1], v17.h[0] +; CHECK-GI-NEXT: fmov s17, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #152] +; CHECK-GI-NEXT: mov v7.h[2], v18.h[0] +; CHECK-GI-NEXT: fmov s18, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #120] +; CHECK-GI-NEXT: mov v5.h[2], v17.h[0] +; CHECK-GI-NEXT: fmov s17, w10 +; CHECK-GI-NEXT: ldr w10, [sp, #184] +; CHECK-GI-NEXT: mov v3.h[3], v18.h[0] +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: fmov s18, w10 +; CHECK-GI-NEXT: mov v6.h[2], v17.h[0] +; CHECK-GI-NEXT: fmov s17, w13 +; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-GI-NEXT: mov v16.h[2], v17.h[0] +; CHECK-GI-NEXT: fmov s17, w12 +; CHECK-GI-NEXT: mov v6.h[3], v4.h[0] +; CHECK-GI-NEXT: movi v4.4s, #15, msl #8 +; CHECK-GI-NEXT: mov v5.h[3], v17.h[0] +; CHECK-GI-NEXT: fmov s17, w9 +; CHECK-GI-NEXT: mov v16.h[3], v18.h[0] +; CHECK-GI-NEXT: ushll v6.4s, v6.4h, #0 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v4.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-GI-NEXT: mov v7.h[3], v17.h[0] +; CHECK-GI-NEXT: and v2.16b, v2.16b, v4.16b +; CHECK-GI-NEXT: and v3.16b, v3.16b, v4.16b +; CHECK-GI-NEXT: ushll v5.4s, v5.4h, #0 +; CHECK-GI-NEXT: ushll v16.4s, v16.4h, #0 +; CHECK-GI-NEXT: and v6.16b, v6.16b, v4.16b +; CHECK-GI-NEXT: ushll v7.4s, v7.4h, #0 +; CHECK-GI-NEXT: and v5.16b, v5.16b, v4.16b +; CHECK-GI-NEXT: add v1.4s, v1.4s, v6.4s +; CHECK-GI-NEXT: and v7.16b, v7.16b, v4.16b +; CHECK-GI-NEXT: and v4.16b, v16.16b, v4.16b +; CHECK-GI-NEXT: add v0.4s, v0.4s, v5.4s +; CHECK-GI-NEXT: add v2.4s, v2.4s, v7.4s +; CHECK-GI-NEXT: add v3.4s, v3.4s, v4.4s ; CHECK-GI-NEXT: ret entry: %s0s = zext <16 x i12> %s0 to <16 x i32> diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll index 5237a3491de9b4..529a3b72e09714 100644 --- a/llvm/test/CodeGen/AArch64/sext.ll +++ b/llvm/test/CodeGen/AArch64/sext.ll @@ -219,12 +219,21 @@ define <3 x i16> @sext_v3i8_v3i16(<3 x i8> %a) { ; ; CHECK-GI-LABEL: sext_v3i8_v3i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: mov v0.s[1], w1 -; CHECK-GI-NEXT: mov v0.s[2], w2 -; CHECK-GI-NEXT: xtn v0.4h, v0.4s -; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8 -; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8 +; CHECK-GI-NEXT: lsl w8, w0, #8 +; CHECK-GI-NEXT: lsl w9, w1, #8 +; CHECK-GI-NEXT: lsl w10, w2, #8 +; CHECK-GI-NEXT: sxth w8, w8 +; CHECK-GI-NEXT: sxth w9, w9 +; CHECK-GI-NEXT: sxth w10, w10 +; CHECK-GI-NEXT: asr w8, w8, #8 +; CHECK-GI-NEXT: asr w9, w9, #8 +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: asr w8, w10, #8 +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: %c = sext <3 x i8> %a to <3 x i16> @@ -244,16 +253,12 @@ define <3 x i32> @sext_v3i8_v3i32(<3 x i8> %a) { ; ; CHECK-GI-LABEL: sext_v3i8_v3i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov w8, #24 // =0x18 -; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: mov v0.s[1], w1 -; CHECK-GI-NEXT: mov v1.s[1], w8 -; CHECK-GI-NEXT: mov v0.s[2], w2 -; CHECK-GI-NEXT: mov v1.s[2], w8 -; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: neg v1.4s, v1.4s -; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: sxtb w8, w0 +; CHECK-GI-NEXT: sxtb w9, w1 +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: sxtb w8, w2 +; CHECK-GI-NEXT: mov v0.s[1], w9 +; CHECK-GI-NEXT: mov v0.s[2], w8 ; CHECK-GI-NEXT: ret entry: %c = sext <3 x i8> %a to <3 x i32> @@ -280,16 +285,15 @@ define <3 x i64> @sext_v3i8_v3i64(<3 x i8> %a) { ; ; CHECK-GI-LABEL: sext_v3i8_v3i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-GI-NEXT: sxtb x8, w2 -; CHECK-GI-NEXT: fmov d2, x8 -; CHECK-GI-NEXT: mov v0.s[1], w1 -; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-GI-NEXT: shl v0.2d, v0.2d, #56 -; CHECK-GI-NEXT: sshr v0.2d, v0.2d, #56 -; CHECK-GI-NEXT: mov d1, v0.d[1] -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: sxtb x8, w0 +; CHECK-GI-NEXT: sxtb x9, w1 +; CHECK-GI-NEXT: sxtb x10, w2 +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: fmov d1, x9 +; CHECK-GI-NEXT: fmov d2, x10 ; CHECK-GI-NEXT: ret entry: %c = sext <3 x i8> %a to <3 x i64> @@ -382,12 +386,21 @@ define <3 x i16> @sext_v3i10_v3i16(<3 x i10> %a) { ; ; CHECK-GI-LABEL: sext_v3i10_v3i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: mov v0.s[1], w1 -; CHECK-GI-NEXT: mov v0.s[2], w2 -; CHECK-GI-NEXT: xtn v0.4h, v0.4s -; CHECK-GI-NEXT: shl v0.4h, v0.4h, #6 -; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #6 +; CHECK-GI-NEXT: lsl w8, w0, #6 +; CHECK-GI-NEXT: lsl w9, w1, #6 +; CHECK-GI-NEXT: lsl w10, w2, #6 +; CHECK-GI-NEXT: sxth w8, w8 +; CHECK-GI-NEXT: sxth w9, w9 +; CHECK-GI-NEXT: sxth w10, w10 +; CHECK-GI-NEXT: asr w8, w8, #6 +; CHECK-GI-NEXT: asr w9, w9, #6 +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: asr w8, w10, #6 +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: %c = sext <3 x i10> %a to <3 x i16> @@ -407,16 +420,12 @@ define <3 x i32> @sext_v3i10_v3i32(<3 x i10> %a) { ; ; CHECK-GI-LABEL: sext_v3i10_v3i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov w8, #22 // =0x16 -; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: mov v0.s[1], w1 -; CHECK-GI-NEXT: mov v1.s[1], w8 -; CHECK-GI-NEXT: mov v0.s[2], w2 -; CHECK-GI-NEXT: mov v1.s[2], w8 -; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: neg v1.4s, v1.4s -; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: sbfx w8, w0, #0, #10 +; CHECK-GI-NEXT: sbfx w9, w1, #0, #10 +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: sbfx w8, w2, #0, #10 +; CHECK-GI-NEXT: mov v0.s[1], w9 +; CHECK-GI-NEXT: mov v0.s[2], w8 ; CHECK-GI-NEXT: ret entry: %c = sext <3 x i10> %a to <3 x i32> @@ -443,16 +452,15 @@ define <3 x i64> @sext_v3i10_v3i64(<3 x i10> %a) { ; ; CHECK-GI-LABEL: sext_v3i10_v3i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-GI-NEXT: sbfx x8, x2, #0, #10 -; CHECK-GI-NEXT: fmov d2, x8 -; CHECK-GI-NEXT: mov v0.s[1], w1 -; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-GI-NEXT: shl v0.2d, v0.2d, #54 -; CHECK-GI-NEXT: sshr v0.2d, v0.2d, #54 -; CHECK-GI-NEXT: mov d1, v0.d[1] -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: sbfx x8, x0, #0, #10 +; CHECK-GI-NEXT: sbfx x9, x1, #0, #10 +; CHECK-GI-NEXT: sbfx x10, x2, #0, #10 +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: fmov d1, x9 +; CHECK-GI-NEXT: fmov d2, x10 ; CHECK-GI-NEXT: ret entry: %c = sext <3 x i10> %a to <3 x i64> @@ -1024,34 +1032,48 @@ define <16 x i16> @sext_v16i10_v16i16(<16 x i10> %a) { ; ; CHECK-GI-LABEL: sext_v16i10_v16i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s4, w0 -; CHECK-GI-NEXT: fmov s5, w4 -; CHECK-GI-NEXT: ldr s0, [sp] -; CHECK-GI-NEXT: ldr s1, [sp, #8] -; CHECK-GI-NEXT: ldr s2, [sp, #32] -; CHECK-GI-NEXT: ldr s3, [sp, #40] -; CHECK-GI-NEXT: mov v4.s[1], w1 -; CHECK-GI-NEXT: mov v5.s[1], w5 -; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NEXT: mov v2.s[1], v3.s[0] -; CHECK-GI-NEXT: ldr s1, [sp, #16] -; CHECK-GI-NEXT: ldr s3, [sp, #48] -; CHECK-GI-NEXT: mov v4.s[2], w2 -; CHECK-GI-NEXT: mov v5.s[2], w6 -; CHECK-GI-NEXT: mov v0.s[2], v1.s[0] -; CHECK-GI-NEXT: mov v2.s[2], v3.s[0] -; CHECK-GI-NEXT: ldr s1, [sp, #24] -; CHECK-GI-NEXT: ldr s3, [sp, #56] -; CHECK-GI-NEXT: mov v4.s[3], w3 -; CHECK-GI-NEXT: mov v5.s[3], w7 -; CHECK-GI-NEXT: mov v0.s[3], v1.s[0] -; CHECK-GI-NEXT: mov v2.s[3], v3.s[0] -; CHECK-GI-NEXT: uzp1 v1.8h, v4.8h, v5.8h -; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-GI-NEXT: ldr w8, [sp] +; CHECK-GI-NEXT: ldr w9, [sp, #8] +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: fmov s2, w1 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: ldr w8, [sp, #16] +; CHECK-GI-NEXT: mov v0.h[1], v2.h[0] +; CHECK-GI-NEXT: fmov s2, w2 +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #24] +; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] +; CHECK-GI-NEXT: fmov s2, w3 +; CHECK-GI-NEXT: mov v1.h[2], v3.h[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #32] +; CHECK-GI-NEXT: mov v0.h[3], v2.h[0] +; CHECK-GI-NEXT: fmov s2, w4 +; CHECK-GI-NEXT: mov v1.h[3], v3.h[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #40] +; CHECK-GI-NEXT: mov v0.h[4], v2.h[0] +; CHECK-GI-NEXT: fmov s2, w5 +; CHECK-GI-NEXT: mov v1.h[4], v3.h[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #48] +; CHECK-GI-NEXT: mov v0.h[5], v2.h[0] +; CHECK-GI-NEXT: fmov s2, w6 +; CHECK-GI-NEXT: mov v1.h[5], v3.h[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #56] +; CHECK-GI-NEXT: mov v0.h[6], v2.h[0] +; CHECK-GI-NEXT: fmov s2, w7 +; CHECK-GI-NEXT: mov v1.h[6], v3.h[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v0.h[7], v2.h[0] +; CHECK-GI-NEXT: mov v1.h[7], v3.h[0] +; CHECK-GI-NEXT: shl v0.8h, v0.8h, #6 ; CHECK-GI-NEXT: shl v1.8h, v1.8h, #6 -; CHECK-GI-NEXT: shl v2.8h, v0.8h, #6 -; CHECK-GI-NEXT: sshr v0.8h, v1.8h, #6 -; CHECK-GI-NEXT: sshr v1.8h, v2.8h, #6 +; CHECK-GI-NEXT: sshr v0.8h, v0.8h, #6 +; CHECK-GI-NEXT: sshr v1.8h, v1.8h, #6 ; CHECK-GI-NEXT: ret entry: %c = sext <16 x i10> %a to <16 x i16> @@ -1101,36 +1123,54 @@ define <16 x i32> @sext_v16i10_v16i32(<16 x i10> %a) { ; ; CHECK-GI-LABEL: sext_v16i10_v16i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s4, w0 -; CHECK-GI-NEXT: fmov s5, w4 -; CHECK-GI-NEXT: ldr s0, [sp] -; CHECK-GI-NEXT: ldr s1, [sp, #8] -; CHECK-GI-NEXT: ldr s2, [sp, #32] -; CHECK-GI-NEXT: ldr s3, [sp, #40] -; CHECK-GI-NEXT: mov v4.s[1], w1 -; CHECK-GI-NEXT: mov v5.s[1], w5 -; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NEXT: mov v2.s[1], v3.s[0] -; CHECK-GI-NEXT: ldr s1, [sp, #16] -; CHECK-GI-NEXT: ldr s3, [sp, #48] -; CHECK-GI-NEXT: mov v4.s[2], w2 -; CHECK-GI-NEXT: mov v5.s[2], w6 -; CHECK-GI-NEXT: mov v0.s[2], v1.s[0] -; CHECK-GI-NEXT: mov v2.s[2], v3.s[0] -; CHECK-GI-NEXT: ldr s1, [sp, #24] -; CHECK-GI-NEXT: ldr s3, [sp, #56] -; CHECK-GI-NEXT: mov v4.s[3], w3 -; CHECK-GI-NEXT: mov v5.s[3], w7 -; CHECK-GI-NEXT: mov v0.s[3], v1.s[0] -; CHECK-GI-NEXT: mov v2.s[3], v3.s[0] -; CHECK-GI-NEXT: shl v1.4s, v4.4s, #22 -; CHECK-GI-NEXT: shl v3.4s, v5.4s, #22 -; CHECK-GI-NEXT: shl v4.4s, v0.4s, #22 -; CHECK-GI-NEXT: shl v5.4s, v2.4s, #22 -; CHECK-GI-NEXT: sshr v0.4s, v1.4s, #22 -; CHECK-GI-NEXT: sshr v1.4s, v3.4s, #22 -; CHECK-GI-NEXT: sshr v2.4s, v4.4s, #22 -; CHECK-GI-NEXT: sshr v3.4s, v5.4s, #22 +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: fmov s1, w1 +; CHECK-GI-NEXT: ldr w8, [sp] +; CHECK-GI-NEXT: fmov s2, w5 +; CHECK-GI-NEXT: ldr w9, [sp, #8] +; CHECK-GI-NEXT: ldr w10, [sp, #32] +; CHECK-GI-NEXT: ldr w11, [sp, #40] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #16] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: fmov s1, w4 +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: fmov s5, w10 +; CHECK-GI-NEXT: fmov s6, w11 +; CHECK-GI-NEXT: ldr w9, [sp, #48] +; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NEXT: fmov s2, w2 +; CHECK-GI-NEXT: mov v3.h[1], v4.h[0] +; CHECK-GI-NEXT: mov v5.h[1], v6.h[0] +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: fmov s6, w9 +; CHECK-GI-NEXT: ldr w8, [sp, #24] +; CHECK-GI-NEXT: ldr w9, [sp, #56] +; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] +; CHECK-GI-NEXT: fmov s2, w6 +; CHECK-GI-NEXT: mov v3.h[2], v4.h[0] +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: mov v5.h[2], v6.h[0] +; CHECK-GI-NEXT: fmov s6, w9 +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: fmov s2, w3 +; CHECK-GI-NEXT: mov v3.h[3], v4.h[0] +; CHECK-GI-NEXT: mov v0.h[3], v2.h[0] +; CHECK-GI-NEXT: fmov s2, w7 +; CHECK-GI-NEXT: mov v5.h[3], v6.h[0] +; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] +; CHECK-GI-NEXT: ushll v2.4s, v3.4h, #0 +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v3.4s, v5.4h, #0 +; CHECK-GI-NEXT: shl v2.4s, v2.4s, #22 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: shl v0.4s, v0.4s, #22 +; CHECK-GI-NEXT: shl v3.4s, v3.4s, #22 +; CHECK-GI-NEXT: sshr v2.4s, v2.4s, #22 +; CHECK-GI-NEXT: shl v1.4s, v1.4s, #22 +; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #22 +; CHECK-GI-NEXT: sshr v3.4s, v3.4s, #22 +; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #22 ; CHECK-GI-NEXT: ret entry: %c = sext <16 x i10> %a to <16 x i32> @@ -1188,49 +1228,69 @@ define <16 x i64> @sext_v16i10_v16i64(<16 x i10> %a) { ; ; CHECK-GI-LABEL: sext_v16i10_v16i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s7, w0 -; CHECK-GI-NEXT: fmov s17, w2 -; CHECK-GI-NEXT: ldr s0, [sp] -; CHECK-GI-NEXT: fmov s18, w4 -; CHECK-GI-NEXT: fmov s19, w6 -; CHECK-GI-NEXT: ldr s1, [sp, #8] -; CHECK-GI-NEXT: ldr s2, [sp, #16] -; CHECK-GI-NEXT: ldr s3, [sp, #24] -; CHECK-GI-NEXT: ldr s4, [sp, #32] -; CHECK-GI-NEXT: ldr s5, [sp, #40] -; CHECK-GI-NEXT: ldr s6, [sp, #48] -; CHECK-GI-NEXT: ldr s16, [sp, #56] -; CHECK-GI-NEXT: mov v7.s[1], w1 -; CHECK-GI-NEXT: mov v17.s[1], w3 -; CHECK-GI-NEXT: mov v18.s[1], w5 -; CHECK-GI-NEXT: mov v19.s[1], w7 -; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NEXT: mov v2.s[1], v3.s[0] -; CHECK-GI-NEXT: mov v4.s[1], v5.s[0] -; CHECK-GI-NEXT: mov v6.s[1], v16.s[0] -; CHECK-GI-NEXT: ushll v1.2d, v7.2s, #0 -; CHECK-GI-NEXT: ushll v3.2d, v17.2s, #0 -; CHECK-GI-NEXT: ushll v5.2d, v18.2s, #0 -; CHECK-GI-NEXT: ushll v7.2d, v19.2s, #0 -; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-GI-NEXT: ushll v2.2d, v2.2s, #0 -; CHECK-GI-NEXT: ushll v4.2d, v4.2s, #0 -; CHECK-GI-NEXT: ushll v6.2d, v6.2s, #0 -; CHECK-GI-NEXT: shl v1.2d, v1.2d, #54 -; CHECK-GI-NEXT: shl v3.2d, v3.2d, #54 +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: fmov s1, w1 +; CHECK-GI-NEXT: ldr w8, [sp] +; CHECK-GI-NEXT: fmov s2, w5 +; CHECK-GI-NEXT: ldr w9, [sp, #8] +; CHECK-GI-NEXT: ldr w10, [sp, #32] +; CHECK-GI-NEXT: ldr w11, [sp, #40] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: fmov s1, w4 +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: fmov s4, w11 +; CHECK-GI-NEXT: ldr w9, [sp, #48] +; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #16] +; CHECK-GI-NEXT: fmov s5, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #24] +; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: fmov s3, w10 +; CHECK-GI-NEXT: mov v3.h[1], v4.h[0] +; CHECK-GI-NEXT: fmov s4, w2 +; CHECK-GI-NEXT: mov v2.h[2], v5.h[0] +; CHECK-GI-NEXT: fmov s5, w8 +; CHECK-GI-NEXT: mov v0.h[2], v4.h[0] +; CHECK-GI-NEXT: fmov s4, w6 +; CHECK-GI-NEXT: mov v2.h[3], v5.h[0] +; CHECK-GI-NEXT: mov v1.h[2], v4.h[0] +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #56] +; CHECK-GI-NEXT: mov v3.h[2], v4.h[0] +; CHECK-GI-NEXT: fmov s4, w3 +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: mov v0.h[3], v4.h[0] +; CHECK-GI-NEXT: fmov s4, w7 +; CHECK-GI-NEXT: ushll v6.2d, v2.2s, #0 +; CHECK-GI-NEXT: ushll2 v2.2d, v2.4s, #0 +; CHECK-GI-NEXT: mov v1.h[3], v4.h[0] +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: shl v6.2d, v6.2d, #54 +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: shl v18.2d, v2.2d, #54 +; CHECK-GI-NEXT: mov v3.h[3], v4.h[0] +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll v4.2d, v0.2s, #0 +; CHECK-GI-NEXT: ushll2 v0.2d, v0.4s, #0 +; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-GI-NEXT: ushll v5.2d, v1.2s, #0 +; CHECK-GI-NEXT: ushll2 v1.2d, v1.4s, #0 +; CHECK-GI-NEXT: shl v4.2d, v4.2d, #54 +; CHECK-GI-NEXT: shl v16.2d, v0.2d, #54 +; CHECK-GI-NEXT: ushll v7.2d, v3.2s, #0 +; CHECK-GI-NEXT: ushll2 v3.2d, v3.4s, #0 ; CHECK-GI-NEXT: shl v5.2d, v5.2d, #54 +; CHECK-GI-NEXT: shl v17.2d, v1.2d, #54 +; CHECK-GI-NEXT: sshr v0.2d, v4.2d, #54 +; CHECK-GI-NEXT: sshr v1.2d, v16.2d, #54 +; CHECK-GI-NEXT: sshr v4.2d, v6.2d, #54 ; CHECK-GI-NEXT: shl v7.2d, v7.2d, #54 -; CHECK-GI-NEXT: shl v16.2d, v0.2d, #54 -; CHECK-GI-NEXT: shl v17.2d, v2.2d, #54 -; CHECK-GI-NEXT: shl v18.2d, v4.2d, #54 -; CHECK-GI-NEXT: shl v19.2d, v6.2d, #54 -; CHECK-GI-NEXT: sshr v0.2d, v1.2d, #54 -; CHECK-GI-NEXT: sshr v1.2d, v3.2d, #54 +; CHECK-GI-NEXT: shl v19.2d, v3.2d, #54 ; CHECK-GI-NEXT: sshr v2.2d, v5.2d, #54 -; CHECK-GI-NEXT: sshr v3.2d, v7.2d, #54 -; CHECK-GI-NEXT: sshr v4.2d, v16.2d, #54 -; CHECK-GI-NEXT: sshr v5.2d, v17.2d, #54 -; CHECK-GI-NEXT: sshr v6.2d, v18.2d, #54 +; CHECK-GI-NEXT: sshr v3.2d, v17.2d, #54 +; CHECK-GI-NEXT: sshr v5.2d, v18.2d, #54 +; CHECK-GI-NEXT: sshr v6.2d, v7.2d, #54 ; CHECK-GI-NEXT: sshr v7.2d, v19.2d, #54 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll index c81fd26a775256..54ada05c904487 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -3812,51 +3812,72 @@ define i16 @add_v24i8_v24i16_zext(<24 x i8> %x) { ; ; CHECK-GI-LABEL: add_v24i8_v24i16_zext: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s4, w0 -; CHECK-GI-NEXT: fmov s5, w4 -; CHECK-GI-NEXT: ldr s0, [sp] -; CHECK-GI-NEXT: ldr s6, [sp, #8] -; CHECK-GI-NEXT: ldr s1, [sp, #32] -; CHECK-GI-NEXT: ldr s7, [sp, #40] -; CHECK-GI-NEXT: ldr s2, [sp, #64] -; CHECK-GI-NEXT: ldr s16, [sp, #72] -; CHECK-GI-NEXT: ldr s3, [sp, #96] -; CHECK-GI-NEXT: ldr s17, [sp, #104] -; CHECK-GI-NEXT: mov v4.s[1], w1 -; CHECK-GI-NEXT: mov v5.s[1], w5 -; CHECK-GI-NEXT: mov v0.s[1], v6.s[0] -; CHECK-GI-NEXT: mov v1.s[1], v7.s[0] -; CHECK-GI-NEXT: mov v2.s[1], v16.s[0] -; CHECK-GI-NEXT: mov v3.s[1], v17.s[0] -; CHECK-GI-NEXT: ldr s6, [sp, #16] -; CHECK-GI-NEXT: ldr s7, [sp, #48] -; CHECK-GI-NEXT: ldr s16, [sp, #80] -; CHECK-GI-NEXT: ldr s17, [sp, #112] -; CHECK-GI-NEXT: mov v4.s[2], w2 -; CHECK-GI-NEXT: mov v5.s[2], w6 -; CHECK-GI-NEXT: mov v0.s[2], v6.s[0] -; CHECK-GI-NEXT: mov v1.s[2], v7.s[0] -; CHECK-GI-NEXT: mov v2.s[2], v16.s[0] -; CHECK-GI-NEXT: mov v3.s[2], v17.s[0] -; CHECK-GI-NEXT: ldr s6, [sp, #24] -; CHECK-GI-NEXT: ldr s7, [sp, #56] -; CHECK-GI-NEXT: ldr s16, [sp, #88] -; CHECK-GI-NEXT: ldr s17, [sp, #120] -; CHECK-GI-NEXT: mov v4.s[3], w3 -; CHECK-GI-NEXT: mov v5.s[3], w7 -; CHECK-GI-NEXT: mov v0.s[3], v6.s[0] -; CHECK-GI-NEXT: mov v1.s[3], v7.s[0] -; CHECK-GI-NEXT: mov v2.s[3], v16.s[0] -; CHECK-GI-NEXT: mov v3.s[3], v17.s[0] -; CHECK-GI-NEXT: uzp1 v4.8h, v4.8h, v5.8h -; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-GI-NEXT: uzp1 v1.8h, v2.8h, v3.8h -; CHECK-GI-NEXT: uzp1 v0.16b, v4.16b, v0.16b -; CHECK-GI-NEXT: xtn v1.8b, v1.8h -; CHECK-GI-NEXT: uaddlv h0, v0.16b +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: fmov s1, w1 +; CHECK-GI-NEXT: ldr w8, [sp] +; CHECK-GI-NEXT: ldr w9, [sp, #64] +; CHECK-GI-NEXT: ldr w10, [sp, #72] +; CHECK-GI-NEXT: mov v0.b[1], v1.b[0] +; CHECK-GI-NEXT: fmov s1, w2 +; CHECK-GI-NEXT: fmov s2, w10 +; CHECK-GI-NEXT: mov v0.b[2], v1.b[0] +; CHECK-GI-NEXT: fmov s1, w3 +; CHECK-GI-NEXT: mov v0.b[3], v1.b[0] +; CHECK-GI-NEXT: fmov s1, w4 +; CHECK-GI-NEXT: mov v0.b[4], v1.b[0] +; CHECK-GI-NEXT: fmov s1, w5 +; CHECK-GI-NEXT: mov v0.b[5], v1.b[0] +; CHECK-GI-NEXT: fmov s1, w6 +; CHECK-GI-NEXT: mov v0.b[6], v1.b[0] +; CHECK-GI-NEXT: fmov s1, w7 +; CHECK-GI-NEXT: mov v0.b[7], v1.b[0] +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #8] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #16] +; CHECK-GI-NEXT: mov v0.b[8], v1.b[0] +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #80] +; CHECK-GI-NEXT: mov v1.b[1], v2.b[0] +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #88] +; CHECK-GI-NEXT: mov v0.b[9], v3.b[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #24] +; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #96] +; CHECK-GI-NEXT: mov v0.b[10], v3.b[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #32] +; CHECK-GI-NEXT: mov v1.b[3], v2.b[0] +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #104] +; CHECK-GI-NEXT: mov v0.b[11], v3.b[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #40] +; CHECK-GI-NEXT: mov v1.b[4], v2.b[0] +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #112] +; CHECK-GI-NEXT: mov v0.b[12], v3.b[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #48] +; CHECK-GI-NEXT: mov v1.b[5], v2.b[0] +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #120] +; CHECK-GI-NEXT: mov v0.b[13], v3.b[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #56] +; CHECK-GI-NEXT: mov v1.b[6], v2.b[0] +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: mov v0.b[14], v3.b[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v1.b[7], v2.b[0] +; CHECK-GI-NEXT: mov v0.b[15], v3.b[0] ; CHECK-GI-NEXT: uaddlv h1, v1.8b -; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: uaddlv h0, v0.16b ; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: add w0, w8, w9 ; CHECK-GI-NEXT: ret entry: @@ -3938,51 +3959,72 @@ define i16 @add_v24i8_v24i16_sext(<24 x i8> %x) { ; ; CHECK-GI-LABEL: add_v24i8_v24i16_sext: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s4, w0 -; CHECK-GI-NEXT: fmov s5, w4 -; CHECK-GI-NEXT: ldr s0, [sp] -; CHECK-GI-NEXT: ldr s6, [sp, #8] -; CHECK-GI-NEXT: ldr s1, [sp, #32] -; CHECK-GI-NEXT: ldr s7, [sp, #40] -; CHECK-GI-NEXT: ldr s2, [sp, #64] -; CHECK-GI-NEXT: ldr s16, [sp, #72] -; CHECK-GI-NEXT: ldr s3, [sp, #96] -; CHECK-GI-NEXT: ldr s17, [sp, #104] -; CHECK-GI-NEXT: mov v4.s[1], w1 -; CHECK-GI-NEXT: mov v5.s[1], w5 -; CHECK-GI-NEXT: mov v0.s[1], v6.s[0] -; CHECK-GI-NEXT: mov v1.s[1], v7.s[0] -; CHECK-GI-NEXT: mov v2.s[1], v16.s[0] -; CHECK-GI-NEXT: mov v3.s[1], v17.s[0] -; CHECK-GI-NEXT: ldr s6, [sp, #16] -; CHECK-GI-NEXT: ldr s7, [sp, #48] -; CHECK-GI-NEXT: ldr s16, [sp, #80] -; CHECK-GI-NEXT: ldr s17, [sp, #112] -; CHECK-GI-NEXT: mov v4.s[2], w2 -; CHECK-GI-NEXT: mov v5.s[2], w6 -; CHECK-GI-NEXT: mov v0.s[2], v6.s[0] -; CHECK-GI-NEXT: mov v1.s[2], v7.s[0] -; CHECK-GI-NEXT: mov v2.s[2], v16.s[0] -; CHECK-GI-NEXT: mov v3.s[2], v17.s[0] -; CHECK-GI-NEXT: ldr s6, [sp, #24] -; CHECK-GI-NEXT: ldr s7, [sp, #56] -; CHECK-GI-NEXT: ldr s16, [sp, #88] -; CHECK-GI-NEXT: ldr s17, [sp, #120] -; CHECK-GI-NEXT: mov v4.s[3], w3 -; CHECK-GI-NEXT: mov v5.s[3], w7 -; CHECK-GI-NEXT: mov v0.s[3], v6.s[0] -; CHECK-GI-NEXT: mov v1.s[3], v7.s[0] -; CHECK-GI-NEXT: mov v2.s[3], v16.s[0] -; CHECK-GI-NEXT: mov v3.s[3], v17.s[0] -; CHECK-GI-NEXT: uzp1 v4.8h, v4.8h, v5.8h -; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-GI-NEXT: uzp1 v1.8h, v2.8h, v3.8h -; CHECK-GI-NEXT: uzp1 v0.16b, v4.16b, v0.16b -; CHECK-GI-NEXT: xtn v1.8b, v1.8h -; CHECK-GI-NEXT: saddlv h0, v0.16b +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: fmov s1, w1 +; CHECK-GI-NEXT: ldr w8, [sp] +; CHECK-GI-NEXT: ldr w9, [sp, #64] +; CHECK-GI-NEXT: ldr w10, [sp, #72] +; CHECK-GI-NEXT: mov v0.b[1], v1.b[0] +; CHECK-GI-NEXT: fmov s1, w2 +; CHECK-GI-NEXT: fmov s2, w10 +; CHECK-GI-NEXT: mov v0.b[2], v1.b[0] +; CHECK-GI-NEXT: fmov s1, w3 +; CHECK-GI-NEXT: mov v0.b[3], v1.b[0] +; CHECK-GI-NEXT: fmov s1, w4 +; CHECK-GI-NEXT: mov v0.b[4], v1.b[0] +; CHECK-GI-NEXT: fmov s1, w5 +; CHECK-GI-NEXT: mov v0.b[5], v1.b[0] +; CHECK-GI-NEXT: fmov s1, w6 +; CHECK-GI-NEXT: mov v0.b[6], v1.b[0] +; CHECK-GI-NEXT: fmov s1, w7 +; CHECK-GI-NEXT: mov v0.b[7], v1.b[0] +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #8] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #16] +; CHECK-GI-NEXT: mov v0.b[8], v1.b[0] +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #80] +; CHECK-GI-NEXT: mov v1.b[1], v2.b[0] +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #88] +; CHECK-GI-NEXT: mov v0.b[9], v3.b[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #24] +; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #96] +; CHECK-GI-NEXT: mov v0.b[10], v3.b[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #32] +; CHECK-GI-NEXT: mov v1.b[3], v2.b[0] +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #104] +; CHECK-GI-NEXT: mov v0.b[11], v3.b[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #40] +; CHECK-GI-NEXT: mov v1.b[4], v2.b[0] +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #112] +; CHECK-GI-NEXT: mov v0.b[12], v3.b[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #48] +; CHECK-GI-NEXT: mov v1.b[5], v2.b[0] +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #120] +; CHECK-GI-NEXT: mov v0.b[13], v3.b[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #56] +; CHECK-GI-NEXT: mov v1.b[6], v2.b[0] +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: mov v0.b[14], v3.b[0] +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v1.b[7], v2.b[0] +; CHECK-GI-NEXT: mov v0.b[15], v3.b[0] ; CHECK-GI-NEXT: saddlv h1, v1.8b -; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: saddlv h0, v0.16b ; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: add w0, w8, w9 ; CHECK-GI-NEXT: ret entry: @@ -4125,106 +4167,149 @@ define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) { ; ; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_zext: ; CHECK-GI-BASE: // %bb.0: // %entry -; CHECK-GI-BASE-NEXT: fmov s4, w0 -; CHECK-GI-BASE-NEXT: fmov s5, w4 -; CHECK-GI-BASE-NEXT: ldr s0, [sp] -; CHECK-GI-BASE-NEXT: ldr s6, [sp, #8] -; CHECK-GI-BASE-NEXT: ldr s1, [sp, #32] -; CHECK-GI-BASE-NEXT: ldr s7, [sp, #40] -; CHECK-GI-BASE-NEXT: ldr s2, [sp, #64] -; CHECK-GI-BASE-NEXT: ldr s16, [sp, #72] -; CHECK-GI-BASE-NEXT: ldr s3, [sp, #96] -; CHECK-GI-BASE-NEXT: ldr s17, [sp, #104] -; CHECK-GI-BASE-NEXT: mov v4.s[1], w1 -; CHECK-GI-BASE-NEXT: mov v5.s[1], w5 -; CHECK-GI-BASE-NEXT: mov v0.s[1], v6.s[0] -; CHECK-GI-BASE-NEXT: mov v1.s[1], v7.s[0] -; CHECK-GI-BASE-NEXT: mov v2.s[1], v16.s[0] -; CHECK-GI-BASE-NEXT: mov v3.s[1], v17.s[0] -; CHECK-GI-BASE-NEXT: ldr s6, [sp, #16] -; CHECK-GI-BASE-NEXT: ldr s7, [sp, #48] -; CHECK-GI-BASE-NEXT: ldr s16, [sp, #80] -; CHECK-GI-BASE-NEXT: ldr s17, [sp, #112] -; CHECK-GI-BASE-NEXT: mov v4.s[2], w2 -; CHECK-GI-BASE-NEXT: mov v5.s[2], w6 -; CHECK-GI-BASE-NEXT: mov v0.s[2], v6.s[0] -; CHECK-GI-BASE-NEXT: mov v1.s[2], v7.s[0] -; CHECK-GI-BASE-NEXT: mov v2.s[2], v16.s[0] -; CHECK-GI-BASE-NEXT: mov v3.s[2], v17.s[0] -; CHECK-GI-BASE-NEXT: ldr s6, [sp, #24] -; CHECK-GI-BASE-NEXT: ldr s7, [sp, #56] -; CHECK-GI-BASE-NEXT: ldr s16, [sp, #88] -; CHECK-GI-BASE-NEXT: ldr s17, [sp, #120] -; CHECK-GI-BASE-NEXT: mov v4.s[3], w3 -; CHECK-GI-BASE-NEXT: mov v5.s[3], w7 -; CHECK-GI-BASE-NEXT: mov v0.s[3], v6.s[0] -; CHECK-GI-BASE-NEXT: mov v1.s[3], v7.s[0] -; CHECK-GI-BASE-NEXT: mov v2.s[3], v16.s[0] -; CHECK-GI-BASE-NEXT: mov v3.s[3], v17.s[0] -; CHECK-GI-BASE-NEXT: uzp1 v4.8h, v4.8h, v5.8h -; CHECK-GI-BASE-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-GI-BASE-NEXT: uzp1 v1.8h, v2.8h, v3.8h -; CHECK-GI-BASE-NEXT: uzp1 v0.16b, v4.16b, v0.16b -; CHECK-GI-BASE-NEXT: xtn v1.8b, v1.8h -; CHECK-GI-BASE-NEXT: uaddlv h0, v0.16b +; CHECK-GI-BASE-NEXT: fmov s0, w0 +; CHECK-GI-BASE-NEXT: fmov s1, w1 +; CHECK-GI-BASE-NEXT: ldr w8, [sp] +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #64] +; CHECK-GI-BASE-NEXT: ldr w10, [sp, #72] +; CHECK-GI-BASE-NEXT: mov v0.b[1], v1.b[0] +; CHECK-GI-BASE-NEXT: fmov s1, w2 +; CHECK-GI-BASE-NEXT: fmov s2, w10 +; CHECK-GI-BASE-NEXT: mov v0.b[2], v1.b[0] +; CHECK-GI-BASE-NEXT: fmov s1, w3 +; CHECK-GI-BASE-NEXT: mov v0.b[3], v1.b[0] +; CHECK-GI-BASE-NEXT: fmov s1, w4 +; CHECK-GI-BASE-NEXT: mov v0.b[4], v1.b[0] +; CHECK-GI-BASE-NEXT: fmov s1, w5 +; CHECK-GI-BASE-NEXT: mov v0.b[5], v1.b[0] +; CHECK-GI-BASE-NEXT: fmov s1, w6 +; CHECK-GI-BASE-NEXT: mov v0.b[6], v1.b[0] +; CHECK-GI-BASE-NEXT: fmov s1, w7 +; CHECK-GI-BASE-NEXT: mov v0.b[7], v1.b[0] +; CHECK-GI-BASE-NEXT: fmov s1, w8 +; CHECK-GI-BASE-NEXT: ldr w8, [sp, #8] +; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: ldr w8, [sp, #16] +; CHECK-GI-BASE-NEXT: mov v0.b[8], v1.b[0] +; CHECK-GI-BASE-NEXT: fmov s1, w9 +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #80] +; CHECK-GI-BASE-NEXT: mov v1.b[1], v2.b[0] +; CHECK-GI-BASE-NEXT: fmov s2, w9 +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #88] +; CHECK-GI-BASE-NEXT: mov v0.b[9], v3.b[0] +; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: ldr w8, [sp, #24] +; CHECK-GI-BASE-NEXT: mov v1.b[2], v2.b[0] +; CHECK-GI-BASE-NEXT: fmov s2, w9 +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #96] +; CHECK-GI-BASE-NEXT: mov v0.b[10], v3.b[0] +; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: ldr w8, [sp, #32] +; CHECK-GI-BASE-NEXT: mov v1.b[3], v2.b[0] +; CHECK-GI-BASE-NEXT: fmov s2, w9 +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #104] +; CHECK-GI-BASE-NEXT: mov v0.b[11], v3.b[0] +; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: ldr w8, [sp, #40] +; CHECK-GI-BASE-NEXT: mov v1.b[4], v2.b[0] +; CHECK-GI-BASE-NEXT: fmov s2, w9 +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #112] +; CHECK-GI-BASE-NEXT: mov v0.b[12], v3.b[0] +; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: ldr w8, [sp, #48] +; CHECK-GI-BASE-NEXT: mov v1.b[5], v2.b[0] +; CHECK-GI-BASE-NEXT: fmov s2, w9 +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #120] +; CHECK-GI-BASE-NEXT: mov v0.b[13], v3.b[0] +; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: ldr w8, [sp, #56] +; CHECK-GI-BASE-NEXT: mov v1.b[6], v2.b[0] +; CHECK-GI-BASE-NEXT: fmov s2, w9 +; CHECK-GI-BASE-NEXT: mov v0.b[14], v3.b[0] +; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: mov v1.b[7], v2.b[0] +; CHECK-GI-BASE-NEXT: mov v0.b[15], v3.b[0] ; CHECK-GI-BASE-NEXT: uaddlv h1, v1.8b -; CHECK-GI-BASE-NEXT: fmov w8, s0 +; CHECK-GI-BASE-NEXT: uaddlv h0, v0.16b ; CHECK-GI-BASE-NEXT: fmov w9, s1 +; CHECK-GI-BASE-NEXT: fmov w8, s0 ; CHECK-GI-BASE-NEXT: add w8, w8, w9 ; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff ; CHECK-GI-BASE-NEXT: ret ; ; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_zext: ; CHECK-GI-DOT: // %bb.0: // %entry -; CHECK-GI-DOT-NEXT: fmov s4, w0 -; CHECK-GI-DOT-NEXT: fmov s5, w4 -; CHECK-GI-DOT-NEXT: ldr s0, [sp] -; CHECK-GI-DOT-NEXT: ldr s6, [sp, #8] -; CHECK-GI-DOT-NEXT: ldr s1, [sp, #32] -; CHECK-GI-DOT-NEXT: ldr s7, [sp, #40] -; CHECK-GI-DOT-NEXT: ldr s2, [sp, #64] -; CHECK-GI-DOT-NEXT: ldr s16, [sp, #72] -; CHECK-GI-DOT-NEXT: ldr s3, [sp, #96] -; CHECK-GI-DOT-NEXT: ldr s17, [sp, #104] -; CHECK-GI-DOT-NEXT: mov v4.s[1], w1 -; CHECK-GI-DOT-NEXT: mov v5.s[1], w5 -; CHECK-GI-DOT-NEXT: mov v0.s[1], v6.s[0] -; CHECK-GI-DOT-NEXT: mov v1.s[1], v7.s[0] -; CHECK-GI-DOT-NEXT: mov v2.s[1], v16.s[0] -; CHECK-GI-DOT-NEXT: mov v3.s[1], v17.s[0] -; CHECK-GI-DOT-NEXT: ldr s6, [sp, #16] -; CHECK-GI-DOT-NEXT: ldr s7, [sp, #48] -; CHECK-GI-DOT-NEXT: ldr s16, [sp, #80] -; CHECK-GI-DOT-NEXT: ldr s17, [sp, #112] -; CHECK-GI-DOT-NEXT: mov v4.s[2], w2 -; CHECK-GI-DOT-NEXT: mov v5.s[2], w6 -; CHECK-GI-DOT-NEXT: mov v0.s[2], v6.s[0] -; CHECK-GI-DOT-NEXT: mov v1.s[2], v7.s[0] -; CHECK-GI-DOT-NEXT: mov v2.s[2], v16.s[0] -; CHECK-GI-DOT-NEXT: mov v3.s[2], v17.s[0] -; CHECK-GI-DOT-NEXT: ldr s6, [sp, #24] -; CHECK-GI-DOT-NEXT: ldr s7, [sp, #56] -; CHECK-GI-DOT-NEXT: ldr s16, [sp, #88] -; CHECK-GI-DOT-NEXT: ldr s17, [sp, #120] -; CHECK-GI-DOT-NEXT: mov v4.s[3], w3 -; CHECK-GI-DOT-NEXT: mov v5.s[3], w7 -; CHECK-GI-DOT-NEXT: mov v0.s[3], v6.s[0] -; CHECK-GI-DOT-NEXT: mov v1.s[3], v7.s[0] -; CHECK-GI-DOT-NEXT: mov v2.s[3], v16.s[0] -; CHECK-GI-DOT-NEXT: mov v3.s[3], v17.s[0] -; CHECK-GI-DOT-NEXT: uzp1 v4.8h, v4.8h, v5.8h -; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000 -; CHECK-GI-DOT-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-GI-DOT-NEXT: uzp1 v1.8h, v2.8h, v3.8h +; CHECK-GI-DOT-NEXT: fmov s0, w0 +; CHECK-GI-DOT-NEXT: fmov s1, w1 +; CHECK-GI-DOT-NEXT: ldr w8, [sp] +; CHECK-GI-DOT-NEXT: ldr w9, [sp, #64] +; CHECK-GI-DOT-NEXT: ldr w10, [sp, #72] +; CHECK-GI-DOT-NEXT: movi v4.8b, #1 +; CHECK-GI-DOT-NEXT: fmov s2, w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #8] +; CHECK-GI-DOT-NEXT: mov v0.b[1], v1.b[0] +; CHECK-GI-DOT-NEXT: fmov s1, w2 +; CHECK-GI-DOT-NEXT: fmov s3, w10 +; CHECK-GI-DOT-NEXT: mov v0.b[2], v1.b[0] +; CHECK-GI-DOT-NEXT: fmov s1, w3 +; CHECK-GI-DOT-NEXT: mov v0.b[3], v1.b[0] +; CHECK-GI-DOT-NEXT: fmov s1, w4 +; CHECK-GI-DOT-NEXT: mov v0.b[4], v1.b[0] +; CHECK-GI-DOT-NEXT: fmov s1, w5 +; CHECK-GI-DOT-NEXT: mov v0.b[5], v1.b[0] +; CHECK-GI-DOT-NEXT: fmov s1, w6 +; CHECK-GI-DOT-NEXT: mov v0.b[6], v1.b[0] +; CHECK-GI-DOT-NEXT: fmov s1, w7 +; CHECK-GI-DOT-NEXT: mov v0.b[7], v1.b[0] +; CHECK-GI-DOT-NEXT: fmov s1, w9 +; CHECK-GI-DOT-NEXT: ldr w9, [sp, #80] +; CHECK-GI-DOT-NEXT: mov v1.b[1], v3.b[0] +; CHECK-GI-DOT-NEXT: fmov s3, w9 +; CHECK-GI-DOT-NEXT: ldr w9, [sp, #88] +; CHECK-GI-DOT-NEXT: mov v0.b[8], v2.b[0] +; CHECK-GI-DOT-NEXT: fmov s2, w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #16] +; CHECK-GI-DOT-NEXT: mov v1.b[2], v3.b[0] +; CHECK-GI-DOT-NEXT: fmov s3, w9 +; CHECK-GI-DOT-NEXT: ldr w9, [sp, #96] +; CHECK-GI-DOT-NEXT: mov v0.b[9], v2.b[0] +; CHECK-GI-DOT-NEXT: fmov s2, w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #24] +; CHECK-GI-DOT-NEXT: mov v1.b[3], v3.b[0] +; CHECK-GI-DOT-NEXT: fmov s3, w9 +; CHECK-GI-DOT-NEXT: ldr w9, [sp, #104] +; CHECK-GI-DOT-NEXT: mov v0.b[10], v2.b[0] +; CHECK-GI-DOT-NEXT: fmov s2, w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #32] +; CHECK-GI-DOT-NEXT: mov v1.b[4], v3.b[0] +; CHECK-GI-DOT-NEXT: fmov s3, w9 +; CHECK-GI-DOT-NEXT: ldr w9, [sp, #112] +; CHECK-GI-DOT-NEXT: mov v0.b[11], v2.b[0] +; CHECK-GI-DOT-NEXT: fmov s2, w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #40] +; CHECK-GI-DOT-NEXT: mov v1.b[5], v3.b[0] +; CHECK-GI-DOT-NEXT: fmov s3, w9 +; CHECK-GI-DOT-NEXT: ldr w9, [sp, #120] +; CHECK-GI-DOT-NEXT: mov v0.b[12], v2.b[0] +; CHECK-GI-DOT-NEXT: fmov s2, w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #48] +; CHECK-GI-DOT-NEXT: fmov s5, w9 +; CHECK-GI-DOT-NEXT: mov v1.b[6], v3.b[0] +; CHECK-GI-DOT-NEXT: fmov s3, w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #56] +; CHECK-GI-DOT-NEXT: mov v0.b[13], v2.b[0] ; CHECK-GI-DOT-NEXT: movi v2.8b, #1 -; CHECK-GI-DOT-NEXT: movi v3.8b, #1 -; CHECK-GI-DOT-NEXT: uzp1 v0.16b, v4.16b, v0.16b -; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000 -; CHECK-GI-DOT-NEXT: xtn v1.8b, v1.8h -; CHECK-GI-DOT-NEXT: mov v3.d[1], v2.d[0] -; CHECK-GI-DOT-NEXT: udot v5.4s, v0.16b, v3.16b -; CHECK-GI-DOT-NEXT: udot v4.4s, v1.16b, v2.16b -; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v4.4s +; CHECK-GI-DOT-NEXT: mov v1.b[7], v5.b[0] +; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000 +; CHECK-GI-DOT-NEXT: mov v0.b[14], v3.b[0] +; CHECK-GI-DOT-NEXT: fmov s3, w8 +; CHECK-GI-DOT-NEXT: mov v4.d[1], v2.d[0] +; CHECK-GI-DOT-NEXT: fmov d1, d1 +; CHECK-GI-DOT-NEXT: mov v0.b[15], v3.b[0] +; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000 +; CHECK-GI-DOT-NEXT: udot v5.4s, v0.16b, v4.16b +; CHECK-GI-DOT-NEXT: udot v3.4s, v1.16b, v2.16b +; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v3.4s ; CHECK-GI-DOT-NEXT: addv s0, v0.4s ; CHECK-GI-DOT-NEXT: fmov w0, s0 ; CHECK-GI-DOT-NEXT: ret @@ -4398,106 +4483,149 @@ define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) { ; ; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_sext: ; CHECK-GI-BASE: // %bb.0: // %entry -; CHECK-GI-BASE-NEXT: fmov s4, w0 -; CHECK-GI-BASE-NEXT: fmov s5, w4 -; CHECK-GI-BASE-NEXT: ldr s0, [sp] -; CHECK-GI-BASE-NEXT: ldr s6, [sp, #8] -; CHECK-GI-BASE-NEXT: ldr s1, [sp, #32] -; CHECK-GI-BASE-NEXT: ldr s7, [sp, #40] -; CHECK-GI-BASE-NEXT: ldr s2, [sp, #64] -; CHECK-GI-BASE-NEXT: ldr s16, [sp, #72] -; CHECK-GI-BASE-NEXT: ldr s3, [sp, #96] -; CHECK-GI-BASE-NEXT: ldr s17, [sp, #104] -; CHECK-GI-BASE-NEXT: mov v4.s[1], w1 -; CHECK-GI-BASE-NEXT: mov v5.s[1], w5 -; CHECK-GI-BASE-NEXT: mov v0.s[1], v6.s[0] -; CHECK-GI-BASE-NEXT: mov v1.s[1], v7.s[0] -; CHECK-GI-BASE-NEXT: mov v2.s[1], v16.s[0] -; CHECK-GI-BASE-NEXT: mov v3.s[1], v17.s[0] -; CHECK-GI-BASE-NEXT: ldr s6, [sp, #16] -; CHECK-GI-BASE-NEXT: ldr s7, [sp, #48] -; CHECK-GI-BASE-NEXT: ldr s16, [sp, #80] -; CHECK-GI-BASE-NEXT: ldr s17, [sp, #112] -; CHECK-GI-BASE-NEXT: mov v4.s[2], w2 -; CHECK-GI-BASE-NEXT: mov v5.s[2], w6 -; CHECK-GI-BASE-NEXT: mov v0.s[2], v6.s[0] -; CHECK-GI-BASE-NEXT: mov v1.s[2], v7.s[0] -; CHECK-GI-BASE-NEXT: mov v2.s[2], v16.s[0] -; CHECK-GI-BASE-NEXT: mov v3.s[2], v17.s[0] -; CHECK-GI-BASE-NEXT: ldr s6, [sp, #24] -; CHECK-GI-BASE-NEXT: ldr s7, [sp, #56] -; CHECK-GI-BASE-NEXT: ldr s16, [sp, #88] -; CHECK-GI-BASE-NEXT: ldr s17, [sp, #120] -; CHECK-GI-BASE-NEXT: mov v4.s[3], w3 -; CHECK-GI-BASE-NEXT: mov v5.s[3], w7 -; CHECK-GI-BASE-NEXT: mov v0.s[3], v6.s[0] -; CHECK-GI-BASE-NEXT: mov v1.s[3], v7.s[0] -; CHECK-GI-BASE-NEXT: mov v2.s[3], v16.s[0] -; CHECK-GI-BASE-NEXT: mov v3.s[3], v17.s[0] -; CHECK-GI-BASE-NEXT: uzp1 v4.8h, v4.8h, v5.8h -; CHECK-GI-BASE-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-GI-BASE-NEXT: uzp1 v1.8h, v2.8h, v3.8h -; CHECK-GI-BASE-NEXT: uzp1 v0.16b, v4.16b, v0.16b -; CHECK-GI-BASE-NEXT: xtn v1.8b, v1.8h -; CHECK-GI-BASE-NEXT: saddlv h0, v0.16b +; CHECK-GI-BASE-NEXT: fmov s0, w0 +; CHECK-GI-BASE-NEXT: fmov s1, w1 +; CHECK-GI-BASE-NEXT: ldr w8, [sp] +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #64] +; CHECK-GI-BASE-NEXT: ldr w10, [sp, #72] +; CHECK-GI-BASE-NEXT: mov v0.b[1], v1.b[0] +; CHECK-GI-BASE-NEXT: fmov s1, w2 +; CHECK-GI-BASE-NEXT: fmov s2, w10 +; CHECK-GI-BASE-NEXT: mov v0.b[2], v1.b[0] +; CHECK-GI-BASE-NEXT: fmov s1, w3 +; CHECK-GI-BASE-NEXT: mov v0.b[3], v1.b[0] +; CHECK-GI-BASE-NEXT: fmov s1, w4 +; CHECK-GI-BASE-NEXT: mov v0.b[4], v1.b[0] +; CHECK-GI-BASE-NEXT: fmov s1, w5 +; CHECK-GI-BASE-NEXT: mov v0.b[5], v1.b[0] +; CHECK-GI-BASE-NEXT: fmov s1, w6 +; CHECK-GI-BASE-NEXT: mov v0.b[6], v1.b[0] +; CHECK-GI-BASE-NEXT: fmov s1, w7 +; CHECK-GI-BASE-NEXT: mov v0.b[7], v1.b[0] +; CHECK-GI-BASE-NEXT: fmov s1, w8 +; CHECK-GI-BASE-NEXT: ldr w8, [sp, #8] +; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: ldr w8, [sp, #16] +; CHECK-GI-BASE-NEXT: mov v0.b[8], v1.b[0] +; CHECK-GI-BASE-NEXT: fmov s1, w9 +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #80] +; CHECK-GI-BASE-NEXT: mov v1.b[1], v2.b[0] +; CHECK-GI-BASE-NEXT: fmov s2, w9 +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #88] +; CHECK-GI-BASE-NEXT: mov v0.b[9], v3.b[0] +; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: ldr w8, [sp, #24] +; CHECK-GI-BASE-NEXT: mov v1.b[2], v2.b[0] +; CHECK-GI-BASE-NEXT: fmov s2, w9 +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #96] +; CHECK-GI-BASE-NEXT: mov v0.b[10], v3.b[0] +; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: ldr w8, [sp, #32] +; CHECK-GI-BASE-NEXT: mov v1.b[3], v2.b[0] +; CHECK-GI-BASE-NEXT: fmov s2, w9 +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #104] +; CHECK-GI-BASE-NEXT: mov v0.b[11], v3.b[0] +; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: ldr w8, [sp, #40] +; CHECK-GI-BASE-NEXT: mov v1.b[4], v2.b[0] +; CHECK-GI-BASE-NEXT: fmov s2, w9 +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #112] +; CHECK-GI-BASE-NEXT: mov v0.b[12], v3.b[0] +; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: ldr w8, [sp, #48] +; CHECK-GI-BASE-NEXT: mov v1.b[5], v2.b[0] +; CHECK-GI-BASE-NEXT: fmov s2, w9 +; CHECK-GI-BASE-NEXT: ldr w9, [sp, #120] +; CHECK-GI-BASE-NEXT: mov v0.b[13], v3.b[0] +; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: ldr w8, [sp, #56] +; CHECK-GI-BASE-NEXT: mov v1.b[6], v2.b[0] +; CHECK-GI-BASE-NEXT: fmov s2, w9 +; CHECK-GI-BASE-NEXT: mov v0.b[14], v3.b[0] +; CHECK-GI-BASE-NEXT: fmov s3, w8 +; CHECK-GI-BASE-NEXT: mov v1.b[7], v2.b[0] +; CHECK-GI-BASE-NEXT: mov v0.b[15], v3.b[0] ; CHECK-GI-BASE-NEXT: saddlv h1, v1.8b -; CHECK-GI-BASE-NEXT: fmov w8, s0 +; CHECK-GI-BASE-NEXT: saddlv h0, v0.16b ; CHECK-GI-BASE-NEXT: fmov w9, s1 +; CHECK-GI-BASE-NEXT: fmov w8, s0 ; CHECK-GI-BASE-NEXT: add w8, w8, w9 ; CHECK-GI-BASE-NEXT: sxth w0, w8 ; CHECK-GI-BASE-NEXT: ret ; ; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_sext: ; CHECK-GI-DOT: // %bb.0: // %entry -; CHECK-GI-DOT-NEXT: fmov s4, w0 -; CHECK-GI-DOT-NEXT: fmov s5, w4 -; CHECK-GI-DOT-NEXT: ldr s0, [sp] -; CHECK-GI-DOT-NEXT: ldr s6, [sp, #8] -; CHECK-GI-DOT-NEXT: ldr s1, [sp, #32] -; CHECK-GI-DOT-NEXT: ldr s7, [sp, #40] -; CHECK-GI-DOT-NEXT: ldr s2, [sp, #64] -; CHECK-GI-DOT-NEXT: ldr s16, [sp, #72] -; CHECK-GI-DOT-NEXT: ldr s3, [sp, #96] -; CHECK-GI-DOT-NEXT: ldr s17, [sp, #104] -; CHECK-GI-DOT-NEXT: mov v4.s[1], w1 -; CHECK-GI-DOT-NEXT: mov v5.s[1], w5 -; CHECK-GI-DOT-NEXT: mov v0.s[1], v6.s[0] -; CHECK-GI-DOT-NEXT: mov v1.s[1], v7.s[0] -; CHECK-GI-DOT-NEXT: mov v2.s[1], v16.s[0] -; CHECK-GI-DOT-NEXT: mov v3.s[1], v17.s[0] -; CHECK-GI-DOT-NEXT: ldr s6, [sp, #16] -; CHECK-GI-DOT-NEXT: ldr s7, [sp, #48] -; CHECK-GI-DOT-NEXT: ldr s16, [sp, #80] -; CHECK-GI-DOT-NEXT: ldr s17, [sp, #112] -; CHECK-GI-DOT-NEXT: mov v4.s[2], w2 -; CHECK-GI-DOT-NEXT: mov v5.s[2], w6 -; CHECK-GI-DOT-NEXT: mov v0.s[2], v6.s[0] -; CHECK-GI-DOT-NEXT: mov v1.s[2], v7.s[0] -; CHECK-GI-DOT-NEXT: mov v2.s[2], v16.s[0] -; CHECK-GI-DOT-NEXT: mov v3.s[2], v17.s[0] -; CHECK-GI-DOT-NEXT: ldr s6, [sp, #24] -; CHECK-GI-DOT-NEXT: ldr s7, [sp, #56] -; CHECK-GI-DOT-NEXT: ldr s16, [sp, #88] -; CHECK-GI-DOT-NEXT: ldr s17, [sp, #120] -; CHECK-GI-DOT-NEXT: mov v4.s[3], w3 -; CHECK-GI-DOT-NEXT: mov v5.s[3], w7 -; CHECK-GI-DOT-NEXT: mov v0.s[3], v6.s[0] -; CHECK-GI-DOT-NEXT: mov v1.s[3], v7.s[0] -; CHECK-GI-DOT-NEXT: mov v2.s[3], v16.s[0] -; CHECK-GI-DOT-NEXT: mov v3.s[3], v17.s[0] -; CHECK-GI-DOT-NEXT: uzp1 v4.8h, v4.8h, v5.8h -; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000 -; CHECK-GI-DOT-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-GI-DOT-NEXT: uzp1 v1.8h, v2.8h, v3.8h +; CHECK-GI-DOT-NEXT: fmov s0, w0 +; CHECK-GI-DOT-NEXT: fmov s1, w1 +; CHECK-GI-DOT-NEXT: ldr w8, [sp] +; CHECK-GI-DOT-NEXT: ldr w9, [sp, #64] +; CHECK-GI-DOT-NEXT: ldr w10, [sp, #72] +; CHECK-GI-DOT-NEXT: movi v4.8b, #1 +; CHECK-GI-DOT-NEXT: fmov s2, w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #8] +; CHECK-GI-DOT-NEXT: mov v0.b[1], v1.b[0] +; CHECK-GI-DOT-NEXT: fmov s1, w2 +; CHECK-GI-DOT-NEXT: fmov s3, w10 +; CHECK-GI-DOT-NEXT: mov v0.b[2], v1.b[0] +; CHECK-GI-DOT-NEXT: fmov s1, w3 +; CHECK-GI-DOT-NEXT: mov v0.b[3], v1.b[0] +; CHECK-GI-DOT-NEXT: fmov s1, w4 +; CHECK-GI-DOT-NEXT: mov v0.b[4], v1.b[0] +; CHECK-GI-DOT-NEXT: fmov s1, w5 +; CHECK-GI-DOT-NEXT: mov v0.b[5], v1.b[0] +; CHECK-GI-DOT-NEXT: fmov s1, w6 +; CHECK-GI-DOT-NEXT: mov v0.b[6], v1.b[0] +; CHECK-GI-DOT-NEXT: fmov s1, w7 +; CHECK-GI-DOT-NEXT: mov v0.b[7], v1.b[0] +; CHECK-GI-DOT-NEXT: fmov s1, w9 +; CHECK-GI-DOT-NEXT: ldr w9, [sp, #80] +; CHECK-GI-DOT-NEXT: mov v1.b[1], v3.b[0] +; CHECK-GI-DOT-NEXT: fmov s3, w9 +; CHECK-GI-DOT-NEXT: ldr w9, [sp, #88] +; CHECK-GI-DOT-NEXT: mov v0.b[8], v2.b[0] +; CHECK-GI-DOT-NEXT: fmov s2, w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #16] +; CHECK-GI-DOT-NEXT: mov v1.b[2], v3.b[0] +; CHECK-GI-DOT-NEXT: fmov s3, w9 +; CHECK-GI-DOT-NEXT: ldr w9, [sp, #96] +; CHECK-GI-DOT-NEXT: mov v0.b[9], v2.b[0] +; CHECK-GI-DOT-NEXT: fmov s2, w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #24] +; CHECK-GI-DOT-NEXT: mov v1.b[3], v3.b[0] +; CHECK-GI-DOT-NEXT: fmov s3, w9 +; CHECK-GI-DOT-NEXT: ldr w9, [sp, #104] +; CHECK-GI-DOT-NEXT: mov v0.b[10], v2.b[0] +; CHECK-GI-DOT-NEXT: fmov s2, w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #32] +; CHECK-GI-DOT-NEXT: mov v1.b[4], v3.b[0] +; CHECK-GI-DOT-NEXT: fmov s3, w9 +; CHECK-GI-DOT-NEXT: ldr w9, [sp, #112] +; CHECK-GI-DOT-NEXT: mov v0.b[11], v2.b[0] +; CHECK-GI-DOT-NEXT: fmov s2, w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #40] +; CHECK-GI-DOT-NEXT: mov v1.b[5], v3.b[0] +; CHECK-GI-DOT-NEXT: fmov s3, w9 +; CHECK-GI-DOT-NEXT: ldr w9, [sp, #120] +; CHECK-GI-DOT-NEXT: mov v0.b[12], v2.b[0] +; CHECK-GI-DOT-NEXT: fmov s2, w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #48] +; CHECK-GI-DOT-NEXT: fmov s5, w9 +; CHECK-GI-DOT-NEXT: mov v1.b[6], v3.b[0] +; CHECK-GI-DOT-NEXT: fmov s3, w8 +; CHECK-GI-DOT-NEXT: ldr w8, [sp, #56] +; CHECK-GI-DOT-NEXT: mov v0.b[13], v2.b[0] ; CHECK-GI-DOT-NEXT: movi v2.8b, #1 -; CHECK-GI-DOT-NEXT: movi v3.8b, #1 -; CHECK-GI-DOT-NEXT: uzp1 v0.16b, v4.16b, v0.16b -; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000 -; CHECK-GI-DOT-NEXT: xtn v1.8b, v1.8h -; CHECK-GI-DOT-NEXT: mov v3.d[1], v2.d[0] -; CHECK-GI-DOT-NEXT: sdot v5.4s, v0.16b, v3.16b -; CHECK-GI-DOT-NEXT: sdot v4.4s, v1.16b, v2.16b -; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v4.4s +; CHECK-GI-DOT-NEXT: mov v1.b[7], v5.b[0] +; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000 +; CHECK-GI-DOT-NEXT: mov v0.b[14], v3.b[0] +; CHECK-GI-DOT-NEXT: fmov s3, w8 +; CHECK-GI-DOT-NEXT: mov v4.d[1], v2.d[0] +; CHECK-GI-DOT-NEXT: fmov d1, d1 +; CHECK-GI-DOT-NEXT: mov v0.b[15], v3.b[0] +; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000 +; CHECK-GI-DOT-NEXT: sdot v5.4s, v0.16b, v4.16b +; CHECK-GI-DOT-NEXT: sdot v3.4s, v1.16b, v2.16b +; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v3.4s ; CHECK-GI-DOT-NEXT: addv s0, v0.4s ; CHECK-GI-DOT-NEXT: fmov w0, s0 ; CHECK-GI-DOT-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/xtn.ll b/llvm/test/CodeGen/AArch64/xtn.ll index 3c86f4bf9eb213..e536ba240453e2 100644 --- a/llvm/test/CodeGen/AArch64/xtn.ll +++ b/llvm/test/CodeGen/AArch64/xtn.ll @@ -127,19 +127,12 @@ entry: } define <2 x i8> @xtn_v2i128_v2i8(<2 x i128> %a) { -; CHECK-SD-LABEL: xtn_v2i128_v2i8: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: fmov s0, w0 -; CHECK-SD-NEXT: mov v0.s[1], w2 -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: xtn_v2i128_v2i8: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov d0, x0 -; CHECK-GI-NEXT: mov v0.d[1], x2 -; CHECK-GI-NEXT: xtn v0.2s, v0.2d -; CHECK-GI-NEXT: ret +; CHECK-LABEL: xtn_v2i128_v2i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: mov v0.s[1], w2 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret entry: %arg1 = trunc <2 x i128> %a to <2 x i8> ret <2 x i8> %arg1 @@ -174,9 +167,11 @@ define <2 x i16> @xtn_v2i128_v2i16(<2 x i128> %a) { ; ; CHECK-GI-LABEL: xtn_v2i128_v2i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov d0, x0 -; CHECK-GI-NEXT: mov v0.d[1], x2 -; CHECK-GI-NEXT: xtn v0.2s, v0.2d +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: fmov s1, w2 +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: %arg1 = trunc <2 x i128> %a to <2 x i16> @@ -194,19 +189,12 @@ entry: } define <2 x i32> @xtn_v2i128_v2i32(<2 x i128> %a) { -; CHECK-SD-LABEL: xtn_v2i128_v2i32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: fmov s0, w0 -; CHECK-SD-NEXT: mov v0.s[1], w2 -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: xtn_v2i128_v2i32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov d0, x0 -; CHECK-GI-NEXT: mov v0.d[1], x2 -; CHECK-GI-NEXT: xtn v0.2s, v0.2d -; CHECK-GI-NEXT: ret +; CHECK-LABEL: xtn_v2i128_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: mov v0.s[1], w2 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret entry: %arg1 = trunc <2 x i128> %a to <2 x i32> ret <2 x i32> %arg1 diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll index 716d2398996be2..bb968c8eb00fcb 100644 --- a/llvm/test/CodeGen/AArch64/zext.ll +++ b/llvm/test/CodeGen/AArch64/zext.ll @@ -242,16 +242,15 @@ define <3 x i16> @zext_v3i8_v3i16(<3 x i8> %a) { ; ; CHECK-GI-LABEL: zext_v3i8_v3i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: mov w8, #255 // =0xff +; CHECK-GI-NEXT: and w8, w0, #0xff +; CHECK-GI-NEXT: and w9, w1, #0xff +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: and w8, w2, #0xff +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] ; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: mov v0.s[1], w1 -; CHECK-GI-NEXT: mov v2.16b, v1.16b -; CHECK-GI-NEXT: mov v0.s[2], w2 -; CHECK-GI-NEXT: mov v2.h[1], v1.h[0] -; CHECK-GI-NEXT: xtn v0.4h, v0.4s -; CHECK-GI-NEXT: mov v2.h[2], v1.h[0] -; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: %c = zext <3 x i8> %a to <3 x i16> @@ -271,14 +270,12 @@ define <3 x i32> @zext_v3i8_v3i32(<3 x i8> %a) { ; ; CHECK-GI-LABEL: zext_v3i8_v3i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov w8, #255 // =0xff -; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: mov v0.s[1], w1 -; CHECK-GI-NEXT: mov v1.s[1], w8 -; CHECK-GI-NEXT: mov v0.s[2], w2 -; CHECK-GI-NEXT: mov v1.s[2], w8 -; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: and w8, w0, #0xff +; CHECK-GI-NEXT: and w9, w1, #0xff +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: and w8, w2, #0xff +; CHECK-GI-NEXT: mov v0.s[1], w9 +; CHECK-GI-NEXT: mov v0.s[2], w8 ; CHECK-GI-NEXT: ret entry: %c = zext <3 x i8> %a to <3 x i32> @@ -305,16 +302,15 @@ define <3 x i64> @zext_v3i8_v3i64(<3 x i8> %a) { ; ; CHECK-GI-LABEL: zext_v3i8_v3i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: movi v1.2d, #0x000000000000ff +; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-GI-NEXT: and x8, x2, #0xff -; CHECK-GI-NEXT: fmov d2, x8 -; CHECK-GI-NEXT: mov v0.s[1], w1 -; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: mov d1, v0.d[1] -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: and x8, x0, #0xff +; CHECK-GI-NEXT: and x9, x1, #0xff +; CHECK-GI-NEXT: and x10, x2, #0xff +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: fmov d1, x9 +; CHECK-GI-NEXT: fmov d2, x10 ; CHECK-GI-NEXT: ret entry: %c = zext <3 x i8> %a to <3 x i64> @@ -407,16 +403,15 @@ define <3 x i16> @zext_v3i10_v3i16(<3 x i10> %a) { ; ; CHECK-GI-LABEL: zext_v3i10_v3i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: mov w8, #1023 // =0x3ff +; CHECK-GI-NEXT: and w8, w0, #0x3ff +; CHECK-GI-NEXT: and w9, w1, #0x3ff +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: and w8, w2, #0x3ff +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] ; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: mov v0.s[1], w1 -; CHECK-GI-NEXT: mov v2.16b, v1.16b -; CHECK-GI-NEXT: mov v0.s[2], w2 -; CHECK-GI-NEXT: mov v2.h[1], v1.h[0] -; CHECK-GI-NEXT: xtn v0.4h, v0.4s -; CHECK-GI-NEXT: mov v2.h[2], v1.h[0] -; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret entry: %c = zext <3 x i10> %a to <3 x i16> @@ -436,14 +431,12 @@ define <3 x i32> @zext_v3i10_v3i32(<3 x i10> %a) { ; ; CHECK-GI-LABEL: zext_v3i10_v3i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov w8, #1023 // =0x3ff -; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: mov v0.s[1], w1 -; CHECK-GI-NEXT: mov v1.s[1], w8 -; CHECK-GI-NEXT: mov v0.s[2], w2 -; CHECK-GI-NEXT: mov v1.s[2], w8 -; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: and w8, w0, #0x3ff +; CHECK-GI-NEXT: and w9, w1, #0x3ff +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: and w8, w2, #0x3ff +; CHECK-GI-NEXT: mov v0.s[1], w9 +; CHECK-GI-NEXT: mov v0.s[2], w8 ; CHECK-GI-NEXT: ret entry: %c = zext <3 x i10> %a to <3 x i32> @@ -469,17 +462,15 @@ define <3 x i64> @zext_v3i10_v3i64(<3 x i10> %a) { ; ; CHECK-GI-LABEL: zext_v3i10_v3i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: adrp x8, .LCPI27_0 +; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-GI-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI27_0] -; CHECK-GI-NEXT: and x8, x2, #0x3ff -; CHECK-GI-NEXT: fmov d2, x8 -; CHECK-GI-NEXT: mov v0.s[1], w1 -; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: mov d1, v0.d[1] -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: and x8, x0, #0x3ff +; CHECK-GI-NEXT: and x9, x1, #0x3ff +; CHECK-GI-NEXT: and x10, x2, #0x3ff +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: fmov d1, x9 +; CHECK-GI-NEXT: fmov d2, x10 ; CHECK-GI-NEXT: ret entry: %c = zext <3 x i10> %a to <3 x i64> @@ -1098,33 +1089,51 @@ define <16 x i32> @zext_v16i10_v16i32(<16 x i10> %a) { ; ; CHECK-GI-LABEL: zext_v16i10_v16i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s4, w0 -; CHECK-GI-NEXT: fmov s5, w4 -; CHECK-GI-NEXT: ldr s2, [sp] -; CHECK-GI-NEXT: ldr s0, [sp, #8] -; CHECK-GI-NEXT: ldr s3, [sp, #32] -; CHECK-GI-NEXT: ldr s1, [sp, #40] -; CHECK-GI-NEXT: movi v6.4s, #3, msl #8 -; CHECK-GI-NEXT: mov v4.s[1], w1 -; CHECK-GI-NEXT: mov v5.s[1], w5 -; CHECK-GI-NEXT: mov v2.s[1], v0.s[0] -; CHECK-GI-NEXT: mov v3.s[1], v1.s[0] -; CHECK-GI-NEXT: ldr s0, [sp, #16] -; CHECK-GI-NEXT: ldr s1, [sp, #48] -; CHECK-GI-NEXT: mov v4.s[2], w2 -; CHECK-GI-NEXT: mov v5.s[2], w6 -; CHECK-GI-NEXT: mov v2.s[2], v0.s[0] -; CHECK-GI-NEXT: mov v3.s[2], v1.s[0] -; CHECK-GI-NEXT: ldr s0, [sp, #24] -; CHECK-GI-NEXT: ldr s1, [sp, #56] -; CHECK-GI-NEXT: mov v4.s[3], w3 -; CHECK-GI-NEXT: mov v5.s[3], w7 -; CHECK-GI-NEXT: mov v2.s[3], v0.s[0] -; CHECK-GI-NEXT: mov v3.s[3], v1.s[0] -; CHECK-GI-NEXT: and v0.16b, v4.16b, v6.16b -; CHECK-GI-NEXT: and v1.16b, v5.16b, v6.16b -; CHECK-GI-NEXT: and v2.16b, v2.16b, v6.16b -; CHECK-GI-NEXT: and v3.16b, v3.16b, v6.16b +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: fmov s1, w1 +; CHECK-GI-NEXT: ldr w8, [sp] +; CHECK-GI-NEXT: fmov s3, w5 +; CHECK-GI-NEXT: ldr w9, [sp, #8] +; CHECK-GI-NEXT: ldr w10, [sp, #32] +; CHECK-GI-NEXT: ldr w11, [sp, #40] +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #16] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: fmov s1, w4 +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: fmov s5, w10 +; CHECK-GI-NEXT: fmov s6, w11 +; CHECK-GI-NEXT: ldr w9, [sp, #48] +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] +; CHECK-GI-NEXT: fmov s3, w2 +; CHECK-GI-NEXT: mov v2.h[1], v4.h[0] +; CHECK-GI-NEXT: mov v5.h[1], v6.h[0] +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: fmov s6, w9 +; CHECK-GI-NEXT: ldr w8, [sp, #24] +; CHECK-GI-NEXT: ldr w9, [sp, #56] +; CHECK-GI-NEXT: mov v0.h[2], v3.h[0] +; CHECK-GI-NEXT: fmov s3, w6 +; CHECK-GI-NEXT: mov v2.h[2], v4.h[0] +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: mov v5.h[2], v6.h[0] +; CHECK-GI-NEXT: fmov s6, w9 +; CHECK-GI-NEXT: mov v1.h[2], v3.h[0] +; CHECK-GI-NEXT: fmov s3, w3 +; CHECK-GI-NEXT: mov v2.h[3], v4.h[0] +; CHECK-GI-NEXT: mov v0.h[3], v3.h[0] +; CHECK-GI-NEXT: fmov s3, w7 +; CHECK-GI-NEXT: mov v5.h[3], v6.h[0] +; CHECK-GI-NEXT: mov v1.h[3], v3.h[0] +; CHECK-GI-NEXT: movi v3.4s, #3, msl #8 +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v4.4s, v5.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-GI-NEXT: and v2.16b, v2.16b, v3.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: and v3.16b, v4.16b, v3.16b ; CHECK-GI-NEXT: ret entry: %c = zext <16 x i10> %a to <16 x i32> @@ -1176,44 +1185,64 @@ define <16 x i64> @zext_v16i10_v16i64(<16 x i10> %a) { ; ; CHECK-GI-LABEL: zext_v16i10_v16i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s16, w0 -; CHECK-GI-NEXT: fmov s17, w2 -; CHECK-GI-NEXT: ldr s0, [sp] -; CHECK-GI-NEXT: fmov s18, w4 -; CHECK-GI-NEXT: fmov s19, w6 -; CHECK-GI-NEXT: ldr s1, [sp, #8] -; CHECK-GI-NEXT: ldr s2, [sp, #16] -; CHECK-GI-NEXT: ldr s3, [sp, #24] -; CHECK-GI-NEXT: ldr s4, [sp, #32] -; CHECK-GI-NEXT: ldr s5, [sp, #40] -; CHECK-GI-NEXT: ldr s6, [sp, #48] -; CHECK-GI-NEXT: ldr s7, [sp, #56] -; CHECK-GI-NEXT: mov v16.s[1], w1 -; CHECK-GI-NEXT: mov v17.s[1], w3 -; CHECK-GI-NEXT: mov v18.s[1], w5 -; CHECK-GI-NEXT: mov v19.s[1], w7 -; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] -; CHECK-GI-NEXT: mov v2.s[1], v3.s[0] -; CHECK-GI-NEXT: mov v4.s[1], v5.s[0] -; CHECK-GI-NEXT: mov v6.s[1], v7.s[0] +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: fmov s1, w1 +; CHECK-GI-NEXT: ldr w8, [sp] +; CHECK-GI-NEXT: fmov s2, w5 +; CHECK-GI-NEXT: ldr w9, [sp, #8] +; CHECK-GI-NEXT: ldr w10, [sp, #32] +; CHECK-GI-NEXT: ldr w11, [sp, #40] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: fmov s1, w4 +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: fmov s4, w11 +; CHECK-GI-NEXT: ldr w9, [sp, #48] +; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #16] +; CHECK-GI-NEXT: fmov s5, w8 +; CHECK-GI-NEXT: ldr w8, [sp, #24] +; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: fmov s3, w10 +; CHECK-GI-NEXT: mov v3.h[1], v4.h[0] +; CHECK-GI-NEXT: fmov s4, w2 +; CHECK-GI-NEXT: mov v2.h[2], v5.h[0] +; CHECK-GI-NEXT: fmov s5, w8 ; CHECK-GI-NEXT: adrp x8, .LCPI54_0 -; CHECK-GI-NEXT: ushll v1.2d, v16.2s, #0 -; CHECK-GI-NEXT: ushll v3.2d, v17.2s, #0 -; CHECK-GI-NEXT: ushll v5.2d, v18.2s, #0 -; CHECK-GI-NEXT: ushll v7.2d, v19.2s, #0 -; CHECK-GI-NEXT: ushll v16.2d, v0.2s, #0 -; CHECK-GI-NEXT: ushll v18.2d, v2.2s, #0 -; CHECK-GI-NEXT: ushll v19.2d, v4.2s, #0 -; CHECK-GI-NEXT: ushll v20.2d, v6.2s, #0 -; CHECK-GI-NEXT: ldr q17, [x8, :lo12:.LCPI54_0] -; CHECK-GI-NEXT: and v0.16b, v1.16b, v17.16b -; CHECK-GI-NEXT: and v1.16b, v3.16b, v17.16b -; CHECK-GI-NEXT: and v2.16b, v5.16b, v17.16b -; CHECK-GI-NEXT: and v3.16b, v7.16b, v17.16b -; CHECK-GI-NEXT: and v4.16b, v16.16b, v17.16b -; CHECK-GI-NEXT: and v5.16b, v18.16b, v17.16b -; CHECK-GI-NEXT: and v6.16b, v19.16b, v17.16b -; CHECK-GI-NEXT: and v7.16b, v20.16b, v17.16b +; CHECK-GI-NEXT: ldr q7, [x8, :lo12:.LCPI54_0] +; CHECK-GI-NEXT: mov v0.h[2], v4.h[0] +; CHECK-GI-NEXT: fmov s4, w6 +; CHECK-GI-NEXT: mov v2.h[3], v5.h[0] +; CHECK-GI-NEXT: mov v1.h[2], v4.h[0] +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: ldr w9, [sp, #56] +; CHECK-GI-NEXT: mov v3.h[2], v4.h[0] +; CHECK-GI-NEXT: fmov s4, w3 +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: mov v0.h[3], v4.h[0] +; CHECK-GI-NEXT: fmov s4, w7 +; CHECK-GI-NEXT: ushll v17.2d, v2.2s, #0 +; CHECK-GI-NEXT: ushll2 v18.2d, v2.4s, #0 +; CHECK-GI-NEXT: mov v1.h[3], v4.h[0] +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: mov v3.h[3], v4.h[0] +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll v4.2d, v0.2s, #0 +; CHECK-GI-NEXT: ushll2 v5.2d, v0.4s, #0 +; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-GI-NEXT: ushll v6.2d, v1.2s, #0 +; CHECK-GI-NEXT: ushll2 v16.2d, v1.4s, #0 +; CHECK-GI-NEXT: and v0.16b, v4.16b, v7.16b +; CHECK-GI-NEXT: and v1.16b, v5.16b, v7.16b +; CHECK-GI-NEXT: and v4.16b, v17.16b, v7.16b +; CHECK-GI-NEXT: and v5.16b, v18.16b, v7.16b +; CHECK-GI-NEXT: ushll v19.2d, v3.2s, #0 +; CHECK-GI-NEXT: ushll2 v20.2d, v3.4s, #0 +; CHECK-GI-NEXT: and v2.16b, v6.16b, v7.16b +; CHECK-GI-NEXT: and v3.16b, v16.16b, v7.16b +; CHECK-GI-NEXT: and v6.16b, v19.16b, v7.16b +; CHECK-GI-NEXT: and v7.16b, v20.16b, v7.16b ; CHECK-GI-NEXT: ret entry: %c = zext <16 x i10> %a to <16 x i64> From a811f263356af4fcf5b479c7a32d1bab44ac8954 Mon Sep 17 00:00:00 2001 From: Dmitri Gribenko Date: Wed, 21 Aug 2024 10:20:39 +0200 Subject: [PATCH 031/426] [llvm][test] Write temporary files into a temporary directory --- llvm/unittests/Analysis/GraphWriterTest.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llvm/unittests/Analysis/GraphWriterTest.cpp b/llvm/unittests/Analysis/GraphWriterTest.cpp index a723c92d157618..4017e63a8c8675 100644 --- a/llvm/unittests/Analysis/GraphWriterTest.cpp +++ b/llvm/unittests/Analysis/GraphWriterTest.cpp @@ -12,6 +12,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/Testing/Support/SupportHelpers.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/raw_ostream.h" #include "gtest/gtest.h" @@ -56,7 +57,10 @@ class GraphWriterTest : public testing::Test { static void writeCFGToDotFile(Function &F, std::string Name, bool CFGOnly = false) { std::error_code EC; - raw_fd_ostream File(Name + ".dot", EC, sys::fs::OpenFlags::OF_Text); + llvm::unittest::TempDir Tmp("tmpdir", /*Unique=*/true); + SmallString<128> FileName(Tmp.path().begin(), Tmp.path().end()); + sys::path::append(FileName, Name + ".dot"); + raw_fd_ostream File(FileName, EC, sys::fs::OpenFlags::OF_Text); DOTFuncInfo CFGInfo(&F); From 768598bcc3528ff5c4cd2c8a9b74d023614e1a9e Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Wed, 21 Aug 2024 10:12:37 +0200 Subject: [PATCH 032/426] Revert "[LLVM] [X86] Fix integer overflows in frame layout for huge frames (#101840)" This casuses assertion failures targeting 32-bit x86: lib/Target/X86/X86RegisterInfo.cpp:989: virtual bool llvm::X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator, int, unsigned int, RegScavenger *) const: Assertion `(Is64Bit || FitsIn32Bits) && "Requesting 64-bit offset in 32-bit immediate!"' failed. See comment on the PR. > Fix 32-bit integer overflows in the X86 target frame layout when dealing > with frames larger than 4gb. When this occurs, we'll scavenge a scratch > register to be able to hold the correct stack offset for frame locals. > > This completes reapplying #84114. > > Fixes #48911 > Fixes #75944 > Fixes #87154 This reverts commit 0abb7791614947bc24931dd851ade31d02496977. --- llvm/lib/CodeGen/PrologEpilogInserter.cpp | 2 +- llvm/lib/Target/X86/X86FrameLowering.cpp | 11 +-- llvm/lib/Target/X86/X86RegisterInfo.cpp | 36 ++-------- llvm/lib/Target/X86/X86RegisterInfo.h | 9 --- llvm/test/CodeGen/X86/avx512f-large-stack.ll | 23 ------ llvm/test/CodeGen/X86/huge-stack.ll | 72 +++---------------- .../CodeGen/X86/win64-stackprobe-overflow.ll | 2 +- 7 files changed, 19 insertions(+), 136 deletions(-) delete mode 100644 llvm/test/CodeGen/X86/avx512f-large-stack.ll diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp index c03ea587805213..ee03eaa8ae527c 100644 --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -1553,7 +1553,7 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &MF, // If this instruction has a FrameIndex operand, we need to // use that target machine register info object to eliminate // it. - TRI.eliminateFrameIndex(MI, SPAdj, i, RS); + TRI.eliminateFrameIndex(MI, SPAdj, i); // Reset the iterator if we were at the beginning of the BB. if (AtBeginning) { diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index d850f4fd768311..8404f2231680d6 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -24,7 +24,6 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/EHPersonalities.h" @@ -2617,7 +2616,7 @@ StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, // object. // We need to factor in additional offsets applied during the prologue to the // frame, base, and stack pointer depending on which is used. - int64_t Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea(); + int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea(); const X86MachineFunctionInfo *X86FI = MF.getInfo(); unsigned CSSize = X86FI->getCalleeSavedFrameSize(); uint64_t StackSize = MFI.getStackSize(); @@ -4141,14 +4140,6 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized( // emitPrologue if it gets called and emits CFI. MF.setHasWinCFI(false); - MachineFrameInfo &MFI = MF.getFrameInfo(); - // If the frame is big enough that we might need to scavenge a register to - // handle huge offsets, reserve a stack slot for that now. - if (!isInt<32>(MFI.estimateStackSize(MF))) { - int FI = MFI.CreateStackObject(SlotSize, Align(SlotSize), false); - RS->addScavengingFrameIndex(FI); - } - // If we are using Windows x64 CFI, ensure that the stack is always 8 byte // aligned. The format doesn't support misaligned stack adjustments. if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 79ee9ecfdf3ce7..638eb1c4f11e41 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -13,7 +13,6 @@ //===----------------------------------------------------------------------===// #include "X86RegisterInfo.h" -#include "MCTargetDesc/X86BaseInfo.h" #include "X86FrameLowering.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" @@ -25,7 +24,6 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TileShapeInfo.h" @@ -907,7 +905,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); // Determine base register and offset. - int64_t FIOffset; + int FIOffset; Register BasePtr; if (MI.isReturn()) { assert((!hasStackRealignment(MF) || @@ -958,34 +956,10 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } if (MI.getOperand(FIOperandNum+3).isImm()) { - int64_t Imm = MI.getOperand(FIOperandNum + 3).getImm(); - int64_t Offset = FIOffset + Imm; - bool FitsIn32Bits = isInt<32>(Offset); - // If the offset will not fit in a 32-bit displacement, - // then for 64-bit targets, scavenge a register to hold it. - // Otherwise, for 32-bit targets, this is a bug! - if (Is64Bit && !FitsIn32Bits) { - assert(RS && "RegisterScavenger was NULL"); - const X86InstrInfo *TII = MF.getSubtarget().getInstrInfo(); - const DebugLoc &DL = MI.getDebugLoc(); - - RS->enterBasicBlockEnd(MBB); - RS->backward(std::next(II)); - - Register ScratchReg = RS->scavengeRegisterBackwards( - X86::GR64RegClass, II, /*RestoreAfter=*/false, /*SPAdj=*/0, - /*AllowSpill=*/true); - assert(ScratchReg != 0 && "scratch reg was 0"); - RS->setRegUsed(ScratchReg); - - BuildMI(MBB, II, DL, TII->get(X86::MOV64ri), ScratchReg).addImm(Offset); - - MI.getOperand(FIOperandNum + 3).setImm(0); - MI.getOperand(FIOperandNum + 2).setReg(ScratchReg); - - return false; - } - assert((Is64Bit || FitsIn32Bits) && + // Offset is a 32-bit integer. + int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm()); + int Offset = FIOffset + Imm; + assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) && "Requesting 64-bit offset in 32-bit immediate!"); if (Offset != 0 || !tryOptimizeLEAtoMOV(II)) MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset); diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h index dd03a108fb8e69..7296a5f021e4ad 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.h +++ b/llvm/lib/Target/X86/X86RegisterInfo.h @@ -13,8 +13,6 @@ #ifndef LLVM_LIB_TARGET_X86_X86REGISTERINFO_H #define LLVM_LIB_TARGET_X86_X86REGISTERINFO_H -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #define GET_REGINFO_HEADER @@ -178,13 +176,6 @@ class X86RegisterInfo final : public X86GenRegisterInfo { SmallVectorImpl &Hints, const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const override; - - bool requiresRegisterScavenging(const MachineFunction &MF) const override { - const MachineFrameInfo &MFI = MF.getFrameInfo(); - - // We need to register scavenge if the frame is very large. - return !isInt<32>(MFI.estimateStackSize(MF)); - } }; } // End llvm namespace diff --git a/llvm/test/CodeGen/X86/avx512f-large-stack.ll b/llvm/test/CodeGen/X86/avx512f-large-stack.ll deleted file mode 100644 index 3cb5391c56abf5..00000000000000 --- a/llvm/test/CodeGen/X86/avx512f-large-stack.ll +++ /dev/null @@ -1,23 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --version 4 -; RUN: llc -O0 -mtriple=x86_64 -mattr=+avx512f -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK -define void @f(i16 %LGV2, i1 %LGV3) { -; CHECK-LABEL: f: -; CHECK: # %bb.0: # %BB -; CHECK-NEXT: subq $2147483528, %rsp # imm = 0x7FFFFF88 -; CHECK-NEXT: .cfi_def_cfa_offset 2147483536 -; CHECK-NEXT: movb %sil, %cl -; CHECK-NEXT: movw %di, %ax -; CHECK-NEXT: movswq %ax, %rax -; CHECK-NEXT: andb $1, %cl -; CHECK-NEXT: movabsq $-2147483768, %rdx # imm = 0xFFFFFFFF7FFFFF88 -; CHECK-NEXT: movb %cl, (%rsp,%rdx) -; CHECK-NEXT: addq $2147483528, %rsp # imm = 0x7FFFFF88 -; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: retq -BB: - %A = alloca i1, i33 2147483648, align 1 - %G = getelementptr i1, ptr %A, i16 %LGV2 - %G4 = getelementptr i1, ptr %G, i32 -2147483648 - store i1 %LGV3, ptr %G4, align 1 - ret void -} diff --git a/llvm/test/CodeGen/X86/huge-stack.ll b/llvm/test/CodeGen/X86/huge-stack.ll index 41b8a0141b63d8..920033ba1182c3 100644 --- a/llvm/test/CodeGen/X86/huge-stack.ll +++ b/llvm/test/CodeGen/X86/huge-stack.ll @@ -5,70 +5,20 @@ define void @foo() unnamed_addr #0 { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: -; CHECK-NEXT: movabsq $8589934472, %rax # imm = 0x1FFFFFF88 +; CHECK-NEXT: movabsq $8589934462, %rax # imm = 0x1FFFFFF7E ; CHECK-NEXT: subq %rax, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 8589934480 -; CHECK-NEXT: movabsq $4294967177, %rax # imm = 0xFFFFFF89 -; CHECK-NEXT: movb $42, (%rsp,%rax) -; CHECK-NEXT: movb $43, -118(%rsp) -; CHECK-NEXT: movabsq $8589934472, %rax # imm = 0x1FFFFFF88 +; CHECK-NEXT: .cfi_def_cfa_offset 8589934470 +; CHECK-NEXT: movb $42, -129(%rsp) +; CHECK-NEXT: movb $43, -128(%rsp) +; CHECK-NEXT: movabsq $8589934462, %rax # imm = 0x1FFFFFF7E ; CHECK-NEXT: addq %rax, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq - %large1 = alloca %large, align 1 - %large2 = alloca %large, align 1 - %ptrLarge1 = getelementptr inbounds %large, ptr %large1, i64 0, i64 0 - store i8 42, ptr %ptrLarge1, align 1 - %ptrLarge2 = getelementptr inbounds %large, ptr %large2, i64 0, i64 0 - store i8 43, ptr %ptrLarge2, align 1 + %1 = alloca %large, align 1 + %2 = alloca %large, align 1 + %3 = getelementptr inbounds %large, ptr %1, i64 0, i64 0 + store i8 42, ptr %3, align 1 + %4 = getelementptr inbounds %large, ptr %2, i64 0, i64 0 + store i8 43, ptr %4, align 1 ret void } - -declare ptr @baz(ptr, ptr, ptr, ptr) - -define ptr @scavenge_spill() unnamed_addr #0 { -; CHECK-LABEL: scavenge_spill: -; CHECK: # %bb.0: -; CHECK-NEXT: movabsq $25769803816, %rax # imm = 0x600000028 -; CHECK-NEXT: subq %rax, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 25769803824 -; CHECK-NEXT: movabsq $21474836521, %rax # imm = 0x500000029 -; CHECK-NEXT: leaq (%rsp,%rax), %rdi -; CHECK-NEXT: movabsq $17179869226, %rax # imm = 0x40000002A -; CHECK-NEXT: leaq (%rsp,%rax), %rsi -; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movabsq $12884901931, %rax # imm = 0x30000002B -; CHECK-NEXT: leaq (%rsp,%rax), %rdx -; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movabsq $8589934636, %rax # imm = 0x20000002C -; CHECK-NEXT: leaq (%rsp,%rax), %rcx -; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: callq baz@PLT -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: leaq 46(%rsp), %rdi -; CHECK-NEXT: callq baz@PLT -; CHECK-NEXT: # kill: def $rcx killed $rax -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-NEXT: movabsq $25769803816, %rcx # imm = 0x600000028 -; CHECK-NEXT: addq %rcx, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: retq - %large1 = alloca %large, align 1 - %ptrLarge1 = getelementptr inbounds %large, ptr %large1, i64 0, i64 0 - %large2 = alloca %large, align 1 - %ptrLarge2 = getelementptr inbounds %large, ptr %large2, i64 0, i64 0 - %large3 = alloca %large, align 1 - %ptrLarge3 = getelementptr inbounds %large, ptr %large3, i64 0, i64 0 - %large4 = alloca %large, align 1 - %ptrLarge4 = getelementptr inbounds %large, ptr %large4, i64 0, i64 0 - %large5 = alloca %large, align 1 - %ptrLarge5 = getelementptr inbounds %large, ptr %large5, i64 0, i64 0 - %ret1 = call ptr @baz(ptr %ptrLarge1, ptr %ptrLarge2, ptr %ptrLarge3, ptr %ptrLarge4) - %large6 = alloca %large, align 1 - %ptrLarge6 = getelementptr inbounds %large, ptr %large6, i64 0, i64 0 - %ret2 = call ptr @baz(ptr %ptrLarge6, ptr %ptrLarge2, ptr %ptrLarge3, ptr %ptrLarge4) - ret ptr %ret1 -} diff --git a/llvm/test/CodeGen/X86/win64-stackprobe-overflow.ll b/llvm/test/CodeGen/X86/win64-stackprobe-overflow.ll index 732fc6543e3141..9555ce032db90c 100644 --- a/llvm/test/CodeGen/X86/win64-stackprobe-overflow.ll +++ b/llvm/test/CodeGen/X86/win64-stackprobe-overflow.ll @@ -10,5 +10,5 @@ start: attributes #0 = { nonlazybind uwtable "probe-stack"="probe_stack" "target-cpu"="x86-64" } ; CHECK-LABEL: foo: -; CHECK: movabsq $4294967312, %rax +; CHECK: movabsq $4294967304, %rax ; CHECK-NEXT: callq probe_stack From bacedb5684c79d35af61c4e30fb5d7fd9c2daf97 Mon Sep 17 00:00:00 2001 From: Shao-Ce SUN Date: Wed, 21 Aug 2024 16:42:16 +0800 Subject: [PATCH 033/426] [RISCV] Remove experimental for Ssqosid ext (#105476) Ratified: https://github.com/riscv/riscv-ssqosid/releases/tag/v1.0 --- .../Driver/print-supported-extensions-riscv.c | 2 +- .../test/Preprocessor/riscv-target-features.c | 18 +++++++++--------- llvm/docs/RISCVUsage.rst | 4 +--- llvm/lib/Target/RISCV/RISCVFeatures.td | 6 +++--- llvm/test/CodeGen/RISCV/attributes.ll | 8 ++++---- .../TargetParser/RISCVISAInfoTest.cpp | 2 +- 6 files changed, 19 insertions(+), 21 deletions(-) diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c index 132422393170a9..9497d01a832604 100644 --- a/clang/test/Driver/print-supported-extensions-riscv.c +++ b/clang/test/Driver/print-supported-extensions-riscv.c @@ -126,6 +126,7 @@ // CHECK-NEXT: sscofpmf 1.0 'Sscofpmf' (Count Overflow and Mode-Based Filtering) // CHECK-NEXT: sscounterenw 1.0 'Sscounterenw' (Support writeable scounteren enable bit for any hpmcounter that is not read-only zero) // CHECK-NEXT: sscsrind 1.0 'Sscsrind' (Indirect CSR Access Supervisor Level) +// CHECK-NEXT: ssqosid 1.0 'Ssqosid' (Quality-of-Service (QoS) Identifiers) // CHECK-NEXT: ssstateen 1.0 'Ssstateen' (Supervisor-mode view of the state-enable extension) // CHECK-NEXT: ssstrict 1.0 'Ssstrict' (No non-conforming extensions are present) // CHECK-NEXT: sstc 1.0 'Sstc' (Supervisor-mode timer interrupts) @@ -178,7 +179,6 @@ // CHECK-NEXT: smnpm 1.0 'Smnpm' (Machine-level Pointer Masking for next lower privilege mode) // CHECK-NEXT: ssnpm 1.0 'Ssnpm' (Supervisor-level Pointer Masking for next lower privilege mode) // CHECK-NEXT: sspm 1.0 'Sspm' (Indicates Supervisor-mode Pointer Masking) -// CHECK-NEXT: ssqosid 1.0 'Ssqosid' (Quality-of-Service (QoS) Identifiers) // CHECK-NEXT: supm 1.0 'Supm' (Indicates User-mode Pointer Masking) // CHECK-EMPTY: // CHECK-NEXT: Supported Profiles diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c index 517702fab5b919..5bb6c10f85f1a7 100644 --- a/clang/test/Preprocessor/riscv-target-features.c +++ b/clang/test/Preprocessor/riscv-target-features.c @@ -37,6 +37,7 @@ // CHECK-NOT: __riscv_sscofpmf {{.*$}} // CHECK-NOT: __riscv_sscounterenw {{.*$}} // CHECK-NOT: __riscv_sscsrind {{.*$}} +// CHECK-NOT: __riscv_ssqosid{{.*$}} // CHECK-NOT: __riscv_ssstateen {{.*$}} // CHECK-NOT: __riscv_ssstrict {{.*$}} // CHECK-NOT: __riscv_sstc {{.*$}} @@ -179,7 +180,6 @@ // CHECK-NOT: __riscv_smnpm{{.*$}} // CHECK-NOT: __riscv_ssnpm{{.*$}} // CHECK-NOT: __riscv_sspm{{.*$}} -// CHECK-NOT: __riscv_ssqosid{{.*$}} // CHECK-NOT: __riscv_supm{{.*$}} // CHECK-NOT: __riscv_zacas {{.*$}} // CHECK-NOT: __riscv_zalasr {{.*$}} @@ -1415,6 +1415,14 @@ // RUN: -o - | FileCheck --check-prefix=CHECK-SSCSRIND-EXT %s // CHECK-SSCSRIND-EXT: __riscv_sscsrind 1000000{{$}} +// RUN: %clang --target=riscv32 \ +// RUN: -march=rv32i_ssqosid1p0 -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-SSQOSID-EXT %s +// RUN: %clang --target=riscv64 \ +// RUN: -march=rv64i_ssqosid1p0 -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-SSQOSID-EXT %s +// CHECK-SSQOSID-EXT: __riscv_ssqosid 1000000{{$}} + // RUN: %clang --target=riscv32 \ // RUN: -march=rv32ismcdeleg1p0 -E -dM %s \ // RUN: -o - | FileCheck --check-prefix=CHECK-SMCDELEG-EXT %s @@ -1740,14 +1748,6 @@ // RUN: -o - | FileCheck --check-prefix=CHECK-SUPM-EXT %s // CHECK-SUPM-EXT: __riscv_supm 1000000{{$}} -// RUN: %clang --target=riscv32 -menable-experimental-extensions \ -// RUN: -march=rv32i_ssqosid1p0 -E -dM %s \ -// RUN: -o - | FileCheck --check-prefix=CHECK-SSQOSID-EXT %s -// RUN: %clang --target=riscv64 -menable-experimental-extensions \ -// RUN: -march=rv64i_ssqosid1p0 -E -dM %s \ -// RUN: -o - | FileCheck --check-prefix=CHECK-SSQOSID-EXT %s -// CHECK-SSQOSID-EXT: __riscv_ssqosid 1000000{{$}} - // Misaligned // RUN: %clang --target=riscv32-unknown-linux-gnu -march=rv32i -E -dM %s \ diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst index 3af1428138c021..4e50f55e4cb60b 100644 --- a/llvm/docs/RISCVUsage.rst +++ b/llvm/docs/RISCVUsage.rst @@ -136,6 +136,7 @@ on support follow. ``Sscofpmf`` Assembly Support ``Sscounterenw`` Assembly Support (`See note <#riscv-profiles-extensions-note>`__) ``Sscsrind`` Supported + ``Ssqosid`` Assembly Support ``Ssstateen`` Assembly Support (`See note <#riscv-profiles-extensions-note>`__) ``Ssstrict`` Assembly Support (`See note <#riscv-profiles-extensions-note>`__) ``Sstc`` Assembly Support @@ -290,9 +291,6 @@ The primary goal of experimental support is to assist in the process of ratifica ``experimental-ssnpm``, ``experimental-smnpm``, ``experimental-smmpm``, ``experimental-sspm``, ``experimental-supm`` LLVM implements the `v1.0.0-rc2 specification `__. -``experimental-ssqosid`` - LLVM implements assembler support for the `v1.0-rc1 draft specification `_. - ``experimental-zacas`` LLVM implements the `1.0 release specification `__. amocas.w will be used for i32 cmpxchg. amocas.d will be used i64 cmpxchg on RV64. The compiler will not generate amocas.d on RV32 or amocas.q on RV64 due to ABI compatibilty. These can only be used in the assembler. The extension will be left as experimental until `an ABI issue `__ is resolved. diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index a439e75a3d5af4..d448f9301f3ae8 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -968,9 +968,9 @@ def FeatureStdExtSstc : RISCVExtension<"sstc", 1, 0, "'Sstc' (Supervisor-mode timer interrupts)">; -def FeaturesStdExtSsqosid - : RISCVExperimentalExtension<"ssqosid", 1, 0, - "'Ssqosid' (Quality-of-Service (QoS) Identifiers)">; +def FeatureStdExtSsqosid + : RISCVExtension<"ssqosid", 1, 0, + "'Ssqosid' (Quality-of-Service (QoS) Identifiers)">; def FeatureStdExtShtvala : RISCVExtension<"shtvala", 1, 0, diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll index 3aee484beeaa35..2a02327cd3c7b0 100644 --- a/llvm/test/CodeGen/RISCV/attributes.ll +++ b/llvm/test/CodeGen/RISCV/attributes.ll @@ -115,6 +115,7 @@ ; RUN: llc -mtriple=riscv32 -mattr=+ssaia %s -o - | FileCheck --check-prefixes=CHECK,RV32SSAIA %s ; RUN: llc -mtriple=riscv32 -mattr=+smcsrind %s -o - | FileCheck --check-prefixes=CHECK,RV32SMCSRIND %s ; RUN: llc -mtriple=riscv32 -mattr=+sscsrind %s -o - | FileCheck --check-prefixes=CHECK,RV32SSCSRIND %s +; RUN: llc -mtriple=riscv32 -mattr=+ssqosid %s -o - | FileCheck --check-prefix=RV32SSQOSID %s ; RUN: llc -mtriple=riscv32 -mattr=+smcdeleg %s -o - | FileCheck --check-prefixes=CHECK,RV32SMCDELEG %s ; RUN: llc -mtriple=riscv32 -mattr=+smepmp %s -o - | FileCheck --check-prefixes=CHECK,RV32SMEPMP %s ; RUN: llc -mtriple=riscv32 -mattr=+zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV32ZFBFMIN %s @@ -132,7 +133,6 @@ ; RUN: llc -mtriple=riscv32 -mattr=+experimental-smmpm %s -o - | FileCheck --check-prefix=RV32SMMPM %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-sspm %s -o - | FileCheck --check-prefix=RV32SSPM %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-supm %s -o - | FileCheck --check-prefix=RV32SUPM %s -; RUN: llc -mtriple=riscv32 -mattr=+experimental-ssqosid %s -o - | FileCheck --check-prefix=RV32SSQOSID %s ; RUN: llc -mtriple=riscv64 %s -o - | FileCheck %s ; RUN: llc -mtriple=riscv64 -mattr=+m %s -o - | FileCheck --check-prefixes=CHECK,RV64M %s @@ -256,6 +256,7 @@ ; RUN: llc -mtriple=riscv64 -mattr=+ssaia %s -o - | FileCheck --check-prefixes=CHECK,RV64SSAIA %s ; RUN: llc -mtriple=riscv64 -mattr=+smcsrind %s -o - | FileCheck --check-prefixes=CHECK,RV64SMCSRIND %s ; RUN: llc -mtriple=riscv64 -mattr=+sscsrind %s -o - | FileCheck --check-prefixes=CHECK,RV64SSCSRIND %s +; RUN: llc -mtriple=riscv64 -mattr=+ssqosid %s -o - | FileCheck --check-prefix=RV64SSQOSID %s ; RUN: llc -mtriple=riscv64 -mattr=+smcdeleg %s -o - | FileCheck --check-prefixes=CHECK,RV64SMCDELEG %s ; RUN: llc -mtriple=riscv64 -mattr=+smepmp %s -o - | FileCheck --check-prefixes=CHECK,RV64SMEPMP %s ; RUN: llc -mtriple=riscv64 -mattr=+zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV64ZFBFMIN %s @@ -272,7 +273,6 @@ ; RUN: llc -mtriple=riscv64 -mattr=+experimental-smmpm %s -o - | FileCheck --check-prefix=RV64SMMPM %s ; RUN: llc -mtriple=riscv64 -mattr=+experimental-sspm %s -o - | FileCheck --check-prefix=RV64SSPM %s ; RUN: llc -mtriple=riscv64 -mattr=+experimental-supm %s -o - | FileCheck --check-prefix=RV64SUPM %s -; RUN: llc -mtriple=riscv64 -mattr=+experimental-ssqosid %s -o - | FileCheck --check-prefix=RV64SSQOSID %s ; Tests for profile features. ; RUN: llc -mtriple=riscv32 -mattr=+rvi20u32 %s -o - | FileCheck --check-prefix=RVI20U32 %s @@ -403,6 +403,7 @@ ; RV32SSAIA: .attribute 5, "rv32i2p1_ssaia1p0" ; RV32SMCSRIND: .attribute 5, "rv32i2p1_smcsrind1p0" ; RV32SSCSRIND: .attribute 5, "rv32i2p1_sscsrind1p0" +; RV32SSQOSID: .attribute 5, "rv32i2p1_ssqosid1p0" ; RV32SMCDELEG: .attribute 5, "rv32i2p1_smcdeleg1p0" ; RV32SMEPMP: .attribute 5, "rv32i2p1_smepmp1p0" ; RV32ZFBFMIN: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zfbfmin1p0" @@ -420,7 +421,6 @@ ; RV32SMMPM: .attribute 5, "rv32i2p1_smmpm1p0" ; RV32SSPM: .attribute 5, "rv32i2p1_sspm1p0" ; RV32SUPM: .attribute 5, "rv32i2p1_supm1p0" -; RV32SSQOSID: .attribute 5, "rv32i2p1_ssqosid1p0" ; RV64M: .attribute 5, "rv64i2p1_m2p0_zmmul1p0" ; RV64ZMMUL: .attribute 5, "rv64i2p1_zmmul1p0" @@ -542,6 +542,7 @@ ; RV64SSAIA: .attribute 5, "rv64i2p1_ssaia1p0" ; RV64SMCSRIND: .attribute 5, "rv64i2p1_smcsrind1p0" ; RV64SSCSRIND: .attribute 5, "rv64i2p1_sscsrind1p0" +; RV64SSQOSID: .attribute 5, "rv64i2p1_ssqosid1p0" ; RV64SMCDELEG: .attribute 5, "rv64i2p1_smcdeleg1p0" ; RV64SMEPMP: .attribute 5, "rv64i2p1_smepmp1p0" ; RV64ZFBFMIN: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfbfmin1p0" @@ -558,7 +559,6 @@ ; RV64SMMPM: .attribute 5, "rv64i2p1_smmpm1p0" ; RV64SSPM: .attribute 5, "rv64i2p1_sspm1p0" ; RV64SUPM: .attribute 5, "rv64i2p1_supm1p0" -; RV64SSQOSID: .attribute 5, "rv64i2p1_ssqosid1p0" ; RVI20U32: .attribute 5, "rv32i2p1" ; RVI20U64: .attribute 5, "rv64i2p1" diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp index d1de96477a5ca0..6172e48c484ce8 100644 --- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp +++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp @@ -1071,6 +1071,7 @@ R"(All available -march extensions for RISC-V sscofpmf 1.0 sscounterenw 1.0 sscsrind 1.0 + ssqosid 1.0 ssstateen 1.0 ssstrict 1.0 sstc 1.0 @@ -1123,7 +1124,6 @@ Experimental extensions smnpm 1.0 ssnpm 1.0 sspm 1.0 - ssqosid 1.0 supm 1.0 Supported Profiles From 6c189eaea9941898e7379903d10274dbf6e2c545 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Wed, 21 Aug 2024 09:44:01 +0100 Subject: [PATCH 034/426] [AArch64] Add SME peephole optimizer pass (#104612) This pass removes back-to-back smstart/smstop instructions to reduce the number of streaming mode changes in a function. The implementation as proposed doesn't aim to solve all problems yet and suggests a number of cases that can be optimized in the future. --- llvm/lib/Target/AArch64/AArch64.h | 2 + .../Target/AArch64/AArch64TargetMachine.cpp | 9 + llvm/lib/Target/AArch64/CMakeLists.txt | 1 + llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp | 260 +++++++++ llvm/test/CodeGen/AArch64/O3-pipeline.ll | 1 + .../test/CodeGen/AArch64/sme-darwin-sve-vg.ll | 2 +- .../test/CodeGen/AArch64/sme-peephole-opts.ll | 505 ++++++++++++++++++ .../CodeGen/AArch64/sme-streaming-body.ll | 46 +- .../AArch64/sme-streaming-interface.ll | 2 - .../CodeGen/AArch64/sme-toggle-pstateza.ll | 7 +- llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll | 12 - .../streaming-compatible-memory-ops.ll | 2 - 12 files changed, 789 insertions(+), 60 deletions(-) create mode 100644 llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp create mode 100644 llvm/test/CodeGen/AArch64/sme-peephole-opts.ll diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h index ff19327c692021..62fbf94e803f0c 100644 --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -59,6 +59,7 @@ FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); FunctionPass *createAArch64CollectLOHPass(); FunctionPass *createSMEABIPass(); +FunctionPass *createSMEPeepholeOptPass(); ModulePass *createSVEIntrinsicOptsPass(); InstructionSelector * createAArch64InstructionSelector(const AArch64TargetMachine &, @@ -110,6 +111,7 @@ void initializeFalkorHWPFFixPass(PassRegistry&); void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&); void initializeLDTLSCleanupPass(PassRegistry&); void initializeSMEABIPass(PassRegistry &); +void initializeSMEPeepholeOptPass(PassRegistry &); void initializeSVEIntrinsicOptsPass(PassRegistry &); void initializeAArch64Arm64ECCallLoweringPass(PassRegistry &); } // end namespace llvm diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index bcd677310d1247..bd5684a287381a 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -167,6 +167,11 @@ static cl::opt cl::desc("Enable SVE intrinsic opts"), cl::init(true)); +static cl::opt + EnableSMEPeepholeOpt("enable-aarch64-sme-peephole-opt", cl::init(true), + cl::Hidden, + cl::desc("Perform SME peephole optimization")); + static cl::opt EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix", cl::init(true), cl::Hidden); @@ -256,6 +261,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() { initializeLDTLSCleanupPass(*PR); initializeKCFIPass(*PR); initializeSMEABIPass(*PR); + initializeSMEPeepholeOptPass(*PR); initializeSVEIntrinsicOptsPass(*PR); initializeAArch64SpeculationHardeningPass(*PR); initializeAArch64SLSHardeningPass(*PR); @@ -754,6 +760,9 @@ bool AArch64PassConfig::addGlobalInstructionSelect() { } void AArch64PassConfig::addMachineSSAOptimization() { + if (TM->getOptLevel() != CodeGenOptLevel::None && EnableSMEPeepholeOpt) + addPass(createSMEPeepholeOptPass()); + // Run default MachineSSAOptimization first. TargetPassConfig::addMachineSSAOptimization(); diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt index 639bc0707dff24..da13db8e68b0e6 100644 --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -87,6 +87,7 @@ add_llvm_target(AArch64CodeGen AArch64TargetObjectFile.cpp AArch64TargetTransformInfo.cpp SMEABIPass.cpp + SMEPeepholeOpt.cpp SVEIntrinsicOpts.cpp AArch64SIMDInstrOpt.cpp diff --git a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp new file mode 100644 index 00000000000000..ba737afadaf943 --- /dev/null +++ b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp @@ -0,0 +1,260 @@ +//===- SMEPeepholeOpt.cpp - SME peephole optimization pass-----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This pass tries to remove back-to-back (smstart, smstop) and +// (smstop, smstart) sequences. The pass is conservative when it cannot +// determine that it is safe to remove these sequences. +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64Subtarget.h" +#include "Utils/AArch64SMEAttributes.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-sme-peephole-opt" + +namespace { + +struct SMEPeepholeOpt : public MachineFunctionPass { + static char ID; + + SMEPeepholeOpt() : MachineFunctionPass(ID) { + initializeSMEPeepholeOptPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "SME Peephole Optimization pass"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool optimizeStartStopPairs(MachineBasicBlock &MBB, + bool &HasRemovedAllSMChanges) const; +}; + +char SMEPeepholeOpt::ID = 0; + +} // end anonymous namespace + +static bool isConditionalStartStop(const MachineInstr *MI) { + return MI->getOpcode() == AArch64::MSRpstatePseudo; +} + +static bool isMatchingStartStopPair(const MachineInstr *MI1, + const MachineInstr *MI2) { + // We only consider the same type of streaming mode change here, i.e. + // start/stop SM, or start/stop ZA pairs. + if (MI1->getOperand(0).getImm() != MI2->getOperand(0).getImm()) + return false; + + // One must be 'start', the other must be 'stop' + if (MI1->getOperand(1).getImm() == MI2->getOperand(1).getImm()) + return false; + + bool IsConditional = isConditionalStartStop(MI2); + if (isConditionalStartStop(MI1) != IsConditional) + return false; + + if (!IsConditional) + return true; + + // Check to make sure the conditional start/stop pairs are identical. + if (MI1->getOperand(2).getImm() != MI2->getOperand(2).getImm()) + return false; + + // Ensure reg masks are identical. + if (MI1->getOperand(4).getRegMask() != MI2->getOperand(4).getRegMask()) + return false; + + // This optimisation is unlikely to happen in practice for conditional + // smstart/smstop pairs as the virtual registers for pstate.sm will always + // be different. + // TODO: For this optimisation to apply to conditional smstart/smstop, + // this pass will need to do more work to remove redundant calls to + // __arm_sme_state. + + // Only consider conditional start/stop pairs which read the same register + // holding the original value of pstate.sm, as some conditional start/stops + // require the state on entry to the function. + if (MI1->getOperand(3).isReg() && MI2->getOperand(3).isReg()) { + Register Reg1 = MI1->getOperand(3).getReg(); + Register Reg2 = MI2->getOperand(3).getReg(); + if (Reg1.isPhysical() || Reg2.isPhysical() || Reg1 != Reg2) + return false; + } + + return true; +} + +static bool ChangesStreamingMode(const MachineInstr *MI) { + assert((MI->getOpcode() == AArch64::MSRpstatesvcrImm1 || + MI->getOpcode() == AArch64::MSRpstatePseudo) && + "Expected MI to be a smstart/smstop instruction"); + return MI->getOperand(0).getImm() == AArch64SVCR::SVCRSM || + MI->getOperand(0).getImm() == AArch64SVCR::SVCRSMZA; +} + +static bool isSVERegOp(const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI, + const MachineOperand &MO) { + if (!MO.isReg()) + return false; + + Register R = MO.getReg(); + if (R.isPhysical()) + return llvm::any_of(TRI.subregs_inclusive(R), [](const MCPhysReg &SR) { + return AArch64::ZPRRegClass.contains(SR) || + AArch64::PPRRegClass.contains(SR); + }); + + const TargetRegisterClass *RC = MRI.getRegClass(R); + return TRI.getCommonSubClass(&AArch64::ZPRRegClass, RC) || + TRI.getCommonSubClass(&AArch64::PPRRegClass, RC); +} + +bool SMEPeepholeOpt::optimizeStartStopPairs( + MachineBasicBlock &MBB, bool &HasRemovedAllSMChanges) const { + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterInfo &TRI = + *MBB.getParent()->getSubtarget().getRegisterInfo(); + + bool Changed = false; + MachineInstr *Prev = nullptr; + SmallVector ToBeRemoved; + + // Convenience function to reset the matching of a sequence. + auto Reset = [&]() { + Prev = nullptr; + ToBeRemoved.clear(); + }; + + // Walk through instructions in the block trying to find pairs of smstart + // and smstop nodes that cancel each other out. We only permit a limited + // set of instructions to appear between them, otherwise we reset our + // tracking. + unsigned NumSMChanges = 0; + unsigned NumSMChangesRemoved = 0; + for (MachineInstr &MI : make_early_inc_range(MBB)) { + switch (MI.getOpcode()) { + case AArch64::MSRpstatesvcrImm1: + case AArch64::MSRpstatePseudo: { + if (ChangesStreamingMode(&MI)) + NumSMChanges++; + + if (!Prev) + Prev = &MI; + else if (isMatchingStartStopPair(Prev, &MI)) { + // If they match, we can remove them, and possibly any instructions + // that we marked for deletion in between. + Prev->eraseFromParent(); + MI.eraseFromParent(); + for (MachineInstr *TBR : ToBeRemoved) + TBR->eraseFromParent(); + ToBeRemoved.clear(); + Prev = nullptr; + Changed = true; + NumSMChangesRemoved += 2; + } else { + Reset(); + Prev = &MI; + } + continue; + } + default: + if (!Prev) + // Avoid doing expensive checks when Prev is nullptr. + continue; + break; + } + + // Test if the instructions in between the start/stop sequence are agnostic + // of streaming mode. If not, the algorithm should reset. + switch (MI.getOpcode()) { + default: + Reset(); + break; + case AArch64::COALESCER_BARRIER_FPR16: + case AArch64::COALESCER_BARRIER_FPR32: + case AArch64::COALESCER_BARRIER_FPR64: + case AArch64::COALESCER_BARRIER_FPR128: + case AArch64::COPY: + // These instructions should be safe when executed on their own, but + // the code remains conservative when SVE registers are used. There may + // exist subtle cases where executing a COPY in a different mode results + // in different behaviour, even if we can't yet come up with any + // concrete example/test-case. + if (isSVERegOp(TRI, MRI, MI.getOperand(0)) || + isSVERegOp(TRI, MRI, MI.getOperand(1))) + Reset(); + break; + case AArch64::ADJCALLSTACKDOWN: + case AArch64::ADJCALLSTACKUP: + case AArch64::ANDXri: + case AArch64::ADDXri: + // We permit these as they don't generate SVE/NEON instructions. + break; + case AArch64::VGRestorePseudo: + case AArch64::VGSavePseudo: + // When the smstart/smstop are removed, we should also remove + // the pseudos that save/restore the VG value for CFI info. + ToBeRemoved.push_back(&MI); + break; + case AArch64::MSRpstatesvcrImm1: + case AArch64::MSRpstatePseudo: + llvm_unreachable("Should have been handled"); + } + } + + HasRemovedAllSMChanges = + NumSMChanges && (NumSMChanges == NumSMChangesRemoved); + return Changed; +} + +INITIALIZE_PASS(SMEPeepholeOpt, "aarch64-sme-peephole-opt", + "SME Peephole Optimization", false, false) + +bool SMEPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + if (!MF.getSubtarget().hasSME()) + return false; + + assert(MF.getRegInfo().isSSA() && "Expected to be run on SSA form!"); + + bool Changed = false; + bool FunctionHasAllSMChangesRemoved = false; + + // Even if the block lives in a function with no SME attributes attached we + // still have to analyze all the blocks because we may call a streaming + // function that requires smstart/smstop pairs. + for (MachineBasicBlock &MBB : MF) { + bool BlockHasAllSMChangesRemoved; + Changed |= optimizeStartStopPairs(MBB, BlockHasAllSMChangesRemoved); + FunctionHasAllSMChangesRemoved |= BlockHasAllSMChangesRemoved; + } + + AArch64FunctionInfo *AFI = MF.getInfo(); + if (FunctionHasAllSMChangesRemoved) + AFI->setHasStreamingModeChanges(false); + + return Changed; +} + +FunctionPass *llvm::createSMEPeepholeOptPass() { return new SMEPeepholeOpt(); } diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll index 72a888bde5ebbc..3465b717261cf5 100644 --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -122,6 +122,7 @@ ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: AArch64 Local Dynamic TLS Access Clean-up ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions +; CHECK-NEXT: SME Peephole Optimization pass ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Early Tail Duplication ; CHECK-NEXT: Optimize machine instruction PHIs diff --git a/llvm/test/CodeGen/AArch64/sme-darwin-sve-vg.ll b/llvm/test/CodeGen/AArch64/sme-darwin-sve-vg.ll index c32e9cbc053939..cad529062102cf 100644 --- a/llvm/test/CodeGen/AArch64/sme-darwin-sve-vg.ll +++ b/llvm/test/CodeGen/AArch64/sme-darwin-sve-vg.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=aarch64-darwin -mattr=+sve -mattr=+sme -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-darwin -mattr=+sve -mattr=+sme -enable-aarch64-sme-peephole-opt=false -verify-machineinstrs < %s | FileCheck %s declare void @normal_callee(); diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll new file mode 100644 index 00000000000000..cb8a825a201ad6 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll @@ -0,0 +1,505 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme2 < %s | FileCheck %s + +declare void @callee() +declare void @callee_farg(float) +declare float @callee_farg_fret(float) + +; normal caller -> streaming callees +define void @test0() nounwind { +; CHECK-LABEL: test0: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: bl callee +; CHECK-NEXT: bl callee +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @callee() "aarch64_pstate_sm_enabled" + call void @callee() "aarch64_pstate_sm_enabled" + ret void +} + +; streaming caller -> normal callees +define void @test1() nounwind "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: test1: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl callee +; CHECK-NEXT: bl callee +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @callee() + call void @callee() + ret void +} + +; streaming-compatible caller -> normal callees +; these conditional smstart/smstop are not yet optimized away. +define void @test2() nounwind "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: test2: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: tbz w19, #0, .LBB2_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: bl callee +; CHECK-NEXT: tbz w19, #0, .LBB2_4 +; CHECK-NEXT: // %bb.3: +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB2_4: +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: tbz w19, #0, .LBB2_6 +; CHECK-NEXT: // %bb.5: +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB2_6: +; CHECK-NEXT: bl callee +; CHECK-NEXT: tbz w19, #0, .LBB2_8 +; CHECK-NEXT: // %bb.7: +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB2_8: +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @callee() + call void @callee() + ret void +} + +; streaming-compatible caller -> mixed callees +define void @test3() nounwind "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: test3: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: tbnz w19, #0, .LBB3_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB3_2: +; CHECK-NEXT: bl callee +; CHECK-NEXT: tbnz w19, #0, .LBB3_4 +; CHECK-NEXT: // %bb.3: +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB3_4: +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: tbz w19, #0, .LBB3_6 +; CHECK-NEXT: // %bb.5: +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB3_6: +; CHECK-NEXT: bl callee +; CHECK-NEXT: tbz w19, #0, .LBB3_8 +; CHECK-NEXT: // %bb.7: +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB3_8: +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x19, x0, #0x1 +; CHECK-NEXT: tbnz w19, #0, .LBB3_10 +; CHECK-NEXT: // %bb.9: +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB3_10: +; CHECK-NEXT: bl callee +; CHECK-NEXT: tbnz w19, #0, .LBB3_12 +; CHECK-NEXT: // %bb.11: +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB3_12: +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @callee() "aarch64_pstate_sm_enabled" + call void @callee() + call void @callee() "aarch64_pstate_sm_enabled" + ret void +} + +; streaming caller -> normal callees (pass 0.0f) +define void @test4() nounwind "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: test4: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: fmov s0, wzr +; CHECK-NEXT: bl callee_farg +; CHECK-NEXT: fmov s0, wzr +; CHECK-NEXT: bl callee_farg +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @callee_farg(float zeroinitializer) + call void @callee_farg(float zeroinitializer) + ret void +} + +; streaming caller -> normal callees (pass fp arg) +define void @test5(float %f) nounwind "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: test5: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: bl callee_farg +; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: bl callee_farg +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + call void @callee_farg(float %f) + call void @callee_farg(float %f) + ret void +} + +define float @test6(float %f) nounwind "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: test6: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: bl callee_farg_fret +; CHECK-NEXT: bl callee_farg_fret +; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res0 = call float @callee_farg_fret(float %f) + %res1 = call float @callee_farg_fret(float %res0) + ret float %res1 +} + +; save/restore zt0 to stack is not yet optimised away by the pass, +; because of the ldr/str of zt0, which will need some further analysis +; to make sure if the redundant str can be removed. +define void @test7() nounwind "aarch64_inout_zt0" { +; CHECK-LABEL: test7: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #144 +; CHECK-NEXT: stp x30, x19, [sp, #128] // 16-byte Folded Spill +; CHECK-NEXT: add x19, sp, #64 +; CHECK-NEXT: str zt0, [x19] +; CHECK-NEXT: smstop za +; CHECK-NEXT: bl callee +; CHECK-NEXT: smstart za +; CHECK-NEXT: ldr zt0, [x19] +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: str zt0, [x19] +; CHECK-NEXT: smstop za +; CHECK-NEXT: bl callee +; CHECK-NEXT: smstart za +; CHECK-NEXT: ldr zt0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #128] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #144 +; CHECK-NEXT: ret + call void @callee() + call void @callee() + ret void +} + +; test that 'smstop za' is not cancelled out with 'smstart sm'. +define void @test8() nounwind "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: test8: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl callee +; CHECK-NEXT: smstart sm +; CHECK-NEXT: smstop za +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @callee() + call void @llvm.aarch64.sme.za.disable() + ret void +} + +; test that the 'smstart' and 'smstop' are entirely removed, +; along with any code to read 'vg' for the CFI. +define void @test9() "aarch64_pstate_sm_body" { +; CHECK-LABEL: test9: +; CHECK: // %bb.0: +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: bl callee +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @callee() + ret void +} + +; similar to above, but in this case only the first +; 'smstart, smstop' pair can be removed and the code required +; for the CFI is still needed. +define void @test10() "aarch64_pstate_sm_body" { +; CHECK-LABEL: test10: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: lsr x9, x9, #3 +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset vg, -16 +; CHECK-NEXT: .cfi_offset w30, -32 +; CHECK-NEXT: .cfi_offset b8, -40 +; CHECK-NEXT: .cfi_offset b9, -48 +; CHECK-NEXT: .cfi_offset b10, -56 +; CHECK-NEXT: .cfi_offset b11, -64 +; CHECK-NEXT: .cfi_offset b12, -72 +; CHECK-NEXT: .cfi_offset b13, -80 +; CHECK-NEXT: .cfi_offset b14, -88 +; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: bl callee +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: bl callee +; CHECK-NEXT: .cfi_offset vg, -24 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl callee +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret + call void @callee() + call void @callee() "aarch64_pstate_sm_enabled" + call void @callee() + ret void +} + +; test that an operation like a store is executed in the right +; streaming mode and blocks the optimization. +define void @test11(ptr %p) nounwind "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: test11: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl callee +; CHECK-NEXT: smstart sm +; CHECK-NEXT: mov z0.b, #0 // =0x0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl callee +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @callee() + store zeroinitializer, ptr %p + call void @callee() + ret void +} + +; test that 'smstart sm' and 'smstop za' don't get folded away together. +; we can further optimize this test by considering streaming mode +; separately from ZA. +define void @test12() "aarch64_pstate_sm_body" { +; CHECK-LABEL: test12: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: lsr x9, x9, #3 +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset vg, -16 +; CHECK-NEXT: .cfi_offset w30, -32 +; CHECK-NEXT: .cfi_offset b8, -40 +; CHECK-NEXT: .cfi_offset b9, -48 +; CHECK-NEXT: .cfi_offset b10, -56 +; CHECK-NEXT: .cfi_offset b11, -64 +; CHECK-NEXT: .cfi_offset b12, -72 +; CHECK-NEXT: .cfi_offset b13, -80 +; CHECK-NEXT: .cfi_offset b14, -88 +; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: smstop za +; CHECK-NEXT: .cfi_offset vg, -24 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl callee +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: smstart za +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.za.disable() + call void @callee() + call void @llvm.aarch64.sme.za.enable() + ret void +} + +; We conservatively don't remove the smstart/smstop pair yet when there are COPY +; instructions that copy SVE registers, because we can't yet conclusively prove +; that this is safe (although for this example, it would be). +define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: test13: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl callee_farg_fret +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl callee_farg_fret +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + %res0 = call @callee_farg_fret( zeroinitializer) + %res1 = call @callee_farg_fret( %res0) + store %res1, ptr %ptr + ret void +} diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll index 3afd571ffba28e..572b1fff3520a9 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll @@ -136,25 +136,9 @@ define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_psta define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate_sm_body" nounwind { ; CHECK-LABEL: locally_streaming_caller_locally_streaming_callee: ; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: rdsvl x9, #1 -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: lsr x9, x9, #3 -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill -; CHECK-NEXT: smstart sm -; CHECK-NEXT: smstop sm +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: bl locally_streaming_caller_streaming_callee -; CHECK-NEXT: smstart sm -; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret call void @locally_streaming_caller_streaming_callee(); @@ -272,31 +256,9 @@ declare void @use_ptr(ptr) "aarch64_pstate_sm_compatible" define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_pstate_sm_body" { ; CHECK-LABEL: call_to_intrinsic_without_chain: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: rdsvl x9, #1 -; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: lsr x9, x9, #3 -; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x9, [sp, #96] // 8-byte Folded Spill -; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill -; CHECK-NEXT: smstart sm -; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: bl cos -; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill -; CHECK-NEXT: smstart sm -; CHECK-NEXT: smstop sm -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #112 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: %0 = call fast double @llvm.cos.f64(double %x) diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll index 4321493434230f..bd0734df9e23e6 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll @@ -269,8 +269,6 @@ define @smstart_clobber_sve_duplicate( %x) ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: bl streaming_callee -; CHECK-NEXT: smstop sm -; CHECK-NEXT: smstart sm ; CHECK-NEXT: bl streaming_callee ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme-toggle-pstateza.ll b/llvm/test/CodeGen/AArch64/sme-toggle-pstateza.ll index 3c50ab54e561e6..cc119dae1aa4d5 100644 --- a/llvm/test/CodeGen/AArch64/sme-toggle-pstateza.ll +++ b/llvm/test/CodeGen/AArch64/sme-toggle-pstateza.ll @@ -1,7 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64 -mattr=+sme -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64 -mattr=+sme -enable-aarch64-sme-peephole-opt=true -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-OPT +; RUN: llc -mtriple=aarch64 -mattr=+sme -enable-aarch64-sme-peephole-opt=false -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK define void @toggle_pstate_za() { +; CHECK-OPT-LABEL: toggle_pstate_za: +; CHECK-OPT: // %bb.0: +; CHECK-OPT-NEXT: ret +; ; CHECK-LABEL: toggle_pstate_za: ; CHECK: // %bb.0: ; CHECK-NEXT: smstart za diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll index 6264ce0cf4ae6d..a96f9e382ed1a8 100644 --- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll +++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll @@ -669,9 +669,6 @@ define void @vg_locally_streaming_fn() #3 { ; CHECK-NEXT: .cfi_offset b13, -80 ; CHECK-NEXT: .cfi_offset b14, -88 ; CHECK-NEXT: .cfi_offset b15, -96 -; CHECK-NEXT: smstart sm -; CHECK-NEXT: .cfi_offset vg, -24 -; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .cfi_restore vg @@ -679,9 +676,6 @@ define void @vg_locally_streaming_fn() #3 { ; CHECK-NEXT: .cfi_offset vg, -24 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee -; CHECK-NEXT: smstart sm -; CHECK-NEXT: .cfi_restore vg -; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload @@ -725,9 +719,6 @@ define void @vg_locally_streaming_fn() #3 { ; FP-CHECK-NEXT: .cfi_offset b13, -80 ; FP-CHECK-NEXT: .cfi_offset b14, -88 ; FP-CHECK-NEXT: .cfi_offset b15, -96 -; FP-CHECK-NEXT: smstart sm -; FP-CHECK-NEXT: .cfi_offset vg, -16 -; FP-CHECK-NEXT: smstop sm ; FP-CHECK-NEXT: bl callee ; FP-CHECK-NEXT: smstart sm ; FP-CHECK-NEXT: .cfi_restore vg @@ -735,9 +726,6 @@ define void @vg_locally_streaming_fn() #3 { ; FP-CHECK-NEXT: .cfi_offset vg, -16 ; FP-CHECK-NEXT: smstop sm ; FP-CHECK-NEXT: bl callee -; FP-CHECK-NEXT: smstart sm -; FP-CHECK-NEXT: .cfi_restore vg -; FP-CHECK-NEXT: smstop sm ; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96 ; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload ; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll index 106d6190e88b9c..20faeb23eed59d 100644 --- a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll +++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll @@ -264,8 +264,6 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind { ; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src] ; CHECK-NO-SME-ROUTINES-NEXT: smstop sm ; CHECK-NO-SME-ROUTINES-NEXT: bl memcpy -; CHECK-NO-SME-ROUTINES-NEXT: smstart sm -; CHECK-NO-SME-ROUTINES-NEXT: smstop sm ; CHECK-NO-SME-ROUTINES-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NO-SME-ROUTINES-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NO-SME-ROUTINES-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload From 126b56a234486a2cd05a8beca78bcf89fe47d167 Mon Sep 17 00:00:00 2001 From: Piyou Chen Date: Wed, 21 Aug 2024 16:46:59 +0800 Subject: [PATCH 035/426] [RISCV] Make EmitRISCVCpuSupports accept multiple features (#104917) This patch creates an additional EmitRISCVCpuSupports function to handle situations with multiple features. It also modifies the original EmitRISCVCpuSupports function to invoke the new one. --- clang/lib/CodeGen/CGBuiltin.cpp | 72 +++++++++++++------ clang/lib/CodeGen/CodeGenFunction.h | 1 + clang/test/CodeGen/builtin-cpu-supports.c | 16 ++--- llvm/include/llvm/TargetParser/RISCVISAInfo.h | 3 + 4 files changed, 61 insertions(+), 31 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 3e787cad6e82fa..3d77b118235ca0 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -14439,33 +14439,59 @@ Value *CodeGenFunction::EmitRISCVCpuSupports(const CallExpr *E) { if (!getContext().getTargetInfo().validateCpuSupports(FeatureStr)) return Builder.getFalse(); - // Note: We are making an unchecked assumption that the size of the - // feature array is >= 1. This holds for any version of compiler-rt - // which defines this interface. - llvm::ArrayType *ArrayOfInt64Ty = llvm::ArrayType::get(Int64Ty, 1); + return EmitRISCVCpuSupports(ArrayRef(FeatureStr)); +} + +static Value *loadRISCVFeatureBits(unsigned Index, CGBuilderTy &Builder, + CodeGenModule &CGM) { + llvm::Type *Int32Ty = Builder.getInt32Ty(); + llvm::Type *Int64Ty = Builder.getInt64Ty(); + llvm::ArrayType *ArrayOfInt64Ty = + llvm::ArrayType::get(Int64Ty, llvm::RISCVISAInfo::FeatureBitSize); llvm::Type *StructTy = llvm::StructType::get(Int32Ty, ArrayOfInt64Ty); llvm::Constant *RISCVFeaturesBits = CGM.CreateRuntimeVariable(StructTy, "__riscv_feature_bits"); - auto *GV = cast(RISCVFeaturesBits); - GV->setDSOLocal(true); - - auto LoadFeatureBit = [&](unsigned Index) { - // Create GEP then load. - Value *IndexVal = llvm::ConstantInt::get(Int32Ty, Index); - llvm::Value *GEPIndices[] = {Builder.getInt32(0), Builder.getInt32(1), - IndexVal}; - Value *Ptr = - Builder.CreateInBoundsGEP(StructTy, RISCVFeaturesBits, GEPIndices); - Value *FeaturesBit = - Builder.CreateAlignedLoad(Int64Ty, Ptr, CharUnits::fromQuantity(8)); - return FeaturesBit; - }; + cast(RISCVFeaturesBits)->setDSOLocal(true); + Value *IndexVal = llvm::ConstantInt::get(Int32Ty, Index); + llvm::Value *GEPIndices[] = {Builder.getInt32(0), Builder.getInt32(1), + IndexVal}; + Value *Ptr = + Builder.CreateInBoundsGEP(StructTy, RISCVFeaturesBits, GEPIndices); + Value *FeaturesBit = + Builder.CreateAlignedLoad(Int64Ty, Ptr, CharUnits::fromQuantity(8)); + return FeaturesBit; +} + +Value *CodeGenFunction::EmitRISCVCpuSupports(ArrayRef FeaturesStrs) { + const unsigned RISCVFeatureLength = llvm::RISCVISAInfo::FeatureBitSize; + uint64_t RequireBitMasks[RISCVFeatureLength] = {0}; + + for (auto Feat : FeaturesStrs) { + auto [GroupID, BitPos] = RISCVISAInfo::getRISCVFeaturesBitsInfo(Feat); + + // If there isn't BitPos for this feature, skip this version. + // It also report the warning to user during compilation. + if (BitPos == -1) + return Builder.getFalse(); - auto [GroupID, BitPos] = RISCVISAInfo::getRISCVFeaturesBitsInfo(FeatureStr); - assert(BitPos != -1 && "validation should have rejected this feature"); - Value *MaskV = Builder.getInt64(1ULL << BitPos); - Value *Bitset = Builder.CreateAnd(LoadFeatureBit(GroupID), MaskV); - return Builder.CreateICmpEQ(Bitset, MaskV); + RequireBitMasks[GroupID] |= (1ULL << BitPos); + } + + Value *Result = nullptr; + for (unsigned Idx = 0; Idx < RISCVFeatureLength; Idx++) { + if (RequireBitMasks[Idx] == 0) + continue; + + Value *Mask = Builder.getInt64(RequireBitMasks[Idx]); + Value *Bitset = + Builder.CreateAnd(loadRISCVFeatureBits(Idx, Builder, CGM), Mask); + Value *CmpV = Builder.CreateICmpEQ(Bitset, Mask); + Result = (!Result) ? CmpV : Builder.CreateAnd(Result, CmpV); + } + + assert(Result && "Should have value here."); + + return Result; } Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 57e0b7f91e9bf8..e1b9ada3c1e1fd 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4704,6 +4704,7 @@ class CodeGenFunction : public CodeGenTypeCache { ReturnValueSlot ReturnValue); llvm::Value *EmitRISCVCpuSupports(const CallExpr *E); + llvm::Value *EmitRISCVCpuSupports(ArrayRef FeaturesStrs); llvm::Value *EmitRISCVCpuInit(); void AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst, diff --git a/clang/test/CodeGen/builtin-cpu-supports.c b/clang/test/CodeGen/builtin-cpu-supports.c index b252484fc3df95..72fc9a433dd6e8 100644 --- a/clang/test/CodeGen/builtin-cpu-supports.c +++ b/clang/test/CodeGen/builtin-cpu-supports.c @@ -251,7 +251,7 @@ int test_ppc(int a) { // CHECK-RV32-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 // CHECK-RV32-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 // CHECK-RV32-NEXT: call void @__init_riscv_feature_bits(ptr null) -// CHECK-RV32-NEXT: [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [1 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 +// CHECK-RV32-NEXT: [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 // CHECK-RV32-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1 // CHECK-RV32-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1 // CHECK-RV32-NEXT: br i1 [[TMP2]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] @@ -259,7 +259,7 @@ int test_ppc(int a) { // CHECK-RV32-NEXT: store i32 3, ptr [[RETVAL]], align 4 // CHECK-RV32-NEXT: br label [[RETURN:%.*]] // CHECK-RV32: if.else: -// CHECK-RV32-NEXT: [[TMP3:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [1 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 +// CHECK-RV32-NEXT: [[TMP3:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 // CHECK-RV32-NEXT: [[TMP4:%.*]] = and i64 [[TMP3]], 4 // CHECK-RV32-NEXT: [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 4 // CHECK-RV32-NEXT: br i1 [[TMP5]], label [[IF_THEN1:%.*]], label [[IF_ELSE2:%.*]] @@ -267,7 +267,7 @@ int test_ppc(int a) { // CHECK-RV32-NEXT: store i32 7, ptr [[RETVAL]], align 4 // CHECK-RV32-NEXT: br label [[RETURN]] // CHECK-RV32: if.else2: -// CHECK-RV32-NEXT: [[TMP6:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [1 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 +// CHECK-RV32-NEXT: [[TMP6:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 // CHECK-RV32-NEXT: [[TMP7:%.*]] = and i64 [[TMP6]], 2097152 // CHECK-RV32-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 2097152 // CHECK-RV32-NEXT: br i1 [[TMP8]], label [[IF_THEN3:%.*]], label [[IF_ELSE4:%.*]] @@ -275,7 +275,7 @@ int test_ppc(int a) { // CHECK-RV32-NEXT: store i32 11, ptr [[RETVAL]], align 4 // CHECK-RV32-NEXT: br label [[RETURN]] // CHECK-RV32: if.else4: -// CHECK-RV32-NEXT: [[TMP9:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [1 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 1), align 8 +// CHECK-RV32-NEXT: [[TMP9:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 1), align 8 // CHECK-RV32-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 8 // CHECK-RV32-NEXT: [[TMP11:%.*]] = icmp eq i64 [[TMP10]], 8 // CHECK-RV32-NEXT: br i1 [[TMP11]], label [[IF_THEN5:%.*]], label [[IF_END:%.*]] @@ -302,7 +302,7 @@ int test_ppc(int a) { // CHECK-RV64-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 // CHECK-RV64-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 // CHECK-RV64-NEXT: call void @__init_riscv_feature_bits(ptr null) -// CHECK-RV64-NEXT: [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [1 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 +// CHECK-RV64-NEXT: [[TMP0:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 // CHECK-RV64-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1 // CHECK-RV64-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1 // CHECK-RV64-NEXT: br i1 [[TMP2]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] @@ -310,7 +310,7 @@ int test_ppc(int a) { // CHECK-RV64-NEXT: store i32 3, ptr [[RETVAL]], align 4 // CHECK-RV64-NEXT: br label [[RETURN:%.*]] // CHECK-RV64: if.else: -// CHECK-RV64-NEXT: [[TMP3:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [1 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 +// CHECK-RV64-NEXT: [[TMP3:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 // CHECK-RV64-NEXT: [[TMP4:%.*]] = and i64 [[TMP3]], 4 // CHECK-RV64-NEXT: [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 4 // CHECK-RV64-NEXT: br i1 [[TMP5]], label [[IF_THEN1:%.*]], label [[IF_ELSE2:%.*]] @@ -318,7 +318,7 @@ int test_ppc(int a) { // CHECK-RV64-NEXT: store i32 7, ptr [[RETVAL]], align 4 // CHECK-RV64-NEXT: br label [[RETURN]] // CHECK-RV64: if.else2: -// CHECK-RV64-NEXT: [[TMP6:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [1 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 +// CHECK-RV64-NEXT: [[TMP6:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 0), align 8 // CHECK-RV64-NEXT: [[TMP7:%.*]] = and i64 [[TMP6]], 2097152 // CHECK-RV64-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 2097152 // CHECK-RV64-NEXT: br i1 [[TMP8]], label [[IF_THEN3:%.*]], label [[IF_ELSE4:%.*]] @@ -326,7 +326,7 @@ int test_ppc(int a) { // CHECK-RV64-NEXT: store i32 11, ptr [[RETVAL]], align 4 // CHECK-RV64-NEXT: br label [[RETURN]] // CHECK-RV64: if.else4: -// CHECK-RV64-NEXT: [[TMP9:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [1 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 1), align 8 +// CHECK-RV64-NEXT: [[TMP9:%.*]] = load i64, ptr getelementptr inbounds ({ i32, [2 x i64] }, ptr @__riscv_feature_bits, i32 0, i32 1, i32 1), align 8 // CHECK-RV64-NEXT: [[TMP10:%.*]] = and i64 [[TMP9]], 8 // CHECK-RV64-NEXT: [[TMP11:%.*]] = icmp eq i64 [[TMP10]], 8 // CHECK-RV64-NEXT: br i1 [[TMP11]], label [[IF_THEN5:%.*]], label [[IF_END:%.*]] diff --git a/llvm/include/llvm/TargetParser/RISCVISAInfo.h b/llvm/include/llvm/TargetParser/RISCVISAInfo.h index dd00e12cdf6c1e..5b2b6f29fd3db8 100644 --- a/llvm/include/llvm/TargetParser/RISCVISAInfo.h +++ b/llvm/include/llvm/TargetParser/RISCVISAInfo.h @@ -84,6 +84,9 @@ class RISCVISAInfo { /// <-1, -1> if not supported. static std::pair getRISCVFeaturesBitsInfo(StringRef Ext); + // The maximum value of the group ID obtained from getRISCVFeaturesBitsInfo. + static constexpr unsigned FeatureBitSize = 2; + private: RISCVISAInfo(unsigned XLen) : XLen(XLen) {} From 5f91de9d18cfa136645c2cbc91901b676c10df81 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Wed, 21 Aug 2024 04:54:20 -0400 Subject: [PATCH 036/426] [AMDGPU][True16][test] added missing true16 flag in gfx12 asm vop1 (#104884) added missing true16 flag in gfx12 asm vop1 --- llvm/test/MC/AMDGPU/gfx12_asm_vop1-fake16.s | 4 +- llvm/test/MC/AMDGPU/gfx12_asm_vop1.s | 424 ++++++++++---------- 2 files changed, 214 insertions(+), 214 deletions(-) diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1-fake16.s index 8fef2ab26dfdd8..17d44e027d94d6 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1-fake16.s @@ -1,7 +1,7 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-ASM %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -disassemble -show-encoding | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -disassemble -show-encoding | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-ASM %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s v_bfrev_b32_e32 v5, v1 // GFX12: v_bfrev_b32_e32 v5, v1 ; encoding: [0x01,0x71,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s index bd640e786e4ac1..7a6bb874b105df 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s @@ -1,7 +1,7 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding -comment-column=0 %s | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-ASM %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -disassemble -show-encoding -comment-column=0 | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -disassemble -show-encoding -comment-column=0 | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding -comment-column=0 %s | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-ASM %s -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding -comment-column=0 | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding -comment-column=0 | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s // this file will be converted to true16 format when more true16 instructions are supported @@ -50,50 +50,50 @@ v_bfrev_b32 v5, src_scc v_bfrev_b32 v255, 0xaf123456 // GFX12: v_bfrev_b32_e32 v255, 0xaf123456 ; encoding: [0xff,0x70,0xfe,0x7f,0x56,0x34,0x12,0xaf] -v_ceil_f16 v5, v1 -// GFX12: v_ceil_f16_e32 v5, v1 ; encoding: [0x01,0xb9,0x0a,0x7e] +v_ceil_f16 v5.l, v1.l +// GFX12: v_ceil_f16_e32 v5.l, v1.l ; encoding: [0x01,0xb9,0x0a,0x7e] -v_ceil_f16 v5, v127 -// GFX12: v_ceil_f16_e32 v5, v127 ; encoding: [0x7f,0xb9,0x0a,0x7e] +v_ceil_f16 v5.l, v127.l +// GFX12: v_ceil_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xb9,0x0a,0x7e] -v_ceil_f16 v5, s1 -// GFX12: v_ceil_f16_e32 v5, s1 ; encoding: [0x01,0xb8,0x0a,0x7e] +v_ceil_f16 v5.l, s1 +// GFX12: v_ceil_f16_e32 v5.l, s1 ; encoding: [0x01,0xb8,0x0a,0x7e] -v_ceil_f16 v5, s105 -// GFX12: v_ceil_f16_e32 v5, s105 ; encoding: [0x69,0xb8,0x0a,0x7e] +v_ceil_f16 v5.l, s105 +// GFX12: v_ceil_f16_e32 v5.l, s105 ; encoding: [0x69,0xb8,0x0a,0x7e] -v_ceil_f16 v5, vcc_lo -// GFX12: v_ceil_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xb8,0x0a,0x7e] +v_ceil_f16 v5.l, vcc_lo +// GFX12: v_ceil_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xb8,0x0a,0x7e] -v_ceil_f16 v5, vcc_hi -// GFX12: v_ceil_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xb8,0x0a,0x7e] +v_ceil_f16 v5.l, vcc_hi +// GFX12: v_ceil_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xb8,0x0a,0x7e] -v_ceil_f16 v5, ttmp15 -// GFX12: v_ceil_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xb8,0x0a,0x7e] +v_ceil_f16 v5.l, ttmp15 +// GFX12: v_ceil_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xb8,0x0a,0x7e] -v_ceil_f16 v5, m0 -// GFX12: v_ceil_f16_e32 v5, m0 ; encoding: [0x7d,0xb8,0x0a,0x7e] +v_ceil_f16 v5.l, m0 +// GFX12: v_ceil_f16_e32 v5.l, m0 ; encoding: [0x7d,0xb8,0x0a,0x7e] -v_ceil_f16 v5, exec_lo -// GFX12: v_ceil_f16_e32 v5, exec_lo ; encoding: [0x7e,0xb8,0x0a,0x7e] +v_ceil_f16 v5.l, exec_lo +// GFX12: v_ceil_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xb8,0x0a,0x7e] -v_ceil_f16 v5, exec_hi -// GFX12: v_ceil_f16_e32 v5, exec_hi ; encoding: [0x7f,0xb8,0x0a,0x7e] +v_ceil_f16 v5.l, exec_hi +// GFX12: v_ceil_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xb8,0x0a,0x7e] -v_ceil_f16 v5, null -// GFX12: v_ceil_f16_e32 v5, null ; encoding: [0x7c,0xb8,0x0a,0x7e] +v_ceil_f16 v5.l, null +// GFX12: v_ceil_f16_e32 v5.l, null ; encoding: [0x7c,0xb8,0x0a,0x7e] -v_ceil_f16 v5, -1 -// GFX12: v_ceil_f16_e32 v5, -1 ; encoding: [0xc1,0xb8,0x0a,0x7e] +v_ceil_f16 v5.l, -1 +// GFX12: v_ceil_f16_e32 v5.l, -1 ; encoding: [0xc1,0xb8,0x0a,0x7e] -v_ceil_f16 v5, 0.5 -// GFX12: v_ceil_f16_e32 v5, 0.5 ; encoding: [0xf0,0xb8,0x0a,0x7e] +v_ceil_f16 v5.l, 0.5 +// GFX12: v_ceil_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xb8,0x0a,0x7e] -v_ceil_f16 v5, src_scc -// GFX12: v_ceil_f16_e32 v5, src_scc ; encoding: [0xfd,0xb8,0x0a,0x7e] +v_ceil_f16 v5.l, src_scc +// GFX12: v_ceil_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xb8,0x0a,0x7e] -v_ceil_f16 v127, 0xfe0b -// GFX12: v_ceil_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xb8,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_ceil_f16 v127.l, 0xfe0b +// GFX12: v_ceil_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xb8,0xfe,0x7e,0x0b,0xfe,0x00,0x00] v_ceil_f32 v5, v1 // GFX12: v_ceil_f32_e32 v5, v1 ; encoding: [0x01,0x45,0x0a,0x7e] @@ -1728,50 +1728,50 @@ v_cvt_u32_u16 v5, src_scc v_cvt_u32_u16 v255, 0xfe0b // GFX12: v_cvt_u32_u16_e32 v255, 0xfe0b ; encoding: [0xff,0xd6,0xfe,0x7f,0x0b,0xfe,0x00,0x00] -v_exp_f16 v5, v1 -// GFX12: v_exp_f16_e32 v5, v1 ; encoding: [0x01,0xb1,0x0a,0x7e] +v_exp_f16 v5.l, v1.l +// GFX12: v_exp_f16_e32 v5.l, v1.l ; encoding: [0x01,0xb1,0x0a,0x7e] -v_exp_f16 v5, v127 -// GFX12: v_exp_f16_e32 v5, v127 ; encoding: [0x7f,0xb1,0x0a,0x7e] +v_exp_f16 v5.l, v127.l +// GFX12: v_exp_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xb1,0x0a,0x7e] -v_exp_f16 v5, s1 -// GFX12: v_exp_f16_e32 v5, s1 ; encoding: [0x01,0xb0,0x0a,0x7e] +v_exp_f16 v5.l, s1 +// GFX12: v_exp_f16_e32 v5.l, s1 ; encoding: [0x01,0xb0,0x0a,0x7e] -v_exp_f16 v5, s105 -// GFX12: v_exp_f16_e32 v5, s105 ; encoding: [0x69,0xb0,0x0a,0x7e] +v_exp_f16 v5.l, s105 +// GFX12: v_exp_f16_e32 v5.l, s105 ; encoding: [0x69,0xb0,0x0a,0x7e] -v_exp_f16 v5, vcc_lo -// GFX12: v_exp_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xb0,0x0a,0x7e] +v_exp_f16 v5.l, vcc_lo +// GFX12: v_exp_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xb0,0x0a,0x7e] -v_exp_f16 v5, vcc_hi -// GFX12: v_exp_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xb0,0x0a,0x7e] +v_exp_f16 v5.l, vcc_hi +// GFX12: v_exp_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xb0,0x0a,0x7e] -v_exp_f16 v5, ttmp15 -// GFX12: v_exp_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xb0,0x0a,0x7e] +v_exp_f16 v5.l, ttmp15 +// GFX12: v_exp_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xb0,0x0a,0x7e] -v_exp_f16 v5, m0 -// GFX12: v_exp_f16_e32 v5, m0 ; encoding: [0x7d,0xb0,0x0a,0x7e] +v_exp_f16 v5.l, m0 +// GFX12: v_exp_f16_e32 v5.l, m0 ; encoding: [0x7d,0xb0,0x0a,0x7e] -v_exp_f16 v5, exec_lo -// GFX12: v_exp_f16_e32 v5, exec_lo ; encoding: [0x7e,0xb0,0x0a,0x7e] +v_exp_f16 v5.l, exec_lo +// GFX12: v_exp_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xb0,0x0a,0x7e] -v_exp_f16 v5, exec_hi -// GFX12: v_exp_f16_e32 v5, exec_hi ; encoding: [0x7f,0xb0,0x0a,0x7e] +v_exp_f16 v5.l, exec_hi +// GFX12: v_exp_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xb0,0x0a,0x7e] -v_exp_f16 v5, null -// GFX12: v_exp_f16_e32 v5, null ; encoding: [0x7c,0xb0,0x0a,0x7e] +v_exp_f16 v5.l, null +// GFX12: v_exp_f16_e32 v5.l, null ; encoding: [0x7c,0xb0,0x0a,0x7e] -v_exp_f16 v5, -1 -// GFX12: v_exp_f16_e32 v5, -1 ; encoding: [0xc1,0xb0,0x0a,0x7e] +v_exp_f16 v5.l, -1 +// GFX12: v_exp_f16_e32 v5.l, -1 ; encoding: [0xc1,0xb0,0x0a,0x7e] -v_exp_f16 v5, 0.5 -// GFX12: v_exp_f16_e32 v5, 0.5 ; encoding: [0xf0,0xb0,0x0a,0x7e] +v_exp_f16 v5.l, 0.5 +// GFX12: v_exp_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xb0,0x0a,0x7e] -v_exp_f16 v5, src_scc -// GFX12: v_exp_f16_e32 v5, src_scc ; encoding: [0xfd,0xb0,0x0a,0x7e] +v_exp_f16 v5.l, src_scc +// GFX12: v_exp_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xb0,0x0a,0x7e] -v_exp_f16 v127, 0xfe0b -// GFX12: v_exp_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xb0,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_exp_f16 v127.l, 0xfe0b +// GFX12: v_exp_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xb0,0xfe,0x7e,0x0b,0xfe,0x00,0x00] v_exp_f32 v5, v1 // GFX12: v_exp_f32_e32 v5, v1 ; encoding: [0x01,0x4b,0x0a,0x7e] @@ -1953,50 +1953,50 @@ v_ffbl_b32 v5, src_scc v_ffbl_b32 v255, 0xaf123456 // GFX12: v_ctz_i32_b32_e32 v255, 0xaf123456 ; encoding: [0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf] -v_floor_f16 v5, v1 -// GFX12: v_floor_f16_e32 v5, v1 ; encoding: [0x01,0xb7,0x0a,0x7e] +v_floor_f16 v5.l, v1.l +// GFX12: v_floor_f16_e32 v5.l, v1.l ; encoding: [0x01,0xb7,0x0a,0x7e] -v_floor_f16 v5, v127 -// GFX12: v_floor_f16_e32 v5, v127 ; encoding: [0x7f,0xb7,0x0a,0x7e] +v_floor_f16 v5.l, v127.l +// GFX12: v_floor_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xb7,0x0a,0x7e] -v_floor_f16 v5, s1 -// GFX12: v_floor_f16_e32 v5, s1 ; encoding: [0x01,0xb6,0x0a,0x7e] +v_floor_f16 v5.l, s1 +// GFX12: v_floor_f16_e32 v5.l, s1 ; encoding: [0x01,0xb6,0x0a,0x7e] -v_floor_f16 v5, s105 -// GFX12: v_floor_f16_e32 v5, s105 ; encoding: [0x69,0xb6,0x0a,0x7e] +v_floor_f16 v5.l, s105 +// GFX12: v_floor_f16_e32 v5.l, s105 ; encoding: [0x69,0xb6,0x0a,0x7e] -v_floor_f16 v5, vcc_lo -// GFX12: v_floor_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xb6,0x0a,0x7e] +v_floor_f16 v5.l, vcc_lo +// GFX12: v_floor_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xb6,0x0a,0x7e] -v_floor_f16 v5, vcc_hi -// GFX12: v_floor_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xb6,0x0a,0x7e] +v_floor_f16 v5.l, vcc_hi +// GFX12: v_floor_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xb6,0x0a,0x7e] -v_floor_f16 v5, ttmp15 -// GFX12: v_floor_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xb6,0x0a,0x7e] +v_floor_f16 v5.l, ttmp15 +// GFX12: v_floor_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xb6,0x0a,0x7e] -v_floor_f16 v5, m0 -// GFX12: v_floor_f16_e32 v5, m0 ; encoding: [0x7d,0xb6,0x0a,0x7e] +v_floor_f16 v5.l, m0 +// GFX12: v_floor_f16_e32 v5.l, m0 ; encoding: [0x7d,0xb6,0x0a,0x7e] -v_floor_f16 v5, exec_lo -// GFX12: v_floor_f16_e32 v5, exec_lo ; encoding: [0x7e,0xb6,0x0a,0x7e] +v_floor_f16 v5.l, exec_lo +// GFX12: v_floor_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xb6,0x0a,0x7e] -v_floor_f16 v5, exec_hi -// GFX12: v_floor_f16_e32 v5, exec_hi ; encoding: [0x7f,0xb6,0x0a,0x7e] +v_floor_f16 v5.l, exec_hi +// GFX12: v_floor_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xb6,0x0a,0x7e] -v_floor_f16 v5, null -// GFX12: v_floor_f16_e32 v5, null ; encoding: [0x7c,0xb6,0x0a,0x7e] +v_floor_f16 v5.l, null +// GFX12: v_floor_f16_e32 v5.l, null ; encoding: [0x7c,0xb6,0x0a,0x7e] -v_floor_f16 v5, -1 -// GFX12: v_floor_f16_e32 v5, -1 ; encoding: [0xc1,0xb6,0x0a,0x7e] +v_floor_f16 v5.l, -1 +// GFX12: v_floor_f16_e32 v5.l, -1 ; encoding: [0xc1,0xb6,0x0a,0x7e] -v_floor_f16 v5, 0.5 -// GFX12: v_floor_f16_e32 v5, 0.5 ; encoding: [0xf0,0xb6,0x0a,0x7e] +v_floor_f16 v5.l, 0.5 +// GFX12: v_floor_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xb6,0x0a,0x7e] -v_floor_f16 v5, src_scc -// GFX12: v_floor_f16_e32 v5, src_scc ; encoding: [0xfd,0xb6,0x0a,0x7e] +v_floor_f16 v5.l, src_scc +// GFX12: v_floor_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xb6,0x0a,0x7e] -v_floor_f16 v127, 0xfe0b -// GFX12: v_floor_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xb6,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_floor_f16 v127.l, 0xfe0b +// GFX12: v_floor_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xb6,0xfe,0x7e,0x0b,0xfe,0x00,0x00] v_floor_f32 v5, v1 // GFX12: v_floor_f32_e32 v5, v1 ; encoding: [0x01,0x49,0x0a,0x7e] @@ -2457,50 +2457,50 @@ v_frexp_mant_f64 v[5:6], src_scc v_frexp_mant_f64 v[254:255], 0xaf123456 // GFX12: v_frexp_mant_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x7a,0xfc,0x7f,0x56,0x34,0x12,0xaf] -v_log_f16 v5, v1 -// GFX12: v_log_f16_e32 v5, v1 ; encoding: [0x01,0xaf,0x0a,0x7e] +v_log_f16 v5.l, v1.l +// GFX12: v_log_f16_e32 v5.l, v1.l ; encoding: [0x01,0xaf,0x0a,0x7e] -v_log_f16 v5, v127 -// GFX12: v_log_f16_e32 v5, v127 ; encoding: [0x7f,0xaf,0x0a,0x7e] +v_log_f16 v5.l, v127.l +// GFX12: v_log_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xaf,0x0a,0x7e] -v_log_f16 v5, s1 -// GFX12: v_log_f16_e32 v5, s1 ; encoding: [0x01,0xae,0x0a,0x7e] +v_log_f16 v5.l, s1 +// GFX12: v_log_f16_e32 v5.l, s1 ; encoding: [0x01,0xae,0x0a,0x7e] -v_log_f16 v5, s105 -// GFX12: v_log_f16_e32 v5, s105 ; encoding: [0x69,0xae,0x0a,0x7e] +v_log_f16 v5.l, s105 +// GFX12: v_log_f16_e32 v5.l, s105 ; encoding: [0x69,0xae,0x0a,0x7e] -v_log_f16 v5, vcc_lo -// GFX12: v_log_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xae,0x0a,0x7e] +v_log_f16 v5.l, vcc_lo +// GFX12: v_log_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xae,0x0a,0x7e] -v_log_f16 v5, vcc_hi -// GFX12: v_log_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xae,0x0a,0x7e] +v_log_f16 v5.l, vcc_hi +// GFX12: v_log_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xae,0x0a,0x7e] -v_log_f16 v5, ttmp15 -// GFX12: v_log_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xae,0x0a,0x7e] +v_log_f16 v5.l, ttmp15 +// GFX12: v_log_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xae,0x0a,0x7e] -v_log_f16 v5, m0 -// GFX12: v_log_f16_e32 v5, m0 ; encoding: [0x7d,0xae,0x0a,0x7e] +v_log_f16 v5.l, m0 +// GFX12: v_log_f16_e32 v5.l, m0 ; encoding: [0x7d,0xae,0x0a,0x7e] -v_log_f16 v5, exec_lo -// GFX12: v_log_f16_e32 v5, exec_lo ; encoding: [0x7e,0xae,0x0a,0x7e] +v_log_f16 v5.l, exec_lo +// GFX12: v_log_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xae,0x0a,0x7e] -v_log_f16 v5, exec_hi -// GFX12: v_log_f16_e32 v5, exec_hi ; encoding: [0x7f,0xae,0x0a,0x7e] +v_log_f16 v5.l, exec_hi +// GFX12: v_log_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xae,0x0a,0x7e] -v_log_f16 v5, null -// GFX12: v_log_f16_e32 v5, null ; encoding: [0x7c,0xae,0x0a,0x7e] +v_log_f16 v5.l, null +// GFX12: v_log_f16_e32 v5.l, null ; encoding: [0x7c,0xae,0x0a,0x7e] -v_log_f16 v5, -1 -// GFX12: v_log_f16_e32 v5, -1 ; encoding: [0xc1,0xae,0x0a,0x7e] +v_log_f16 v5.l, -1 +// GFX12: v_log_f16_e32 v5.l, -1 ; encoding: [0xc1,0xae,0x0a,0x7e] -v_log_f16 v5, 0.5 -// GFX12: v_log_f16_e32 v5, 0.5 ; encoding: [0xf0,0xae,0x0a,0x7e] +v_log_f16 v5.l, 0.5 +// GFX12: v_log_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xae,0x0a,0x7e] -v_log_f16 v5, src_scc -// GFX12: v_log_f16_e32 v5, src_scc ; encoding: [0xfd,0xae,0x0a,0x7e] +v_log_f16 v5.l, src_scc +// GFX12: v_log_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xae,0x0a,0x7e] -v_log_f16 v127, 0xfe0b -// GFX12: v_log_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xae,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_log_f16 v127.l, 0xfe0b +// GFX12: v_log_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xae,0xfe,0x7e,0x0b,0xfe,0x00,0x00] v_log_f32 v5, v1 // GFX12: v_log_f32_e32 v5, v1 ; encoding: [0x01,0x4f,0x0a,0x7e] @@ -2758,50 +2758,50 @@ v_permlane64_b32 v255, v255 v_pipeflush // GFX12: v_pipeflush ; encoding: [0x00,0x36,0x00,0x7e] -v_rcp_f16 v5, v1 -// GFX12: v_rcp_f16_e32 v5, v1 ; encoding: [0x01,0xa9,0x0a,0x7e] +v_rcp_f16 v5.l, v1.l +// GFX12: v_rcp_f16_e32 v5.l, v1.l ; encoding: [0x01,0xa9,0x0a,0x7e] -v_rcp_f16 v5, v127 -// GFX12: v_rcp_f16_e32 v5, v127 ; encoding: [0x7f,0xa9,0x0a,0x7e] +v_rcp_f16 v5.l, v127.l +// GFX12: v_rcp_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xa9,0x0a,0x7e] -v_rcp_f16 v5, s1 -// GFX12: v_rcp_f16_e32 v5, s1 ; encoding: [0x01,0xa8,0x0a,0x7e] +v_rcp_f16 v5.l, s1 +// GFX12: v_rcp_f16_e32 v5.l, s1 ; encoding: [0x01,0xa8,0x0a,0x7e] -v_rcp_f16 v5, s105 -// GFX12: v_rcp_f16_e32 v5, s105 ; encoding: [0x69,0xa8,0x0a,0x7e] +v_rcp_f16 v5.l, s105 +// GFX12: v_rcp_f16_e32 v5.l, s105 ; encoding: [0x69,0xa8,0x0a,0x7e] -v_rcp_f16 v5, vcc_lo -// GFX12: v_rcp_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xa8,0x0a,0x7e] +v_rcp_f16 v5.l, vcc_lo +// GFX12: v_rcp_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xa8,0x0a,0x7e] -v_rcp_f16 v5, vcc_hi -// GFX12: v_rcp_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xa8,0x0a,0x7e] +v_rcp_f16 v5.l, vcc_hi +// GFX12: v_rcp_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xa8,0x0a,0x7e] -v_rcp_f16 v5, ttmp15 -// GFX12: v_rcp_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xa8,0x0a,0x7e] +v_rcp_f16 v5.l, ttmp15 +// GFX12: v_rcp_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xa8,0x0a,0x7e] -v_rcp_f16 v5, m0 -// GFX12: v_rcp_f16_e32 v5, m0 ; encoding: [0x7d,0xa8,0x0a,0x7e] +v_rcp_f16 v5.l, m0 +// GFX12: v_rcp_f16_e32 v5.l, m0 ; encoding: [0x7d,0xa8,0x0a,0x7e] -v_rcp_f16 v5, exec_lo -// GFX12: v_rcp_f16_e32 v5, exec_lo ; encoding: [0x7e,0xa8,0x0a,0x7e] +v_rcp_f16 v5.l, exec_lo +// GFX12: v_rcp_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xa8,0x0a,0x7e] -v_rcp_f16 v5, exec_hi -// GFX12: v_rcp_f16_e32 v5, exec_hi ; encoding: [0x7f,0xa8,0x0a,0x7e] +v_rcp_f16 v5.l, exec_hi +// GFX12: v_rcp_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xa8,0x0a,0x7e] -v_rcp_f16 v5, null -// GFX12: v_rcp_f16_e32 v5, null ; encoding: [0x7c,0xa8,0x0a,0x7e] +v_rcp_f16 v5.l, null +// GFX12: v_rcp_f16_e32 v5.l, null ; encoding: [0x7c,0xa8,0x0a,0x7e] -v_rcp_f16 v5, -1 -// GFX12: v_rcp_f16_e32 v5, -1 ; encoding: [0xc1,0xa8,0x0a,0x7e] +v_rcp_f16 v5.l, -1 +// GFX12: v_rcp_f16_e32 v5.l, -1 ; encoding: [0xc1,0xa8,0x0a,0x7e] -v_rcp_f16 v5, 0.5 -// GFX12: v_rcp_f16_e32 v5, 0.5 ; encoding: [0xf0,0xa8,0x0a,0x7e] +v_rcp_f16 v5.l, 0.5 +// GFX12: v_rcp_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xa8,0x0a,0x7e] -v_rcp_f16 v5, src_scc -// GFX12: v_rcp_f16_e32 v5, src_scc ; encoding: [0xfd,0xa8,0x0a,0x7e] +v_rcp_f16 v5.l, src_scc +// GFX12: v_rcp_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xa8,0x0a,0x7e] -v_rcp_f16 v127, 0xfe0b -// GFX12: v_rcp_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xa8,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_rcp_f16 v127.l, 0xfe0b +// GFX12: v_rcp_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xa8,0xfe,0x7e,0x0b,0xfe,0x00,0x00] v_rcp_f32 v5, v1 // GFX12: v_rcp_f32_e32 v5, v1 ; encoding: [0x01,0x55,0x0a,0x7e] @@ -3073,50 +3073,50 @@ v_rndne_f64 v[5:6], src_scc v_rndne_f64 v[254:255], 0xaf123456 // GFX12: v_rndne_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x32,0xfc,0x7f,0x56,0x34,0x12,0xaf] -v_rsq_f16 v5, v1 -// GFX12: v_rsq_f16_e32 v5, v1 ; encoding: [0x01,0xad,0x0a,0x7e] +v_rsq_f16 v5.l, v1.l +// GFX12: v_rsq_f16_e32 v5.l, v1.l ; encoding: [0x01,0xad,0x0a,0x7e] -v_rsq_f16 v5, v127 -// GFX12: v_rsq_f16_e32 v5, v127 ; encoding: [0x7f,0xad,0x0a,0x7e] +v_rsq_f16 v5.l, v127.l +// GFX12: v_rsq_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xad,0x0a,0x7e] -v_rsq_f16 v5, s1 -// GFX12: v_rsq_f16_e32 v5, s1 ; encoding: [0x01,0xac,0x0a,0x7e] +v_rsq_f16 v5.l, s1 +// GFX12: v_rsq_f16_e32 v5.l, s1 ; encoding: [0x01,0xac,0x0a,0x7e] -v_rsq_f16 v5, s105 -// GFX12: v_rsq_f16_e32 v5, s105 ; encoding: [0x69,0xac,0x0a,0x7e] +v_rsq_f16 v5.l, s105 +// GFX12: v_rsq_f16_e32 v5.l, s105 ; encoding: [0x69,0xac,0x0a,0x7e] -v_rsq_f16 v5, vcc_lo -// GFX12: v_rsq_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xac,0x0a,0x7e] +v_rsq_f16 v5.l, vcc_lo +// GFX12: v_rsq_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xac,0x0a,0x7e] -v_rsq_f16 v5, vcc_hi -// GFX12: v_rsq_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xac,0x0a,0x7e] +v_rsq_f16 v5.l, vcc_hi +// GFX12: v_rsq_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xac,0x0a,0x7e] -v_rsq_f16 v5, ttmp15 -// GFX12: v_rsq_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xac,0x0a,0x7e] +v_rsq_f16 v5.l, ttmp15 +// GFX12: v_rsq_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xac,0x0a,0x7e] -v_rsq_f16 v5, m0 -// GFX12: v_rsq_f16_e32 v5, m0 ; encoding: [0x7d,0xac,0x0a,0x7e] +v_rsq_f16 v5.l, m0 +// GFX12: v_rsq_f16_e32 v5.l, m0 ; encoding: [0x7d,0xac,0x0a,0x7e] -v_rsq_f16 v5, exec_lo -// GFX12: v_rsq_f16_e32 v5, exec_lo ; encoding: [0x7e,0xac,0x0a,0x7e] +v_rsq_f16 v5.l, exec_lo +// GFX12: v_rsq_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xac,0x0a,0x7e] -v_rsq_f16 v5, exec_hi -// GFX12: v_rsq_f16_e32 v5, exec_hi ; encoding: [0x7f,0xac,0x0a,0x7e] +v_rsq_f16 v5.l, exec_hi +// GFX12: v_rsq_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xac,0x0a,0x7e] -v_rsq_f16 v5, null -// GFX12: v_rsq_f16_e32 v5, null ; encoding: [0x7c,0xac,0x0a,0x7e] +v_rsq_f16 v5.l, null +// GFX12: v_rsq_f16_e32 v5.l, null ; encoding: [0x7c,0xac,0x0a,0x7e] -v_rsq_f16 v5, -1 -// GFX12: v_rsq_f16_e32 v5, -1 ; encoding: [0xc1,0xac,0x0a,0x7e] +v_rsq_f16 v5.l, -1 +// GFX12: v_rsq_f16_e32 v5.l, -1 ; encoding: [0xc1,0xac,0x0a,0x7e] -v_rsq_f16 v5, 0.5 -// GFX12: v_rsq_f16_e32 v5, 0.5 ; encoding: [0xf0,0xac,0x0a,0x7e] +v_rsq_f16 v5.l, 0.5 +// GFX12: v_rsq_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xac,0x0a,0x7e] -v_rsq_f16 v5, src_scc -// GFX12: v_rsq_f16_e32 v5, src_scc ; encoding: [0xfd,0xac,0x0a,0x7e] +v_rsq_f16 v5.l, src_scc +// GFX12: v_rsq_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xac,0x0a,0x7e] -v_rsq_f16 v127, 0xfe0b -// GFX12: v_rsq_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xac,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_rsq_f16 v127.l, 0xfe0b +// GFX12: v_rsq_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xac,0xfe,0x7e,0x0b,0xfe,0x00,0x00] v_rsq_f32 v5, v1 // GFX12: v_rsq_f32_e32 v5, v1 ; encoding: [0x01,0x5d,0x0a,0x7e] @@ -3334,50 +3334,50 @@ v_sin_f32 v5, src_scc v_sin_f32 v255, 0xaf123456 // GFX12: v_sin_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x6a,0xfe,0x7f,0x56,0x34,0x12,0xaf] -v_sqrt_f16 v5, v1 -// GFX12: v_sqrt_f16_e32 v5, v1 ; encoding: [0x01,0xab,0x0a,0x7e] +v_sqrt_f16 v5.l, v1.l +// GFX12: v_sqrt_f16_e32 v5.l, v1.l ; encoding: [0x01,0xab,0x0a,0x7e] -v_sqrt_f16 v5, v127 -// GFX12: v_sqrt_f16_e32 v5, v127 ; encoding: [0x7f,0xab,0x0a,0x7e] +v_sqrt_f16 v5.l, v127.l +// GFX12: v_sqrt_f16_e32 v5.l, v127.l ; encoding: [0x7f,0xab,0x0a,0x7e] -v_sqrt_f16 v5, s1 -// GFX12: v_sqrt_f16_e32 v5, s1 ; encoding: [0x01,0xaa,0x0a,0x7e] +v_sqrt_f16 v5.l, s1 +// GFX12: v_sqrt_f16_e32 v5.l, s1 ; encoding: [0x01,0xaa,0x0a,0x7e] -v_sqrt_f16 v5, s105 -// GFX12: v_sqrt_f16_e32 v5, s105 ; encoding: [0x69,0xaa,0x0a,0x7e] +v_sqrt_f16 v5.l, s105 +// GFX12: v_sqrt_f16_e32 v5.l, s105 ; encoding: [0x69,0xaa,0x0a,0x7e] -v_sqrt_f16 v5, vcc_lo -// GFX12: v_sqrt_f16_e32 v5, vcc_lo ; encoding: [0x6a,0xaa,0x0a,0x7e] +v_sqrt_f16 v5.l, vcc_lo +// GFX12: v_sqrt_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xaa,0x0a,0x7e] -v_sqrt_f16 v5, vcc_hi -// GFX12: v_sqrt_f16_e32 v5, vcc_hi ; encoding: [0x6b,0xaa,0x0a,0x7e] +v_sqrt_f16 v5.l, vcc_hi +// GFX12: v_sqrt_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xaa,0x0a,0x7e] -v_sqrt_f16 v5, ttmp15 -// GFX12: v_sqrt_f16_e32 v5, ttmp15 ; encoding: [0x7b,0xaa,0x0a,0x7e] +v_sqrt_f16 v5.l, ttmp15 +// GFX12: v_sqrt_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xaa,0x0a,0x7e] -v_sqrt_f16 v5, m0 -// GFX12: v_sqrt_f16_e32 v5, m0 ; encoding: [0x7d,0xaa,0x0a,0x7e] +v_sqrt_f16 v5.l, m0 +// GFX12: v_sqrt_f16_e32 v5.l, m0 ; encoding: [0x7d,0xaa,0x0a,0x7e] -v_sqrt_f16 v5, exec_lo -// GFX12: v_sqrt_f16_e32 v5, exec_lo ; encoding: [0x7e,0xaa,0x0a,0x7e] +v_sqrt_f16 v5.l, exec_lo +// GFX12: v_sqrt_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0xaa,0x0a,0x7e] -v_sqrt_f16 v5, exec_hi -// GFX12: v_sqrt_f16_e32 v5, exec_hi ; encoding: [0x7f,0xaa,0x0a,0x7e] +v_sqrt_f16 v5.l, exec_hi +// GFX12: v_sqrt_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0xaa,0x0a,0x7e] -v_sqrt_f16 v5, null -// GFX12: v_sqrt_f16_e32 v5, null ; encoding: [0x7c,0xaa,0x0a,0x7e] +v_sqrt_f16 v5.l, null +// GFX12: v_sqrt_f16_e32 v5.l, null ; encoding: [0x7c,0xaa,0x0a,0x7e] -v_sqrt_f16 v5, -1 -// GFX12: v_sqrt_f16_e32 v5, -1 ; encoding: [0xc1,0xaa,0x0a,0x7e] +v_sqrt_f16 v5.l, -1 +// GFX12: v_sqrt_f16_e32 v5.l, -1 ; encoding: [0xc1,0xaa,0x0a,0x7e] -v_sqrt_f16 v5, 0.5 -// GFX12: v_sqrt_f16_e32 v5, 0.5 ; encoding: [0xf0,0xaa,0x0a,0x7e] +v_sqrt_f16 v5.l, 0.5 +// GFX12: v_sqrt_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0xaa,0x0a,0x7e] -v_sqrt_f16 v5, src_scc -// GFX12: v_sqrt_f16_e32 v5, src_scc ; encoding: [0xfd,0xaa,0x0a,0x7e] +v_sqrt_f16 v5.l, src_scc +// GFX12: v_sqrt_f16_e32 v5.l, src_scc ; encoding: [0xfd,0xaa,0x0a,0x7e] -v_sqrt_f16 v127, 0xfe0b -// GFX12: v_sqrt_f16_e32 v127, 0xfe0b ; encoding: [0xff,0xaa,0xfe,0x7e,0x0b,0xfe,0x00,0x00] +v_sqrt_f16 v127.l, 0xfe0b +// GFX12: v_sqrt_f16_e32 v127.l, 0xfe0b ; encoding: [0xff,0xaa,0xfe,0x7e,0x0b,0xfe,0x00,0x00] v_sqrt_f32 v5, v1 // GFX12: v_sqrt_f32_e32 v5, v1 ; encoding: [0x01,0x67,0x0a,0x7e] From 8ac140f390847e4e85e0a4fd910baaf46e5d115b Mon Sep 17 00:00:00 2001 From: Younan Zhang Date: Wed, 21 Aug 2024 17:01:05 +0800 Subject: [PATCH 037/426] [Clang][NFCI] Cleanup the fix for default function argument substitution (#104911) (This is one step towards tweaking `getTemplateInstantiationArgs()` as discussed in https://github.com/llvm/llvm-project/pull/102922) We don't always substitute into default arguments while transforming a function parameter. In that case, we would preserve the uninstantiated expression until after, e.g. building up a CXXDefaultArgExpr and instantiate the expression there. For member function instantiation, this algorithm used to cause a problem in that the default argument of an out-of-line member function specialization couldn't get properly instantiated. This is because, in `getTemplateInstantiationArgs()`, we would give up visiting a function's declaration context if the function is a specialization of a member template. For example, ```cpp template struct S { template void f(T = sizeof(T)); }; template <> template void S::f(int) {} ``` The default argument `sizeof(U)` that lexically appears inside the declaration would be copied to the function declaration in the class template specialization `S`, as well as to the function's out-of-line definition. We use template arguments collected from the out-of-line function definition when substituting into the default arguments. We would therefore give up the traversal after the function, resulting in a single-level template argument of the `f` itself. However the default argument here could still reference the template parameters of the primary template, hence the error. In fact, this is similar to constraint checking in some respects: we actually want the "whole" template arguments relative to the primary template, not those relative to the function definition. So this patch adds another flag to indicate `getTemplateInstantiationArgs()` for that. This patch also consolidates the tests for default arguments and removes some unnecessary tests. --- clang/include/clang/Sema/Sema.h | 9 +++++++- clang/lib/Sema/SemaTemplateInstantiate.cpp | 23 +++++++------------ .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 10 ++++---- 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 299a916b9abf8d..1f7e555d1b8717 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -13071,12 +13071,19 @@ class Sema final : public SemaBase { /// ForConstraintInstantiation indicates we should continue looking when /// encountering a lambda generic call operator, and continue looking for /// arguments on an enclosing class template. + /// + /// \param SkipForSpecialization when specified, any template specializations + /// in a traversal would be ignored. + /// \param ForDefaultArgumentSubstitution indicates we should continue looking + /// when encountering a specialized member function template, rather than + /// returning immediately. MultiLevelTemplateArgumentList getTemplateInstantiationArgs( const NamedDecl *D, const DeclContext *DC = nullptr, bool Final = false, std::optional> Innermost = std::nullopt, bool RelativeToPrimary = false, const FunctionDecl *Pattern = nullptr, bool ForConstraintInstantiation = false, - bool SkipForSpecialization = false); + bool SkipForSpecialization = false, + bool ForDefaultArgumentSubstitution = false); /// RAII object to handle the state changes required to synthesize /// a function body. diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index 9a6cd2cd0ab751..fd90f83f3976ca 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -256,7 +256,8 @@ HandleClassTemplateSpec(const ClassTemplateSpecializationDecl *ClassTemplSpec, Response HandleFunction(Sema &SemaRef, const FunctionDecl *Function, MultiLevelTemplateArgumentList &Result, const FunctionDecl *Pattern, bool RelativeToPrimary, - bool ForConstraintInstantiation) { + bool ForConstraintInstantiation, + bool ForDefaultArgumentSubstitution) { // Add template arguments from a function template specialization. if (!RelativeToPrimary && Function->getTemplateSpecializationKindForInstantiation() == @@ -286,7 +287,8 @@ Response HandleFunction(Sema &SemaRef, const FunctionDecl *Function, // If this function was instantiated from a specialized member that is // a function template, we're done. assert(Function->getPrimaryTemplate() && "No function template?"); - if (Function->getPrimaryTemplate()->isMemberSpecialization()) + if (!ForDefaultArgumentSubstitution && + Function->getPrimaryTemplate()->isMemberSpecialization()) return Response::Done(); // If this function is a generic lambda specialization, we are done. @@ -468,7 +470,7 @@ MultiLevelTemplateArgumentList Sema::getTemplateInstantiationArgs( const NamedDecl *ND, const DeclContext *DC, bool Final, std::optional> Innermost, bool RelativeToPrimary, const FunctionDecl *Pattern, bool ForConstraintInstantiation, - bool SkipForSpecialization) { + bool SkipForSpecialization, bool ForDefaultArgumentSubstitution) { assert((ND || DC) && "Can't find arguments for a decl if one isn't provided"); // Accumulate the set of template argument lists in this structure. MultiLevelTemplateArgumentList Result; @@ -510,7 +512,8 @@ MultiLevelTemplateArgumentList Sema::getTemplateInstantiationArgs( SkipForSpecialization); } else if (const auto *Function = dyn_cast(CurDecl)) { R = HandleFunction(*this, Function, Result, Pattern, RelativeToPrimary, - ForConstraintInstantiation); + ForConstraintInstantiation, + ForDefaultArgumentSubstitution); } else if (const auto *Rec = dyn_cast(CurDecl)) { R = HandleRecordDecl(*this, Rec, Result, Context, ForConstraintInstantiation); @@ -3227,7 +3230,6 @@ bool Sema::SubstDefaultArgument( // default argument expression appears. ContextRAII SavedContext(*this, FD); std::unique_ptr LIS; - MultiLevelTemplateArgumentList NewTemplateArgs = TemplateArgs; if (ForCallExpr) { // When instantiating a default argument due to use in a call expression, @@ -3240,19 +3242,10 @@ bool Sema::SubstDefaultArgument( /*ForDefinition*/ false); if (addInstantiatedParametersToScope(FD, PatternFD, *LIS, TemplateArgs)) return true; - const FunctionTemplateDecl *PrimaryTemplate = FD->getPrimaryTemplate(); - if (PrimaryTemplate && PrimaryTemplate->isOutOfLine()) { - TemplateArgumentList *CurrentTemplateArgumentList = - TemplateArgumentList::CreateCopy(getASTContext(), - TemplateArgs.getInnermost()); - NewTemplateArgs = getTemplateInstantiationArgs( - FD, FD->getDeclContext(), /*Final=*/false, - CurrentTemplateArgumentList->asArray(), /*RelativeToPrimary=*/true); - } } runWithSufficientStackSpace(Loc, [&] { - Result = SubstInitializer(PatternExpr, NewTemplateArgs, + Result = SubstInitializer(PatternExpr, TemplateArgs, /*DirectInit*/ false); }); } diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index a58854acb21fa5..0e064be2391838 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -4699,10 +4699,12 @@ bool Sema::InstantiateDefaultArgument(SourceLocation CallLoc, FunctionDecl *FD, // // template // A Foo(int a = A::FooImpl()); - MultiLevelTemplateArgumentList TemplateArgs = - getTemplateInstantiationArgs(FD, FD->getLexicalDeclContext(), - /*Final=*/false, /*Innermost=*/std::nullopt, - /*RelativeToPrimary=*/true); + MultiLevelTemplateArgumentList TemplateArgs = getTemplateInstantiationArgs( + FD, FD->getLexicalDeclContext(), + /*Final=*/false, /*Innermost=*/std::nullopt, + /*RelativeToPrimary=*/true, /*Pattern=*/nullptr, + /*ForConstraintInstantiation=*/false, /*SkipForSpecialization=*/false, + /*ForDefaultArgumentSubstitution=*/true); if (SubstDefaultArgument(CallLoc, Param, TemplateArgs, /*ForCallExpr*/ true)) return true; From 6b8c194f8587945e063691992068f1f821837769 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 21 Aug 2024 09:03:31 +0000 Subject: [PATCH 038/426] [gn build] Port 6c189eaea994 --- llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn index 2ffe83da90eed7..57570de8813751 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn @@ -165,6 +165,7 @@ static_library("LLVMAArch64CodeGen") { "GISel/AArch64PreLegalizerCombiner.cpp", "GISel/AArch64RegisterBankInfo.cpp", "SMEABIPass.cpp", + "SMEPeepholeOpt.cpp", "SVEIntrinsicOpts.cpp", ] } From 3083459c1d7a723e946db99a5794f33242ba1402 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Wed, 21 Aug 2024 11:47:48 +0200 Subject: [PATCH 039/426] [bazel] Port a3d41879ecf5690a73f9226951d3856c7faa34a4 --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index de069daf603f1e..57b08448ae9294 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -257,6 +257,14 @@ gentbl_cc_library( ["--gen-typedef-defs"], "include/mlir/IR/BuiltinTypes.cpp.inc", ), + ( + ["-gen-type-constraint-decls"], + "include/mlir/IR/BuiltinTypeConstraints.h.inc", + ), + ( + ["-gen-type-constraint-defs"], + "include/mlir/IR/BuiltinTypeConstraints.cpp.inc", + ), ], tblgen = ":mlir-tblgen", td_file = "include/mlir/IR/BuiltinTypes.td", From 297bb467acd31447d64f0540835127d50408e87d Mon Sep 17 00:00:00 2001 From: Edd Dawson Date: Wed, 21 Aug 2024 10:53:45 +0100 Subject: [PATCH 040/426] [PS5][Driver] Link main components with -pie by default (#102901) The PS5 linker currently forces `-pie` for typical link jobs. Have the driver pass `pie` under the same conditions. With this change we can remove our private linker patch and also allow `-no-pie` to have an effect. SIE tracker: TOOLCHAIN-16704 --- clang/lib/Driver/ToolChains/PS4CPU.cpp | 5 ++++- clang/test/Driver/ps5-linker.c | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp index 0175b5eb63e657..22103eb50803a5 100644 --- a/clang/lib/Driver/ToolChains/PS4CPU.cpp +++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp @@ -237,7 +237,10 @@ void tools::PS5cpu::Linker::ConstructJob(Compilation &C, const JobAction &JA, if (!D.SysRoot.empty()) CmdArgs.push_back(Args.MakeArgString("--sysroot=" + D.SysRoot)); - if (Args.hasArg(options::OPT_pie)) + // Default to PIE for non-static executables. + const bool PIE = + !Args.hasArg(options::OPT_r, options::OPT_shared, options::OPT_static); + if (Args.hasFlag(options::OPT_pie, options::OPT_no_pie, PIE)) CmdArgs.push_back("-pie"); if (Args.hasArg(options::OPT_static)) diff --git a/clang/test/Driver/ps5-linker.c b/clang/test/Driver/ps5-linker.c index 95d64d9017be04..c462e5a178e4a6 100644 --- a/clang/test/Driver/ps5-linker.c +++ b/clang/test/Driver/ps5-linker.c @@ -1,3 +1,19 @@ +// Test that PIE is the default for main components + +// RUN: %clang --target=x86_64-scei-ps5 %s -### 2>&1 | FileCheck --check-prefixes=CHECK-PIE %s + +// CHECK-PIE: {{ld(\.exe)?}}" +// CHECK-PIE-SAME: "-pie" + +// RUN: %clang --target=x86_64-scei-ps5 -no-pie %s -### 2>&1 | FileCheck --check-prefixes=CHECK-NO-PIE %s +// RUN: %clang --target=x86_64-scei-ps5 -r %s -### 2>&1 | FileCheck --check-prefixes=CHECK-NO-PIE %s +// RUN: %clang --target=x86_64-scei-ps5 -shared %s -### 2>&1 | FileCheck --check-prefixes=CHECK-NO-PIE,CHECK-SHARED %s +// RUN: %clang --target=x86_64-scei-ps5 -static %s -### 2>&1 | FileCheck --check-prefixes=CHECK-NO-PIE %s + +// CHECK-NO-PIE: {{ld(\.exe)?}}" +// CHECK-NO-PIE-NOT: "-pie" +// CHECK-SHARED: "--shared" + // Test that -static is forwarded to the linker // RUN: %clang --target=x86_64-scei-ps5 -static %s -### 2>&1 | FileCheck --check-prefixes=CHECK-STATIC %s From a105877646d68e48cdeeeadd9d1e075dc3c5d68d Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 21 Aug 2024 12:02:54 +0200 Subject: [PATCH 041/426] [InstCombine] Remove some of the complexity-based canonicalization (#91185) The idea behind this canonicalization is that it allows us to handle less patterns, because we know that some will be canonicalized away. This is indeed very useful to e.g. know that constants are always on the right. However, this is only useful if the canonicalization is actually reliable. This is the case for constants, but not for arguments: Moving these to the right makes it look like the "more complex" expression is guaranteed to be on the left, but this is not actually the case in practice. It fails as soon as you replace the argument with another instruction. The end result is that it looks like things correctly work in tests, while they actually don't. We use the "thwart complexity-based canonicalization" trick to handle this in tests, but it's often a challenge for new contributors to get this right, and based on the regressions this PR originally exposed, we clearly don't get this right in many cases. For this reason, I think that it's better to remove this complexity canonicalization. It will make it much easier to write tests for commuted cases and make sure that they are handled. --- .../acle_sve_ld1-bfloat.c | 4 +- .../aarch64-sve-intrinsics/acle_sve_ld1.c | 44 +-- .../aarch64-sve-intrinsics/acle_sve_ld1sb.c | 26 +- .../aarch64-sve-intrinsics/acle_sve_ld1sh.c | 18 +- .../aarch64-sve-intrinsics/acle_sve_ld1sw.c | 10 +- .../aarch64-sve-intrinsics/acle_sve_ld1ub.c | 26 +- .../aarch64-sve-intrinsics/acle_sve_ld1uh.c | 18 +- .../aarch64-sve-intrinsics/acle_sve_ld1uw.c | 10 +- .../acle_sve_st1-bfloat.c | 4 +- .../aarch64-sve-intrinsics/acle_sve_st1.c | 46 +-- .../aarch64-sve-intrinsics/acle_sve_st1b.c | 14 +- .../aarch64-sve-intrinsics/acle_sve_st1h.c | 10 +- .../aarch64-sve-intrinsics/acle_sve_st1w.c | 6 +- .../acle_sve2p1_loads.c | 144 +++++----- .../acle_sve2p1_store.c | 144 +++++----- clang/test/CodeGen/attr-counted-by.c | 8 +- clang/test/CodeGen/fp-reassoc-pragma.cpp | 6 +- clang/test/CodeGen/fp-reciprocal-pragma.cpp | 12 +- clang/test/CodeGen/ms-mixed-ptr-sizes.c | 16 +- clang/test/Headers/wasm.c | 2 +- .../Transforms/InstCombine/InstCombiner.h | 25 +- .../InstCombine/InstCombineAddSub.cpp | 4 +- .../ValueTracking/known-power-of-two-urem.ll | 22 +- .../ValueTracking/known-power-of-two.ll | 60 ++-- .../knownbits-and-or-xor-lowbit.ll | 16 +- .../ValueTracking/knownbits-bmi-pattern.ll | 16 +- .../Analysis/ValueTracking/phi-known-bits.ll | 2 +- .../amdgpu-simplify-libcall-pow-codegen.ll | 8 +- .../AMDGPU/amdgpu-simplify-libcall-pow.ll | 46 +-- .../AMDGPU/amdgpu-simplify-libcall-pown.ll | 18 +- .../AMDGPU/amdgpu-simplify-libcall-powr.ll | 8 +- .../IndVarSimplify/rewrite-loop-exit-value.ll | 6 +- ...004-11-27-SetCCForCastLargerAndConstant.ll | 26 +- .../InstCombine/2010-11-23-Distributed.ll | 2 +- llvm/test/Transforms/InstCombine/abs-1.ll | 4 +- .../Transforms/InstCombine/add-mask-neg.ll | 6 +- llvm/test/Transforms/InstCombine/add.ll | 44 +-- llvm/test/Transforms/InstCombine/add2.ll | 2 +- .../test/Transforms/InstCombine/add_or_sub.ll | 8 +- .../InstCombine/and-or-icmp-const-icmp.ll | 2 +- .../Transforms/InstCombine/and-or-icmps.ll | 110 ++++---- .../test/Transforms/InstCombine/and-or-not.ll | 10 +- llvm/test/Transforms/InstCombine/and-or.ll | 26 +- .../Transforms/InstCombine/and-xor-merge.ll | 2 +- .../test/Transforms/InstCombine/and-xor-or.ll | 222 +++++++-------- llvm/test/Transforms/InstCombine/and.ll | 54 ++-- .../InstCombine/apint-and-xor-merge.ll | 2 +- llvm/test/Transforms/InstCombine/apint-or.ll | 4 +- .../Transforms/InstCombine/apint-shift.ll | 2 +- llvm/test/Transforms/InstCombine/apint-sub.ll | 2 +- llvm/test/Transforms/InstCombine/ashr-lshr.ll | 24 +- .../Transforms/InstCombine/assume-align.ll | 2 +- .../InstCombine/assume-separate_storage.ll | 2 +- llvm/test/Transforms/InstCombine/avg-lsb.ll | 4 +- .../InstCombine/binop-and-shifts.ll | 38 +-- .../test/Transforms/InstCombine/binop-cast.ll | 8 +- .../test/Transforms/InstCombine/bit-checks.ll | 44 +-- .../InstCombine/bitcast-inseltpoison.ll | 8 +- llvm/test/Transforms/InstCombine/bitcast.ll | 12 +- .../test/Transforms/InstCombine/bitreverse.ll | 6 +- .../test/Transforms/InstCombine/bswap-fold.ll | 10 +- .../test/Transforms/InstCombine/call-guard.ll | 2 +- ...nt-low-bit-mask-and-icmp-eq-to-icmp-ule.ll | 2 +- ...nt-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll | 2 +- ...t-low-bit-mask-and-icmp-uge-to-icmp-ule.ll | 2 +- ...t-low-bit-mask-and-icmp-ult-to-icmp-ugt.ll | 2 +- ...ze-low-bit-mask-and-icmp-eq-to-icmp-ule.ll | 12 +- ...ze-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll | 12 +- ...low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll | 24 +- ...low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll | 24 +- ...low-bit-mask-v3-and-icmp-eq-to-icmp-ule.ll | 10 +- ...low-bit-mask-v3-and-icmp-ne-to-icmp-ugt.ll | 10 +- ...low-bit-mask-v4-and-icmp-eq-to-icmp-ule.ll | 12 +- ...low-bit-mask-v4-and-icmp-ne-to-icmp-ugt.ll | 12 +- .../Transforms/InstCombine/cast-mul-select.ll | 4 +- llvm/test/Transforms/InstCombine/cast.ll | 6 +- llvm/test/Transforms/InstCombine/cast_phi.ll | 2 +- llvm/test/Transforms/InstCombine/cast_ptr.ll | 10 +- .../Transforms/InstCombine/cmp-x-vs-neg-x.ll | 4 +- .../InstCombine/conditional-negation.ll | 18 +- .../test/Transforms/InstCombine/ctpop-cttz.ll | 2 +- .../test/Transforms/InstCombine/ctpop-pow2.ll | 6 +- llvm/test/Transforms/InstCombine/cttz.ll | 4 +- llvm/test/Transforms/InstCombine/demorgan.ll | 20 +- .../Transforms/InstCombine/dependent-ivs.ll | 4 +- .../InstCombine/fadd-fsub-factor.ll | 38 +-- llvm/test/Transforms/InstCombine/fadd.ll | 10 +- .../Transforms/InstCombine/fast-basictest.ll | 20 +- llvm/test/Transforms/InstCombine/fast-math.ll | 6 +- llvm/test/Transforms/InstCombine/fcmp.ll | 2 +- llvm/test/Transforms/InstCombine/fdiv-sqrt.ll | 6 +- llvm/test/Transforms/InstCombine/fdiv.ll | 8 +- .../InstCombine/float-shrink-compare.ll | 28 +- llvm/test/Transforms/InstCombine/fmul.ll | 20 +- .../InstCombine/fold-ext-eq-c-with-op.ll | 4 +- ...c-of-add-of-not-x-and-y-to-sub-x-from-y.ll | 6 +- .../InstCombine/fold-select-fmul-if-zero.ll | 32 +-- .../InstCombine/fold-signbit-test-power2.ll | 2 +- llvm/test/Transforms/InstCombine/fpextend.ll | 8 +- llvm/test/Transforms/InstCombine/fptrunc.ll | 2 +- .../Transforms/InstCombine/free-inversion.ll | 10 +- llvm/test/Transforms/InstCombine/fsh.ll | 4 +- llvm/test/Transforms/InstCombine/fsub.ll | 46 +-- llvm/test/Transforms/InstCombine/funnel.ll | 4 +- .../Transforms/InstCombine/getelementptr.ll | 4 +- .../hoist-negation-out-of-bias-calculation.ll | 16 +- ...hoist-xor-by-constant-from-xor-by-value.ll | 2 +- llvm/test/Transforms/InstCombine/icmp-add.ll | 66 ++--- .../InstCombine/icmp-and-add-sub-xor-p2.ll | 24 +- .../InstCombine/icmp-and-lowbit-mask.ll | 44 +-- .../Transforms/InstCombine/icmp-and-shift.ll | 10 +- .../Transforms/InstCombine/icmp-custom-dl.ll | 2 +- .../InstCombine/icmp-equality-rotate.ll | 8 +- .../InstCombine/icmp-equality-xor.ll | 2 +- .../Transforms/InstCombine/icmp-ext-ext.ll | 26 +- llvm/test/Transforms/InstCombine/icmp-gep.ll | 12 +- .../Transforms/InstCombine/icmp-mul-zext.ll | 10 +- llvm/test/Transforms/InstCombine/icmp-mul.ll | 8 +- .../Transforms/InstCombine/icmp-of-and-x.ll | 8 +- .../Transforms/InstCombine/icmp-of-or-x.ll | 10 +- .../InstCombine/icmp-of-trunc-ext.ll | 46 +-- .../Transforms/InstCombine/icmp-of-xor-x.ll | 42 +-- .../icmp-or-of-select-with-zero.ll | 2 +- llvm/test/Transforms/InstCombine/icmp-or.ll | 2 +- .../test/Transforms/InstCombine/icmp-range.ll | 66 ++--- .../Transforms/InstCombine/icmp-rotate.ll | 2 +- .../icmp-select-implies-common-op.ll | 40 +-- .../Transforms/InstCombine/icmp-select.ll | 8 +- llvm/test/Transforms/InstCombine/icmp-sub.ll | 4 +- ...al-to-icmp-eq-of-lshr-val-by-bits-and-0.ll | 10 +- ...al-to-icmp-ne-of-lshr-val-by-bits-and-0.ll | 10 +- llvm/test/Transforms/InstCombine/icmp.ll | 56 ++-- llvm/test/Transforms/InstCombine/implies.ll | 4 +- ...rt-variable-mask-in-masked-merge-scalar.ll | 2 +- ...rt-variable-mask-in-masked-merge-vector.ll | 2 +- llvm/test/Transforms/InstCombine/ispow2.ll | 14 +- .../test/Transforms/InstCombine/known-bits.ll | 6 +- .../Transforms/InstCombine/known-never-nan.ll | 2 +- llvm/test/Transforms/InstCombine/ldexp-ext.ll | 16 +- llvm/test/Transforms/InstCombine/log-pow.ll | 6 +- .../logical-select-inseltpoison.ll | 22 +- .../Transforms/InstCombine/logical-select.ll | 50 ++-- .../InstCombine/lshr-and-negC-icmpeq-zero.ll | 2 +- llvm/test/Transforms/InstCombine/lshr.ll | 4 +- .../InstCombine/masked-merge-add.ll | 16 +- .../InstCombine/masked-merge-and-of-ors.ll | 42 +-- .../Transforms/InstCombine/masked-merge-or.ll | 16 +- .../InstCombine/masked-merge-xor.ll | 40 +-- .../Transforms/InstCombine/minmax-fold.ll | 10 +- .../Transforms/InstCombine/minmax-of-xor-x.ll | 20 +- .../Transforms/InstCombine/mul-masked-bits.ll | 6 +- llvm/test/Transforms/InstCombine/mul-pow2.ll | 2 +- llvm/test/Transforms/InstCombine/mul.ll | 8 +- llvm/test/Transforms/InstCombine/mul_fold.ll | 12 +- .../Transforms/InstCombine/mul_full_64.ll | 4 +- llvm/test/Transforms/InstCombine/not-add.ll | 8 +- llvm/test/Transforms/InstCombine/not.ll | 48 ++-- .../Transforms/InstCombine/onehot_merge.ll | 48 ++-- .../test/Transforms/InstCombine/or-xor-xor.ll | 4 +- llvm/test/Transforms/InstCombine/or-xor.ll | 38 +-- llvm/test/Transforms/InstCombine/or.ll | 20 +- ...nput-masking-after-truncation-variant-b.ll | 6 +- ...dant-left-shift-input-masking-variant-b.ll | 2 +- llvm/test/Transforms/InstCombine/phi.ll | 10 +- llvm/test/Transforms/InstCombine/pr44242.ll | 8 +- llvm/test/Transforms/InstCombine/pr49688.ll | 4 +- llvm/test/Transforms/InstCombine/pr75369.ll | 2 +- .../InstCombine/ptr-int-ptr-icmp.ll | 14 +- llvm/test/Transforms/InstCombine/ptrmask.ll | 22 +- .../Transforms/InstCombine/range-check.ll | 44 +-- .../Transforms/InstCombine/reassociate-nuw.ll | 8 +- ...nput-masking-after-truncation-variant-b.ll | 10 +- ...dant-left-shift-input-masking-variant-b.ll | 20 +- llvm/test/Transforms/InstCombine/rem.ll | 22 +- ...f-negative-is-non-zero-and-no-underflow.ll | 36 +-- ...ve-or-zero-is-non-zero-and-no-underflow.ll | 32 +-- ...ult-of-usub-is-non-zero-and-no-overflow.ll | 56 ++-- .../InstCombine/saturating-add-sub.ll | 26 +- .../InstCombine/scalarization-inseltpoison.ll | 12 +- .../Transforms/InstCombine/scalarization.ll | 12 +- .../Transforms/InstCombine/select-and-or.ll | 26 +- .../InstCombine/select-binop-cmp.ll | 2 +- .../select-binop-foldable-floating-point.ll | 24 +- .../InstCombine/select-cmp-eq-op-fold.ll | 2 +- .../test/Transforms/InstCombine/select-cmp.ll | 58 ++-- .../InstCombine/select-ctlz-to-cttz.ll | 12 +- .../Transforms/InstCombine/select-divrem.ll | 2 +- .../InstCombine/select-factorize.ll | 24 +- .../InstCombine/select-masked_gather.ll | 2 +- .../InstCombine/select-masked_load.ll | 2 +- .../InstCombine/select-of-bittest.ll | 18 +- .../InstCombine/select-safe-transforms.ll | 4 +- .../InstCombine/select-with-bitwise-ops.ll | 86 +++--- llvm/test/Transforms/InstCombine/select.ll | 38 +-- .../Transforms/InstCombine/select_meta.ll | 10 +- llvm/test/Transforms/InstCombine/set.ll | 4 +- llvm/test/Transforms/InstCombine/shift-add.ll | 12 +- ...ciation-in-bittest-with-truncation-lshr.ll | 2 +- ...ociation-in-bittest-with-truncation-shl.ll | 8 +- .../shift-direction-in-bit-test.ll | 4 +- .../Transforms/InstCombine/shift-logic.ll | 6 +- llvm/test/Transforms/InstCombine/shift.ll | 10 +- llvm/test/Transforms/InstCombine/shl-bo.ll | 32 +-- .../Transforms/InstCombine/shuffle-binop.ll | 4 +- .../InstCombine/signed-truncation-check.ll | 4 +- .../InstCombine/simplify-demanded-fpclass.ll | 2 +- .../InstCombine/sink-not-into-and.ll | 2 +- .../InstCombine/sink-not-into-or.ll | 2 +- llvm/test/Transforms/InstCombine/smax-icmp.ll | 8 +- llvm/test/Transforms/InstCombine/smin-icmp.ll | 8 +- .../InstCombine/sub-ashr-or-to-icmp-select.ll | 4 +- llvm/test/Transforms/InstCombine/sub-gep.ll | 2 +- .../InstCombine/sub-lshr-or-to-icmp-select.ll | 2 +- .../test/Transforms/InstCombine/sub-minmax.ll | 10 +- llvm/test/Transforms/InstCombine/sub-not.ll | 16 +- .../sub-of-negatible-inseltpoison.ll | 16 +- .../InstCombine/sub-of-negatible.ll | 18 +- .../Transforms/InstCombine/sub-xor-cmp.ll | 8 +- llvm/test/Transforms/InstCombine/sub.ll | 26 +- .../Transforms/InstCombine/trunc-binop-ext.ll | 40 +-- llvm/test/Transforms/InstCombine/uaddo.ll | 20 +- llvm/test/Transforms/InstCombine/umax-icmp.ll | 8 +- llvm/test/Transforms/InstCombine/umin-icmp.ll | 8 +- .../unordered-compare-and-ordered.ll | 8 +- ...gned-add-lack-of-overflow-check-via-add.ll | 2 +- ...gned-add-lack-of-overflow-check-via-xor.ll | 22 +- .../unsigned-add-lack-of-overflow-check.ll | 12 +- .../unsigned-add-overflow-check-via-add.ll | 4 +- .../unsigned-add-overflow-check-via-xor.ll | 22 +- .../unsigned-add-overflow-check.ll | 12 +- .../unsigned-sub-lack-of-overflow-check.ll | 2 +- .../unsigned-sub-overflow-check.ll | 2 +- .../InstCombine/vec_demanded_elts.ll | 6 +- .../InstCombine/vec_shuffle-inseltpoison.ll | 14 +- .../Transforms/InstCombine/vec_shuffle.ll | 14 +- .../Transforms/InstCombine/vector-reverse.ll | 2 +- .../test/Transforms/InstCombine/vector-xor.ll | 8 +- .../InstCombine/widenable-conditions.ll | 16 +- llvm/test/Transforms/InstCombine/xor.ll | 34 +-- llvm/test/Transforms/InstCombine/xor2.ll | 32 +-- .../InstCombine/zext-bool-add-sub.ll | 16 +- .../Transforms/InstCombine/zext-or-icmp.ll | 2 +- llvm/test/Transforms/InstCombine/zext.ll | 8 +- .../AArch64/deterministic-type-shrinkage.ll | 2 +- .../AArch64/sve-cond-inv-loads.ll | 18 +- .../AArch64/sve-gather-scatter.ll | 30 +- .../LoopVectorize/AArch64/sve-inductions.ll | 2 +- .../AArch64/sve-interleaved-accesses.ll | 2 +- .../AArch64/sve-vector-reverse.ll | 136 ++++----- .../LoopVectorize/AArch64/sve-widen-phi.ll | 26 +- .../AArch64/vector-reverse-mask4.ll | 10 +- .../Transforms/LoopVectorize/ARM/mve-qabs.ll | 74 ++--- .../LoopVectorize/ARM/mve-reductions.ll | 14 +- .../LoopVectorize/ARM/mve-selectandorcost.ll | 64 ++--- .../LoopVectorize/ARM/pointer_iv.ll | 264 +++++++++--------- .../ARM/tail-fold-multiple-icmps.ll | 36 +-- .../X86/invariant-load-gather.ll | 4 +- .../X86/invariant-store-vectorization.ll | 20 +- .../Transforms/LoopVectorize/X86/pr23997.ll | 2 +- .../LoopVectorize/extract-last-veclane.ll | 4 +- .../LoopVectorize/float-induction.ll | 60 ++-- .../LoopVectorize/if-conversion-nest.ll | 6 +- .../Transforms/LoopVectorize/induction.ll | 114 ++++---- .../LoopVectorize/interleaved-accesses.ll | 2 +- .../invariant-store-vectorization-2.ll | 14 +- .../invariant-store-vectorization.ll | 28 +- .../LoopVectorize/reduction-inloop-cond.ll | 18 +- .../LoopVectorize/reduction-inloop.ll | 26 +- .../Transforms/LoopVectorize/reduction.ll | 22 +- .../Transforms/LoopVectorize/runtime-check.ll | 8 +- .../LoopVectorize/scalable-inductions.ll | 8 +- .../uniform-args-call-variants.ll | 4 +- llvm/test/Transforms/PGOProfile/chr.ll | 14 +- .../AArch64/hoist-runtime-checks.ll | 8 +- ...ting-sinking-required-for-vectorization.ll | 14 +- .../AArch64/matrix-extract-insert.ll | 16 +- ...ple-unreachable-exits-for-vectorization.ll | 2 +- .../PhaseOrdering/AArch64/quant_4x4.ll | 8 +- .../PhaseOrdering/ARM/arm_mult_q15.ll | 2 +- .../X86/hoist-load-of-baseptr.ll | 4 +- .../PhaseOrdering/X86/speculation-vs-tbaa.ll | 2 +- .../X86/vector-reductions-logical.ll | 4 +- .../PhaseOrdering/fast-basictest.ll | 2 +- .../PhaseOrdering/reassociate-instcombine.ll | 4 +- .../PhaseOrdering/runtime-check-removal.ll | 2 +- .../Reassociate/fast-ArrayOutOfBounds.ll | 12 +- .../Reassociate/fast-SubReassociate.ll | 6 +- .../X86/cmp_commute-inseltpoison.ll | 52 ++-- .../SLPVectorizer/X86/cmp_commute.ll | 52 ++-- 289 files changed, 2680 insertions(+), 2681 deletions(-) diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1-bfloat.c index cbc645d429e5cb..aaf4e652cd1456 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1-bfloat.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1-bfloat.c @@ -44,7 +44,7 @@ svbfloat16_t test_svld1_bf16(svbool_t pg, const bfloat16_t *base) MODE_ATTR // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv8bf16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: ret [[TMP4]] @@ -54,7 +54,7 @@ svbfloat16_t test_svld1_bf16(svbool_t pg, const bfloat16_t *base) MODE_ATTR // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv8bf16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: ret [[TMP4]] diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1.c index 0c5ab6c9aea9f9..276ef64736bc33 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1.c @@ -209,7 +209,7 @@ svfloat64_t test_svld1_f64(svbool_t pg, const float64_t *base) MODE_ATTR // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]] // CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr [[TMP2]], i32 1, [[PG:%.*]], zeroinitializer) // CHECK-NEXT: ret [[TMP3]] @@ -218,7 +218,7 @@ svfloat64_t test_svld1_f64(svbool_t pg, const float64_t *base) MODE_ATTR // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]] // CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr [[TMP2]], i32 1, [[PG:%.*]], zeroinitializer) // CPP-CHECK-NEXT: ret [[TMP3]] @@ -233,7 +233,7 @@ svint8_t test_svld1_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum) MODE_ // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv8i16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: ret [[TMP4]] @@ -243,7 +243,7 @@ svint8_t test_svld1_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum) MODE_ // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv8i16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: ret [[TMP4]] @@ -258,7 +258,7 @@ svint16_t test_svld1_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum) MO // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv4i32.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: ret [[TMP4]] @@ -268,7 +268,7 @@ svint16_t test_svld1_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum) MO // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv4i32.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: ret [[TMP4]] @@ -283,7 +283,7 @@ svint32_t test_svld1_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum) MO // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i64.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: ret [[TMP4]] @@ -293,7 +293,7 @@ svint32_t test_svld1_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum) MO // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i64.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: ret [[TMP4]] @@ -307,7 +307,7 @@ svint64_t test_svld1_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum) MO // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]] // CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr [[TMP2]], i32 1, [[PG:%.*]], zeroinitializer) // CHECK-NEXT: ret [[TMP3]] @@ -316,7 +316,7 @@ svint64_t test_svld1_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum) MO // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]] // CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr [[TMP2]], i32 1, [[PG:%.*]], zeroinitializer) // CPP-CHECK-NEXT: ret [[TMP3]] @@ -331,7 +331,7 @@ svuint8_t test_svld1_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum) MOD // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv8i16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: ret [[TMP4]] @@ -341,7 +341,7 @@ svuint8_t test_svld1_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum) MOD // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv8i16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: ret [[TMP4]] @@ -356,7 +356,7 @@ svuint16_t test_svld1_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv4i32.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: ret [[TMP4]] @@ -366,7 +366,7 @@ svuint16_t test_svld1_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv4i32.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: ret [[TMP4]] @@ -381,7 +381,7 @@ svuint32_t test_svld1_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i64.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: ret [[TMP4]] @@ -391,7 +391,7 @@ svuint32_t test_svld1_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i64.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: ret [[TMP4]] @@ -406,7 +406,7 @@ svuint64_t test_svld1_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv8f16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: ret [[TMP4]] @@ -416,7 +416,7 @@ svuint64_t test_svld1_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv8f16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: ret [[TMP4]] @@ -431,7 +431,7 @@ svfloat16_t test_svld1_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv4f32.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: ret [[TMP4]] @@ -441,7 +441,7 @@ svfloat16_t test_svld1_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv4f32.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: ret [[TMP4]] @@ -456,7 +456,7 @@ svfloat32_t test_svld1_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2f64.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: ret [[TMP4]] @@ -466,7 +466,7 @@ svfloat32_t test_svld1_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2f64.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: ret [[TMP4]] diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sb.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sb.c index 59d1e103db389b..2757f2873cc83a 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sb.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sb.c @@ -141,7 +141,7 @@ svuint64_t test_svld1sb_u64(svbool_t pg, const int8_t *base) MODE_ATTR // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -152,7 +152,7 @@ svuint64_t test_svld1sb_u64(svbool_t pg, const int8_t *base) MODE_ATTR // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -168,7 +168,7 @@ svint16_t test_svld1sb_vnum_s16(svbool_t pg, const int8_t *base, int64_t vnum) M // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -179,7 +179,7 @@ svint16_t test_svld1sb_vnum_s16(svbool_t pg, const int8_t *base, int64_t vnum) M // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -195,7 +195,7 @@ svint32_t test_svld1sb_vnum_s32(svbool_t pg, const int8_t *base, int64_t vnum) M // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -206,7 +206,7 @@ svint32_t test_svld1sb_vnum_s32(svbool_t pg, const int8_t *base, int64_t vnum) M // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -222,7 +222,7 @@ svint64_t test_svld1sb_vnum_s64(svbool_t pg, const int8_t *base, int64_t vnum) M // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -233,7 +233,7 @@ svint64_t test_svld1sb_vnum_s64(svbool_t pg, const int8_t *base, int64_t vnum) M // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -249,7 +249,7 @@ svuint16_t test_svld1sb_vnum_u16(svbool_t pg, const int8_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -260,7 +260,7 @@ svuint16_t test_svld1sb_vnum_u16(svbool_t pg, const int8_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -276,7 +276,7 @@ svuint32_t test_svld1sb_vnum_u32(svbool_t pg, const int8_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -287,7 +287,7 @@ svuint32_t test_svld1sb_vnum_u32(svbool_t pg, const int8_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -298,7 +298,7 @@ svuint64_t test_svld1sb_vnum_u64(svbool_t pg, const int8_t *base, int64_t vnum) return svld1sb_vnum_u64(pg, base, vnum); } -#ifndef __ARM_FEATURE_SME +#ifndef __ARM_FEATURE_SME // CHECK-LABEL: @test_svld1sb_gather_u32base_s32( // CHECK-NEXT: entry: diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sh.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sh.c index 1df3f6adbc1c65..dbc762fb8632a0 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sh.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sh.c @@ -103,7 +103,7 @@ svuint64_t test_svld1sh_u64(svbool_t pg, const int16_t *base) MODE_ATTR // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -114,7 +114,7 @@ svuint64_t test_svld1sh_u64(svbool_t pg, const int16_t *base) MODE_ATTR // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -130,7 +130,7 @@ svint32_t test_svld1sh_vnum_s32(svbool_t pg, const int16_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -141,7 +141,7 @@ svint32_t test_svld1sh_vnum_s32(svbool_t pg, const int16_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -157,7 +157,7 @@ svint64_t test_svld1sh_vnum_s64(svbool_t pg, const int16_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -168,7 +168,7 @@ svint64_t test_svld1sh_vnum_s64(svbool_t pg, const int16_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -184,7 +184,7 @@ svuint32_t test_svld1sh_vnum_u32(svbool_t pg, const int16_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -195,7 +195,7 @@ svuint32_t test_svld1sh_vnum_u32(svbool_t pg, const int16_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -206,7 +206,7 @@ svuint64_t test_svld1sh_vnum_u64(svbool_t pg, const int16_t *base, int64_t vnum) return svld1sh_vnum_u64(pg, base, vnum); } -#ifndef __ARM_FEATURE_SME +#ifndef __ARM_FEATURE_SME // CHECK-LABEL: @test_svld1sh_gather_u32base_s32( // CHECK-NEXT: entry: diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sw.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sw.c index e7d77e62d44c1b..575d2141d28152 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sw.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sw.c @@ -65,7 +65,7 @@ svuint64_t test_svld1sw_u64(svbool_t pg, const int32_t *base) MODE_ATTR // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -76,7 +76,7 @@ svuint64_t test_svld1sw_u64(svbool_t pg, const int32_t *base) MODE_ATTR // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -92,7 +92,7 @@ svint64_t test_svld1sw_vnum_s64(svbool_t pg, const int32_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -103,7 +103,7 @@ svint64_t test_svld1sw_vnum_s64(svbool_t pg, const int32_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = sext [[TMP4]] to @@ -114,7 +114,7 @@ svuint64_t test_svld1sw_vnum_u64(svbool_t pg, const int32_t *base, int64_t vnum) return svld1sw_vnum_u64(pg, base, vnum); } -#ifndef __ARM_FEATURE_SME +#ifndef __ARM_FEATURE_SME // CHECK-LABEL: @test_svld1sw_gather_u64base_s64( // CHECK-NEXT: entry: diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1ub.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1ub.c index 31906b4e5f646a..07e88152a6f535 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1ub.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1ub.c @@ -141,7 +141,7 @@ svuint64_t test_svld1ub_u64(svbool_t pg, const uint8_t *base) MODE_ATTR // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -152,7 +152,7 @@ svuint64_t test_svld1ub_u64(svbool_t pg, const uint8_t *base) MODE_ATTR // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -168,7 +168,7 @@ svint16_t test_svld1ub_vnum_s16(svbool_t pg, const uint8_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -179,7 +179,7 @@ svint16_t test_svld1ub_vnum_s16(svbool_t pg, const uint8_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -195,7 +195,7 @@ svint32_t test_svld1ub_vnum_s32(svbool_t pg, const uint8_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -206,7 +206,7 @@ svint32_t test_svld1ub_vnum_s32(svbool_t pg, const uint8_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -222,7 +222,7 @@ svint64_t test_svld1ub_vnum_s64(svbool_t pg, const uint8_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -233,7 +233,7 @@ svint64_t test_svld1ub_vnum_s64(svbool_t pg, const uint8_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -249,7 +249,7 @@ svuint16_t test_svld1ub_vnum_u16(svbool_t pg, const uint8_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -260,7 +260,7 @@ svuint16_t test_svld1ub_vnum_u16(svbool_t pg, const uint8_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -276,7 +276,7 @@ svuint32_t test_svld1ub_vnum_u32(svbool_t pg, const uint8_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -287,7 +287,7 @@ svuint32_t test_svld1ub_vnum_u32(svbool_t pg, const uint8_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -298,7 +298,7 @@ svuint64_t test_svld1ub_vnum_u64(svbool_t pg, const uint8_t *base, int64_t vnum) return svld1ub_vnum_u64(pg, base, vnum); } -#ifndef __ARM_FEATURE_SME +#ifndef __ARM_FEATURE_SME // CHECK-LABEL: @test_svld1ub_gather_u32base_s32( // CHECK-NEXT: entry: diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uh.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uh.c index e6553e193109f8..6d91c1ecd7c7ae 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uh.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uh.c @@ -103,7 +103,7 @@ svuint64_t test_svld1uh_u64(svbool_t pg, const uint16_t *base) MODE_ATTR // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -114,7 +114,7 @@ svuint64_t test_svld1uh_u64(svbool_t pg, const uint16_t *base) MODE_ATTR // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -130,7 +130,7 @@ svint32_t test_svld1uh_vnum_s32(svbool_t pg, const uint16_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -141,7 +141,7 @@ svint32_t test_svld1uh_vnum_s32(svbool_t pg, const uint16_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -157,7 +157,7 @@ svint64_t test_svld1uh_vnum_s64(svbool_t pg, const uint16_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -168,7 +168,7 @@ svint64_t test_svld1uh_vnum_s64(svbool_t pg, const uint16_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -184,7 +184,7 @@ svuint32_t test_svld1uh_vnum_u32(svbool_t pg, const uint16_t *base, int64_t vnum // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -195,7 +195,7 @@ svuint32_t test_svld1uh_vnum_u32(svbool_t pg, const uint16_t *base, int64_t vnum // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -206,7 +206,7 @@ svuint64_t test_svld1uh_vnum_u64(svbool_t pg, const uint16_t *base, int64_t vnum return svld1uh_vnum_u64(pg, base, vnum); } -#ifndef __ARM_FEATURE_SME +#ifndef __ARM_FEATURE_SME // CHECK-LABEL: @test_svld1uh_gather_u32base_s32( // CHECK-NEXT: entry: diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uw.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uw.c index b7ffb86daac235..7be23987aedf50 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uw.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uw.c @@ -65,7 +65,7 @@ svuint64_t test_svld1uw_u64(svbool_t pg, const uint32_t *base) MODE_ATTR // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -76,7 +76,7 @@ svuint64_t test_svld1uw_u64(svbool_t pg, const uint32_t *base) MODE_ATTR // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -92,7 +92,7 @@ svint64_t test_svld1uw_vnum_s64(svbool_t pg, const uint32_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -103,7 +103,7 @@ svint64_t test_svld1uw_vnum_s64(svbool_t pg, const uint32_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, [[TMP0]], zeroinitializer) // CPP-CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP4]] to @@ -114,7 +114,7 @@ svuint64_t test_svld1uw_vnum_u64(svbool_t pg, const uint32_t *base, int64_t vnum return svld1uw_vnum_u64(pg, base, vnum); } -#ifndef __ARM_FEATURE_SME +#ifndef __ARM_FEATURE_SME // CHECK-LABEL: @test_svld1uw_gather_u64base_s64( // CHECK-NEXT: entry: diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1-bfloat.c index c1254e03102d72..1d194626418a22 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1-bfloat.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1-bfloat.c @@ -45,7 +45,7 @@ void test_svst1_bf16(svbool_t pg, bfloat16_t *base, svbfloat16_t data) MODE_ATTR // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.masked.store.nxv8bf16.p0( [[DATA:%.*]], ptr [[TMP3]], i32 1, [[TMP0]]) // CHECK-NEXT: ret void @@ -55,7 +55,7 @@ void test_svst1_bf16(svbool_t pg, bfloat16_t *base, svbfloat16_t data) MODE_ATTR // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.masked.store.nxv8bf16.p0( [[DATA:%.*]], ptr [[TMP3]], i32 1, [[TMP0]]) // CPP-CHECK-NEXT: ret void diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1.c index 519f0c90614a54..29afdaf3eb0c7a 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1.c @@ -209,7 +209,7 @@ void test_svst1_f64(svbool_t pg, float64_t *base, svfloat64_t data) MODE_ATTR // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]] // CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[DATA:%.*]], ptr [[TMP2]], i32 1, [[PG:%.*]]) // CHECK-NEXT: ret void @@ -218,7 +218,7 @@ void test_svst1_f64(svbool_t pg, float64_t *base, svfloat64_t data) MODE_ATTR // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]] // CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[DATA:%.*]], ptr [[TMP2]], i32 1, [[PG:%.*]]) // CPP-CHECK-NEXT: ret void @@ -233,7 +233,7 @@ void test_svst1_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8_t data) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.masked.store.nxv8i16.p0( [[DATA:%.*]], ptr [[TMP3]], i32 1, [[TMP0]]) // CHECK-NEXT: ret void @@ -243,7 +243,7 @@ void test_svst1_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8_t data) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.masked.store.nxv8i16.p0( [[DATA:%.*]], ptr [[TMP3]], i32 1, [[TMP0]]) // CPP-CHECK-NEXT: ret void @@ -258,7 +258,7 @@ void test_svst1_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16_t dat // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.masked.store.nxv4i32.p0( [[DATA:%.*]], ptr [[TMP3]], i32 1, [[TMP0]]) // CHECK-NEXT: ret void @@ -268,7 +268,7 @@ void test_svst1_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16_t dat // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.masked.store.nxv4i32.p0( [[DATA:%.*]], ptr [[TMP3]], i32 1, [[TMP0]]) // CPP-CHECK-NEXT: ret void @@ -283,7 +283,7 @@ void test_svst1_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32_t dat // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.masked.store.nxv2i64.p0( [[DATA:%.*]], ptr [[TMP3]], i32 1, [[TMP0]]) // CHECK-NEXT: ret void @@ -293,7 +293,7 @@ void test_svst1_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32_t dat // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.masked.store.nxv2i64.p0( [[DATA:%.*]], ptr [[TMP3]], i32 1, [[TMP0]]) // CPP-CHECK-NEXT: ret void @@ -307,7 +307,7 @@ void test_svst1_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64_t dat // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]] // CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[DATA:%.*]], ptr [[TMP2]], i32 1, [[PG:%.*]]) // CHECK-NEXT: ret void @@ -316,7 +316,7 @@ void test_svst1_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64_t dat // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]] // CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[DATA:%.*]], ptr [[TMP2]], i32 1, [[PG:%.*]]) // CPP-CHECK-NEXT: ret void @@ -331,7 +331,7 @@ void test_svst1_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8_t data // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.masked.store.nxv8i16.p0( [[DATA:%.*]], ptr [[TMP3]], i32 1, [[TMP0]]) // CHECK-NEXT: ret void @@ -341,7 +341,7 @@ void test_svst1_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8_t data // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.masked.store.nxv8i16.p0( [[DATA:%.*]], ptr [[TMP3]], i32 1, [[TMP0]]) // CPP-CHECK-NEXT: ret void @@ -356,7 +356,7 @@ void test_svst1_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16_t d // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.masked.store.nxv4i32.p0( [[DATA:%.*]], ptr [[TMP3]], i32 1, [[TMP0]]) // CHECK-NEXT: ret void @@ -366,7 +366,7 @@ void test_svst1_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16_t d // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.masked.store.nxv4i32.p0( [[DATA:%.*]], ptr [[TMP3]], i32 1, [[TMP0]]) // CPP-CHECK-NEXT: ret void @@ -381,7 +381,7 @@ void test_svst1_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32_t d // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.masked.store.nxv2i64.p0( [[DATA:%.*]], ptr [[TMP3]], i32 1, [[TMP0]]) // CHECK-NEXT: ret void @@ -391,7 +391,7 @@ void test_svst1_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32_t d // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.masked.store.nxv2i64.p0( [[DATA:%.*]], ptr [[TMP3]], i32 1, [[TMP0]]) // CPP-CHECK-NEXT: ret void @@ -406,7 +406,7 @@ void test_svst1_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64_t d // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.masked.store.nxv8f16.p0( [[DATA:%.*]], ptr [[TMP3]], i32 1, [[TMP0]]) // CHECK-NEXT: ret void @@ -416,7 +416,7 @@ void test_svst1_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64_t d // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.masked.store.nxv8f16.p0( [[DATA:%.*]], ptr [[TMP3]], i32 1, [[TMP0]]) // CPP-CHECK-NEXT: ret void @@ -431,7 +431,7 @@ void test_svst1_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16_t // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.masked.store.nxv4f32.p0( [[DATA:%.*]], ptr [[TMP3]], i32 1, [[TMP0]]) // CHECK-NEXT: ret void @@ -441,7 +441,7 @@ void test_svst1_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16_t // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.masked.store.nxv4f32.p0( [[DATA:%.*]], ptr [[TMP3]], i32 1, [[TMP0]]) // CPP-CHECK-NEXT: ret void @@ -456,7 +456,7 @@ void test_svst1_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32_t // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.masked.store.nxv2f64.p0( [[DATA:%.*]], ptr [[TMP3]], i32 1, [[TMP0]]) // CHECK-NEXT: ret void @@ -466,7 +466,7 @@ void test_svst1_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32_t // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.masked.store.nxv2f64.p0( [[DATA:%.*]], ptr [[TMP3]], i32 1, [[TMP0]]) // CPP-CHECK-NEXT: ret void @@ -476,7 +476,7 @@ void test_svst1_vnum_f64(svbool_t pg, float64_t *base, int64_t vnum, svfloat64_t return SVE_ACLE_FUNC(svst1_vnum,_f64,,)(pg, base, vnum, data); } -#ifndef __ARM_FEATURE_SME +#ifndef __ARM_FEATURE_SME // CHECK-LABEL: @test_svst1_scatter_u32base_s32( // CHECK-NEXT: entry: diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1b.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1b.c index 152f01aab7405b..c908bc2a483cec 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1b.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1b.c @@ -97,7 +97,7 @@ void test_svst1b_u64(svbool_t pg, uint8_t *base, svuint64_t data) MODE_ATTR // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = trunc [[DATA:%.*]] to // CHECK-NEXT: tail call void @llvm.masked.store.nxv8i8.p0( [[TMP4]], ptr [[TMP3]], i32 1, [[TMP0]]) @@ -113,7 +113,7 @@ void test_svst1b_vnum_s16(svbool_t pg, int8_t *base, int64_t vnum, svint16_t dat // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = trunc [[DATA:%.*]] to // CHECK-NEXT: tail call void @llvm.masked.store.nxv4i8.p0( [[TMP4]], ptr [[TMP3]], i32 1, [[TMP0]]) @@ -129,7 +129,7 @@ void test_svst1b_vnum_s32(svbool_t pg, int8_t *base, int64_t vnum, svint32_t dat // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = trunc [[DATA:%.*]] to // CHECK-NEXT: tail call void @llvm.masked.store.nxv2i8.p0( [[TMP4]], ptr [[TMP3]], i32 1, [[TMP0]]) @@ -145,7 +145,7 @@ void test_svst1b_vnum_s64(svbool_t pg, int8_t *base, int64_t vnum, svint64_t dat // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = trunc [[DATA:%.*]] to // CHECK-NEXT: tail call void @llvm.masked.store.nxv8i8.p0( [[TMP4]], ptr [[TMP3]], i32 1, [[TMP0]]) @@ -161,7 +161,7 @@ void test_svst1b_vnum_u16(svbool_t pg, uint8_t *base, int64_t vnum, svuint16_t d // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = trunc [[DATA:%.*]] to // CHECK-NEXT: tail call void @llvm.masked.store.nxv4i8.p0( [[TMP4]], ptr [[TMP3]], i32 1, [[TMP0]]) @@ -177,7 +177,7 @@ void test_svst1b_vnum_u32(svbool_t pg, uint8_t *base, int64_t vnum, svuint32_t d // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = trunc [[DATA:%.*]] to // CHECK-NEXT: tail call void @llvm.masked.store.nxv2i8.p0( [[TMP4]], ptr [[TMP3]], i32 1, [[TMP0]]) @@ -188,7 +188,7 @@ void test_svst1b_vnum_u64(svbool_t pg, uint8_t *base, int64_t vnum, svuint64_t d return SVE_ACLE_FUNC(svst1b_vnum,_u64,,)(pg, base, vnum, data); } -#ifndef __ARM_FEATURE_SME +#ifndef __ARM_FEATURE_SME // CHECK-LABEL: @test_svst1b_scatter_u32base_s32( // CHECK-NEXT: entry: diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1h.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1h.c index 9aa450f2e5457d..959b658425f01d 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1h.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1h.c @@ -73,7 +73,7 @@ void test_svst1h_u64(svbool_t pg, uint16_t *base, svuint64_t data) MODE_ATTR // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = trunc [[DATA:%.*]] to // CHECK-NEXT: tail call void @llvm.masked.store.nxv4i16.p0( [[TMP4]], ptr [[TMP3]], i32 1, [[TMP0]]) @@ -89,7 +89,7 @@ void test_svst1h_vnum_s32(svbool_t pg, int16_t *base, int64_t vnum, svint32_t da // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = trunc [[DATA:%.*]] to // CHECK-NEXT: tail call void @llvm.masked.store.nxv2i16.p0( [[TMP4]], ptr [[TMP3]], i32 1, [[TMP0]]) @@ -105,7 +105,7 @@ void test_svst1h_vnum_s64(svbool_t pg, int16_t *base, int64_t vnum, svint64_t da // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = trunc [[DATA:%.*]] to // CHECK-NEXT: tail call void @llvm.masked.store.nxv4i16.p0( [[TMP4]], ptr [[TMP3]], i32 1, [[TMP0]]) @@ -121,7 +121,7 @@ void test_svst1h_vnum_u32(svbool_t pg, uint16_t *base, int64_t vnum, svuint32_t // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = trunc [[DATA:%.*]] to // CHECK-NEXT: tail call void @llvm.masked.store.nxv2i16.p0( [[TMP4]], ptr [[TMP3]], i32 1, [[TMP0]]) @@ -132,7 +132,7 @@ void test_svst1h_vnum_u64(svbool_t pg, uint16_t *base, int64_t vnum, svuint64_t return SVE_ACLE_FUNC(svst1h_vnum,_u64,,)(pg, base, vnum, data); } -#ifndef __ARM_FEATURE_SME +#ifndef __ARM_FEATURE_SME // CHECK-LABEL: @test_svst1h_scatter_u32base_s32( // CHECK-NEXT: entry: diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1w.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1w.c index f22190b3583ed9..3d9e45bda7b3f6 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1w.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1w.c @@ -49,7 +49,7 @@ void test_svst1w_u64(svbool_t pg, uint32_t *base, svuint64_t data) MODE_ATTR // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = trunc [[DATA:%.*]] to // CHECK-NEXT: tail call void @llvm.masked.store.nxv2i32.p0( [[TMP4]], ptr [[TMP3]], i32 1, [[TMP0]]) @@ -65,7 +65,7 @@ void test_svst1w_vnum_s64(svbool_t pg, int32_t *base, int64_t vnum, svint64_t da // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = trunc [[DATA:%.*]] to // CHECK-NEXT: tail call void @llvm.masked.store.nxv2i32.p0( [[TMP4]], ptr [[TMP3]], i32 1, [[TMP0]]) @@ -76,7 +76,7 @@ void test_svst1w_vnum_u64(svbool_t pg, uint32_t *base, int64_t vnum, svuint64_t return SVE_ACLE_FUNC(svst1w_vnum,_u64,,)(pg, base, vnum, data); } -#ifndef __ARM_FEATURE_SME +#ifndef __ARM_FEATURE_SME // CHECK-LABEL: @test_svst1w_scatter_u64base_s64( // CHECK-NEXT: entry: diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c index 877e24411bb9b5..467161ccc238da 100644 --- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c @@ -314,7 +314,7 @@ svfloat64x2_t test_svld2q_f64(svbool_t pg, const float64_t *base) // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]] // CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP3:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP2]]) // CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP3]], 0 @@ -327,7 +327,7 @@ svfloat64x2_t test_svld2q_f64(svbool_t pg, const float64_t *base) // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]] // CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP2]]) // CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP3]], 0 @@ -345,7 +345,7 @@ svuint8x2_t test_svld2q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum) // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]] // CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP3:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP2]]) // CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP3]], 0 @@ -358,7 +358,7 @@ svuint8x2_t test_svld2q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum) // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]] // CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP2]]) // CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP3]], 0 @@ -376,7 +376,7 @@ svint8x2_t test_svld2q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8i16( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 @@ -390,7 +390,7 @@ svint8x2_t test_svld2q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8i16( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 @@ -409,7 +409,7 @@ svuint16x2_t test_svld2q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnu // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8i16( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 @@ -423,7 +423,7 @@ svuint16x2_t test_svld2q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnu // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8i16( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 @@ -442,7 +442,7 @@ svint16x2_t test_svld2q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv4i32( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 @@ -456,7 +456,7 @@ svint16x2_t test_svld2q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv4i32( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 @@ -475,7 +475,7 @@ svuint32x2_t test_svld2q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnu // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv4i32( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 @@ -489,7 +489,7 @@ svuint32x2_t test_svld2q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnu // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv4i32( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 @@ -508,7 +508,7 @@ svint32x2_t test_svld2q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv2i64( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 @@ -522,7 +522,7 @@ svint32x2_t test_svld2q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv2i64( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 @@ -541,7 +541,7 @@ svuint64x2_t test_svld2q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnu // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv2i64( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 @@ -555,7 +555,7 @@ svuint64x2_t test_svld2q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnu // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv2i64( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 @@ -574,7 +574,7 @@ svint64x2_t test_svld2q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8f16( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 @@ -588,7 +588,7 @@ svint64x2_t test_svld2q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8f16( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 @@ -607,7 +607,7 @@ svfloat16x2_t test_svld2q_vnum_f16(svbool_t pg, const float16_t *base, int64_t v // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8bf16( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 @@ -621,7 +621,7 @@ svfloat16x2_t test_svld2q_vnum_f16(svbool_t pg, const float16_t *base, int64_t v // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv8bf16( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 @@ -640,7 +640,7 @@ svbfloat16x2_t test_svld2q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_ // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv4f32( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 @@ -654,7 +654,7 @@ svbfloat16x2_t test_svld2q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_ // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv4f32( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 @@ -673,7 +673,7 @@ svfloat32x2_t test_svld2q_vnum_f32(svbool_t pg, const float32_t *base, int64_t v // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv2f64( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 @@ -687,7 +687,7 @@ svfloat32x2_t test_svld2q_vnum_f32(svbool_t pg, const float32_t *base, int64_t v // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , } @llvm.aarch64.sve.ld2q.sret.nxv2f64( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP4]], 0 @@ -1049,7 +1049,7 @@ svfloat64x3_t test_svld3q_f64(svbool_t pg, const float64_t *base) // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]] // CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP3:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP2]]) // CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP3]], 0 @@ -1064,7 +1064,7 @@ svfloat64x3_t test_svld3q_f64(svbool_t pg, const float64_t *base) // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]] // CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP2]]) // CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP3]], 0 @@ -1084,7 +1084,7 @@ svuint8x3_t test_svld3q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum) // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]] // CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP3:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP2]]) // CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP3]], 0 @@ -1099,7 +1099,7 @@ svuint8x3_t test_svld3q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum) // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]] // CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP2]]) // CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[TMP3]], 0 @@ -1120,7 +1120,7 @@ svint8x3_t test_svld3q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8i16( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP4]], 0 @@ -1136,7 +1136,7 @@ svint8x3_t test_svld3q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8i16( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP4]], 0 @@ -1157,7 +1157,7 @@ svuint16x3_t test_svld3q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnu // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8i16( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP4]], 0 @@ -1173,7 +1173,7 @@ svuint16x3_t test_svld3q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnu // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8i16( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP4]], 0 @@ -1194,7 +1194,7 @@ svint16x3_t test_svld3q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4i32( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP4]], 0 @@ -1210,7 +1210,7 @@ svint16x3_t test_svld3q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4i32( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP4]], 0 @@ -1231,7 +1231,7 @@ svuint32x3_t test_svld3q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnu // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4i32( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP4]], 0 @@ -1247,7 +1247,7 @@ svuint32x3_t test_svld3q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnu // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4i32( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP4]], 0 @@ -1268,7 +1268,7 @@ svint32x3_t test_svld3q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2i64( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP4]], 0 @@ -1284,7 +1284,7 @@ svint32x3_t test_svld3q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2i64( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP4]], 0 @@ -1305,7 +1305,7 @@ svuint64x3_t test_svld3q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnu // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2i64( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP4]], 0 @@ -1321,7 +1321,7 @@ svuint64x3_t test_svld3q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnu // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2i64( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP4]], 0 @@ -1342,7 +1342,7 @@ svint64x3_t test_svld3q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8f16( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP4]], 0 @@ -1358,7 +1358,7 @@ svint64x3_t test_svld3q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8f16( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP4]], 0 @@ -1379,7 +1379,7 @@ svfloat16x3_t test_svld3q_vnum_f16(svbool_t pg, const float16_t *base, int64_t v // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8bf16( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP4]], 0 @@ -1395,7 +1395,7 @@ svfloat16x3_t test_svld3q_vnum_f16(svbool_t pg, const float16_t *base, int64_t v // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv8bf16( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP4]], 0 @@ -1416,7 +1416,7 @@ svbfloat16x3_t test_svld3q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_ // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4f32( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP4]], 0 @@ -1432,7 +1432,7 @@ svbfloat16x3_t test_svld3q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_ // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv4f32( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP4]], 0 @@ -1453,7 +1453,7 @@ svfloat32x3_t test_svld3q_vnum_f32(svbool_t pg, const float32_t *base, int64_t v // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2f64( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP4]], 0 @@ -1469,7 +1469,7 @@ svfloat32x3_t test_svld3q_vnum_f32(svbool_t pg, const float32_t *base, int64_t v // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , } @llvm.aarch64.sve.ld3q.sret.nxv2f64( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP4]], 0 @@ -1850,7 +1850,7 @@ svfloat64x4_t test_svld4q_f64(svbool_t pg, const float64_t *base) // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]] // CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP3:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP2]]) // CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 @@ -1867,7 +1867,7 @@ svfloat64x4_t test_svld4q_f64(svbool_t pg, const float64_t *base) // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]] // CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP2]]) // CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 @@ -1889,7 +1889,7 @@ svuint8x4_t test_svld4q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum) // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]] // CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP3:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP2]]) // CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 @@ -1906,7 +1906,7 @@ svuint8x4_t test_svld4q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum) // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]] // CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv16i8( [[PG:%.*]], ptr [[TMP2]]) // CPP-CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 @@ -1928,7 +1928,7 @@ svint8x4_t test_svld4q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8i16( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 @@ -1946,7 +1946,7 @@ svint8x4_t test_svld4q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8i16( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 @@ -1969,7 +1969,7 @@ svuint16x4_t test_svld4q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnu // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8i16( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 @@ -1987,7 +1987,7 @@ svuint16x4_t test_svld4q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnu // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8i16( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 @@ -2010,7 +2010,7 @@ svint16x4_t test_svld4q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4i32( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 @@ -2028,7 +2028,7 @@ svint16x4_t test_svld4q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4i32( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 @@ -2051,7 +2051,7 @@ svuint32x4_t test_svld4q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnu // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4i32( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 @@ -2069,7 +2069,7 @@ svuint32x4_t test_svld4q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnu // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4i32( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 @@ -2092,7 +2092,7 @@ svint32x4_t test_svld4q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2i64( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 @@ -2110,7 +2110,7 @@ svint32x4_t test_svld4q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2i64( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 @@ -2133,7 +2133,7 @@ svuint64x4_t test_svld4q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnu // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2i64( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 @@ -2151,7 +2151,7 @@ svuint64x4_t test_svld4q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnu // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2i64( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 @@ -2174,7 +2174,7 @@ svint64x4_t test_svld4q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum) // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8f16( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 @@ -2192,7 +2192,7 @@ svint64x4_t test_svld4q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum) // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8f16( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 @@ -2215,7 +2215,7 @@ svfloat16x4_t test_svld4q_vnum_f16(svbool_t pg, const float16_t *base, int64_t v // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8bf16( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 @@ -2233,7 +2233,7 @@ svfloat16x4_t test_svld4q_vnum_f16(svbool_t pg, const float16_t *base, int64_t v // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv8bf16( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 @@ -2256,7 +2256,7 @@ svbfloat16x4_t test_svld4q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_ // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4f32( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 @@ -2274,7 +2274,7 @@ svbfloat16x4_t test_svld4q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_ // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv4f32( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 @@ -2297,7 +2297,7 @@ svfloat32x4_t test_svld4q_vnum_f32(svbool_t pg, const float32_t *base, int64_t v // CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2f64( [[TMP0]], ptr [[TMP3]]) // CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 @@ -2315,7 +2315,7 @@ svfloat32x4_t test_svld4q_vnum_f32(svbool_t pg, const float32_t *base, int64_t v // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]] // CPP-CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { , , , } @llvm.aarch64.sve.ld4q.sret.nxv2f64( [[TMP0]], ptr [[TMP3]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP4]], 0 diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c index bc028eeba624cc..1def0289c12ae4 100644 --- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c +++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c @@ -267,7 +267,7 @@ void test_svst2q_f64(svbool_t pg, const float64_t *base, svfloat64x2_t zt) // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZT]], i64 16) // CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP3]] // CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[TMP4]]) // CHECK-NEXT: ret void @@ -278,7 +278,7 @@ void test_svst2q_f64(svbool_t pg, const float64_t *base, svfloat64x2_t zt) // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZT]], i64 16) // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP3]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void @@ -294,7 +294,7 @@ void test_svst2q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum, svuint8 // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZT]], i64 16) // CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP3]] // CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[TMP4]]) // CHECK-NEXT: ret void @@ -305,7 +305,7 @@ void test_svst2q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum, svuint8 // CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv32i8( [[ZT]], i64 16) // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP3]] // CPP-CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv16i8( [[TMP0]], [[TMP1]], [[PG:%.*]], ptr [[TMP4]]) // CPP-CHECK-NEXT: ret void @@ -322,7 +322,7 @@ void test_svst2q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum, svint8x2 // CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP5]]) // CHECK-NEXT: ret void @@ -334,7 +334,7 @@ void test_svst2q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum, svint8x2 // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void @@ -351,7 +351,7 @@ void test_svst2q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum, svuin // CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP5]]) // CHECK-NEXT: ret void @@ -363,7 +363,7 @@ void test_svst2q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum, svuin // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void @@ -380,7 +380,7 @@ void test_svst2q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum, svint1 // CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP5]]) // CHECK-NEXT: ret void @@ -392,7 +392,7 @@ void test_svst2q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum, svint1 // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void @@ -409,7 +409,7 @@ void test_svst2q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum, svuin // CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP5]]) // CHECK-NEXT: ret void @@ -421,7 +421,7 @@ void test_svst2q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum, svuin // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void @@ -438,7 +438,7 @@ void test_svst2q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum, svint3 // CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP5]]) // CHECK-NEXT: ret void @@ -450,7 +450,7 @@ void test_svst2q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum, svint3 // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void @@ -467,7 +467,7 @@ void test_svst2q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum, svuin // CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP5]]) // CHECK-NEXT: ret void @@ -479,7 +479,7 @@ void test_svst2q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum, svuin // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void @@ -496,7 +496,7 @@ void test_svst2q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum, svint6 // CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP5]]) // CHECK-NEXT: ret void @@ -508,7 +508,7 @@ void test_svst2q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum, svint6 // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void @@ -525,7 +525,7 @@ void test_svst2q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum, svfl // CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP5]]) // CHECK-NEXT: ret void @@ -537,7 +537,7 @@ void test_svst2q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum, svfl // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void @@ -554,7 +554,7 @@ void test_svst2q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum, sv // CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP5]]) // CHECK-NEXT: ret void @@ -566,7 +566,7 @@ void test_svst2q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum, sv // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void @@ -583,7 +583,7 @@ void test_svst2q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum, svfl // CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP5]]) // CHECK-NEXT: ret void @@ -595,7 +595,7 @@ void test_svst2q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum, svfl // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st2q.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void @@ -885,7 +885,7 @@ void test_svst3q_f64(svbool_t pg, const float64_t *base, svfloat64x3_t zt) // CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT]], i64 32) // CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP5]]) // CHECK-NEXT: ret void @@ -897,7 +897,7 @@ void test_svst3q_f64(svbool_t pg, const float64_t *base, svfloat64x3_t zt) // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT]], i64 32) // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void @@ -914,7 +914,7 @@ void test_svst3q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum, svuint8 // CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT]], i64 32) // CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP5]]) // CHECK-NEXT: ret void @@ -926,7 +926,7 @@ void test_svst3q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum, svuint8 // CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv48i8( [[ZT]], i64 32) // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP4]] // CPP-CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[PG:%.*]], ptr [[TMP5]]) // CPP-CHECK-NEXT: ret void @@ -944,7 +944,7 @@ void test_svst3q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum, svint8x3 // CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP6]]) // CHECK-NEXT: ret void @@ -957,7 +957,7 @@ void test_svst3q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum, svint8x3 // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CPP-CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP6]]) // CPP-CHECK-NEXT: ret void @@ -975,7 +975,7 @@ void test_svst3q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum, svuin // CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP6]]) // CHECK-NEXT: ret void @@ -988,7 +988,7 @@ void test_svst3q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum, svuin // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CPP-CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP6]]) // CPP-CHECK-NEXT: ret void @@ -1006,7 +1006,7 @@ void test_svst3q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum, svint1 // CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP6]]) // CHECK-NEXT: ret void @@ -1019,7 +1019,7 @@ void test_svst3q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum, svint1 // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CPP-CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP6]]) // CPP-CHECK-NEXT: ret void @@ -1037,7 +1037,7 @@ void test_svst3q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum, svuin // CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP6]]) // CHECK-NEXT: ret void @@ -1050,7 +1050,7 @@ void test_svst3q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum, svuin // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CPP-CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP6]]) // CPP-CHECK-NEXT: ret void @@ -1068,7 +1068,7 @@ void test_svst3q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum, svint3 // CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP6]]) // CHECK-NEXT: ret void @@ -1081,7 +1081,7 @@ void test_svst3q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum, svint3 // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CPP-CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP6]]) // CPP-CHECK-NEXT: ret void @@ -1099,7 +1099,7 @@ void test_svst3q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum, svuin // CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP6]]) // CHECK-NEXT: ret void @@ -1112,7 +1112,7 @@ void test_svst3q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum, svuin // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CPP-CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP6]]) // CPP-CHECK-NEXT: ret void @@ -1130,7 +1130,7 @@ void test_svst3q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum, svint6 // CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP6]]) // CHECK-NEXT: ret void @@ -1143,7 +1143,7 @@ void test_svst3q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum, svint6 // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CPP-CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP6]]) // CPP-CHECK-NEXT: ret void @@ -1161,7 +1161,7 @@ void test_svst3q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum, svfl // CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP6]]) // CHECK-NEXT: ret void @@ -1174,7 +1174,7 @@ void test_svst3q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum, svfl // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CPP-CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP6]]) // CPP-CHECK-NEXT: ret void @@ -1192,7 +1192,7 @@ void test_svst3q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum, sv // CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP6]]) // CHECK-NEXT: ret void @@ -1205,7 +1205,7 @@ void test_svst3q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum, sv // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CPP-CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP6]]) // CPP-CHECK-NEXT: ret void @@ -1223,7 +1223,7 @@ void test_svst3q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum, svfl // CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP6]]) // CHECK-NEXT: ret void @@ -1236,7 +1236,7 @@ void test_svst3q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum, svfl // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CPP-CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st3q.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], ptr [[TMP6]]) // CPP-CHECK-NEXT: ret void @@ -1551,7 +1551,7 @@ void test_svst4q_f64(svbool_t pg, const float64_t *base, svfloat64x4_t zt) // CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 48) // CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP6]]) // CHECK-NEXT: ret void @@ -1564,7 +1564,7 @@ void test_svst4q_f64(svbool_t pg, const float64_t *base, svfloat64x4_t zt) // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 48) // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CPP-CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP6]]) // CPP-CHECK-NEXT: ret void @@ -1582,7 +1582,7 @@ void test_svst4q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum, svuint8 // CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 48) // CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP6]]) // CHECK-NEXT: ret void @@ -1595,7 +1595,7 @@ void test_svst4q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum, svuint8 // CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv16i8.nxv64i8( [[ZT]], i64 48) // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP5]] // CPP-CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv16i8( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[PG:%.*]], ptr [[TMP6]]) // CPP-CHECK-NEXT: ret void @@ -1614,7 +1614,7 @@ void test_svst4q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum, svint8x4 // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP6]] // CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP7]]) // CHECK-NEXT: ret void @@ -1628,7 +1628,7 @@ void test_svst4q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum, svint8x4 // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP6]] // CPP-CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP7]]) // CPP-CHECK-NEXT: ret void @@ -1647,7 +1647,7 @@ void test_svst4q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum, svuin // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP6]] // CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP7]]) // CHECK-NEXT: ret void @@ -1661,7 +1661,7 @@ void test_svst4q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum, svuin // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP6]] // CPP-CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8i16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP7]]) // CPP-CHECK-NEXT: ret void @@ -1680,7 +1680,7 @@ void test_svst4q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum, svint1 // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP6]] // CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP7]]) // CHECK-NEXT: ret void @@ -1694,7 +1694,7 @@ void test_svst4q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum, svint1 // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP6]] // CPP-CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP7]]) // CPP-CHECK-NEXT: ret void @@ -1713,7 +1713,7 @@ void test_svst4q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum, svuin // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP6]] // CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP7]]) // CHECK-NEXT: ret void @@ -1727,7 +1727,7 @@ void test_svst4q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum, svuin // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP6]] // CPP-CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4i32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP7]]) // CPP-CHECK-NEXT: ret void @@ -1746,7 +1746,7 @@ void test_svst4q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum, svint3 // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP6]] // CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP7]]) // CHECK-NEXT: ret void @@ -1760,7 +1760,7 @@ void test_svst4q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum, svint3 // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP6]] // CPP-CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP7]]) // CPP-CHECK-NEXT: ret void @@ -1779,7 +1779,7 @@ void test_svst4q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum, svuin // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP6]] // CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP7]]) // CHECK-NEXT: ret void @@ -1793,7 +1793,7 @@ void test_svst4q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum, svuin // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP6]] // CPP-CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2i64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP7]]) // CPP-CHECK-NEXT: ret void @@ -1812,7 +1812,7 @@ void test_svst4q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum, svint6 // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP6]] // CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP7]]) // CHECK-NEXT: ret void @@ -1826,7 +1826,7 @@ void test_svst4q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum, svint6 // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP6]] // CPP-CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8f16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP7]]) // CPP-CHECK-NEXT: ret void @@ -1845,7 +1845,7 @@ void test_svst4q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum, svfl // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP6]] // CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP7]]) // CHECK-NEXT: ret void @@ -1859,7 +1859,7 @@ void test_svst4q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum, svfl // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP6]] // CPP-CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv8bf16( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP7]]) // CPP-CHECK-NEXT: ret void @@ -1878,7 +1878,7 @@ void test_svst4q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum, sv // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP6]] // CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP7]]) // CHECK-NEXT: ret void @@ -1892,7 +1892,7 @@ void test_svst4q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum, sv // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP6]] // CPP-CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv4f32( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP7]]) // CPP-CHECK-NEXT: ret void @@ -1911,7 +1911,7 @@ void test_svst4q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum, svfl // CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64() // CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4 -// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]] +// CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP6]] // CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP7]]) // CHECK-NEXT: ret void @@ -1925,7 +1925,7 @@ void test_svst4q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum, svfl // CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG:%.*]]) // CPP-CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64() // CPP-CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4 -// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]] +// CPP-CHECK-NEXT: [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP6]] // CPP-CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.sve.st4q.nxv2f64( [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], ptr [[TMP7]]) // CPP-CHECK-NEXT: ret void diff --git a/clang/test/CodeGen/attr-counted-by.c b/clang/test/CodeGen/attr-counted-by.c index 9acc896c0f0e9d..3ed8b6f0c71861 100644 --- a/clang/test/CodeGen/attr-counted-by.c +++ b/clang/test/CodeGen/attr-counted-by.c @@ -111,7 +111,7 @@ void test1(struct annotated *p, int index, int val) { // SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 // SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4 // SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = zext i32 [[DOT_COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]] -// SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], [[INDEX]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp ult i64 [[INDEX]], [[TMP0]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: // SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB3:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]] @@ -200,7 +200,7 @@ size_t test2_bdos(struct annotated *p) { // SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 // SANITIZE-WITH-ATTR-NEXT: [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4 // SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = zext i32 [[DOT_COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]] -// SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], [[INDEX]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp ult i64 [[INDEX]], [[TMP0]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: // SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB4:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]] @@ -1185,7 +1185,7 @@ struct test13_bar { // SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8 // SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4 // SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]] -// SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[TMP1]], [[INDEX]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = icmp ult i64 [[INDEX]], [[TMP1]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP2]], label [[CONT5:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] // SANITIZE-WITH-ATTR: handler.out_of_bounds: // SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB24:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]] @@ -1212,7 +1212,7 @@ struct test13_bar { // SANITIZE-WITHOUT-ATTR-NEXT: [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8 // SANITIZE-WITHOUT-ATTR-NEXT: [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4 // SANITIZE-WITHOUT-ATTR-NEXT: [[TMP1:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META9]] -// SANITIZE-WITHOUT-ATTR-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[TMP1]], [[INDEX]], !nosanitize [[META9]] +// SANITIZE-WITHOUT-ATTR-NEXT: [[TMP2:%.*]] = icmp ult i64 [[INDEX]], [[TMP1]], !nosanitize [[META9]] // SANITIZE-WITHOUT-ATTR-NEXT: br i1 [[TMP2]], label [[CONT5:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF8]], !nosanitize [[META9]] // SANITIZE-WITHOUT-ATTR: handler.out_of_bounds: // SANITIZE-WITHOUT-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB8:[0-9]+]], i64 [[INDEX]]) #[[ATTR8]], !nosanitize [[META9]] diff --git a/clang/test/CodeGen/fp-reassoc-pragma.cpp b/clang/test/CodeGen/fp-reassoc-pragma.cpp index 0cf2f812e66e96..8b9329c40174b7 100644 --- a/clang/test/CodeGen/fp-reassoc-pragma.cpp +++ b/clang/test/CodeGen/fp-reassoc-pragma.cpp @@ -3,8 +3,8 @@ float fp_reassoc_simple(float a, float b, float c) { // CHECK: _Z17fp_reassoc_simplefff // CHECK: %[[A:.+]] = fadd reassoc float %b, %c -// CHECK: %[[M:.+]] = fmul reassoc float %[[A]], %b -// CHECK-NEXT: fadd reassoc float %[[M]], %c +// CHECK: %[[M:.+]] = fmul reassoc float %b, %[[A]] +// CHECK-NEXT: fadd reassoc float %c, %[[M]] #pragma clang fp reassociate(on) a = b + c; return a * b + c; @@ -34,7 +34,7 @@ float fp_reassoc_template(float a, float b, float c) { // CHECK: _Z19fp_reassoc_templatefff // CHECK: %[[A1:.+]] = fadd reassoc float %a, %b // CHECK-NEXT: %[[A2:.+]] = fsub reassoc float %[[A1]], %c - // CHECK-NEXT: fadd reassoc float %[[A2]], %c + // CHECK-NEXT: fadd reassoc float %c, %[[A2]] return template_reassoc(a, b, c); } diff --git a/clang/test/CodeGen/fp-reciprocal-pragma.cpp b/clang/test/CodeGen/fp-reciprocal-pragma.cpp index db93550301bf23..8398e48410e333 100644 --- a/clang/test/CodeGen/fp-reciprocal-pragma.cpp +++ b/clang/test/CodeGen/fp-reciprocal-pragma.cpp @@ -5,11 +5,11 @@ float base(float a, float b, float c) { // CHECK-LABEL: _Z4basefff // FLAG: %[[A:.+]] = fdiv arcp float %b, %c // FLAG: %[[M:.+]] = fdiv arcp float %[[A]], %b -// FLAG-NEXT: fadd arcp float %[[M]], %c +// FLAG-NEXT: fadd arcp float %c, %[[M]] // DEFAULT: %[[A:.+]] = fdiv float %b, %c // DEFAULT: %[[M:.+]] = fdiv float %[[A]], %b -// DEFAULT-NEXT: fadd float %[[M]], %c +// DEFAULT-NEXT: fadd float %c, %[[M]] a = b / c; return a / b + c; } @@ -19,7 +19,7 @@ float fp_recip_simple(float a, float b, float c) { // CHECK-LABEL: _Z15fp_recip_simplefff // CHECK: %[[A:.+]] = fdiv arcp float %b, %c // CHECK: %[[M:.+]] = fdiv arcp float %[[A]], %b -// CHECK-NEXT: fadd arcp float %[[M]], %c +// CHECK-NEXT: fadd arcp float %c, %[[M]] #pragma clang fp reciprocal(on) a = b / c; return a / b + c; @@ -30,7 +30,7 @@ float fp_recip_disable(float a, float b, float c) { // CHECK-LABEL: _Z16fp_recip_disablefff // CHECK: %[[A:.+]] = fdiv float %b, %c // CHECK: %[[M:.+]] = fdiv float %[[A]], %b -// CHECK-NEXT: fadd float %[[M]], %c +// CHECK-NEXT: fadd float %c, %[[M]] #pragma clang fp reciprocal(off) a = b / c; return a / b + c; @@ -40,7 +40,7 @@ float fp_recip_with_reassoc_simple(float a, float b, float c) { // CHECK-LABEL: _Z28fp_recip_with_reassoc_simplefff // CHECK: %[[A:.+]] = fmul reassoc arcp float %b, %c // CHECK: %[[M:.+]] = fdiv reassoc arcp float %b, %[[A]] -// CHECK-NEXT: fadd reassoc arcp float %[[M]], %c +// CHECK-NEXT: fadd reassoc arcp float %c, %[[M]] #pragma clang fp reciprocal(on) reassociate(on) a = b / c; return a / b + c; @@ -72,7 +72,7 @@ float fp_recip_template(float a, float b, float c) { // CHECK-LABEL: _Z17fp_recip_templatefff // CHECK: %[[A1:.+]] = fdiv arcp float %a, %b // CHECK-NEXT: %[[A2:.+]] = fsub arcp float %[[A1]], %c - // CHECK-NEXT: fadd arcp float %[[A2]], %c + // CHECK-NEXT: fadd arcp float %c, %[[A2]] return template_recip(a, b, c); } diff --git a/clang/test/CodeGen/ms-mixed-ptr-sizes.c b/clang/test/CodeGen/ms-mixed-ptr-sizes.c index 51bea60eb39dce..0bc1925b13dbc6 100644 --- a/clang/test/CodeGen/ms-mixed-ptr-sizes.c +++ b/clang/test/CodeGen/ms-mixed-ptr-sizes.c @@ -51,35 +51,35 @@ void test_other(struct Foo *f, __attribute__((address_space(10))) int *i) { int test_compare1(int *__ptr32 __uptr i, int *__ptr64 j) { // ALL-LABEL: define dso_local range(i32 0, 2) i32 @test_compare1 // X64: %{{.+}} = addrspacecast ptr %j to ptr addrspace(271) - // X64: %cmp = icmp eq ptr addrspace(271) %{{.+}}, %i + // X64: %cmp = icmp eq ptr addrspace(271) %i, %{{.+}} // X86: %{{.+}} = addrspacecast ptr addrspace(272) %j to ptr addrspace(271) - // X86: %cmp = icmp eq ptr addrspace(271) %{{.+}}, %i + // X86: %cmp = icmp eq ptr addrspace(271) %i, %{{.+}} return (i == j); } int test_compare2(int *__ptr32 __sptr i, int *__ptr64 j) { // ALL-LABEL: define dso_local range(i32 0, 2) i32 @test_compare2 // X64: %{{.+}} = addrspacecast ptr %j to ptr addrspace(270) - // X64: %cmp = icmp eq ptr addrspace(270) %{{.+}}, %i + // X64: %cmp = icmp eq ptr addrspace(270) %i, %{{.+}} // X86: %{{.+}} = addrspacecast ptr addrspace(272) %j to ptr - // X86: %cmp = icmp eq ptr %{{.+}}, %i + // X86: %cmp = icmp eq ptr %i, %{{.+}} return (i == j); } int test_compare3(int *__ptr32 __uptr i, int *__ptr64 j) { // ALL-LABEL: define dso_local range(i32 0, 2) i32 @test_compare3 // X64: %{{.+}} = addrspacecast ptr addrspace(271) %i to ptr - // X64: %cmp = icmp eq ptr %{{.+}}, %j + // X64: %cmp = icmp eq ptr %j, %{{.+}} // X86: %{{.+}} = addrspacecast ptr addrspace(271) %i to ptr addrspace(272) - // X86: %cmp = icmp eq ptr addrspace(272) %{{.+}}, %j + // X86: %cmp = icmp eq ptr addrspace(272) %j, %{{.+}} return (j == i); } int test_compare4(int *__ptr32 __sptr i, int *__ptr64 j) { // ALL-LABEL: define dso_local range(i32 0, 2) i32 @test_compare4 // X64: %{{.+}} = addrspacecast ptr addrspace(270) %i to ptr - // X64: %cmp = icmp eq ptr %{{.+}}, %j + // X64: %cmp = icmp eq ptr %j, %{{.+}} // X86: %{{.+}} = addrspacecast ptr %i to ptr addrspace(272) - // X86: %cmp = icmp eq ptr addrspace(272) %{{.+}}, %j + // X86: %cmp = icmp eq ptr addrspace(272) %j, %{{.+}} return (j == i); } diff --git a/clang/test/Headers/wasm.c b/clang/test/Headers/wasm.c index b22d87a5f8b700..0fae8557a066dd 100644 --- a/clang/test/Headers/wasm.c +++ b/clang/test/Headers/wasm.c @@ -1499,7 +1499,7 @@ v128_t test_v128_xor(v128_t a, v128_t b) { // CHECK-LABEL: @test_v128_andnot( // CHECK-NEXT: entry: // CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i32> [[B:%.*]], -// CHECK-NEXT: [[AND_I:%.*]] = and <4 x i32> [[NOT_I]], [[A:%.*]] +// CHECK-NEXT: [[AND_I:%.*]] = and <4 x i32> [[A:%.*]], [[NOT_I]] // CHECK-NEXT: ret <4 x i32> [[AND_I]] // v128_t test_v128_andnot(v128_t a, v128_t b) { diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h index ebcbd5d9e88800..ed2e7f58ca853c 100644 --- a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h +++ b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h @@ -132,21 +132,18 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner { /// This routine maps IR values to various complexity ranks: /// 0 -> undef /// 1 -> Constants - /// 2 -> Other non-instructions - /// 3 -> Arguments - /// 4 -> Cast and (f)neg/not instructions - /// 5 -> Other instructions + /// 2 -> Cast and (f)neg/not instructions + /// 3 -> Other instructions and arguments static unsigned getComplexity(Value *V) { - if (isa(V)) { - if (isa(V) || match(V, m_Neg(PatternMatch::m_Value())) || - match(V, m_Not(PatternMatch::m_Value())) || - match(V, m_FNeg(PatternMatch::m_Value()))) - return 4; - return 5; - } - if (isa(V)) - return 3; - return isa(V) ? (isa(V) ? 0 : 1) : 2; + if (isa(V)) + return isa(V) ? 0 : 1; + + if (isa(V) || match(V, m_Neg(PatternMatch::m_Value())) || + match(V, m_Not(PatternMatch::m_Value())) || + match(V, m_FNeg(PatternMatch::m_Value()))) + return 2; + + return 3; } /// Predicate canonicalization reduces the number of patterns that need to be diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 123f810bacfb6e..dd4a64050f878a 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -2719,8 +2719,10 @@ Instruction *InstCombinerImpl::hoistFNegAboveFMulFDiv(Value *FNegOp, Instruction &FMFSource) { Value *X, *Y; if (match(FNegOp, m_FMul(m_Value(X), m_Value(Y)))) { + // Push into RHS which is more likely to simplify (const or another fneg). + // FIXME: It would be better to invert the transform. return cast(Builder.CreateFMulFMF( - Builder.CreateFNegFMF(X, &FMFSource), Y, &FMFSource)); + X, Builder.CreateFNegFMF(Y, &FMFSource), &FMFSource)); } if (match(FNegOp, m_FDiv(m_Value(X), m_Value(Y)))) { diff --git a/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll b/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll index ba3a484441e9e3..55c3e7779478ef 100644 --- a/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll +++ b/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll @@ -19,7 +19,7 @@ define i64 @known_power_of_two_urem_phi(i64 %size, i1 %cmp, i1 %cmp1) { ; CHECK-NEXT: br label [[COND_END]] ; CHECK: cond.end: ; CHECK-NEXT: [[PHI1:%.*]] = phi i64 [ 4095, [[ENTRY:%.*]] ], [ [[PHI]], [[COND_TRUE_END]] ] -; CHECK-NEXT: [[UREM:%.*]] = and i64 [[PHI1]], [[SIZE:%.*]] +; CHECK-NEXT: [[UREM:%.*]] = and i64 [[SIZE:%.*]], [[PHI1]] ; CHECK-NEXT: ret i64 [[UREM]] ; entry: @@ -57,7 +57,7 @@ define i64 @known_power_of_two_urem_nested_expr(i64 %size, i1 %cmp, i1 %cmp1, i6 ; CHECK: cond.end: ; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ [[SELECT]], [[COND_FALSE]] ], [ [[TMP1]], [[COND_TRUE]] ], [ [[PHI]], [[COND_END]] ] ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[PHI]], -1 -; CHECK-NEXT: [[UREM:%.*]] = and i64 [[TMP2]], [[SIZE:%.*]] +; CHECK-NEXT: [[UREM:%.*]] = and i64 [[SIZE:%.*]], [[TMP2]] ; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i64 [[UREM]], 10 ; CHECK-NEXT: br i1 [[CMP2]], label [[COND_END]], label [[END:%.*]] ; CHECK: end: @@ -119,7 +119,7 @@ define i64 @known_power_of_two_urem_loop_mul(i64 %size, i64 %a) { ; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ [[START]], [[ENTRY:%.*]] ], [ [[I:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[PHI]], -1 -; CHECK-NEXT: [[UREM:%.*]] = and i64 [[TMP0]], [[SIZE:%.*]] +; CHECK-NEXT: [[UREM:%.*]] = and i64 [[SIZE:%.*]], [[TMP0]] ; CHECK-NEXT: [[ADD]] = add nuw i64 [[SUM]], [[UREM]] ; CHECK-NEXT: [[I]] = shl nuw i64 [[PHI]], 2 ; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i64 [[PHI]], 25000000 @@ -190,7 +190,7 @@ define i64 @known_power_of_two_urem_loop_shl(i64 %size, i64 %a) { ; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ [[START]], [[ENTRY:%.*]] ], [ [[I:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[PHI]], -1 -; CHECK-NEXT: [[UREM:%.*]] = and i64 [[TMP0]], [[SIZE:%.*]] +; CHECK-NEXT: [[UREM:%.*]] = and i64 [[SIZE:%.*]], [[TMP0]] ; CHECK-NEXT: [[ADD]] = add nuw i64 [[SUM]], [[UREM]] ; CHECK-NEXT: [[I]] = shl nuw i64 [[PHI]], 1 ; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i64 [[PHI]], 50000000 @@ -225,7 +225,7 @@ define i64 @known_power_of_two_urem_loop_lshr(i64 %size, i64 %a) { ; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ [[START]], [[ENTRY:%.*]] ], [ [[I:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[PHI]], -1 -; CHECK-NEXT: [[UREM:%.*]] = and i64 [[TMP0]], [[SIZE:%.*]] +; CHECK-NEXT: [[UREM:%.*]] = and i64 [[SIZE:%.*]], [[TMP0]] ; CHECK-NEXT: [[ADD]] = add nuw i64 [[SUM]], [[UREM]] ; CHECK-NEXT: [[I]] = lshr i64 [[PHI]], 1 ; CHECK-NEXT: [[ICMP_NOT:%.*]] = icmp ult i64 [[PHI]], 2 @@ -260,7 +260,7 @@ define i64 @known_power_of_two_urem_loop_ashr(i64 %size, i64 %a) { ; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ 4096, [[ENTRY:%.*]] ], [ [[I:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[PHI]], -1 -; CHECK-NEXT: [[UREM:%.*]] = and i64 [[TMP0]], [[SIZE:%.*]] +; CHECK-NEXT: [[UREM:%.*]] = and i64 [[SIZE:%.*]], [[TMP0]] ; CHECK-NEXT: [[ADD]] = add nsw i64 [[SUM]], [[UREM]] ; CHECK-NEXT: [[I]] = lshr i64 [[PHI]], [[A:%.*]] ; CHECK-NEXT: [[ICMP_NOT:%.*]] = icmp eq i64 [[I]], 0 @@ -396,7 +396,7 @@ define i8 @known_power_of_two_rust_next_power_of_two(i8 %x, i8 %y) { ; CHECK-NEXT: [[TMP3:%.*]] = lshr i8 -1, [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i8 [[X]], 1 ; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i8 [[TMP3]], i8 0 -; CHECK-NEXT: [[R:%.*]] = and i8 [[TMP5]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = and i8 [[Y:%.*]], [[TMP5]] ; CHECK-NEXT: ret i8 [[R]] ; %2 = add i8 %x, -1 @@ -414,7 +414,7 @@ define i8 @known_power_of_two_rust_next_power_of_two(i8 %x, i8 %y) { define i8 @known_power_of_two_lshr_add_one_allow_zero(i8 %x, i8 %y) { ; CHECK-LABEL: @known_power_of_two_lshr_add_one_allow_zero( ; CHECK-NEXT: [[TMP1:%.*]] = lshr i8 -1, [[X:%.*]] -; CHECK-NEXT: [[R:%.*]] = and i8 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = and i8 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i8 [[R]] ; %4 = lshr i8 -1, %x @@ -429,7 +429,7 @@ define i1 @known_power_of_two_lshr_add_one_nuw_deny_zero(i8 %x, i8 %y) { ; CHECK-LABEL: @known_power_of_two_lshr_add_one_nuw_deny_zero( ; CHECK-NEXT: [[TMP1:%.*]] = lshr i8 -1, [[X:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = sub i8 -2, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = or i8 [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = or i8 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[TMP3]], -1 ; CHECK-NEXT: ret i1 [[R]] ; @@ -446,7 +446,7 @@ define i1 @negative_known_power_of_two_lshr_add_one_deny_zero(i8 %x, i8 %y) { ; CHECK-LABEL: @negative_known_power_of_two_lshr_add_one_deny_zero( ; CHECK-NEXT: [[TMP1:%.*]] = lshr i8 -1, [[X:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = sub i8 -2, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = or i8 [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = or i8 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[TMP3]], -1 ; CHECK-NEXT: ret i1 [[R]] ; @@ -463,7 +463,7 @@ define i1 @negative_known_power_of_two_lshr_add_one_nsw_deny_zero(i8 %x, i8 %y) ; CHECK-LABEL: @negative_known_power_of_two_lshr_add_one_nsw_deny_zero( ; CHECK-NEXT: [[TMP1:%.*]] = lshr i8 -1, [[X:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = sub i8 -2, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = or i8 [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = or i8 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[TMP3]], -1 ; CHECK-NEXT: ret i1 [[R]] ; diff --git a/llvm/test/Analysis/ValueTracking/known-power-of-two.ll b/llvm/test/Analysis/ValueTracking/known-power-of-two.ll index 7bcf96065a69d9..7cfb6af0d7b95d 100644 --- a/llvm/test/Analysis/ValueTracking/known-power-of-two.ll +++ b/llvm/test/Analysis/ValueTracking/known-power-of-two.ll @@ -16,8 +16,8 @@ declare i16 @llvm.umax.i16(i16, i16) define i32 @pr25900(i32 %d) { ; CHECK-LABEL: define i32 @pr25900 ; CHECK-SAME: (i32 [[D:%.*]]) { -; CHECK-NEXT: [[AND:%.*]] = ashr i32 [[D]], 31 -; CHECK-NEXT: [[DIV:%.*]] = sdiv i32 4, [[AND]] +; CHECK-NEXT: [[ASHR:%.*]] = ashr i32 [[D]], 31 +; CHECK-NEXT: [[DIV:%.*]] = sdiv i32 4, [[ASHR]] ; CHECK-NEXT: ret i32 [[DIV]] ; %and = and i32 %d, -2147483648 @@ -37,7 +37,7 @@ define i8 @trunc_is_pow2_or_zero(i16 %x, i8 %y) { ; CHECK-NEXT: [[XP2:%.*]] = shl i16 4, [[X]] ; CHECK-NEXT: [[XX:%.*]] = trunc i16 [[XP2]] to i8 ; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[XX]], -1 -; CHECK-NEXT: [[R:%.*]] = and i8 [[TMP1]], [[Y]] +; CHECK-NEXT: [[R:%.*]] = and i8 [[Y]], [[TMP1]] ; CHECK-NEXT: ret i8 [[R]] ; %xp2 = shl i16 4, %x @@ -67,7 +67,7 @@ define i1 @trunc_is_pow2_fail(i16 %x, i8 %y) { ; CHECK-SAME: (i16 [[X:%.*]], i8 [[Y:%.*]]) { ; CHECK-NEXT: [[XP2:%.*]] = shl i16 4, [[X]] ; CHECK-NEXT: [[XX:%.*]] = trunc i16 [[XP2]] to i8 -; CHECK-NEXT: [[AND:%.*]] = and i8 [[XX]], [[Y]] +; CHECK-NEXT: [[AND:%.*]] = and i8 [[Y]], [[XX]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[AND]], [[XX]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -85,7 +85,7 @@ define i16 @bswap_is_pow2_or_zero(i16 %x, i16 %y) { ; CHECK-NEXT: [[XP2:%.*]] = shl i16 4, [[X]] ; CHECK-NEXT: [[XX:%.*]] = call i16 @llvm.bswap.i16(i16 [[XP2]]) ; CHECK-NEXT: [[TMP1:%.*]] = add i16 [[XX]], -1 -; CHECK-NEXT: [[R:%.*]] = and i16 [[TMP1]], [[Y]] +; CHECK-NEXT: [[R:%.*]] = and i16 [[Y]], [[TMP1]] ; CHECK-NEXT: ret i16 [[R]] ; %xp2 = shl i16 4, %x @@ -115,7 +115,7 @@ define i1 @bswap_is_pow2(i16 %x, i16 %y) { ; CHECK-SAME: (i16 [[X:%.*]], i16 [[Y:%.*]]) { ; CHECK-NEXT: [[XP2:%.*]] = shl nuw i16 1, [[X]] ; CHECK-NEXT: [[XX:%.*]] = call i16 @llvm.bswap.i16(i16 [[XP2]]) -; CHECK-NEXT: [[AND:%.*]] = and i16 [[XX]], [[Y]] +; CHECK-NEXT: [[AND:%.*]] = and i16 [[Y]], [[XX]] ; CHECK-NEXT: [[R:%.*]] = icmp ne i16 [[AND]], 0 ; CHECK-NEXT: ret i1 [[R]] ; @@ -132,7 +132,7 @@ define i1 @bswap_is_pow2_fail(i16 %x, i16 %y) { ; CHECK-SAME: (i16 [[X:%.*]], i16 [[Y:%.*]]) { ; CHECK-NEXT: [[XP2:%.*]] = shl i16 2, [[X]] ; CHECK-NEXT: [[XX:%.*]] = call i16 @llvm.bswap.i16(i16 [[XP2]]) -; CHECK-NEXT: [[AND:%.*]] = and i16 [[XX]], [[Y]] +; CHECK-NEXT: [[AND:%.*]] = and i16 [[Y]], [[XX]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i16 [[AND]], [[XX]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -150,7 +150,7 @@ define i16 @bitreverse_is_pow2_or_zero(i16 %x, i16 %y) { ; CHECK-NEXT: [[XP2:%.*]] = shl i16 4, [[X]] ; CHECK-NEXT: [[XX:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[XP2]]) ; CHECK-NEXT: [[TMP1:%.*]] = add nsw i16 [[XX]], -1 -; CHECK-NEXT: [[R:%.*]] = and i16 [[TMP1]], [[Y]] +; CHECK-NEXT: [[R:%.*]] = and i16 [[Y]], [[TMP1]] ; CHECK-NEXT: ret i16 [[R]] ; %xp2 = shl i16 4, %x @@ -180,7 +180,7 @@ define i1 @bitreverse_is_pow2(i16 %x, i16 %y) { ; CHECK-SAME: (i16 [[X:%.*]], i16 [[Y:%.*]]) { ; CHECK-NEXT: [[XP2:%.*]] = shl nuw i16 1, [[X]] ; CHECK-NEXT: [[XX:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[XP2]]) -; CHECK-NEXT: [[AND:%.*]] = and i16 [[XX]], [[Y]] +; CHECK-NEXT: [[AND:%.*]] = and i16 [[Y]], [[XX]] ; CHECK-NEXT: [[R:%.*]] = icmp ne i16 [[AND]], 0 ; CHECK-NEXT: ret i1 [[R]] ; @@ -197,7 +197,7 @@ define i1 @bitreverse_is_pow2_fail(i16 %x, i16 %y) { ; CHECK-SAME: (i16 [[X:%.*]], i16 [[Y:%.*]]) { ; CHECK-NEXT: [[XP2:%.*]] = shl i16 2, [[X]] ; CHECK-NEXT: [[XX:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[XP2]]) -; CHECK-NEXT: [[AND:%.*]] = and i16 [[XX]], [[Y]] +; CHECK-NEXT: [[AND:%.*]] = and i16 [[Y]], [[XX]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i16 [[AND]], [[XX]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -215,7 +215,7 @@ define i16 @fshl_is_pow2_or_zero(i16 %x, i16 %y, i16 %z) { ; CHECK-NEXT: [[XP2:%.*]] = shl i16 4, [[X]] ; CHECK-NEXT: [[XX:%.*]] = call i16 @llvm.fshl.i16(i16 [[XP2]], i16 [[XP2]], i16 [[Z]]) ; CHECK-NEXT: [[TMP1:%.*]] = add i16 [[XX]], -1 -; CHECK-NEXT: [[R:%.*]] = and i16 [[TMP1]], [[Y]] +; CHECK-NEXT: [[R:%.*]] = and i16 [[Y]], [[TMP1]] ; CHECK-NEXT: ret i16 [[R]] ; %xp2 = shl i16 4, %x @@ -262,7 +262,7 @@ define i1 @fshl_is_pow2(i16 %x, i16 %y, i16 %z) { ; CHECK-SAME: (i16 [[X:%.*]], i16 [[Y:%.*]], i16 [[Z:%.*]]) { ; CHECK-NEXT: [[XP2:%.*]] = shl nuw i16 1, [[X]] ; CHECK-NEXT: [[XX:%.*]] = call i16 @llvm.fshl.i16(i16 [[XP2]], i16 [[XP2]], i16 [[Z]]) -; CHECK-NEXT: [[AND:%.*]] = and i16 [[XX]], [[Y]] +; CHECK-NEXT: [[AND:%.*]] = and i16 [[Y]], [[XX]] ; CHECK-NEXT: [[R:%.*]] = icmp ne i16 [[AND]], 0 ; CHECK-NEXT: ret i1 [[R]] ; @@ -279,7 +279,7 @@ define i1 @fshl_is_pow2_fail(i16 %x, i16 %y, i16 %z) { ; CHECK-SAME: (i16 [[X:%.*]], i16 [[Y:%.*]], i16 [[Z:%.*]]) { ; CHECK-NEXT: [[XP2:%.*]] = shl i16 2, [[X]] ; CHECK-NEXT: [[XX:%.*]] = call i16 @llvm.fshl.i16(i16 [[XP2]], i16 [[XP2]], i16 [[Z]]) -; CHECK-NEXT: [[AND:%.*]] = and i16 [[XX]], [[Y]] +; CHECK-NEXT: [[AND:%.*]] = and i16 [[Y]], [[XX]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i16 [[AND]], [[XX]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -297,7 +297,7 @@ define i16 @fshr_is_pow2_or_zero(i16 %x, i16 %y, i16 %z) { ; CHECK-NEXT: [[XP2:%.*]] = shl i16 4, [[X]] ; CHECK-NEXT: [[XX:%.*]] = call i16 @llvm.fshr.i16(i16 [[XP2]], i16 [[XP2]], i16 [[Z]]) ; CHECK-NEXT: [[TMP1:%.*]] = add i16 [[XX]], -1 -; CHECK-NEXT: [[R:%.*]] = and i16 [[TMP1]], [[Y]] +; CHECK-NEXT: [[R:%.*]] = and i16 [[Y]], [[TMP1]] ; CHECK-NEXT: ret i16 [[R]] ; %xp2 = shl i16 4, %x @@ -344,7 +344,7 @@ define i1 @fshr_is_pow2(i16 %x, i16 %y, i16 %z) { ; CHECK-SAME: (i16 [[X:%.*]], i16 [[Y:%.*]], i16 [[Z:%.*]]) { ; CHECK-NEXT: [[XP2:%.*]] = shl nuw i16 1, [[X]] ; CHECK-NEXT: [[XX:%.*]] = call i16 @llvm.fshr.i16(i16 [[XP2]], i16 [[XP2]], i16 [[Z]]) -; CHECK-NEXT: [[AND:%.*]] = and i16 [[XX]], [[Y]] +; CHECK-NEXT: [[AND:%.*]] = and i16 [[Y]], [[XX]] ; CHECK-NEXT: [[R:%.*]] = icmp ne i16 [[AND]], 0 ; CHECK-NEXT: ret i1 [[R]] ; @@ -361,7 +361,7 @@ define i1 @fshr_is_pow2_fail(i16 %x, i16 %y, i16 %z) { ; CHECK-SAME: (i16 [[X:%.*]], i16 [[Y:%.*]], i16 [[Z:%.*]]) { ; CHECK-NEXT: [[XP2:%.*]] = shl i16 2, [[X]] ; CHECK-NEXT: [[XX:%.*]] = call i16 @llvm.fshr.i16(i16 [[XP2]], i16 [[XP2]], i16 [[Z]]) -; CHECK-NEXT: [[AND:%.*]] = and i16 [[XX]], [[Y]] +; CHECK-NEXT: [[AND:%.*]] = and i16 [[Y]], [[XX]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i16 [[AND]], [[XX]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -380,7 +380,7 @@ define i16 @mul_is_pow2_or_zero(i16 %x, i16 %y, i16 %z) { ; CHECK-NEXT: [[ZP2:%.*]] = shl i16 2, [[Z]] ; CHECK-NEXT: [[XX:%.*]] = mul i16 [[XP2]], [[ZP2]] ; CHECK-NEXT: [[TMP1:%.*]] = add i16 [[XX]], -1 -; CHECK-NEXT: [[R:%.*]] = and i16 [[TMP1]], [[Y]] +; CHECK-NEXT: [[R:%.*]] = and i16 [[Y]], [[TMP1]] ; CHECK-NEXT: ret i16 [[R]] ; %xp2 = shl i16 4, %x @@ -416,7 +416,7 @@ define i1 @mul_is_pow2(i16 %x, i16 %y, i16 %z) { ; CHECK-NEXT: [[ZP2:%.*]] = shl nuw nsw i16 2, [[ZSMALL]] ; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i16 [[XSMALL]], 2 ; CHECK-NEXT: [[XX:%.*]] = shl nuw nsw i16 [[ZP2]], [[TMP1]] -; CHECK-NEXT: [[AND:%.*]] = and i16 [[XX]], [[Y]] +; CHECK-NEXT: [[AND:%.*]] = and i16 [[Y]], [[XX]] ; CHECK-NEXT: [[R:%.*]] = icmp ne i16 [[AND]], 0 ; CHECK-NEXT: ret i1 [[R]] ; @@ -439,7 +439,7 @@ define i1 @mul_is_pow2_fail(i16 %x, i16 %y, i16 %z) { ; CHECK-NEXT: [[ZP2:%.*]] = shl nuw nsw i16 2, [[ZSMALL]] ; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i16 [[XSMALL]], 2 ; CHECK-NEXT: [[XX:%.*]] = shl i16 [[ZP2]], [[TMP1]] -; CHECK-NEXT: [[AND:%.*]] = and i16 [[XX]], [[Y]] +; CHECK-NEXT: [[AND:%.*]] = and i16 [[Y]], [[XX]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i16 [[AND]], [[XX]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -462,7 +462,7 @@ define i1 @mul_is_pow2_fail2(i16 %x, i16 %y, i16 %z) { ; CHECK-NEXT: [[XP2:%.*]] = shl nuw nsw i16 3, [[XSMALL]] ; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i16 [[ZSMALL]], 1 ; CHECK-NEXT: [[XX:%.*]] = shl nuw nsw i16 [[XP2]], [[TMP1]] -; CHECK-NEXT: [[AND:%.*]] = and i16 [[XX]], [[Y]] +; CHECK-NEXT: [[AND:%.*]] = and i16 [[Y]], [[XX]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i16 [[AND]], [[XX]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -482,7 +482,7 @@ define i1 @shl_is_pow2(i16 %x, i16 %y) { ; CHECK-SAME: (i16 [[X:%.*]], i16 [[Y:%.*]]) { ; CHECK-NEXT: [[XSMALL:%.*]] = and i16 [[X]], 7 ; CHECK-NEXT: [[XX:%.*]] = shl nuw nsw i16 4, [[XSMALL]] -; CHECK-NEXT: [[AND:%.*]] = and i16 [[XX]], [[Y]] +; CHECK-NEXT: [[AND:%.*]] = and i16 [[Y]], [[XX]] ; CHECK-NEXT: [[R:%.*]] = icmp ne i16 [[AND]], 0 ; CHECK-NEXT: ret i1 [[R]] ; @@ -499,7 +499,7 @@ define i1 @shl_is_pow2_fail(i16 %x, i16 %y) { ; CHECK-SAME: (i16 [[X:%.*]], i16 [[Y:%.*]]) { ; CHECK-NEXT: [[XSMALL:%.*]] = and i16 [[X]], 7 ; CHECK-NEXT: [[XX:%.*]] = shl i16 512, [[XSMALL]] -; CHECK-NEXT: [[AND:%.*]] = and i16 [[XX]], [[Y]] +; CHECK-NEXT: [[AND:%.*]] = and i16 [[Y]], [[XX]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i16 [[AND]], [[XX]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -516,7 +516,7 @@ define i1 @shl_is_pow2_fail2(i16 %x, i16 %y) { ; CHECK-SAME: (i16 [[X:%.*]], i16 [[Y:%.*]]) { ; CHECK-NEXT: [[XSMALL:%.*]] = and i16 [[X]], 7 ; CHECK-NEXT: [[XX:%.*]] = shl nuw nsw i16 5, [[XSMALL]] -; CHECK-NEXT: [[AND:%.*]] = and i16 [[XX]], [[Y]] +; CHECK-NEXT: [[AND:%.*]] = and i16 [[Y]], [[XX]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i16 [[AND]], [[XX]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -533,7 +533,7 @@ define i1 @lshr_is_pow2(i16 %x, i16 %y) { ; CHECK-SAME: (i16 [[X:%.*]], i16 [[Y:%.*]]) { ; CHECK-NEXT: [[XSMALL:%.*]] = and i16 [[X]], 7 ; CHECK-NEXT: [[XX:%.*]] = lshr exact i16 512, [[XSMALL]] -; CHECK-NEXT: [[AND:%.*]] = and i16 [[XX]], [[Y]] +; CHECK-NEXT: [[AND:%.*]] = and i16 [[Y]], [[XX]] ; CHECK-NEXT: [[R:%.*]] = icmp ne i16 [[AND]], 0 ; CHECK-NEXT: ret i1 [[R]] ; @@ -550,7 +550,7 @@ define i1 @lshr_is_pow2_fail(i16 %x, i16 %y) { ; CHECK-SAME: (i16 [[X:%.*]], i16 [[Y:%.*]]) { ; CHECK-NEXT: [[XSMALL:%.*]] = and i16 [[X]], 7 ; CHECK-NEXT: [[XX:%.*]] = lshr i16 4, [[XSMALL]] -; CHECK-NEXT: [[AND:%.*]] = and i16 [[XX]], [[Y]] +; CHECK-NEXT: [[AND:%.*]] = and i16 [[Y]], [[XX]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i16 [[AND]], [[XX]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -567,7 +567,7 @@ define i1 @lshr_is_pow2_fail2(i16 %x, i16 %y) { ; CHECK-SAME: (i16 [[X:%.*]], i16 [[Y:%.*]]) { ; CHECK-NEXT: [[XSMALL:%.*]] = and i16 [[X]], 7 ; CHECK-NEXT: [[XX:%.*]] = lshr i16 513, [[XSMALL]] -; CHECK-NEXT: [[AND:%.*]] = and i16 [[XX]], [[Y]] +; CHECK-NEXT: [[AND:%.*]] = and i16 [[Y]], [[XX]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i16 [[AND]], [[XX]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -584,7 +584,7 @@ define i1 @and_is_pow2(i16 %x, i16 %y) { ; CHECK-SAME: (i16 [[X:%.*]], i16 [[Y:%.*]]) { ; CHECK-NEXT: [[XNZ:%.*]] = or i16 [[X]], 4 ; CHECK-NEXT: [[X_NEG:%.*]] = sub nsw i16 0, [[XNZ]] -; CHECK-NEXT: [[TMP1:%.*]] = and i16 [[X_NEG]], [[Y]] +; CHECK-NEXT: [[TMP1:%.*]] = and i16 [[Y]], [[X_NEG]] ; CHECK-NEXT: [[AND:%.*]] = and i16 [[TMP1]], [[XNZ]] ; CHECK-NEXT: [[R:%.*]] = icmp ne i16 [[AND]], 0 ; CHECK-NEXT: ret i1 [[R]] @@ -602,8 +602,8 @@ define i1 @and_is_pow2_fail(i16 %x, i16 %y) { ; CHECK-LABEL: define i1 @and_is_pow2_fail ; CHECK-SAME: (i16 [[X:%.*]], i16 [[Y:%.*]]) { ; CHECK-NEXT: [[X_NEG:%.*]] = sub i16 0, [[X]] -; CHECK-NEXT: [[XX:%.*]] = and i16 [[X_NEG]], [[X]] -; CHECK-NEXT: [[AND:%.*]] = and i16 [[XX]], [[Y]] +; CHECK-NEXT: [[XX:%.*]] = and i16 [[X]], [[X_NEG]] +; CHECK-NEXT: [[AND:%.*]] = and i16 [[Y]], [[XX]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i16 [[AND]], [[XX]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -619,7 +619,7 @@ define i16 @i1_is_pow2_or_zero(i1 %x, i16 %y) { ; CHECK-LABEL: define i16 @i1_is_pow2_or_zero ; CHECK-SAME: (i1 [[X:%.*]], i16 [[Y:%.*]]) { ; CHECK-NEXT: [[XX:%.*]] = zext i1 [[X]] to i16 -; CHECK-NEXT: [[R:%.*]] = or i16 [[XX]], [[Y]] +; CHECK-NEXT: [[R:%.*]] = or i16 [[Y]], [[XX]] ; CHECK-NEXT: ret i16 [[R]] ; %xx = zext i1 %x to i16 diff --git a/llvm/test/Analysis/ValueTracking/knownbits-and-or-xor-lowbit.ll b/llvm/test/Analysis/ValueTracking/knownbits-and-or-xor-lowbit.ll index 4ca7ed9eda7bbc..fba907ab731b0b 100644 --- a/llvm/test/Analysis/ValueTracking/knownbits-and-or-xor-lowbit.ll +++ b/llvm/test/Analysis/ValueTracking/knownbits-and-or-xor-lowbit.ll @@ -93,7 +93,7 @@ define <2 x i1> @sub_XY_and_bit0_is_zero_fail(<2 x i8> %x, <2 x i8> %C) nounwind ; CHECK-LABEL: @sub_XY_and_bit0_is_zero_fail( ; CHECK-NEXT: [[C1:%.*]] = or <2 x i8> [[C:%.*]], ; CHECK-NEXT: [[Y:%.*]] = sub <2 x i8> [[X:%.*]], [[C1]] -; CHECK-NEXT: [[W:%.*]] = and <2 x i8> [[Y]], [[X]] +; CHECK-NEXT: [[W:%.*]] = and <2 x i8> [[X]], [[Y]] ; CHECK-NEXT: [[R:%.*]] = icmp eq <2 x i8> [[W]], ; CHECK-NEXT: ret <2 x i1> [[R]] ; @@ -108,7 +108,7 @@ define i1 @sub_XY_xor_bit0_is_one_fail(i8 %x, i8 %C) nounwind { ; CHECK-LABEL: @sub_XY_xor_bit0_is_one_fail( ; CHECK-NEXT: [[C1:%.*]] = xor i8 [[C:%.*]], 1 ; CHECK-NEXT: [[Y:%.*]] = sub i8 [[X:%.*]], [[C1]] -; CHECK-NEXT: [[W:%.*]] = xor i8 [[Y]], [[X]] +; CHECK-NEXT: [[W:%.*]] = xor i8 [[X]], [[Y]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[W]], 10 ; CHECK-NEXT: ret i1 [[R]] ; @@ -122,7 +122,7 @@ define i1 @sub_XY_xor_bit0_is_one_fail(i8 %x, i8 %C) nounwind { define i1 @sub_XY_or_bit0_is_one_fail(i8 %x, i8 %C) nounwind { ; CHECK-LABEL: @sub_XY_or_bit0_is_one_fail( ; CHECK-NEXT: [[Y:%.*]] = sub i8 [[X:%.*]], [[C:%.*]] -; CHECK-NEXT: [[W:%.*]] = or i8 [[Y]], [[X]] +; CHECK-NEXT: [[W:%.*]] = or i8 [[X]], [[Y]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[W]], 10 ; CHECK-NEXT: ret i1 [[R]] ; @@ -135,7 +135,7 @@ define i1 @sub_XY_or_bit0_is_one_fail(i8 %x, i8 %C) nounwind { define i1 @sub_YX_and_bit0_is_zero_fail(i8 %x, i8 %C) nounwind { ; CHECK-LABEL: @sub_YX_and_bit0_is_zero_fail( ; CHECK-NEXT: [[Y:%.*]] = sub i8 [[C:%.*]], [[X:%.*]] -; CHECK-NEXT: [[W:%.*]] = and i8 [[Y]], [[X]] +; CHECK-NEXT: [[W:%.*]] = and i8 [[X]], [[Y]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[W]], -1 ; CHECK-NEXT: ret i1 [[R]] ; @@ -148,7 +148,7 @@ define i1 @sub_YX_and_bit0_is_zero_fail(i8 %x, i8 %C) nounwind { define <2 x i1> @sub_YX_xor_bit0_is_one_fail(<2 x i8> %x, <2 x i8> %C) nounwind { ; CHECK-LABEL: @sub_YX_xor_bit0_is_one_fail( ; CHECK-NEXT: [[TMP1:%.*]] = sub <2 x i8> [[X:%.*]], [[C:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i8> [[TMP1]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i8> [[X]], [[TMP1]] ; CHECK-NEXT: [[R:%.*]] = icmp eq <2 x i8> [[TMP2]], ; CHECK-NEXT: ret <2 x i1> [[R]] ; @@ -163,7 +163,7 @@ define i1 @sub_YX_or_bit0_is_one_fail(i8 %x, i8 %C) nounwind { ; CHECK-LABEL: @sub_YX_or_bit0_is_one_fail( ; CHECK-NEXT: [[C1:%.*]] = xor i8 [[C:%.*]], 1 ; CHECK-NEXT: [[Y:%.*]] = sub i8 [[C1]], [[X:%.*]] -; CHECK-NEXT: [[W:%.*]] = or i8 [[Y]], [[X]] +; CHECK-NEXT: [[W:%.*]] = or i8 [[X]], [[Y]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[W]], 32 ; CHECK-NEXT: ret i1 [[R]] ; @@ -178,7 +178,7 @@ define i1 @add_YX_xor_bit0_is_one_fail(i8 %x, i8 %C) nounwind { ; CHECK-LABEL: @add_YX_xor_bit0_is_one_fail( ; CHECK-NEXT: [[C1:%.*]] = and i8 [[C:%.*]], 1 ; CHECK-NEXT: [[Y:%.*]] = add i8 [[C1]], [[X:%.*]] -; CHECK-NEXT: [[W:%.*]] = xor i8 [[Y]], [[X]] +; CHECK-NEXT: [[W:%.*]] = xor i8 [[X]], [[Y]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[W]], 32 ; CHECK-NEXT: ret i1 [[R]] ; @@ -193,7 +193,7 @@ define <2 x i1> @add_XY_or_bit0_is_one_fail(<2 x i8> %x, <2 x i8> %C) nounwind { ; CHECK-LABEL: @add_XY_or_bit0_is_one_fail( ; CHECK-NEXT: [[C1:%.*]] = add <2 x i8> [[C:%.*]], ; CHECK-NEXT: [[Y:%.*]] = add <2 x i8> [[C1]], [[X:%.*]] -; CHECK-NEXT: [[W:%.*]] = or <2 x i8> [[Y]], [[X]] +; CHECK-NEXT: [[W:%.*]] = or <2 x i8> [[X]], [[Y]] ; CHECK-NEXT: [[R:%.*]] = icmp eq <2 x i8> [[W]], ; CHECK-NEXT: ret <2 x i1> [[R]] ; diff --git a/llvm/test/Analysis/ValueTracking/knownbits-bmi-pattern.ll b/llvm/test/Analysis/ValueTracking/knownbits-bmi-pattern.ll index 793d6ffa3e34e4..407100dec1201e 100644 --- a/llvm/test/Analysis/ValueTracking/knownbits-bmi-pattern.ll +++ b/llvm/test/Analysis/ValueTracking/knownbits-bmi-pattern.ll @@ -161,7 +161,7 @@ define i32 @blsmsk_and_eval2(i32 %x) { ; CHECK-LABEL: @blsmsk_and_eval2( ; CHECK-NEXT: [[X1:%.*]] = or i32 [[X:%.*]], 10 ; CHECK-NEXT: [[X2:%.*]] = add i32 [[X1]], 63 -; CHECK-NEXT: [[X3:%.*]] = xor i32 [[X2]], [[X]] +; CHECK-NEXT: [[X3:%.*]] = xor i32 [[X]], [[X2]] ; CHECK-NEXT: [[Z:%.*]] = and i32 [[X3]], 32 ; CHECK-NEXT: ret i32 [[Z]] ; @@ -337,7 +337,7 @@ define <2 x i1> @blsi_ge_is_false_vec(<2 x i32> %x) { ; CHECK-LABEL: @blsi_ge_is_false_vec( ; CHECK-NEXT: [[X1:%.*]] = or <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[X2:%.*]] = sub nsw <2 x i32> zeroinitializer, [[X1]] -; CHECK-NEXT: [[X3:%.*]] = and <2 x i32> [[X2]], [[X]] +; CHECK-NEXT: [[X3:%.*]] = and <2 x i32> [[X]], [[X2]] ; CHECK-NEXT: [[Z:%.*]] = icmp ugt <2 x i32> [[X3]], ; CHECK-NEXT: ret <2 x i1> [[Z]] ; @@ -352,7 +352,7 @@ define <2 x i1> @blsi_ge_is_false_diff_vec(<2 x i32> %x) { ; CHECK-LABEL: @blsi_ge_is_false_diff_vec( ; CHECK-NEXT: [[X1:%.*]] = or <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[X2:%.*]] = sub nsw <2 x i32> zeroinitializer, [[X1]] -; CHECK-NEXT: [[X3:%.*]] = and <2 x i32> [[X2]], [[X]] +; CHECK-NEXT: [[X3:%.*]] = and <2 x i32> [[X]], [[X2]] ; CHECK-NEXT: [[Z:%.*]] = icmp ugt <2 x i32> [[X3]], ; CHECK-NEXT: ret <2 x i1> [[Z]] ; @@ -445,7 +445,7 @@ define <2 x i32> @blsi_and_eval2_vec(<2 x i32> %x) { ; CHECK-LABEL: @blsi_and_eval2_vec( ; CHECK-NEXT: [[X1:%.*]] = or <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[X2:%.*]] = sub nsw <2 x i32> zeroinitializer, [[X1]] -; CHECK-NEXT: [[X3:%.*]] = and <2 x i32> [[X2]], [[X]] +; CHECK-NEXT: [[X3:%.*]] = and <2 x i32> [[X]], [[X2]] ; CHECK-NEXT: [[Z:%.*]] = and <2 x i32> [[X3]], ; CHECK-NEXT: ret <2 x i32> [[Z]] ; @@ -460,7 +460,7 @@ define i32 @blsi_and_eval3(i32 %x) { ; CHECK-LABEL: @blsi_and_eval3( ; CHECK-NEXT: [[X1:%.*]] = or i32 [[X:%.*]], 34 ; CHECK-NEXT: [[X2:%.*]] = sub nsw i32 0, [[X1]] -; CHECK-NEXT: [[X3:%.*]] = and i32 [[X2]], [[X]] +; CHECK-NEXT: [[X3:%.*]] = and i32 [[X]], [[X2]] ; CHECK-NEXT: [[Z:%.*]] = and i32 [[X3]], 208 ; CHECK-NEXT: ret i32 [[Z]] ; @@ -480,7 +480,7 @@ define <2 x i1> @blsi_eq_is_false_assume_vec(<2 x i32> %x) { ; CHECK-NEXT: [[CMP1:%.*]] = extractelement <2 x i1> [[CMP]], i64 1 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP1]]) ; CHECK-NEXT: [[X2:%.*]] = sub <2 x i32> zeroinitializer, [[X]] -; CHECK-NEXT: [[X3:%.*]] = and <2 x i32> [[X2]], [[X]] +; CHECK-NEXT: [[X3:%.*]] = and <2 x i32> [[X]], [[X2]] ; CHECK-NEXT: [[Z:%.*]] = icmp eq <2 x i32> [[X3]], ; CHECK-NEXT: ret <2 x i1> [[Z]] ; @@ -668,7 +668,7 @@ define i32 @blsmsk_xor_no_eval_assume(i32 %x) { ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LB]], 0 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) ; CHECK-NEXT: [[X2:%.*]] = add i32 [[X]], -1 -; CHECK-NEXT: [[X3:%.*]] = xor i32 [[X2]], [[X]] +; CHECK-NEXT: [[X3:%.*]] = xor i32 [[X]], [[X2]] ; CHECK-NEXT: [[Z:%.*]] = xor i32 [[X3]], 32 ; CHECK-NEXT: ret i32 [[Z]] ; @@ -687,7 +687,7 @@ define i32 @blsmsk_xor_no_eval_assume2(i32 %x) { ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[LB]], 0 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) ; CHECK-NEXT: [[X2:%.*]] = add nsw i32 [[X]], -1 -; CHECK-NEXT: [[X3:%.*]] = xor i32 [[X2]], [[X]] +; CHECK-NEXT: [[X3:%.*]] = xor i32 [[X]], [[X2]] ; CHECK-NEXT: [[Z:%.*]] = xor i32 [[X3]], 32 ; CHECK-NEXT: ret i32 [[Z]] ; diff --git a/llvm/test/Analysis/ValueTracking/phi-known-bits.ll b/llvm/test/Analysis/ValueTracking/phi-known-bits.ll index 3728e4177dd998..8691e63a4f3ee1 100644 --- a/llvm/test/Analysis/ValueTracking/phi-known-bits.ll +++ b/llvm/test/Analysis/ValueTracking/phi-known-bits.ll @@ -401,7 +401,7 @@ define i8 @phi_ugt_high_bits_and_known_todo_high_depths(i8 %xx, i8 %y, i8 %z) { ; CHECK-LABEL: @phi_ugt_high_bits_and_known_todo_high_depths( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[YY:%.*]] = and i8 [[Y:%.*]], -2 -; CHECK-NEXT: [[XXX:%.*]] = and i8 [[YY]], [[XX:%.*]] +; CHECK-NEXT: [[XXX:%.*]] = and i8 [[XX:%.*]], [[YY]] ; CHECK-NEXT: [[ZZ:%.*]] = or i8 [[Z:%.*]], 1 ; CHECK-NEXT: [[X:%.*]] = add i8 [[XXX]], [[ZZ]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i8 [[X]], -65 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll index ebbab5c2b9508a..0025d23b108038 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll @@ -215,7 +215,7 @@ define half @test_powr_fast_f16(half %x, half %y) { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_log_f16_e32 v0, v0 -; CHECK-NEXT: v_mul_f16_e32 v0, v0, v1 +; CHECK-NEXT: v_mul_f16_e32 v0, v1, v0 ; CHECK-NEXT: v_exp_f16_e32 v0, v0 ; CHECK-NEXT: s_setpc_b64 s[30:31] %powr = tail call fast half @_Z4powrDhDh(half %x, half %y) @@ -236,11 +236,11 @@ define float @test_powr_fast_f32(float %x, float %y) { ; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000 ; CHECK-NEXT: v_sub_f32_e32 v0, v0, v2 -; CHECK-NEXT: v_mul_f32_e32 v2, v0, v1 +; CHECK-NEXT: v_mul_f32_e32 v2, v1, v0 ; CHECK-NEXT: v_mov_b32_e32 v3, 0x42800000 ; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc -; CHECK-NEXT: v_fma_f32 v0, v0, v1, v2 +; CHECK-NEXT: v_fma_f32 v0, v1, v0, v2 ; CHECK-NEXT: v_exp_f32_e32 v0, v0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; CHECK-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc @@ -296,7 +296,7 @@ define double @test_powr_fast_f64(double %x, double %y) { ; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[40:41] +; CHECK-NEXT: v_mul_f64 v[0:1], v[40:41], v[0:1] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll index 6b4b0f881f3beb..acdab29e85b91a 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll @@ -1074,7 +1074,7 @@ define float @test_pow_afn_f32_nnan_ninf_x_known_positive(float nofpclass(ninf n ; CHECK-LABEL: define float @test_pow_afn_f32_nnan_ninf_x_known_positive ; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[X:%.*]], float [[Y:%.*]]) { ; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[X]]) -; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y]] +; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[Y]], [[__LOG2]] ; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]]) ; CHECK-NEXT: ret float [[__EXP2]] ; @@ -1096,7 +1096,7 @@ define <2 x float> @test_pow_afn_v2f32_nnan_ninf_x_known_positive(<2 x float> no ; CHECK-LABEL: define <2 x float> @test_pow_afn_v2f32_nnan_ninf_x_known_positive ; CHECK-SAME: (<2 x float> nofpclass(ninf nsub nnorm) [[X:%.*]], <2 x float> [[Y:%.*]]) { ; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @llvm.log2.v2f32(<2 x float> [[X]]) -; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[__LOG2]], [[Y]] +; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[Y]], [[__LOG2]] ; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @llvm.exp2.v2f32(<2 x float> [[__YLOGX]]) ; CHECK-NEXT: ret <2 x float> [[__EXP2]] ; @@ -1158,7 +1158,7 @@ define double @test_pow_afn_f64_nnan_ninf_x_known_positive(double nofpclass(ninf ; CHECK-LABEL: define double @test_pow_afn_f64_nnan_ninf_x_known_positive ; CHECK-SAME: (double nofpclass(ninf nsub nnorm) [[X:%.*]], double [[Y:%.*]]) { ; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn double @_Z4log2d(double [[X]]) -; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn double [[__LOG2]], [[Y]] +; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn double [[Y]], [[__LOG2]] ; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn double @_Z4exp2d(double [[__YLOGX]]) ; CHECK-NEXT: ret double [[__EXP2]] ; @@ -1180,7 +1180,7 @@ define <2 x double> @test_pow_afn_v2f64_nnan_ninf_x_known_positive(<2 x double> ; CHECK-LABEL: define <2 x double> @test_pow_afn_v2f64_nnan_ninf_x_known_positive ; CHECK-SAME: (<2 x double> nofpclass(ninf nsub nnorm) [[X:%.*]], <2 x double> [[Y:%.*]]) { ; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x double> @_Z4log2Dv2_d(<2 x double> [[X]]) -; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x double> [[__LOG2]], [[Y]] +; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x double> [[Y]], [[__LOG2]] ; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x double> @_Z4exp2Dv2_d(<2 x double> [[__YLOGX]]) ; CHECK-NEXT: ret <2 x double> [[__EXP2]] ; @@ -1242,7 +1242,7 @@ define half @test_pow_afn_f16_nnan_ninf_x_known_positive(half nofpclass(ninf nno ; CHECK-LABEL: define half @test_pow_afn_f16_nnan_ninf_x_known_positive ; CHECK-SAME: (half nofpclass(ninf nsub nnorm) [[X:%.*]], half [[Y:%.*]]) { ; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn half @llvm.log2.f16(half [[X]]) -; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn half [[__LOG2]], [[Y]] +; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn half [[Y]], [[__LOG2]] ; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn half @llvm.exp2.f16(half [[__YLOGX]]) ; CHECK-NEXT: ret half [[__EXP2]] ; @@ -1264,7 +1264,7 @@ define <2 x half> @test_pow_afn_v2f16_nnan_ninf_x_known_positive(<2 x half> nofp ; CHECK-LABEL: define <2 x half> @test_pow_afn_v2f16_nnan_ninf_x_known_positive ; CHECK-SAME: (<2 x half> nofpclass(ninf nsub nnorm) [[X:%.*]], <2 x half> [[Y:%.*]]) { ; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x half> @llvm.log2.v2f16(<2 x half> [[X]]) -; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x half> [[__LOG2]], [[Y]] +; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x half> [[Y]], [[__LOG2]] ; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x half> @llvm.exp2.v2f16(<2 x half> [[__YLOGX]]) ; CHECK-NEXT: ret <2 x half> [[__EXP2]] ; @@ -1684,7 +1684,7 @@ define float @test_pow_afn_f32_nnan_ninf__y_3(float %x) { ; CHECK-LABEL: define float @test_pow_afn_f32_nnan_ninf__y_3 ; CHECK-SAME: (float [[X:%.*]]) { ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn float [[X]], [[X]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn float [[__POWX2]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn float [[X]], [[__POWX2]] ; CHECK-NEXT: ret float [[__POWPROD]] ; %pow = tail call afn nnan ninf float @_Z3powff(float %x, float 3.0) @@ -1737,7 +1737,7 @@ define float @test_pow_afn_f32_nnan_ninf__y_5(float %x) { ; CHECK-SAME: (float [[X:%.*]]) { ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn float [[X]], [[X]] ; CHECK-NEXT: [[__POWX21:%.*]] = fmul nnan ninf afn float [[__POWX2]], [[__POWX2]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn float [[__POWX21]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn float [[X]], [[__POWX21]] ; CHECK-NEXT: ret float [[__POWPROD]] ; %pow = tail call afn nnan ninf float @_Z3powff(float %x, float 5.0) @@ -1759,7 +1759,7 @@ define float @test_pow_afn_f32_nnan_ninf__y_neg5(float %x) { ; CHECK-SAME: (float [[X:%.*]]) { ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn float [[X]], [[X]] ; CHECK-NEXT: [[__POWX21:%.*]] = fmul nnan ninf afn float [[__POWX2]], [[__POWX2]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn float [[__POWX21]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn float [[X]], [[__POWX21]] ; CHECK-NEXT: [[__1POWPROD:%.*]] = fdiv nnan ninf afn float 1.000000e+00, [[__POWPROD]] ; CHECK-NEXT: ret float [[__1POWPROD]] ; @@ -1793,7 +1793,7 @@ define <2 x float> @test_pow_afn_v2f32_nnan_ninf__y_3(<2 x float> %x) { ; CHECK-LABEL: define <2 x float> @test_pow_afn_v2f32_nnan_ninf__y_3 ; CHECK-SAME: (<2 x float> [[X:%.*]]) { ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn <2 x float> [[X]], [[X]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn <2 x float> [[__POWX2]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn <2 x float> [[X]], [[__POWX2]] ; CHECK-NEXT: ret <2 x float> [[__POWPROD]] ; %pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> ) @@ -1836,7 +1836,7 @@ define <2 x float> @test_pow_afn_v2f32_nnan_ninf__y_5(<2 x float> %x) { ; CHECK-SAME: (<2 x float> [[X:%.*]]) { ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn <2 x float> [[X]], [[X]] ; CHECK-NEXT: [[__POWX21:%.*]] = fmul nnan ninf afn <2 x float> [[__POWX2]], [[__POWX2]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn <2 x float> [[__POWX21]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn <2 x float> [[X]], [[__POWX21]] ; CHECK-NEXT: ret <2 x float> [[__POWPROD]] ; %pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> ) @@ -1848,7 +1848,7 @@ define float @test_pow_afn_f32_nnan_ninf__y_5_known_positive(float nofpclass(nin ; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[X:%.*]]) { ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn float [[X]], [[X]] ; CHECK-NEXT: [[__POWX21:%.*]] = fmul nnan ninf afn float [[__POWX2]], [[__POWX2]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn float [[__POWX21]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn float [[X]], [[__POWX21]] ; CHECK-NEXT: ret float [[__POWPROD]] ; %pow = tail call afn nnan ninf float @_Z3powff(float %x, float 5.0) @@ -1861,7 +1861,7 @@ define float @test_pow_afn_f32_nnan_ninf__y_5_known_positive_with_ninf_flag(floa ; CHECK-SAME: (float nofpclass(nsub nnorm) [[X:%.*]]) { ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn float [[X]], [[X]] ; CHECK-NEXT: [[__POWX21:%.*]] = fmul nnan ninf afn float [[__POWX2]], [[__POWX2]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn float [[__POWX21]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn float [[X]], [[__POWX21]] ; CHECK-NEXT: ret float [[__POWPROD]] ; %pow = tail call afn nnan ninf float @_Z3powff(float %x, float 5.0) @@ -1882,7 +1882,7 @@ define double @test_pow_afn_f64_nnan_ninf__y_3(double %x) { ; CHECK-LABEL: define double @test_pow_afn_f64_nnan_ninf__y_3 ; CHECK-SAME: (double [[X:%.*]]) { ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn double [[X]], [[X]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn double [[__POWX2]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn double [[X]], [[__POWX2]] ; CHECK-NEXT: ret double [[__POWPROD]] ; %pow = tail call afn nnan ninf double @_Z3powdd(double %x, double 3.0) @@ -1935,7 +1935,7 @@ define double @test_pow_afn_f64_nnan_ninf__y_5(double %x) { ; CHECK-SAME: (double [[X:%.*]]) { ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn double [[X]], [[X]] ; CHECK-NEXT: [[__POWX21:%.*]] = fmul nnan ninf afn double [[__POWX2]], [[__POWX2]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn double [[__POWX21]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn double [[X]], [[__POWX21]] ; CHECK-NEXT: ret double [[__POWPROD]] ; %pow = tail call afn nnan ninf double @_Z3powdd(double %x, double 5.0) @@ -1957,7 +1957,7 @@ define double @test_pow_afn_f64_nnan_ninf__y_neg5(double %x) { ; CHECK-SAME: (double [[X:%.*]]) { ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn double [[X]], [[X]] ; CHECK-NEXT: [[__POWX21:%.*]] = fmul nnan ninf afn double [[__POWX2]], [[__POWX2]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn double [[__POWX21]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn double [[X]], [[__POWX21]] ; CHECK-NEXT: [[__1POWPROD:%.*]] = fdiv nnan ninf afn double 1.000000e+00, [[__POWPROD]] ; CHECK-NEXT: ret double [[__1POWPROD]] ; @@ -1982,7 +1982,7 @@ define <2 x double> @test_pow_afn_v2f64_nnan_ninf__y_3(<2 x double> %x) { ; CHECK-LABEL: define <2 x double> @test_pow_afn_v2f64_nnan_ninf__y_3 ; CHECK-SAME: (<2 x double> [[X:%.*]]) { ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn <2 x double> [[X]], [[X]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn <2 x double> [[__POWX2]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn <2 x double> [[X]], [[__POWX2]] ; CHECK-NEXT: ret <2 x double> [[__POWPROD]] ; %pow = tail call afn nnan ninf <2 x double> @_Z3powDv2_dS_(<2 x double> %x, <2 x double> ) @@ -2015,7 +2015,7 @@ define <2 x double> @test_pow_afn_v2f64_nnan_ninf__y_5(<2 x double> %x) { ; CHECK-SAME: (<2 x double> [[X:%.*]]) { ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn <2 x double> [[X]], [[X]] ; CHECK-NEXT: [[__POWX21:%.*]] = fmul nnan ninf afn <2 x double> [[__POWX2]], [[__POWX2]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn <2 x double> [[__POWX21]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn <2 x double> [[X]], [[__POWX21]] ; CHECK-NEXT: ret <2 x double> [[__POWPROD]] ; %pow = tail call afn nnan ninf <2 x double> @_Z3powDv2_dS_(<2 x double> %x, <2 x double> ) @@ -2036,7 +2036,7 @@ define half @test_pow_afn_f16_nnan_ninf__y_3(half %x) { ; CHECK-LABEL: define half @test_pow_afn_f16_nnan_ninf__y_3 ; CHECK-SAME: (half [[X:%.*]]) { ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn half [[X]], [[X]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn half [[__POWX2]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn half [[X]], [[__POWX2]] ; CHECK-NEXT: ret half [[__POWPROD]] ; %pow = tail call afn nnan ninf half @_Z3powDhDh(half %x, half 3.0) @@ -2089,7 +2089,7 @@ define half @test_pow_afn_f16_nnan_ninf__y_5(half %x) { ; CHECK-SAME: (half [[X:%.*]]) { ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn half [[X]], [[X]] ; CHECK-NEXT: [[__POWX21:%.*]] = fmul nnan ninf afn half [[__POWX2]], [[__POWX2]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn half [[__POWX21]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn half [[X]], [[__POWX21]] ; CHECK-NEXT: ret half [[__POWPROD]] ; %pow = tail call afn nnan ninf half @_Z3powDhDh(half %x, half 5.0) @@ -2111,7 +2111,7 @@ define half @test_pow_afn_f16_nnan_ninf__y_neg5(half %x) { ; CHECK-SAME: (half [[X:%.*]]) { ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn half [[X]], [[X]] ; CHECK-NEXT: [[__POWX21:%.*]] = fmul nnan ninf afn half [[__POWX2]], [[__POWX2]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn half [[__POWX21]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn half [[X]], [[__POWX21]] ; CHECK-NEXT: [[__1POWPROD:%.*]] = fdiv nnan ninf afn half 0xH3C00, [[__POWPROD]] ; CHECK-NEXT: ret half [[__1POWPROD]] ; @@ -2136,7 +2136,7 @@ define <2 x half> @test_pow_afn_v2f16_nnan_ninf__y_3(<2 x half> %x) { ; CHECK-LABEL: define <2 x half> @test_pow_afn_v2f16_nnan_ninf__y_3 ; CHECK-SAME: (<2 x half> [[X:%.*]]) { ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn <2 x half> [[X]], [[X]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn <2 x half> [[__POWX2]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn <2 x half> [[X]], [[__POWX2]] ; CHECK-NEXT: ret <2 x half> [[__POWPROD]] ; %pow = tail call afn nnan ninf <2 x half> @_Z3powDv2_DhS_(<2 x half> %x, <2 x half> ) @@ -2169,7 +2169,7 @@ define <2 x half> @test_pow_afn_v2f16_nnan_ninf__y_5(<2 x half> %x) { ; CHECK-SAME: (<2 x half> [[X:%.*]]) { ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn <2 x half> [[X]], [[X]] ; CHECK-NEXT: [[__POWX21:%.*]] = fmul nnan ninf afn <2 x half> [[__POWX2]], [[__POWX2]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn <2 x half> [[__POWX21]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn <2 x half> [[X]], [[__POWX21]] ; CHECK-NEXT: ret <2 x half> [[__POWPROD]] ; %pow = tail call afn nnan ninf <2 x half> @_Z3powDv2_DhS_(<2 x half> %x, <2 x half> ) diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll index 77db224af28902..bd4b86f0387666 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll @@ -850,7 +850,7 @@ define float @test_pown_afn_nnan_ninf_f32__y_3(float %x) { ; CHECK-LABEL: define float @test_pown_afn_nnan_ninf_f32__y_3 ; CHECK-SAME: (float [[X:%.*]]) { ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn float [[X]], [[X]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn float [[__POWX2]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn float [[X]], [[__POWX2]] ; CHECK-NEXT: ret float [[__POWPROD]] ; %call = tail call nnan ninf afn float @_Z4pownfi(float %x, i32 3) @@ -861,7 +861,7 @@ define float @test_pown_afn_nnan_ninf_f32__y_neg3(float %x) { ; CHECK-LABEL: define float @test_pown_afn_nnan_ninf_f32__y_neg3 ; CHECK-SAME: (float [[X:%.*]]) { ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn float [[X]], [[X]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn float [[__POWX2]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn float [[X]], [[__POWX2]] ; CHECK-NEXT: [[__1POWPROD:%.*]] = fdiv nnan ninf afn float 1.000000e+00, [[__POWPROD]] ; CHECK-NEXT: ret float [[__1POWPROD]] ; @@ -897,7 +897,7 @@ define float @test_pown_afn_nnan_ninf_f32__y_5(float %x) { ; CHECK-SAME: (float [[X:%.*]]) { ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn float [[X]], [[X]] ; CHECK-NEXT: [[__POWX21:%.*]] = fmul nnan ninf afn float [[__POWX2]], [[__POWX2]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn float [[__POWX21]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn float [[X]], [[__POWX21]] ; CHECK-NEXT: ret float [[__POWPROD]] ; %call = tail call nnan ninf afn float @_Z4pownfi(float %x, i32 5) @@ -909,7 +909,7 @@ define float @test_pown_afn_nnan_ninf_f32__y_neg5(float %x) { ; CHECK-SAME: (float [[X:%.*]]) { ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn float [[X]], [[X]] ; CHECK-NEXT: [[__POWX21:%.*]] = fmul nnan ninf afn float [[__POWX2]], [[__POWX2]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn float [[__POWX21]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn float [[X]], [[__POWX21]] ; CHECK-NEXT: [[__1POWPROD:%.*]] = fdiv nnan ninf afn float 1.000000e+00, [[__POWPROD]] ; CHECK-NEXT: ret float [[__1POWPROD]] ; @@ -921,7 +921,7 @@ define float @test_pown_afn_nnan_ninf_f32__y_7(float %x) { ; CHECK-LABEL: define float @test_pown_afn_nnan_ninf_f32__y_7 ; CHECK-SAME: (float [[X:%.*]]) { ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn float [[X]], [[X]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn float [[__POWX2]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn float [[X]], [[__POWX2]] ; CHECK-NEXT: [[__POWX21:%.*]] = fmul nnan ninf afn float [[__POWX2]], [[__POWX2]] ; CHECK-NEXT: [[__POWPROD2:%.*]] = fmul nnan ninf afn float [[__POWPROD]], [[__POWX21]] ; CHECK-NEXT: ret float [[__POWPROD2]] @@ -934,7 +934,7 @@ define float @test_pown_afn_nnan_ninf_f32__y_neg7(float %x) { ; CHECK-LABEL: define float @test_pown_afn_nnan_ninf_f32__y_neg7 ; CHECK-SAME: (float [[X:%.*]]) { ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn float [[X]], [[X]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn float [[__POWX2]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn float [[X]], [[__POWX2]] ; CHECK-NEXT: [[__POWX21:%.*]] = fmul nnan ninf afn float [[__POWX2]], [[__POWX2]] ; CHECK-NEXT: [[__POWPROD2:%.*]] = fmul nnan ninf afn float [[__POWPROD]], [[__POWX21]] ; CHECK-NEXT: [[__1POWPROD:%.*]] = fdiv nnan ninf afn float 1.000000e+00, [[__POWPROD2]] @@ -974,7 +974,7 @@ define <2 x float> @test_pown_afn_nnan_ninf_v2f32__y_3(<2 x float> %x) { ; CHECK-SAME: (<2 x float> [[X:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn <2 x float> [[X]], [[X]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn <2 x float> [[__POWX2]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn <2 x float> [[X]], [[__POWX2]] ; CHECK-NEXT: ret <2 x float> [[__POWPROD]] ; entry: @@ -1000,7 +1000,7 @@ define <2 x float> @test_pown_afn_nnan_ninf_v2f32__y_neg3(<2 x float> %x) { ; CHECK-SAME: (<2 x float> [[X:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn <2 x float> [[X]], [[X]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn <2 x float> [[__POWX2]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn <2 x float> [[X]], [[__POWX2]] ; CHECK-NEXT: [[__1POWPROD:%.*]] = fdiv nnan ninf afn <2 x float> , [[__POWPROD]] ; CHECK-NEXT: ret <2 x float> [[__1POWPROD]] ; @@ -1029,7 +1029,7 @@ define <2 x float> @test_pown_afn_nnan_ninf_v2f32__y_5(<2 x float> %x) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__POWX2:%.*]] = fmul nnan ninf afn <2 x float> [[X]], [[X]] ; CHECK-NEXT: [[__POWX21:%.*]] = fmul nnan ninf afn <2 x float> [[__POWX2]], [[__POWX2]] -; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn <2 x float> [[__POWX21]], [[X]] +; CHECK-NEXT: [[__POWPROD:%.*]] = fmul nnan ninf afn <2 x float> [[X]], [[__POWX21]] ; CHECK-NEXT: ret <2 x float> [[__POWPROD]] ; entry: diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-powr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-powr.ll index dc4cf1d067ef18..1a92ca8960a776 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-powr.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-powr.ll @@ -26,7 +26,7 @@ define float @test_powr_fast_f32(float %x, float %y) { ; CHECK-LABEL: define float @test_powr_fast_f32 ; CHECK-SAME: (float [[X:%.*]], float [[Y:%.*]]) { ; CHECK-NEXT: [[__LOG2:%.*]] = call fast float @llvm.log2.f32(float [[X]]) -; CHECK-NEXT: [[__YLOGX:%.*]] = fmul fast float [[__LOG2]], [[Y]] +; CHECK-NEXT: [[__YLOGX:%.*]] = fmul fast float [[Y]], [[__LOG2]] ; CHECK-NEXT: [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float [[__YLOGX]]) ; CHECK-NEXT: ret float [[__EXP2]] ; @@ -38,7 +38,7 @@ define <2 x float> @test_powr_fast_v2f32(<2 x float> %x, <2 x float> %y) { ; CHECK-LABEL: define <2 x float> @test_powr_fast_v2f32 ; CHECK-SAME: (<2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) { ; CHECK-NEXT: [[__LOG2:%.*]] = call fast <2 x float> @llvm.log2.v2f32(<2 x float> [[X]]) -; CHECK-NEXT: [[__YLOGX:%.*]] = fmul fast <2 x float> [[__LOG2]], [[Y]] +; CHECK-NEXT: [[__YLOGX:%.*]] = fmul fast <2 x float> [[Y]], [[__LOG2]] ; CHECK-NEXT: [[__EXP2:%.*]] = call fast <2 x float> @llvm.exp2.v2f32(<2 x float> [[__YLOGX]]) ; CHECK-NEXT: ret <2 x float> [[__EXP2]] ; @@ -1011,7 +1011,7 @@ define float @test_powr_afn_f32_nnan_ninf_x_known_positive(float nofpclass(ninf ; CHECK-LABEL: define float @test_powr_afn_f32_nnan_ninf_x_known_positive ; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[X:%.*]], float [[Y:%.*]]) { ; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @llvm.log2.f32(float [[X]]) -; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y]] +; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[Y]], [[__LOG2]] ; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @llvm.exp2.f32(float [[__YLOGX]]) ; CHECK-NEXT: ret float [[__EXP2]] ; @@ -1033,7 +1033,7 @@ define <2 x float> @test_powr_afn_v2f32_nnan_ninf_x_known_positive(<2 x float> n ; CHECK-LABEL: define <2 x float> @test_powr_afn_v2f32_nnan_ninf_x_known_positive ; CHECK-SAME: (<2 x float> nofpclass(ninf nsub nnorm) [[X:%.*]], <2 x float> [[Y:%.*]]) { ; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @llvm.log2.v2f32(<2 x float> [[X]]) -; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[__LOG2]], [[Y]] +; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[Y]], [[__LOG2]] ; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @llvm.exp2.v2f32(<2 x float> [[__YLOGX]]) ; CHECK-NEXT: ret <2 x float> [[__EXP2]] ; diff --git a/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-value.ll b/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-value.ll index 653970cc34022a..1956f454a52bbf 100644 --- a/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-value.ll +++ b/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-value.ll @@ -165,8 +165,8 @@ define i16 @pr57336(i16 %end, i16 %m) mustprogress { ; CHECK: for.body: ; CHECK-NEXT: [[INC8:%.*]] = phi i16 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[INC]] = add nuw nsw i16 [[INC8]], 1 -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i16 [[INC8]], [[M:%.*]] -; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp sgt i16 [[MUL]], [[END:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i16 [[M:%.*]], [[INC8]] +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp slt i16 [[END:%.*]], [[MUL]] ; CHECK-NEXT: br i1 [[CMP_NOT]], label [[CRIT_EDGE:%.*]], label [[FOR_BODY]] ; CHECK: crit_edge: ; CHECK-NEXT: [[TMP0:%.*]] = add i16 [[END]], 1 @@ -254,7 +254,7 @@ define i32 @vscale_slt_with_vp_umin2(ptr nocapture %A, i32 %n) mustprogress vsca ; CHECK-NEXT: entry: ; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[VF:%.*]] = shl nuw nsw i32 [[VSCALE]], 2 -; CHECK-NEXT: [[CMP4:%.*]] = icmp slt i32 [[VF]], [[N:%.*]] +; CHECK-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], [[VF]] ; CHECK-NEXT: br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[EARLY_EXIT:%.*]] ; CHECK: for.body.preheader: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] diff --git a/llvm/test/Transforms/InstCombine/2004-11-27-SetCCForCastLargerAndConstant.ll b/llvm/test/Transforms/InstCombine/2004-11-27-SetCCForCastLargerAndConstant.ll index 0f50ea9dfe3169..7689fc9f682414 100644 --- a/llvm/test/Transforms/InstCombine/2004-11-27-SetCCForCastLargerAndConstant.ll +++ b/llvm/test/Transforms/InstCombine/2004-11-27-SetCCForCastLargerAndConstant.ll @@ -272,7 +272,7 @@ define i1 @gt_unsigned_to_small_negative(i8 %SB) { define i1 @different_size_zext_zext_ugt(i7 %x, i4 %y) { ; CHECK-LABEL: @different_size_zext_zext_ugt( ; CHECK-NEXT: [[TMP1:%.*]] = zext i4 [[Y:%.*]] to i7 -; CHECK-NEXT: [[R:%.*]] = icmp ult i7 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i7 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %zx = zext i7 %x to i25 @@ -284,7 +284,7 @@ define i1 @different_size_zext_zext_ugt(i7 %x, i4 %y) { define <2 x i1> @different_size_zext_zext_ugt_commute(<2 x i4> %x, <2 x i7> %y) { ; CHECK-LABEL: @different_size_zext_zext_ugt_commute( ; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i4> [[X:%.*]] to <2 x i7> -; CHECK-NEXT: [[R:%.*]] = icmp ugt <2 x i7> [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ult <2 x i7> [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i1> [[R]] ; %zx = zext <2 x i4> %x to <2 x i25> @@ -296,7 +296,7 @@ define <2 x i1> @different_size_zext_zext_ugt_commute(<2 x i4> %x, <2 x i7> %y) define i1 @different_size_zext_zext_ult(i4 %x, i7 %y) { ; CHECK-LABEL: @different_size_zext_zext_ult( ; CHECK-NEXT: [[TMP1:%.*]] = zext i4 [[X:%.*]] to i7 -; CHECK-NEXT: [[R:%.*]] = icmp ult i7 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i7 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %zx = zext i4 %x to i25 @@ -308,7 +308,7 @@ define i1 @different_size_zext_zext_ult(i4 %x, i7 %y) { define i1 @different_size_zext_zext_eq(i4 %x, i7 %y) { ; CHECK-LABEL: @different_size_zext_zext_eq( ; CHECK-NEXT: [[TMP1:%.*]] = zext i4 [[X:%.*]] to i7 -; CHECK-NEXT: [[R:%.*]] = icmp eq i7 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i7 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %zx = zext i4 %x to i25 @@ -320,7 +320,7 @@ define i1 @different_size_zext_zext_eq(i4 %x, i7 %y) { define i1 @different_size_zext_zext_ne_commute(i7 %x, i4 %y) { ; CHECK-LABEL: @different_size_zext_zext_ne_commute( ; CHECK-NEXT: [[TMP1:%.*]] = zext i4 [[Y:%.*]] to i7 -; CHECK-NEXT: [[R:%.*]] = icmp ne i7 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ne i7 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %zx = zext i7 %x to i25 @@ -332,7 +332,7 @@ define i1 @different_size_zext_zext_ne_commute(i7 %x, i4 %y) { define i1 @different_size_zext_zext_slt(i7 %x, i4 %y) { ; CHECK-LABEL: @different_size_zext_zext_slt( ; CHECK-NEXT: [[TMP1:%.*]] = zext i4 [[Y:%.*]] to i7 -; CHECK-NEXT: [[R:%.*]] = icmp ugt i7 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ult i7 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %zx = zext i7 %x to i25 @@ -344,7 +344,7 @@ define i1 @different_size_zext_zext_slt(i7 %x, i4 %y) { define i1 @different_size_zext_zext_sgt(i7 %x, i4 %y) { ; CHECK-LABEL: @different_size_zext_zext_sgt( ; CHECK-NEXT: [[TMP1:%.*]] = zext i4 [[Y:%.*]] to i7 -; CHECK-NEXT: [[R:%.*]] = icmp ult i7 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i7 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %zx = zext i7 %x to i25 @@ -356,7 +356,7 @@ define i1 @different_size_zext_zext_sgt(i7 %x, i4 %y) { define i1 @different_size_sext_sext_sgt(i7 %x, i4 %y) { ; CHECK-LABEL: @different_size_sext_sext_sgt( ; CHECK-NEXT: [[TMP1:%.*]] = sext i4 [[Y:%.*]] to i7 -; CHECK-NEXT: [[R:%.*]] = icmp slt i7 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp sgt i7 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %sx = sext i7 %x to i25 @@ -368,7 +368,7 @@ define i1 @different_size_sext_sext_sgt(i7 %x, i4 %y) { define i1 @different_size_sext_sext_sle(i7 %x, i4 %y) { ; CHECK-LABEL: @different_size_sext_sext_sle( ; CHECK-NEXT: [[TMP1:%.*]] = sext i4 [[Y:%.*]] to i7 -; CHECK-NEXT: [[R:%.*]] = icmp sge i7 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp sle i7 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %sx = sext i7 %x to i25 @@ -380,7 +380,7 @@ define i1 @different_size_sext_sext_sle(i7 %x, i4 %y) { define i1 @different_size_sext_sext_eq(i7 %x, i4 %y) { ; CHECK-LABEL: @different_size_sext_sext_eq( ; CHECK-NEXT: [[TMP1:%.*]] = sext i4 [[Y:%.*]] to i7 -; CHECK-NEXT: [[R:%.*]] = icmp eq i7 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i7 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %sx = sext i7 %x to i25 @@ -392,7 +392,7 @@ define i1 @different_size_sext_sext_eq(i7 %x, i4 %y) { define i1 @different_size_sext_sext_ule(i7 %x, i4 %y) { ; CHECK-LABEL: @different_size_sext_sext_ule( ; CHECK-NEXT: [[TMP1:%.*]] = sext i4 [[Y:%.*]] to i7 -; CHECK-NEXT: [[R:%.*]] = icmp uge i7 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i7 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %sx = sext i7 %x to i25 @@ -423,7 +423,7 @@ define i1 @different_size_sext_sext_ule_extra_use1(i7 %x, i4 %y) { ; CHECK-NEXT: [[SY:%.*]] = sext i4 [[Y:%.*]] to i25 ; CHECK-NEXT: call void @use(i25 [[SY]]) ; CHECK-NEXT: [[TMP1:%.*]] = sext i4 [[Y]] to i7 -; CHECK-NEXT: [[R:%.*]] = icmp uge i7 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i7 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %sx = sext i7 %x to i25 @@ -438,7 +438,7 @@ define i1 @different_size_sext_sext_ule_extra_use2(i7 %x, i4 %y) { ; CHECK-NEXT: [[SX:%.*]] = sext i7 [[X:%.*]] to i25 ; CHECK-NEXT: call void @use(i25 [[SX]]) ; CHECK-NEXT: [[TMP1:%.*]] = sext i4 [[Y:%.*]] to i7 -; CHECK-NEXT: [[R:%.*]] = icmp uge i7 [[TMP1]], [[X]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i7 [[X]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %sx = sext i7 %x to i25 diff --git a/llvm/test/Transforms/InstCombine/2010-11-23-Distributed.ll b/llvm/test/Transforms/InstCombine/2010-11-23-Distributed.ll index 70fd7274f35d49..45564cd9d95f32 100644 --- a/llvm/test/Transforms/InstCombine/2010-11-23-Distributed.ll +++ b/llvm/test/Transforms/InstCombine/2010-11-23-Distributed.ll @@ -16,7 +16,7 @@ define i32 @foo(i32 %x, i32 %y) { define i1 @bar(i64 %x, i64 %y) { ; CHECK-LABEL: @bar( ; CHECK-NEXT: [[Y1:%.*]] = xor i64 [[X:%.*]], -1 -; CHECK-NEXT: [[B:%.*]] = and i64 [[Y1]], [[Y:%.*]] +; CHECK-NEXT: [[B:%.*]] = and i64 [[Y:%.*]], [[Y1]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i64 [[B]], 0 ; CHECK-NEXT: ret i1 [[R]] ; diff --git a/llvm/test/Transforms/InstCombine/abs-1.ll b/llvm/test/Transforms/InstCombine/abs-1.ll index 0cf7cd97d8ff4d..63287e59f66346 100644 --- a/llvm/test/Transforms/InstCombine/abs-1.ll +++ b/llvm/test/Transforms/InstCombine/abs-1.ll @@ -306,7 +306,7 @@ define i32 @nabs_canonical_9(i32 %a, i32 %b) { ; CHECK-LABEL: @nabs_canonical_9( ; CHECK-NEXT: [[T1:%.*]] = sub i32 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.abs.i32(i32 [[T1]], i1 false) -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], [[A]] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[A]], [[TMP1]] ; CHECK-NEXT: [[ADD:%.*]] = sub i32 [[B]], [[TMP2]] ; CHECK-NEXT: ret i32 [[ADD]] ; @@ -417,7 +417,7 @@ declare void @extra_use_i1(i1) define i8 @shifty_abs_too_many_uses(i8 %x) { ; CHECK-LABEL: @shifty_abs_too_many_uses( ; CHECK-NEXT: [[SIGNBIT:%.*]] = ashr i8 [[X:%.*]], 7 -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[SIGNBIT]], [[X]] +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[X]], [[SIGNBIT]] ; CHECK-NEXT: [[ABS:%.*]] = xor i8 [[ADD]], [[SIGNBIT]] ; CHECK-NEXT: call void @extra_use(i8 [[SIGNBIT]]) ; CHECK-NEXT: ret i8 [[ABS]] diff --git a/llvm/test/Transforms/InstCombine/add-mask-neg.ll b/llvm/test/Transforms/InstCombine/add-mask-neg.ll index 0e579f30976079..b72f051a0b799b 100644 --- a/llvm/test/Transforms/InstCombine/add-mask-neg.ll +++ b/llvm/test/Transforms/InstCombine/add-mask-neg.ll @@ -49,7 +49,7 @@ define i32 @dec_commute_mask_neg_i32(i32 %X) { define i32 @dec_mask_neg_multiuse_i32(i32 %X) { ; CHECK-LABEL: @dec_mask_neg_multiuse_i32( ; CHECK-NEXT: [[NEG:%.*]] = sub i32 0, [[X:%.*]] -; CHECK-NEXT: [[MASK:%.*]] = and i32 [[NEG]], [[X]] +; CHECK-NEXT: [[MASK:%.*]] = and i32 [[X]], [[NEG]] ; CHECK-NEXT: [[DEC:%.*]] = add i32 [[MASK]], -1 ; CHECK-NEXT: call void @use(i32 [[NEG]]) ; CHECK-NEXT: ret i32 [[DEC]] @@ -64,7 +64,7 @@ define i32 @dec_mask_neg_multiuse_i32(i32 %X) { define i32 @dec_mask_multiuse_neg_i32(i32 %X) { ; CHECK-LABEL: @dec_mask_multiuse_neg_i32( ; CHECK-NEXT: [[NEG:%.*]] = sub i32 0, [[X:%.*]] -; CHECK-NEXT: [[MASK:%.*]] = and i32 [[NEG]], [[X]] +; CHECK-NEXT: [[MASK:%.*]] = and i32 [[X]], [[NEG]] ; CHECK-NEXT: [[DEC:%.*]] = add i32 [[MASK]], -1 ; CHECK-NEXT: call void @use(i32 [[MASK]]) ; CHECK-NEXT: ret i32 [[DEC]] @@ -105,7 +105,7 @@ define <2 x i32> @dec_mask_neg_v2i32_poison(<2 x i32> %X) { define <2 x i32> @dec_mask_multiuse_neg_multiuse_v2i32(<2 x i32> %X) { ; CHECK-LABEL: @dec_mask_multiuse_neg_multiuse_v2i32( ; CHECK-NEXT: [[NEG:%.*]] = sub <2 x i32> zeroinitializer, [[X:%.*]] -; CHECK-NEXT: [[MASK:%.*]] = and <2 x i32> [[NEG]], [[X]] +; CHECK-NEXT: [[MASK:%.*]] = and <2 x i32> [[X]], [[NEG]] ; CHECK-NEXT: [[DEC:%.*]] = add <2 x i32> [[MASK]], ; CHECK-NEXT: call void @usev(<2 x i32> [[NEG]]) ; CHECK-NEXT: call void @usev(<2 x i32> [[MASK]]) diff --git a/llvm/test/Transforms/InstCombine/add.ll b/llvm/test/Transforms/InstCombine/add.ll index b1f21e58de1e2a..36da56d8441bf7 100644 --- a/llvm/test/Transforms/InstCombine/add.ll +++ b/llvm/test/Transforms/InstCombine/add.ll @@ -260,7 +260,7 @@ define i32 @test9(i32 %A) { define i1 @test10(i8 %a, i8 %b) { ; CHECK-LABEL: @test10( ; CHECK-NEXT: [[ADD:%.*]] = sub i8 0, [[B:%.*]] -; CHECK-NEXT: [[C:%.*]] = icmp ne i8 [[ADD]], [[A:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp ne i8 [[A:%.*]], [[ADD]] ; CHECK-NEXT: ret i1 [[C]] ; %add = add i8 %a, %b @@ -271,7 +271,7 @@ define i1 @test10(i8 %a, i8 %b) { define <2 x i1> @test10vec(<2 x i8> %a, <2 x i8> %b) { ; CHECK-LABEL: @test10vec( ; CHECK-NEXT: [[C:%.*]] = sub <2 x i8> zeroinitializer, [[B:%.*]] -; CHECK-NEXT: [[D:%.*]] = icmp ne <2 x i8> [[C]], [[A:%.*]] +; CHECK-NEXT: [[D:%.*]] = icmp ne <2 x i8> [[A:%.*]], [[C]] ; CHECK-NEXT: ret <2 x i1> [[D]] ; %c = add <2 x i8> %a, %b @@ -302,7 +302,7 @@ define <2 x i1> @test11vec(<2 x i8> %a) { define i8 @reassoc_shl1(i8 %x, i8 %y) { ; CHECK-LABEL: @reassoc_shl1( ; CHECK-NEXT: [[REASS_ADD:%.*]] = shl i8 [[X:%.*]], 1 -; CHECK-NEXT: [[R:%.*]] = add i8 [[REASS_ADD]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = add i8 [[Y:%.*]], [[REASS_ADD]] ; CHECK-NEXT: ret i8 [[R]] ; %a = add i8 %y, %x @@ -313,7 +313,7 @@ define i8 @reassoc_shl1(i8 %x, i8 %y) { define <2 x i8> @reassoc_shl1_commute1(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @reassoc_shl1_commute1( ; CHECK-NEXT: [[REASS_ADD:%.*]] = shl <2 x i8> [[X:%.*]], -; CHECK-NEXT: [[R:%.*]] = add <2 x i8> [[REASS_ADD]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = add <2 x i8> [[Y:%.*]], [[REASS_ADD]] ; CHECK-NEXT: ret <2 x i8> [[R]] ; %a = add <2 x i8> %x, %y @@ -1274,7 +1274,7 @@ define <2 x i32> @test44_vec_non_splat(<2 x i32> %A) { define i32 @lshr_add(i1 %x, i1 %y) { ; CHECK-LABEL: @lshr_add( ; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[X:%.*]], true -; CHECK-NEXT: [[TMP2:%.*]] = and i1 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i1 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: [[R:%.*]] = zext i1 [[TMP2]] to i32 ; CHECK-NEXT: ret i32 [[R]] ; @@ -1288,7 +1288,7 @@ define i32 @lshr_add(i1 %x, i1 %y) { define i5 @and_add(i1 %x, i1 %y) { ; CHECK-LABEL: @and_add( ; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[X:%.*]], true -; CHECK-NEXT: [[TMP2:%.*]] = and i1 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i1 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[TMP2]], i5 -2, i5 0 ; CHECK-NEXT: ret i5 [[R]] ; @@ -1302,7 +1302,7 @@ define i5 @and_add(i1 %x, i1 %y) { define <2 x i8> @ashr_add_commute(<2 x i1> %x, <2 x i1> %y) { ; CHECK-LABEL: @ashr_add_commute( ; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i1> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i8> ; CHECK-NEXT: ret <2 x i8> [[TMP3]] ; @@ -1656,7 +1656,7 @@ define i8 @add_and_xor_wrong_const(i8 %x, i8 %y) { define i8 @add_and_xor_wrong_op(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @add_and_xor_wrong_op( ; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[Z:%.*]], -1 -; CHECK-NEXT: [[AND:%.*]] = and i8 [[XOR]], [[Y:%.*]] +; CHECK-NEXT: [[AND:%.*]] = and i8 [[Y:%.*]], [[XOR]] ; CHECK-NEXT: [[ADD:%.*]] = add i8 [[AND]], [[X:%.*]] ; CHECK-NEXT: ret i8 [[ADD]] ; @@ -1711,7 +1711,7 @@ define i8 @add_and_xor_extra_use(i8 noundef %x, i8 %y) { ; CHECK-LABEL: @add_and_xor_extra_use( ; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X:%.*]], -1 ; CHECK-NEXT: call void @use(i8 [[XOR]]) -; CHECK-NEXT: [[AND:%.*]] = and i8 [[XOR]], [[Y:%.*]] +; CHECK-NEXT: [[AND:%.*]] = and i8 [[Y:%.*]], [[XOR]] ; CHECK-NEXT: call void @use(i8 [[AND]]) ; CHECK-NEXT: [[ADD:%.*]] = or i8 [[Y]], [[X]] ; CHECK-NEXT: ret i8 [[ADD]] @@ -1956,7 +1956,7 @@ define i32 @add_add_add_commute1(i32 %A, i32 %B, i32 %C, i32 %D) { define i32 @add_add_add_commute2(i32 %A, i32 %B, i32 %C, i32 %D) { ; CHECK-LABEL: @add_add_add_commute2( ; CHECK-NEXT: [[E:%.*]] = add i32 [[B:%.*]], [[A:%.*]] -; CHECK-NEXT: [[F:%.*]] = add i32 [[E]], [[C:%.*]] +; CHECK-NEXT: [[F:%.*]] = add i32 [[C:%.*]], [[E]] ; CHECK-NEXT: [[G:%.*]] = add i32 [[F]], [[D:%.*]] ; CHECK-NEXT: ret i32 [[G]] ; @@ -1969,8 +1969,8 @@ define i32 @add_add_add_commute2(i32 %A, i32 %B, i32 %C, i32 %D) { define i32 @add_add_add_commute3(i32 %A, i32 %B, i32 %C, i32 %D) { ; CHECK-LABEL: @add_add_add_commute3( ; CHECK-NEXT: [[E:%.*]] = add i32 [[B:%.*]], [[A:%.*]] -; CHECK-NEXT: [[F:%.*]] = add i32 [[E]], [[C:%.*]] -; CHECK-NEXT: [[G:%.*]] = add i32 [[F]], [[D:%.*]] +; CHECK-NEXT: [[F:%.*]] = add i32 [[C:%.*]], [[E]] +; CHECK-NEXT: [[G:%.*]] = add i32 [[D:%.*]], [[F]] ; CHECK-NEXT: ret i32 [[G]] ; %E = add i32 %B, %A @@ -1984,7 +1984,7 @@ define i32 @add_add_add_commute3(i32 %A, i32 %B, i32 %C, i32 %D) { define i8 @mul_add_common_factor_commute1(i8 %x, i8 %y) { ; CHECK-LABEL: @mul_add_common_factor_commute1( ; CHECK-NEXT: [[X1:%.*]] = add i8 [[Y:%.*]], 1 -; CHECK-NEXT: [[A:%.*]] = mul i8 [[X1]], [[X:%.*]] +; CHECK-NEXT: [[A:%.*]] = mul i8 [[X:%.*]], [[X1]] ; CHECK-NEXT: ret i8 [[A]] ; %m = mul nsw i8 %x, %y @@ -2078,7 +2078,7 @@ define i8 @not_mul_wrong_op(i8 %x, i8 %y) { ; CHECK-LABEL: @not_mul_wrong_op( ; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[X:%.*]], 42 ; CHECK-NEXT: [[NOT:%.*]] = xor i8 [[MUL]], -1 -; CHECK-NEXT: [[PLUSX:%.*]] = add i8 [[NOT]], [[Y:%.*]] +; CHECK-NEXT: [[PLUSX:%.*]] = add i8 [[Y:%.*]], [[NOT]] ; CHECK-NEXT: ret i8 [[PLUSX]] ; %mul = mul i8 %x, 42 @@ -2094,7 +2094,7 @@ define i8 @not_mul_use1(i8 %x) { ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i8 [[X:%.*]], 42 ; CHECK-NEXT: call void @use(i8 [[MUL]]) ; CHECK-NEXT: [[NOT:%.*]] = xor i8 [[MUL]], -1 -; CHECK-NEXT: [[PLUSX:%.*]] = add nsw i8 [[NOT]], [[X]] +; CHECK-NEXT: [[PLUSX:%.*]] = add nsw i8 [[X]], [[NOT]] ; CHECK-NEXT: ret i8 [[PLUSX]] ; %mul = mul nsw i8 %x, 42 @@ -2111,7 +2111,7 @@ define i8 @not_mul_use2(i8 %x) { ; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[X:%.*]], 42 ; CHECK-NEXT: [[NOT:%.*]] = xor i8 [[MUL]], -1 ; CHECK-NEXT: call void @use(i8 [[NOT]]) -; CHECK-NEXT: [[PLUSX:%.*]] = add i8 [[NOT]], [[X]] +; CHECK-NEXT: [[PLUSX:%.*]] = add i8 [[X]], [[NOT]] ; CHECK-NEXT: ret i8 [[PLUSX]] ; %mul = mul i8 %x, 42 @@ -3395,7 +3395,7 @@ define i32 @add_reduce_sqr_sum_flipped(i32 %a, i32 %b) { define i32 @add_reduce_sqr_sum_flipped2(i32 %a, i32 %bx) { ; CHECK-LABEL: @add_reduce_sqr_sum_flipped2( ; CHECK-NEXT: [[B:%.*]] = xor i32 [[BX:%.*]], 42 -; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[B]], [[A:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[A:%.*]], [[B]] ; CHECK-NEXT: [[ADD:%.*]] = mul i32 [[TMP1]], [[TMP1]] ; CHECK-NEXT: ret i32 [[ADD]] ; @@ -3455,7 +3455,7 @@ define i32 @add_reduce_sqr_sum_order2_flipped(i32 %a, i32 %b) { define i32 @add_reduce_sqr_sum_order2_flipped2(i32 %a, i32 %bx) { ; CHECK-LABEL: @add_reduce_sqr_sum_order2_flipped2( ; CHECK-NEXT: [[B:%.*]] = xor i32 [[BX:%.*]], 42 -; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[B]], [[A:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[A:%.*]], [[B]] ; CHECK-NEXT: [[AB2:%.*]] = mul i32 [[TMP1]], [[TMP1]] ; CHECK-NEXT: ret i32 [[AB2]] ; @@ -3472,7 +3472,7 @@ define i32 @add_reduce_sqr_sum_order2_flipped2(i32 %a, i32 %bx) { define i32 @add_reduce_sqr_sum_order2_flipped3(i32 %a, i32 %bx) { ; CHECK-LABEL: @add_reduce_sqr_sum_order2_flipped3( ; CHECK-NEXT: [[B:%.*]] = xor i32 [[BX:%.*]], 42 -; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[B]], [[A:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[A:%.*]], [[B]] ; CHECK-NEXT: [[AB2:%.*]] = mul i32 [[TMP1]], [[TMP1]] ; CHECK-NEXT: ret i32 [[AB2]] ; @@ -3669,7 +3669,7 @@ define i32 @add_reduce_sqr_sum_order5_flipped2(i32 %a, i32 %b) { define i32 @add_reduce_sqr_sum_order5_flipped3(i32 %ax, i32 %b) { ; CHECK-LABEL: @add_reduce_sqr_sum_order5_flipped3( ; CHECK-NEXT: [[A:%.*]] = xor i32 [[AX:%.*]], 42 -; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[A]], [[B:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[B:%.*]], [[A]] ; CHECK-NEXT: [[AB2:%.*]] = mul i32 [[TMP1]], [[TMP1]] ; CHECK-NEXT: ret i32 [[AB2]] ; @@ -4044,7 +4044,7 @@ define i32 @add_reduce_sqr_sum_varB_invalid3(i32 %a, i32 %b) { ; CHECK-NEXT: [[A_B:%.*]] = mul nsw i32 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TWOAB:%.*]] = shl i32 [[A_B]], 1 ; CHECK-NEXT: [[B_SQ1:%.*]] = add i32 [[A]], [[B]] -; CHECK-NEXT: [[A2_B2:%.*]] = mul i32 [[B_SQ1]], [[B]] +; CHECK-NEXT: [[A2_B2:%.*]] = mul i32 [[B]], [[B_SQ1]] ; CHECK-NEXT: [[AB2:%.*]] = add i32 [[TWOAB]], [[A2_B2]] ; CHECK-NEXT: ret i32 [[AB2]] ; @@ -4062,7 +4062,7 @@ define i32 @add_reduce_sqr_sum_varB_invalid4(i32 %a, i32 %b) { ; CHECK-NEXT: [[A_B:%.*]] = mul nsw i32 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TWOAB:%.*]] = shl i32 [[A_B]], 1 ; CHECK-NEXT: [[NOT_B_SQ1:%.*]] = add i32 [[A]], [[B]] -; CHECK-NEXT: [[A2_B2:%.*]] = mul i32 [[NOT_B_SQ1]], [[A]] +; CHECK-NEXT: [[A2_B2:%.*]] = mul i32 [[A]], [[NOT_B_SQ1]] ; CHECK-NEXT: [[AB2:%.*]] = add i32 [[TWOAB]], [[A2_B2]] ; CHECK-NEXT: ret i32 [[AB2]] ; diff --git a/llvm/test/Transforms/InstCombine/add2.ll b/llvm/test/Transforms/InstCombine/add2.ll index 9ebcdac77179ee..ae80ab2e92ad15 100644 --- a/llvm/test/Transforms/InstCombine/add2.ll +++ b/llvm/test/Transforms/InstCombine/add2.ll @@ -452,7 +452,7 @@ define i8 @add_of_mul(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @add_of_mul( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MB1:%.*]] = add i8 [[Y:%.*]], [[Z:%.*]] -; CHECK-NEXT: [[SUM:%.*]] = mul i8 [[MB1]], [[X:%.*]] +; CHECK-NEXT: [[SUM:%.*]] = mul i8 [[X:%.*]], [[MB1]] ; CHECK-NEXT: ret i8 [[SUM]] ; entry: diff --git a/llvm/test/Transforms/InstCombine/add_or_sub.ll b/llvm/test/Transforms/InstCombine/add_or_sub.ll index 5f1234618b9a62..ef44f036b71fa4 100644 --- a/llvm/test/Transforms/InstCombine/add_or_sub.ll +++ b/llvm/test/Transforms/InstCombine/add_or_sub.ll @@ -103,7 +103,7 @@ define i12 @add_or_sub_comb_i12_multiuse_only_sub(i12 %p) { define i8 @add_or_sub_comb_i8_negative_y_sub(i8 %x, i8 %y) { ; CHECK-LABEL: @add_or_sub_comb_i8_negative_y_sub( ; CHECK-NEXT: [[SUB:%.*]] = sub i8 0, [[Y:%.*]] -; CHECK-NEXT: [[OR:%.*]] = or i8 [[SUB]], [[X:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i8 [[X:%.*]], [[SUB]] ; CHECK-NEXT: [[ADD:%.*]] = add i8 [[OR]], [[X]] ; CHECK-NEXT: ret i8 [[ADD]] ; @@ -116,7 +116,7 @@ define i8 @add_or_sub_comb_i8_negative_y_sub(i8 %x, i8 %y) { define i8 @add_or_sub_comb_i8_negative_y_or(i8 %x, i8 %y) { ; CHECK-LABEL: @add_or_sub_comb_i8_negative_y_or( ; CHECK-NEXT: [[SUB:%.*]] = sub i8 0, [[X:%.*]] -; CHECK-NEXT: [[OR:%.*]] = or i8 [[SUB]], [[Y:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i8 [[Y:%.*]], [[SUB]] ; CHECK-NEXT: [[ADD:%.*]] = add i8 [[OR]], [[X]] ; CHECK-NEXT: ret i8 [[ADD]] ; @@ -129,7 +129,7 @@ define i8 @add_or_sub_comb_i8_negative_y_or(i8 %x, i8 %y) { define i8 @add_or_sub_comb_i8_negative_y_add(i8 %x, i8 %y) { ; CHECK-LABEL: @add_or_sub_comb_i8_negative_y_add( ; CHECK-NEXT: [[SUB:%.*]] = sub i8 0, [[X:%.*]] -; CHECK-NEXT: [[OR:%.*]] = or i8 [[SUB]], [[X]] +; CHECK-NEXT: [[OR:%.*]] = or i8 [[X]], [[SUB]] ; CHECK-NEXT: [[ADD:%.*]] = add i8 [[OR]], [[Y:%.*]] ; CHECK-NEXT: ret i8 [[ADD]] ; @@ -142,7 +142,7 @@ define i8 @add_or_sub_comb_i8_negative_y_add(i8 %x, i8 %y) { define i8 @add_or_sub_comb_i8_negative_xor_instead_or(i8 %x) { ; CHECK-LABEL: @add_or_sub_comb_i8_negative_xor_instead_or( ; CHECK-NEXT: [[SUB:%.*]] = sub i8 0, [[X:%.*]] -; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[SUB]], [[X]] +; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X]], [[SUB]] ; CHECK-NEXT: [[ADD:%.*]] = add i8 [[XOR]], [[X]] ; CHECK-NEXT: ret i8 [[ADD]] ; diff --git a/llvm/test/Transforms/InstCombine/and-or-icmp-const-icmp.ll b/llvm/test/Transforms/InstCombine/and-or-icmp-const-icmp.ll index 9365f8281ccbd3..de5de37fe2df64 100644 --- a/llvm/test/Transforms/InstCombine/and-or-icmp-const-icmp.ll +++ b/llvm/test/Transforms/InstCombine/and-or-icmp-const-icmp.ll @@ -275,7 +275,7 @@ define i1 @ne_commuted_equal_minus_1(i8 %x, i8 %py) { ; CHECK-LABEL: define i1 @ne_commuted_equal_minus_1( ; CHECK-SAME: i8 [[X:%.*]], i8 [[PY:%.*]]) { ; CHECK-NEXT: [[Y:%.*]] = sdiv i8 42, [[PY]] -; CHECK-NEXT: [[AND:%.*]] = icmp ugt i8 [[Y]], [[X]] +; CHECK-NEXT: [[AND:%.*]] = icmp ult i8 [[X]], [[Y]] ; CHECK-NEXT: ret i1 [[AND]] ; %y = sdiv i8 42, %py ; thwart complexity-based canonicalization diff --git a/llvm/test/Transforms/InstCombine/and-or-icmps.ll b/llvm/test/Transforms/InstCombine/and-or-icmps.ll index 7d4fddc1563fed..74ef365db8d225 100644 --- a/llvm/test/Transforms/InstCombine/and-or-icmps.ll +++ b/llvm/test/Transforms/InstCombine/and-or-icmps.ll @@ -1320,7 +1320,7 @@ define i1 @bitwise_and_bitwise_and_icmps(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[Z_SHIFT]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i8 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[AND2:%.*]] = and i1 [[C1]], [[TMP3]] ; CHECK-NEXT: ret i1 [[AND2]] @@ -1341,7 +1341,7 @@ define i1 @bitwise_and_bitwise_and_icmps_comm1(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[Z_SHIFT]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i8 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[AND2:%.*]] = and i1 [[C1]], [[TMP3]] ; CHECK-NEXT: ret i1 [[AND2]] @@ -1362,7 +1362,7 @@ define i1 @bitwise_and_bitwise_and_icmps_comm2(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[Z_SHIFT]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i8 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[AND2:%.*]] = and i1 [[TMP3]], [[C1]] ; CHECK-NEXT: ret i1 [[AND2]] @@ -1383,7 +1383,7 @@ define i1 @bitwise_and_bitwise_and_icmps_comm3(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[Z_SHIFT]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i8 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[AND2:%.*]] = and i1 [[TMP3]], [[C1]] ; CHECK-NEXT: ret i1 [[AND2]] @@ -1404,7 +1404,7 @@ define i1 @bitwise_and_logical_and_icmps(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[Z_SHIFT]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i8 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[AND2:%.*]] = select i1 [[C1]], i1 [[TMP3]], i1 false ; CHECK-NEXT: ret i1 [[AND2]] @@ -1425,7 +1425,7 @@ define i1 @bitwise_and_logical_and_icmps_comm1(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[Z_SHIFT]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i8 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[AND2:%.*]] = select i1 [[C1]], i1 [[TMP3]], i1 false ; CHECK-NEXT: ret i1 [[AND2]] @@ -1447,7 +1447,7 @@ define i1 @bitwise_and_logical_and_icmps_comm2(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = freeze i8 [[Z_SHIFT]] ; CHECK-NEXT: [[TMP2:%.*]] = or i8 [[TMP1]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = and i8 [[TMP2]], [[X:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = and i8 [[X:%.*]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i8 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[AND2:%.*]] = select i1 [[TMP4]], i1 [[C1]], i1 false ; CHECK-NEXT: ret i1 [[AND2]] @@ -1468,7 +1468,7 @@ define i1 @bitwise_and_logical_and_icmps_comm3(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[Z_SHIFT]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i8 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[AND2:%.*]] = select i1 [[TMP3]], i1 [[C1]], i1 false ; CHECK-NEXT: ret i1 [[AND2]] @@ -1489,7 +1489,7 @@ define i1 @logical_and_bitwise_and_icmps(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[X_M1:%.*]] = and i8 [[X:%.*]], 1 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] -; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[Z_SHIFT]], [[X]] +; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[X]], [[Z_SHIFT]] ; CHECK-NEXT: [[C2:%.*]] = icmp ne i8 [[X_M1]], 0 ; CHECK-NEXT: [[C3:%.*]] = icmp ne i8 [[X_M2]], 0 ; CHECK-NEXT: [[AND1:%.*]] = and i1 [[C1]], [[C2]] @@ -1512,7 +1512,7 @@ define i1 @logical_and_bitwise_and_icmps_comm1(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[X_M1:%.*]] = and i8 [[X:%.*]], 1 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] -; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[Z_SHIFT]], [[X]] +; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[X]], [[Z_SHIFT]] ; CHECK-NEXT: [[C2:%.*]] = icmp ne i8 [[X_M1]], 0 ; CHECK-NEXT: [[C3:%.*]] = icmp ne i8 [[X_M2]], 0 ; CHECK-NEXT: [[AND1:%.*]] = and i1 [[C1]], [[C2]] @@ -1535,7 +1535,7 @@ define i1 @logical_and_bitwise_and_icmps_comm2(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[X_M1:%.*]] = and i8 [[X:%.*]], 1 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] -; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[Z_SHIFT]], [[X]] +; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[X]], [[Z_SHIFT]] ; CHECK-NEXT: [[C2:%.*]] = icmp ne i8 [[X_M1]], 0 ; CHECK-NEXT: [[C3:%.*]] = icmp ne i8 [[X_M2]], 0 ; CHECK-NEXT: [[AND1:%.*]] = and i1 [[C2]], [[C1]] @@ -1558,7 +1558,7 @@ define i1 @logical_and_bitwise_and_icmps_comm3(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[X_M1:%.*]] = and i8 [[X:%.*]], 1 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] -; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[Z_SHIFT]], [[X]] +; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[X]], [[Z_SHIFT]] ; CHECK-NEXT: [[C2:%.*]] = icmp ne i8 [[X_M1]], 0 ; CHECK-NEXT: [[C3:%.*]] = icmp ne i8 [[X_M2]], 0 ; CHECK-NEXT: [[AND1:%.*]] = and i1 [[C2]], [[C1]] @@ -1581,7 +1581,7 @@ define i1 @logical_and_logical_and_icmps(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[X_M1:%.*]] = and i8 [[X:%.*]], 1 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] -; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[Z_SHIFT]], [[X]] +; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[X]], [[Z_SHIFT]] ; CHECK-NEXT: [[C2:%.*]] = icmp ne i8 [[X_M1]], 0 ; CHECK-NEXT: [[C3:%.*]] = icmp ne i8 [[X_M2]], 0 ; CHECK-NEXT: [[AND1:%.*]] = select i1 [[C1]], i1 [[C2]], i1 false @@ -1604,7 +1604,7 @@ define i1 @logical_and_logical_and_icmps_comm1(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[X_M1:%.*]] = and i8 [[X:%.*]], 1 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] -; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[Z_SHIFT]], [[X]] +; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[X]], [[Z_SHIFT]] ; CHECK-NEXT: [[C2:%.*]] = icmp ne i8 [[X_M1]], 0 ; CHECK-NEXT: [[C3:%.*]] = icmp ne i8 [[X_M2]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[C3]], i1 [[C1]], i1 false @@ -1627,7 +1627,7 @@ define i1 @logical_and_logical_and_icmps_comm2(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[X_M1:%.*]] = and i8 [[X:%.*]], 1 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] -; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[Z_SHIFT]], [[X]] +; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[X]], [[Z_SHIFT]] ; CHECK-NEXT: [[C2:%.*]] = icmp ne i8 [[X_M1]], 0 ; CHECK-NEXT: [[C3:%.*]] = icmp ne i8 [[X_M2]], 0 ; CHECK-NEXT: [[AND1:%.*]] = select i1 [[C2]], i1 [[C1]], i1 false @@ -1650,7 +1650,7 @@ define i1 @logical_and_logical_and_icmps_comm3(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[Z_SHIFT]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i8 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[AND2:%.*]] = select i1 [[TMP3]], i1 [[C1]], i1 false ; CHECK-NEXT: ret i1 [[AND2]] @@ -1671,7 +1671,7 @@ define i1 @bitwise_or_bitwise_or_icmps(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[Z_SHIFT]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i8 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = or i1 [[C1]], [[TMP3]] ; CHECK-NEXT: ret i1 [[OR2]] @@ -1692,7 +1692,7 @@ define i1 @bitwise_or_bitwise_or_icmps_comm1(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[Z_SHIFT]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i8 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = or i1 [[C1]], [[TMP3]] ; CHECK-NEXT: ret i1 [[OR2]] @@ -1713,7 +1713,7 @@ define i1 @bitwise_or_bitwise_or_icmps_comm2(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[Z_SHIFT]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i8 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = or i1 [[TMP3]], [[C1]] ; CHECK-NEXT: ret i1 [[OR2]] @@ -1734,7 +1734,7 @@ define i1 @bitwise_or_bitwise_or_icmps_comm3(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[Z_SHIFT]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i8 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = or i1 [[TMP3]], [[C1]] ; CHECK-NEXT: ret i1 [[OR2]] @@ -1755,7 +1755,7 @@ define i1 @bitwise_or_logical_or_icmps(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[Z_SHIFT]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i8 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = select i1 [[C1]], i1 true, i1 [[TMP3]] ; CHECK-NEXT: ret i1 [[OR2]] @@ -1776,7 +1776,7 @@ define i1 @bitwise_or_logical_or_icmps_comm1(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[Z_SHIFT]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i8 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = select i1 [[C1]], i1 true, i1 [[TMP3]] ; CHECK-NEXT: ret i1 [[OR2]] @@ -1798,7 +1798,7 @@ define i1 @bitwise_or_logical_or_icmps_comm2(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = freeze i8 [[Z_SHIFT]] ; CHECK-NEXT: [[TMP2:%.*]] = or i8 [[TMP1]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = and i8 [[TMP2]], [[X:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = and i8 [[X:%.*]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i8 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[OR2:%.*]] = select i1 [[TMP4]], i1 true, i1 [[C1]] ; CHECK-NEXT: ret i1 [[OR2]] @@ -1819,7 +1819,7 @@ define i1 @bitwise_or_logical_or_icmps_comm3(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[Z_SHIFT]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i8 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = select i1 [[TMP3]], i1 true, i1 [[C1]] ; CHECK-NEXT: ret i1 [[OR2]] @@ -1840,7 +1840,7 @@ define i1 @logical_or_bitwise_or_icmps(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[X_M1:%.*]] = and i8 [[X:%.*]], 1 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] -; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[Z_SHIFT]], [[X]] +; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[X]], [[Z_SHIFT]] ; CHECK-NEXT: [[C2:%.*]] = icmp eq i8 [[X_M1]], 0 ; CHECK-NEXT: [[C3:%.*]] = icmp eq i8 [[X_M2]], 0 ; CHECK-NEXT: [[OR1:%.*]] = or i1 [[C1]], [[C2]] @@ -1863,7 +1863,7 @@ define i1 @logical_or_bitwise_or_icmps_comm1(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[X_M1:%.*]] = and i8 [[X:%.*]], 1 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] -; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[Z_SHIFT]], [[X]] +; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[X]], [[Z_SHIFT]] ; CHECK-NEXT: [[C2:%.*]] = icmp eq i8 [[X_M1]], 0 ; CHECK-NEXT: [[C3:%.*]] = icmp eq i8 [[X_M2]], 0 ; CHECK-NEXT: [[OR1:%.*]] = or i1 [[C1]], [[C2]] @@ -1886,7 +1886,7 @@ define i1 @logical_or_bitwise_or_icmps_comm2(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[X_M1:%.*]] = and i8 [[X:%.*]], 1 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] -; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[Z_SHIFT]], [[X]] +; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[X]], [[Z_SHIFT]] ; CHECK-NEXT: [[C2:%.*]] = icmp eq i8 [[X_M1]], 0 ; CHECK-NEXT: [[C3:%.*]] = icmp eq i8 [[X_M2]], 0 ; CHECK-NEXT: [[OR1:%.*]] = or i1 [[C2]], [[C1]] @@ -1909,7 +1909,7 @@ define i1 @logical_or_bitwise_or_icmps_comm3(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[X_M1:%.*]] = and i8 [[X:%.*]], 1 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] -; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[Z_SHIFT]], [[X]] +; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[X]], [[Z_SHIFT]] ; CHECK-NEXT: [[C2:%.*]] = icmp eq i8 [[X_M1]], 0 ; CHECK-NEXT: [[C3:%.*]] = icmp eq i8 [[X_M2]], 0 ; CHECK-NEXT: [[OR1:%.*]] = or i1 [[C2]], [[C1]] @@ -1932,7 +1932,7 @@ define i1 @logical_or_logical_or_icmps(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[X_M1:%.*]] = and i8 [[X:%.*]], 1 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] -; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[Z_SHIFT]], [[X]] +; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[X]], [[Z_SHIFT]] ; CHECK-NEXT: [[C2:%.*]] = icmp eq i8 [[X_M1]], 0 ; CHECK-NEXT: [[C3:%.*]] = icmp eq i8 [[X_M2]], 0 ; CHECK-NEXT: [[OR1:%.*]] = select i1 [[C1]], i1 true, i1 [[C2]] @@ -1955,7 +1955,7 @@ define i1 @logical_or_logical_or_icmps_comm1(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[X_M1:%.*]] = and i8 [[X:%.*]], 1 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] -; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[Z_SHIFT]], [[X]] +; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[X]], [[Z_SHIFT]] ; CHECK-NEXT: [[C2:%.*]] = icmp eq i8 [[X_M1]], 0 ; CHECK-NEXT: [[C3:%.*]] = icmp eq i8 [[X_M2]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[C3]], i1 true, i1 [[C1]] @@ -1978,7 +1978,7 @@ define i1 @logical_or_logical_or_icmps_comm2(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[X_M1:%.*]] = and i8 [[X:%.*]], 1 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] -; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[Z_SHIFT]], [[X]] +; CHECK-NEXT: [[X_M2:%.*]] = and i8 [[X]], [[Z_SHIFT]] ; CHECK-NEXT: [[C2:%.*]] = icmp eq i8 [[X_M1]], 0 ; CHECK-NEXT: [[C3:%.*]] = icmp eq i8 [[X_M2]], 0 ; CHECK-NEXT: [[OR1:%.*]] = select i1 [[C2]], i1 true, i1 [[C1]] @@ -2001,7 +2001,7 @@ define i1 @logical_or_logical_or_icmps_comm3(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[Y:%.*]], 42 ; CHECK-NEXT: [[Z_SHIFT:%.*]] = shl nuw i8 1, [[Z:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[Z_SHIFT]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i8 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = select i1 [[TMP3]], i1 true, i1 [[C1]] ; CHECK-NEXT: ret i1 [[OR2]] @@ -2052,7 +2052,7 @@ define i1 @bitwise_and_logical_and_masked_icmp_allzeros(i1 %c, i32 %x) { define i1 @bitwise_and_logical_and_masked_icmp_allzeros_poison1(i1 %c, i32 %x, i32 %y) { ; CHECK-LABEL: @bitwise_and_logical_and_masked_icmp_allzeros_poison1( ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[Y:%.*]], 7 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0 ; CHECK-NEXT: [[AND2:%.*]] = select i1 [[TMP3]], i1 [[C:%.*]], i1 false ; CHECK-NEXT: ret i1 [[AND2]] @@ -2104,7 +2104,7 @@ define i1 @bitwise_and_logical_and_masked_icmp_allones(i1 %c, i32 %x) { define i1 @bitwise_and_logical_and_masked_icmp_allones_poison1(i1 %c, i32 %x, i32 %y) { ; CHECK-LABEL: @bitwise_and_logical_and_masked_icmp_allones_poison1( ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[Y:%.*]], 7 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[AND2:%.*]] = select i1 [[TMP3]], i1 [[C:%.*]], i1 false ; CHECK-NEXT: ret i1 [[AND2]] @@ -3118,8 +3118,8 @@ entry: define i1 @icmp_eq_or_z_or_pow2orz(i8 %x, i8 %y) { ; CHECK-LABEL: @icmp_eq_or_z_or_pow2orz( ; CHECK-NEXT: [[NY:%.*]] = sub i8 0, [[Y:%.*]] -; CHECK-NEXT: [[POW2ORZ:%.*]] = and i8 [[NY]], [[Y]] -; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[POW2ORZ]], [[X:%.*]] +; CHECK-NEXT: [[POW2ORZ:%.*]] = and i8 [[Y]], [[NY]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], [[POW2ORZ]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[TMP1]], [[X]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -3136,8 +3136,8 @@ define i1 @icmp_eq_or_z_or_pow2orz(i8 %x, i8 %y) { define i1 @icmp_eq_or_z_or_pow2orz_logical(i8 %x, i8 %y) { ; CHECK-LABEL: @icmp_eq_or_z_or_pow2orz_logical( ; CHECK-NEXT: [[NY:%.*]] = sub i8 0, [[Y:%.*]] -; CHECK-NEXT: [[POW2ORZ:%.*]] = and i8 [[NY]], [[Y]] -; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[POW2ORZ]], [[X:%.*]] +; CHECK-NEXT: [[POW2ORZ:%.*]] = and i8 [[Y]], [[NY]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], [[POW2ORZ]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[TMP1]], [[X]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -3155,9 +3155,9 @@ define i1 @icmp_eq_or_z_or_pow2orz_logical(i8 %x, i8 %y) { define i1 @icmp_eq_or_z_or_pow2orz_fail_multiuse(i8 %x, i8 %y) { ; CHECK-LABEL: @icmp_eq_or_z_or_pow2orz_fail_multiuse( ; CHECK-NEXT: [[NY:%.*]] = sub i8 0, [[Y:%.*]] -; CHECK-NEXT: [[POW2ORZ:%.*]] = and i8 [[NY]], [[Y]] +; CHECK-NEXT: [[POW2ORZ:%.*]] = and i8 [[Y]], [[NY]] ; CHECK-NEXT: [[C0:%.*]] = icmp eq i8 [[X:%.*]], 0 -; CHECK-NEXT: [[CP2:%.*]] = icmp eq i8 [[POW2ORZ]], [[X]] +; CHECK-NEXT: [[CP2:%.*]] = icmp eq i8 [[X]], [[POW2ORZ]] ; CHECK-NEXT: call void @use(i1 [[C0]]) ; CHECK-NEXT: [[R:%.*]] = or i1 [[C0]], [[CP2]] ; CHECK-NEXT: ret i1 [[R]] @@ -3176,9 +3176,9 @@ define i1 @icmp_eq_or_z_or_pow2orz_fail_multiuse(i8 %x, i8 %y) { define i1 @icmp_eq_or_z_or_pow2orz_fail_logic_or(i8 %x, i8 %y) { ; CHECK-LABEL: @icmp_eq_or_z_or_pow2orz_fail_logic_or( ; CHECK-NEXT: [[NY:%.*]] = sub i8 0, [[Y:%.*]] -; CHECK-NEXT: [[POW2ORZ:%.*]] = and i8 [[NY]], [[Y]] +; CHECK-NEXT: [[POW2ORZ:%.*]] = and i8 [[Y]], [[NY]] ; CHECK-NEXT: [[C0:%.*]] = icmp eq i8 [[X:%.*]], 0 -; CHECK-NEXT: [[CP2:%.*]] = icmp eq i8 [[POW2ORZ]], [[X]] +; CHECK-NEXT: [[CP2:%.*]] = icmp eq i8 [[X]], [[POW2ORZ]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[C0]], i1 true, i1 [[CP2]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -3195,8 +3195,8 @@ define i1 @icmp_eq_or_z_or_pow2orz_fail_logic_or(i8 %x, i8 %y) { define <2 x i1> @icmp_ne_and_z_and_pow2orz(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @icmp_ne_and_z_and_pow2orz( ; CHECK-NEXT: [[NY:%.*]] = sub <2 x i8> zeroinitializer, [[Y:%.*]] -; CHECK-NEXT: [[POW2ORZ:%.*]] = and <2 x i8> [[NY]], [[Y]] -; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i8> [[POW2ORZ]], [[X:%.*]] +; CHECK-NEXT: [[POW2ORZ:%.*]] = and <2 x i8> [[Y]], [[NY]] +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i8> [[X:%.*]], [[POW2ORZ]] ; CHECK-NEXT: [[R:%.*]] = icmp ne <2 x i8> [[TMP1]], [[X]] ; CHECK-NEXT: ret <2 x i1> [[R]] ; @@ -3226,9 +3226,9 @@ define i1 @icmp_ne_and_z_and_onefail(i8 %x) { define i1 @icmp_ne_and_z_and_pow2orz_fail_multiuse1(i8 %x, i8 %y) { ; CHECK-LABEL: @icmp_ne_and_z_and_pow2orz_fail_multiuse1( ; CHECK-NEXT: [[NY:%.*]] = sub i8 0, [[Y:%.*]] -; CHECK-NEXT: [[POW2ORZ:%.*]] = and i8 [[NY]], [[Y]] +; CHECK-NEXT: [[POW2ORZ:%.*]] = and i8 [[Y]], [[NY]] ; CHECK-NEXT: [[C0:%.*]] = icmp eq i8 [[X:%.*]], 0 -; CHECK-NEXT: [[CP2:%.*]] = icmp eq i8 [[POW2ORZ]], [[X]] +; CHECK-NEXT: [[CP2:%.*]] = icmp eq i8 [[X]], [[POW2ORZ]] ; CHECK-NEXT: call void @use(i1 [[C0]]) ; CHECK-NEXT: [[R:%.*]] = or i1 [[C0]], [[CP2]] ; CHECK-NEXT: ret i1 [[R]] @@ -3247,9 +3247,9 @@ define i1 @icmp_ne_and_z_and_pow2orz_fail_multiuse1(i8 %x, i8 %y) { define <2 x i1> @icmp_ne_and_z_and_pow2orz_fail_logic_and(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @icmp_ne_and_z_and_pow2orz_fail_logic_and( ; CHECK-NEXT: [[NY:%.*]] = sub <2 x i8> zeroinitializer, [[Y:%.*]] -; CHECK-NEXT: [[POW2ORZ:%.*]] = and <2 x i8> [[NY]], [[Y]] +; CHECK-NEXT: [[POW2ORZ:%.*]] = and <2 x i8> [[Y]], [[NY]] ; CHECK-NEXT: [[C0:%.*]] = icmp ne <2 x i8> [[X:%.*]], zeroinitializer -; CHECK-NEXT: [[CP2:%.*]] = icmp ne <2 x i8> [[POW2ORZ]], [[X]] +; CHECK-NEXT: [[CP2:%.*]] = icmp ne <2 x i8> [[X]], [[POW2ORZ]] ; CHECK-NEXT: [[R:%.*]] = select <2 x i1> [[C0]], <2 x i1> [[CP2]], <2 x i1> zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[R]] ; @@ -3267,7 +3267,7 @@ define i1 @icmp_eq_or_z_or_pow2orz_fail_not_pow2(i8 %x, i8 %y) { ; CHECK-NEXT: [[NY:%.*]] = sub i8 1, [[Y:%.*]] ; CHECK-NEXT: [[POW2ORZ:%.*]] = and i8 [[NY]], [[Y]] ; CHECK-NEXT: [[C0:%.*]] = icmp eq i8 [[X:%.*]], 0 -; CHECK-NEXT: [[CP2:%.*]] = icmp eq i8 [[POW2ORZ]], [[X]] +; CHECK-NEXT: [[CP2:%.*]] = icmp eq i8 [[X]], [[POW2ORZ]] ; CHECK-NEXT: [[R:%.*]] = or i1 [[C0]], [[CP2]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -3283,9 +3283,9 @@ define i1 @icmp_eq_or_z_or_pow2orz_fail_not_pow2(i8 %x, i8 %y) { define i1 @icmp_eq_or_z_or_pow2orz_fail_nonzero_const(i8 %x, i8 %y) { ; CHECK-LABEL: @icmp_eq_or_z_or_pow2orz_fail_nonzero_const( ; CHECK-NEXT: [[NY:%.*]] = sub i8 0, [[Y:%.*]] -; CHECK-NEXT: [[POW2ORZ:%.*]] = and i8 [[NY]], [[Y]] +; CHECK-NEXT: [[POW2ORZ:%.*]] = and i8 [[Y]], [[NY]] ; CHECK-NEXT: [[C0:%.*]] = icmp eq i8 [[X:%.*]], 1 -; CHECK-NEXT: [[CP2:%.*]] = icmp eq i8 [[POW2ORZ]], [[X]] +; CHECK-NEXT: [[CP2:%.*]] = icmp eq i8 [[X]], [[POW2ORZ]] ; CHECK-NEXT: [[R:%.*]] = or i1 [[C0]], [[CP2]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -3301,8 +3301,8 @@ define i1 @icmp_eq_or_z_or_pow2orz_fail_nonzero_const(i8 %x, i8 %y) { define <2 x i1> @icmp_ne_and_z_and_pow2orz_fail_bad_pred(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @icmp_ne_and_z_and_pow2orz_fail_bad_pred( ; CHECK-NEXT: [[NY:%.*]] = sub <2 x i8> zeroinitializer, [[Y:%.*]] -; CHECK-NEXT: [[POW2ORZ:%.*]] = and <2 x i8> [[NY]], [[Y]] -; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i8> [[POW2ORZ]], [[X:%.*]] +; CHECK-NEXT: [[POW2ORZ:%.*]] = and <2 x i8> [[Y]], [[NY]] +; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i8> [[X:%.*]], [[POW2ORZ]] ; CHECK-NEXT: [[R:%.*]] = icmp eq <2 x i8> [[TMP1]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[R]] ; @@ -3318,9 +3318,9 @@ define <2 x i1> @icmp_ne_and_z_and_pow2orz_fail_bad_pred(<2 x i8> %x, <2 x i8> % define i1 @icmp_eq_or_z_or_pow2orz_fail_bad_pred2(i8 %x, i8 %y) { ; CHECK-LABEL: @icmp_eq_or_z_or_pow2orz_fail_bad_pred2( ; CHECK-NEXT: [[NY:%.*]] = sub i8 0, [[Y:%.*]] -; CHECK-NEXT: [[POW2ORZ:%.*]] = and i8 [[NY]], [[Y]] +; CHECK-NEXT: [[POW2ORZ:%.*]] = and i8 [[Y]], [[NY]] ; CHECK-NEXT: [[C0:%.*]] = icmp slt i8 [[X:%.*]], 1 -; CHECK-NEXT: [[CP2:%.*]] = icmp sge i8 [[POW2ORZ]], [[X]] +; CHECK-NEXT: [[CP2:%.*]] = icmp sle i8 [[X]], [[POW2ORZ]] ; CHECK-NEXT: [[R:%.*]] = or i1 [[C0]], [[CP2]] ; CHECK-NEXT: ret i1 [[R]] ; diff --git a/llvm/test/Transforms/InstCombine/and-or-not.ll b/llvm/test/Transforms/InstCombine/and-or-not.ll index 2e351c30ea1f7b..5e6c480df5d103 100644 --- a/llvm/test/Transforms/InstCombine/and-or-not.ll +++ b/llvm/test/Transforms/InstCombine/and-or-not.ll @@ -506,8 +506,8 @@ define i64 @PR32830(i64 %a, i64 %b, i64 %c) { ; CHECK-LABEL: @PR32830( ; CHECK-NEXT: [[NOTA:%.*]] = xor i64 [[A:%.*]], -1 ; CHECK-NEXT: [[NOTB:%.*]] = xor i64 [[B:%.*]], -1 -; CHECK-NEXT: [[OR1:%.*]] = or i64 [[NOTB]], [[A]] -; CHECK-NEXT: [[OR2:%.*]] = or i64 [[NOTA]], [[C:%.*]] +; CHECK-NEXT: [[OR1:%.*]] = or i64 [[A]], [[NOTB]] +; CHECK-NEXT: [[OR2:%.*]] = or i64 [[C:%.*]], [[NOTA]] ; CHECK-NEXT: [[AND:%.*]] = and i64 [[OR1]], [[OR2]] ; CHECK-NEXT: ret i64 [[AND]] ; @@ -813,7 +813,7 @@ define i4 @reduce_xor_common_op_commute1(i4 %x, i4 %y, i4 %z) { define i4 @annihilate_xor_common_op_commute2(i4 %x, i4 %y, i4 %p, i4 %q) { ; CHECK-LABEL: @annihilate_xor_common_op_commute2( ; CHECK-NEXT: [[Z:%.*]] = mul i4 [[P:%.*]], [[P]] -; CHECK-NEXT: [[TMP1:%.*]] = xor i4 [[Z]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i4 [[Y:%.*]], [[Z]] ; CHECK-NEXT: [[TMP2:%.*]] = xor i4 [[TMP1]], [[Q:%.*]] ; CHECK-NEXT: ret i4 [[TMP2]] ; @@ -828,8 +828,8 @@ define i4 @annihilate_xor_common_op_commute2(i4 %x, i4 %y, i4 %p, i4 %q) { define <2 x i4> @reduce_xor_common_op_commute3(<2 x i4> %x, <2 x i4> %y, <2 x i4> %p) { ; CHECK-LABEL: @reduce_xor_common_op_commute3( ; CHECK-NEXT: [[Z:%.*]] = mul <2 x i4> [[P:%.*]], [[P]] -; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i4> [[Z]], [[Y:%.*]] -; CHECK-NEXT: [[R:%.*]] = or <2 x i4> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i4> [[Y:%.*]], [[Z]] +; CHECK-NEXT: [[R:%.*]] = or <2 x i4> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i4> [[R]] ; %z = mul <2 x i4> %p, %p ; thwart complexity-based canonicalization diff --git a/llvm/test/Transforms/InstCombine/and-or.ll b/llvm/test/Transforms/InstCombine/and-or.ll index b4ef27607121d2..fee055a2e12451 100644 --- a/llvm/test/Transforms/InstCombine/and-or.ll +++ b/llvm/test/Transforms/InstCombine/and-or.ll @@ -385,7 +385,7 @@ define i8 @or_or_and_noOneUse(i8 %a, i8 %b, i8 %c, i8 %d) { ; CHECK-NEXT: call void @use(i8 [[AND1]]) ; CHECK-NEXT: [[AND2:%.*]] = and i8 [[A]], [[D:%.*]] ; CHECK-NEXT: call void @use(i8 [[AND2]]) -; CHECK-NEXT: [[OR1:%.*]] = or i8 [[AND2]], [[C:%.*]] +; CHECK-NEXT: [[OR1:%.*]] = or i8 [[C:%.*]], [[AND2]] ; CHECK-NEXT: call void @use(i8 [[OR1]]) ; CHECK-NEXT: [[OR2:%.*]] = or i8 [[OR1]], [[AND1]] ; CHECK-NEXT: ret i8 [[OR2]] @@ -405,7 +405,7 @@ define i8 @or_or_and_pat1(i8 %a, i8 %b, i8 %c, i8 %d) { ; CHECK-LABEL: @or_or_and_pat1( ; CHECK-NEXT: [[CT:%.*]] = udiv i8 42, [[C:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[D:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = or i8 [[CT]], [[TMP2]] ; CHECK-NEXT: ret i8 [[OR2]] ; @@ -439,7 +439,7 @@ define i8 @or_or_and_pat3(i8 %a, i8 %b, i8 %c, i8 %d) { ; CHECK-LABEL: @or_or_and_pat3( ; CHECK-NEXT: [[CT:%.*]] = udiv i8 42, [[C:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[D:%.*]], [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = or i8 [[CT]], [[TMP2]] ; CHECK-NEXT: ret i8 [[OR2]] ; @@ -472,7 +472,7 @@ define i8 @or_or_and_pat4(i8 %a, i8 %b, i8 %c, i8 %d) { define i8 @or_or_and_pat5(i8 %a, i8 %b, i8 %c, i8 %d) { ; CHECK-LABEL: @or_or_and_pat5( ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[D:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = or i8 [[TMP2]], [[C:%.*]] ; CHECK-NEXT: ret i8 [[OR2]] ; @@ -502,7 +502,7 @@ define i8 @or_or_and_pat6(i8 %a, i8 %b, i8 %c, i8 %d) { define i8 @or_or_and_pat7(i8 %a, i8 %b, i8 %c, i8 %d) { ; CHECK-LABEL: @or_or_and_pat7( ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[D:%.*]], [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = or i8 [[TMP2]], [[C:%.*]] ; CHECK-NEXT: ret i8 [[OR2]] ; @@ -535,7 +535,7 @@ define i8 @or_and_or_noOneUse(i8 %a, i8 %b, i8 %c, i8 %d) { ; CHECK-NEXT: call void @use(i8 [[AND1]]) ; CHECK-NEXT: [[AND2:%.*]] = and i8 [[A]], [[D:%.*]] ; CHECK-NEXT: call void @use(i8 [[AND2]]) -; CHECK-NEXT: [[OR1:%.*]] = or i8 [[AND2]], [[C:%.*]] +; CHECK-NEXT: [[OR1:%.*]] = or i8 [[C:%.*]], [[AND2]] ; CHECK-NEXT: call void @use(i8 [[OR1]]) ; CHECK-NEXT: [[OR2:%.*]] = or i8 [[AND1]], [[OR1]] ; CHECK-NEXT: ret i8 [[OR2]] @@ -555,7 +555,7 @@ define i8 @or_and_or_pat1(i8 %a, i8 %b, i8 %c, i8 %d) { ; CHECK-LABEL: @or_and_or_pat1( ; CHECK-NEXT: [[CT:%.*]] = udiv i8 42, [[C:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[D:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = or i8 [[CT]], [[TMP2]] ; CHECK-NEXT: ret i8 [[OR2]] ; @@ -589,7 +589,7 @@ define i8 @or_and_or_pat3(i8 %a, i8 %b, i8 %c, i8 %d) { ; CHECK-LABEL: @or_and_or_pat3( ; CHECK-NEXT: [[CT:%.*]] = udiv i8 42, [[C:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[D:%.*]], [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = or i8 [[CT]], [[TMP2]] ; CHECK-NEXT: ret i8 [[OR2]] ; @@ -622,7 +622,7 @@ define i8 @or_and_or_pat4(i8 %a, i8 %b, i8 %c, i8 %d) { define i8 @or_and_or_pat5(i8 %a, i8 %b, i8 %c, i8 %d) { ; CHECK-LABEL: @or_and_or_pat5( ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[D:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = or i8 [[TMP2]], [[C:%.*]] ; CHECK-NEXT: ret i8 [[OR2]] ; @@ -652,7 +652,7 @@ define i8 @or_and_or_pat6(i8 %a, i8 %b, i8 %c, i8 %d) { define i8 @or_and_or_pat7(i8 %a, i8 %b, i8 %c, i8 %d) { ; CHECK-LABEL: @or_and_or_pat7( ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[D:%.*]], [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = or i8 [[TMP2]], [[C:%.*]] ; CHECK-NEXT: ret i8 [[OR2]] ; @@ -687,8 +687,8 @@ define i32 @or_or_and_noOneUse_fail1(i32 %a, i32 %b) { ; CHECK-NEXT: call void @use2(i32 [[AND]]) ; CHECK-NEXT: [[AND1:%.*]] = or i32 [[B:%.*]], 157 ; CHECK-NEXT: [[OR:%.*]] = and i32 [[SHR]], [[AND1]] -; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[B]], 23 -; CHECK-NEXT: [[AND9:%.*]] = and i32 [[TMP1]], 157 +; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[B]], 23 +; CHECK-NEXT: [[AND9:%.*]] = and i32 [[SHR8]], 157 ; CHECK-NEXT: [[R:%.*]] = or i32 [[OR]], [[AND9]] ; CHECK-NEXT: ret i32 [[R]] ; @@ -714,7 +714,7 @@ define { i1, i1, i1, i1, i1 } @or_or_and_noOneUse_fail2(i1 %a_0, i1 %a_1, i1 %a_ ; CHECK-NEXT: [[TMP3:%.*]] = and i1 [[A_1:%.*]], [[B_1:%.*]] ; CHECK-NEXT: [[TMP4:%.*]] = xor i1 [[TMP3]], true ; CHECK-NEXT: [[TMP5:%.*]] = and i1 [[TMP0]], [[A_1]] -; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP2]], [[A_1]] +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[A_1]], [[TMP2]] ; CHECK-NEXT: [[TMP7:%.*]] = and i1 [[TMP6]], [[B_1]] ; CHECK-NEXT: [[D:%.*]] = or i1 [[TMP7]], [[TMP5]] ; CHECK-NEXT: [[DOTNOT1:%.*]] = or i1 [[TMP1]], [[TMP3]] diff --git a/llvm/test/Transforms/InstCombine/and-xor-merge.ll b/llvm/test/Transforms/InstCombine/and-xor-merge.ll index 80bdf67525faae..cf1285cbc11a47 100644 --- a/llvm/test/Transforms/InstCombine/and-xor-merge.ll +++ b/llvm/test/Transforms/InstCombine/and-xor-merge.ll @@ -5,7 +5,7 @@ define i32 @test1(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @test1( ; CHECK-NEXT: [[T61:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[T7:%.*]] = and i32 [[T61]], [[Z:%.*]] +; CHECK-NEXT: [[T7:%.*]] = and i32 [[Z:%.*]], [[T61]] ; CHECK-NEXT: ret i32 [[T7]] ; %t3 = and i32 %z, %x diff --git a/llvm/test/Transforms/InstCombine/and-xor-or.ll b/llvm/test/Transforms/InstCombine/and-xor-or.ll index b26d6e16c2db27..3dbf9af7e19343 100644 --- a/llvm/test/Transforms/InstCombine/and-xor-or.ll +++ b/llvm/test/Transforms/InstCombine/and-xor-or.ll @@ -339,8 +339,8 @@ define i64 @and_xor_or_negative(i64 %x, i64 %y, i64 %z, i64 %w) { ; CHECK-LABEL: define {{[^@]+}}@and_xor_or_negative ; CHECK-SAME: (i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[Z:%.*]], i64 [[W:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[Y]], [[X]] -; CHECK-NEXT: [[TMP2:%.*]] = xor i64 [[TMP1]], [[Z]] -; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[TMP2]], [[W]] +; CHECK-NEXT: [[TMP2:%.*]] = xor i64 [[Z]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[W]], [[TMP2]] ; CHECK-NEXT: ret i64 [[TMP3]] ; %1 = and i64 %y, %x @@ -585,7 +585,7 @@ define i64 @sext_or_chain(i64 %a, i16 %b, i16 %c) { ; CHECK-SAME: (i64 [[A:%.*]], i16 [[B:%.*]], i16 [[C:%.*]]) { ; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[B]] to i64 ; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[C]] to i64 -; CHECK-NEXT: [[OR:%.*]] = or i64 [[CONV]], [[A]] +; CHECK-NEXT: [[OR:%.*]] = or i64 [[A]], [[CONV]] ; CHECK-NEXT: [[OR2:%.*]] = or i64 [[OR]], [[CONV2]] ; CHECK-NEXT: ret i64 [[OR2]] ; @@ -601,7 +601,7 @@ define i64 @zext_or_chain(i64 %a, i16 %b, i16 %c) { ; CHECK-SAME: (i64 [[A:%.*]], i16 [[B:%.*]], i16 [[C:%.*]]) { ; CHECK-NEXT: [[CONV:%.*]] = zext i16 [[B]] to i64 ; CHECK-NEXT: [[CONV2:%.*]] = zext i16 [[C]] to i64 -; CHECK-NEXT: [[OR:%.*]] = or i64 [[CONV]], [[A]] +; CHECK-NEXT: [[OR:%.*]] = or i64 [[A]], [[CONV]] ; CHECK-NEXT: [[OR2:%.*]] = or i64 [[OR]], [[CONV2]] ; CHECK-NEXT: ret i64 [[OR2]] ; @@ -617,7 +617,7 @@ define i64 @sext_and_chain(i64 %a, i16 %b, i16 %c) { ; CHECK-SAME: (i64 [[A:%.*]], i16 [[B:%.*]], i16 [[C:%.*]]) { ; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[B]] to i64 ; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[C]] to i64 -; CHECK-NEXT: [[AND:%.*]] = and i64 [[CONV]], [[A]] +; CHECK-NEXT: [[AND:%.*]] = and i64 [[A]], [[CONV]] ; CHECK-NEXT: [[AND2:%.*]] = and i64 [[AND]], [[CONV2]] ; CHECK-NEXT: ret i64 [[AND2]] ; @@ -633,7 +633,7 @@ define i64 @zext_and_chain(i64 %a, i16 %b, i16 %c) { ; CHECK-SAME: (i64 [[A:%.*]], i16 [[B:%.*]], i16 [[C:%.*]]) { ; CHECK-NEXT: [[CONV:%.*]] = zext i16 [[B]] to i64 ; CHECK-NEXT: [[CONV2:%.*]] = zext i16 [[C]] to i64 -; CHECK-NEXT: [[AND:%.*]] = and i64 [[CONV]], [[A]] +; CHECK-NEXT: [[AND:%.*]] = and i64 [[A]], [[CONV]] ; CHECK-NEXT: [[AND2:%.*]] = and i64 [[AND]], [[CONV2]] ; CHECK-NEXT: ret i64 [[AND2]] ; @@ -649,7 +649,7 @@ define i64 @sext_xor_chain(i64 %a, i16 %b, i16 %c) { ; CHECK-SAME: (i64 [[A:%.*]], i16 [[B:%.*]], i16 [[C:%.*]]) { ; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[B]] to i64 ; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[C]] to i64 -; CHECK-NEXT: [[XOR:%.*]] = xor i64 [[CONV]], [[A]] +; CHECK-NEXT: [[XOR:%.*]] = xor i64 [[A]], [[CONV]] ; CHECK-NEXT: [[XOR2:%.*]] = xor i64 [[XOR]], [[CONV2]] ; CHECK-NEXT: ret i64 [[XOR2]] ; @@ -665,7 +665,7 @@ define i64 @zext_xor_chain(i64 %a, i16 %b, i16 %c) { ; CHECK-SAME: (i64 [[A:%.*]], i16 [[B:%.*]], i16 [[C:%.*]]) { ; CHECK-NEXT: [[CONV:%.*]] = zext i16 [[B]] to i64 ; CHECK-NEXT: [[CONV2:%.*]] = zext i16 [[C]] to i64 -; CHECK-NEXT: [[XOR:%.*]] = xor i64 [[CONV]], [[A]] +; CHECK-NEXT: [[XOR:%.*]] = xor i64 [[A]], [[CONV]] ; CHECK-NEXT: [[XOR2:%.*]] = xor i64 [[XOR]], [[CONV2]] ; CHECK-NEXT: ret i64 [[XOR2]] ; @@ -682,7 +682,7 @@ define i64 @sext_or_chain_two_uses1(i64 %a, i16 %b, i16 %c, i64 %d) { ; CHECK-SAME: (i64 [[A:%.*]], i16 [[B:%.*]], i16 [[C:%.*]], i64 [[D:%.*]]) { ; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[B]] to i64 ; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[C]] to i64 -; CHECK-NEXT: [[OR:%.*]] = or i64 [[CONV]], [[A]] +; CHECK-NEXT: [[OR:%.*]] = or i64 [[A]], [[CONV]] ; CHECK-NEXT: [[OR2:%.*]] = or i64 [[OR]], [[CONV2]] ; CHECK-NEXT: [[USE:%.*]] = udiv i64 [[OR]], [[D]] ; CHECK-NEXT: [[RETVAL:%.*]] = udiv i64 [[OR2]], [[USE]] @@ -702,7 +702,7 @@ define i64 @sext_or_chain_two_uses2(i64 %a, i16 %b, i16 %c, i64 %d) { ; CHECK-SAME: (i64 [[A:%.*]], i16 [[B:%.*]], i16 [[C:%.*]], i64 [[D:%.*]]) { ; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[B]] to i64 ; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[C]] to i64 -; CHECK-NEXT: [[OR:%.*]] = or i64 [[CONV]], [[A]] +; CHECK-NEXT: [[OR:%.*]] = or i64 [[A]], [[CONV]] ; CHECK-NEXT: [[OR2:%.*]] = or i64 [[OR]], [[CONV2]] ; CHECK-NEXT: [[USE1:%.*]] = udiv i64 [[OR2]], [[D]] ; CHECK-NEXT: [[USE2:%.*]] = udiv i64 [[OR2]], [[USE1]] @@ -761,7 +761,7 @@ define i32 @not_and_and_not_commute1(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[B]], [[C]] ; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], -1 -; CHECK-NEXT: [[AND2:%.*]] = and i32 [[TMP2]], [[A]] +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[A]], [[TMP2]] ; CHECK-NEXT: ret i32 [[AND2]] ; %not1 = xor i32 %b, -1 @@ -856,7 +856,7 @@ define i32 @not_or_or_not_commute1(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[B]], [[C]] ; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], -1 -; CHECK-NEXT: [[OR2:%.*]] = or i32 [[TMP2]], [[A]] +; CHECK-NEXT: [[OR2:%.*]] = or i32 [[A]], [[TMP2]] ; CHECK-NEXT: ret i32 [[OR2]] ; %not1 = xor i32 %b, -1 @@ -952,7 +952,7 @@ define i32 @or_not_and_commute2(i32 %a, i32 %b0, i32 %c) { ; CHECK-LABEL: define {{[^@]+}}@or_not_and_commute2 ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B0:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[B:%.*]] = sdiv i32 42, [[B0]] -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], [[C]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[C]], [[B]] ; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[A]], -1 ; CHECK-NEXT: [[OR3:%.*]] = and i32 [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret i32 [[OR3]] @@ -990,7 +990,7 @@ define i32 @or_not_and_commute4(i32 %a, i32 %b, i32 %c0) { ; CHECK-LABEL: define {{[^@]+}}@or_not_and_commute4 ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C0:%.*]]) { ; CHECK-NEXT: [[C:%.*]] = sdiv i32 42, [[C0]] -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[C]], [[B]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], [[C]] ; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[A]], -1 ; CHECK-NEXT: [[OR3:%.*]] = and i32 [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret i32 [[OR3]] @@ -1011,7 +1011,7 @@ define i32 @or_not_and_commute5(i32 %a0, i32 %b, i32 %c0) { ; CHECK-SAME: (i32 [[A0:%.*]], i32 [[B:%.*]], i32 [[C0:%.*]]) { ; CHECK-NEXT: [[A:%.*]] = sdiv i32 42, [[A0]] ; CHECK-NEXT: [[C:%.*]] = sdiv i32 42, [[C0]] -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[C]], [[B]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], [[C]] ; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[A]], -1 ; CHECK-NEXT: [[OR3:%.*]] = and i32 [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret i32 [[OR3]] @@ -1137,10 +1137,10 @@ define i32 @or_not_and_extra_not_use2(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[A]], [[B]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[OR1]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NOT1]], [[C]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[C]], [[NOT1]] ; CHECK-NEXT: [[OR2:%.*]] = or i32 [[A]], [[C]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[OR2]], -1 -; CHECK-NEXT: [[AND2:%.*]] = and i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[OR3:%.*]] = or i32 [[AND1]], [[AND2]] ; CHECK-NEXT: call void @use(i32 [[NOT2]]) ; CHECK-NEXT: ret i32 [[OR3]] @@ -1161,7 +1161,7 @@ define i32 @or_not_and_extra_and_use1(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[A]], [[B]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[OR1]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NOT1]], [[C]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[C]], [[NOT1]] ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], [[C]] ; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[A]], -1 ; CHECK-NEXT: [[OR3:%.*]] = and i32 [[TMP1]], [[TMP2]] @@ -1184,10 +1184,10 @@ define i32 @or_not_and_extra_and_use2(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[A]], [[B]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[OR1]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NOT1]], [[C]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[C]], [[NOT1]] ; CHECK-NEXT: [[OR2:%.*]] = or i32 [[A]], [[C]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[OR2]], -1 -; CHECK-NEXT: [[AND2:%.*]] = and i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[OR3:%.*]] = or i32 [[AND1]], [[AND2]] ; CHECK-NEXT: call void @use(i32 [[AND2]]) ; CHECK-NEXT: ret i32 [[OR3]] @@ -1250,10 +1250,10 @@ define i32 @or_not_and_wrong_c(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) { ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[A]], [[B]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[OR1]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NOT1]], [[C]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[C]], [[NOT1]] ; CHECK-NEXT: [[OR2:%.*]] = or i32 [[A]], [[D]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[OR2]], -1 -; CHECK-NEXT: [[AND2:%.*]] = and i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[OR3:%.*]] = or i32 [[AND1]], [[AND2]] ; CHECK-NEXT: ret i32 [[OR3]] ; @@ -1272,10 +1272,10 @@ define i32 @or_not_and_wrong_b(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) { ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[A]], [[B]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[OR1]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NOT1]], [[C]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[C]], [[NOT1]] ; CHECK-NEXT: [[OR2:%.*]] = or i32 [[A]], [[C]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[OR2]], -1 -; CHECK-NEXT: [[AND2:%.*]] = and i32 [[NOT2]], [[D]] +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[D]], [[NOT2]] ; CHECK-NEXT: [[OR3:%.*]] = or i32 [[AND1]], [[AND2]] ; CHECK-NEXT: ret i32 [[OR3]] ; @@ -1333,7 +1333,7 @@ define i32 @and_not_or_commute2(i32 %a, i32 %b0, i32 %c) { ; CHECK-LABEL: define {{[^@]+}}@and_not_or_commute2 ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B0:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[B:%.*]] = sdiv i32 42, [[B0]] -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], [[C]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[C]], [[B]] ; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[A]] ; CHECK-NEXT: [[AND3:%.*]] = xor i32 [[TMP2]], -1 ; CHECK-NEXT: ret i32 [[AND3]] @@ -1371,7 +1371,7 @@ define i32 @and_not_or_commute4(i32 %a, i32 %b, i32 %c0) { ; CHECK-LABEL: define {{[^@]+}}@and_not_or_commute4 ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C0:%.*]]) { ; CHECK-NEXT: [[C:%.*]] = sdiv i32 42, [[C0]] -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[C]], [[B]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], [[C]] ; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[A]] ; CHECK-NEXT: [[AND3:%.*]] = xor i32 [[TMP2]], -1 ; CHECK-NEXT: ret i32 [[AND3]] @@ -1392,7 +1392,7 @@ define i32 @and_not_or_commute5(i32 %a0, i32 %b, i32 %c0) { ; CHECK-SAME: (i32 [[A0:%.*]], i32 [[B:%.*]], i32 [[C0:%.*]]) { ; CHECK-NEXT: [[A:%.*]] = sdiv i32 42, [[A0]] ; CHECK-NEXT: [[C:%.*]] = sdiv i32 42, [[C0]] -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[C]], [[B]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], [[C]] ; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[A]] ; CHECK-NEXT: [[AND3:%.*]] = xor i32 [[TMP2]], -1 ; CHECK-NEXT: ret i32 [[AND3]] @@ -1518,10 +1518,10 @@ define i32 @and_not_or_extra_not_use2(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[A]], [[B]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[AND1]], -1 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[NOT1]], [[C]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[C]], [[NOT1]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[A]], [[C]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[AND2]], -1 -; CHECK-NEXT: [[OR2:%.*]] = or i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[OR2:%.*]] = or i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[AND3:%.*]] = and i32 [[OR1]], [[OR2]] ; CHECK-NEXT: call void @use(i32 [[NOT2]]) ; CHECK-NEXT: ret i32 [[AND3]] @@ -1542,7 +1542,7 @@ define i32 @and_not_or_extra_and_use1(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[A]], [[B]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[AND1]], -1 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[NOT1]], [[C]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[C]], [[NOT1]] ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], [[C]] ; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[A]] ; CHECK-NEXT: [[AND3:%.*]] = xor i32 [[TMP2]], -1 @@ -1565,10 +1565,10 @@ define i32 @and_not_or_extra_and_use2(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[A]], [[B]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[AND1]], -1 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[NOT1]], [[C]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[C]], [[NOT1]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[A]], [[C]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[AND2]], -1 -; CHECK-NEXT: [[OR2:%.*]] = or i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[OR2:%.*]] = or i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[AND3:%.*]] = and i32 [[OR1]], [[OR2]] ; CHECK-NEXT: call void @use(i32 [[OR2]]) ; CHECK-NEXT: ret i32 [[AND3]] @@ -1631,10 +1631,10 @@ define i32 @and_not_or_wrong_c(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) { ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[A]], [[B]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[AND1]], -1 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[NOT1]], [[C]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[C]], [[NOT1]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[A]], [[D]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[AND2]], -1 -; CHECK-NEXT: [[OR2:%.*]] = or i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[OR2:%.*]] = or i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[AND3:%.*]] = and i32 [[OR1]], [[OR2]] ; CHECK-NEXT: ret i32 [[AND3]] ; @@ -1653,10 +1653,10 @@ define i32 @and_not_or_wrong_b(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) { ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[A]], [[B]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[AND1]], -1 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[NOT1]], [[C]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[C]], [[NOT1]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[A]], [[C]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[AND2]], -1 -; CHECK-NEXT: [[OR2:%.*]] = or i32 [[NOT2]], [[D]] +; CHECK-NEXT: [[OR2:%.*]] = or i32 [[D]], [[NOT2]] ; CHECK-NEXT: [[AND3:%.*]] = and i32 [[OR1]], [[OR2]] ; CHECK-NEXT: ret i32 [[AND3]] ; @@ -1693,7 +1693,7 @@ define i32 @or_and_not_not_commute1(i32 %a, i32 %b0, i32 %c) { ; CHECK-LABEL: define {{[^@]+}}@or_and_not_not_commute1 ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B0:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[B:%.*]] = sdiv i32 42, [[B0]] -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[B]], [[C]] +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[C]], [[B]] ; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], [[A]] ; CHECK-NEXT: [[OR3:%.*]] = xor i32 [[TMP2]], -1 ; CHECK-NEXT: ret i32 [[OR3]] @@ -1780,7 +1780,7 @@ define i32 @or_and_not_not_commute6(i32 %a, i32 %b0, i32 %c) { ; CHECK-LABEL: define {{[^@]+}}@or_and_not_not_commute6 ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B0:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[B:%.*]] = sdiv i32 42, [[B0]] -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[B]], [[C]] +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[C]], [[B]] ; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], [[A]] ; CHECK-NEXT: [[OR3:%.*]] = xor i32 [[TMP2]], -1 ; CHECK-NEXT: ret i32 [[OR3]] @@ -1819,7 +1819,7 @@ define i32 @or_and_not_not_extra_not_use1(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[OR1]], -1 ; CHECK-NEXT: [[OR2:%.*]] = or i32 [[A]], [[C]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[OR2]], -1 -; CHECK-NEXT: [[AND:%.*]] = and i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[OR3:%.*]] = or i32 [[AND]], [[NOT1]] ; CHECK-NEXT: call void @use(i32 [[NOT1]]) ; CHECK-NEXT: ret i32 [[OR3]] @@ -1860,7 +1860,7 @@ define i32 @or_and_not_not_extra_and_use(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[OR2:%.*]] = or i32 [[A]], [[C]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[OR2]], -1 -; CHECK-NEXT: [[AND:%.*]] = and i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[C]], [[B]] ; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], [[A]] ; CHECK-NEXT: [[OR3:%.*]] = xor i32 [[TMP2]], -1 @@ -1884,7 +1884,7 @@ define i32 @or_and_not_not_extra_or_use1(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[OR1]], -1 ; CHECK-NEXT: [[OR2:%.*]] = or i32 [[A]], [[C]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[OR2]], -1 -; CHECK-NEXT: [[AND:%.*]] = and i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[OR3:%.*]] = or i32 [[AND]], [[NOT1]] ; CHECK-NEXT: call void @use(i32 [[OR1]]) ; CHECK-NEXT: ret i32 [[OR3]] @@ -1929,7 +1929,7 @@ define i32 @or_and_not_not_2_extra_uses(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[OR1]], -1 ; CHECK-NEXT: [[OR2:%.*]] = or i32 [[A]], [[C]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[OR2]], -1 -; CHECK-NEXT: [[AND:%.*]] = and i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[B]], [[NOT2]] ; CHECK-NEXT: call void @use(i32 [[AND]]) ; CHECK-NEXT: [[OR3:%.*]] = or i32 [[AND]], [[NOT1]] ; CHECK-NEXT: ret i32 [[OR3]] @@ -1952,7 +1952,7 @@ define i32 @or_and_not_not_wrong_a(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[OR1]], -1 ; CHECK-NEXT: [[OR2:%.*]] = or i32 [[A]], [[C]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[OR2]], -1 -; CHECK-NEXT: [[AND:%.*]] = and i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[OR3:%.*]] = or i32 [[AND]], [[NOT1]] ; CHECK-NEXT: ret i32 [[OR3]] ; @@ -1972,7 +1972,7 @@ define i32 @or_and_not_not_wrong_b(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[OR1]], -1 ; CHECK-NEXT: [[OR2:%.*]] = or i32 [[A]], [[C]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[OR2]], -1 -; CHECK-NEXT: [[AND:%.*]] = and i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[OR3:%.*]] = or i32 [[AND]], [[NOT1]] ; CHECK-NEXT: ret i32 [[OR3]] ; @@ -2008,7 +2008,7 @@ define i32 @and_or_not_not_commute1(i32 %a, i32 %b0, i32 %c) { ; CHECK-LABEL: define {{[^@]+}}@and_or_not_not_commute1 ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B0:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[B:%.*]] = sdiv i32 42, [[B0]] -; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[B]], [[C]] +; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[C]], [[B]] ; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[A]] ; CHECK-NEXT: [[AND3:%.*]] = xor i32 [[TMP2]], -1 ; CHECK-NEXT: ret i32 [[AND3]] @@ -2095,7 +2095,7 @@ define i32 @and_or_not_not_commute6(i32 %a, i32 %b0, i32 %c) { ; CHECK-LABEL: define {{[^@]+}}@and_or_not_not_commute6 ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B0:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[B:%.*]] = sdiv i32 42, [[B0]] -; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[B]], [[C]] +; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[C]], [[B]] ; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[A]] ; CHECK-NEXT: [[AND3:%.*]] = xor i32 [[TMP2]], -1 ; CHECK-NEXT: ret i32 [[AND3]] @@ -2134,7 +2134,7 @@ define i32 @and_or_not_not_extra_not_use1(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[AND1]], -1 ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[A]], [[C]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[AND2]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[AND3:%.*]] = xor i32 [[AND1]], [[OR]] ; CHECK-NEXT: call void @use(i32 [[NOT1]]) ; CHECK-NEXT: ret i32 [[AND3]] @@ -2175,7 +2175,7 @@ define i32 @and_or_not_not_extra_and_use(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[A]], [[C]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[AND2]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[C]], [[B]] ; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[A]] ; CHECK-NEXT: [[AND3:%.*]] = xor i32 [[TMP2]], -1 @@ -2198,7 +2198,7 @@ define i32 @and_or_not_not_extra_or_use1(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[B]], [[A]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[A]], [[C]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[AND2]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[AND3:%.*]] = xor i32 [[AND1]], [[OR]] ; CHECK-NEXT: call void @use(i32 [[AND1]]) ; CHECK-NEXT: ret i32 [[AND3]] @@ -2240,7 +2240,7 @@ define i32 @and_or_not_not_2_extra_uses(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: call void @use(i32 [[AND1]]) ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[A]], [[C]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[AND2]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[B]], [[NOT2]] ; CHECK-NEXT: call void @use(i32 [[OR]]) ; CHECK-NEXT: [[AND3:%.*]] = xor i32 [[AND1]], [[OR]] ; CHECK-NEXT: ret i32 [[AND3]] @@ -2262,7 +2262,7 @@ define i32 @and_or_not_not_wrong_a(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[B]], [[D]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[A]], [[C]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[AND2]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[AND3:%.*]] = xor i32 [[AND1]], [[OR]] ; CHECK-NEXT: ret i32 [[AND3]] ; @@ -2282,7 +2282,7 @@ define i32 @and_or_not_not_wrong_b(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[AND1]], -1 ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[A]], [[C]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[AND2]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[AND3:%.*]] = and i32 [[OR]], [[NOT1]] ; CHECK-NEXT: ret i32 [[AND3]] ; @@ -2471,7 +2471,7 @@ define i32 @and_not_or_or_not_or_xor_use3(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[B]], [[C]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[OR1]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NOT1]], [[A]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[A]], [[NOT1]] ; CHECK-NEXT: [[XOR1:%.*]] = xor i32 [[B]], [[C]] ; CHECK-NEXT: [[OR2:%.*]] = or i32 [[XOR1]], [[A]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[OR2]], -1 @@ -2539,7 +2539,7 @@ define i32 @and_not_or_or_not_or_xor_use6(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[B]], [[C]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[OR1]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NOT1]], [[A]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[A]], [[NOT1]] ; CHECK-NEXT: [[XOR1:%.*]] = xor i32 [[B]], [[C]] ; CHECK-NEXT: [[OR2:%.*]] = or i32 [[XOR1]], [[A]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[OR2]], -1 @@ -2567,7 +2567,7 @@ define i32 @or_not_and_and_not_and_xor(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[B]], [[C]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[AND1]], -1 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[NOT1]], [[A]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[A]], [[NOT1]] ; CHECK-NEXT: [[XOR1:%.*]] = xor i32 [[B]], [[C]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[XOR1]], [[A]] ; CHECK-NEXT: [[AND3:%.*]] = xor i32 [[AND2]], [[OR1]] @@ -2588,7 +2588,7 @@ define i32 @or_not_and_and_not_and_xor_commute1(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[C]], [[B]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[AND1]], -1 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[NOT1]], [[A]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[A]], [[NOT1]] ; CHECK-NEXT: [[XOR1:%.*]] = xor i32 [[B]], [[C]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[XOR1]], [[A]] ; CHECK-NEXT: [[AND3:%.*]] = xor i32 [[AND2]], [[OR1]] @@ -2632,7 +2632,7 @@ define i32 @or_not_and_and_not_and_xor_commute3(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[B]], [[C]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[AND1]], -1 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[NOT1]], [[A]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[A]], [[NOT1]] ; CHECK-NEXT: [[XOR1:%.*]] = xor i32 [[C]], [[B]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[XOR1]], [[A]] ; CHECK-NEXT: [[AND3:%.*]] = xor i32 [[AND2]], [[OR1]] @@ -2676,7 +2676,7 @@ define i32 @or_not_and_and_not_and_xor_commute5(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[B]], [[C]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[AND1]], -1 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[NOT1]], [[A]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[A]], [[NOT1]] ; CHECK-NEXT: [[XOR1:%.*]] = xor i32 [[B]], [[C]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[XOR1]], [[A]] ; CHECK-NEXT: [[AND3:%.*]] = xor i32 [[AND2]], [[OR1]] @@ -2697,7 +2697,7 @@ define i32 @or_not_and_and_not_and_xor_use1(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[B]], [[C]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[AND1]], -1 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[NOT1]], [[A]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[A]], [[NOT1]] ; CHECK-NEXT: [[XOR1:%.*]] = xor i32 [[B]], [[C]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[XOR1]], [[A]] ; CHECK-NEXT: [[AND3:%.*]] = xor i32 [[AND2]], [[OR1]] @@ -2720,7 +2720,7 @@ define i32 @or_not_and_and_not_and_xor_use2(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[B]], [[C]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[AND1]], -1 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[NOT1]], [[A]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[A]], [[NOT1]] ; CHECK-NEXT: [[XOR1:%.*]] = xor i32 [[B]], [[C]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[XOR1]], [[A]] ; CHECK-NEXT: [[AND3:%.*]] = xor i32 [[AND2]], [[OR1]] @@ -2743,7 +2743,7 @@ define i32 @or_not_and_and_not_and_xor_use3(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[B]], [[C]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[AND1]], -1 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[NOT1]], [[A]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[A]], [[NOT1]] ; CHECK-NEXT: [[XOR1:%.*]] = xor i32 [[B]], [[C]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[XOR1]], [[A]] ; CHECK-NEXT: [[AND3:%.*]] = xor i32 [[AND2]], [[OR1]] @@ -2766,7 +2766,7 @@ define i32 @or_not_and_and_not_and_xor_use4(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[B]], [[C]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[AND1]], -1 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[NOT1]], [[A]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[A]], [[NOT1]] ; CHECK-NEXT: [[XOR1:%.*]] = xor i32 [[B]], [[C]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[XOR1]], [[A]] ; CHECK-NEXT: [[AND3:%.*]] = xor i32 [[AND2]], [[OR1]] @@ -2789,7 +2789,7 @@ define i32 @or_not_and_and_not_and_xor_use5(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[B]], [[C]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[AND1]], -1 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[NOT1]], [[A]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[A]], [[NOT1]] ; CHECK-NEXT: [[XOR1:%.*]] = xor i32 [[B]], [[C]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[XOR1]], [[A]] ; CHECK-NEXT: [[AND3:%.*]] = xor i32 [[AND2]], [[OR1]] @@ -2812,7 +2812,7 @@ define i32 @or_not_and_and_not_and_xor_use6(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[B]], [[C]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[AND1]], -1 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[NOT1]], [[A]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[A]], [[NOT1]] ; CHECK-NEXT: [[XOR1:%.*]] = xor i32 [[B]], [[C]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[XOR1]], [[A]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[AND2]], -1 @@ -2965,7 +2965,7 @@ define i32 @not_and_and_or_not_or_or_commute3(i32 %a, i32 %b0, i32 %c) { ; CHECK-LABEL: define {{[^@]+}}@not_and_and_or_not_or_or_commute3 ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B0:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[B:%.*]] = sdiv i32 42, [[B0]] -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], [[C]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[C]], [[B]] ; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], [[A]] ; CHECK-NEXT: [[OR3:%.*]] = xor i32 [[TMP2]], -1 ; CHECK-NEXT: ret i32 [[OR3]] @@ -3051,7 +3051,7 @@ define i32 @not_and_and_or_not_or_or_use3(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: [[OR2:%.*]] = or i32 [[OR1]], [[C]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[OR2]], -1 ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[AND1]], [[C]] ; CHECK-NEXT: [[OR3:%.*]] = or i32 [[AND2]], [[NOT1]] ; CHECK-NEXT: call void @use(i32 [[NOT1]]) @@ -3093,7 +3093,7 @@ define i32 @not_and_and_or_not_or_or_use5(i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: define {{[^@]+}}@not_and_and_or_not_or_or_use5 ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[C]], [[B]] ; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], [[A]] ; CHECK-NEXT: [[OR3:%.*]] = xor i32 [[TMP2]], -1 @@ -3118,7 +3118,7 @@ define i32 @not_and_and_or_not_or_or_use6(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: [[OR2:%.*]] = or i32 [[OR1]], [[C]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[OR2]], -1 ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[AND1]], [[C]] ; CHECK-NEXT: [[OR3:%.*]] = or i32 [[AND2]], [[NOT1]] ; CHECK-NEXT: call void @use(i32 [[AND2]]) @@ -3270,7 +3270,7 @@ define i32 @not_or_or_and_not_and_and_commute3(i32 %a, i32 %b0, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B0:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[B:%.*]] = sdiv i32 42, [[B0]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], [[C]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[C]], [[B]] ; CHECK-NEXT: [[AND3:%.*]] = or i32 [[TMP1]], [[NOT2]] ; CHECK-NEXT: ret i32 [[AND3]] ; @@ -3355,7 +3355,7 @@ define i32 @not_or_or_and_not_and_and_use3(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[AND1]], [[C]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[AND2]], -1 ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[OR2:%.*]] = or i32 [[OR1]], [[C]] ; CHECK-NEXT: [[AND3:%.*]] = xor i32 [[AND2]], [[OR2]] ; CHECK-NEXT: call void @use(i32 [[NOT1]]) @@ -3396,7 +3396,7 @@ define i32 @not_or_or_and_not_and_and_use5(i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: define {{[^@]+}}@not_or_or_and_not_and_and_use5 ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[C]], [[B]] ; CHECK-NEXT: [[AND3:%.*]] = or i32 [[TMP1]], [[NOT2]] ; CHECK-NEXT: call void @use(i32 [[OR1]]) @@ -3419,7 +3419,7 @@ define i32 @not_or_or_and_not_and_and_use6(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[B]], [[A]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[AND1]], [[C]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[OR2:%.*]] = or i32 [[OR1]], [[C]] ; CHECK-NEXT: [[AND3:%.*]] = xor i32 [[AND2]], [[OR2]] ; CHECK-NEXT: call void @use(i32 [[OR2]]) @@ -3443,7 +3443,7 @@ define i32 @not_and_and_or_no_or(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[C]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = and i32 [[TMP2]], [[NOT2]] ; CHECK-NEXT: ret i32 [[OR2]] ; @@ -3461,7 +3461,7 @@ define i32 @not_and_and_or_no_or_commute1_and(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[C]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = and i32 [[TMP2]], [[NOT2]] ; CHECK-NEXT: ret i32 [[OR2]] ; @@ -3479,7 +3479,7 @@ define i32 @not_and_and_or_no_or_commute2_and(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[C]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = and i32 [[TMP2]], [[NOT2]] ; CHECK-NEXT: ret i32 [[OR2]] ; @@ -3497,7 +3497,7 @@ define i32 @not_and_and_or_no_or_commute1(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[C]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = and i32 [[TMP2]], [[NOT2]] ; CHECK-NEXT: ret i32 [[OR2]] ; @@ -3516,7 +3516,7 @@ define i32 @not_and_and_or_no_or_commute2(i32 %a, i32 %b0, i32 %c) { ; CHECK-NEXT: [[B:%.*]] = sdiv i32 42, [[B0]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[C]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = and i32 [[TMP2]], [[NOT2]] ; CHECK-NEXT: ret i32 [[OR2]] ; @@ -3555,7 +3555,7 @@ define i32 @not_and_and_or_no_or_use1(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[C]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = and i32 [[TMP2]], [[NOT2]] ; CHECK-NEXT: call void @use(i32 [[NOT2]]) ; CHECK-NEXT: ret i32 [[OR2]] @@ -3575,7 +3575,7 @@ define i32 @not_and_and_or_no_or_use2(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[C]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = and i32 [[TMP2]], [[NOT2]] ; CHECK-NEXT: call void @use(i32 [[NOT2]]) ; CHECK-NEXT: ret i32 [[OR2]] @@ -3595,7 +3595,7 @@ define i32 @not_and_and_or_no_or_use3(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[C]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = and i32 [[TMP2]], [[NOT2]] ; CHECK-NEXT: call void @use(i32 [[NOT2]]) ; CHECK-NEXT: ret i32 [[OR2]] @@ -3615,7 +3615,7 @@ define i32 @not_and_and_or_no_or_use4(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[C]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = and i32 [[TMP2]], [[NOT2]] ; CHECK-NEXT: call void @use(i32 [[NOT2]]) ; CHECK-NEXT: ret i32 [[OR2]] @@ -3636,7 +3636,7 @@ define i32 @not_and_and_or_no_or_use5(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[B]], [[A]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[OR1]], -1 ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[NOT2]], [[C]] +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[C]], [[NOT2]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[TMP1]], [[B]] ; CHECK-NEXT: [[OR2:%.*]] = or i32 [[AND2]], [[NOT1]] ; CHECK-NEXT: call void @use(i32 [[OR1]]) @@ -3658,7 +3658,7 @@ define i32 @not_and_and_or_no_or_use6(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[B]], [[A]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[OR1]], -1 ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[NOT2]], [[C]] +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[C]], [[NOT2]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[TMP1]], [[B]] ; CHECK-NEXT: [[OR2:%.*]] = or i32 [[AND2]], [[NOT1]] ; CHECK-NEXT: call void @use(i32 [[NOT1]]) @@ -3678,9 +3678,9 @@ define i32 @not_and_and_or_no_or_use7(i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: define {{[^@]+}}@not_and_and_or_no_or_use7 ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[C]], [[TMP1]] ; CHECK-NEXT: [[OR2:%.*]] = and i32 [[TMP2]], [[NOT2]] ; CHECK-NEXT: call void @use(i32 [[AND1]]) ; CHECK-NEXT: ret i32 [[OR2]] @@ -3701,7 +3701,7 @@ define i32 @not_and_and_or_no_or_use8(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[B]], [[A]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[OR1]], -1 ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[NOT2]], [[C]] +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[C]], [[NOT2]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[TMP1]], [[B]] ; CHECK-NEXT: [[OR2:%.*]] = or i32 [[AND2]], [[NOT1]] ; CHECK-NEXT: call void @use(i32 [[AND2]]) @@ -3724,7 +3724,7 @@ define i32 @not_or_or_and_no_and(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[C]], [[TMP1]] ; CHECK-NEXT: [[AND2:%.*]] = or i32 [[TMP2]], [[NOT2]] ; CHECK-NEXT: ret i32 [[AND2]] ; @@ -3742,7 +3742,7 @@ define i32 @not_or_or_and_no_and_commute1_or(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[C]], [[TMP1]] ; CHECK-NEXT: [[AND2:%.*]] = or i32 [[TMP2]], [[NOT2]] ; CHECK-NEXT: ret i32 [[AND2]] ; @@ -3760,7 +3760,7 @@ define i32 @not_or_or_and_no_and_commute2_or(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[C]], [[TMP1]] ; CHECK-NEXT: [[AND2:%.*]] = or i32 [[TMP2]], [[NOT2]] ; CHECK-NEXT: ret i32 [[AND2]] ; @@ -3778,7 +3778,7 @@ define i32 @not_or_or_and_no_and_commute1(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[C]], [[TMP1]] ; CHECK-NEXT: [[AND2:%.*]] = or i32 [[TMP2]], [[NOT2]] ; CHECK-NEXT: ret i32 [[AND2]] ; @@ -3797,7 +3797,7 @@ define i32 @not_or_or_and_no_and_commute2(i32 %a, i32 %b0, i32 %c) { ; CHECK-NEXT: [[B:%.*]] = sdiv i32 42, [[B0]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[C]], [[TMP1]] ; CHECK-NEXT: [[AND2:%.*]] = or i32 [[TMP2]], [[NOT2]] ; CHECK-NEXT: ret i32 [[AND2]] ; @@ -3836,7 +3836,7 @@ define i32 @not_or_or_and_no_and_use1(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[C]], [[TMP1]] ; CHECK-NEXT: [[AND2:%.*]] = or i32 [[TMP2]], [[NOT2]] ; CHECK-NEXT: call void @use(i32 [[NOT2]]) ; CHECK-NEXT: ret i32 [[AND2]] @@ -3856,7 +3856,7 @@ define i32 @not_or_or_and_no_and_use2(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[C]], [[TMP1]] ; CHECK-NEXT: [[AND2:%.*]] = or i32 [[TMP2]], [[NOT2]] ; CHECK-NEXT: call void @use(i32 [[NOT2]]) ; CHECK-NEXT: ret i32 [[AND2]] @@ -3876,7 +3876,7 @@ define i32 @not_or_or_and_no_and_use3(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[C]], [[TMP1]] ; CHECK-NEXT: [[AND2:%.*]] = or i32 [[TMP2]], [[NOT2]] ; CHECK-NEXT: call void @use(i32 [[NOT2]]) ; CHECK-NEXT: ret i32 [[AND2]] @@ -3896,7 +3896,7 @@ define i32 @not_or_or_and_no_and_use4(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[C]], [[TMP1]] ; CHECK-NEXT: [[AND2:%.*]] = or i32 [[TMP2]], [[NOT2]] ; CHECK-NEXT: call void @use(i32 [[NOT2]]) ; CHECK-NEXT: ret i32 [[AND2]] @@ -3916,7 +3916,7 @@ define i32 @not_or_or_and_no_and_use5(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[B]], [[A]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[NOT2]], [[C]] +; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[C]], [[NOT2]] ; CHECK-NEXT: [[OR2:%.*]] = or i32 [[TMP1]], [[B]] ; CHECK-NEXT: [[AND2:%.*]] = xor i32 [[AND1]], [[OR2]] ; CHECK-NEXT: call void @use(i32 [[AND1]]) @@ -3938,7 +3938,7 @@ define i32 @not_or_or_and_no_and_use6(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[B]], [[A]] ; CHECK-NEXT: [[NOT1:%.*]] = xor i32 [[AND1]], -1 ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[NOT2]], [[C]] +; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[C]], [[NOT2]] ; CHECK-NEXT: [[OR2:%.*]] = or i32 [[TMP1]], [[B]] ; CHECK-NEXT: [[AND2:%.*]] = xor i32 [[AND1]], [[OR2]] ; CHECK-NEXT: call void @use(i32 [[NOT1]]) @@ -3958,9 +3958,9 @@ define i32 @not_or_or_and_no_and_use7(i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: define {{[^@]+}}@not_or_or_and_no_and_use7 ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[NOT2]], [[B]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[B]], [[NOT2]] ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[C]], [[TMP1]] ; CHECK-NEXT: [[AND2:%.*]] = or i32 [[TMP2]], [[NOT2]] ; CHECK-NEXT: call void @use(i32 [[OR1]]) ; CHECK-NEXT: ret i32 [[AND2]] @@ -3980,7 +3980,7 @@ define i32 @not_or_or_and_no_and_use8(i32 %a, i32 %b, i32 %c) { ; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) { ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[B]], [[A]] ; CHECK-NEXT: [[NOT2:%.*]] = xor i32 [[A]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[NOT2]], [[C]] +; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[C]], [[NOT2]] ; CHECK-NEXT: [[OR2:%.*]] = or i32 [[TMP1]], [[B]] ; CHECK-NEXT: [[AND2:%.*]] = xor i32 [[AND1]], [[OR2]] ; CHECK-NEXT: call void @use(i32 [[OR2]]) @@ -4000,7 +4000,7 @@ define i4 @and_orn_xor(i4 %a, i4 %b) { ; CHECK-LABEL: define {{[^@]+}}@and_orn_xor ; CHECK-SAME: (i4 [[A:%.*]], i4 [[B:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = xor i4 [[A]], -1 -; CHECK-NEXT: [[R:%.*]] = and i4 [[TMP1]], [[B]] +; CHECK-NEXT: [[R:%.*]] = and i4 [[B]], [[TMP1]] ; CHECK-NEXT: ret i4 [[R]] ; %xor = xor i4 %a, %b @@ -4014,7 +4014,7 @@ define <2 x i4> @and_orn_xor_commute1(<2 x i4> %a, <2 x i4> %b) { ; CHECK-LABEL: define {{[^@]+}}@and_orn_xor_commute1 ; CHECK-SAME: (<2 x i4> [[A:%.*]], <2 x i4> [[B:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i4> [[A]], -; CHECK-NEXT: [[R:%.*]] = and <2 x i4> [[TMP1]], [[B]] +; CHECK-NEXT: [[R:%.*]] = and <2 x i4> [[B]], [[TMP1]] ; CHECK-NEXT: ret <2 x i4> [[R]] ; %xor = xor <2 x i4> %a, %b @@ -4030,7 +4030,7 @@ define i32 @and_orn_xor_commute2(i32 %a, i32 %b) { ; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[B]], [[A]] ; CHECK-NEXT: call void @use(i32 [[XOR]]) ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A]], -1 -; CHECK-NEXT: [[R:%.*]] = and i32 [[TMP1]], [[B]] +; CHECK-NEXT: [[R:%.*]] = and i32 [[B]], [[TMP1]] ; CHECK-NEXT: ret i32 [[R]] ; %xor = xor i32 %b, %a @@ -4047,7 +4047,7 @@ define i32 @and_orn_xor_commute3(i32 %a, i32 %b) { ; CHECK-NEXT: [[NOTA:%.*]] = xor i32 [[A]], -1 ; CHECK-NEXT: call void @use(i32 [[NOTA]]) ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A]], -1 -; CHECK-NEXT: [[R:%.*]] = and i32 [[TMP1]], [[B]] +; CHECK-NEXT: [[R:%.*]] = and i32 [[B]], [[TMP1]] ; CHECK-NEXT: ret i32 [[R]] ; %xor = xor i32 %b, %a @@ -4207,7 +4207,7 @@ define i16 @and_zext_zext(i8 %x, i4 %y) { ; CHECK-LABEL: define {{[^@]+}}@and_zext_zext ; CHECK-SAME: (i8 [[X:%.*]], i4 [[Y:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = zext i4 [[Y]] to i8 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X]], [[TMP1]] ; CHECK-NEXT: [[R:%.*]] = zext nneg i8 [[TMP2]] to i16 ; CHECK-NEXT: ret i16 [[R]] ; @@ -4221,7 +4221,7 @@ define i16 @or_zext_zext(i8 %x, i4 %y) { ; CHECK-LABEL: define {{[^@]+}}@or_zext_zext ; CHECK-SAME: (i8 [[X:%.*]], i4 [[Y:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = zext i4 [[Y]] to i8 -; CHECK-NEXT: [[TMP2:%.*]] = or i8 [[TMP1]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = or i8 [[X]], [[TMP1]] ; CHECK-NEXT: [[R:%.*]] = zext i8 [[TMP2]] to i16 ; CHECK-NEXT: ret i16 [[R]] ; @@ -4235,7 +4235,7 @@ define <2 x i16> @xor_zext_zext(<2 x i8> %x, <2 x i4> %y) { ; CHECK-LABEL: define {{[^@]+}}@xor_zext_zext ; CHECK-SAME: (<2 x i8> [[X:%.*]], <2 x i4> [[Y:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i4> [[Y]] to <2 x i8> -; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i8> [[TMP1]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i8> [[X]], [[TMP1]] ; CHECK-NEXT: [[R:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i16> ; CHECK-NEXT: ret <2 x i16> [[R]] ; @@ -4249,7 +4249,7 @@ define i16 @and_sext_sext(i8 %x, i4 %y) { ; CHECK-LABEL: define {{[^@]+}}@and_sext_sext ; CHECK-SAME: (i8 [[X:%.*]], i4 [[Y:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = sext i4 [[Y]] to i8 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X]], [[TMP1]] ; CHECK-NEXT: [[R:%.*]] = sext i8 [[TMP2]] to i16 ; CHECK-NEXT: ret i16 [[R]] ; @@ -4263,7 +4263,7 @@ define i16 @or_sext_sext(i8 %x, i4 %y) { ; CHECK-LABEL: define {{[^@]+}}@or_sext_sext ; CHECK-SAME: (i8 [[X:%.*]], i4 [[Y:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = sext i4 [[Y]] to i8 -; CHECK-NEXT: [[TMP2:%.*]] = or i8 [[TMP1]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = or i8 [[X]], [[TMP1]] ; CHECK-NEXT: [[R:%.*]] = sext i8 [[TMP2]] to i16 ; CHECK-NEXT: ret i16 [[R]] ; @@ -4277,7 +4277,7 @@ define i16 @xor_sext_sext(i8 %x, i4 %y) { ; CHECK-LABEL: define {{[^@]+}}@xor_sext_sext ; CHECK-SAME: (i8 [[X:%.*]], i4 [[Y:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = sext i4 [[Y]] to i8 -; CHECK-NEXT: [[TMP2:%.*]] = xor i8 [[TMP1]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = xor i8 [[X]], [[TMP1]] ; CHECK-NEXT: [[R:%.*]] = sext i8 [[TMP2]] to i16 ; CHECK-NEXT: ret i16 [[R]] ; @@ -4801,7 +4801,7 @@ define i1 @test_and_xor_freely_invertable_multiuse(i32 %x, i32 %y, i1 %z) { ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X]], [[Y]] ; CHECK-NEXT: call void @use_i1(i1 [[CMP]]) ; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[CMP]], true -; CHECK-NEXT: [[AND:%.*]] = and i1 [[TMP1]], [[Z]] +; CHECK-NEXT: [[AND:%.*]] = and i1 [[Z]], [[TMP1]] ; CHECK-NEXT: ret i1 [[AND]] ; %cmp = icmp sgt i32 %x, %y diff --git a/llvm/test/Transforms/InstCombine/and.ll b/llvm/test/Transforms/InstCombine/and.ll index b5250fc1a7849d..466718c8023007 100644 --- a/llvm/test/Transforms/InstCombine/and.ll +++ b/llvm/test/Transforms/InstCombine/and.ll @@ -831,7 +831,7 @@ define i64 @test39(i32 %X) { define i32 @lowmask_add_zext(i8 %x, i32 %y) { ; CHECK-LABEL: @lowmask_add_zext( ; CHECK-NEXT: [[Y_TR:%.*]] = trunc i32 [[Y:%.*]] to i8 -; CHECK-NEXT: [[BO_NARROW:%.*]] = add i8 [[Y_TR]], [[X:%.*]] +; CHECK-NEXT: [[BO_NARROW:%.*]] = add i8 [[X:%.*]], [[Y_TR]] ; CHECK-NEXT: [[R:%.*]] = zext i8 [[BO_NARROW]] to i32 ; CHECK-NEXT: ret i32 [[R]] ; @@ -845,7 +845,7 @@ define i32 @lowmask_add_zext_commute(i16 %x, i32 %p) { ; CHECK-LABEL: @lowmask_add_zext_commute( ; CHECK-NEXT: [[Y:%.*]] = mul i32 [[P:%.*]], [[P]] ; CHECK-NEXT: [[Y_TR:%.*]] = trunc i32 [[Y]] to i16 -; CHECK-NEXT: [[BO_NARROW:%.*]] = add i16 [[Y_TR]], [[X:%.*]] +; CHECK-NEXT: [[BO_NARROW:%.*]] = add i16 [[X:%.*]], [[Y_TR]] ; CHECK-NEXT: [[R:%.*]] = zext i16 [[BO_NARROW]] to i32 ; CHECK-NEXT: ret i32 [[R]] ; @@ -861,7 +861,7 @@ define i32 @lowmask_add_zext_commute(i16 %x, i32 %p) { define i32 @lowmask_add_zext_wrong_mask(i8 %x, i32 %y) { ; CHECK-LABEL: @lowmask_add_zext_wrong_mask( ; CHECK-NEXT: [[ZX:%.*]] = zext i8 [[X:%.*]] to i32 -; CHECK-NEXT: [[BO:%.*]] = add i32 [[ZX]], [[Y:%.*]] +; CHECK-NEXT: [[BO:%.*]] = add i32 [[Y:%.*]], [[ZX]] ; CHECK-NEXT: [[R:%.*]] = and i32 [[BO]], 511 ; CHECK-NEXT: ret i32 [[R]] ; @@ -877,7 +877,7 @@ define i32 @lowmask_add_zext_use1(i8 %x, i32 %y) { ; CHECK-LABEL: @lowmask_add_zext_use1( ; CHECK-NEXT: [[ZX:%.*]] = zext i8 [[X:%.*]] to i32 ; CHECK-NEXT: call void @use32(i32 [[ZX]]) -; CHECK-NEXT: [[BO:%.*]] = add i32 [[ZX]], [[Y:%.*]] +; CHECK-NEXT: [[BO:%.*]] = add i32 [[Y:%.*]], [[ZX]] ; CHECK-NEXT: [[R:%.*]] = and i32 [[BO]], 255 ; CHECK-NEXT: ret i32 [[R]] ; @@ -893,7 +893,7 @@ define i32 @lowmask_add_zext_use1(i8 %x, i32 %y) { define i32 @lowmask_add_zext_use2(i8 %x, i32 %y) { ; CHECK-LABEL: @lowmask_add_zext_use2( ; CHECK-NEXT: [[ZX:%.*]] = zext i8 [[X:%.*]] to i32 -; CHECK-NEXT: [[BO:%.*]] = add i32 [[ZX]], [[Y:%.*]] +; CHECK-NEXT: [[BO:%.*]] = add i32 [[Y:%.*]], [[ZX]] ; CHECK-NEXT: call void @use32(i32 [[BO]]) ; CHECK-NEXT: [[R:%.*]] = and i32 [[BO]], 255 ; CHECK-NEXT: ret i32 [[R]] @@ -938,7 +938,7 @@ define i17 @lowmask_sub_zext_commute(i5 %x, i17 %y) { define i32 @lowmask_mul_zext(i8 %x, i32 %y) { ; CHECK-LABEL: @lowmask_mul_zext( ; CHECK-NEXT: [[Y_TR:%.*]] = trunc i32 [[Y:%.*]] to i8 -; CHECK-NEXT: [[BO_NARROW:%.*]] = mul i8 [[Y_TR]], [[X:%.*]] +; CHECK-NEXT: [[BO_NARROW:%.*]] = mul i8 [[X:%.*]], [[Y_TR]] ; CHECK-NEXT: [[R:%.*]] = zext i8 [[BO_NARROW]] to i32 ; CHECK-NEXT: ret i32 [[R]] ; @@ -952,7 +952,7 @@ define i32 @lowmask_xor_zext_commute(i8 %x, i32 %p) { ; CHECK-LABEL: @lowmask_xor_zext_commute( ; CHECK-NEXT: [[Y:%.*]] = mul i32 [[P:%.*]], [[P]] ; CHECK-NEXT: [[Y_TR:%.*]] = trunc i32 [[Y]] to i8 -; CHECK-NEXT: [[BO_NARROW:%.*]] = xor i8 [[Y_TR]], [[X:%.*]] +; CHECK-NEXT: [[BO_NARROW:%.*]] = xor i8 [[X:%.*]], [[Y_TR]] ; CHECK-NEXT: [[R:%.*]] = zext i8 [[BO_NARROW]] to i32 ; CHECK-NEXT: ret i32 [[R]] ; @@ -966,7 +966,7 @@ define i32 @lowmask_xor_zext_commute(i8 %x, i32 %p) { define i24 @lowmask_or_zext_commute(i16 %x, i24 %y) { ; CHECK-LABEL: @lowmask_or_zext_commute( ; CHECK-NEXT: [[Y_TR:%.*]] = trunc i24 [[Y:%.*]] to i16 -; CHECK-NEXT: [[BO_NARROW:%.*]] = or i16 [[Y_TR]], [[X:%.*]] +; CHECK-NEXT: [[BO_NARROW:%.*]] = or i16 [[X:%.*]], [[Y_TR]] ; CHECK-NEXT: [[R:%.*]] = zext i16 [[BO_NARROW]] to i24 ; CHECK-NEXT: ret i24 [[R]] ; @@ -1127,7 +1127,7 @@ define i32 @test45(i32 %x, i32 %y) nounwind { ; y & (~y | x) -> y | x define i32 @test46(i32 %x, i32 %y) nounwind { ; CHECK-LABEL: @test46( -; CHECK-NEXT: [[A:%.*]] = and i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[A:%.*]] = and i32 [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: ret i32 [[A]] ; %n = xor i32 %y, -1 @@ -1139,7 +1139,7 @@ define i32 @test46(i32 %x, i32 %y) nounwind { ; y & (x | ~y) -> y | x define i32 @test47(i32 %x, i32 %y) nounwind { ; CHECK-LABEL: @test47( -; CHECK-NEXT: [[A:%.*]] = and i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[A:%.*]] = and i32 [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: ret i32 [[A]] ; %n = xor i32 %y, -1 @@ -1814,7 +1814,7 @@ define i16 @signbit_splat_mask_use2(i8 %x, i16 %y) { ; CHECK-NEXT: [[A:%.*]] = ashr i8 [[X:%.*]], 7 ; CHECK-NEXT: [[S:%.*]] = sext i8 [[A]] to i16 ; CHECK-NEXT: call void @use16(i16 [[S]]) -; CHECK-NEXT: [[R:%.*]] = and i16 [[S]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = and i16 [[Y:%.*]], [[S]] ; CHECK-NEXT: ret i16 [[R]] ; %a = ashr i8 %x, 7 @@ -1830,7 +1830,7 @@ define i16 @not_signbit_splat_mask1(i8 %x, i16 %y) { ; CHECK-LABEL: @not_signbit_splat_mask1( ; CHECK-NEXT: [[A:%.*]] = ashr i8 [[X:%.*]], 7 ; CHECK-NEXT: [[Z:%.*]] = zext i8 [[A]] to i16 -; CHECK-NEXT: [[R:%.*]] = and i16 [[Z]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = and i16 [[Y:%.*]], [[Z]] ; CHECK-NEXT: ret i16 [[R]] ; %a = ashr i8 %x, 7 @@ -1845,7 +1845,7 @@ define i16 @not_signbit_splat_mask2(i8 %x, i16 %y) { ; CHECK-LABEL: @not_signbit_splat_mask2( ; CHECK-NEXT: [[A:%.*]] = ashr i8 [[X:%.*]], 6 ; CHECK-NEXT: [[S:%.*]] = sext i8 [[A]] to i16 -; CHECK-NEXT: [[R:%.*]] = and i16 [[S]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = and i16 [[Y:%.*]], [[S]] ; CHECK-NEXT: ret i16 [[R]] ; %a = ashr i8 %x, 6 @@ -1920,7 +1920,7 @@ define i8 @not_ashr_not_bitwidth_mask(i8 %x, i8 %y) { ; CHECK-LABEL: @not_ashr_not_bitwidth_mask( ; CHECK-NEXT: [[SIGN:%.*]] = ashr i8 [[X:%.*]], 6 ; CHECK-NEXT: [[NOT:%.*]] = xor i8 [[SIGN]], -1 -; CHECK-NEXT: [[R:%.*]] = and i8 [[NOT]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = and i8 [[Y:%.*]], [[NOT]] ; CHECK-NEXT: ret i8 [[R]] ; %sign = ashr i8 %x, 6 @@ -1935,7 +1935,7 @@ define i8 @not_lshr_bitwidth_mask(i8 %x, i8 %y) { ; CHECK-LABEL: @not_lshr_bitwidth_mask( ; CHECK-NEXT: [[SIGN:%.*]] = lshr i8 [[X:%.*]], 7 ; CHECK-NEXT: [[NOT:%.*]] = xor i8 [[SIGN]], -1 -; CHECK-NEXT: [[R:%.*]] = and i8 [[NOT]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = and i8 [[Y:%.*]], [[NOT]] ; CHECK-NEXT: ret i8 [[R]] ; %sign = lshr i8 %x, 7 @@ -2029,7 +2029,7 @@ define i16 @not_invert_signbit_splat_mask1(i8 %x, i16 %y) { ; CHECK-NEXT: [[ISNOTNEG:%.*]] = icmp sgt i8 [[X:%.*]], -1 ; CHECK-NEXT: [[N:%.*]] = sext i1 [[ISNOTNEG]] to i8 ; CHECK-NEXT: [[Z:%.*]] = zext i8 [[N]] to i16 -; CHECK-NEXT: [[R:%.*]] = and i16 [[Z]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = and i16 [[Y:%.*]], [[Z]] ; CHECK-NEXT: ret i16 [[R]] ; %a = ashr i8 %x, 7 @@ -2046,7 +2046,7 @@ define i16 @not_invert_signbit_splat_mask2(i8 %x, i16 %y) { ; CHECK-NEXT: [[A:%.*]] = ashr i8 [[X:%.*]], 6 ; CHECK-NEXT: [[N:%.*]] = xor i8 [[A]], -1 ; CHECK-NEXT: [[S:%.*]] = sext i8 [[N]] to i16 -; CHECK-NEXT: [[R:%.*]] = and i16 [[S]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = and i16 [[Y:%.*]], [[S]] ; CHECK-NEXT: ret i16 [[R]] ; %a = ashr i8 %x, 6 @@ -2504,7 +2504,7 @@ define i8 @negate_lowbitmask_use2(i8 %x, i8 %y) { ; CHECK-NEXT: [[A:%.*]] = and i8 [[X:%.*]], 1 ; CHECK-NEXT: [[N:%.*]] = sub nsw i8 0, [[A]] ; CHECK-NEXT: call void @use8(i8 [[N]]) -; CHECK-NEXT: [[R:%.*]] = and i8 [[N]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = and i8 [[Y:%.*]], [[N]] ; CHECK-NEXT: ret i8 [[R]] ; %a = and i8 %x, 1 @@ -2553,7 +2553,7 @@ define i32 @and_zext_multiuse(i32 %a, i1 %b) { ; CHECK-LABEL: @and_zext_multiuse( ; CHECK-NEXT: [[MASK:%.*]] = zext i1 [[B:%.*]] to i32 ; CHECK-NEXT: call void @use32(i32 [[MASK]]) -; CHECK-NEXT: [[R:%.*]] = and i32 [[MASK]], [[A:%.*]] +; CHECK-NEXT: [[R:%.*]] = and i32 [[A:%.*]], [[MASK]] ; CHECK-NEXT: ret i32 [[R]] ; %mask = zext i1 %b to i32 @@ -2636,7 +2636,7 @@ define i32 @and_zext_eq_zero(i32 %A, i32 %C) { define i32 @canonicalize_and_add_power2_or_zero(i32 %x, i32 %y) { ; CHECK-LABEL: @canonicalize_and_add_power2_or_zero( ; CHECK-NEXT: [[NY:%.*]] = sub i32 0, [[Y:%.*]] -; CHECK-NEXT: [[P2:%.*]] = and i32 [[NY]], [[Y]] +; CHECK-NEXT: [[P2:%.*]] = and i32 [[Y]], [[NY]] ; CHECK-NEXT: call void @use32(i32 [[P2]]) ; CHECK-NEXT: [[X2:%.*]] = mul i32 [[X:%.*]], [[X]] ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X2]], -1 @@ -2656,7 +2656,7 @@ define i32 @canonicalize_and_add_power2_or_zero(i32 %x, i32 %y) { define i32 @canonicalize_and_sub_power2_or_zero(i32 %x, i32 %y) { ; CHECK-LABEL: @canonicalize_and_sub_power2_or_zero( ; CHECK-NEXT: [[NY:%.*]] = sub i32 0, [[Y:%.*]] -; CHECK-NEXT: [[P2:%.*]] = and i32 [[NY]], [[Y]] +; CHECK-NEXT: [[P2:%.*]] = and i32 [[Y]], [[NY]] ; CHECK-NEXT: call void @use32(i32 [[P2]]) ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X:%.*]], -1 ; CHECK-NEXT: [[AND:%.*]] = and i32 [[P2]], [[TMP1]] @@ -2674,7 +2674,7 @@ define i32 @canonicalize_and_sub_power2_or_zero(i32 %x, i32 %y) { define i32 @canonicalize_and_add_power2_or_zero_commuted1(i32 %x, i32 %y) { ; CHECK-LABEL: @canonicalize_and_add_power2_or_zero_commuted1( ; CHECK-NEXT: [[NY:%.*]] = sub i32 0, [[Y:%.*]] -; CHECK-NEXT: [[P2:%.*]] = and i32 [[NY]], [[Y]] +; CHECK-NEXT: [[P2:%.*]] = and i32 [[Y]], [[NY]] ; CHECK-NEXT: call void @use32(i32 [[P2]]) ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X:%.*]], -1 ; CHECK-NEXT: [[AND:%.*]] = and i32 [[P2]], [[TMP1]] @@ -2692,7 +2692,7 @@ define i32 @canonicalize_and_add_power2_or_zero_commuted1(i32 %x, i32 %y) { define i32 @canonicalize_and_add_power2_or_zero_commuted2(i32 %x, i32 %y) { ; CHECK-LABEL: @canonicalize_and_add_power2_or_zero_commuted2( ; CHECK-NEXT: [[NY:%.*]] = sub i32 0, [[Y:%.*]] -; CHECK-NEXT: [[P2:%.*]] = and i32 [[NY]], [[Y]] +; CHECK-NEXT: [[P2:%.*]] = and i32 [[Y]], [[NY]] ; CHECK-NEXT: call void @use32(i32 [[P2]]) ; CHECK-NEXT: [[X2:%.*]] = mul i32 [[X:%.*]], [[X]] ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X2]], -1 @@ -2712,7 +2712,7 @@ define i32 @canonicalize_and_add_power2_or_zero_commuted2(i32 %x, i32 %y) { define i32 @canonicalize_and_add_power2_or_zero_commuted3(i32 %x, i32 %y) { ; CHECK-LABEL: @canonicalize_and_add_power2_or_zero_commuted3( ; CHECK-NEXT: [[NY:%.*]] = sub i32 0, [[Y:%.*]] -; CHECK-NEXT: [[P2:%.*]] = and i32 [[NY]], [[Y]] +; CHECK-NEXT: [[P2:%.*]] = and i32 [[Y]], [[NY]] ; CHECK-NEXT: call void @use32(i32 [[P2]]) ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X:%.*]], -1 ; CHECK-NEXT: [[AND:%.*]] = and i32 [[P2]], [[TMP1]] @@ -2730,7 +2730,7 @@ define i32 @canonicalize_and_add_power2_or_zero_commuted3(i32 %x, i32 %y) { define i32 @canonicalize_and_sub_power2_or_zero_commuted_nofold(i32 %x, i32 %y) { ; CHECK-LABEL: @canonicalize_and_sub_power2_or_zero_commuted_nofold( ; CHECK-NEXT: [[NY:%.*]] = sub i32 0, [[Y:%.*]] -; CHECK-NEXT: [[P2:%.*]] = and i32 [[NY]], [[Y]] +; CHECK-NEXT: [[P2:%.*]] = and i32 [[Y]], [[NY]] ; CHECK-NEXT: call void @use32(i32 [[P2]]) ; CHECK-NEXT: [[VAL:%.*]] = sub i32 [[P2]], [[X:%.*]] ; CHECK-NEXT: [[AND:%.*]] = and i32 [[VAL]], [[P2]] @@ -2759,7 +2759,7 @@ define i32 @canonicalize_and_add_non_power2_or_zero_nofold(i32 %x, i32 %y) { define i32 @canonicalize_and_add_power2_or_zero_multiuse_nofold(i32 %x, i32 %y) { ; CHECK-LABEL: @canonicalize_and_add_power2_or_zero_multiuse_nofold( ; CHECK-NEXT: [[NY:%.*]] = sub i32 0, [[Y:%.*]] -; CHECK-NEXT: [[P2:%.*]] = and i32 [[NY]], [[Y]] +; CHECK-NEXT: [[P2:%.*]] = and i32 [[Y]], [[NY]] ; CHECK-NEXT: call void @use32(i32 [[P2]]) ; CHECK-NEXT: [[X2:%.*]] = mul i32 [[X:%.*]], [[X]] ; CHECK-NEXT: [[VAL:%.*]] = add i32 [[X2]], [[P2]] @@ -2781,7 +2781,7 @@ define i32 @canonicalize_and_add_power2_or_zero_multiuse_nofold(i32 %x, i32 %y) define i32 @canonicalize_and_sub_power2_or_zero_multiuse_nofold(i32 %x, i32 %y) { ; CHECK-LABEL: @canonicalize_and_sub_power2_or_zero_multiuse_nofold( ; CHECK-NEXT: [[NY:%.*]] = sub i32 0, [[Y:%.*]] -; CHECK-NEXT: [[P2:%.*]] = and i32 [[NY]], [[Y]] +; CHECK-NEXT: [[P2:%.*]] = and i32 [[Y]], [[NY]] ; CHECK-NEXT: call void @use32(i32 [[P2]]) ; CHECK-NEXT: [[VAL:%.*]] = sub i32 [[X:%.*]], [[P2]] ; CHECK-NEXT: call void @use32(i32 [[VAL]]) diff --git a/llvm/test/Transforms/InstCombine/apint-and-xor-merge.ll b/llvm/test/Transforms/InstCombine/apint-and-xor-merge.ll index 9810e5057d8a96..eca38586d01d0e 100644 --- a/llvm/test/Transforms/InstCombine/apint-and-xor-merge.ll +++ b/llvm/test/Transforms/InstCombine/apint-and-xor-merge.ll @@ -8,7 +8,7 @@ define i57 @test1(i57 %x, i57 %y, i57 %z) { ; CHECK-LABEL: @test1( ; CHECK-NEXT: [[TMP61:%.*]] = xor i57 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[TMP7:%.*]] = and i57 [[TMP61]], [[Z:%.*]] +; CHECK-NEXT: [[TMP7:%.*]] = and i57 [[Z:%.*]], [[TMP61]] ; CHECK-NEXT: ret i57 [[TMP7]] ; %tmp3 = and i57 %z, %x diff --git a/llvm/test/Transforms/InstCombine/apint-or.ll b/llvm/test/Transforms/InstCombine/apint-or.ll index 38bffdf35a364e..07a0e497e521e1 100644 --- a/llvm/test/Transforms/InstCombine/apint-or.ll +++ b/llvm/test/Transforms/InstCombine/apint-or.ll @@ -20,7 +20,7 @@ define i39 @test2(i39 %V, i39 %M) { ; CHECK-LABEL: define i39 @test2( ; CHECK-SAME: i39 [[V:%.*]], i39 [[M:%.*]]) { ; CHECK-NEXT: [[N:%.*]] = and i39 [[M]], -274877906944 -; CHECK-NEXT: [[A:%.*]] = add i39 [[N]], [[V]] +; CHECK-NEXT: [[A:%.*]] = add i39 [[V]], [[N]] ; CHECK-NEXT: ret i39 [[A]] ; %C1 = xor i39 274877906943, -1 ;; C2 = 274877906943 @@ -51,7 +51,7 @@ define i399 @test5(i399 %V, i399 %M) { ; CHECK-LABEL: define i399 @test5( ; CHECK-SAME: i399 [[V:%.*]], i399 [[M:%.*]]) { ; CHECK-NEXT: [[N:%.*]] = and i399 [[M]], 18446742974197923840 -; CHECK-NEXT: [[A:%.*]] = add i399 [[N]], [[V]] +; CHECK-NEXT: [[A:%.*]] = add i399 [[V]], [[N]] ; CHECK-NEXT: ret i399 [[A]] ; %C1 = xor i399 274877906943, -1 ;; C2 = 274877906943 diff --git a/llvm/test/Transforms/InstCombine/apint-shift.ll b/llvm/test/Transforms/InstCombine/apint-shift.ll index ecf9c4e9c4e692..21c6c18009d1df 100644 --- a/llvm/test/Transforms/InstCombine/apint-shift.ll +++ b/llvm/test/Transforms/InstCombine/apint-shift.ll @@ -538,7 +538,7 @@ define <2 x i43> @lshr_shl_eq_amt_multi_use_splat_vec(<2 x i43> %A) { define i37 @test25(i37 %AA, i37 %BB) { ; CHECK-LABEL: @test25( ; CHECK-NEXT: [[D:%.*]] = and i37 [[AA:%.*]], -131072 -; CHECK-NEXT: [[C2:%.*]] = add i37 [[D]], [[BB:%.*]] +; CHECK-NEXT: [[C2:%.*]] = add i37 [[BB:%.*]], [[D]] ; CHECK-NEXT: [[F:%.*]] = and i37 [[C2]], -131072 ; CHECK-NEXT: ret i37 [[F]] ; diff --git a/llvm/test/Transforms/InstCombine/apint-sub.ll b/llvm/test/Transforms/InstCombine/apint-sub.ll index 1c0374d4437409..e9abe1a7e627d5 100644 --- a/llvm/test/Transforms/InstCombine/apint-sub.ll +++ b/llvm/test/Transforms/InstCombine/apint-sub.ll @@ -50,7 +50,7 @@ define i19 @test5(i19 %A, i19 %Bok, i19 %Cok) { define i57 @test6(i57 %A, i57 %B) { ; CHECK-LABEL: @test6( ; CHECK-NEXT: [[B_NOT:%.*]] = xor i57 [[B:%.*]], -1 -; CHECK-NEXT: [[D:%.*]] = and i57 [[B_NOT]], [[A:%.*]] +; CHECK-NEXT: [[D:%.*]] = and i57 [[A:%.*]], [[B_NOT]] ; CHECK-NEXT: ret i57 [[D]] ; %C = and i57 %A, %B diff --git a/llvm/test/Transforms/InstCombine/ashr-lshr.ll b/llvm/test/Transforms/InstCombine/ashr-lshr.ll index a81cd47b1cd4b6..9e31c9b0738c61 100644 --- a/llvm/test/Transforms/InstCombine/ashr-lshr.ll +++ b/llvm/test/Transforms/InstCombine/ashr-lshr.ll @@ -620,7 +620,7 @@ define <2 x i8> @ashr_known_pos_exact_vec(<2 x i8> %x, <2 x i8> %y) { define i32 @lshr_mul_times_3_div_2(i32 %0) { ; CHECK-LABEL: @lshr_mul_times_3_div_2( ; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP0:%.*]], 1 -; CHECK-NEXT: [[LSHR:%.*]] = add nuw nsw i32 [[TMP2]], [[TMP0]] +; CHECK-NEXT: [[LSHR:%.*]] = add nuw nsw i32 [[TMP0]], [[TMP2]] ; CHECK-NEXT: ret i32 [[LSHR]] ; %mul = mul nsw nuw i32 %0, 3 @@ -631,7 +631,7 @@ define i32 @lshr_mul_times_3_div_2(i32 %0) { define i32 @lshr_mul_times_3_div_2_exact(i32 %x) { ; CHECK-LABEL: @lshr_mul_times_3_div_2_exact( ; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i32 [[X:%.*]], 1 -; CHECK-NEXT: [[LSHR:%.*]] = add nsw i32 [[TMP1]], [[X]] +; CHECK-NEXT: [[LSHR:%.*]] = add nsw i32 [[X]], [[TMP1]] ; CHECK-NEXT: ret i32 [[LSHR]] ; %mul = mul nsw i32 %x, 3 @@ -670,7 +670,7 @@ define i32 @mul_times_3_div_2_multiuse_lshr(i32 %x) { define i32 @lshr_mul_times_3_div_2_exact_2(i32 %x) { ; CHECK-LABEL: @lshr_mul_times_3_div_2_exact_2( ; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i32 [[X:%.*]], 1 -; CHECK-NEXT: [[LSHR:%.*]] = add nuw i32 [[TMP1]], [[X]] +; CHECK-NEXT: [[LSHR:%.*]] = add nuw i32 [[X]], [[TMP1]] ; CHECK-NEXT: ret i32 [[LSHR]] ; %mul = mul nuw i32 %x, 3 @@ -681,7 +681,7 @@ define i32 @lshr_mul_times_3_div_2_exact_2(i32 %x) { define i32 @lshr_mul_times_5_div_4(i32 %0) { ; CHECK-LABEL: @lshr_mul_times_5_div_4( ; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP0:%.*]], 2 -; CHECK-NEXT: [[LSHR:%.*]] = add nuw nsw i32 [[TMP2]], [[TMP0]] +; CHECK-NEXT: [[LSHR:%.*]] = add nuw nsw i32 [[TMP0]], [[TMP2]] ; CHECK-NEXT: ret i32 [[LSHR]] ; %mul = mul nsw nuw i32 %0, 5 @@ -692,7 +692,7 @@ define i32 @lshr_mul_times_5_div_4(i32 %0) { define i32 @lshr_mul_times_5_div_4_exact(i32 %x) { ; CHECK-LABEL: @lshr_mul_times_5_div_4_exact( ; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i32 [[X:%.*]], 2 -; CHECK-NEXT: [[LSHR:%.*]] = add nsw i32 [[TMP1]], [[X]] +; CHECK-NEXT: [[LSHR:%.*]] = add nsw i32 [[X]], [[TMP1]] ; CHECK-NEXT: ret i32 [[LSHR]] ; %mul = mul nsw i32 %x, 5 @@ -731,7 +731,7 @@ define i32 @mul_times_5_div_4_multiuse_lshr(i32 %x) { define i32 @lshr_mul_times_5_div_4_exact_2(i32 %x) { ; CHECK-LABEL: @lshr_mul_times_5_div_4_exact_2( ; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i32 [[X:%.*]], 2 -; CHECK-NEXT: [[LSHR:%.*]] = add nuw i32 [[TMP1]], [[X]] +; CHECK-NEXT: [[LSHR:%.*]] = add nuw i32 [[X]], [[TMP1]] ; CHECK-NEXT: ret i32 [[LSHR]] ; %mul = mul nuw i32 %x, 5 @@ -742,7 +742,7 @@ define i32 @lshr_mul_times_5_div_4_exact_2(i32 %x) { define i32 @ashr_mul_times_3_div_2(i32 %0) { ; CHECK-LABEL: @ashr_mul_times_3_div_2( ; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[TMP0:%.*]], 1 -; CHECK-NEXT: [[ASHR:%.*]] = add nuw nsw i32 [[TMP2]], [[TMP0]] +; CHECK-NEXT: [[ASHR:%.*]] = add nuw nsw i32 [[TMP0]], [[TMP2]] ; CHECK-NEXT: ret i32 [[ASHR]] ; %mul = mul nuw nsw i32 %0, 3 @@ -753,7 +753,7 @@ define i32 @ashr_mul_times_3_div_2(i32 %0) { define i32 @ashr_mul_times_3_div_2_exact(i32 %x) { ; CHECK-LABEL: @ashr_mul_times_3_div_2_exact( ; CHECK-NEXT: [[TMP1:%.*]] = ashr exact i32 [[X:%.*]], 1 -; CHECK-NEXT: [[ASHR:%.*]] = add nsw i32 [[TMP1]], [[X]] +; CHECK-NEXT: [[ASHR:%.*]] = add nsw i32 [[X]], [[TMP1]] ; CHECK-NEXT: ret i32 [[ASHR]] ; %mul = mul nsw i32 %x, 3 @@ -805,7 +805,7 @@ define i32 @mul_times_3_div_2_multiuse_ashr(i32 %x) { define i32 @ashr_mul_times_3_div_2_exact_2(i32 %x) { ; CHECK-LABEL: @ashr_mul_times_3_div_2_exact_2( ; CHECK-NEXT: [[TMP1:%.*]] = ashr exact i32 [[X:%.*]], 1 -; CHECK-NEXT: [[ASHR:%.*]] = add nsw i32 [[TMP1]], [[X]] +; CHECK-NEXT: [[ASHR:%.*]] = add nsw i32 [[X]], [[TMP1]] ; CHECK-NEXT: ret i32 [[ASHR]] ; %mul = mul nsw i32 %x, 3 @@ -816,7 +816,7 @@ define i32 @ashr_mul_times_3_div_2_exact_2(i32 %x) { define i32 @ashr_mul_times_5_div_4(i32 %0) { ; CHECK-LABEL: @ashr_mul_times_5_div_4( ; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[TMP0:%.*]], 2 -; CHECK-NEXT: [[ASHR:%.*]] = add nuw nsw i32 [[TMP2]], [[TMP0]] +; CHECK-NEXT: [[ASHR:%.*]] = add nuw nsw i32 [[TMP0]], [[TMP2]] ; CHECK-NEXT: ret i32 [[ASHR]] ; %mul = mul nuw nsw i32 %0, 5 @@ -827,7 +827,7 @@ define i32 @ashr_mul_times_5_div_4(i32 %0) { define i32 @ashr_mul_times_5_div_4_exact(i32 %x) { ; CHECK-LABEL: @ashr_mul_times_5_div_4_exact( ; CHECK-NEXT: [[TMP1:%.*]] = ashr exact i32 [[X:%.*]], 2 -; CHECK-NEXT: [[ASHR:%.*]] = add nsw i32 [[TMP1]], [[X]] +; CHECK-NEXT: [[ASHR:%.*]] = add nsw i32 [[X]], [[TMP1]] ; CHECK-NEXT: ret i32 [[ASHR]] ; %mul = mul nsw i32 %x, 5 @@ -866,7 +866,7 @@ define i32 @mul_times_5_div_4_multiuse_ashr(i32 %x) { define i32 @ashr_mul_times_5_div_4_exact_2(i32 %x) { ; CHECK-LABEL: @ashr_mul_times_5_div_4_exact_2( ; CHECK-NEXT: [[TMP1:%.*]] = ashr exact i32 [[X:%.*]], 2 -; CHECK-NEXT: [[ASHR:%.*]] = add nsw i32 [[TMP1]], [[X]] +; CHECK-NEXT: [[ASHR:%.*]] = add nsw i32 [[X]], [[TMP1]] ; CHECK-NEXT: ret i32 [[ASHR]] ; %mul = mul nsw i32 %x, 5 diff --git a/llvm/test/Transforms/InstCombine/assume-align.ll b/llvm/test/Transforms/InstCombine/assume-align.ll index 798707f317d299..ce3195d50be7ca 100644 --- a/llvm/test/Transforms/InstCombine/assume-align.ll +++ b/llvm/test/Transforms/InstCombine/assume-align.ll @@ -88,7 +88,7 @@ define void @f3(i64 %a, ptr %b) { ; CHECK-LABEL: @f3( ; CHECK-NEXT: [[C:%.*]] = ptrtoint ptr [[B:%.*]] to i64 ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[B]], i64 4294967296) ] -; CHECK-NEXT: [[D:%.*]] = add i64 [[C]], [[A:%.*]] +; CHECK-NEXT: [[D:%.*]] = add i64 [[A:%.*]], [[C]] ; CHECK-NEXT: call void @g(i64 [[D]]) ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/InstCombine/assume-separate_storage.ll b/llvm/test/Transforms/InstCombine/assume-separate_storage.ll index 8fa8c3e80786d5..b94c303e5a70c5 100644 --- a/llvm/test/Transforms/InstCombine/assume-separate_storage.ll +++ b/llvm/test/Transforms/InstCombine/assume-separate_storage.ll @@ -24,7 +24,7 @@ define i64 @folds_removed_operands(ptr %a, ptr %b, i64 %n1, i64 %n2) { ; CHECK-LABEL: @folds_removed_operands( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[REASS_ADD:%.*]] = shl i64 [[N2:%.*]], 1 -; CHECK-NEXT: [[Y:%.*]] = add i64 [[REASS_ADD]], [[N1:%.*]] +; CHECK-NEXT: [[Y:%.*]] = add i64 [[N1:%.*]], [[REASS_ADD]] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "separate_storage"(ptr [[A:%.*]], ptr [[B:%.*]]) ] ; CHECK-NEXT: ret i64 [[Y]] ; diff --git a/llvm/test/Transforms/InstCombine/avg-lsb.ll b/llvm/test/Transforms/InstCombine/avg-lsb.ll index 23a47166bf2fef..1e9e4e3bcafb27 100644 --- a/llvm/test/Transforms/InstCombine/avg-lsb.ll +++ b/llvm/test/Transforms/InstCombine/avg-lsb.ll @@ -5,7 +5,7 @@ define i8 @avg_lsb(i8 %a, i8 %b) { ; CHECK-LABEL: define i8 @avg_lsb( ; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) { ; CHECK-NEXT: [[REM:%.*]] = and i8 [[A]], 1 -; CHECK-NEXT: [[DIV2:%.*]] = and i8 [[REM]], [[B]] +; CHECK-NEXT: [[DIV2:%.*]] = and i8 [[B]], [[REM]] ; CHECK-NEXT: ret i8 [[DIV2]] ; %rem = and i8 %a, 1 @@ -35,7 +35,7 @@ define <2 x i8> @avg_lsb_vector(<2 x i8> %a, <2 x i8> %b) { ; CHECK-LABEL: define <2 x i8> @avg_lsb_vector( ; CHECK-SAME: <2 x i8> [[A:%.*]], <2 x i8> [[B:%.*]]) { ; CHECK-NEXT: [[REM:%.*]] = and <2 x i8> [[A]], -; CHECK-NEXT: [[DIV2:%.*]] = and <2 x i8> [[REM]], [[B]] +; CHECK-NEXT: [[DIV2:%.*]] = and <2 x i8> [[B]], [[REM]] ; CHECK-NEXT: ret <2 x i8> [[DIV2]] ; %rem = and <2 x i8> %a, diff --git a/llvm/test/Transforms/InstCombine/binop-and-shifts.ll b/llvm/test/Transforms/InstCombine/binop-and-shifts.ll index f776dc13bb4e5a..4b5de41fc7095d 100644 --- a/llvm/test/Transforms/InstCombine/binop-and-shifts.ll +++ b/llvm/test/Transforms/InstCombine/binop-and-shifts.ll @@ -77,7 +77,7 @@ define i8 @shl_and_and_fail2(i8 %x, i8 %y) { define <2 x i8> @lshr_and_or(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @lshr_and_or( ; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i8> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i8> [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: [[BW1:%.*]] = lshr <2 x i8> [[TMP2]], ; CHECK-NEXT: ret <2 x i8> [[BW1]] ; @@ -106,7 +106,7 @@ define <2 x i8> @lshr_and_or_fail(<2 x i8> %x, <2 x i8> %y) { define i8 @shl_and_xor(i8 %x, i8 %y) { ; CHECK-LABEL: @shl_and_xor( ; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], 10 -; CHECK-NEXT: [[TMP2:%.*]] = xor i8 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = xor i8 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: [[BW1:%.*]] = shl i8 [[TMP2]], 1 ; CHECK-NEXT: ret i8 [[BW1]] ; @@ -120,7 +120,7 @@ define i8 @shl_and_xor(i8 %x, i8 %y) { define i8 @shl_and_add(i8 %x, i8 %y) { ; CHECK-LABEL: @shl_and_add( ; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[Y:%.*]], 59 -; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[BW1:%.*]] = shl i8 [[TMP2]], 1 ; CHECK-NEXT: ret i8 [[BW1]] ; @@ -149,7 +149,7 @@ define i8 @shl_xor_add_fail(i8 %x, i8 %y) { define i8 @lshr_or_and(i8 %x, i8 %y) { ; CHECK-LABEL: @lshr_or_and( ; CHECK-NEXT: [[TMP1:%.*]] = or i8 [[X:%.*]], -64 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: [[BW1:%.*]] = lshr i8 [[TMP2]], 5 ; CHECK-NEXT: ret i8 [[BW1]] ; @@ -177,7 +177,7 @@ define i8 @lshr_or_or_fail(i8 %x, i8 %y) { define <2 x i8> @shl_xor_and(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @shl_xor_and( ; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i8> [[Y:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i8> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i8> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[BW1:%.*]] = shl <2 x i8> [[TMP2]], ; CHECK-NEXT: ret <2 x i8> [[BW1]] ; @@ -307,7 +307,7 @@ define i8 @lshr_add_add_no_const_fail(i8 %x, i8 %y, i8 %sh, i8 %mask) { define <2 x i8> @lshr_add_and(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @lshr_add_and( ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i8> [[Y:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i8> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i8> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[BW1:%.*]] = lshr <2 x i8> [[TMP2]], ; CHECK-NEXT: ret <2 x i8> [[BW1]] ; @@ -393,7 +393,7 @@ define i8 @lshr_xor_or_fail_bad_mask(i8 %x, i8 %y) { define <2 x i8> @lshr_or_xor_good_mask(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @lshr_or_xor_good_mask( ; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i8> [[Y:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i8> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i8> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[BW1:%.*]] = lshr <2 x i8> [[TMP2]], ; CHECK-NEXT: ret <2 x i8> [[BW1]] ; @@ -450,7 +450,7 @@ define i8 @shl_xor_xor_bad_mask_distribute(i8 %x, i8 %y) { define i8 @shl_add_and(i8 %x, i8 %y) { ; CHECK-LABEL: @shl_add_and( ; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[Y:%.*]], 61 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[BW1:%.*]] = shl i8 [[TMP2]], 1 ; CHECK-NEXT: ret i8 [[BW1]] ; @@ -509,7 +509,7 @@ define i8 @lshr_add_xor_fail(i8 %x, i8 %y) { define <2 x i8> @lshr_and_add(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @lshr_and_add( ; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i8> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i8> [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i8> [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: [[BW1:%.*]] = shl <2 x i8> [[TMP2]], ; CHECK-NEXT: ret <2 x i8> [[BW1]] ; @@ -555,7 +555,7 @@ define i8 @shl_add_and_fail_mismatch_shift(i8 %x, i8 %y) { define i8 @and_ashr_not(i8 %x, i8 %y, i8 %shamt) { ; CHECK-LABEL: @and_ashr_not( ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[AND:%.*]] = ashr i8 [[TMP2]], [[SHAMT:%.*]] ; CHECK-NEXT: ret i8 [[AND]] ; @@ -569,7 +569,7 @@ define i8 @and_ashr_not(i8 %x, i8 %y, i8 %shamt) { define i8 @and_ashr_not_commuted(i8 %x, i8 %y, i8 %shamt) { ; CHECK-LABEL: @and_ashr_not_commuted( ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[AND:%.*]] = ashr i8 [[TMP2]], [[SHAMT:%.*]] ; CHECK-NEXT: ret i8 [[AND]] ; @@ -634,7 +634,7 @@ define i8 @and_ashr_not_fail_invalid_xor_constant(i8 %x, i8 %y, i8 %shamt) { define <4 x i8> @and_ashr_not_vec(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) { ; CHECK-LABEL: @and_ashr_not_vec( ; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i8> [[Y:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i8> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i8> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[AND:%.*]] = ashr <4 x i8> [[TMP2]], [[SHAMT:%.*]] ; CHECK-NEXT: ret <4 x i8> [[AND]] ; @@ -648,7 +648,7 @@ define <4 x i8> @and_ashr_not_vec(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) { define <4 x i8> @and_ashr_not_vec_commuted(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) { ; CHECK-LABEL: @and_ashr_not_vec_commuted( ; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i8> [[Y:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i8> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i8> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[AND:%.*]] = ashr <4 x i8> [[TMP2]], [[SHAMT:%.*]] ; CHECK-NEXT: ret <4 x i8> [[AND]] ; @@ -662,7 +662,7 @@ define <4 x i8> @and_ashr_not_vec_commuted(<4 x i8> %x, <4 x i8> %y, <4 x i8> %s define <4 x i8> @and_ashr_not_vec_poison_1(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) { ; CHECK-LABEL: @and_ashr_not_vec_poison_1( ; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i8> [[Y:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i8> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i8> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[AND:%.*]] = ashr <4 x i8> [[TMP2]], [[SHAMT:%.*]] ; CHECK-NEXT: ret <4 x i8> [[AND]] ; @@ -689,7 +689,7 @@ define <4 x i8> @and_ashr_not_vec_poison_2(<4 x i8> %x, <4 x i8> %y, <4 x i8> %s define i8 @or_ashr_not(i8 %x, i8 %y, i8 %shamt) { ; CHECK-LABEL: @or_ashr_not( ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = or i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = or i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR:%.*]] = ashr i8 [[TMP2]], [[SHAMT:%.*]] ; CHECK-NEXT: ret i8 [[OR]] ; @@ -703,7 +703,7 @@ define i8 @or_ashr_not(i8 %x, i8 %y, i8 %shamt) { define i8 @or_ashr_not_commuted(i8 %x, i8 %y, i8 %shamt) { ; CHECK-LABEL: @or_ashr_not_commuted( ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = or i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = or i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR:%.*]] = ashr i8 [[TMP2]], [[SHAMT:%.*]] ; CHECK-NEXT: ret i8 [[OR]] ; @@ -768,7 +768,7 @@ define i8 @or_ashr_not_fail_invalid_xor_constant(i8 %x, i8 %y, i8 %shamt) { define <4 x i8> @or_ashr_not_vec(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) { ; CHECK-LABEL: @or_ashr_not_vec( ; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i8> [[Y:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i8> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i8> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR:%.*]] = ashr <4 x i8> [[TMP2]], [[SHAMT:%.*]] ; CHECK-NEXT: ret <4 x i8> [[OR]] ; @@ -782,7 +782,7 @@ define <4 x i8> @or_ashr_not_vec(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) { define <4 x i8> @or_ashr_not_vec_commuted(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) { ; CHECK-LABEL: @or_ashr_not_vec_commuted( ; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i8> [[Y:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i8> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i8> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR:%.*]] = ashr <4 x i8> [[TMP2]], [[SHAMT:%.*]] ; CHECK-NEXT: ret <4 x i8> [[OR]] ; @@ -796,7 +796,7 @@ define <4 x i8> @or_ashr_not_vec_commuted(<4 x i8> %x, <4 x i8> %y, <4 x i8> %sh define <4 x i8> @or_ashr_not_vec_poison_1(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) { ; CHECK-LABEL: @or_ashr_not_vec_poison_1( ; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i8> [[Y:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i8> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i8> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR:%.*]] = ashr <4 x i8> [[TMP2]], [[SHAMT:%.*]] ; CHECK-NEXT: ret <4 x i8> [[OR]] ; diff --git a/llvm/test/Transforms/InstCombine/binop-cast.ll b/llvm/test/Transforms/InstCombine/binop-cast.ll index d521a7d5a2b3a1..9d3b18c5e79ed5 100644 --- a/llvm/test/Transforms/InstCombine/binop-cast.ll +++ b/llvm/test/Transforms/InstCombine/binop-cast.ll @@ -129,7 +129,7 @@ define i32 @and_not_zext_to_sel(i32 %x, i1 %y) { ; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[Y:%.*]] to i32 ; CHECK-NEXT: call void @use(i32 [[ZEXT]]) ; CHECK-NEXT: [[NOT:%.*]] = xor i32 [[ZEXT]], -1 -; CHECK-NEXT: [[R:%.*]] = and i32 [[NOT]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = and i32 [[X:%.*]], [[NOT]] ; CHECK-NEXT: ret i32 [[R]] ; %zext = zext i1 %y to i32 @@ -175,7 +175,7 @@ define i32 @or_sext_to_sel_multi_use(i32 %x, i1 %y) { ; CHECK-LABEL: @or_sext_to_sel_multi_use( ; CHECK-NEXT: [[SEXT:%.*]] = sext i1 [[Y:%.*]] to i32 ; CHECK-NEXT: call void @use(i32 [[SEXT]]) -; CHECK-NEXT: [[R:%.*]] = or i32 [[SEXT]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = or i32 [[X:%.*]], [[SEXT]] ; CHECK-NEXT: ret i32 [[R]] ; %sext = sext i1 %y to i32 @@ -200,7 +200,7 @@ define i32 @or_sext_to_sel_multi_use_constant_mask(i1 %y) { define i32 @xor_sext_to_sel(i32 %x, i1 %y) { ; CHECK-LABEL: @xor_sext_to_sel( ; CHECK-NEXT: [[SEXT:%.*]] = sext i1 [[Y:%.*]] to i32 -; CHECK-NEXT: [[R:%.*]] = xor i32 [[SEXT]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = xor i32 [[X:%.*]], [[SEXT]] ; CHECK-NEXT: ret i32 [[R]] ; %sext = sext i1 %y to i32 @@ -236,7 +236,7 @@ define i32 @xor_sext_to_sel_multi_use(i32 %x, i1 %y) { ; CHECK-LABEL: @xor_sext_to_sel_multi_use( ; CHECK-NEXT: [[SEXT:%.*]] = sext i1 [[Y:%.*]] to i32 ; CHECK-NEXT: call void @use(i32 [[SEXT]]) -; CHECK-NEXT: [[R:%.*]] = xor i32 [[SEXT]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = xor i32 [[X:%.*]], [[SEXT]] ; CHECK-NEXT: ret i32 [[R]] ; %sext = sext i1 %y to i32 diff --git a/llvm/test/Transforms/InstCombine/bit-checks.ll b/llvm/test/Transforms/InstCombine/bit-checks.ll index c7e1fbb8945493..208b2b16e99033 100644 --- a/llvm/test/Transforms/InstCombine/bit-checks.ll +++ b/llvm/test/Transforms/InstCombine/bit-checks.ll @@ -137,7 +137,7 @@ define i32 @main3b_logical(i32 %argc) { define i32 @main3e_like(i32 %argc, i32 %argc2, i32 %argc3) { ; CHECK-LABEL: @main3e_like( ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[ARGC2:%.*]], [[ARGC3:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[ARGC:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[ARGC:%.*]], [[TMP1]] ; CHECK-NEXT: [[AND_COND:%.*]] = icmp ne i32 [[TMP2]], 0 ; CHECK-NEXT: [[STOREMERGE:%.*]] = zext i1 [[AND_COND]] to i32 ; CHECK-NEXT: ret i32 [[STOREMERGE]] @@ -238,7 +238,7 @@ define i32 @main3d_logical(i32 %argc) { define i32 @main3f_like(i32 %argc, i32 %argc2, i32 %argc3) { ; CHECK-LABEL: @main3f_like( ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[ARGC2:%.*]], [[ARGC3:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[ARGC:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[ARGC:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR_COND_NOT:%.*]] = icmp eq i32 [[TMP2]], 0 ; CHECK-NEXT: [[STOREMERGE:%.*]] = zext i1 [[OR_COND_NOT]] to i32 ; CHECK-NEXT: ret i32 [[STOREMERGE]] @@ -355,7 +355,7 @@ define i32 @main4b_logical(i32 %argc) { define i32 @main4e_like(i32 %argc, i32 %argc2, i32 %argc3) { ; CHECK-LABEL: @main4e_like( ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[ARGC2:%.*]], [[ARGC3:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[ARGC:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[ARGC:%.*]], [[TMP1]] ; CHECK-NEXT: [[AND_COND:%.*]] = icmp ne i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[STOREMERGE:%.*]] = zext i1 [[AND_COND]] to i32 ; CHECK-NEXT: ret i32 [[STOREMERGE]] @@ -456,7 +456,7 @@ define i32 @main4d_logical(i32 %argc) { define i32 @main4f_like(i32 %argc, i32 %argc2, i32 %argc3) { ; CHECK-LABEL: @main4f_like( ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[ARGC2:%.*]], [[ARGC3:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[ARGC:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[ARGC:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR_COND_NOT:%.*]] = icmp eq i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[STOREMERGE:%.*]] = zext i1 [[OR_COND_NOT]] to i32 ; CHECK-NEXT: ret i32 [[STOREMERGE]] @@ -529,7 +529,7 @@ define i32 @main5_like_logical(i32 %argc, i32 %argc2) { define i32 @main5e_like(i32 %argc, i32 %argc2, i32 %argc3) { ; CHECK-LABEL: @main5e_like( ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[ARGC2:%.*]], [[ARGC3:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[ARGC:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[ARGC:%.*]], [[TMP1]] ; CHECK-NEXT: [[AND_COND:%.*]] = icmp ne i32 [[TMP2]], [[ARGC]] ; CHECK-NEXT: [[STOREMERGE:%.*]] = zext i1 [[AND_COND]] to i32 ; CHECK-NEXT: ret i32 [[STOREMERGE]] @@ -602,7 +602,7 @@ define i32 @main5c_like_logical(i32 %argc, i32 %argc2) { define i32 @main5f_like(i32 %argc, i32 %argc2, i32 %argc3) { ; CHECK-LABEL: @main5f_like( ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[ARGC2:%.*]], [[ARGC3:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[ARGC:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[ARGC:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR_COND_NOT:%.*]] = icmp eq i32 [[TMP2]], [[ARGC]] ; CHECK-NEXT: [[STOREMERGE:%.*]] = zext i1 [[OR_COND_NOT]] to i32 ; CHECK-NEXT: ret i32 [[STOREMERGE]] @@ -772,7 +772,7 @@ define i32 @main6d_logical(i32 %argc) { define i32 @main7a(i32 %argc, i32 %argc2, i32 %argc3) { ; CHECK-LABEL: @main7a( ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[ARGC2:%.*]], [[ARGC3:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[ARGC:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[ARGC:%.*]], [[TMP1]] ; CHECK-NEXT: [[AND_COND:%.*]] = icmp ne i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[STOREMERGE:%.*]] = zext i1 [[AND_COND]] to i32 ; CHECK-NEXT: ret i32 [[STOREMERGE]] @@ -810,8 +810,8 @@ define i32 @main7b(i32 %argc, i32 %argc2, i32 %argc3x) { ; CHECK-LABEL: @main7b( ; CHECK-NEXT: [[ARGC3:%.*]] = mul i32 [[ARGC3X:%.*]], 42 ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[ARGC:%.*]], [[ARGC2:%.*]] -; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[AND1]], [[ARGC2]] -; CHECK-NEXT: [[AND2:%.*]] = and i32 [[ARGC3]], [[ARGC]] +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[ARGC2]], [[AND1]] +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[ARGC]], [[ARGC3]] ; CHECK-NEXT: [[TOBOOL3:%.*]] = icmp ne i32 [[ARGC3]], [[AND2]] ; CHECK-NEXT: [[AND_COND_NOT:%.*]] = or i1 [[TOBOOL]], [[TOBOOL3]] ; CHECK-NEXT: [[STOREMERGE:%.*]] = zext i1 [[AND_COND_NOT]] to i32 @@ -830,9 +830,9 @@ define i32 @main7b(i32 %argc, i32 %argc2, i32 %argc3x) { define i32 @main7b_logical(i32 %argc, i32 %argc2, i32 %argc3) { ; CHECK-LABEL: @main7b_logical( ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[ARGC:%.*]], [[ARGC2:%.*]] -; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[AND1]], [[ARGC2]] +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[ARGC2]], [[AND1]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[ARGC]], [[ARGC3:%.*]] -; CHECK-NEXT: [[TOBOOL3:%.*]] = icmp ne i32 [[AND2]], [[ARGC3]] +; CHECK-NEXT: [[TOBOOL3:%.*]] = icmp ne i32 [[ARGC3]], [[AND2]] ; CHECK-NEXT: [[AND_COND_NOT:%.*]] = select i1 [[TOBOOL]], i1 true, i1 [[TOBOOL3]] ; CHECK-NEXT: [[STOREMERGE:%.*]] = zext i1 [[AND_COND_NOT]] to i32 ; CHECK-NEXT: ret i32 [[STOREMERGE]] @@ -851,7 +851,7 @@ define i32 @main7c(i32 %argc, i32 %argc2, i32 %argc3x) { ; CHECK-LABEL: @main7c( ; CHECK-NEXT: [[ARGC3:%.*]] = mul i32 [[ARGC3X:%.*]], 42 ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[ARGC2:%.*]], [[ARGC:%.*]] -; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[AND1]], [[ARGC2]] +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[ARGC2]], [[AND1]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[ARGC3]], [[ARGC]] ; CHECK-NEXT: [[TOBOOL3:%.*]] = icmp ne i32 [[ARGC3]], [[AND2]] ; CHECK-NEXT: [[AND_COND_NOT:%.*]] = or i1 [[TOBOOL]], [[TOBOOL3]] @@ -871,9 +871,9 @@ define i32 @main7c(i32 %argc, i32 %argc2, i32 %argc3x) { define i32 @main7c_logical(i32 %argc, i32 %argc2, i32 %argc3) { ; CHECK-LABEL: @main7c_logical( ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[ARGC2:%.*]], [[ARGC:%.*]] -; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[AND1]], [[ARGC2]] +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[ARGC2]], [[AND1]] ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[ARGC3:%.*]], [[ARGC]] -; CHECK-NEXT: [[TOBOOL3:%.*]] = icmp ne i32 [[AND2]], [[ARGC3]] +; CHECK-NEXT: [[TOBOOL3:%.*]] = icmp ne i32 [[ARGC3]], [[AND2]] ; CHECK-NEXT: [[AND_COND_NOT:%.*]] = select i1 [[TOBOOL]], i1 true, i1 [[TOBOOL3]] ; CHECK-NEXT: [[STOREMERGE:%.*]] = zext i1 [[AND_COND_NOT]] to i32 ; CHECK-NEXT: ret i32 [[STOREMERGE]] @@ -893,7 +893,7 @@ define i32 @main7d(i32 %argc, i32 %argc2, i32 %argc3, i32 %argc4, i32 %argc5) { ; CHECK-NEXT: [[BC:%.*]] = and i32 [[ARGC2:%.*]], [[ARGC4:%.*]] ; CHECK-NEXT: [[DE:%.*]] = and i32 [[ARGC3:%.*]], [[ARGC5:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[BC]], [[DE]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[ARGC:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[ARGC:%.*]], [[TMP1]] ; CHECK-NEXT: [[AND_COND:%.*]] = icmp ne i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[STOREMERGE:%.*]] = zext i1 [[AND_COND]] to i32 ; CHECK-NEXT: ret i32 [[STOREMERGE]] @@ -913,9 +913,9 @@ define i32 @main7d_logical(i32 %argc, i32 %argc2, i32 %argc3, i32 %argc4, i32 %a ; CHECK-LABEL: @main7d_logical( ; CHECK-NEXT: [[BC:%.*]] = and i32 [[ARGC2:%.*]], [[ARGC4:%.*]] ; CHECK-NEXT: [[DE:%.*]] = and i32 [[ARGC3:%.*]], [[ARGC5:%.*]] -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[BC]], [[ARGC:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[ARGC:%.*]], [[BC]] ; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[AND1]], [[BC]] -; CHECK-NEXT: [[AND2:%.*]] = and i32 [[DE]], [[ARGC]] +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[ARGC]], [[DE]] ; CHECK-NEXT: [[TOBOOL3:%.*]] = icmp ne i32 [[AND2]], [[DE]] ; CHECK-NEXT: [[AND_COND_NOT:%.*]] = select i1 [[TOBOOL]], i1 true, i1 [[TOBOOL3]] ; CHECK-NEXT: [[STOREMERGE:%.*]] = zext i1 [[AND_COND_NOT]] to i32 @@ -938,7 +938,7 @@ define i32 @main7e(i32 %argc, i32 %argc2, i32 %argc3, i32 %argc4, i32 %argc5) { ; CHECK-NEXT: [[BC:%.*]] = and i32 [[ARGC2:%.*]], [[ARGC4:%.*]] ; CHECK-NEXT: [[DE:%.*]] = and i32 [[ARGC3:%.*]], [[ARGC5:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[BC]], [[DE]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[ARGC:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[ARGC:%.*]], [[TMP1]] ; CHECK-NEXT: [[AND_COND:%.*]] = icmp ne i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[STOREMERGE:%.*]] = zext i1 [[AND_COND]] to i32 ; CHECK-NEXT: ret i32 [[STOREMERGE]] @@ -983,7 +983,7 @@ define i32 @main7f(i32 %argc, i32 %argc2, i32 %argc3, i32 %argc4, i32 %argc5) { ; CHECK-NEXT: [[BC:%.*]] = and i32 [[ARGC2:%.*]], [[ARGC4:%.*]] ; CHECK-NEXT: [[DE:%.*]] = and i32 [[ARGC3:%.*]], [[ARGC5:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[BC]], [[DE]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[ARGC:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[ARGC:%.*]], [[TMP1]] ; CHECK-NEXT: [[AND_COND:%.*]] = icmp ne i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[STOREMERGE:%.*]] = zext i1 [[AND_COND]] to i32 ; CHECK-NEXT: ret i32 [[STOREMERGE]] @@ -1003,9 +1003,9 @@ define i32 @main7f_logical(i32 %argc, i32 %argc2, i32 %argc3, i32 %argc4, i32 %a ; CHECK-LABEL: @main7f_logical( ; CHECK-NEXT: [[BC:%.*]] = and i32 [[ARGC2:%.*]], [[ARGC4:%.*]] ; CHECK-NEXT: [[DE:%.*]] = and i32 [[ARGC3:%.*]], [[ARGC5:%.*]] -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[BC]], [[ARGC:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[ARGC:%.*]], [[BC]] ; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[BC]], [[AND1]] -; CHECK-NEXT: [[AND2:%.*]] = and i32 [[DE]], [[ARGC]] +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[ARGC]], [[DE]] ; CHECK-NEXT: [[TOBOOL3:%.*]] = icmp ne i32 [[DE]], [[AND2]] ; CHECK-NEXT: [[AND_COND_NOT:%.*]] = select i1 [[TOBOOL]], i1 true, i1 [[TOBOOL3]] ; CHECK-NEXT: [[STOREMERGE:%.*]] = zext i1 [[AND_COND_NOT]] to i32 @@ -1028,7 +1028,7 @@ define i32 @main7g(i32 %argc, i32 %argc2, i32 %argc3, i32 %argc4, i32 %argc5) { ; CHECK-NEXT: [[BC:%.*]] = and i32 [[ARGC2:%.*]], [[ARGC4:%.*]] ; CHECK-NEXT: [[DE:%.*]] = and i32 [[ARGC3:%.*]], [[ARGC5:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[BC]], [[DE]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[ARGC:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[ARGC:%.*]], [[TMP1]] ; CHECK-NEXT: [[AND_COND:%.*]] = icmp ne i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[STOREMERGE:%.*]] = zext i1 [[AND_COND]] to i32 ; CHECK-NEXT: ret i32 [[STOREMERGE]] diff --git a/llvm/test/Transforms/InstCombine/bitcast-inseltpoison.ll b/llvm/test/Transforms/InstCombine/bitcast-inseltpoison.ll index 061182fdaf3c80..3744d8c9171c7d 100644 --- a/llvm/test/Transforms/InstCombine/bitcast-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/bitcast-inseltpoison.ll @@ -120,7 +120,7 @@ define <2 x i8> @canonicalize_bitcast_logic_with_constant(<4 x i4> %x) { define <4 x i32> @bitcasts_and_bitcast(<4 x i32> %a, <8 x i16> %b) { ; CHECK-LABEL: @bitcasts_and_bitcast( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B:%.*]] to <4 x i32> -; CHECK-NEXT: [[BC3:%.*]] = and <4 x i32> [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[BC3:%.*]] = and <4 x i32> [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret <4 x i32> [[BC3]] ; %bc1 = bitcast <4 x i32> %a to <2 x i64> @@ -133,7 +133,7 @@ define <4 x i32> @bitcasts_and_bitcast(<4 x i32> %a, <8 x i16> %b) { define <4 x float> @bitcasts_and_bitcast_to_fp(<4 x float> %a, <8 x i16> %b) { ; CHECK-LABEL: @bitcasts_and_bitcast_to_fp( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <8 x i16> -; CHECK-NEXT: [[TMP2:%.*]] = and <8 x i16> [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <8 x i16> [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[BC3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x float> ; CHECK-NEXT: ret <4 x float> [[BC3]] ; @@ -149,7 +149,7 @@ define <4 x float> @bitcasts_and_bitcast_to_fp(<4 x float> %a, <8 x i16> %b) { define i128 @bitcast_or_bitcast(i128 %a, <2 x i64> %b) { ; CHECK-LABEL: @bitcast_or_bitcast( ; CHECK-NEXT: [[BC1:%.*]] = bitcast i128 [[A:%.*]] to <2 x i64> -; CHECK-NEXT: [[OR:%.*]] = or <2 x i64> [[BC1]], [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or <2 x i64> [[B:%.*]], [[BC1]] ; CHECK-NEXT: [[BC2:%.*]] = bitcast <2 x i64> [[OR]] to i128 ; CHECK-NEXT: ret i128 [[BC2]] ; @@ -164,7 +164,7 @@ define i128 @bitcast_or_bitcast(i128 %a, <2 x i64> %b) { define <4 x i32> @bitcast_xor_bitcast(<4 x i32> %a, i128 %b) { ; CHECK-LABEL: @bitcast_xor_bitcast( ; CHECK-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[A:%.*]] to i128 -; CHECK-NEXT: [[XOR:%.*]] = xor i128 [[BC1]], [[B:%.*]] +; CHECK-NEXT: [[XOR:%.*]] = xor i128 [[B:%.*]], [[BC1]] ; CHECK-NEXT: [[BC2:%.*]] = bitcast i128 [[XOR]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[BC2]] ; diff --git a/llvm/test/Transforms/InstCombine/bitcast.ll b/llvm/test/Transforms/InstCombine/bitcast.ll index 26047f2c899a38..4ab24ce7b925dc 100644 --- a/llvm/test/Transforms/InstCombine/bitcast.ll +++ b/llvm/test/Transforms/InstCombine/bitcast.ll @@ -122,7 +122,7 @@ define <2 x i8> @canonicalize_bitcast_logic_with_constant(<4 x i4> %x) { define <4 x i32> @bitcasts_and_bitcast(<4 x i32> %a, <8 x i16> %b) { ; CHECK-LABEL: @bitcasts_and_bitcast( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B:%.*]] to <4 x i32> -; CHECK-NEXT: [[BC3:%.*]] = and <4 x i32> [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[BC3:%.*]] = and <4 x i32> [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret <4 x i32> [[BC3]] ; %bc1 = bitcast <4 x i32> %a to <2 x i64> @@ -135,7 +135,7 @@ define <4 x i32> @bitcasts_and_bitcast(<4 x i32> %a, <8 x i16> %b) { define <4 x float> @bitcasts_and_bitcast_to_fp(<4 x float> %a, <8 x i16> %b) { ; CHECK-LABEL: @bitcasts_and_bitcast_to_fp( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <8 x i16> -; CHECK-NEXT: [[TMP2:%.*]] = and <8 x i16> [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <8 x i16> [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[BC3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x float> ; CHECK-NEXT: ret <4 x float> [[BC3]] ; @@ -149,7 +149,7 @@ define <4 x float> @bitcasts_and_bitcast_to_fp(<4 x float> %a, <8 x i16> %b) { define <2 x double> @bitcasts_or_bitcast_to_fp(<4 x float> %a, <8 x i16> %b) { ; CHECK-LABEL: @bitcasts_or_bitcast_to_fp( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <8 x i16> -; CHECK-NEXT: [[TMP2:%.*]] = or <8 x i16> [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = or <8 x i16> [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[BC3:%.*]] = bitcast <8 x i16> [[TMP2]] to <2 x double> ; CHECK-NEXT: ret <2 x double> [[BC3]] ; @@ -163,7 +163,7 @@ define <2 x double> @bitcasts_or_bitcast_to_fp(<4 x float> %a, <8 x i16> %b) { define <4 x float> @bitcasts_xor_bitcast_to_fp(<2 x double> %a, <8 x i16> %b) { ; CHECK-LABEL: @bitcasts_xor_bitcast_to_fp( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[A:%.*]] to <8 x i16> -; CHECK-NEXT: [[TMP2:%.*]] = xor <8 x i16> [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = xor <8 x i16> [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[BC3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x float> ; CHECK-NEXT: ret <4 x float> [[BC3]] ; @@ -198,7 +198,7 @@ define <4 x float> @bitcasts_and_bitcast_to_fp_multiuse(<4 x float> %a, <8 x i16 define i128 @bitcast_or_bitcast(i128 %a, <2 x i64> %b) { ; CHECK-LABEL: @bitcast_or_bitcast( ; CHECK-NEXT: [[BC1:%.*]] = bitcast i128 [[A:%.*]] to <2 x i64> -; CHECK-NEXT: [[OR:%.*]] = or <2 x i64> [[BC1]], [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or <2 x i64> [[B:%.*]], [[BC1]] ; CHECK-NEXT: [[BC2:%.*]] = bitcast <2 x i64> [[OR]] to i128 ; CHECK-NEXT: ret i128 [[BC2]] ; @@ -213,7 +213,7 @@ define i128 @bitcast_or_bitcast(i128 %a, <2 x i64> %b) { define <4 x i32> @bitcast_xor_bitcast(<4 x i32> %a, i128 %b) { ; CHECK-LABEL: @bitcast_xor_bitcast( ; CHECK-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[A:%.*]] to i128 -; CHECK-NEXT: [[XOR:%.*]] = xor i128 [[BC1]], [[B:%.*]] +; CHECK-NEXT: [[XOR:%.*]] = xor i128 [[B:%.*]], [[BC1]] ; CHECK-NEXT: [[BC2:%.*]] = bitcast i128 [[XOR]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[BC2]] ; diff --git a/llvm/test/Transforms/InstCombine/bitreverse.ll b/llvm/test/Transforms/InstCombine/bitreverse.ll index cbe9695c48690b..fe44a7a77bdff8 100644 --- a/llvm/test/Transforms/InstCombine/bitreverse.ll +++ b/llvm/test/Transforms/InstCombine/bitreverse.ll @@ -403,7 +403,7 @@ define i64 @PR59897(i1 %X1_2) { define i16 @rev_xor_lhs_rev16(i16 %a, i16 %b) #0 { ; CHECK-LABEL: @rev_xor_lhs_rev16( ; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[B:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = xor i16 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = xor i16 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret i16 [[TMP2]] ; %1 = tail call i16 @llvm.bitreverse.i16(i16 %a) @@ -475,7 +475,7 @@ define <2 x i32> @rev_xor_rhs_i32vec(<2 x i32> %a, <2 x i32> %b) #0 { define i64 @rev_and_rhs_rev64_multiuse1(i64 %a, i64 %b) #0 { ; CHECK-LABEL: @rev_and_rhs_rev64_multiuse1( ; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.bitreverse.i64(i64 [[B:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.bitreverse.i64(i64 [[TMP2]]) ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP3]] ; CHECK-NEXT: ret i64 [[TMP4]] @@ -490,7 +490,7 @@ define i64 @rev_and_rhs_rev64_multiuse1(i64 %a, i64 %b) #0 { define i64 @rev_and_rhs_rev64_multiuse2(i64 %a, i64 %b) #0 { ; CHECK-LABEL: @rev_and_rhs_rev64_multiuse2( ; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.bitreverse.i64(i64 [[B:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.bitreverse.i64(i64 [[TMP2]]) ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP1]], [[TMP3]] ; CHECK-NEXT: ret i64 [[TMP4]] diff --git a/llvm/test/Transforms/InstCombine/bswap-fold.ll b/llvm/test/Transforms/InstCombine/bswap-fold.ll index 91674c6017a9e5..ddc0430896e7d1 100644 --- a/llvm/test/Transforms/InstCombine/bswap-fold.ll +++ b/llvm/test/Transforms/InstCombine/bswap-fold.ll @@ -544,7 +544,7 @@ define i64 @bs_and64i_multiuse(i64 %a, i64 %b) #0 { define i16 @bs_and_lhs_bs16(i16 %a, i16 %b) #0 { ; CHECK-LABEL: @bs_and_lhs_bs16( ; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.bswap.i16(i16 [[B:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret i16 [[TMP2]] ; %1 = tail call i16 @llvm.bswap.i16(i16 %a) @@ -556,7 +556,7 @@ define i16 @bs_and_lhs_bs16(i16 %a, i16 %b) #0 { define i16 @bs_or_lhs_bs16(i16 %a, i16 %b) #0 { ; CHECK-LABEL: @bs_or_lhs_bs16( ; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.bswap.i16(i16 [[B:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = or i16 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = or i16 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret i16 [[TMP2]] ; %1 = tail call i16 @llvm.bswap.i16(i16 %a) @@ -568,7 +568,7 @@ define i16 @bs_or_lhs_bs16(i16 %a, i16 %b) #0 { define i16 @bs_xor_lhs_bs16(i16 %a, i16 %b) #0 { ; CHECK-LABEL: @bs_xor_lhs_bs16( ; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.bswap.i16(i16 [[B:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = xor i16 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = xor i16 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret i16 [[TMP2]] ; %1 = tail call i16 @llvm.bswap.i16(i16 %a) @@ -724,7 +724,7 @@ define <2 x i32> @bs_xor_rhs_i32vec(<2 x i32> %a, <2 x i32> %b) #0 { define i64 @bs_and_rhs_bs64_multiuse1(i64 %a, i64 %b) #0 { ; CHECK-LABEL: @bs_and_rhs_bs64_multiuse1( ; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[B:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP2]]) ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP3]] ; CHECK-NEXT: ret i64 [[TMP4]] @@ -739,7 +739,7 @@ define i64 @bs_and_rhs_bs64_multiuse1(i64 %a, i64 %b) #0 { define i64 @bs_and_rhs_bs64_multiuse2(i64 %a, i64 %b) #0 { ; CHECK-LABEL: @bs_and_rhs_bs64_multiuse2( ; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[B:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP2]]) ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP1]], [[TMP3]] ; CHECK-NEXT: ret i64 [[TMP4]] diff --git a/llvm/test/Transforms/InstCombine/call-guard.ll b/llvm/test/Transforms/InstCombine/call-guard.ll index 358518b9bd1cba..6b31c78118d0b8 100644 --- a/llvm/test/Transforms/InstCombine/call-guard.ll +++ b/llvm/test/Transforms/InstCombine/call-guard.ll @@ -80,7 +80,7 @@ define void @negative_load(i32 %V1, ptr %P) { define void @deref_load(i32 %V1, ptr dereferenceable(4) align 4 %P) nofree nosync { ; CHECK-LABEL: @deref_load( ; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[P:%.*]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[V2]], [[V1:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[V1:%.*]], [[V2]] ; CHECK-NEXT: [[TMP2:%.*]] = icmp slt i32 [[TMP1]], 0 ; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[TMP2]], i32 123) [ "deopt"() ] ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-eq-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-eq-to-icmp-ule.ll index 759770688cf209..3d5696a0245139 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-eq-to-icmp-ule.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-eq-to-icmp-ule.ll @@ -27,7 +27,7 @@ define i1 @p0(i8 %x) { define i1 @pv(i8 %x, i8 %y) { ; CHECK-LABEL: @pv( ; CHECK-NEXT: [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]] -; CHECK-NEXT: [[RET:%.*]] = icmp uge i8 [[TMP0]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ule i8 [[X:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[RET]] ; %tmp0 = lshr i8 -1, %y diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll index 9b28129dd9e172..21daeb8983a85d 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll @@ -27,7 +27,7 @@ define i1 @p0(i8 %x) { define i1 @pv(i8 %x, i8 %y) { ; CHECK-LABEL: @pv( ; CHECK-NEXT: [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]] -; CHECK-NEXT: [[RET:%.*]] = icmp ult i8 [[TMP0]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ugt i8 [[X:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[RET]] ; %tmp0 = lshr i8 -1, %y diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-uge-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-uge-to-icmp-ule.ll index cfd48821b2c1d5..1dac73df387896 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-uge-to-icmp-ule.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-uge-to-icmp-ule.ll @@ -27,7 +27,7 @@ define i1 @p0(i8 %x) { define i1 @pv(i8 %x, i8 %y) { ; CHECK-LABEL: @pv( ; CHECK-NEXT: [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]] -; CHECK-NEXT: [[RET:%.*]] = icmp uge i8 [[TMP0]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ule i8 [[X:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[RET]] ; %tmp0 = lshr i8 -1, %y diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ult-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ult-to-icmp-ugt.ll index 70fb34f4992899..7eda7bb58f2700 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ult-to-icmp-ugt.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ult-to-icmp-ugt.ll @@ -27,7 +27,7 @@ define i1 @p0(i8 %x) { define i1 @pv(i8 %x, i8 %y) { ; CHECK-LABEL: @pv( ; CHECK-NEXT: [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]] -; CHECK-NEXT: [[RET:%.*]] = icmp ult i8 [[TMP0]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ugt i8 [[X:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[RET]] ; %tmp0 = lshr i8 -1, %y diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-eq-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-eq-to-icmp-ule.ll index dc5658d302d991..5a58fc96c6643f 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-eq-to-icmp-ule.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-eq-to-icmp-ule.ll @@ -15,7 +15,7 @@ define i1 @p0(i8 %x, i8 %y) { ; CHECK-LABEL: @p0( ; CHECK-NEXT: [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]] -; CHECK-NEXT: [[RET:%.*]] = icmp uge i8 [[TMP0]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ule i8 [[X:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[RET]] ; %tmp0 = lshr i8 -1, %y @@ -31,7 +31,7 @@ define i1 @p0(i8 %x, i8 %y) { define <2 x i1> @p1_vec(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @p1_vec( ; CHECK-NEXT: [[TMP0:%.*]] = lshr <2 x i8> , [[Y:%.*]] -; CHECK-NEXT: [[RET:%.*]] = icmp uge <2 x i8> [[TMP0]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ule <2 x i8> [[X:%.*]], [[TMP0]] ; CHECK-NEXT: ret <2 x i1> [[RET]] ; %tmp0 = lshr <2 x i8> , %y @@ -43,7 +43,7 @@ define <2 x i1> @p1_vec(<2 x i8> %x, <2 x i8> %y) { define <3 x i1> @p2_vec_poison(<3 x i8> %x, <3 x i8> %y) { ; CHECK-LABEL: @p2_vec_poison( ; CHECK-NEXT: [[TMP0:%.*]] = lshr <3 x i8> , [[Y:%.*]] -; CHECK-NEXT: [[RET:%.*]] = icmp uge <3 x i8> [[TMP0]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ule <3 x i8> [[X:%.*]], [[TMP0]] ; CHECK-NEXT: ret <3 x i1> [[RET]] ; %tmp0 = lshr <3 x i8> , %y @@ -110,7 +110,7 @@ define i1 @oneuse0(i8 %x, i8 %y) { ; CHECK-LABEL: @oneuse0( ; CHECK-NEXT: [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[TMP0]]) -; CHECK-NEXT: [[RET:%.*]] = icmp uge i8 [[TMP0]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ule i8 [[X:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[RET]] ; %tmp0 = lshr i8 -1, %y @@ -125,7 +125,7 @@ define i1 @oneuse1(i8 %x, i8 %y) { ; CHECK-NEXT: [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[TMP0]], [[X:%.*]] ; CHECK-NEXT: call void @use8(i8 [[TMP1]]) -; CHECK-NEXT: [[RET:%.*]] = icmp uge i8 [[TMP0]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp ule i8 [[X]], [[TMP0]] ; CHECK-NEXT: ret i1 [[RET]] ; %tmp0 = lshr i8 -1, %y @@ -141,7 +141,7 @@ define i1 @oneuse2(i8 %x, i8 %y) { ; CHECK-NEXT: call void @use8(i8 [[TMP0]]) ; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[TMP0]], [[X:%.*]] ; CHECK-NEXT: call void @use8(i8 [[TMP1]]) -; CHECK-NEXT: [[RET:%.*]] = icmp uge i8 [[TMP0]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp ule i8 [[X]], [[TMP0]] ; CHECK-NEXT: ret i1 [[RET]] ; %tmp0 = lshr i8 -1, %y diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll index 8fbbd2bb9907d9..edd528b500e557 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll @@ -15,7 +15,7 @@ define i1 @p0(i8 %x, i8 %y) { ; CHECK-LABEL: @p0( ; CHECK-NEXT: [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]] -; CHECK-NEXT: [[RET:%.*]] = icmp ult i8 [[TMP0]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ugt i8 [[X:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[RET]] ; %tmp0 = lshr i8 -1, %y @@ -31,7 +31,7 @@ define i1 @p0(i8 %x, i8 %y) { define <2 x i1> @p1_vec(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @p1_vec( ; CHECK-NEXT: [[TMP0:%.*]] = lshr <2 x i8> , [[Y:%.*]] -; CHECK-NEXT: [[RET:%.*]] = icmp ult <2 x i8> [[TMP0]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ugt <2 x i8> [[X:%.*]], [[TMP0]] ; CHECK-NEXT: ret <2 x i1> [[RET]] ; %tmp0 = lshr <2 x i8> , %y @@ -43,7 +43,7 @@ define <2 x i1> @p1_vec(<2 x i8> %x, <2 x i8> %y) { define <3 x i1> @p2_vec_poison(<3 x i8> %x, <3 x i8> %y) { ; CHECK-LABEL: @p2_vec_poison( ; CHECK-NEXT: [[TMP0:%.*]] = lshr <3 x i8> , [[Y:%.*]] -; CHECK-NEXT: [[RET:%.*]] = icmp ult <3 x i8> [[TMP0]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ugt <3 x i8> [[X:%.*]], [[TMP0]] ; CHECK-NEXT: ret <3 x i1> [[RET]] ; %tmp0 = lshr <3 x i8> , %y @@ -110,7 +110,7 @@ define i1 @oneuse0(i8 %x, i8 %y) { ; CHECK-LABEL: @oneuse0( ; CHECK-NEXT: [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[TMP0]]) -; CHECK-NEXT: [[RET:%.*]] = icmp ult i8 [[TMP0]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ugt i8 [[X:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[RET]] ; %tmp0 = lshr i8 -1, %y @@ -125,7 +125,7 @@ define i1 @oneuse1(i8 %x, i8 %y) { ; CHECK-NEXT: [[TMP0:%.*]] = lshr i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[TMP0]], [[X:%.*]] ; CHECK-NEXT: call void @use8(i8 [[TMP1]]) -; CHECK-NEXT: [[RET:%.*]] = icmp ult i8 [[TMP0]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp ugt i8 [[X]], [[TMP0]] ; CHECK-NEXT: ret i1 [[RET]] ; %tmp0 = lshr i8 -1, %y @@ -141,7 +141,7 @@ define i1 @oneuse2(i8 %x, i8 %y) { ; CHECK-NEXT: call void @use8(i8 [[TMP0]]) ; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[TMP0]], [[X:%.*]] ; CHECK-NEXT: call void @use8(i8 [[TMP1]]) -; CHECK-NEXT: [[RET:%.*]] = icmp ult i8 [[TMP0]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp ugt i8 [[X]], [[TMP0]] ; CHECK-NEXT: ret i1 [[RET]] ; %tmp0 = lshr i8 -1, %y diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll index 443efbe1ecaf6d..1adef8b0710b31 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll @@ -144,7 +144,7 @@ define i1 @oneuse0(i8 %x, i8 %y) { ; CHECK-LABEL: @oneuse0( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: [[RET:%.*]] = icmp eq i8 [[TMP1]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; @@ -161,7 +161,7 @@ define i1 @oneuse1(i8 %x, i8 %y) { ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 ; CHECK-NEXT: call void @use8(i8 [[T1]]) -; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: [[RET:%.*]] = icmp eq i8 [[TMP1]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; @@ -177,9 +177,9 @@ define i1 @oneuse2(i8 %x, i8 %y) { ; CHECK-LABEL: @oneuse2( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 -; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[T2:%.*]] = and i8 [[X:%.*]], [[T1]] ; CHECK-NEXT: call void @use8(i8 [[T2]]) -; CHECK-NEXT: [[RET:%.*]] = icmp uge i8 [[T1]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp ule i8 [[X]], [[T1]] ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 -1, %y @@ -196,7 +196,7 @@ define i1 @oneuse3(i8 %x, i8 %y) { ; CHECK-NEXT: call void @use8(i8 [[T0]]) ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 ; CHECK-NEXT: call void @use8(i8 [[T1]]) -; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: [[RET:%.*]] = icmp eq i8 [[TMP1]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; @@ -214,9 +214,9 @@ define i1 @oneuse4(i8 %x, i8 %y) { ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T0]]) ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 -; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[T2:%.*]] = and i8 [[X:%.*]], [[T1]] ; CHECK-NEXT: call void @use8(i8 [[T2]]) -; CHECK-NEXT: [[RET:%.*]] = icmp uge i8 [[T1]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp ule i8 [[X]], [[T1]] ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 -1, %y @@ -234,9 +234,9 @@ define i1 @oneuse5(i8 %x, i8 %y) { ; CHECK-NEXT: call void @use8(i8 [[T0]]) ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 ; CHECK-NEXT: call void @use8(i8 [[T1]]) -; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[T2:%.*]] = and i8 [[X:%.*]], [[T1]] ; CHECK-NEXT: call void @use8(i8 [[T2]]) -; CHECK-NEXT: [[RET:%.*]] = icmp uge i8 [[T1]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp ule i8 [[X]], [[T1]] ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 -1, %y @@ -257,7 +257,7 @@ define i1 @n0(i8 %x, i8 %y, i8 %notx) { ; CHECK-LABEL: @n0( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 -; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[T2:%.*]] = and i8 [[X:%.*]], [[T1]] ; CHECK-NEXT: [[RET:%.*]] = icmp eq i8 [[T2]], [[NOTX:%.*]] ; CHECK-NEXT: ret i1 [[RET]] ; @@ -271,7 +271,7 @@ define i1 @n0(i8 %x, i8 %y, i8 %notx) { define i1 @n1(i8 %x, i8 %y) { ; CHECK-LABEL: @n1( ; CHECK-NEXT: [[T0:%.*]] = shl nuw i8 1, [[Y:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: [[RET:%.*]] = icmp eq i8 [[TMP1]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; @@ -286,7 +286,7 @@ define i1 @n2(i8 %x, i8 %y) { ; CHECK-LABEL: @n2( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[T0]], -2 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[RET:%.*]] = icmp eq i8 [[TMP2]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll index ffde4eae777cb2..36238c75370ab6 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll @@ -144,7 +144,7 @@ define i1 @oneuse0(i8 %x, i8 %y) { ; CHECK-LABEL: @oneuse0( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: [[RET:%.*]] = icmp ne i8 [[TMP1]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; @@ -161,7 +161,7 @@ define i1 @oneuse1(i8 %x, i8 %y) { ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 ; CHECK-NEXT: call void @use8(i8 [[T1]]) -; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: [[RET:%.*]] = icmp ne i8 [[TMP1]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; @@ -177,9 +177,9 @@ define i1 @oneuse2(i8 %x, i8 %y) { ; CHECK-LABEL: @oneuse2( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 -; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[T2:%.*]] = and i8 [[X:%.*]], [[T1]] ; CHECK-NEXT: call void @use8(i8 [[T2]]) -; CHECK-NEXT: [[RET:%.*]] = icmp ult i8 [[T1]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp ugt i8 [[X]], [[T1]] ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 -1, %y @@ -196,7 +196,7 @@ define i1 @oneuse3(i8 %x, i8 %y) { ; CHECK-NEXT: call void @use8(i8 [[T0]]) ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 ; CHECK-NEXT: call void @use8(i8 [[T1]]) -; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: [[RET:%.*]] = icmp ne i8 [[TMP1]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; @@ -214,9 +214,9 @@ define i1 @oneuse4(i8 %x, i8 %y) { ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T0]]) ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 -; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[T2:%.*]] = and i8 [[X:%.*]], [[T1]] ; CHECK-NEXT: call void @use8(i8 [[T2]]) -; CHECK-NEXT: [[RET:%.*]] = icmp ult i8 [[T1]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp ugt i8 [[X]], [[T1]] ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 -1, %y @@ -234,9 +234,9 @@ define i1 @oneuse5(i8 %x, i8 %y) { ; CHECK-NEXT: call void @use8(i8 [[T0]]) ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 ; CHECK-NEXT: call void @use8(i8 [[T1]]) -; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[T2:%.*]] = and i8 [[X:%.*]], [[T1]] ; CHECK-NEXT: call void @use8(i8 [[T2]]) -; CHECK-NEXT: [[RET:%.*]] = icmp ult i8 [[T1]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp ugt i8 [[X]], [[T1]] ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 -1, %y @@ -257,7 +257,7 @@ define i1 @n0(i8 %x, i8 %y, i8 %notx) { ; CHECK-LABEL: @n0( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 -; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[T2:%.*]] = and i8 [[X:%.*]], [[T1]] ; CHECK-NEXT: [[RET:%.*]] = icmp ne i8 [[T2]], [[NOTX:%.*]] ; CHECK-NEXT: ret i1 [[RET]] ; @@ -271,7 +271,7 @@ define i1 @n0(i8 %x, i8 %y, i8 %notx) { define i1 @n1(i8 %x, i8 %y) { ; CHECK-LABEL: @n1( ; CHECK-NEXT: [[T0:%.*]] = shl nuw i8 1, [[Y:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: [[RET:%.*]] = icmp ne i8 [[TMP1]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; @@ -286,7 +286,7 @@ define i1 @n2(i8 %x, i8 %y) { ; CHECK-LABEL: @n2( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[T0]], -2 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[RET:%.*]] = icmp ne i8 [[TMP2]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-eq-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-eq-to-icmp-ule.ll index 946bb03e04f7e4..fd56324f10dc38 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-eq-to-icmp-ule.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-eq-to-icmp-ule.ll @@ -174,7 +174,7 @@ define i1 @oneuse0(i8 %x, i8 %y) { ; CHECK-NEXT: call void @use8(i8 [[T0]]) ; CHECK-NEXT: [[T1:%.*]] = add i8 [[T0]], -1 ; CHECK-NEXT: call void @use8(i8 [[T1]]) -; CHECK-NEXT: [[RET:%.*]] = icmp uge i8 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ule i8 [[X:%.*]], [[T1]] ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 1, %y @@ -193,7 +193,7 @@ define i1 @oneuse1(i8 %x, i8 %y) { ; CHECK-NEXT: [[T1:%.*]] = add i8 [[T0]], -1 ; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T2]]) -; CHECK-NEXT: [[RET:%.*]] = icmp uge i8 [[T1]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp ule i8 [[X]], [[T1]] ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 1, %y @@ -213,7 +213,7 @@ define i1 @oneuse2(i8 %x, i8 %y) { ; CHECK-NEXT: call void @use8(i8 [[T1]]) ; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T2]]) -; CHECK-NEXT: [[RET:%.*]] = icmp uge i8 [[T1]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp ule i8 [[X]], [[T1]] ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 1, %y @@ -252,7 +252,7 @@ define i1 @n1(i8 %x, i8 %y) { ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T0]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[T0]] -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[RET:%.*]] = icmp eq i8 [[TMP2]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; @@ -269,7 +269,7 @@ define i1 @n2(i8 %x, i8 %y) { ; CHECK-NEXT: [[T0:%.*]] = shl nuw i8 1, [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T0]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub nuw i8 -2, [[T0]] -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[RET:%.*]] = icmp eq i8 [[TMP2]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-ne-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-ne-to-icmp-ugt.ll index 63d406d54179fc..4d8ce5d9a6cca0 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-ne-to-icmp-ugt.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-ne-to-icmp-ugt.ll @@ -174,7 +174,7 @@ define i1 @oneuse0(i8 %x, i8 %y) { ; CHECK-NEXT: call void @use8(i8 [[T0]]) ; CHECK-NEXT: [[T1:%.*]] = add i8 [[T0]], -1 ; CHECK-NEXT: call void @use8(i8 [[T1]]) -; CHECK-NEXT: [[RET:%.*]] = icmp ult i8 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ugt i8 [[X:%.*]], [[T1]] ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 1, %y @@ -193,7 +193,7 @@ define i1 @oneuse1(i8 %x, i8 %y) { ; CHECK-NEXT: [[T1:%.*]] = add i8 [[T0]], -1 ; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T2]]) -; CHECK-NEXT: [[RET:%.*]] = icmp ult i8 [[T1]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp ugt i8 [[X]], [[T1]] ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 1, %y @@ -213,7 +213,7 @@ define i1 @oneuse2(i8 %x, i8 %y) { ; CHECK-NEXT: call void @use8(i8 [[T1]]) ; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T2]]) -; CHECK-NEXT: [[RET:%.*]] = icmp ult i8 [[T1]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp ugt i8 [[X]], [[T1]] ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 1, %y @@ -252,7 +252,7 @@ define i1 @n1(i8 %x, i8 %y) { ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T0]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[T0]] -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[RET:%.*]] = icmp ne i8 [[TMP2]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; @@ -269,7 +269,7 @@ define i1 @n2(i8 %x, i8 %y) { ; CHECK-NEXT: [[T0:%.*]] = shl nuw i8 1, [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T0]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub nuw i8 -2, [[T0]] -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[RET:%.*]] = icmp ne i8 [[TMP2]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-eq-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-eq-to-icmp-ule.ll index f48d284e085bcd..5fab93092a050e 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-eq-to-icmp-ule.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-eq-to-icmp-ule.ll @@ -23,7 +23,7 @@ define i1 @p0(i8 %x, i8 %y) { ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T0]]) ; CHECK-NEXT: [[T1:%.*]] = lshr i8 -1, [[Y]] -; CHECK-NEXT: [[RET:%.*]] = icmp uge i8 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ule i8 [[X:%.*]], [[T1]] ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 -1, %y @@ -43,7 +43,7 @@ define <2 x i1> @p1_vec(<2 x i8> %x, <2 x i8> %y) { ; CHECK-NEXT: [[T0:%.*]] = shl nsw <2 x i8> , [[Y:%.*]] ; CHECK-NEXT: call void @use2i8(<2 x i8> [[T0]]) ; CHECK-NEXT: [[T1:%.*]] = lshr <2 x i8> , [[Y]] -; CHECK-NEXT: [[RET:%.*]] = icmp uge <2 x i8> [[T1]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ule <2 x i8> [[X:%.*]], [[T1]] ; CHECK-NEXT: ret <2 x i1> [[RET]] ; %t0 = shl <2 x i8> , %y @@ -59,7 +59,7 @@ define <3 x i1> @p2_vec_poison0(<3 x i8> %x, <3 x i8> %y) { ; CHECK-NEXT: [[T0:%.*]] = shl nsw <3 x i8> , [[Y:%.*]] ; CHECK-NEXT: call void @use3i8(<3 x i8> [[T0]]) ; CHECK-NEXT: [[T1:%.*]] = lshr <3 x i8> , [[Y]] -; CHECK-NEXT: [[RET:%.*]] = icmp uge <3 x i8> [[T1]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ule <3 x i8> [[X:%.*]], [[T1]] ; CHECK-NEXT: ret <3 x i1> [[RET]] ; %t0 = shl <3 x i8> , %y @@ -140,7 +140,7 @@ define i1 @oneuse0(i8 %x, i8 %y) { ; CHECK-NEXT: call void @use8(i8 [[T0]]) ; CHECK-NEXT: [[T1:%.*]] = lshr i8 -1, [[Y]] ; CHECK-NEXT: call void @use8(i8 [[T1]]) -; CHECK-NEXT: [[RET:%.*]] = icmp uge i8 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ule i8 [[X:%.*]], [[T1]] ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 -1, %y @@ -159,7 +159,7 @@ define i1 @oneuse1(i8 %x, i8 %y) { ; CHECK-NEXT: [[T1:%.*]] = lshr i8 -1, [[Y]] ; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T2]]) -; CHECK-NEXT: [[RET:%.*]] = icmp uge i8 [[T1]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp ule i8 [[X]], [[T1]] ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 -1, %y @@ -179,7 +179,7 @@ define i1 @oneuse2(i8 %x, i8 %y) { ; CHECK-NEXT: call void @use8(i8 [[T1]]) ; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T2]]) -; CHECK-NEXT: [[RET:%.*]] = icmp uge i8 [[T1]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp ule i8 [[X]], [[T1]] ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 -1, %y diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-ne-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-ne-to-icmp-ugt.ll index f4b3c67164e492..40a67ce1d60cb4 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-ne-to-icmp-ugt.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-ne-to-icmp-ugt.ll @@ -23,7 +23,7 @@ define i1 @p0(i8 %x, i8 %y) { ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T0]]) ; CHECK-NEXT: [[T1:%.*]] = lshr i8 -1, [[Y]] -; CHECK-NEXT: [[RET:%.*]] = icmp ult i8 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ugt i8 [[X:%.*]], [[T1]] ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 -1, %y @@ -43,7 +43,7 @@ define <2 x i1> @p1_vec(<2 x i8> %x, <2 x i8> %y) { ; CHECK-NEXT: [[T0:%.*]] = shl nsw <2 x i8> , [[Y:%.*]] ; CHECK-NEXT: call void @use2i8(<2 x i8> [[T0]]) ; CHECK-NEXT: [[T1:%.*]] = lshr <2 x i8> , [[Y]] -; CHECK-NEXT: [[RET:%.*]] = icmp ult <2 x i8> [[T1]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ugt <2 x i8> [[X:%.*]], [[T1]] ; CHECK-NEXT: ret <2 x i1> [[RET]] ; %t0 = shl <2 x i8> , %y @@ -59,7 +59,7 @@ define <3 x i1> @p2_vec_poison0(<3 x i8> %x, <3 x i8> %y) { ; CHECK-NEXT: [[T0:%.*]] = shl nsw <3 x i8> , [[Y:%.*]] ; CHECK-NEXT: call void @use3i8(<3 x i8> [[T0]]) ; CHECK-NEXT: [[T1:%.*]] = lshr <3 x i8> , [[Y]] -; CHECK-NEXT: [[RET:%.*]] = icmp ult <3 x i8> [[T1]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ugt <3 x i8> [[X:%.*]], [[T1]] ; CHECK-NEXT: ret <3 x i1> [[RET]] ; %t0 = shl <3 x i8> , %y @@ -140,7 +140,7 @@ define i1 @oneuse0(i8 %x, i8 %y) { ; CHECK-NEXT: call void @use8(i8 [[T0]]) ; CHECK-NEXT: [[T1:%.*]] = lshr i8 -1, [[Y]] ; CHECK-NEXT: call void @use8(i8 [[T1]]) -; CHECK-NEXT: [[RET:%.*]] = icmp ult i8 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ugt i8 [[X:%.*]], [[T1]] ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 -1, %y @@ -159,7 +159,7 @@ define i1 @oneuse1(i8 %x, i8 %y) { ; CHECK-NEXT: [[T1:%.*]] = lshr i8 -1, [[Y]] ; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T2]]) -; CHECK-NEXT: [[RET:%.*]] = icmp ult i8 [[T1]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp ugt i8 [[X]], [[T1]] ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 -1, %y @@ -179,7 +179,7 @@ define i1 @oneuse2(i8 %x, i8 %y) { ; CHECK-NEXT: call void @use8(i8 [[T1]]) ; CHECK-NEXT: [[T2:%.*]] = and i8 [[T1]], [[X:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T2]]) -; CHECK-NEXT: [[RET:%.*]] = icmp ult i8 [[T1]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp ugt i8 [[X]], [[T1]] ; CHECK-NEXT: ret i1 [[RET]] ; %t0 = shl i8 -1, %y diff --git a/llvm/test/Transforms/InstCombine/cast-mul-select.ll b/llvm/test/Transforms/InstCombine/cast-mul-select.ll index d185e226805231..6eb3a8c0a2049b 100644 --- a/llvm/test/Transforms/InstCombine/cast-mul-select.ll +++ b/llvm/test/Transforms/InstCombine/cast-mul-select.ll @@ -196,7 +196,7 @@ define void @PR36225(i32 %a, i32 %b, i1 %c1, i3 %v1, i3 %v2) { ; CHECK: for.end: ; CHECK-NEXT: [[H:%.*]] = phi i8 [ [[SPEC_SELECT]], [[FOR_BODY3_US]] ], [ [[SPEC_SELECT]], [[FOR_BODY3_US]] ], [ 0, [[FOR_BODY3]] ], [ 0, [[FOR_BODY3]] ] ; CHECK-NEXT: [[CONV:%.*]] = zext nneg i8 [[H]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[CONV]], [[A:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], [[CONV]] ; CHECK-NEXT: br i1 [[CMP]], label [[EXIT]], label [[EXIT2:%.*]] ; CHECK: exit2: ; CHECK-NEXT: unreachable @@ -228,7 +228,7 @@ define void @PR36225(i32 %a, i32 %b, i1 %c1, i3 %v1, i3 %v2) { ; DBGINFO-NEXT: #dbg_value(i8 [[H]], [[META91:![0-9]+]], !DIExpression(), [[DBG100]]) ; DBGINFO-NEXT: [[CONV:%.*]] = zext nneg i8 [[H]] to i32, !dbg [[DBG101:![0-9]+]] ; DBGINFO-NEXT: #dbg_value(i32 [[CONV]], [[META92:![0-9]+]], !DIExpression(), [[DBG101]]) -; DBGINFO-NEXT: [[CMP:%.*]] = icmp slt i32 [[CONV]], [[A:%.*]], !dbg [[DBG102:![0-9]+]] +; DBGINFO-NEXT: [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], [[CONV]], !dbg [[DBG102:![0-9]+]] ; DBGINFO-NEXT: #dbg_value(i1 [[CMP]], [[META93:![0-9]+]], !DIExpression(), [[DBG102]]) ; DBGINFO-NEXT: br i1 [[CMP]], label [[EXIT]], label [[EXIT2:%.*]], !dbg [[DBG103:![0-9]+]] ; DBGINFO: exit2: diff --git a/llvm/test/Transforms/InstCombine/cast.ll b/llvm/test/Transforms/InstCombine/cast.ll index 43c198b9e19d8e..0e44dc1b8ca9c0 100644 --- a/llvm/test/Transforms/InstCombine/cast.ll +++ b/llvm/test/Transforms/InstCombine/cast.ll @@ -587,7 +587,7 @@ define i64 @test44(i8 %T) { define i64 @test45(i8 %A, i64 %Q) { ; ALL-LABEL: @test45( ; ALL-NEXT: [[B:%.*]] = sext i8 [[A:%.*]] to i64 -; ALL-NEXT: [[C:%.*]] = or i64 [[B]], [[Q:%.*]] +; ALL-NEXT: [[C:%.*]] = or i64 [[Q:%.*]], [[B]] ; ALL-NEXT: [[E:%.*]] = and i64 [[C]], 4294967295 ; ALL-NEXT: ret i64 [[E]] ; @@ -1144,10 +1144,10 @@ define %s @test78(ptr %p, i64 %i, i64 %j, i32 %k, i32 %l, i128 %m, i128 %n) { ; ALL-NEXT: [[A:%.*]] = mul nsw i32 [[K:%.*]], 36 ; ALL-NEXT: [[B:%.*]] = mul nsw i32 [[A]], [[L:%.*]] ; ALL-NEXT: [[C:%.*]] = sext i32 [[B]] to i128 -; ALL-NEXT: [[D:%.*]] = mul nsw i128 [[C]], [[M:%.*]] +; ALL-NEXT: [[D:%.*]] = mul nsw i128 [[M:%.*]], [[C]] ; ALL-NEXT: [[E:%.*]] = mul i128 [[D]], [[N:%.*]] ; ALL-NEXT: [[F:%.*]] = trunc i128 [[E]] to i64 -; ALL-NEXT: [[G:%.*]] = mul nsw i64 [[F]], [[I:%.*]] +; ALL-NEXT: [[G:%.*]] = mul nsw i64 [[I:%.*]], [[F]] ; ALL-NEXT: [[H:%.*]] = mul nsw i64 [[G]], [[J:%.*]] ; ALL-NEXT: [[PP:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 [[H]] ; ALL-NEXT: [[LOAD:%.*]] = load [[S:%.*]], ptr [[PP]], align 4 diff --git a/llvm/test/Transforms/InstCombine/cast_phi.ll b/llvm/test/Transforms/InstCombine/cast_phi.ll index 68847e73ac5d29..7dfe60539138d6 100644 --- a/llvm/test/Transforms/InstCombine/cast_phi.ll +++ b/llvm/test/Transforms/InstCombine/cast_phi.ll @@ -350,7 +350,7 @@ define i32 @zext_in_loop_and_exit_block(i8 %step, i32 %end) { ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[IV_EXT:%.*]] = zext i8 [[IV]] to i32 -; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[IV_EXT]], [[END:%.*]] +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[END:%.*]], [[IV_EXT]] ; CHECK-NEXT: br i1 [[CMP_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]] ; CHECK: loop.latch: ; CHECK-NEXT: [[IV_NEXT]] = add i8 [[IV]], [[STEP:%.*]] diff --git a/llvm/test/Transforms/InstCombine/cast_ptr.ll b/llvm/test/Transforms/InstCombine/cast_ptr.ll index 9f2d128ecc3771..db576b9679b14f 100644 --- a/llvm/test/Transforms/InstCombine/cast_ptr.ll +++ b/llvm/test/Transforms/InstCombine/cast_ptr.ll @@ -259,7 +259,7 @@ define i32 @ptr_add_in_int(i32 %x, i32 %y) { define i32 @ptr_add_in_int_2(i32 %x, i32 %y) { ; CHECK-LABEL: @ptr_add_in_int_2( ; CHECK-NEXT: [[P2_IDX:%.*]] = shl nsw i32 [[Y:%.*]], 2 -; CHECK-NEXT: [[R:%.*]] = add i32 [[P2_IDX]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = add i32 [[X:%.*]], [[P2_IDX]] ; CHECK-NEXT: ret i32 [[R]] ; %ptr = inttoptr i32 %x to ptr @@ -271,7 +271,7 @@ define i32 @ptr_add_in_int_2(i32 %x, i32 %y) { define i32 @ptr_add_in_int_nneg(i32 %x, i32 %y) { ; CHECK-LABEL: @ptr_add_in_int_nneg( ; CHECK-NEXT: [[Z:%.*]] = call i32 @llvm.abs.i32(i32 [[Y:%.*]], i1 true) -; CHECK-NEXT: [[R:%.*]] = add nuw i32 [[Z]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = add nuw i32 [[X:%.*]], [[Z]] ; CHECK-NEXT: ret i32 [[R]] ; %z = call i32 @llvm.abs.i32(i32 %y, i1 true) @@ -308,7 +308,7 @@ define i16 @ptr_add_in_int_different_type_2(i32 %x, i32 %y) { define i32 @ptr_add_in_int_different_type_3(i16 %x, i32 %y) { ; CHECK-LABEL: @ptr_add_in_int_different_type_3( ; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 -; CHECK-NEXT: [[R:%.*]] = add i32 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = add i32 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[R]] ; %ptr = inttoptr i16 %x to ptr @@ -320,7 +320,7 @@ define i32 @ptr_add_in_int_different_type_3(i16 %x, i32 %y) { define i32 @ptr_add_in_int_different_type_4(i64 %x, i32 %y) { ; CHECK-LABEL: @ptr_add_in_int_different_type_4( ; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[X:%.*]] to i32 -; CHECK-NEXT: [[R:%.*]] = add i32 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = add i32 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[R]] ; %ptr = inttoptr i64 %x to ptr @@ -332,7 +332,7 @@ define i32 @ptr_add_in_int_different_type_4(i64 %x, i32 %y) { define i32 @ptr_add_in_int_not_inbounds(i32 %x, i32 %y) { ; CHECK-LABEL: @ptr_add_in_int_not_inbounds( ; CHECK-NEXT: [[Z:%.*]] = call i32 @llvm.abs.i32(i32 [[Y:%.*]], i1 true) -; CHECK-NEXT: [[R:%.*]] = add i32 [[Z]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = add i32 [[X:%.*]], [[Z]] ; CHECK-NEXT: ret i32 [[R]] ; %z = call i32 @llvm.abs.i32(i32 %y, i1 true) diff --git a/llvm/test/Transforms/InstCombine/cmp-x-vs-neg-x.ll b/llvm/test/Transforms/InstCombine/cmp-x-vs-neg-x.ll index c5ff0f90fdaeec..96b03e6cd054c9 100644 --- a/llvm/test/Transforms/InstCombine/cmp-x-vs-neg-x.ll +++ b/llvm/test/Transforms/InstCombine/cmp-x-vs-neg-x.ll @@ -132,7 +132,7 @@ define i1 @t9(i8 %x) { define i1 @n10(i8 %x) { ; CHECK-LABEL: @n10( ; CHECK-NEXT: [[NEG_X:%.*]] = sub i8 0, [[X:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i8 [[NEG_X]], [[X]] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[X]], [[NEG_X]] ; CHECK-NEXT: ret i1 [[CMP]] ; %neg_x = sub i8 0, %x ; not nsw @@ -154,7 +154,7 @@ define i1 @n11(i8 %x) { define i1 @n12(i8 %x1, i8 %x2) { ; CHECK-LABEL: @n12( ; CHECK-NEXT: [[NEG_X:%.*]] = sub nsw i8 0, [[X1:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i8 [[NEG_X]], [[X2:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[X2:%.*]], [[NEG_X]] ; CHECK-NEXT: ret i1 [[CMP]] ; %neg_x = sub nsw i8 0, %x1 ; not %x2 diff --git a/llvm/test/Transforms/InstCombine/conditional-negation.ll b/llvm/test/Transforms/InstCombine/conditional-negation.ll index 1bdfd76edb341c..0ae1af8f8e67f7 100644 --- a/llvm/test/Transforms/InstCombine/conditional-negation.ll +++ b/llvm/test/Transforms/InstCombine/conditional-negation.ll @@ -44,7 +44,7 @@ define i8 @t2(i8 %x, i1 %cond0, i1 %cond1) { ; CHECK-LABEL: @t2( ; CHECK-NEXT: [[COND_SPLAT0:%.*]] = sext i1 [[COND0:%.*]] to i8 ; CHECK-NEXT: [[COND_SPLAT1:%.*]] = sext i1 [[COND1:%.*]] to i8 -; CHECK-NEXT: [[SUB:%.*]] = add i8 [[COND_SPLAT0]], [[X:%.*]] +; CHECK-NEXT: [[SUB:%.*]] = add i8 [[X:%.*]], [[COND_SPLAT0]] ; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[SUB]], [[COND_SPLAT1]] ; CHECK-NEXT: ret i8 [[XOR]] ; @@ -59,7 +59,7 @@ define i8 @t2(i8 %x, i1 %cond0, i1 %cond1) { define i8 @t3(i8 %x, i2 %cond) { ; CHECK-LABEL: @t3( ; CHECK-NEXT: [[COND_SPLAT:%.*]] = sext i2 [[COND:%.*]] to i8 -; CHECK-NEXT: [[SUB:%.*]] = add i8 [[COND_SPLAT]], [[X:%.*]] +; CHECK-NEXT: [[SUB:%.*]] = add i8 [[X:%.*]], [[COND_SPLAT]] ; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[SUB]], [[COND_SPLAT]] ; CHECK-NEXT: ret i8 [[XOR]] ; @@ -71,7 +71,7 @@ define i8 @t3(i8 %x, i2 %cond) { define <2 x i8> @t3_vec(<2 x i8> %x, <2 x i2> %cond) { ; CHECK-LABEL: @t3_vec( ; CHECK-NEXT: [[COND_SPLAT:%.*]] = sext <2 x i2> [[COND:%.*]] to <2 x i8> -; CHECK-NEXT: [[SUB:%.*]] = add <2 x i8> [[COND_SPLAT]], [[X:%.*]] +; CHECK-NEXT: [[SUB:%.*]] = add <2 x i8> [[X:%.*]], [[COND_SPLAT]] ; CHECK-NEXT: [[XOR:%.*]] = xor <2 x i8> [[SUB]], [[COND_SPLAT]] ; CHECK-NEXT: ret <2 x i8> [[XOR]] ; @@ -115,7 +115,7 @@ define i8 @extrause01_v1(i8 %x, i1 %cond) { define i8 @extrause10_v1(i8 %x, i1 %cond) { ; CHECK-LABEL: @extrause10_v1( ; CHECK-NEXT: [[COND_SPLAT:%.*]] = sext i1 [[COND:%.*]] to i8 -; CHECK-NEXT: [[SUB:%.*]] = add i8 [[COND_SPLAT]], [[X:%.*]] +; CHECK-NEXT: [[SUB:%.*]] = add i8 [[X:%.*]], [[COND_SPLAT]] ; CHECK-NEXT: call void @use.i8(i8 [[SUB]]) ; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[SUB]], [[COND_SPLAT]] ; CHECK-NEXT: ret i8 [[XOR]] @@ -130,7 +130,7 @@ define i8 @extrause11_v1(i8 %x, i1 %cond) { ; CHECK-LABEL: @extrause11_v1( ; CHECK-NEXT: [[COND_SPLAT:%.*]] = sext i1 [[COND:%.*]] to i8 ; CHECK-NEXT: call void @use.i8(i8 [[COND_SPLAT]]) -; CHECK-NEXT: [[SUB:%.*]] = add i8 [[COND_SPLAT]], [[X:%.*]] +; CHECK-NEXT: [[SUB:%.*]] = add i8 [[X:%.*]], [[COND_SPLAT]] ; CHECK-NEXT: call void @use.i8(i8 [[SUB]]) ; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[SUB]], [[COND_SPLAT]] ; CHECK-NEXT: ret i8 [[XOR]] @@ -195,7 +195,7 @@ define i8 @extrause011_v2(i8 %x, i1 %cond) { define i8 @extrause100_v2(i8 %x, i1 %cond) { ; CHECK-LABEL: @extrause100_v2( ; CHECK-NEXT: [[COND_SPLAT0:%.*]] = sext i1 [[COND:%.*]] to i8 -; CHECK-NEXT: [[SUB:%.*]] = add i8 [[COND_SPLAT0]], [[X:%.*]] +; CHECK-NEXT: [[SUB:%.*]] = add i8 [[X:%.*]], [[COND_SPLAT0]] ; CHECK-NEXT: call void @use.i8(i8 [[SUB]]) ; CHECK-NEXT: [[X_NEG:%.*]] = sub i8 0, [[X]] ; CHECK-NEXT: [[XOR:%.*]] = select i1 [[COND]], i8 [[X_NEG]], i8 [[X]] @@ -212,7 +212,7 @@ define i8 @extrause101_v2(i8 %x, i1 %cond) { ; CHECK-LABEL: @extrause101_v2( ; CHECK-NEXT: [[COND_SPLAT0:%.*]] = sext i1 [[COND:%.*]] to i8 ; CHECK-NEXT: call void @use.i8(i8 [[COND_SPLAT0]]) -; CHECK-NEXT: [[SUB:%.*]] = add i8 [[COND_SPLAT0]], [[X:%.*]] +; CHECK-NEXT: [[SUB:%.*]] = add i8 [[X:%.*]], [[COND_SPLAT0]] ; CHECK-NEXT: call void @use.i8(i8 [[SUB]]) ; CHECK-NEXT: [[X_NEG:%.*]] = sub i8 0, [[X]] ; CHECK-NEXT: [[XOR:%.*]] = select i1 [[COND]], i8 [[X_NEG]], i8 [[X]] @@ -231,7 +231,7 @@ define i8 @extrause110_v2(i8 %x, i1 %cond) { ; CHECK-NEXT: [[COND_SPLAT0:%.*]] = sext i1 [[COND:%.*]] to i8 ; CHECK-NEXT: [[COND_SPLAT1:%.*]] = sext i1 [[COND]] to i8 ; CHECK-NEXT: call void @use.i8(i8 [[COND_SPLAT1]]) -; CHECK-NEXT: [[SUB:%.*]] = add i8 [[COND_SPLAT0]], [[X:%.*]] +; CHECK-NEXT: [[SUB:%.*]] = add i8 [[X:%.*]], [[COND_SPLAT0]] ; CHECK-NEXT: call void @use.i8(i8 [[SUB]]) ; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[SUB]], [[COND_SPLAT1]] ; CHECK-NEXT: ret i8 [[XOR]] @@ -250,7 +250,7 @@ define i8 @extrause111_v2(i8 %x, i1 %cond) { ; CHECK-NEXT: call void @use.i8(i8 [[COND_SPLAT0]]) ; CHECK-NEXT: [[COND_SPLAT1:%.*]] = sext i1 [[COND]] to i8 ; CHECK-NEXT: call void @use.i8(i8 [[COND_SPLAT1]]) -; CHECK-NEXT: [[SUB:%.*]] = add i8 [[COND_SPLAT0]], [[X:%.*]] +; CHECK-NEXT: [[SUB:%.*]] = add i8 [[X:%.*]], [[COND_SPLAT0]] ; CHECK-NEXT: call void @use.i8(i8 [[SUB]]) ; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[SUB]], [[COND_SPLAT1]] ; CHECK-NEXT: ret i8 [[XOR]] diff --git a/llvm/test/Transforms/InstCombine/ctpop-cttz.ll b/llvm/test/Transforms/InstCombine/ctpop-cttz.ll index a505654fa96e7f..bcfbce8dfd3d22 100644 --- a/llvm/test/Transforms/InstCombine/ctpop-cttz.ll +++ b/llvm/test/Transforms/InstCombine/ctpop-cttz.ll @@ -33,7 +33,7 @@ define <2 x i32> @ctpop1v(<2 x i32> %0) { define i32 @ctpop1_multiuse(i32 %0) { ; CHECK-LABEL: @ctpop1_multiuse( ; CHECK-NEXT: [[TMP2:%.*]] = sub i32 0, [[TMP0:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP2]], [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP3]], -1 ; CHECK-NEXT: [[TMP5:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[TMP4]]) ; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], [[TMP3]] diff --git a/llvm/test/Transforms/InstCombine/ctpop-pow2.ll b/llvm/test/Transforms/InstCombine/ctpop-pow2.ll index 4ef1ed0ec4976b..17997b25d096c1 100644 --- a/llvm/test/Transforms/InstCombine/ctpop-pow2.ll +++ b/llvm/test/Transforms/InstCombine/ctpop-pow2.ll @@ -12,7 +12,7 @@ declare void @llvm.assume(i1) define i16 @ctpop_x_and_negx(i16 %x) { ; CHECK-LABEL: @ctpop_x_and_negx( ; CHECK-NEXT: [[V0:%.*]] = sub i16 0, [[X:%.*]] -; CHECK-NEXT: [[V1:%.*]] = and i16 [[V0]], [[X]] +; CHECK-NEXT: [[V1:%.*]] = and i16 [[X]], [[V0]] ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i16 [[V1]], 0 ; CHECK-NEXT: [[CNT:%.*]] = zext i1 [[TMP1]] to i16 ; CHECK-NEXT: ret i16 [[CNT]] @@ -74,7 +74,7 @@ define i8 @ctpop_imin_plus1_lshr_nz(i8 %x) { define i64 @ctpop_x_and_negx_nz(i64 %x) { ; CHECK-LABEL: @ctpop_x_and_negx_nz( ; CHECK-NEXT: [[V0:%.*]] = sub i64 0, [[X:%.*]] -; CHECK-NEXT: [[V1:%.*]] = and i64 [[V0]], [[X]] +; CHECK-NEXT: [[V1:%.*]] = and i64 [[X]], [[V0]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 [[V1]], 0 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) ; CHECK-NEXT: ret i64 1 @@ -127,7 +127,7 @@ define <2 x i32> @ctpop_shl2_1_vec_nz(<2 x i32> %x) { define <2 x i64> @ctpop_x_and_negx_vec(<2 x i64> %x) { ; CHECK-LABEL: @ctpop_x_and_negx_vec( ; CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> zeroinitializer, [[X:%.*]] -; CHECK-NEXT: [[AND:%.*]] = and <2 x i64> [[SUB]], [[X]] +; CHECK-NEXT: [[AND:%.*]] = and <2 x i64> [[X]], [[SUB]] ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i64> [[AND]], zeroinitializer ; CHECK-NEXT: [[CNT:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i64> ; CHECK-NEXT: ret <2 x i64> [[CNT]] diff --git a/llvm/test/Transforms/InstCombine/cttz.ll b/llvm/test/Transforms/InstCombine/cttz.ll index 66b7a03fe5d7be..e106faf9cb38f0 100644 --- a/llvm/test/Transforms/InstCombine/cttz.ll +++ b/llvm/test/Transforms/InstCombine/cttz.ll @@ -193,7 +193,7 @@ define i32 @cttz_of_lowest_set_bit_wrong_const(i32 %x) { define i32 @cttz_of_lowest_set_bit_wrong_operand(i32 %x, i32 %y) { ; CHECK-LABEL: @cttz_of_lowest_set_bit_wrong_operand( ; CHECK-NEXT: [[SUB:%.*]] = sub i32 0, [[Y:%.*]] -; CHECK-NEXT: [[AND:%.*]] = and i32 [[SUB]], [[X:%.*]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], [[SUB]] ; CHECK-NEXT: [[TZ:%.*]] = call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[AND]], i1 false) ; CHECK-NEXT: ret i32 [[TZ]] ; @@ -206,7 +206,7 @@ define i32 @cttz_of_lowest_set_bit_wrong_operand(i32 %x, i32 %y) { define i32 @cttz_of_lowest_set_bit_wrong_intrinsic(i32 %x) { ; CHECK-LABEL: @cttz_of_lowest_set_bit_wrong_intrinsic( ; CHECK-NEXT: [[SUB:%.*]] = sub i32 0, [[X:%.*]] -; CHECK-NEXT: [[AND:%.*]] = and i32 [[SUB]], [[X]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], [[SUB]] ; CHECK-NEXT: [[TZ:%.*]] = call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[AND]], i1 false) ; CHECK-NEXT: ret i32 [[TZ]] ; diff --git a/llvm/test/Transforms/InstCombine/demorgan.ll b/llvm/test/Transforms/InstCombine/demorgan.ll index 11052d38f9bc7e..460758d512bb38 100644 --- a/llvm/test/Transforms/InstCombine/demorgan.ll +++ b/llvm/test/Transforms/InstCombine/demorgan.ll @@ -191,7 +191,7 @@ define i71 @test5_apint(i71 %A, i71 %B) { define i8 @demorgan_nand(i8 %A, i8 %B) { ; CHECK-LABEL: @demorgan_nand( ; CHECK-NEXT: [[B_NOT:%.*]] = xor i8 [[B:%.*]], -1 -; CHECK-NEXT: [[NOTC:%.*]] = or i8 [[B_NOT]], [[A:%.*]] +; CHECK-NEXT: [[NOTC:%.*]] = or i8 [[A:%.*]], [[B_NOT]] ; CHECK-NEXT: ret i8 [[NOTC]] ; %notx = xor i8 %A, -1 @@ -205,7 +205,7 @@ define i8 @demorgan_nand(i8 %A, i8 %B) { define i7 @demorgan_nand_apint1(i7 %A, i7 %B) { ; CHECK-LABEL: @demorgan_nand_apint1( ; CHECK-NEXT: [[B_NOT:%.*]] = xor i7 [[B:%.*]], -1 -; CHECK-NEXT: [[NOTC:%.*]] = or i7 [[B_NOT]], [[A:%.*]] +; CHECK-NEXT: [[NOTC:%.*]] = or i7 [[A:%.*]], [[B_NOT]] ; CHECK-NEXT: ret i7 [[NOTC]] ; %nota = xor i7 %A, -1 @@ -219,7 +219,7 @@ define i7 @demorgan_nand_apint1(i7 %A, i7 %B) { define i117 @demorgan_nand_apint2(i117 %A, i117 %B) { ; CHECK-LABEL: @demorgan_nand_apint2( ; CHECK-NEXT: [[B_NOT:%.*]] = xor i117 [[B:%.*]], -1 -; CHECK-NEXT: [[NOTC:%.*]] = or i117 [[B_NOT]], [[A:%.*]] +; CHECK-NEXT: [[NOTC:%.*]] = or i117 [[A:%.*]], [[B_NOT]] ; CHECK-NEXT: ret i117 [[NOTC]] ; %nota = xor i117 %A, -1 @@ -233,7 +233,7 @@ define i117 @demorgan_nand_apint2(i117 %A, i117 %B) { define i8 @demorgan_nor(i8 %A, i8 %B) { ; CHECK-LABEL: @demorgan_nor( ; CHECK-NEXT: [[B_NOT:%.*]] = xor i8 [[B:%.*]], -1 -; CHECK-NEXT: [[NOTC:%.*]] = and i8 [[B_NOT]], [[A:%.*]] +; CHECK-NEXT: [[NOTC:%.*]] = and i8 [[A:%.*]], [[B_NOT]] ; CHECK-NEXT: ret i8 [[NOTC]] ; %notx = xor i8 %A, -1 @@ -249,7 +249,7 @@ define i8 @demorgan_nor_use2a(i8 %A, i8 %B) { ; CHECK-NEXT: [[NOTA:%.*]] = xor i8 [[A:%.*]], -1 ; CHECK-NEXT: [[USE2A:%.*]] = mul i8 [[NOTA]], 23 ; CHECK-NEXT: [[B_NOT:%.*]] = xor i8 [[B:%.*]], -1 -; CHECK-NEXT: [[NOTC:%.*]] = and i8 [[B_NOT]], [[A]] +; CHECK-NEXT: [[NOTC:%.*]] = and i8 [[A]], [[B_NOT]] ; CHECK-NEXT: [[R:%.*]] = sdiv i8 [[NOTC]], [[USE2A]] ; CHECK-NEXT: ret i8 [[R]] ; @@ -267,7 +267,7 @@ define i8 @demorgan_nor_use2b(i8 %A, i8 %B) { ; CHECK-LABEL: @demorgan_nor_use2b( ; CHECK-NEXT: [[USE2B:%.*]] = mul i8 [[B:%.*]], 23 ; CHECK-NEXT: [[B_NOT:%.*]] = xor i8 [[B]], -1 -; CHECK-NEXT: [[NOTC:%.*]] = and i8 [[B_NOT]], [[A:%.*]] +; CHECK-NEXT: [[NOTC:%.*]] = and i8 [[A:%.*]], [[B_NOT]] ; CHECK-NEXT: [[R:%.*]] = sdiv i8 [[NOTC]], [[USE2B]] ; CHECK-NEXT: ret i8 [[R]] ; @@ -284,7 +284,7 @@ define i8 @demorgan_nor_use2b(i8 %A, i8 %B) { define i8 @demorgan_nor_use2c(i8 %A, i8 %B) { ; CHECK-LABEL: @demorgan_nor_use2c( ; CHECK-NEXT: [[NOTA:%.*]] = xor i8 [[A:%.*]], -1 -; CHECK-NEXT: [[C:%.*]] = or i8 [[NOTA]], [[B:%.*]] +; CHECK-NEXT: [[C:%.*]] = or i8 [[B:%.*]], [[NOTA]] ; CHECK-NEXT: [[USE2C:%.*]] = mul i8 [[C]], 23 ; CHECK-NEXT: [[NOTC:%.*]] = xor i8 [[C]], -1 ; CHECK-NEXT: [[R:%.*]] = sdiv i8 [[NOTC]], [[USE2C]] @@ -306,7 +306,7 @@ define i8 @demorgan_nor_use2ab(i8 %A, i8 %B) { ; CHECK-NEXT: [[NOTA:%.*]] = xor i8 [[A:%.*]], -1 ; CHECK-NEXT: [[USE2A:%.*]] = mul i8 [[NOTA]], 17 ; CHECK-NEXT: [[B_NOT:%.*]] = xor i8 [[B]], -1 -; CHECK-NEXT: [[NOTC:%.*]] = and i8 [[B_NOT]], [[A]] +; CHECK-NEXT: [[NOTC:%.*]] = and i8 [[A]], [[B_NOT]] ; CHECK-NEXT: [[R1:%.*]] = sdiv i8 [[NOTC]], [[USE2B]] ; CHECK-NEXT: [[R2:%.*]] = sdiv i8 [[R1]], [[USE2A]] ; CHECK-NEXT: ret i8 [[R2]] @@ -327,7 +327,7 @@ define i8 @demorgan_nor_use2ac(i8 %A, i8 %B) { ; CHECK-LABEL: @demorgan_nor_use2ac( ; CHECK-NEXT: [[NOTA:%.*]] = xor i8 [[A:%.*]], -1 ; CHECK-NEXT: [[USE2A:%.*]] = mul i8 [[NOTA]], 17 -; CHECK-NEXT: [[C:%.*]] = or i8 [[NOTA]], [[B:%.*]] +; CHECK-NEXT: [[C:%.*]] = or i8 [[B:%.*]], [[NOTA]] ; CHECK-NEXT: [[USE2C:%.*]] = mul i8 [[C]], 23 ; CHECK-NEXT: [[NOTC:%.*]] = xor i8 [[C]], -1 ; CHECK-NEXT: [[R1:%.*]] = sdiv i8 [[NOTC]], [[USE2C]] @@ -350,7 +350,7 @@ define i8 @demorgan_nor_use2bc(i8 %A, i8 %B) { ; CHECK-LABEL: @demorgan_nor_use2bc( ; CHECK-NEXT: [[USE2B:%.*]] = mul i8 [[B:%.*]], 23 ; CHECK-NEXT: [[NOTA:%.*]] = xor i8 [[A:%.*]], -1 -; CHECK-NEXT: [[C:%.*]] = or i8 [[NOTA]], [[B]] +; CHECK-NEXT: [[C:%.*]] = or i8 [[B]], [[NOTA]] ; CHECK-NEXT: [[USE2C:%.*]] = mul i8 [[C]], 23 ; CHECK-NEXT: [[NOTC:%.*]] = xor i8 [[C]], -1 ; CHECK-NEXT: [[R1:%.*]] = sdiv i8 [[NOTC]], [[USE2C]] diff --git a/llvm/test/Transforms/InstCombine/dependent-ivs.ll b/llvm/test/Transforms/InstCombine/dependent-ivs.ll index e3207daefee09a..e4a042ff5fe515 100644 --- a/llvm/test/Transforms/InstCombine/dependent-ivs.ll +++ b/llvm/test/Transforms/InstCombine/dependent-ivs.ll @@ -452,7 +452,7 @@ define void @int_iv_add_wrong_start(i64 %base, i64 %end) { ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 1, [[ENTRY]] ] ; CHECK-NEXT: call void @use.i64(i64 [[IV2]]) ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4 -; CHECK-NEXT: [[IV2_NEXT]] = add i64 [[IV_NEXT]], [[BASE]] +; CHECK-NEXT: [[IV2_NEXT]] = add i64 [[BASE]], [[IV_NEXT]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]] ; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: @@ -706,7 +706,7 @@ define void @different_loops(i64 %base) { ; CHECK: loop2: ; CHECK-NEXT: [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP2]] ], [ [[BASE]], [[LOOP1]] ] ; CHECK-NEXT: call void @use.i64(i64 [[IV2]]) -; CHECK-NEXT: [[IV2_NEXT]] = add nuw i64 [[IV_NEXT]], [[BASE]] +; CHECK-NEXT: [[IV2_NEXT]] = add nuw i64 [[BASE]], [[IV_NEXT]] ; CHECK-NEXT: [[CMP2:%.*]] = call i1 @get.i1() ; CHECK-NEXT: br i1 [[CMP2]], label [[EXIT:%.*]], label [[LOOP2]] ; CHECK: exit: diff --git a/llvm/test/Transforms/InstCombine/fadd-fsub-factor.ll b/llvm/test/Transforms/InstCombine/fadd-fsub-factor.ll index 4b9c4fd9f95446..0be7f50cfddaee 100644 --- a/llvm/test/Transforms/InstCombine/fadd-fsub-factor.ll +++ b/llvm/test/Transforms/InstCombine/fadd-fsub-factor.ll @@ -474,8 +474,8 @@ define float @fdiv_fsub_denorm(float %x) { define float @lerp_commute0(float %a, float %b, float %c) { ; CHECK-LABEL: @lerp_commute0( ; CHECK-NEXT: [[TMP1:%.*]] = fsub fast float [[B:%.*]], [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fmul fast float [[TMP1]], [[C:%.*]] -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP2]], [[A]] +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast float [[C:%.*]], [[TMP1]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[A]], [[TMP2]] ; CHECK-NEXT: ret float [[ADD]] ; %sub = fsub fast float 1.0, %c @@ -488,8 +488,8 @@ define float @lerp_commute0(float %a, float %b, float %c) { define <2 x float> @lerp_commute1(<2 x float> %a, <2 x float> %b, <2 x float> %c) { ; CHECK-LABEL: @lerp_commute1( ; CHECK-NEXT: [[TMP1:%.*]] = fsub fast <2 x float> [[B:%.*]], [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[C:%.*]] -; CHECK-NEXT: [[ADD:%.*]] = fadd fast <2 x float> [[TMP2]], [[A]] +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[C:%.*]], [[TMP1]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast <2 x float> [[A]], [[TMP2]] ; CHECK-NEXT: ret <2 x float> [[ADD]] ; %sub = fsub <2 x float> , %c @@ -502,8 +502,8 @@ define <2 x float> @lerp_commute1(<2 x float> %a, <2 x float> %b, <2 x float> %c define float @lerp_commute2(float %a, float %b, float %c) { ; CHECK-LABEL: @lerp_commute2( ; CHECK-NEXT: [[TMP1:%.*]] = fsub reassoc nsz float [[B:%.*]], [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fmul reassoc nsz float [[TMP1]], [[C:%.*]] -; CHECK-NEXT: [[ADD:%.*]] = fadd reassoc nsz float [[TMP2]], [[A]] +; CHECK-NEXT: [[TMP2:%.*]] = fmul reassoc nsz float [[C:%.*]], [[TMP1]] +; CHECK-NEXT: [[ADD:%.*]] = fadd reassoc nsz float [[A]], [[TMP2]] ; CHECK-NEXT: ret float [[ADD]] ; %sub = fsub float 1.0, %c @@ -516,8 +516,8 @@ define float @lerp_commute2(float %a, float %b, float %c) { define float @lerp_commute3(float %a, float %b, float %c) { ; CHECK-LABEL: @lerp_commute3( ; CHECK-NEXT: [[TMP1:%.*]] = fsub reassoc ninf nsz float [[B:%.*]], [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fmul reassoc ninf nsz float [[TMP1]], [[C:%.*]] -; CHECK-NEXT: [[ADD:%.*]] = fadd reassoc ninf nsz float [[TMP2]], [[A]] +; CHECK-NEXT: [[TMP2:%.*]] = fmul reassoc ninf nsz float [[C:%.*]], [[TMP1]] +; CHECK-NEXT: [[ADD:%.*]] = fadd reassoc ninf nsz float [[A]], [[TMP2]] ; CHECK-NEXT: ret float [[ADD]] ; %sub = fsub fast float 1.0, %c @@ -530,8 +530,8 @@ define float @lerp_commute3(float %a, float %b, float %c) { define double @lerp_commute4(double %a, double %b, double %c) { ; CHECK-LABEL: @lerp_commute4( ; CHECK-NEXT: [[TMP1:%.*]] = fsub fast double [[B:%.*]], [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fmul fast double [[TMP1]], [[C:%.*]] -; CHECK-NEXT: [[ADD:%.*]] = fadd fast double [[TMP2]], [[A]] +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast double [[C:%.*]], [[TMP1]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast double [[A]], [[TMP2]] ; CHECK-NEXT: ret double [[ADD]] ; %sub = fsub fast double 1.0, %c @@ -544,8 +544,8 @@ define double @lerp_commute4(double %a, double %b, double %c) { define double @lerp_commute5(double %a, double %b, double %c) { ; CHECK-LABEL: @lerp_commute5( ; CHECK-NEXT: [[TMP1:%.*]] = fsub fast double [[B:%.*]], [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fmul fast double [[TMP1]], [[C:%.*]] -; CHECK-NEXT: [[ADD:%.*]] = fadd fast double [[TMP2]], [[A]] +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast double [[C:%.*]], [[TMP1]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast double [[A]], [[TMP2]] ; CHECK-NEXT: ret double [[ADD]] ; %sub = fsub fast double 1.0, %c @@ -558,8 +558,8 @@ define double @lerp_commute5(double %a, double %b, double %c) { define half @lerp_commute6(half %a, half %b, half %c) { ; CHECK-LABEL: @lerp_commute6( ; CHECK-NEXT: [[TMP1:%.*]] = fsub fast half [[B:%.*]], [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fmul fast half [[TMP1]], [[C:%.*]] -; CHECK-NEXT: [[ADD:%.*]] = fadd fast half [[TMP2]], [[A]] +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast half [[C:%.*]], [[TMP1]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast half [[A]], [[TMP2]] ; CHECK-NEXT: ret half [[ADD]] ; %sub = fsub fast half 1.0, %c @@ -572,8 +572,8 @@ define half @lerp_commute6(half %a, half %b, half %c) { define half @lerp_commute7(half %a, half %b, half %c) { ; CHECK-LABEL: @lerp_commute7( ; CHECK-NEXT: [[TMP1:%.*]] = fsub fast half [[B:%.*]], [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fmul fast half [[TMP1]], [[C:%.*]] -; CHECK-NEXT: [[ADD:%.*]] = fadd fast half [[TMP2]], [[A]] +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast half [[C:%.*]], [[TMP1]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast half [[A]], [[TMP2]] ; CHECK-NEXT: ret half [[ADD]] ; %sub = fsub fast half 1.0, %c @@ -586,7 +586,7 @@ define half @lerp_commute7(half %a, half %b, half %c) { define float @lerp_extra_use1(float %a, float %b, float %c) { ; CHECK-LABEL: @lerp_extra_use1( ; CHECK-NEXT: [[SUB:%.*]] = fsub fast float 1.000000e+00, [[C:%.*]] -; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[SUB]], [[A:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[A:%.*]], [[SUB]] ; CHECK-NEXT: [[BC:%.*]] = fmul fast float [[B:%.*]], [[C]] ; CHECK-NEXT: call void @use(float [[BC]]) ; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[BC]], [[MUL]] @@ -603,7 +603,7 @@ define float @lerp_extra_use1(float %a, float %b, float %c) { define float @lerp_extra_use2(float %a, float %b, float %c) { ; CHECK-LABEL: @lerp_extra_use2( ; CHECK-NEXT: [[SUB:%.*]] = fsub fast float 1.000000e+00, [[C:%.*]] -; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[SUB]], [[A:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[A:%.*]], [[SUB]] ; CHECK-NEXT: call void @use(float [[MUL]]) ; CHECK-NEXT: [[BC:%.*]] = fmul fast float [[B:%.*]], [[C]] ; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[BC]], [[MUL]] @@ -621,7 +621,7 @@ define float @lerp_extra_use3(float %a, float %b, float %c) { ; CHECK-LABEL: @lerp_extra_use3( ; CHECK-NEXT: [[SUB:%.*]] = fsub fast float 1.000000e+00, [[C:%.*]] ; CHECK-NEXT: call void @use(float [[SUB]]) -; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[SUB]], [[A:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[A:%.*]], [[SUB]] ; CHECK-NEXT: [[BC:%.*]] = fmul fast float [[B:%.*]], [[C]] ; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[BC]], [[MUL]] ; CHECK-NEXT: ret float [[ADD]] diff --git a/llvm/test/Transforms/InstCombine/fadd.ll b/llvm/test/Transforms/InstCombine/fadd.ll index 38508cdb09e1f0..840ccaef1086ab 100644 --- a/llvm/test/Transforms/InstCombine/fadd.ll +++ b/llvm/test/Transforms/InstCombine/fadd.ll @@ -83,7 +83,7 @@ define double @fmul_fneg2(double %x, double %py, double %pz) { ; CHECK-LABEL: @fmul_fneg2( ; CHECK-NEXT: [[Y:%.*]] = frem double -4.200000e+01, [[PY:%.*]] ; CHECK-NEXT: [[Z:%.*]] = frem double 4.200000e+01, [[PZ:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = fmul double [[Y]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = fmul double [[X:%.*]], [[Y]] ; CHECK-NEXT: [[R:%.*]] = fsub double [[Z]], [[TMP1]] ; CHECK-NEXT: ret double [[R]] ; @@ -149,7 +149,7 @@ define double @fmul_fneg2_commute(double %x, double %py, double %pz) { ; CHECK-LABEL: @fmul_fneg2_commute( ; CHECK-NEXT: [[Y:%.*]] = frem double 4.100000e+01, [[PY:%.*]] ; CHECK-NEXT: [[Z:%.*]] = frem double 4.200000e+01, [[PZ:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = fmul double [[Y]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = fmul double [[X:%.*]], [[Y]] ; CHECK-NEXT: [[R:%.*]] = fsub double [[Z]], [[TMP1]] ; CHECK-NEXT: ret double [[R]] ; @@ -207,7 +207,7 @@ define <2 x float> @fmul_fneg1_extra_use(<2 x float> %x, <2 x float> %y, <2 x fl ; CHECK-LABEL: @fmul_fneg1_extra_use( ; CHECK-NEXT: [[Z:%.*]] = frem <2 x float> , [[PZ:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = fneg <2 x float> [[X:%.*]] -; CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[Y:%.*]], [[NEG]] ; CHECK-NEXT: call void @use_vec(<2 x float> [[MUL]]) ; CHECK-NEXT: [[R:%.*]] = fadd <2 x float> [[Z]], [[MUL]] ; CHECK-NEXT: ret <2 x float> [[R]] @@ -299,7 +299,7 @@ define float @fmul_fneg2_extra_use2(float %x, float %py, float %z) { ; CHECK-NEXT: [[Y:%.*]] = frem float -4.200000e+01, [[PY:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = fneg float [[X:%.*]] ; CHECK-NEXT: call void @use(float [[NEG]]) -; CHECK-NEXT: [[TMP1:%.*]] = fmul float [[Y]], [[X]] +; CHECK-NEXT: [[TMP1:%.*]] = fmul float [[X]], [[Y]] ; CHECK-NEXT: [[R:%.*]] = fsub float [[Z:%.*]], [[TMP1]] ; CHECK-NEXT: ret float [[R]] ; @@ -355,7 +355,7 @@ define <2 x float> @fmul_fneg1_extra_use3(<2 x float> %x, <2 x float> %y, <2 x f ; CHECK-LABEL: @fmul_fneg1_extra_use3( ; CHECK-NEXT: [[NEG:%.*]] = fneg <2 x float> [[X:%.*]] ; CHECK-NEXT: call void @use_vec(<2 x float> [[NEG]]) -; CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[Y:%.*]], [[NEG]] ; CHECK-NEXT: call void @use_vec(<2 x float> [[MUL]]) ; CHECK-NEXT: [[R:%.*]] = fadd <2 x float> [[MUL]], [[Z:%.*]] ; CHECK-NEXT: ret <2 x float> [[R]] diff --git a/llvm/test/Transforms/InstCombine/fast-basictest.ll b/llvm/test/Transforms/InstCombine/fast-basictest.ll index 3c7776a43e55e7..b0e43ba4321621 100644 --- a/llvm/test/Transforms/InstCombine/fast-basictest.ll +++ b/llvm/test/Transforms/InstCombine/fast-basictest.ll @@ -424,7 +424,7 @@ define float @test14_reassoc(float %arg) { define float @test15(float %b, float %a) { ; CHECK-LABEL: @test15( ; CHECK-NEXT: [[TMP1:%.*]] = fadd fast float [[A:%.*]], 1.234000e+03 -; CHECK-NEXT: [[TMP2:%.*]] = fadd fast float [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast float [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = fsub fast float [[TMP2]], [[A]] ; CHECK-NEXT: ret float [[TMP3]] ; @@ -438,7 +438,7 @@ define float @test15(float %b, float %a) { define float @test15_unary_fneg(float %b, float %a) { ; CHECK-LABEL: @test15_unary_fneg( ; CHECK-NEXT: [[TMP1:%.*]] = fadd fast float [[A:%.*]], 1.234000e+03 -; CHECK-NEXT: [[TMP2:%.*]] = fadd fast float [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast float [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = fsub fast float [[TMP2]], [[A]] ; CHECK-NEXT: ret float [[TMP3]] ; @@ -452,7 +452,7 @@ define float @test15_unary_fneg(float %b, float %a) { define float @test15_reassoc_nsz(float %b, float %a) { ; CHECK-LABEL: @test15_reassoc_nsz( ; CHECK-NEXT: [[TMP1:%.*]] = fadd reassoc nsz float [[A:%.*]], 1.234000e+03 -; CHECK-NEXT: [[TMP2:%.*]] = fadd reassoc nsz float [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = fadd reassoc nsz float [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = fsub reassoc nsz float [[TMP2]], [[A]] ; CHECK-NEXT: ret float [[TMP3]] ; @@ -466,7 +466,7 @@ define float @test15_reassoc_nsz(float %b, float %a) { define float @test15_reassoc(float %b, float %a) { ; CHECK-LABEL: @test15_reassoc( ; CHECK-NEXT: [[TMP1:%.*]] = fadd reassoc float [[A:%.*]], 1.234000e+03 -; CHECK-NEXT: [[TMP2:%.*]] = fadd reassoc float [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = fadd reassoc float [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = fsub reassoc float 0.000000e+00, [[A]] ; CHECK-NEXT: [[TMP4:%.*]] = fadd reassoc float [[TMP2]], [[TMP3]] ; CHECK-NEXT: ret float [[TMP4]] @@ -550,7 +550,7 @@ define float @test16_reassoc(float %a, float %b, float %z) { define float @test17(float %a, float %b, float %z) { ; CHECK-LABEL: @test17( ; CHECK-NEXT: [[TMP1:%.*]] = fmul fast float [[Z:%.*]], 4.000000e+01 -; CHECK-NEXT: [[F:%.*]] = fmul fast float [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[F:%.*]] = fmul fast float [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret float [[F]] ; %d = fmul fast float %z, 4.000000e+01 @@ -563,7 +563,7 @@ define float @test17(float %a, float %b, float %z) { define float @test17_unary_fneg(float %a, float %b, float %z) { ; CHECK-LABEL: @test17_unary_fneg( ; CHECK-NEXT: [[TMP1:%.*]] = fmul fast float [[Z:%.*]], 4.000000e+01 -; CHECK-NEXT: [[F:%.*]] = fmul fast float [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[F:%.*]] = fmul fast float [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret float [[F]] ; %d = fmul fast float %z, 4.000000e+01 @@ -576,7 +576,7 @@ define float @test17_unary_fneg(float %a, float %b, float %z) { define float @test17_reassoc_nsz(float %a, float %b, float %z) { ; CHECK-LABEL: @test17_reassoc_nsz( ; CHECK-NEXT: [[TMP1:%.*]] = fmul reassoc nsz float [[Z:%.*]], 4.000000e+01 -; CHECK-NEXT: [[F:%.*]] = fmul reassoc nsz float [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[F:%.*]] = fmul reassoc nsz float [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret float [[F]] ; %d = fmul reassoc nsz float %z, 4.000000e+01 @@ -591,7 +591,7 @@ define float @test17_reassoc(float %a, float %b, float %z) { ; CHECK-LABEL: @test17_reassoc( ; CHECK-NEXT: [[D:%.*]] = fmul reassoc float [[Z:%.*]], 4.000000e+01 ; CHECK-NEXT: [[C:%.*]] = fsub reassoc float 0.000000e+00, [[D]] -; CHECK-NEXT: [[E:%.*]] = fmul reassoc float [[C]], [[A:%.*]] +; CHECK-NEXT: [[E:%.*]] = fmul reassoc float [[A:%.*]], [[C]] ; CHECK-NEXT: [[F:%.*]] = fsub reassoc float 0.000000e+00, [[E]] ; CHECK-NEXT: ret float [[F]] ; @@ -607,7 +607,7 @@ define float @test17_reassoc(float %a, float %b, float %z) { define float @test17_unary_fneg_no_FMF(float %a, float %b, float %z) { ; CHECK-LABEL: @test17_unary_fneg_no_FMF( ; CHECK-NEXT: [[TMP1:%.*]] = fmul float [[Z:%.*]], 4.000000e+01 -; CHECK-NEXT: [[F:%.*]] = fmul float [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[F:%.*]] = fmul float [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret float [[F]] ; %d = fmul float %z, 4.000000e+01 @@ -620,7 +620,7 @@ define float @test17_unary_fneg_no_FMF(float %a, float %b, float %z) { define float @test17_reassoc_unary_fneg(float %a, float %b, float %z) { ; CHECK-LABEL: @test17_reassoc_unary_fneg( ; CHECK-NEXT: [[TMP1:%.*]] = fmul reassoc float [[Z:%.*]], 4.000000e+01 -; CHECK-NEXT: [[F:%.*]] = fmul reassoc float [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[F:%.*]] = fmul reassoc float [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret float [[F]] ; %d = fmul reassoc float %z, 4.000000e+01 diff --git a/llvm/test/Transforms/InstCombine/fast-math.ll b/llvm/test/Transforms/InstCombine/fast-math.ll index d7c90e82ab520a..32f136d53fab4b 100644 --- a/llvm/test/Transforms/InstCombine/fast-math.ll +++ b/llvm/test/Transforms/InstCombine/fast-math.ll @@ -65,7 +65,7 @@ define double @fold3_reassoc_nsz(double %f1) { define double @fold3_reassoc(double %f1) { ; CHECK-LABEL: @fold3_reassoc( ; CHECK-NEXT: [[T1:%.*]] = fmul reassoc double [[F1:%.*]], 5.000000e+00 -; CHECK-NEXT: [[T2:%.*]] = fadd reassoc double [[T1]], [[F1]] +; CHECK-NEXT: [[T2:%.*]] = fadd reassoc double [[F1]], [[T1]] ; CHECK-NEXT: ret double [[T2]] ; %t1 = fmul reassoc double 5.000000e+00, %f1 @@ -175,7 +175,7 @@ define float @fold6_reassoc_nsz(float %f1) { define float @fold6_reassoc(float %f1) { ; CHECK-LABEL: @fold6_reassoc( ; CHECK-NEXT: [[T1:%.*]] = fadd reassoc float [[F1:%.*]], [[F1]] -; CHECK-NEXT: [[T2:%.*]] = fadd reassoc float [[T1]], [[F1]] +; CHECK-NEXT: [[T2:%.*]] = fadd reassoc float [[F1]], [[T1]] ; CHECK-NEXT: [[T3:%.*]] = fadd reassoc float [[T2]], [[F1]] ; CHECK-NEXT: ret float [[T3]] ; @@ -506,7 +506,7 @@ define float @fold16(float %x, float %y) { ; CHECK-NEXT: [[CMP:%.*]] = fcmp ogt float [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = fneg float [[Y]] ; CHECK-NEXT: [[R_P:%.*]] = select i1 [[CMP]], float [[Y]], float [[TMP1]] -; CHECK-NEXT: [[R:%.*]] = fadd float [[R_P]], [[X]] +; CHECK-NEXT: [[R:%.*]] = fadd float [[X]], [[R_P]] ; CHECK-NEXT: ret float [[R]] ; %cmp = fcmp ogt float %x, %y diff --git a/llvm/test/Transforms/InstCombine/fcmp.ll b/llvm/test/Transforms/InstCombine/fcmp.ll index 8afb6463b669db..0d45baddcb2fc7 100644 --- a/llvm/test/Transforms/InstCombine/fcmp.ll +++ b/llvm/test/Transforms/InstCombine/fcmp.ll @@ -1439,7 +1439,7 @@ define i1 @fcmp_fadd_neg_zero(float %x, float %y) { define i1 @fcmp_fadd_zero_switched(float %x, float %y) { ; CHECK-LABEL: @fcmp_fadd_zero_switched( -; CHECK-NEXT: [[CMP:%.*]] = fcmp ult float [[Y:%.*]], [[X:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp ugt float [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: ret i1 [[CMP]] ; %add = fadd float %y, 0.000000e+00 diff --git a/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll b/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll index 9f030c5ebf7bb2..c5078ff1efc5ab 100644 --- a/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll +++ b/llvm/test/Transforms/InstCombine/fdiv-sqrt.ll @@ -8,7 +8,7 @@ define double @sqrt_div_fast(double %x, double %y, double %z) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = fdiv fast double [[Z:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = call fast double @llvm.sqrt.f64(double [[TMP0]]) -; CHECK-NEXT: [[DIV1:%.*]] = fmul fast double [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[DIV1:%.*]] = fmul fast double [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret double [[DIV1]] ; entry: @@ -38,7 +38,7 @@ define double @sqrt_div_reassoc_arcp(double %x, double %y, double %z) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc arcp double [[Z:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[TMP0]]) -; CHECK-NEXT: [[DIV1:%.*]] = fmul reassoc arcp double [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[DIV1:%.*]] = fmul reassoc arcp double [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret double [[DIV1]] ; entry: @@ -98,7 +98,7 @@ define double @sqrt_div_arcp_missing(double %x, double %y, double %z) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc double [[Z:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = call reassoc arcp double @llvm.sqrt.f64(double [[TMP0]]) -; CHECK-NEXT: [[DIV1:%.*]] = fmul reassoc arcp double [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[DIV1:%.*]] = fmul reassoc arcp double [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret double [[DIV1]] ; entry: diff --git a/llvm/test/Transforms/InstCombine/fdiv.ll b/llvm/test/Transforms/InstCombine/fdiv.ll index ca11685c98417a..12d6e6463de657 100644 --- a/llvm/test/Transforms/InstCombine/fdiv.ll +++ b/llvm/test/Transforms/InstCombine/fdiv.ll @@ -678,7 +678,7 @@ define float @pow_divisor(float %x, float %y, float %z) { ; CHECK-LABEL: @pow_divisor( ; CHECK-NEXT: [[TMP1:%.*]] = fneg reassoc arcp float [[Y:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = call reassoc arcp float @llvm.pow.f32(float [[X:%.*]], float [[TMP1]]) -; CHECK-NEXT: [[R:%.*]] = fmul reassoc arcp float [[TMP2]], [[Z:%.*]] +; CHECK-NEXT: [[R:%.*]] = fmul reassoc arcp float [[Z:%.*]], [[TMP2]] ; CHECK-NEXT: ret float [[R]] ; %p = call float @llvm.pow.f32(float %x, float %y) @@ -744,7 +744,7 @@ define float @exp_divisor(float %y, float %z) { ; CHECK-LABEL: @exp_divisor( ; CHECK-NEXT: [[TMP1:%.*]] = fneg reassoc arcp float [[Y:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = call reassoc arcp float @llvm.exp.f32(float [[TMP1]]) -; CHECK-NEXT: [[R:%.*]] = fmul reassoc arcp float [[TMP2]], [[Z:%.*]] +; CHECK-NEXT: [[R:%.*]] = fmul reassoc arcp float [[Z:%.*]], [[TMP2]] ; CHECK-NEXT: ret float [[R]] ; %p = call float @llvm.exp.f32(float %y) @@ -810,7 +810,7 @@ define float @exp2_divisor(float %y, float %z) { ; CHECK-LABEL: @exp2_divisor( ; CHECK-NEXT: [[TMP1:%.*]] = fneg reassoc arcp float [[Y:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = call reassoc arcp float @llvm.exp2.f32(float [[TMP1]]) -; CHECK-NEXT: [[R:%.*]] = fmul reassoc arcp float [[TMP2]], [[Z:%.*]] +; CHECK-NEXT: [[R:%.*]] = fmul reassoc arcp float [[Z:%.*]], [[TMP2]] ; CHECK-NEXT: ret float [[R]] ; %p = call float @llvm.exp2.f32(float %y) @@ -876,7 +876,7 @@ define float @powi_divisor(float %x, i32 %y, float %z) { ; CHECK-LABEL: @powi_divisor( ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 0, [[Y:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = call reassoc ninf arcp float @llvm.powi.f32.i32(float [[X:%.*]], i32 [[TMP1]]) -; CHECK-NEXT: [[R:%.*]] = fmul reassoc ninf arcp float [[TMP2]], [[Z:%.*]] +; CHECK-NEXT: [[R:%.*]] = fmul reassoc ninf arcp float [[Z:%.*]], [[TMP2]] ; CHECK-NEXT: ret float [[R]] ; %p = call float @llvm.powi.f32.i32(float %x, i32 %y) diff --git a/llvm/test/Transforms/InstCombine/float-shrink-compare.ll b/llvm/test/Transforms/InstCombine/float-shrink-compare.ll index e6e41ad03ce596..77b6ed7c5abe84 100644 --- a/llvm/test/Transforms/InstCombine/float-shrink-compare.ll +++ b/llvm/test/Transforms/InstCombine/float-shrink-compare.ll @@ -215,7 +215,7 @@ define i1 @test7_intrin(float %x, float %y) { define i1 @test8(float %x, float %y) { ; CHECK-LABEL: @test8( ; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.ceil.f32(float [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[CMP]] ; %x.ext = fpext float %x to double @@ -228,7 +228,7 @@ define i1 @test8(float %x, float %y) { define i1 @test8_intrin(float %x, float %y) { ; CHECK-LABEL: @test8_intrin( ; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.ceil.f32(float [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[CMP]] ; %x.ext = fpext float %x to double @@ -241,7 +241,7 @@ define i1 @test8_intrin(float %x, float %y) { define i1 @test9(float %x, float %y) { ; CHECK-LABEL: @test9( ; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[CMP]] ; %x.ext = fpext float %x to double @@ -254,7 +254,7 @@ define i1 @test9(float %x, float %y) { define i1 @test9_intrin(float %x, float %y) { ; CHECK-LABEL: @test9_intrin( ; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[CMP]] ; %x.ext = fpext float %x to double @@ -319,7 +319,7 @@ define i1 @test11_intrin(float %x, float %y) { define i1 @test12(float %x, float %y) { ; CHECK-LABEL: @test12( ; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.rint.f32(float [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[CMP]] ; %x.ext = fpext float %x to double @@ -332,7 +332,7 @@ define i1 @test12(float %x, float %y) { define i1 @test13(float %x, float %y) { ; CHECK-LABEL: @test13( ; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.round.f32(float [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[CMP]] ; %x.ext = fpext float %x to double @@ -345,7 +345,7 @@ define i1 @test13(float %x, float %y) { define i1 @test13_intrin(float %x, float %y) { ; CHECK-LABEL: @test13_intrin( ; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.round.f32(float [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[CMP]] ; %x.ext = fpext float %x to double @@ -358,7 +358,7 @@ define i1 @test13_intrin(float %x, float %y) { define i1 @test13a(float %x, float %y) { ; CHECK-LABEL: @test13a( ; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.roundeven.f32(float [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[CMP]] ; %x.ext = fpext float %x to double @@ -371,7 +371,7 @@ define i1 @test13a(float %x, float %y) { define i1 @test13a_intrin(float %x, float %y) { ; CHECK-LABEL: @test13a_intrin( ; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.roundeven.f32(float [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[CMP]] ; %x.ext = fpext float %x to double @@ -384,7 +384,7 @@ define i1 @test13a_intrin(float %x, float %y) { define i1 @test14(float %x, float %y) { ; CHECK-LABEL: @test14( ; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.trunc.f32(float [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[CMP]] ; %x.ext = fpext float %x to double @@ -397,7 +397,7 @@ define i1 @test14(float %x, float %y) { define i1 @test14_intrin(float %x, float %y) { ; CHECK-LABEL: @test14_intrin( ; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.trunc.f32(float [[X:%.*]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[CMP]] ; %x.ext = fpext float %x to double @@ -424,7 +424,7 @@ define i1 @test15(float %x, float %y, float %z) { define i1 @test16(float %x, float %y, float %z) { ; CHECK-LABEL: @test16( ; CHECK-NEXT: [[FMINF:%.*]] = call nsz float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: [[TMP1:%.*]] = fcmp oeq float [[FMINF]], [[Z:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = fcmp oeq float [[Z:%.*]], [[FMINF]] ; CHECK-NEXT: ret i1 [[TMP1]] ; %1 = fpext float %z to double @@ -452,7 +452,7 @@ define i1 @test17(float %x, float %y, float %z) { define i1 @test18(float %x, float %y, float %z) { ; CHECK-LABEL: @test18( ; CHECK-NEXT: [[FMAXF:%.*]] = call nsz float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: [[TMP1:%.*]] = fcmp oeq float [[FMAXF]], [[Z:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = fcmp oeq float [[Z:%.*]], [[FMAXF]] ; CHECK-NEXT: ret i1 [[TMP1]] ; %1 = fpext float %z to double @@ -480,7 +480,7 @@ define i1 @test19(float %x, float %y, float %z) { define i1 @test20(float %x, float %y) { ; CHECK-LABEL: @test20( ; CHECK-NEXT: [[FMINF:%.*]] = call nsz float @llvm.minnum.f32(float [[X:%.*]], float 1.000000e+00) -; CHECK-NEXT: [[TMP1:%.*]] = fcmp oeq float [[FMINF]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = fcmp oeq float [[Y:%.*]], [[FMINF]] ; CHECK-NEXT: ret i1 [[TMP1]] ; %1 = fpext float %y to double diff --git a/llvm/test/Transforms/InstCombine/fmul.ll b/llvm/test/Transforms/InstCombine/fmul.ll index ae2df634b02009..4554b4ed8844de 100644 --- a/llvm/test/Transforms/InstCombine/fmul.ll +++ b/llvm/test/Transforms/InstCombine/fmul.ll @@ -281,7 +281,7 @@ define float @neg_unary_neg_multi_use(float %x, float %y) { define float @neg_mul(float %x, float %y) { ; CHECK-LABEL: @neg_mul( ; CHECK-NEXT: [[SUB:%.*]] = fneg float [[X:%.*]] -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[SUB]], [[Y:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = fmul float [[Y:%.*]], [[SUB]] ; CHECK-NEXT: ret float [[MUL]] ; %sub = fsub float -0.0, %x @@ -292,7 +292,7 @@ define float @neg_mul(float %x, float %y) { define float @unary_neg_mul(float %x, float %y) { ; CHECK-LABEL: @unary_neg_mul( ; CHECK-NEXT: [[NEG:%.*]] = fneg float [[X:%.*]] -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = fmul float [[Y:%.*]], [[NEG]] ; CHECK-NEXT: ret float [[MUL]] ; %neg = fneg float %x @@ -303,7 +303,7 @@ define float @unary_neg_mul(float %x, float %y) { define <2 x float> @neg_mul_vec(<2 x float> %x, <2 x float> %y) { ; CHECK-LABEL: @neg_mul_vec( ; CHECK-NEXT: [[SUB:%.*]] = fneg <2 x float> [[X:%.*]] -; CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[SUB]], [[Y:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[Y:%.*]], [[SUB]] ; CHECK-NEXT: ret <2 x float> [[MUL]] ; %sub = fsub <2 x float> , %x @@ -314,7 +314,7 @@ define <2 x float> @neg_mul_vec(<2 x float> %x, <2 x float> %y) { define <2 x float> @unary_neg_mul_vec(<2 x float> %x, <2 x float> %y) { ; CHECK-LABEL: @unary_neg_mul_vec( ; CHECK-NEXT: [[SUB:%.*]] = fneg <2 x float> [[X:%.*]] -; CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[SUB]], [[Y:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[Y:%.*]], [[SUB]] ; CHECK-NEXT: ret <2 x float> [[MUL]] ; %sub = fneg <2 x float> %x @@ -325,7 +325,7 @@ define <2 x float> @unary_neg_mul_vec(<2 x float> %x, <2 x float> %y) { define <2 x float> @neg_mul_vec_poison(<2 x float> %x, <2 x float> %y) { ; CHECK-LABEL: @neg_mul_vec_poison( ; CHECK-NEXT: [[SUB:%.*]] = fneg <2 x float> [[X:%.*]] -; CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[SUB]], [[Y:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[Y:%.*]], [[SUB]] ; CHECK-NEXT: ret <2 x float> [[MUL]] ; %sub = fsub <2 x float> , %x @@ -337,7 +337,7 @@ define <2 x float> @neg_mul_vec_poison(<2 x float> %x, <2 x float> %y) { define float @neg_sink_nsz(float %x, float %y) { ; CHECK-LABEL: @neg_sink_nsz( ; CHECK-NEXT: [[SUB1:%.*]] = fneg nsz float [[X:%.*]] -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[SUB1]], [[Y:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = fmul float [[Y:%.*]], [[SUB1]] ; CHECK-NEXT: ret float [[MUL]] ; %sub1 = fsub nsz float 0.0, %x @@ -348,7 +348,7 @@ define float @neg_sink_nsz(float %x, float %y) { define float @neg_sink_multi_use(float %x, float %y) { ; CHECK-LABEL: @neg_sink_multi_use( ; CHECK-NEXT: [[SUB1:%.*]] = fneg float [[X:%.*]] -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[SUB1]], [[Y:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = fmul float [[Y:%.*]], [[SUB1]] ; CHECK-NEXT: [[MUL2:%.*]] = fmul float [[MUL]], [[SUB1]] ; CHECK-NEXT: ret float [[MUL2]] ; @@ -361,7 +361,7 @@ define float @neg_sink_multi_use(float %x, float %y) { define float @unary_neg_mul_multi_use(float %x, float %y) { ; CHECK-LABEL: @unary_neg_mul_multi_use( ; CHECK-NEXT: [[SUB1:%.*]] = fneg float [[X:%.*]] -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[SUB1]], [[Y:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = fmul float [[Y:%.*]], [[SUB1]] ; CHECK-NEXT: [[MUL2:%.*]] = fmul float [[MUL]], [[SUB1]] ; CHECK-NEXT: ret float [[MUL2]] ; @@ -449,7 +449,7 @@ declare double @llvm.sqrt.f64(double) define double @sqrt_squared2(double %f) { ; CHECK-LABEL: @sqrt_squared2( ; CHECK-NEXT: [[SQRT:%.*]] = call double @llvm.sqrt.f64(double [[F:%.*]]) -; CHECK-NEXT: [[MUL2:%.*]] = fmul double [[SQRT]], [[F]] +; CHECK-NEXT: [[MUL2:%.*]] = fmul double [[F]], [[SQRT]] ; CHECK-NEXT: ret double [[MUL2]] ; %sqrt = call double @llvm.sqrt.f64(double %f) @@ -1132,7 +1132,7 @@ for.body: define double @fmul_negated_constant_expression(double %x) { ; CHECK-LABEL: @fmul_negated_constant_expression( ; CHECK-NEXT: [[FSUB:%.*]] = fneg double bitcast (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr @g, i64 16) to i64) to double) -; CHECK-NEXT: [[R:%.*]] = fmul double [[FSUB]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = fmul double [[X:%.*]], [[FSUB]] ; CHECK-NEXT: ret double [[R]] ; %fsub = fsub double -0.000000e+00, bitcast (i64 ptrtoint (ptr getelementptr inbounds ({ [2 x ptr] }, ptr @g, i64 0, i32 0, i64 2) to i64) to double) diff --git a/llvm/test/Transforms/InstCombine/fold-ext-eq-c-with-op.ll b/llvm/test/Transforms/InstCombine/fold-ext-eq-c-with-op.ll index 4d02d492d2aa76..248c802d03f9db 100644 --- a/llvm/test/Transforms/InstCombine/fold-ext-eq-c-with-op.ll +++ b/llvm/test/Transforms/InstCombine/fold-ext-eq-c-with-op.ll @@ -31,7 +31,7 @@ define i8 @fold_add_zext_eq_0_fail_multiuse_exp(i8 %x) { ; CHECK-LABEL: @fold_add_zext_eq_0_fail_multiuse_exp( ; CHECK-NEXT: [[X_EQ:%.*]] = icmp eq i8 [[X:%.*]], 0 ; CHECK-NEXT: [[X_EQ_EXT:%.*]] = zext i1 [[X_EQ]] to i8 -; CHECK-NEXT: [[R:%.*]] = add i8 [[X_EQ_EXT]], [[X]] +; CHECK-NEXT: [[R:%.*]] = add i8 [[X]], [[X_EQ_EXT]] ; CHECK-NEXT: call void @use.i8(i8 [[X_EQ_EXT]]) ; CHECK-NEXT: ret i8 [[R]] ; @@ -46,7 +46,7 @@ define i8 @fold_add_sext_eq_4_fail_wrong_cond(i8 %x, i8 %y) { ; CHECK-LABEL: @fold_add_sext_eq_4_fail_wrong_cond( ; CHECK-NEXT: [[X_EQ:%.*]] = icmp eq i8 [[Y:%.*]], 4 ; CHECK-NEXT: [[X_EQ_EXT:%.*]] = sext i1 [[X_EQ]] to i8 -; CHECK-NEXT: [[R:%.*]] = add i8 [[X_EQ_EXT]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = add i8 [[X:%.*]], [[X_EQ_EXT]] ; CHECK-NEXT: call void @use.i8(i8 [[X_EQ_EXT]]) ; CHECK-NEXT: ret i8 [[R]] ; diff --git a/llvm/test/Transforms/InstCombine/fold-inc-of-add-of-not-x-and-y-to-sub-x-from-y.ll b/llvm/test/Transforms/InstCombine/fold-inc-of-add-of-not-x-and-y-to-sub-x-from-y.ll index 1fd570bf2635b2..d16f36927d71a2 100644 --- a/llvm/test/Transforms/InstCombine/fold-inc-of-add-of-not-x-and-y-to-sub-x-from-y.ll +++ b/llvm/test/Transforms/InstCombine/fold-inc-of-add-of-not-x-and-y-to-sub-x-from-y.ll @@ -92,7 +92,7 @@ define i32 @t5(i32 %x, i32 %y) { define i32 @t6(i32 %x, i32 %y) { ; CHECK-LABEL: @t6( ; CHECK-NEXT: [[T0:%.*]] = xor i32 [[X:%.*]], -1 -; CHECK-NEXT: [[T1:%.*]] = add i32 [[T0]], [[Y:%.*]] +; CHECK-NEXT: [[T1:%.*]] = add i32 [[Y:%.*]], [[T0]] ; CHECK-NEXT: call void @use32(i32 [[T1]]) ; CHECK-NEXT: [[T2:%.*]] = sub i32 [[Y]], [[X]] ; CHECK-NEXT: ret i32 [[T2]] @@ -108,7 +108,7 @@ define i32 @t7(i32 %x, i32 %y) { ; CHECK-LABEL: @t7( ; CHECK-NEXT: [[T0:%.*]] = xor i32 [[X:%.*]], -1 ; CHECK-NEXT: call void @use32(i32 [[T0]]) -; CHECK-NEXT: [[T1:%.*]] = add i32 [[T0]], [[Y:%.*]] +; CHECK-NEXT: [[T1:%.*]] = add i32 [[Y:%.*]], [[T0]] ; CHECK-NEXT: call void @use32(i32 [[T1]]) ; CHECK-NEXT: [[T2:%.*]] = sub i32 [[Y]], [[X]] ; CHECK-NEXT: ret i32 [[T2]] @@ -202,7 +202,7 @@ define i32 @n11(i32 %x, i32 %y) { define i32 @n12(i32 %x, i32 %y) { ; CHECK-LABEL: @n12( ; CHECK-NEXT: [[T0:%.*]] = xor i32 [[X:%.*]], -1 -; CHECK-NEXT: [[T1:%.*]] = add i32 [[T0]], [[Y:%.*]] +; CHECK-NEXT: [[T1:%.*]] = add i32 [[Y:%.*]], [[T0]] ; CHECK-NEXT: [[T2:%.*]] = add i32 [[T1]], 2 ; CHECK-NEXT: ret i32 [[T2]] ; diff --git a/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll b/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll index dedd12f8cc7a3d..1c28b151825c12 100644 --- a/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll +++ b/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll @@ -428,7 +428,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32(float %x, float %y) { ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul float [[SCALED_X]], [[X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -442,7 +442,7 @@ define float @fmul_by_fabs_var_if_0_oeq_zero_f32(float %x, float %y) { ; CHECK-NEXT: [[Y_FABS:%.*]] = call float @llvm.fabs.f32(float [[Y:%.*]]) ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y_FABS]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul float [[SCALED_X]], [[X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %y.fabs = call float @llvm.fabs.f32(float %y) @@ -468,7 +468,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_nsz_fmul(float %x, float %y) { ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nsz_fmul( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nsz float [[SCALED_X]], [[X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nsz float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -482,7 +482,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_nsz_ninf_fmul(float %x, float %y) { ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nsz_ninf_fmul( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul ninf nsz float [[SCALED_X]], [[X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul ninf nsz float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -496,7 +496,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_nsz_nnan_fmul(float %x, float %y) { ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nsz_nnan_fmul( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan nsz float [[SCALED_X]], [[X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan nsz float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -510,7 +510,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_nnan_ninf_fmul(float %x, float %y) { ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nnan_ninf_fmul( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf float [[SCALED_X]], [[X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -524,7 +524,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_nsz_nnan_ninf_select(float %x, float ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nsz_nnan_ninf_select( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan ninf nsz i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul float [[SCALED_X]], [[X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -559,7 +559,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz(float %x, float % ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf nsz float [[SCALED_X]], [[X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf nsz float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -572,7 +572,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz_commuted(float %x ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz_commuted( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf nsz float [[SCALED_X]], [[X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf nsz float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -586,7 +586,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_known_never_ne ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_known_never_negzero( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf float [[SCALED_X]], [[X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -599,7 +599,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_known_never_ne ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_known_never_negzero_negsub( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf float [[SCALED_X]], [[X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -623,7 +623,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_fmul_known_never_nan_inf_negzero(flo ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_known_never_nan_inf_negzero( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul float [[SCALED_X]], [[X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -636,7 +636,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_fmul_known_never_nan_inf_negzero_nsu ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_known_never_nan_inf_negzero_nsub( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul float [[SCALED_X]], [[X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -693,7 +693,7 @@ define float @fmul_by_self_if_0_oeq_zero_f32(float %x) { ; CHECK-LABEL: @fmul_by_self_if_0_oeq_zero_f32( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[X]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul float [[SCALED_X]], [[X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -706,7 +706,7 @@ define float @fmul_by_self_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz(float %x) { ; CHECK-LABEL: @fmul_by_self_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[X]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf nsz float [[SCALED_X]], [[X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf nsz float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -719,7 +719,7 @@ define float @fmul_by_self_if_0_oeq_zero_f32_select_nnan_ninf_nsz(float %x) { ; CHECK-LABEL: @fmul_by_self_if_0_oeq_zero_f32_select_nnan_ninf_nsz( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan ninf nsz i1 [[X_IS_ZERO]], float [[X]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul float [[SCALED_X]], [[X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 diff --git a/llvm/test/Transforms/InstCombine/fold-signbit-test-power2.ll b/llvm/test/Transforms/InstCombine/fold-signbit-test-power2.ll index f5024664f58c3e..a5c7cb3306ed08 100644 --- a/llvm/test/Transforms/InstCombine/fold-signbit-test-power2.ll +++ b/llvm/test/Transforms/InstCombine/fold-signbit-test-power2.ll @@ -124,7 +124,7 @@ define i1 @pow2_or_zero_is_negative_extra_use(i8 %x) { ; CHECK-LABEL: @pow2_or_zero_is_negative_extra_use( ; CHECK-NEXT: [[NEG:%.*]] = sub i8 0, [[X:%.*]] ; CHECK-NEXT: call void @use(i8 [[NEG]]) -; CHECK-NEXT: [[POW2_OR_ZERO:%.*]] = and i8 [[NEG]], [[X]] +; CHECK-NEXT: [[POW2_OR_ZERO:%.*]] = and i8 [[X]], [[NEG]] ; CHECK-NEXT: call void @use(i8 [[POW2_OR_ZERO]]) ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[X]], -128 ; CHECK-NEXT: ret i1 [[CMP]] diff --git a/llvm/test/Transforms/InstCombine/fpextend.ll b/llvm/test/Transforms/InstCombine/fpextend.ll index 19f512d717a978..c9adbe10d8db44 100644 --- a/llvm/test/Transforms/InstCombine/fpextend.ll +++ b/llvm/test/Transforms/InstCombine/fpextend.ll @@ -142,7 +142,7 @@ define float @test9(half %x, half %y) nounwind { define float @test10(half %x, float %y) nounwind { ; CHECK-LABEL: @test10( ; CHECK-NEXT: [[TMP1:%.*]] = fpext half [[X:%.*]] to float -; CHECK-NEXT: [[T56:%.*]] = fmul float [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[T56:%.*]] = fmul float [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret float [[T56]] ; %t1 = fpext half %x to double @@ -167,7 +167,7 @@ define float @test11(half %x) nounwind { define float @test12(float %x, half %y) nounwind { ; CHECK-LABEL: @test12( ; CHECK-NEXT: [[TMP1:%.*]] = fpext half [[Y:%.*]] to float -; CHECK-NEXT: [[T34:%.*]] = fadd float [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[T34:%.*]] = fadd float [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret float [[T34]] ; %t1 = fpext float %x to double @@ -440,8 +440,8 @@ define half @bf16_to_f32_to_f16(bfloat %a) nounwind { define bfloat @bf16_frem(bfloat %x) { ; CHECK-LABEL: @bf16_frem( -; CHECK-NEXT: [[FREM:%.*]] = frem bfloat [[X:%.*]], 0xR40C9 -; CHECK-NEXT: ret bfloat [[FREM]] +; CHECK-NEXT: [[TMP1:%.*]] = frem bfloat [[X:%.*]], 0xR40C9 +; CHECK-NEXT: ret bfloat [[TMP1]] ; %t1 = fpext bfloat %x to float %t2 = frem float %t1, 6.281250e+00 diff --git a/llvm/test/Transforms/InstCombine/fptrunc.ll b/llvm/test/Transforms/InstCombine/fptrunc.ll index c78df0b83d9cdf..825868b1070336 100644 --- a/llvm/test/Transforms/InstCombine/fptrunc.ll +++ b/llvm/test/Transforms/InstCombine/fptrunc.ll @@ -4,7 +4,7 @@ define float @fadd_fpext_op0(float %x, double %y) { ; CHECK-LABEL: @fadd_fpext_op0( ; CHECK-NEXT: [[EXT:%.*]] = fpext float [[X:%.*]] to double -; CHECK-NEXT: [[BO:%.*]] = fadd reassoc double [[EXT]], [[Y:%.*]] +; CHECK-NEXT: [[BO:%.*]] = fadd reassoc double [[Y:%.*]], [[EXT]] ; CHECK-NEXT: [[R:%.*]] = fptrunc double [[BO]] to float ; CHECK-NEXT: ret float [[R]] ; diff --git a/llvm/test/Transforms/InstCombine/free-inversion.ll b/llvm/test/Transforms/InstCombine/free-inversion.ll index a89887a586b582..ebb9310ee0a78c 100644 --- a/llvm/test/Transforms/InstCombine/free-inversion.ll +++ b/llvm/test/Transforms/InstCombine/free-inversion.ll @@ -30,7 +30,7 @@ define i8 @xor_2(i8 %a, i1 %c, i8 %x, i8 %y) { ; CHECK-LABEL: @xor_2( ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], -124 ; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[C:%.*]], i8 [[X:%.*]], i8 [[TMP1]] -; CHECK-NEXT: [[NOT_AB:%.*]] = xor i8 [[TMP2]], [[A:%.*]] +; CHECK-NEXT: [[NOT_AB:%.*]] = xor i8 [[A:%.*]], [[TMP2]] ; CHECK-NEXT: ret i8 [[NOT_AB]] ; %nx = xor i8 %x, -1 @@ -45,7 +45,7 @@ define i8 @xor_fail(i8 %a, i1 %c, i8 %x, i8 %y) { ; CHECK-LABEL: @xor_fail( ; CHECK-NEXT: [[NX:%.*]] = xor i8 [[X:%.*]], -1 ; CHECK-NEXT: [[B:%.*]] = select i1 [[C:%.*]], i8 [[NX]], i8 [[Y:%.*]] -; CHECK-NEXT: [[AB:%.*]] = xor i8 [[B]], [[A:%.*]] +; CHECK-NEXT: [[AB:%.*]] = xor i8 [[A:%.*]], [[B]] ; CHECK-NEXT: [[NOT_AB:%.*]] = xor i8 [[AB]], -1 ; CHECK-NEXT: ret i8 [[NOT_AB]] ; @@ -91,7 +91,7 @@ define i8 @add_fail(i8 %a, i1 %c, i8 %x, i8 %y) { ; CHECK-NEXT: [[NX:%.*]] = xor i8 [[X:%.*]], [[A:%.*]] ; CHECK-NEXT: [[YY:%.*]] = xor i8 [[Y:%.*]], 123 ; CHECK-NEXT: [[B:%.*]] = select i1 [[C:%.*]], i8 [[NX]], i8 [[YY]] -; CHECK-NEXT: [[AB:%.*]] = add i8 [[B]], [[A]] +; CHECK-NEXT: [[AB:%.*]] = add i8 [[A]], [[B]] ; CHECK-NEXT: [[NOT_AB:%.*]] = xor i8 [[AB]], -1 ; CHECK-NEXT: ret i8 [[NOT_AB]] ; @@ -605,7 +605,7 @@ define i32 @test_inv_free_i32(i1 %c1, i1 %c2, i32 %c3, i32 %c4) { ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 0, [[B1]] ], [ -1, [[B2]] ], [ [[C3:%.*]], [[B3]] ] -; CHECK-NEXT: [[COND:%.*]] = xor i32 [[TMP0]], [[C4:%.*]] +; CHECK-NEXT: [[COND:%.*]] = xor i32 [[C4:%.*]], [[TMP0]] ; CHECK-NEXT: ret i32 [[COND]] ; entry: @@ -682,7 +682,7 @@ define i32 @test_inv_free_i32_newinst(i1 %c1, i1 %c2, i32 %c3, i32 %c4) { ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: [[VAL:%.*]] = phi i32 [ -1, [[B1]] ], [ 0, [[B2]] ], [ [[ASHR]], [[B3]] ] -; CHECK-NEXT: [[TMP0:%.*]] = xor i32 [[VAL]], [[C4:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = xor i32 [[C4:%.*]], [[VAL]] ; CHECK-NEXT: [[COND:%.*]] = xor i32 [[TMP0]], -1 ; CHECK-NEXT: ret i32 [[COND]] ; diff --git a/llvm/test/Transforms/InstCombine/fsh.ll b/llvm/test/Transforms/InstCombine/fsh.ll index 505a2283672548..f1fba6cb272f9f 100644 --- a/llvm/test/Transforms/InstCombine/fsh.ll +++ b/llvm/test/Transforms/InstCombine/fsh.ll @@ -725,7 +725,7 @@ define i32 @fsh_orconst_rotate(i32 %a) { define i32 @fsh_rotate_5(i8 %x, i32 %y) { ; CHECK-LABEL: @fsh_rotate_5( ; CHECK-NEXT: [[T1:%.*]] = zext i8 [[X:%.*]] to i32 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[T1]], [[Y:%.*]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[Y:%.*]], [[T1]] ; CHECK-NEXT: [[OR2:%.*]] = call i32 @llvm.fshl.i32(i32 [[OR1]], i32 [[OR1]], i32 5) ; CHECK-NEXT: ret i32 [[OR2]] ; @@ -741,7 +741,7 @@ define i32 @fsh_rotate_5(i8 %x, i32 %y) { define i32 @fsh_rotate_18(i8 %x, i32 %y) { ; CHECK-LABEL: @fsh_rotate_18( ; CHECK-NEXT: [[T1:%.*]] = zext i8 [[X:%.*]] to i32 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[T1]], [[Y:%.*]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[Y:%.*]], [[T1]] ; CHECK-NEXT: [[OR2:%.*]] = call i32 @llvm.fshl.i32(i32 [[OR1]], i32 [[OR1]], i32 18) ; CHECK-NEXT: ret i32 [[OR2]] ; diff --git a/llvm/test/Transforms/InstCombine/fsub.ll b/llvm/test/Transforms/InstCombine/fsub.ll index f1e7086e697e86..cffc63405ddcbc 100644 --- a/llvm/test/Transforms/InstCombine/fsub.ll +++ b/llvm/test/Transforms/InstCombine/fsub.ll @@ -86,7 +86,7 @@ define float @unary_neg_sub_nsz_extra_use(float %x, float %y) { define float @sub_sub_nsz(float %x, float %y, float %z) { ; CHECK-LABEL: @sub_sub_nsz( ; CHECK-NEXT: [[TMP1:%.*]] = fsub nsz float [[Y:%.*]], [[X:%.*]] -; CHECK-NEXT: [[T2:%.*]] = fadd nsz float [[TMP1]], [[Z:%.*]] +; CHECK-NEXT: [[T2:%.*]] = fadd nsz float [[Z:%.*]], [[TMP1]] ; CHECK-NEXT: ret float [[T2]] ; %t1 = fsub float %x, %y @@ -219,7 +219,7 @@ define <2 x float> @neg_op1_vec_poison(<2 x float> %x, <2 x float> %y) { define double @neg_ext_op1(float %a, double %b) { ; CHECK-LABEL: @neg_ext_op1( ; CHECK-NEXT: [[TMP1:%.*]] = fpext float [[A:%.*]] to double -; CHECK-NEXT: [[T3:%.*]] = fadd double [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[T3:%.*]] = fadd double [[B:%.*]], [[TMP1]] ; CHECK-NEXT: ret double [[T3]] ; %t1 = fsub float -0.0, %a @@ -231,7 +231,7 @@ define double @neg_ext_op1(float %a, double %b) { define double @unary_neg_ext_op1(float %a, double %b) { ; CHECK-LABEL: @unary_neg_ext_op1( ; CHECK-NEXT: [[TMP1:%.*]] = fpext float [[A:%.*]] to double -; CHECK-NEXT: [[T3:%.*]] = fadd double [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[T3:%.*]] = fadd double [[B:%.*]], [[TMP1]] ; CHECK-NEXT: ret double [[T3]] ; %t1 = fneg float %a @@ -245,7 +245,7 @@ define double @unary_neg_ext_op1(float %a, double %b) { define <2 x float> @neg_trunc_op1(<2 x double> %a, <2 x float> %b) { ; CHECK-LABEL: @neg_trunc_op1( ; CHECK-NEXT: [[TMP1:%.*]] = fptrunc <2 x double> [[A:%.*]] to <2 x float> -; CHECK-NEXT: [[T3:%.*]] = fadd <2 x float> [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[T3:%.*]] = fadd <2 x float> [[B:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x float> [[T3]] ; %t1 = fsub <2 x double> , %a @@ -257,7 +257,7 @@ define <2 x float> @neg_trunc_op1(<2 x double> %a, <2 x float> %b) { define <2 x float> @unary_neg_trunc_op1(<2 x double> %a, <2 x float> %b) { ; CHECK-LABEL: @unary_neg_trunc_op1( ; CHECK-NEXT: [[TMP1:%.*]] = fptrunc <2 x double> [[A:%.*]] to <2 x float> -; CHECK-NEXT: [[T3:%.*]] = fadd <2 x float> [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[T3:%.*]] = fadd <2 x float> [[B:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x float> [[T3]] ; %t1 = fneg <2 x double> %a @@ -271,7 +271,7 @@ define <2 x float> @unary_neg_trunc_op1(<2 x double> %a, <2 x float> %b) { define double @neg_ext_op1_fast(float %a, double %b) { ; CHECK-LABEL: @neg_ext_op1_fast( ; CHECK-NEXT: [[TMP1:%.*]] = fpext float [[A:%.*]] to double -; CHECK-NEXT: [[T3:%.*]] = fadd fast double [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[T3:%.*]] = fadd fast double [[B:%.*]], [[TMP1]] ; CHECK-NEXT: ret double [[T3]] ; %t1 = fsub float -0.0, %a @@ -283,7 +283,7 @@ define double @neg_ext_op1_fast(float %a, double %b) { define double @unary_neg_ext_op1_fast(float %a, double %b) { ; CHECK-LABEL: @unary_neg_ext_op1_fast( ; CHECK-NEXT: [[TMP1:%.*]] = fpext float [[A:%.*]] to double -; CHECK-NEXT: [[T3:%.*]] = fadd fast double [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[T3:%.*]] = fadd fast double [[B:%.*]], [[TMP1]] ; CHECK-NEXT: ret double [[T3]] ; %t1 = fneg float %a @@ -332,7 +332,7 @@ define float @neg_trunc_op1_extra_use(double %a, float %b) { ; CHECK-LABEL: @neg_trunc_op1_extra_use( ; CHECK-NEXT: [[TMP1:%.*]] = fptrunc double [[A:%.*]] to float ; CHECK-NEXT: [[T2:%.*]] = fneg float [[TMP1]] -; CHECK-NEXT: [[T3:%.*]] = fadd float [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[T3:%.*]] = fadd float [[B:%.*]], [[TMP1]] ; CHECK-NEXT: call void @use(float [[T2]]) ; CHECK-NEXT: ret float [[T3]] ; @@ -347,7 +347,7 @@ define float @unary_neg_trunc_op1_extra_use(double %a, float %b) { ; CHECK-LABEL: @unary_neg_trunc_op1_extra_use( ; CHECK-NEXT: [[TMP1:%.*]] = fptrunc double [[A:%.*]] to float ; CHECK-NEXT: [[T2:%.*]] = fneg float [[TMP1]] -; CHECK-NEXT: [[T3:%.*]] = fadd float [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[T3:%.*]] = fadd float [[B:%.*]], [[TMP1]] ; CHECK-NEXT: call void @use(float [[T2]]) ; CHECK-NEXT: ret float [[T3]] ; @@ -407,7 +407,7 @@ define float @PR37605(float %conv) { define double @fsub_fdiv_fneg1(double %x, double %y, double %z) { ; CHECK-LABEL: @fsub_fdiv_fneg1( ; CHECK-NEXT: [[TMP1:%.*]] = fdiv double [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[R:%.*]] = fadd double [[TMP1]], [[Z:%.*]] +; CHECK-NEXT: [[R:%.*]] = fadd double [[Z:%.*]], [[TMP1]] ; CHECK-NEXT: ret double [[R]] ; %neg = fsub double -0.000000e+00, %x @@ -419,7 +419,7 @@ define double @fsub_fdiv_fneg1(double %x, double %y, double %z) { define <2 x double> @fsub_fdiv_fneg2(<2 x double> %x, <2 x double> %y, <2 x double> %z) { ; CHECK-LABEL: @fsub_fdiv_fneg2( ; CHECK-NEXT: [[TMP1:%.*]] = fdiv <2 x double> [[Y:%.*]], [[X:%.*]] -; CHECK-NEXT: [[R:%.*]] = fadd <2 x double> [[TMP1]], [[Z:%.*]] +; CHECK-NEXT: [[R:%.*]] = fadd <2 x double> [[Z:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x double> [[R]] ; %neg = fsub <2 x double> , %x @@ -431,7 +431,7 @@ define <2 x double> @fsub_fdiv_fneg2(<2 x double> %x, <2 x double> %y, <2 x doub define double @fsub_fmul_fneg1(double %x, double %y, double %z) { ; CHECK-LABEL: @fsub_fmul_fneg1( ; CHECK-NEXT: [[TMP1:%.*]] = fmul double [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[R:%.*]] = fadd double [[TMP1]], [[Z:%.*]] +; CHECK-NEXT: [[R:%.*]] = fadd double [[Z:%.*]], [[TMP1]] ; CHECK-NEXT: ret double [[R]] ; %neg = fsub double -0.000000e+00, %x @@ -443,7 +443,7 @@ define double @fsub_fmul_fneg1(double %x, double %y, double %z) { define double @fsub_fmul_fneg2(double %x, double %y, double %z) { ; CHECK-LABEL: @fsub_fmul_fneg2( ; CHECK-NEXT: [[TMP1:%.*]] = fmul double [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[R:%.*]] = fadd double [[TMP1]], [[Z:%.*]] +; CHECK-NEXT: [[R:%.*]] = fadd double [[Z:%.*]], [[TMP1]] ; CHECK-NEXT: ret double [[R]] ; %neg = fsub double -0.000000e+00, %x @@ -487,7 +487,7 @@ declare void @use_vec(<2 x float>) define <2 x float> @fsub_fmul_fneg1_extra_use(<2 x float> %x, <2 x float> %y, <2 x float> %z) { ; CHECK-LABEL: @fsub_fmul_fneg1_extra_use( ; CHECK-NEXT: [[NEG:%.*]] = fneg <2 x float> [[X:%.*]] -; CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[Y:%.*]], [[NEG]] ; CHECK-NEXT: call void @use_vec(<2 x float> [[MUL]]) ; CHECK-NEXT: [[R:%.*]] = fsub <2 x float> [[Z:%.*]], [[MUL]] ; CHECK-NEXT: ret <2 x float> [[R]] @@ -502,7 +502,7 @@ define <2 x float> @fsub_fmul_fneg1_extra_use(<2 x float> %x, <2 x float> %y, <2 define float @fsub_fmul_fneg2_extra_use(float %x, float %y, float %z) { ; CHECK-LABEL: @fsub_fmul_fneg2_extra_use( ; CHECK-NEXT: [[NEG:%.*]] = fneg float [[X:%.*]] -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = fmul float [[Y:%.*]], [[NEG]] ; CHECK-NEXT: call void @use(float [[MUL]]) ; CHECK-NEXT: [[R:%.*]] = fsub float [[Z:%.*]], [[MUL]] ; CHECK-NEXT: ret float [[R]] @@ -519,7 +519,7 @@ define float @fsub_fdiv_fneg1_extra_use2(float %x, float %y, float %z) { ; CHECK-NEXT: [[NEG:%.*]] = fneg float [[X:%.*]] ; CHECK-NEXT: call void @use(float [[NEG]]) ; CHECK-NEXT: [[TMP1:%.*]] = fdiv float [[X]], [[Y:%.*]] -; CHECK-NEXT: [[R:%.*]] = fadd float [[TMP1]], [[Z:%.*]] +; CHECK-NEXT: [[R:%.*]] = fadd float [[Z:%.*]], [[TMP1]] ; CHECK-NEXT: ret float [[R]] ; %neg = fsub float -0.000000e+00, %x @@ -534,7 +534,7 @@ define float @fsub_fdiv_fneg2_extra_use2(float %x, float %y, float %z) { ; CHECK-NEXT: [[NEG:%.*]] = fneg float [[X:%.*]] ; CHECK-NEXT: call void @use(float [[NEG]]) ; CHECK-NEXT: [[TMP1:%.*]] = fdiv float [[Y:%.*]], [[X]] -; CHECK-NEXT: [[R:%.*]] = fadd float [[TMP1]], [[Z:%.*]] +; CHECK-NEXT: [[R:%.*]] = fadd float [[Z:%.*]], [[TMP1]] ; CHECK-NEXT: ret float [[R]] ; %neg = fsub float -0.000000e+00, %x @@ -549,7 +549,7 @@ define <2 x float> @fsub_fmul_fneg1_extra_use2(<2 x float> %x, <2 x float> %y, < ; CHECK-NEXT: [[NEG:%.*]] = fneg <2 x float> [[X:%.*]] ; CHECK-NEXT: call void @use_vec(<2 x float> [[NEG]]) ; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x float> [[X]], [[Y:%.*]] -; CHECK-NEXT: [[R:%.*]] = fadd <2 x float> [[TMP1]], [[Z:%.*]] +; CHECK-NEXT: [[R:%.*]] = fadd <2 x float> [[Z:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x float> [[R]] ; %neg = fsub <2 x float> , %x @@ -564,7 +564,7 @@ define float @fsub_fmul_fneg2_extra_use2(float %x, float %y, float %z) { ; CHECK-NEXT: [[NEG:%.*]] = fneg float [[X:%.*]] ; CHECK-NEXT: call void @use(float [[NEG]]) ; CHECK-NEXT: [[TMP1:%.*]] = fmul float [[X]], [[Y:%.*]] -; CHECK-NEXT: [[R:%.*]] = fadd float [[TMP1]], [[Z:%.*]] +; CHECK-NEXT: [[R:%.*]] = fadd float [[Z:%.*]], [[TMP1]] ; CHECK-NEXT: ret float [[R]] ; %neg = fsub float -0.000000e+00, %x @@ -612,7 +612,7 @@ define <2 x float> @fsub_fmul_fneg1_extra_use3(<2 x float> %x, <2 x float> %y, < ; CHECK-LABEL: @fsub_fmul_fneg1_extra_use3( ; CHECK-NEXT: [[NEG:%.*]] = fneg <2 x float> [[X:%.*]] ; CHECK-NEXT: call void @use_vec(<2 x float> [[NEG]]) -; CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[Y:%.*]], [[NEG]] ; CHECK-NEXT: call void @use_vec(<2 x float> [[MUL]]) ; CHECK-NEXT: [[R:%.*]] = fsub <2 x float> [[Z:%.*]], [[MUL]] ; CHECK-NEXT: ret <2 x float> [[R]] @@ -629,7 +629,7 @@ define float @fsub_fmul_fneg2_extra_use3(float %x, float %y, float %z) { ; CHECK-LABEL: @fsub_fmul_fneg2_extra_use3( ; CHECK-NEXT: [[NEG:%.*]] = fneg float [[X:%.*]] ; CHECK-NEXT: call void @use(float [[NEG]]) -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = fmul float [[Y:%.*]], [[NEG]] ; CHECK-NEXT: call void @use(float [[MUL]]) ; CHECK-NEXT: [[R:%.*]] = fsub float [[Z:%.*]], [[MUL]] ; CHECK-NEXT: ret float [[R]] @@ -805,7 +805,7 @@ define float @fsub_fadd_fsub_reassoc(float %w, float %x, float %y, float %z) { define <2 x float> @fsub_fadd_fsub_reassoc_commute(<2 x float> %w, <2 x float> %x, <2 x float> %y, <2 x float> %z) { ; CHECK-LABEL: @fsub_fadd_fsub_reassoc_commute( ; CHECK-NEXT: [[D:%.*]] = fdiv <2 x float> [[Y:%.*]], -; CHECK-NEXT: [[TMP1:%.*]] = fadd fast <2 x float> [[D]], [[W:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = fadd fast <2 x float> [[W:%.*]], [[D]] ; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <2 x float> [[X:%.*]], [[Z:%.*]] ; CHECK-NEXT: [[S2:%.*]] = fsub fast <2 x float> [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret <2 x float> [[S2]] @@ -823,7 +823,7 @@ define float @fsub_fadd_fsub_reassoc_twice(float %v, float %w, float %x, float % ; CHECK-LABEL: @fsub_fadd_fsub_reassoc_twice( ; CHECK-NEXT: [[TMP1:%.*]] = fadd reassoc nsz float [[W:%.*]], [[Z:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = fadd reassoc nsz float [[X:%.*]], [[V:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = fadd reassoc nsz float [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = fadd reassoc nsz float [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: [[S3:%.*]] = fsub reassoc nsz float [[TMP2]], [[TMP3]] ; CHECK-NEXT: ret float [[S3]] ; diff --git a/llvm/test/Transforms/InstCombine/funnel.ll b/llvm/test/Transforms/InstCombine/funnel.ll index a54e6e4642b753..fa0d59b2269983 100644 --- a/llvm/test/Transforms/InstCombine/funnel.ll +++ b/llvm/test/Transforms/InstCombine/funnel.ll @@ -464,10 +464,10 @@ define i32 @fshl_concat_i8_i8_different_slot(i8 %x, i8 %y, ptr %addr) { define i32 @fshl_concat_unknown_source(i32 %zext.x, i32 %zext.y, ptr %addr) { ; CHECK-LABEL: @fshl_concat_unknown_source( ; CHECK-NEXT: [[SLX:%.*]] = shl i32 [[ZEXT_X:%.*]], 16 -; CHECK-NEXT: [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y:%.*]] +; CHECK-NEXT: [[XY:%.*]] = or i32 [[ZEXT_Y:%.*]], [[SLX]] ; CHECK-NEXT: store i32 [[XY]], ptr [[ADDR:%.*]], align 4 ; CHECK-NEXT: [[SLY:%.*]] = shl i32 [[ZEXT_Y]], 16 -; CHECK-NEXT: [[YX:%.*]] = or i32 [[SLY]], [[ZEXT_X]] +; CHECK-NEXT: [[YX:%.*]] = or i32 [[ZEXT_X]], [[SLY]] ; CHECK-NEXT: ret i32 [[YX]] ; %slx = shl i32 %zext.x, 16 diff --git a/llvm/test/Transforms/InstCombine/getelementptr.ll b/llvm/test/Transforms/InstCombine/getelementptr.ll index a9addfcb182f70..c805a64d5cd070 100644 --- a/llvm/test/Transforms/InstCombine/getelementptr.ll +++ b/llvm/test/Transforms/InstCombine/getelementptr.ll @@ -269,7 +269,7 @@ define <2 x i1> @test13_fixed_scalable(i64 %X, ptr %P, <2 x i64> %y) nounwind { ; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 4 ; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[B_IDX:%.*]] = mul nsw <2 x i64> [[DOTSPLAT]], [[Y:%.*]] +; CHECK-NEXT: [[B_IDX:%.*]] = mul nsw <2 x i64> [[Y:%.*]], [[DOTSPLAT]] ; CHECK-NEXT: [[C:%.*]] = icmp eq <2 x i64> [[A_IDX]], [[B_IDX]] ; CHECK-NEXT: ret <2 x i1> [[C]] ; @@ -288,7 +288,7 @@ define @test13_scalable_scalable(i64 %X, ptr %P, poison, i64 [[TMP2]], i64 0 ; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector [[DOTSPLATINSERT1]], poison, zeroinitializer -; CHECK-NEXT: [[B_IDX:%.*]] = mul nsw [[DOTSPLAT2]], [[Y:%.*]] +; CHECK-NEXT: [[B_IDX:%.*]] = mul nsw [[Y:%.*]], [[DOTSPLAT2]] ; CHECK-NEXT: [[C:%.*]] = icmp eq [[A_IDX]], [[B_IDX]] ; CHECK-NEXT: ret [[C]] ; diff --git a/llvm/test/Transforms/InstCombine/hoist-negation-out-of-bias-calculation.ll b/llvm/test/Transforms/InstCombine/hoist-negation-out-of-bias-calculation.ll index e4cae135197830..6049997db4d1ae 100644 --- a/llvm/test/Transforms/InstCombine/hoist-negation-out-of-bias-calculation.ll +++ b/llvm/test/Transforms/InstCombine/hoist-negation-out-of-bias-calculation.ll @@ -15,7 +15,7 @@ define i8 @t0(i8 %x, i8 %y) { ; CHECK-LABEL: @t0( ; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[NEGBIAS:%.*]] = sub i8 0, [[TMP2]] ; CHECK-NEXT: ret i8 [[NEGBIAS]] ; @@ -45,7 +45,7 @@ define i8 @t1_commutative(i8 %y) { define <2 x i8> @t2_vec(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @t2_vec( ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i8> [[Y:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i8> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i8> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[NEGBIAS:%.*]] = sub <2 x i8> zeroinitializer, [[TMP2]] ; CHECK-NEXT: ret <2 x i8> [[NEGBIAS]] ; @@ -58,7 +58,7 @@ define <2 x i8> @t2_vec(<2 x i8> %x, <2 x i8> %y) { define <2 x i8> @t3_vec_poison(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @t3_vec_poison( ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i8> [[Y:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i8> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i8> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[NEGBIAS:%.*]] = sub <2 x i8> zeroinitializer, [[TMP2]] ; CHECK-NEXT: ret <2 x i8> [[NEGBIAS]] ; @@ -76,7 +76,7 @@ define i8 @n4_extrause0(i8 %x, i8 %y) { ; CHECK-LABEL: @n4_extrause0( ; CHECK-NEXT: [[NEGY:%.*]] = sub i8 0, [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[NEGY]]) -; CHECK-NEXT: [[UNBIASEDX:%.*]] = and i8 [[NEGY]], [[X:%.*]] +; CHECK-NEXT: [[UNBIASEDX:%.*]] = and i8 [[X:%.*]], [[NEGY]] ; CHECK-NEXT: [[NEGBIAS:%.*]] = sub i8 [[UNBIASEDX]], [[X]] ; CHECK-NEXT: ret i8 [[NEGBIAS]] ; @@ -89,7 +89,7 @@ define i8 @n4_extrause0(i8 %x, i8 %y) { define i8 @n5_extrause1(i8 %x, i8 %y) { ; CHECK-LABEL: @n5_extrause1( ; CHECK-NEXT: [[NEGY:%.*]] = sub i8 0, [[Y:%.*]] -; CHECK-NEXT: [[UNBIASEDX:%.*]] = and i8 [[NEGY]], [[X:%.*]] +; CHECK-NEXT: [[UNBIASEDX:%.*]] = and i8 [[X:%.*]], [[NEGY]] ; CHECK-NEXT: call void @use8(i8 [[UNBIASEDX]]) ; CHECK-NEXT: [[NEGBIAS:%.*]] = sub i8 [[UNBIASEDX]], [[X]] ; CHECK-NEXT: ret i8 [[NEGBIAS]] @@ -104,7 +104,7 @@ define i8 @n6_extrause2(i8 %x, i8 %y) { ; CHECK-LABEL: @n6_extrause2( ; CHECK-NEXT: [[NEGY:%.*]] = sub i8 0, [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[NEGY]]) -; CHECK-NEXT: [[UNBIASEDX:%.*]] = and i8 [[NEGY]], [[X:%.*]] +; CHECK-NEXT: [[UNBIASEDX:%.*]] = and i8 [[X:%.*]], [[NEGY]] ; CHECK-NEXT: call void @use8(i8 [[UNBIASEDX]]) ; CHECK-NEXT: [[NEGBIAS:%.*]] = sub i8 [[UNBIASEDX]], [[X]] ; CHECK-NEXT: ret i8 [[NEGBIAS]] @@ -122,7 +122,7 @@ define i8 @n6_extrause2(i8 %x, i8 %y) { define i8 @n7(i8 %x, i8 %y) { ; CHECK-LABEL: @n7( ; CHECK-NEXT: [[NEGY_NOT:%.*]] = add i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[NEGBIAS:%.*]] = and i8 [[NEGY_NOT]], [[X:%.*]] +; CHECK-NEXT: [[NEGBIAS:%.*]] = and i8 [[X:%.*]], [[NEGY_NOT]] ; CHECK-NEXT: ret i8 [[NEGBIAS]] ; %negy = sub i8 0, %y @@ -147,7 +147,7 @@ define i8 @n8(i8 %x, i8 %y) { define i8 @n9(i8 %x0, i8 %x1, i8 %y) { ; CHECK-LABEL: @n9( ; CHECK-NEXT: [[NEGY:%.*]] = sub i8 0, [[Y:%.*]] -; CHECK-NEXT: [[UNBIASEDX:%.*]] = and i8 [[NEGY]], [[X1:%.*]] +; CHECK-NEXT: [[UNBIASEDX:%.*]] = and i8 [[X1:%.*]], [[NEGY]] ; CHECK-NEXT: [[NEGBIAS:%.*]] = sub i8 [[UNBIASEDX]], [[X0:%.*]] ; CHECK-NEXT: ret i8 [[NEGBIAS]] ; diff --git a/llvm/test/Transforms/InstCombine/hoist-xor-by-constant-from-xor-by-value.ll b/llvm/test/Transforms/InstCombine/hoist-xor-by-constant-from-xor-by-value.ll index 200e7ba8e67730..f92b10b0ccb371 100644 --- a/llvm/test/Transforms/InstCombine/hoist-xor-by-constant-from-xor-by-value.ll +++ b/llvm/test/Transforms/InstCombine/hoist-xor-by-constant-from-xor-by-value.ll @@ -64,7 +64,7 @@ define i8 @t4_extrause(i8 %x, i8 %y) { define i8 @t5_commutativity(i8 %x) { ; CHECK-LABEL: @t5_commutativity( ; CHECK-NEXT: [[Y:%.*]] = call i8 @gen8() -; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X:%.*]], [[Y]] ; CHECK-NEXT: [[R:%.*]] = xor i8 [[TMP1]], 42 ; CHECK-NEXT: ret i8 [[R]] ; diff --git a/llvm/test/Transforms/InstCombine/icmp-add.ll b/llvm/test/Transforms/InstCombine/icmp-add.ll index 2ceb44b89eb9e9..0c141d4b8e73aa 100644 --- a/llvm/test/Transforms/InstCombine/icmp-add.ll +++ b/llvm/test/Transforms/InstCombine/icmp-add.ll @@ -207,7 +207,7 @@ define i1 @cvt_icmp_neg_1_sext_plus_zext_eq(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @cvt_icmp_neg_1_sext_plus_zext_eq( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[ARG1:%.*]], true -; CHECK-NEXT: [[T:%.*]] = and i1 [[TMP0]], [[ARG:%.*]] +; CHECK-NEXT: [[T:%.*]] = and i1 [[ARG:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[T]] ; bb: @@ -237,7 +237,7 @@ define i1 @cvt_icmp_1_sext_plus_zext_eq(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @cvt_icmp_1_sext_plus_zext_eq( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[ARG:%.*]], true -; CHECK-NEXT: [[T:%.*]] = and i1 [[TMP0]], [[ARG1:%.*]] +; CHECK-NEXT: [[T:%.*]] = and i1 [[ARG1:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[T]] ; bb: @@ -458,7 +458,7 @@ define i1 @cvt_icmp_neg_1_sext_plus_zext_ne(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @cvt_icmp_neg_1_sext_plus_zext_ne( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[ARG:%.*]], true -; CHECK-NEXT: [[T:%.*]] = or i1 [[TMP0]], [[ARG1:%.*]] +; CHECK-NEXT: [[T:%.*]] = or i1 [[ARG1:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[T]] ; bb: @@ -487,7 +487,7 @@ define i1 @cvt_icmp_1_sext_plus_zext_ne(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @cvt_icmp_1_sext_plus_zext_ne( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[ARG1:%.*]], true -; CHECK-NEXT: [[T:%.*]] = or i1 [[TMP0]], [[ARG:%.*]] +; CHECK-NEXT: [[T:%.*]] = or i1 [[ARG:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[T]] ; bb: @@ -557,7 +557,7 @@ define i1 @cvt_icmp_neg_1_zext_plus_sext_eq(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @cvt_icmp_neg_1_zext_plus_sext_eq( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[ARG:%.*]], true -; CHECK-NEXT: [[T:%.*]] = and i1 [[TMP0]], [[ARG1:%.*]] +; CHECK-NEXT: [[T:%.*]] = and i1 [[ARG1:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[T]] ; bb: @@ -587,7 +587,7 @@ define i1 @cvt_icmp_1_zext_plus_sext_eq(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @cvt_icmp_1_zext_plus_sext_eq( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[ARG1:%.*]], true -; CHECK-NEXT: [[T:%.*]] = and i1 [[TMP0]], [[ARG:%.*]] +; CHECK-NEXT: [[T:%.*]] = and i1 [[ARG:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[T]] ; bb: @@ -628,7 +628,7 @@ define i1 @cvt_icmp_neg_1_zext_plus_sext_ne(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @cvt_icmp_neg_1_zext_plus_sext_ne( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[ARG1:%.*]], true -; CHECK-NEXT: [[T:%.*]] = or i1 [[TMP0]], [[ARG:%.*]] +; CHECK-NEXT: [[T:%.*]] = or i1 [[ARG:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[T]] ; bb: @@ -657,7 +657,7 @@ define i1 @cvt_icmp_1_zext_plus_sext_ne(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @cvt_icmp_1_zext_plus_sext_ne( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[ARG:%.*]], true -; CHECK-NEXT: [[T:%.*]] = or i1 [[TMP0]], [[ARG1:%.*]] +; CHECK-NEXT: [[T:%.*]] = or i1 [[ARG1:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[T]] ; bb: @@ -849,7 +849,7 @@ define i1 @test_sext_zext_cvt_neg_2_ult_icmp(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @test_sext_zext_cvt_neg_2_ult_icmp( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[ARG_NOT:%.*]] = xor i1 [[ARG:%.*]], true -; CHECK-NEXT: [[I4:%.*]] = or i1 [[ARG_NOT]], [[ARG1:%.*]] +; CHECK-NEXT: [[I4:%.*]] = or i1 [[ARG1:%.*]], [[ARG_NOT]] ; CHECK-NEXT: ret i1 [[I4]] ; bb: @@ -864,7 +864,7 @@ define i1 @test_sext_zext_cvt_neg_1_ult_icmp(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @test_sext_zext_cvt_neg_1_ult_icmp( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[ARG:%.*]], true -; CHECK-NEXT: [[I4:%.*]] = or i1 [[TMP0]], [[ARG1:%.*]] +; CHECK-NEXT: [[I4:%.*]] = or i1 [[ARG1:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[I4]] ; bb: @@ -892,7 +892,7 @@ define i1 @test_sext_zext_cvt_2_ult_icmp(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @test_sext_zext_cvt_2_ult_icmp( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[ARG_NOT:%.*]] = xor i1 [[ARG:%.*]], true -; CHECK-NEXT: [[I4:%.*]] = or i1 [[ARG_NOT]], [[ARG1:%.*]] +; CHECK-NEXT: [[I4:%.*]] = or i1 [[ARG1:%.*]], [[ARG_NOT]] ; CHECK-NEXT: ret i1 [[I4]] ; bb: @@ -907,7 +907,7 @@ define i1 @test_zext_sext_cvt_neg_1_ult_icmp(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @test_zext_sext_cvt_neg_1_ult_icmp( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[ARG1:%.*]], true -; CHECK-NEXT: [[I4:%.*]] = or i1 [[TMP0]], [[ARG:%.*]] +; CHECK-NEXT: [[I4:%.*]] = or i1 [[ARG:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[I4]] ; bb: @@ -1047,7 +1047,7 @@ define i1 @test_zext_sext_cvt_neg_2_ugt_icmp(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @test_zext_sext_cvt_neg_2_ugt_icmp( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[ARG:%.*]], true -; CHECK-NEXT: [[I4:%.*]] = and i1 [[TMP0]], [[ARG1:%.*]] +; CHECK-NEXT: [[I4:%.*]] = and i1 [[ARG1:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[I4]] ; bb: @@ -1089,7 +1089,7 @@ define i1 @test_zext_sext_cvt_1_ugt_icmp(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @test_zext_sext_cvt_1_ugt_icmp( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[ARG:%.*]], true -; CHECK-NEXT: [[TMP1:%.*]] = and i1 [[TMP0]], [[ARG1:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = and i1 [[ARG1:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[TMP1]] ; bb: @@ -1104,7 +1104,7 @@ define i1 @test_zext_sext_cvt_2_ugt_icmp(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @test_zext_sext_cvt_2_ugt_icmp( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[ARG:%.*]], true -; CHECK-NEXT: [[I4:%.*]] = and i1 [[TMP0]], [[ARG1:%.*]] +; CHECK-NEXT: [[I4:%.*]] = and i1 [[ARG1:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[I4]] ; bb: @@ -1256,7 +1256,7 @@ define i1 @test_zext_sext_cvt_neg_1_sgt_icmp(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @test_zext_sext_cvt_neg_1_sgt_icmp( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[ARG1_NOT:%.*]] = xor i1 [[ARG1:%.*]], true -; CHECK-NEXT: [[I4:%.*]] = or i1 [[ARG1_NOT]], [[ARG:%.*]] +; CHECK-NEXT: [[I4:%.*]] = or i1 [[ARG:%.*]], [[ARG1_NOT]] ; CHECK-NEXT: ret i1 [[I4]] ; bb: @@ -1271,7 +1271,7 @@ define i1 @test_zext_sext_cvt_0_sgt_icmp(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @test_zext_sext_cvt_0_sgt_icmp( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[ARG1:%.*]], true -; CHECK-NEXT: [[I4:%.*]] = and i1 [[TMP0]], [[ARG:%.*]] +; CHECK-NEXT: [[I4:%.*]] = and i1 [[ARG:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[I4]] ; bb: @@ -1420,7 +1420,7 @@ define i1 @test_zext_sext_cvt_0_slt_icmp(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @test_zext_sext_cvt_0_slt_icmp( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[ARG:%.*]], true -; CHECK-NEXT: [[TMP1:%.*]] = and i1 [[TMP0]], [[ARG1:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = and i1 [[ARG1:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[TMP1]] ; bb: @@ -1435,7 +1435,7 @@ define i1 @test_zext_sext_cvt_1_slt_icmp(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @test_zext_sext_cvt_1_slt_icmp( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[ARG:%.*]], true -; CHECK-NEXT: [[I4:%.*]] = or i1 [[TMP0]], [[ARG1:%.*]] +; CHECK-NEXT: [[I4:%.*]] = or i1 [[ARG1:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[I4]] ; bb: @@ -1617,7 +1617,7 @@ define i1 @test_cvt_icmp19(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @test_cvt_icmp19( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[ARG1:%.*]], true -; CHECK-NEXT: [[I4:%.*]] = and i1 [[TMP0]], [[ARG:%.*]] +; CHECK-NEXT: [[I4:%.*]] = and i1 [[ARG:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[I4]] ; bb: @@ -1646,7 +1646,7 @@ define i1 @test_cvt_icmp21(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @test_cvt_icmp21( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[ARG_NOT:%.*]] = xor i1 [[ARG:%.*]], true -; CHECK-NEXT: [[I4:%.*]] = or i1 [[ARG_NOT]], [[ARG1:%.*]] +; CHECK-NEXT: [[I4:%.*]] = or i1 [[ARG1:%.*]], [[ARG_NOT]] ; CHECK-NEXT: ret i1 [[I4]] ; bb: @@ -1661,7 +1661,7 @@ define i1 @test_cvt_icmp22(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @test_cvt_icmp22( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[ARG:%.*]], true -; CHECK-NEXT: [[I4:%.*]] = or i1 [[TMP0]], [[ARG1:%.*]] +; CHECK-NEXT: [[I4:%.*]] = or i1 [[ARG1:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[I4]] ; bb: @@ -1689,7 +1689,7 @@ define i1 @test_cvt_icmp24(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @test_cvt_icmp24( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[ARG_NOT:%.*]] = xor i1 [[ARG:%.*]], true -; CHECK-NEXT: [[I4:%.*]] = or i1 [[ARG_NOT]], [[ARG1:%.*]] +; CHECK-NEXT: [[I4:%.*]] = or i1 [[ARG1:%.*]], [[ARG_NOT]] ; CHECK-NEXT: ret i1 [[I4]] ; bb: @@ -1704,7 +1704,7 @@ define i1 @test_cvt_icmp25(i1 %arg, i1 %arg1) { ; CHECK-LABEL: @test_cvt_icmp25( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[ARG1:%.*]], true -; CHECK-NEXT: [[TMP1:%.*]] = and i1 [[TMP0]], [[ARG:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = and i1 [[ARG:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[TMP1]] ; bb: @@ -2390,7 +2390,7 @@ define <2 x i1> @icmp_eq_add_non_splat2(<2 x i32> %a) { define i1 @without_nsw_nuw(i8 %x, i8 %y) { ; CHECK-LABEL: @without_nsw_nuw( ; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X:%.*]], 2 -; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[TOBOOL]] ; %t1 = add i8 %x, 37 @@ -2402,7 +2402,7 @@ define i1 @without_nsw_nuw(i8 %x, i8 %y) { define i1 @with_nsw_nuw(i8 %x, i8 %y) { ; CHECK-LABEL: @with_nsw_nuw( ; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i8 [[X:%.*]], 2 -; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[TOBOOL]] ; %t1 = add nsw nuw i8 %x, 37 @@ -2414,7 +2414,7 @@ define i1 @with_nsw_nuw(i8 %x, i8 %y) { define i1 @with_nsw_large(i8 %x, i8 %y) { ; CHECK-LABEL: @with_nsw_large( ; CHECK-NEXT: [[TMP1:%.*]] = add nsw i8 [[X:%.*]], 2 -; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[TOBOOL]] ; %t1 = add nsw i8 %x, 37 @@ -2438,7 +2438,7 @@ define i1 @with_nsw_small(i8 %x, i8 %y) { define i1 @with_nuw_large(i8 %x, i8 %y) { ; CHECK-LABEL: @with_nuw_large( ; CHECK-NEXT: [[TMP1:%.*]] = add nuw i8 [[X:%.*]], 2 -; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[TOBOOL]] ; %t1 = add nuw i8 %x, 37 @@ -2462,7 +2462,7 @@ define i1 @with_nuw_small(i8 %x, i8 %y) { define i1 @with_nuw_large_negative(i8 %x, i8 %y) { ; CHECK-LABEL: @with_nuw_large_negative( ; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X:%.*]], -2 -; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[TOBOOL]] ; %t1 = add nuw i8 %x, -37 @@ -2751,7 +2751,7 @@ define i32 @decrement_min(i32 %x) { define i1 @icmp_add_add_C(i32 %a, i32 %b) { ; CHECK-LABEL: @icmp_add_add_C( ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 0, [[B:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[CMP]] ; %add1 = add i32 %a, %b @@ -2763,7 +2763,7 @@ define i1 @icmp_add_add_C(i32 %a, i32 %b) { define i1 @icmp_add_add_C_pred(i32 %a, i32 %b) { ; CHECK-LABEL: @icmp_add_add_C_pred( ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 0, [[B:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp uge i32 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ule i32 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[CMP]] ; %add1 = add i32 %a, %b @@ -2837,7 +2837,7 @@ define <2 x i1> @icmp_add_add_C_vector_undef(<2 x i8> %a, <2 x i8> %b) { define i1 @icmp_add_add_C_comm1(i32 %a, i32 %b) { ; CHECK-LABEL: @icmp_add_add_C_comm1( ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 0, [[B:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[CMP]] ; %add1 = add i32 %b, %a @@ -2923,7 +2923,7 @@ define i1 @icmp_add_add_C_extra_use2(i32 %a, i32 %b) { ; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: call void @use(i32 [[ADD1]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 0, [[B]] -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP1]], [[A]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[A]], [[TMP1]] ; CHECK-NEXT: ret i1 [[CMP]] ; %add1 = add i32 %a, %b diff --git a/llvm/test/Transforms/InstCombine/icmp-and-add-sub-xor-p2.ll b/llvm/test/Transforms/InstCombine/icmp-and-add-sub-xor-p2.ll index c8a3dfcd68cd46..711d59c1ebfd55 100644 --- a/llvm/test/Transforms/InstCombine/icmp-and-add-sub-xor-p2.ll +++ b/llvm/test/Transforms/InstCombine/icmp-and-add-sub-xor-p2.ll @@ -6,10 +6,10 @@ declare void @use.v2i8(<2 x i8>) define i1 @src_add_eq_p2(i8 %x, i8 %yy) { ; CHECK-LABEL: @src_add_eq_p2( ; CHECK-NEXT: [[NY:%.*]] = sub i8 0, [[YY:%.*]] -; CHECK-NEXT: [[Y:%.*]] = and i8 [[NY]], [[YY]] +; CHECK-NEXT: [[Y:%.*]] = and i8 [[YY]], [[NY]] ; CHECK-NEXT: [[X1:%.*]] = add i8 [[Y]], [[X:%.*]] ; CHECK-NEXT: call void @use.i8(i8 [[X1]]) -; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[Y]], [[X]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X]], [[Y]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[TMP1]], 0 ; CHECK-NEXT: ret i1 [[R]] ; @@ -25,8 +25,8 @@ define i1 @src_add_eq_p2(i8 %x, i8 %yy) { define i1 @src_add_eq_p2_fail_multiuse(i8 %x, i8 %yy) { ; CHECK-LABEL: @src_add_eq_p2_fail_multiuse( ; CHECK-NEXT: [[NY:%.*]] = sub i8 0, [[YY:%.*]] -; CHECK-NEXT: [[Y:%.*]] = and i8 [[NY]], [[YY]] -; CHECK-NEXT: [[X1:%.*]] = add i8 [[Y]], [[X:%.*]] +; CHECK-NEXT: [[Y:%.*]] = and i8 [[YY]], [[NY]] +; CHECK-NEXT: [[X1:%.*]] = add i8 [[X:%.*]], [[Y]] ; CHECK-NEXT: call void @use.i8(i8 [[X1]]) ; CHECK-NEXT: [[V:%.*]] = and i8 [[X1]], [[Y]] ; CHECK-NEXT: call void @use.i8(i8 [[V]]) @@ -46,10 +46,10 @@ define i1 @src_add_eq_p2_fail_multiuse(i8 %x, i8 %yy) { define i1 @src_xor_ne_zero(i8 %x, i8 %yy) { ; CHECK-LABEL: @src_xor_ne_zero( ; CHECK-NEXT: [[NY:%.*]] = sub i8 0, [[YY:%.*]] -; CHECK-NEXT: [[Y:%.*]] = and i8 [[NY]], [[YY]] -; CHECK-NEXT: [[X1:%.*]] = xor i8 [[Y]], [[X:%.*]] +; CHECK-NEXT: [[Y:%.*]] = and i8 [[YY]], [[NY]] +; CHECK-NEXT: [[X1:%.*]] = xor i8 [[X:%.*]], [[Y]] ; CHECK-NEXT: call void @use.i8(i8 [[X1]]) -; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[Y]], [[X]] +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X]], [[Y]] ; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[TMP1]], [[Y]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -65,9 +65,9 @@ define i1 @src_xor_ne_zero(i8 %x, i8 %yy) { define i1 @src_xor_ne_zero_fail_different_p2(i8 %x, i8 %yy) { ; CHECK-LABEL: @src_xor_ne_zero_fail_different_p2( ; CHECK-NEXT: [[NY:%.*]] = sub i8 0, [[YY:%.*]] -; CHECK-NEXT: [[Y:%.*]] = and i8 [[NY]], [[YY]] +; CHECK-NEXT: [[Y:%.*]] = and i8 [[YY]], [[NY]] ; CHECK-NEXT: [[Y2:%.*]] = shl i8 [[Y]], 1 -; CHECK-NEXT: [[X1:%.*]] = xor i8 [[Y]], [[X:%.*]] +; CHECK-NEXT: [[X1:%.*]] = xor i8 [[X:%.*]], [[Y]] ; CHECK-NEXT: call void @use.i8(i8 [[X1]]) ; CHECK-NEXT: [[V:%.*]] = and i8 [[X1]], [[Y2]] ; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[V]], 0 @@ -86,10 +86,10 @@ define i1 @src_xor_ne_zero_fail_different_p2(i8 %x, i8 %yy) { define <2 x i1> @src_sub_ne_p2(<2 x i8> %x, <2 x i8> %yy) { ; CHECK-LABEL: @src_sub_ne_p2( ; CHECK-NEXT: [[NY:%.*]] = sub <2 x i8> zeroinitializer, [[YY:%.*]] -; CHECK-NEXT: [[Y:%.*]] = and <2 x i8> [[NY]], [[YY]] +; CHECK-NEXT: [[Y:%.*]] = and <2 x i8> [[YY]], [[NY]] ; CHECK-NEXT: [[X1:%.*]] = sub <2 x i8> [[X:%.*]], [[Y]] ; CHECK-NEXT: call void @use.v2i8(<2 x i8> [[X1]]) -; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i8> [[Y]], [[X]] +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i8> [[X]], [[Y]] ; CHECK-NEXT: [[R:%.*]] = icmp ne <2 x i8> [[TMP1]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[R]] ; @@ -107,7 +107,7 @@ define <2 x i1> @src_sub_eq_zero(<2 x i8> %x, <2 x i8> %yy) { ; CHECK-NEXT: [[Y:%.*]] = shl <2 x i8> , [[YY:%.*]] ; CHECK-NEXT: [[X1:%.*]] = sub <2 x i8> [[X:%.*]], [[Y]] ; CHECK-NEXT: call void @use.v2i8(<2 x i8> [[X1]]) -; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i8> [[Y]], [[X]] +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i8> [[X]], [[Y]] ; CHECK-NEXT: [[R:%.*]] = icmp eq <2 x i8> [[TMP1]], [[Y]] ; CHECK-NEXT: ret <2 x i1> [[R]] ; diff --git a/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll b/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll index 0aace5f52c96c9..46fd96193909da 100644 --- a/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll +++ b/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll @@ -137,7 +137,7 @@ define i1 @src_is_mask_or(i8 %x_in, i8 %y) { define i1 @src_is_mask_xor(i8 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_mask_xor( ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[MASK:%.*]] = xor i8 [[Y_M1]], [[Y]] +; CHECK-NEXT: [[MASK:%.*]] = xor i8 [[Y]], [[Y_M1]] ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[TMP1]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] @@ -153,7 +153,7 @@ define i1 @src_is_mask_xor(i8 %x_in, i8 %y) { define i1 @src_is_mask_xor_fail_notmask(i8 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_mask_xor_fail_notmask( ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[Y:%.*]] -; CHECK-NEXT: [[NOTMASK:%.*]] = xor i8 [[TMP1]], [[Y]] +; CHECK-NEXT: [[NOTMASK:%.*]] = xor i8 [[Y]], [[TMP1]] ; CHECK-NEXT: [[TMP2:%.*]] = xor i8 [[X_IN:%.*]], -124 ; CHECK-NEXT: [[TMP3:%.*]] = or i8 [[NOTMASK]], [[TMP2]] ; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[TMP3]], -1 @@ -171,7 +171,7 @@ define i1 @src_is_mask_xor_fail_notmask(i8 %x_in, i8 %y) { define i1 @src_is_mask_select(i8 %x_in, i8 %y, i1 %cond) { ; CHECK-LABEL: @src_is_mask_select( ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] +; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y]], [[Y_M1]] ; CHECK-NEXT: [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[YMASK]], i8 15 ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[TMP1]], [[MASK]] @@ -191,7 +191,7 @@ define i1 @src_is_mask_select_fail_wrong_pattern(i8 %x_in, i8 %y, i1 %cond, i8 % ; CHECK-LABEL: @src_is_mask_select_fail_wrong_pattern( ; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] +; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y]], [[Y_M1]] ; CHECK-NEXT: [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[YMASK]], i8 15 ; CHECK-NEXT: [[AND:%.*]] = and i8 [[MASK]], [[X]] ; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[AND]], [[Z:%.*]] @@ -246,7 +246,7 @@ define i1 @src_is_mask_shl_lshr_fail_not_allones(i8 %x_in, i8 %y, i1 %cond) { define i1 @src_is_mask_lshr(i8 %x_in, i8 %y, i8 %z, i1 %cond) { ; CHECK-LABEL: @src_is_mask_lshr( ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] +; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y]], [[Y_M1]] ; CHECK-NEXT: [[SMASK:%.*]] = select i1 [[COND:%.*]], i8 [[YMASK]], i8 15 ; CHECK-NEXT: [[MASK:%.*]] = lshr i8 [[SMASK]], [[Z:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], 123 @@ -266,7 +266,7 @@ define i1 @src_is_mask_lshr(i8 %x_in, i8 %y, i8 %z, i1 %cond) { define i1 @src_is_mask_ashr(i8 %x_in, i8 %y, i8 %z, i1 %cond) { ; CHECK-LABEL: @src_is_mask_ashr( ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] +; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y]], [[Y_M1]] ; CHECK-NEXT: [[SMASK:%.*]] = select i1 [[COND:%.*]], i8 [[YMASK]], i8 15 ; CHECK-NEXT: [[MASK:%.*]] = ashr i8 [[SMASK]], [[Z:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], 123 @@ -302,7 +302,7 @@ define i1 @src_is_mask_p2_m1(i8 %x_in, i8 %y) { define i1 @src_is_mask_umax(i8 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_mask_umax( ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] +; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y]], [[Y_M1]] ; CHECK-NEXT: [[MASK:%.*]] = call i8 @llvm.umax.i8(i8 [[YMASK]], i8 3) ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[TMP1]], [[MASK]] @@ -321,7 +321,7 @@ define i1 @src_is_mask_umax(i8 %x_in, i8 %y) { define i1 @src_is_mask_umin(i8 %x_in, i8 %y, i8 %z) { ; CHECK-LABEL: @src_is_mask_umin( ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] +; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y]], [[Y_M1]] ; CHECK-NEXT: [[ZMASK:%.*]] = lshr i8 15, [[Z:%.*]] ; CHECK-NEXT: [[MASK:%.*]] = call i8 @llvm.umin.i8(i8 [[YMASK]], i8 [[ZMASK]]) ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], 123 @@ -342,7 +342,7 @@ define i1 @src_is_mask_umin(i8 %x_in, i8 %y, i8 %z) { define i1 @src_is_mask_umin_fail_mismatch(i8 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_mask_umin_fail_mismatch( ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] +; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y]], [[Y_M1]] ; CHECK-NEXT: [[MASK:%.*]] = call i8 @llvm.umin.i8(i8 [[YMASK]], i8 -32) ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], -124 ; CHECK-NEXT: [[TMP2:%.*]] = or i8 [[MASK]], [[TMP1]] @@ -362,7 +362,7 @@ define i1 @src_is_mask_umin_fail_mismatch(i8 %x_in, i8 %y) { define i1 @src_is_mask_smax(i8 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_mask_smax( ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] +; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y]], [[Y_M1]] ; CHECK-NEXT: [[MASK:%.*]] = call i8 @llvm.smax.i8(i8 [[YMASK]], i8 -1) ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[TMP1]], [[MASK]] @@ -381,7 +381,7 @@ define i1 @src_is_mask_smax(i8 %x_in, i8 %y) { define i1 @src_is_mask_smin(i8 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_mask_smin( ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] +; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y]], [[Y_M1]] ; CHECK-NEXT: [[MASK:%.*]] = call i8 @llvm.smin.i8(i8 [[YMASK]], i8 0) ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[TMP1]], [[MASK]] @@ -456,7 +456,7 @@ define i1 @src_is_notmask_x_xor_neg_x(i8 %x_in, i8 %y, i1 %cond) { ; CHECK-LABEL: @src_is_notmask_x_xor_neg_x( ; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = xor i8 [[TMP1]], [[Y]] +; CHECK-NEXT: [[TMP2:%.*]] = xor i8 [[Y]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i8 [[TMP2]], i8 7 ; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X]], [[TMP3]] ; CHECK-NEXT: ret i1 [[R]] @@ -474,7 +474,7 @@ define i1 @src_is_notmask_x_xor_neg_x_inv(i8 %x_in, i8 %y, i1 %cond) { ; CHECK-LABEL: @src_is_notmask_x_xor_neg_x_inv( ; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = xor i8 [[TMP1]], [[Y]] +; CHECK-NEXT: [[TMP2:%.*]] = xor i8 [[Y]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i8 [[TMP2]], i8 7 ; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X]], [[TMP3]] ; CHECK-NEXT: ret i1 [[R]] @@ -590,7 +590,7 @@ define i1 @src_is_notmask_neg_p2_fail_not_invertable(i8 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_notmask_neg_p2_fail_not_invertable( ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], -124 ; CHECK-NEXT: [[TMP2:%.*]] = sub i8 0, [[Y:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = or i8 [[TMP2]], [[Y]] +; CHECK-NEXT: [[TMP3:%.*]] = or i8 [[Y]], [[TMP2]] ; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[TMP1]], [[TMP3]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -607,7 +607,7 @@ define i1 @src_is_notmask_xor_fail(i8 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_notmask_xor_fail( ; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[Y:%.*]] -; CHECK-NEXT: [[NOTMASK_REV:%.*]] = xor i8 [[TMP1]], [[Y]] +; CHECK-NEXT: [[NOTMASK_REV:%.*]] = xor i8 [[Y]], [[TMP1]] ; CHECK-NEXT: [[NOTMASK:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[NOTMASK_REV]]) ; CHECK-NEXT: [[AND:%.*]] = and i8 [[X]], [[NOTMASK]] ; CHECK-NEXT: [[R:%.*]] = icmp slt i8 [[AND]], [[X]] @@ -675,7 +675,7 @@ define i1 @src_x_and_mask_slt(i8 %x, i8 %y, i1 %cond) { ; CHECK-NEXT: [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[MASK0]], i8 0 ; CHECK-NEXT: [[MASK_POS:%.*]] = icmp sgt i8 [[MASK]], -1 ; CHECK-NEXT: call void @llvm.assume(i1 [[MASK_POS]]) -; CHECK-NEXT: [[R:%.*]] = icmp slt i8 [[MASK]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp sgt i8 [[X:%.*]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; %mask0 = lshr i8 -1, %y @@ -693,7 +693,7 @@ define i1 @src_x_and_mask_sge(i8 %x, i8 %y, i1 %cond) { ; CHECK-NEXT: [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[MASK0]], i8 0 ; CHECK-NEXT: [[MASK_POS:%.*]] = icmp sgt i8 [[MASK]], -1 ; CHECK-NEXT: call void @llvm.assume(i1 [[MASK_POS]]) -; CHECK-NEXT: [[R:%.*]] = icmp sge i8 [[MASK]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp sle i8 [[X:%.*]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; %mask0 = lshr i8 -1, %y @@ -709,7 +709,7 @@ define i1 @src_x_and_mask_slt_fail_maybe_neg(i8 %x, i8 %y, i1 %cond) { ; CHECK-LABEL: @src_x_and_mask_slt_fail_maybe_neg( ; CHECK-NEXT: [[MASK0:%.*]] = lshr i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[MASK0]], i8 0 -; CHECK-NEXT: [[AND:%.*]] = and i8 [[MASK]], [[X:%.*]] +; CHECK-NEXT: [[AND:%.*]] = and i8 [[X:%.*]], [[MASK]] ; CHECK-NEXT: [[R:%.*]] = icmp slt i8 [[AND]], [[X]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -724,7 +724,7 @@ define i1 @src_x_and_mask_sge_fail_maybe_neg(i8 %x, i8 %y, i1 %cond) { ; CHECK-LABEL: @src_x_and_mask_sge_fail_maybe_neg( ; CHECK-NEXT: [[MASK0:%.*]] = lshr i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[MASK0]], i8 0 -; CHECK-NEXT: [[AND:%.*]] = and i8 [[MASK]], [[X:%.*]] +; CHECK-NEXT: [[AND:%.*]] = and i8 [[X:%.*]], [[MASK]] ; CHECK-NEXT: [[R:%.*]] = icmp sge i8 [[AND]], [[X]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -821,7 +821,7 @@ define i1 @src_x_and_nmask_slt_fail_maybe_z(i8 %x, i8 %y, i1 %cond) { ; CHECK-LABEL: @src_x_and_nmask_slt_fail_maybe_z( ; CHECK-NEXT: [[NOT_MASK0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[NOT_MASK:%.*]] = select i1 [[COND:%.*]], i8 [[NOT_MASK0]], i8 0 -; CHECK-NEXT: [[AND:%.*]] = and i8 [[NOT_MASK]], [[X:%.*]] +; CHECK-NEXT: [[AND:%.*]] = and i8 [[X:%.*]], [[NOT_MASK]] ; CHECK-NEXT: [[R:%.*]] = icmp slt i8 [[AND]], [[NOT_MASK]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -836,7 +836,7 @@ define i1 @src_x_and_nmask_sge_fail_maybe_z(i8 %x, i8 %y, i1 %cond) { ; CHECK-LABEL: @src_x_and_nmask_sge_fail_maybe_z( ; CHECK-NEXT: [[NOT_MASK0:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[NOT_MASK:%.*]] = select i1 [[COND:%.*]], i8 [[NOT_MASK0]], i8 0 -; CHECK-NEXT: [[AND:%.*]] = and i8 [[NOT_MASK]], [[X:%.*]] +; CHECK-NEXT: [[AND:%.*]] = and i8 [[X:%.*]], [[NOT_MASK]] ; CHECK-NEXT: [[R:%.*]] = icmp sge i8 [[AND]], [[NOT_MASK]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -874,7 +874,7 @@ define i1 @src_x_or_mask_ne(i8 %x, i8 %y, i1 %cond) { ; CHECK-LABEL: @src_x_or_mask_ne( ; CHECK-NEXT: [[MASK0:%.*]] = lshr i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[MASK0]], i8 0 -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[MASK]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[X:%.*]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; %mask0 = lshr i8 -1, %y diff --git a/llvm/test/Transforms/InstCombine/icmp-and-shift.ll b/llvm/test/Transforms/InstCombine/icmp-and-shift.ll index 2f797d726afe32..684ece21b1166e 100644 --- a/llvm/test/Transforms/InstCombine/icmp-and-shift.ll +++ b/llvm/test/Transforms/InstCombine/icmp-and-shift.ll @@ -496,7 +496,7 @@ define i1 @eq_and_lshr_minval_commute(i8 %px, i8 %y) { define i1 @eq_and_shl_two(i8 %x, i8 %y) { ; CHECK-LABEL: @eq_and_shl_two( ; CHECK-NEXT: [[POW2_OR_ZERO:%.*]] = shl i8 2, [[Y:%.*]] -; CHECK-NEXT: [[AND:%.*]] = and i8 [[POW2_OR_ZERO]], [[X:%.*]] +; CHECK-NEXT: [[AND:%.*]] = and i8 [[X:%.*]], [[POW2_OR_ZERO]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[AND]], [[POW2_OR_ZERO]] ; CHECK-NEXT: ret i1 [[CMP]] ; @@ -510,7 +510,7 @@ define i1 @eq_and_shl_two(i8 %x, i8 %y) { define i1 @slt_and_shl_one(i8 %x, i8 %y) { ; CHECK-LABEL: @slt_and_shl_one( ; CHECK-NEXT: [[POW2:%.*]] = shl nuw i8 1, [[Y:%.*]] -; CHECK-NEXT: [[AND:%.*]] = and i8 [[POW2]], [[X:%.*]] +; CHECK-NEXT: [[AND:%.*]] = and i8 [[X:%.*]], [[POW2]] ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[AND]], [[POW2]] ; CHECK-NEXT: ret i1 [[CMP]] ; @@ -609,7 +609,7 @@ define i1 @fold_ne_rhs_fail_shift_not_1s(i8 %x, i8 %yy) { define i1 @test_shr_and_1_ne_0(i32 %a, i32 %b) { ; CHECK-LABEL: @test_shr_and_1_ne_0( ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i32 1, [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[TMP2]], 0 ; CHECK-NEXT: ret i1 [[CMP]] ; @@ -710,7 +710,7 @@ define i1 @test_const_shr_and_1_ne_0_multi_use_lshr_negative(i32 %b) { ; CHECK-NEXT: [[SHR:%.*]] = lshr i32 42, [[B:%.*]] ; CHECK-NEXT: [[AND:%.*]] = and i32 [[SHR]], 1 ; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[AND]], 0 -; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[SHR]], [[B]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[B]], [[SHR]] ; CHECK-NEXT: [[RET:%.*]] = and i1 [[CMP1]], [[CMP2]] ; CHECK-NEXT: ret i1 [[RET]] ; @@ -727,7 +727,7 @@ define i1 @test_const_shr_and_1_ne_0_multi_use_and_negative(i32 %b) { ; CHECK-NEXT: [[SHR:%.*]] = lshr i32 42, [[B:%.*]] ; CHECK-NEXT: [[AND:%.*]] = and i32 [[SHR]], 1 ; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[AND]], 0 -; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[AND]], [[B]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[B]], [[AND]] ; CHECK-NEXT: [[RET:%.*]] = and i1 [[CMP1]], [[CMP2]] ; CHECK-NEXT: ret i1 [[RET]] ; diff --git a/llvm/test/Transforms/InstCombine/icmp-custom-dl.ll b/llvm/test/Transforms/InstCombine/icmp-custom-dl.ll index a595ddb07db566..76f8c926e9bec9 100644 --- a/llvm/test/Transforms/InstCombine/icmp-custom-dl.ll +++ b/llvm/test/Transforms/InstCombine/icmp-custom-dl.ll @@ -102,7 +102,7 @@ define i1 @test60_addrspacecast_larger(ptr addrspace(1) %foo, i32 %i, i16 %j) { ; CHECK-LABEL: @test60_addrspacecast_larger( ; CHECK-NEXT: [[I_TR:%.*]] = trunc i32 [[I:%.*]] to i16 ; CHECK-NEXT: [[TMP1:%.*]] = shl i16 [[I_TR]], 2 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i16 [[TMP1]], [[J:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i16 [[J:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[CMP]] ; %bit = addrspacecast ptr addrspace(1) %foo to ptr addrspace(2) diff --git a/llvm/test/Transforms/InstCombine/icmp-equality-rotate.ll b/llvm/test/Transforms/InstCombine/icmp-equality-rotate.ll index 30c97a7f25275e..154958b0e3fada 100644 --- a/llvm/test/Transforms/InstCombine/icmp-equality-rotate.ll +++ b/llvm/test/Transforms/InstCombine/icmp-equality-rotate.ll @@ -7,7 +7,7 @@ declare void @use.i8(i8) define i1 @cmpeq_rorr_to_rorl(i8 %x, i8 %C) { ; CHECK-LABEL: @cmpeq_rorr_to_rorl( ; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.fshl.i8(i8 [[X:%.*]], i8 [[X]], i8 [[C:%.*]]) -; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[TMP1]], [[X]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[X]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %x_rorr = call i8 @llvm.fshr.i8(i8 %x, i8 %x, i8 %C) @@ -65,7 +65,7 @@ define i1 @cmpne_rorr_rorr(i8 %x, i8 %C0, i8 %C1) { ; CHECK-LABEL: @cmpne_rorr_rorr( ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 [[C0:%.*]], [[C1:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = call i8 @llvm.fshl.i8(i8 [[X:%.*]], i8 [[X]], i8 [[TMP1]]) -; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[TMP2]], [[X]] +; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[X]], [[TMP2]] ; CHECK-NEXT: ret i1 [[R]] ; %x_rorr0 = call i8 @llvm.fshr.i8(i8 %x, i8 %x, i8 %C0) @@ -78,7 +78,7 @@ define i1 @cmpne_rorrX_rorrY(i8 %x, i8 %y, i8 %C0, i8 %C1) { ; CHECK-LABEL: @cmpne_rorrX_rorrY( ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 [[C0:%.*]], [[C1:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = call i8 @llvm.fshr.i8(i8 [[X:%.*]], i8 [[X]], i8 [[TMP1]]) -; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret i1 [[R]] ; %x_rorr0 = call i8 @llvm.fshr.i8(i8 %x, i8 %x, i8 %C0) @@ -135,7 +135,7 @@ define i1 @cmpeq_rorlXC_rorlYC_multiuse1(i8 %x, i8 %y) { ; CHECK-NEXT: [[Y_RORL1:%.*]] = call i8 @llvm.fshl.i8(i8 [[Y:%.*]], i8 [[Y]], i8 3) ; CHECK-NEXT: call void @use.i8(i8 [[Y_RORL1]]) ; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.fshl.i8(i8 [[X:%.*]], i8 [[X]], i8 3) -; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[TMP1]], [[Y]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[Y]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %x_rorl0 = call i8 @llvm.fshl.i8(i8 %x, i8 %x, i8 6) diff --git a/llvm/test/Transforms/InstCombine/icmp-equality-xor.ll b/llvm/test/Transforms/InstCombine/icmp-equality-xor.ll index e8a78df6d5f756..b8e8ed0eaf1da6 100644 --- a/llvm/test/Transforms/InstCombine/icmp-equality-xor.ll +++ b/llvm/test/Transforms/InstCombine/icmp-equality-xor.ll @@ -84,7 +84,7 @@ define i1 @cmpeq_xor_cst1_multiuse(i32 %a, i32 %b) { define i1 @cmpeq_xor_cst1_commuted(i32 %a, i32 %b) { ; CHECK-LABEL: @cmpeq_xor_cst1_commuted( ; CHECK-NEXT: [[B2:%.*]] = mul i32 [[B:%.*]], [[B]] -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B2]], [[A:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A:%.*]], [[B2]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 10 ; CHECK-NEXT: ret i1 [[CMP]] ; diff --git a/llvm/test/Transforms/InstCombine/icmp-ext-ext.ll b/llvm/test/Transforms/InstCombine/icmp-ext-ext.ll index 7fc42c65d758b3..1f012d82bc23f4 100644 --- a/llvm/test/Transforms/InstCombine/icmp-ext-ext.ll +++ b/llvm/test/Transforms/InstCombine/icmp-ext-ext.ll @@ -39,7 +39,7 @@ define i1 @zext_zext_eq(i8 %x, i8 %y) { define i1 @zext_zext_sle_op0_narrow(i8 %x, i16 %y) { ; CHECK-LABEL: @zext_zext_sle_op0_narrow( ; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i16 -; CHECK-NEXT: [[C:%.*]] = icmp ule i16 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp uge i16 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[C]] ; %a = zext i8 %x to i32 @@ -51,7 +51,7 @@ define i1 @zext_zext_sle_op0_narrow(i8 %x, i16 %y) { define i1 @zext_zext_ule_op0_wide(i9 %x, i8 %y) { ; CHECK-LABEL: @zext_zext_ule_op0_wide( ; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[Y:%.*]] to i9 -; CHECK-NEXT: [[C:%.*]] = icmp uge i9 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp ule i9 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[C]] ; %a = zext i9 %x to i32 @@ -96,7 +96,7 @@ define i1 @sext_sext_ne(i8 %x, i8 %y) { define i1 @sext_sext_sge_op0_narrow(i5 %x, i8 %y) { ; CHECK-LABEL: @sext_sext_sge_op0_narrow( ; CHECK-NEXT: [[TMP1:%.*]] = sext i5 [[X:%.*]] to i8 -; CHECK-NEXT: [[C:%.*]] = icmp sge i8 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp sle i8 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[C]] ; %a = sext i5 %x to i32 @@ -108,7 +108,7 @@ define i1 @sext_sext_sge_op0_narrow(i5 %x, i8 %y) { define <2 x i1> @sext_sext_uge_op0_wide(<2 x i16> %x, <2 x i8> %y) { ; CHECK-LABEL: @sext_sext_uge_op0_wide( ; CHECK-NEXT: [[TMP1:%.*]] = sext <2 x i8> [[Y:%.*]] to <2 x i16> -; CHECK-NEXT: [[C:%.*]] = icmp ule <2 x i16> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp uge <2 x i16> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i1> [[C]] ; %a = sext <2 x i16> %x to <2 x i32> @@ -208,7 +208,7 @@ define i1 @zext_sext_sle_op0_narrow(i8 %x, i16 %y) { define i1 @zext_nneg_sext_sle_op0_narrow(i8 %x, i16 %y) { ; CHECK-LABEL: @zext_nneg_sext_sle_op0_narrow( ; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i16 -; CHECK-NEXT: [[C:%.*]] = icmp sle i16 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp sge i16 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[C]] ; %a = zext nneg i8 %x to i32 @@ -233,7 +233,7 @@ define i1 @zext_sext_ule_op0_wide(i9 %x, i8 %y) { define i1 @zext_nneg_sext_ule_op0_wide(i9 %x, i8 %y) { ; CHECK-LABEL: @zext_nneg_sext_ule_op0_wide( ; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[Y:%.*]] to i9 -; CHECK-NEXT: [[C:%.*]] = icmp uge i9 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp ule i9 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[C]] ; %a = zext nneg i9 %x to i32 @@ -333,7 +333,7 @@ define i1 @sext_zext_sge_op0_narrow(i5 %x, i8 %y) { define i1 @sext_zext_nneg_sge_op0_narrow(i5 %x, i8 %y) { ; CHECK-LABEL: @sext_zext_nneg_sge_op0_narrow( ; CHECK-NEXT: [[TMP1:%.*]] = sext i5 [[X:%.*]] to i8 -; CHECK-NEXT: [[C:%.*]] = icmp sge i8 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp sle i8 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[C]] ; %a = sext i5 %x to i32 @@ -359,7 +359,7 @@ define i1 @sext_zext_uge_op0_wide(i16 %x, i8 %y) { define i1 @sext_zext_nneg_uge_op0_wide(i16 %x, i8 %y) { ; CHECK-LABEL: @sext_zext_nneg_uge_op0_wide( ; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[Y:%.*]] to i16 -; CHECK-NEXT: [[C:%.*]] = icmp ule i16 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp uge i16 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[C]] ; %a = sext i16 %x to i32 @@ -411,7 +411,7 @@ define i1 @zext_sext_sle_known_nonneg_op0_narrow(i8 %x, i16 %y) { ; CHECK-LABEL: @zext_sext_sle_known_nonneg_op0_narrow( ; CHECK-NEXT: [[N:%.*]] = and i8 [[X:%.*]], 12 ; CHECK-NEXT: [[TMP1:%.*]] = zext nneg i8 [[N]] to i16 -; CHECK-NEXT: [[C:%.*]] = icmp sle i16 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp sge i16 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[C]] ; %n = and i8 %x, 12 @@ -438,7 +438,7 @@ define i1 @zext_sext_ule_known_nonneg_op0_wide(i9 %x, i8 %y) { define i1 @sext_zext_slt_known_nonneg(i8 %x, i8 %y) { ; CHECK-LABEL: @sext_zext_slt_known_nonneg( ; CHECK-NEXT: [[N:%.*]] = and i8 [[Y:%.*]], 126 -; CHECK-NEXT: [[C:%.*]] = icmp sgt i8 [[N]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp slt i8 [[X:%.*]], [[N]] ; CHECK-NEXT: ret i1 [[C]] ; %a = sext i8 %x to i32 @@ -451,7 +451,7 @@ define i1 @sext_zext_slt_known_nonneg(i8 %x, i8 %y) { define i1 @sext_zext_ult_known_nonneg(i8 %x, i8 %y) { ; CHECK-LABEL: @sext_zext_ult_known_nonneg( ; CHECK-NEXT: [[N:%.*]] = lshr i8 [[Y:%.*]], 6 -; CHECK-NEXT: [[C:%.*]] = icmp ugt i8 [[N]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp ult i8 [[X:%.*]], [[N]] ; CHECK-NEXT: ret i1 [[C]] ; %a = sext i8 %x to i32 @@ -464,7 +464,7 @@ define i1 @sext_zext_ult_known_nonneg(i8 %x, i8 %y) { define i1 @sext_zext_ne_known_nonneg(i8 %x, i8 %y) { ; CHECK-LABEL: @sext_zext_ne_known_nonneg( ; CHECK-NEXT: [[N:%.*]] = udiv i8 [[Y:%.*]], 6 -; CHECK-NEXT: [[C:%.*]] = icmp ne i8 [[N]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp ne i8 [[X:%.*]], [[N]] ; CHECK-NEXT: ret i1 [[C]] ; %a = sext i8 %x to i32 @@ -492,7 +492,7 @@ define i1 @sext_zext_uge_known_nonneg_op0_wide(i16 %x, i8 %y) { ; CHECK-LABEL: @sext_zext_uge_known_nonneg_op0_wide( ; CHECK-NEXT: [[N:%.*]] = and i8 [[Y:%.*]], 12 ; CHECK-NEXT: [[TMP1:%.*]] = zext nneg i8 [[N]] to i16 -; CHECK-NEXT: [[C:%.*]] = icmp ule i16 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp uge i16 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[C]] ; %a = sext i16 %x to i32 diff --git a/llvm/test/Transforms/InstCombine/icmp-gep.ll b/llvm/test/Transforms/InstCombine/icmp-gep.ll index ce64ab1c6305a4..01bee5a0f9cbd5 100644 --- a/llvm/test/Transforms/InstCombine/icmp-gep.ll +++ b/llvm/test/Transforms/InstCombine/icmp-gep.ll @@ -329,7 +329,7 @@ define i1 @test60_as1(ptr addrspace(1) %foo, i64 %i, i64 %j) { define i1 @test60_addrspacecast(ptr %foo, i64 %i, i64 %j) { ; CHECK-LABEL: @test60_addrspacecast( ; CHECK-NEXT: [[GEP1_IDX:%.*]] = shl nsw i64 [[I:%.*]], 2 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[GEP1_IDX]], [[J:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[J:%.*]], [[GEP1_IDX]] ; CHECK-NEXT: ret i1 [[CMP]] ; %bit = addrspacecast ptr %foo to ptr addrspace(3) @@ -359,7 +359,7 @@ define i1 @test60_addrspacecast_larger(ptr addrspace(1) %foo, i32 %i, i16 %j) { ; CHECK-LABEL: @test60_addrspacecast_larger( ; CHECK-NEXT: [[I_TR:%.*]] = trunc i32 [[I:%.*]] to i16 ; CHECK-NEXT: [[TMP1:%.*]] = shl i16 [[I_TR]], 2 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i16 [[TMP1]], [[J:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i16 [[J:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[CMP]] ; %bit = addrspacecast ptr addrspace(1) %foo to ptr addrspace(2) @@ -515,10 +515,10 @@ define i1 @test_scalable_xy(ptr %foo, i64 %i, i64 %j) { ; CHECK-LABEL: @test_scalable_xy( ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 4 -; CHECK-NEXT: [[GEP1_IDX:%.*]] = mul nsw i64 [[TMP2]], [[I:%.*]] +; CHECK-NEXT: [[GEP1_IDX:%.*]] = mul nsw i64 [[I:%.*]], [[TMP2]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[TMP3]], 2 -; CHECK-NEXT: [[GEP2_IDX:%.*]] = mul nsw i64 [[TMP4]], [[J:%.*]] +; CHECK-NEXT: [[GEP2_IDX:%.*]] = mul nsw i64 [[J:%.*]], [[TMP4]] ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[GEP2_IDX]], [[GEP1_IDX]] ; CHECK-NEXT: ret i1 [[CMP]] ; @@ -534,10 +534,10 @@ define i1 @test_scalable_ij(ptr %foo, i64 %i, i64 %j) { ; CHECK-LABEL: @test_scalable_ij( ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 4 -; CHECK-NEXT: [[GEP1_IDX:%.*]] = mul nsw i64 [[TMP2]], [[I:%.*]] +; CHECK-NEXT: [[GEP1_IDX:%.*]] = mul nsw i64 [[I:%.*]], [[TMP2]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[TMP3]], 2 -; CHECK-NEXT: [[GEP2_IDX:%.*]] = mul nsw i64 [[TMP4]], [[J:%.*]] +; CHECK-NEXT: [[GEP2_IDX:%.*]] = mul nsw i64 [[J:%.*]], [[TMP4]] ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[GEP1_IDX]], [[GEP2_IDX]] ; CHECK-NEXT: ret i1 [[CMP]] ; diff --git a/llvm/test/Transforms/InstCombine/icmp-mul-zext.ll b/llvm/test/Transforms/InstCombine/icmp-mul-zext.ll index aa23a6d27f69b7..07536f271ceb19 100644 --- a/llvm/test/Transforms/InstCombine/icmp-mul-zext.ll +++ b/llvm/test/Transforms/InstCombine/icmp-mul-zext.ll @@ -16,7 +16,7 @@ define i32 @sterix(i32, i8, i64) { ; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp ult i64 [[MUL3]], 4294967296 ; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[LOR_RHS:%.*]], label [[LOR_END:%.*]] ; CHECK: lor.rhs: -; CHECK-NEXT: [[AND:%.*]] = and i64 [[MUL3]], [[TMP2]] +; CHECK-NEXT: [[AND:%.*]] = and i64 [[TMP2]], [[MUL3]] ; CHECK-NEXT: [[TOBOOL7_NOT:%.*]] = icmp eq i64 [[AND]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = zext i1 [[TOBOOL7_NOT]] to i32 ; CHECK-NEXT: br label [[LOR_END]] @@ -128,12 +128,12 @@ define i1 @PR46561(i1 %a, i1 %x, i1 %y, i8 %z) { ; CHECK-NEXT: br i1 [[A:%.*]], label [[COND_TRUE:%.*]], label [[END:%.*]] ; CHECK: cond.true: ; CHECK-NEXT: [[MULBOOL:%.*]] = and i1 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = trunc i8 [[Z:%.*]] to i1 -; CHECK-NEXT: [[TMP2:%.*]] = xor i1 [[MULBOOL]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = xor i1 [[TMP2]], true +; CHECK-NEXT: [[TMP0:%.*]] = trunc i8 [[Z:%.*]] to i1 +; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[MULBOOL]], [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = xor i1 [[TMP1]], true ; CHECK-NEXT: br label [[END]] ; CHECK: end: -; CHECK-NEXT: [[P:%.*]] = phi i1 [ [[TMP3]], [[COND_TRUE]] ], [ false, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[P:%.*]] = phi i1 [ [[TMP2]], [[COND_TRUE]] ], [ false, [[ENTRY:%.*]] ] ; CHECK-NEXT: ret i1 [[P]] ; entry: diff --git a/llvm/test/Transforms/InstCombine/icmp-mul.ll b/llvm/test/Transforms/InstCombine/icmp-mul.ll index 12c77367b10f78..3ba21abb069ba7 100644 --- a/llvm/test/Transforms/InstCombine/icmp-mul.ll +++ b/llvm/test/Transforms/InstCombine/icmp-mul.ll @@ -1111,7 +1111,7 @@ define i1 @mul_xy_z_assumeodd_eq(i8 %x, i8 %y, i8 %z) { define <2 x i1> @reused_mul_nsw_xy_z_setnonzero_vec_ne(<2 x i8> %x, <2 x i8> %y, <2 x i8> %zi) { ; CHECK-LABEL: @reused_mul_nsw_xy_z_setnonzero_vec_ne( ; CHECK-NEXT: [[Z:%.*]] = or <2 x i8> [[ZI:%.*]], -; CHECK-NEXT: [[MULY:%.*]] = mul nsw <2 x i8> [[Z]], [[Y:%.*]] +; CHECK-NEXT: [[MULY:%.*]] = mul nsw <2 x i8> [[Y:%.*]], [[Z]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i8> [[Y]], [[X:%.*]] ; CHECK-NEXT: call void @usev2xi8(<2 x i8> [[MULY]]) ; CHECK-NEXT: ret <2 x i1> [[CMP]] @@ -1127,8 +1127,8 @@ define <2 x i1> @reused_mul_nsw_xy_z_setnonzero_vec_ne(<2 x i8> %x, <2 x i8> %y, define i1 @mul_mixed_nuw_nsw_xy_z_setodd_ult(i8 %x, i8 %y, i8 %zi) { ; CHECK-LABEL: @mul_mixed_nuw_nsw_xy_z_setodd_ult( ; CHECK-NEXT: [[Z:%.*]] = or i8 [[ZI:%.*]], 1 -; CHECK-NEXT: [[MULX:%.*]] = mul nsw i8 [[Z]], [[X:%.*]] -; CHECK-NEXT: [[MULY:%.*]] = mul nuw nsw i8 [[Z]], [[Y:%.*]] +; CHECK-NEXT: [[MULX:%.*]] = mul nsw i8 [[X:%.*]], [[Z]] +; CHECK-NEXT: [[MULY:%.*]] = mul nuw nsw i8 [[Y:%.*]], [[Z]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[MULX]], [[MULY]] ; CHECK-NEXT: ret i1 [[CMP]] ; @@ -1212,7 +1212,7 @@ define i1 @reused_mul_nuw_xy_z_selectnonzero_ugt(i8 %x, i8 %y, i8 %z) { define <2 x i1> @mul_mixed_nsw_nuw_xy_z_setnonzero_vec_ule(<2 x i8> %x, <2 x i8> %y, <2 x i8> %zi) { ; CHECK-LABEL: @mul_mixed_nsw_nuw_xy_z_setnonzero_vec_ule( ; CHECK-NEXT: [[Z:%.*]] = or <2 x i8> [[ZI:%.*]], -; CHECK-NEXT: [[MULX:%.*]] = mul nuw <2 x i8> [[Z]], [[X:%.*]] +; CHECK-NEXT: [[MULX:%.*]] = mul nuw <2 x i8> [[X:%.*]], [[Z]] ; CHECK-NEXT: [[MULY:%.*]] = mul nsw <2 x i8> [[Z]], [[Y:%.*]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ule <2 x i8> [[MULY]], [[MULX]] ; CHECK-NEXT: ret <2 x i1> [[CMP]] diff --git a/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll b/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll index 75badabda01aeb..09c9c1ebc83159 100644 --- a/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll +++ b/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll @@ -228,7 +228,7 @@ define i1 @icmp_sle_negx_y_fail_maybe_zero(i8 %x, i8 %y) { define i1 @icmp_eq_x_invertable_y_todo(i8 %x, i1 %y) { ; CHECK-LABEL: @icmp_eq_x_invertable_y_todo( ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[Y:%.*]], i8 -8, i8 -25 -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[TMP2]], 0 ; CHECK-NEXT: ret i1 [[R]] ; @@ -253,9 +253,9 @@ define i1 @icmp_eq_x_invertable_y(i8 %x, i8 %y) { define i1 @icmp_eq_x_invertable_y_fail_multiuse(i8 %x, i8 %y) { ; CHECK-LABEL: @icmp_eq_x_invertable_y_fail_multiuse( ; CHECK-NEXT: [[YY:%.*]] = xor i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[AND:%.*]] = and i8 [[YY]], [[X:%.*]] +; CHECK-NEXT: [[AND:%.*]] = and i8 [[X:%.*]], [[YY]] ; CHECK-NEXT: call void @use.i8(i8 [[AND]]) -; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[AND]], [[X]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[X]], [[AND]] ; CHECK-NEXT: ret i1 [[R]] ; %yy = xor i8 %y, -1 @@ -268,7 +268,7 @@ define i1 @icmp_eq_x_invertable_y_fail_multiuse(i8 %x, i8 %y) { define i1 @icmp_eq_x_invertable_y2_todo(i8 %x, i1 %y) { ; CHECK-LABEL: @icmp_eq_x_invertable_y2_todo( ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[Y:%.*]], i8 -8, i8 -25 -; CHECK-NEXT: [[TMP2:%.*]] = or i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = or i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[TMP2]], -1 ; CHECK-NEXT: ret i1 [[R]] ; diff --git a/llvm/test/Transforms/InstCombine/icmp-of-or-x.ll b/llvm/test/Transforms/InstCombine/icmp-of-or-x.ll index 7ff111c42a9e06..93eeab4732185b 100644 --- a/llvm/test/Transforms/InstCombine/icmp-of-or-x.ll +++ b/llvm/test/Transforms/InstCombine/icmp-of-or-x.ll @@ -95,7 +95,7 @@ define i1 @or_eq_notY_eq_0(i8 %x, i8 %y) { define i1 @or_eq_notY_eq_0_fail_multiuse(i8 %x, i8 %y) { ; CHECK-LABEL: @or_eq_notY_eq_0_fail_multiuse( ; CHECK-NEXT: [[NY:%.*]] = xor i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i8 [[NY]], [[X:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i8 [[X:%.*]], [[NY]] ; CHECK-NEXT: call void @use.i8(i8 [[OR]]) ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[OR]], [[NY]] ; CHECK-NEXT: ret i1 [[CMP]] @@ -122,7 +122,7 @@ define i1 @or_ne_notY_eq_1s(i8 %x, i8 %y) { define i1 @or_ne_notY_eq_1s_fail_bad_not(i8 %x, i8 %y) { ; CHECK-LABEL: @or_ne_notY_eq_1s_fail_bad_not( ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = or i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = or i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i8 [[TMP2]], -1 ; CHECK-NEXT: ret i1 [[CMP]] ; @@ -307,7 +307,7 @@ define i1 @or_simplify_uge(i8 %y_in, i8 %rhs_in, i1 %c) { define i1 @or_simplify_ule_fail(i8 %y_in, i8 %rhs_in) { ; CHECK-LABEL: @or_simplify_ule_fail( ; CHECK-NEXT: [[RHS:%.*]] = and i8 [[RHS_IN:%.*]], 127 -; CHECK-NEXT: [[Y:%.*]] = or i8 [[RHS]], [[Y_IN:%.*]] +; CHECK-NEXT: [[Y:%.*]] = or i8 [[Y_IN:%.*]], [[RHS]] ; CHECK-NEXT: [[LBO:%.*]] = or i8 [[Y]], 64 ; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[LBO]], [[RHS]] ; CHECK-NEXT: ret i1 [[R]] @@ -352,7 +352,7 @@ define i1 @or_simplify_ult(i8 %y_in, i8 %rhs_in) { define i1 @or_simplify_ugt_fail(i8 %y_in, i8 %rhs_in) { ; CHECK-LABEL: @or_simplify_ugt_fail( ; CHECK-NEXT: [[RHS:%.*]] = or i8 [[RHS_IN:%.*]], 1 -; CHECK-NEXT: [[LBO:%.*]] = or i8 [[RHS]], [[Y_IN:%.*]] +; CHECK-NEXT: [[LBO:%.*]] = or i8 [[Y_IN:%.*]], [[RHS]] ; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[LBO]], [[RHS]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -377,7 +377,7 @@ define i1 @pr64610(ptr %b) { define i1 @icmp_eq_x_invertable_y2_todo(i8 %x, i1 %y, i8 %z) { ; CHECK-LABEL: @icmp_eq_x_invertable_y2_todo( ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[Y:%.*]], i8 -8, i8 [[Z:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[TMP2]], 0 ; CHECK-NEXT: ret i1 [[R]] ; diff --git a/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll b/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll index c0cd3e775f68ac..f2a02fac90b17c 100644 --- a/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll +++ b/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll @@ -50,7 +50,7 @@ define i1 @icmp_trunc_x_trunc_y_illegal_trunc_to_legal_anyways(i123 %x, i32 %y) ; CHECK-NEXT: call void @llvm.assume(i1 [[X_LB_ONLY]]) ; CHECK-NEXT: call void @llvm.assume(i1 [[Y_LB_ONLY]]) ; CHECK-NEXT: [[TMP1:%.*]] = trunc nuw nsw i123 [[X]] to i32 -; CHECK-NEXT: [[R:%.*]] = icmp eq i32 [[TMP1]], [[Y]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i32 [[Y]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %x_lb_only = icmp ult i123 %x, 65536 @@ -70,7 +70,7 @@ define i1 @icmp_trunc_x_trunc_y_2_illegal_anyways(i33 %x, i63 %y) { ; CHECK-NEXT: call void @llvm.assume(i1 [[X_LB_ONLY]]) ; CHECK-NEXT: call void @llvm.assume(i1 [[Y_LB_ONLY]]) ; CHECK-NEXT: [[TMP1:%.*]] = zext nneg i33 [[X]] to i63 -; CHECK-NEXT: [[R:%.*]] = icmp ugt i63 [[TMP1]], [[Y]] +; CHECK-NEXT: [[R:%.*]] = icmp ult i63 [[Y]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %x_lb_only = icmp ult i33 %x, 512 @@ -90,7 +90,7 @@ define i1 @icmp_trunc_x_trunc_y_3(i64 %x, i32 %y) { ; CHECK-NEXT: call void @llvm.assume(i1 [[X_LB_ONLY]]) ; CHECK-NEXT: call void @llvm.assume(i1 [[Y_LB_ONLY]]) ; CHECK-NEXT: [[TMP1:%.*]] = trunc nuw nsw i64 [[X]] to i32 -; CHECK-NEXT: [[R:%.*]] = icmp uge i32 [[TMP1]], [[Y]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i32 [[Y]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %x_lb_only = icmp ult i64 %x, 123 @@ -152,7 +152,7 @@ define i1 @icmp_trunc_x_trunc_y_swap0(i33 %x, i32 %y) { ; CHECK-NEXT: call void @llvm.assume(i1 [[X_LB_ONLY]]) ; CHECK-NEXT: call void @llvm.assume(i1 [[Y_LB_ONLY]]) ; CHECK-NEXT: [[TMP1:%.*]] = trunc nuw nsw i33 [[X]] to i32 -; CHECK-NEXT: [[R:%.*]] = icmp ule i32 [[TMP1]], [[Y]] +; CHECK-NEXT: [[R:%.*]] = icmp uge i32 [[Y]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %x_lb_only = icmp ult i33 %x, 65536 @@ -172,7 +172,7 @@ define i1 @icmp_trunc_x_trunc_y_swap1(i33 %x, i32 %y) { ; CHECK-NEXT: call void @llvm.assume(i1 [[X_LB_ONLY]]) ; CHECK-NEXT: call void @llvm.assume(i1 [[Y_LB_ONLY]]) ; CHECK-NEXT: [[TMP1:%.*]] = trunc nuw nsw i33 [[X]] to i32 -; CHECK-NEXT: [[R:%.*]] = icmp uge i32 [[TMP1]], [[Y]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i32 [[Y]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %x_lb_only = icmp ult i33 %x, 65536 @@ -190,7 +190,7 @@ define i1 @icmp_trunc_x_zext_y(i32 %x, i8 %y) { ; CHECK-NEXT: [[X_LB_ONLY:%.*]] = icmp ult i32 [[X:%.*]], 65536 ; CHECK-NEXT: call void @llvm.assume(i1 [[X_LB_ONLY]]) ; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[Y:%.*]] to i32 -; CHECK-NEXT: [[R:%.*]] = icmp ult i32 [[TMP1]], [[X]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[X]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %x_lb_only = icmp ult i32 %x, 65536 @@ -206,7 +206,7 @@ define i1 @icmp_trunc_x_zext_y_2(i32 %x, i8 %y) { ; CHECK-NEXT: [[X_LB_ONLY:%.*]] = icmp ult i32 [[X:%.*]], 65536 ; CHECK-NEXT: call void @llvm.assume(i1 [[X_LB_ONLY]]) ; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[Y:%.*]] to i32 -; CHECK-NEXT: [[R:%.*]] = icmp uge i32 [[TMP1]], [[X]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i32 [[X]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %x_lb_only = icmp ult i32 %x, 65536 @@ -222,7 +222,7 @@ define i1 @icmp_trunc_x_zext_y_3(i6 %x, i32 %y) { ; CHECK-NEXT: [[Y_LB_ONLY:%.*]] = icmp ult i32 [[Y:%.*]], 65536 ; CHECK-NEXT: call void @llvm.assume(i1 [[Y_LB_ONLY]]) ; CHECK-NEXT: [[TMP1:%.*]] = zext i6 [[X:%.*]] to i32 -; CHECK-NEXT: [[R:%.*]] = icmp ne i32 [[TMP1]], [[Y]] +; CHECK-NEXT: [[R:%.*]] = icmp ne i32 [[Y]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %y_lb_only = icmp ult i32 %y, 65536 @@ -412,7 +412,7 @@ define i1 @trunc_equality_either(i16 %x, i16 %y) { define i1 @trunc_unsigned_nuw_zext(i32 %x, i8 %y) { ; CHECK-LABEL: @trunc_unsigned_nuw_zext( ; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[Y:%.*]] to i32 -; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[C]] ; %xt = trunc nuw i32 %x to i16 @@ -437,7 +437,7 @@ define i1 @trunc_unsigned_nuw_sext(i32 %x, i8 %y) { define i1 @trunc_unsigned_nsw_zext(i32 %x, i8 %y) { ; CHECK-LABEL: @trunc_unsigned_nsw_zext( ; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[Y:%.*]] to i32 -; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[C]] ; %xt = trunc nsw i32 %x to i16 @@ -449,7 +449,7 @@ define i1 @trunc_unsigned_nsw_zext(i32 %x, i8 %y) { define i1 @trunc_unsigned_nsw_sext(i32 %x, i8 %y) { ; CHECK-LABEL: @trunc_unsigned_nsw_sext( ; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[Y:%.*]] to i32 -; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[C]] ; %xt = trunc nsw i32 %x to i16 @@ -461,7 +461,7 @@ define i1 @trunc_unsigned_nsw_sext(i32 %x, i8 %y) { define i1 @trunc_signed_nsw_sext(i32 %x, i8 %y) { ; CHECK-LABEL: @trunc_signed_nsw_sext( ; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[Y:%.*]] to i32 -; CHECK-NEXT: [[C:%.*]] = icmp sgt i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp slt i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[C]] ; %xt = trunc nsw i32 %x to i16 @@ -473,7 +473,7 @@ define i1 @trunc_signed_nsw_sext(i32 %x, i8 %y) { define i1 @trunc_signed_nsw_zext(i32 %x, i8 %y) { ; CHECK-LABEL: @trunc_signed_nsw_zext( ; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[Y:%.*]] to i32 -; CHECK-NEXT: [[C:%.*]] = icmp sgt i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp slt i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[C]] ; %xt = trunc nsw i32 %x to i16 @@ -511,7 +511,7 @@ define i1 @trunc_signed_nuw_zext(i32 %x, i8 %y) { define i1 @trunc_equality_nuw_zext(i32 %x, i8 %y) { ; CHECK-LABEL: @trunc_equality_nuw_zext( ; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[Y:%.*]] to i32 -; CHECK-NEXT: [[C:%.*]] = icmp ne i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp ne i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[C]] ; %xt = trunc nuw i32 %x to i16 @@ -536,7 +536,7 @@ define i1 @trunc_equality_nuw_sext(i32 %x, i8 %y) { define i1 @trunc_equality_nsw_zext(i32 %x, i8 %y) { ; CHECK-LABEL: @trunc_equality_nsw_zext( ; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[Y:%.*]] to i32 -; CHECK-NEXT: [[C:%.*]] = icmp ne i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp ne i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[C]] ; %xt = trunc nsw i32 %x to i16 @@ -548,7 +548,7 @@ define i1 @trunc_equality_nsw_zext(i32 %x, i8 %y) { define i1 @trunc_equality_nsw_sext(i32 %x, i8 %y) { ; CHECK-LABEL: @trunc_equality_nsw_sext( ; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[Y:%.*]] to i32 -; CHECK-NEXT: [[C:%.*]] = icmp ne i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp ne i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[C]] ; %xt = trunc nsw i32 %x to i16 @@ -560,7 +560,7 @@ define i1 @trunc_equality_nsw_sext(i32 %x, i8 %y) { define i1 @trunc_equality_both_sext(i32 %x, i8 %y) { ; CHECK-LABEL: @trunc_equality_both_sext( ; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[Y:%.*]] to i32 -; CHECK-NEXT: [[C:%.*]] = icmp ne i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp ne i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[C]] ; %xt = trunc nuw nsw i32 %x to i16 @@ -572,7 +572,7 @@ define i1 @trunc_equality_both_sext(i32 %x, i8 %y) { define i1 @test_eq1(i32 %x, i16 %y) { ; CHECK-LABEL: @test_eq1( ; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[Y:%.*]] to i32 -; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[COND]] ; %conv1 = trunc nsw i32 %x to i8 @@ -586,7 +586,7 @@ define i1 @test_eq1(i32 %x, i16 %y) { define i1 @test_eq2(i32 %x, i16 %y) { ; CHECK-LABEL: @test_eq2( ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i16 -; CHECK-NEXT: [[COND:%.*]] = icmp eq i16 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[COND:%.*]] = icmp eq i16 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[COND]] ; %conv1 = trunc nsw i32 %x to i8 @@ -598,7 +598,7 @@ define i1 @test_eq2(i32 %x, i16 %y) { define i1 @test_ult(i32 %x, i16 %y) { ; CHECK-LABEL: @test_ult( ; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[Y:%.*]] to i32 -; CHECK-NEXT: [[COND:%.*]] = icmp ugt i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[COND]] ; %conv1 = trunc nsw i32 %x to i8 @@ -610,7 +610,7 @@ define i1 @test_ult(i32 %x, i16 %y) { define i1 @test_slt(i32 %x, i16 %y) { ; CHECK-LABEL: @test_slt( ; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[Y:%.*]] to i32 -; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[COND:%.*]] = icmp slt i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[COND]] ; %conv1 = trunc nsw i32 %x to i8 @@ -622,7 +622,7 @@ define i1 @test_slt(i32 %x, i16 %y) { define i1 @test_ult_nuw(i32 %x, i16 %y) { ; CHECK-LABEL: @test_ult_nuw( ; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[Y:%.*]] to i32 -; CHECK-NEXT: [[COND:%.*]] = icmp ugt i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[COND]] ; %conv1 = trunc nuw nsw i32 %x to i8 @@ -634,7 +634,7 @@ define i1 @test_ult_nuw(i32 %x, i16 %y) { define i1 @test_slt_nuw(i32 %x, i16 %y) { ; CHECK-LABEL: @test_slt_nuw( ; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[Y:%.*]] to i32 -; CHECK-NEXT: [[COND:%.*]] = icmp sgt i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[COND:%.*]] = icmp slt i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[COND]] ; %conv1 = trunc nuw nsw i32 %x to i8 diff --git a/llvm/test/Transforms/InstCombine/icmp-of-xor-x.ll b/llvm/test/Transforms/InstCombine/icmp-of-xor-x.ll index fd61c8a301662e..a4e7acbca930dc 100644 --- a/llvm/test/Transforms/InstCombine/icmp-of-xor-x.ll +++ b/llvm/test/Transforms/InstCombine/icmp-of-xor-x.ll @@ -10,7 +10,7 @@ define i1 @test_xor1(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @test_xor1( ; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X:%.*]], -1 ; CHECK-NEXT: call void @use.i8(i8 [[XOR]]) -; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], [[X]] ; CHECK-NEXT: [[R:%.*]] = icmp sgt i8 [[TMP1]], [[Z:%.*]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -27,7 +27,7 @@ define i1 @test_xor2(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @test_xor2( ; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[Y:%.*]], -1 ; CHECK-NEXT: call void @use.i8(i8 [[XOR]]) -; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X:%.*]], [[Y]] ; CHECK-NEXT: [[R:%.*]] = icmp sle i8 [[TMP1]], [[Z:%.*]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -44,7 +44,7 @@ define i1 @test_xor3(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @test_xor3( ; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X:%.*]], -1 ; CHECK-NEXT: call void @use.i8(i8 [[XOR]]) -; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], [[X]] ; CHECK-NEXT: [[R:%.*]] = icmp sgt i8 [[TMP1]], [[Z:%.*]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -60,7 +60,7 @@ define i1 @test_xor3(i8 %x, i8 %y, i8 %z) { define i1 @test_xor_ne(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @test_xor_ne( ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], [[X:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[TMP1]], [[Z:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[Z:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %nz = xor i8 %z, -1 @@ -73,7 +73,7 @@ define i1 @test_xor_ne(i8 %x, i8 %y, i8 %z) { define i1 @test_xor_eq(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @test_xor_eq( ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], [[X:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[TMP1]], [[Z:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[Z:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %nz = xor i8 %z, -1 @@ -88,7 +88,7 @@ define i1 @test_xor4(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @test_xor4( ; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X:%.*]], -1 ; CHECK-NEXT: call void @use.i8(i8 [[XOR]]) -; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], [[X]] ; CHECK-NEXT: [[R:%.*]] = icmp sle i8 [[TMP1]], [[Z:%.*]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -104,7 +104,7 @@ define i1 @test_xor5(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @test_xor5( ; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X:%.*]], -1 ; CHECK-NEXT: call void @use.i8(i8 [[XOR]]) -; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], [[X]] ; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[TMP1]], [[Z:%.*]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -120,7 +120,7 @@ define i1 @test_xor6(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @test_xor6( ; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X:%.*]], -1 ; CHECK-NEXT: call void @use.i8(i8 [[XOR]]) -; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], [[X]] ; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[TMP1]], [[Z:%.*]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -136,7 +136,7 @@ define i1 @test_xor7(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @test_xor7( ; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X:%.*]], -1 ; CHECK-NEXT: call void @use.i8(i8 [[XOR]]) -; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], [[X]] ; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[Z:%.*]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -152,7 +152,7 @@ define i1 @test_xor8(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @test_xor8( ; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X:%.*]], -1 ; CHECK-NEXT: call void @use.i8(i8 [[XOR]]) -; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], [[X]] ; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[TMP1]], [[Z:%.*]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -167,7 +167,7 @@ define i1 @test_xor8(i8 %x, i8 %y, i8 %z) { ; test (~a ^ b) < ~a define i1 @test_slt_xor(i32 %x, i32 %y) { ; CHECK-LABEL: @test_slt_xor( -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: [[R:%.*]] = icmp sgt i32 [[TMP1]], [[X]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -180,7 +180,7 @@ define i1 @test_slt_xor(i32 %x, i32 %y) { ; test (a ^ ~b) <= ~b define i1 @test_sle_xor(i32 %x, i32 %y) { ; CHECK-LABEL: @test_sle_xor( -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = icmp sge i32 [[TMP1]], [[Y]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -193,7 +193,7 @@ define i1 @test_sle_xor(i32 %x, i32 %y) { ; test ~a > (~a ^ b) define i1 @test_sgt_xor(i32 %x, i32 %y) { ; CHECK-LABEL: @test_sgt_xor( -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], [[X]] ; CHECK-NEXT: ret i1 [[CMP]] ; @@ -205,7 +205,7 @@ define i1 @test_sgt_xor(i32 %x, i32 %y) { define i1 @test_sge_xor(i32 %x, i32 %y) { ; CHECK-LABEL: @test_sge_xor( -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: [[CMP:%.*]] = icmp sle i32 [[TMP1]], [[X]] ; CHECK-NEXT: ret i1 [[CMP]] ; @@ -217,7 +217,7 @@ define i1 @test_sge_xor(i32 %x, i32 %y) { define i1 @test_ult_xor(i32 %x, i32 %y) { ; CHECK-LABEL: @test_ult_xor( -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[TMP1]], [[X]] ; CHECK-NEXT: ret i1 [[CMP]] ; @@ -229,7 +229,7 @@ define i1 @test_ult_xor(i32 %x, i32 %y) { define i1 @test_ule_xor(i32 %x, i32 %y) { ; CHECK-LABEL: @test_ule_xor( -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: [[CMP:%.*]] = icmp uge i32 [[TMP1]], [[X]] ; CHECK-NEXT: ret i1 [[CMP]] ; @@ -241,7 +241,7 @@ define i1 @test_ule_xor(i32 %x, i32 %y) { define i1 @test_ugt_xor(i32 %x, i32 %y) { ; CHECK-LABEL: @test_ugt_xor( -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP1]], [[X]] ; CHECK-NEXT: ret i1 [[CMP]] ; @@ -253,7 +253,7 @@ define i1 @test_ugt_xor(i32 %x, i32 %y) { define i1 @test_uge_xor(i32 %x, i32 %y) { ; CHECK-LABEL: @test_uge_xor( -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP1]], [[X]] ; CHECK-NEXT: ret i1 [[CMP]] ; @@ -386,7 +386,7 @@ define <2 x i1> @xor_sgt(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @xor_sgt( ; CHECK-NEXT: [[YZ:%.*]] = and <2 x i8> [[Y:%.*]], ; CHECK-NEXT: [[Y1:%.*]] = or disjoint <2 x i8> [[YZ]], -; CHECK-NEXT: [[XOR:%.*]] = xor <2 x i8> [[Y1]], [[X:%.*]] +; CHECK-NEXT: [[XOR:%.*]] = xor <2 x i8> [[X:%.*]], [[Y1]] ; CHECK-NEXT: [[R:%.*]] = icmp sgt <2 x i8> [[XOR]], [[X]] ; CHECK-NEXT: ret <2 x i1> [[R]] ; @@ -401,7 +401,7 @@ define <2 x i1> @xor_sgt_fail_no_known_msb(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @xor_sgt_fail_no_known_msb( ; CHECK-NEXT: [[YZ:%.*]] = and <2 x i8> [[Y:%.*]], ; CHECK-NEXT: [[Y1:%.*]] = or disjoint <2 x i8> [[YZ]], -; CHECK-NEXT: [[XOR:%.*]] = xor <2 x i8> [[Y1]], [[X:%.*]] +; CHECK-NEXT: [[XOR:%.*]] = xor <2 x i8> [[X:%.*]], [[Y1]] ; CHECK-NEXT: [[R:%.*]] = icmp sgt <2 x i8> [[XOR]], [[X]] ; CHECK-NEXT: ret <2 x i1> [[R]] ; @@ -415,7 +415,7 @@ define <2 x i1> @xor_sgt_fail_no_known_msb(<2 x i8> %x, <2 x i8> %y) { define i1 @xor_slt_2(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @xor_slt_2( ; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X:%.*]], 88 -; CHECK-NEXT: [[R:%.*]] = icmp sgt i8 [[XOR]], [[X]] +; CHECK-NEXT: [[R:%.*]] = icmp slt i8 [[X]], [[XOR]] ; CHECK-NEXT: ret i1 [[R]] ; %xor = xor i8 %x, 88 diff --git a/llvm/test/Transforms/InstCombine/icmp-or-of-select-with-zero.ll b/llvm/test/Transforms/InstCombine/icmp-or-of-select-with-zero.ll index 90e0461f8b789e..75301ce5d72a78 100644 --- a/llvm/test/Transforms/InstCombine/icmp-or-of-select-with-zero.ll +++ b/llvm/test/Transforms/InstCombine/icmp-or-of-select-with-zero.ll @@ -271,7 +271,7 @@ define i1 @src_tv_ne_invert(i1 %c1, i8 %a, i8 %b, i8 %x, i8 %yy) { ; CHECK-NEXT: [[C0:%.*]] = xor i1 [[NOT_C0]], true ; CHECK-NEXT: [[Y:%.*]] = add nuw i8 [[YY:%.*]], 1 ; CHECK-NEXT: [[SEL:%.*]] = select i1 [[NOT_C0]], i8 [[Y]], i8 0 -; CHECK-NEXT: [[CC:%.*]] = or i1 [[C0]], [[C1:%.*]] +; CHECK-NEXT: [[CC:%.*]] = or i1 [[C1:%.*]], [[C0]] ; CHECK-NEXT: [[SEL_OTHER:%.*]] = select i1 [[CC]], i8 [[Y]], i8 [[B]] ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i8 [[X:%.*]], 0 ; CHECK-NEXT: [[R:%.*]] = or i1 [[TMP1]], [[NOT_C0]] diff --git a/llvm/test/Transforms/InstCombine/icmp-or.ll b/llvm/test/Transforms/InstCombine/icmp-or.ll index bedaf591fb070e..36b3216196f846 100644 --- a/llvm/test/Transforms/InstCombine/icmp-or.ll +++ b/llvm/test/Transforms/InstCombine/icmp-or.ll @@ -172,7 +172,7 @@ define i1 @eq_const_mask_not_same(i8 %x, i8 %y) { define i1 @eq_const_mask_wrong_opcode(i8 %x, i8 %y) { ; CHECK-LABEL: @eq_const_mask_wrong_opcode( ; CHECK-NEXT: [[B0:%.*]] = or i8 [[X:%.*]], 5 -; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[B0]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], [[B0]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[TMP1]], 5 ; CHECK-NEXT: ret i1 [[CMP]] ; diff --git a/llvm/test/Transforms/InstCombine/icmp-range.ll b/llvm/test/Transforms/InstCombine/icmp-range.ll index 9ed2f2a4860c62..8b690826a7bf9a 100644 --- a/llvm/test/Transforms/InstCombine/icmp-range.ll +++ b/llvm/test/Transforms/InstCombine/icmp-range.ll @@ -152,7 +152,7 @@ define i1 @test_two_ranges(ptr nocapture readonly %arg1, ptr nocapture readonly ; Values' ranges overlap each other, so it can not be simplified. define i1 @test_two_attribute_ranges(i32 range(i32 5, 10) %arg1, i32 range(i32 8, 16) %arg2) { ; CHECK-LABEL: @test_two_attribute_ranges( -; CHECK-NEXT: [[RVAL:%.*]] = icmp ult i32 [[ARG1:%.*]], [[ARG2:%.*]] +; CHECK-NEXT: [[RVAL:%.*]] = icmp ult i32 [[ARG2:%.*]], [[ARG1:%.*]] ; CHECK-NEXT: ret i1 [[RVAL]] ; %rval = icmp ult i32 %arg2, %arg1 @@ -249,7 +249,7 @@ define <2 x i1> @test_two_ranges_vec_true(ptr nocapture readonly %arg1, ptr noca ; Values' ranges overlap each other, so it can not be simplified. define <2 x i1> @test_two_argument_ranges_vec(<2 x i32> range(i32 5, 10) %arg1, <2 x i32> range(i32 8, 16) %arg2) { ; CHECK-LABEL: @test_two_argument_ranges_vec( -; CHECK-NEXT: [[RVAL:%.*]] = icmp ult <2 x i32> [[VAL2:%.*]], [[VAL1:%.*]] +; CHECK-NEXT: [[RVAL:%.*]] = icmp ult <2 x i32> [[ARG2:%.*]], [[ARG1:%.*]] ; CHECK-NEXT: ret <2 x i1> [[RVAL]] ; %rval = icmp ult <2 x i32> %arg2, %arg1 @@ -281,9 +281,9 @@ declare range(i32 1, 6) i32 @create_range3() ; Values' ranges overlap each other, so it can not be simplified. define i1 @test_two_return_attribute_ranges_not_simplified() { ; CHECK-LABEL: @test_two_return_attribute_ranges_not_simplified( -; CHECK-NEXT: [[ARG2:%.*]] = call range(i32 5, 10) i32 @create_range1() -; CHECK-NEXT: [[ARG1:%.*]] = call i32 @create_range2() -; CHECK-NEXT: [[RVAL:%.*]] = icmp ult i32 [[ARG1]], [[ARG2]] +; CHECK-NEXT: [[VAL1:%.*]] = call range(i32 5, 10) i32 @create_range1() +; CHECK-NEXT: [[VAL2:%.*]] = call i32 @create_range2() +; CHECK-NEXT: [[RVAL:%.*]] = icmp ult i32 [[VAL2]], [[VAL1]] ; CHECK-NEXT: ret i1 [[RVAL]] ; %val1 = call range(i32 5, 10) i32 @create_range1() @@ -296,7 +296,7 @@ define i1 @test_two_return_attribute_ranges_not_simplified() { define i1 @test_two_return_attribute_ranges_one_in_call() { ; CHECK-LABEL: @test_two_return_attribute_ranges_one_in_call( ; CHECK-NEXT: [[VAL1:%.*]] = call range(i32 1, 6) i32 @create_range1() -; CHECK-NEXT: [[ARG1:%.*]] = call i32 @create_range2() +; CHECK-NEXT: [[VAL2:%.*]] = call i32 @create_range2() ; CHECK-NEXT: ret i1 false ; %val1 = call range(i32 1, 6) i32 @create_range1() @@ -309,7 +309,7 @@ define i1 @test_two_return_attribute_ranges_one_in_call() { define i1 @test_two_return_attribute_ranges() { ; CHECK-LABEL: @test_two_return_attribute_ranges( ; CHECK-NEXT: [[VAL1:%.*]] = call i32 @create_range3() -; CHECK-NEXT: [[ARG1:%.*]] = call i32 @create_range2() +; CHECK-NEXT: [[VAL2:%.*]] = call i32 @create_range2() ; CHECK-NEXT: ret i1 false ; %val1 = call i32 @create_range3() @@ -370,7 +370,7 @@ define <2 x i1> @ult_zext(<2 x i1> %b, <2 x i8> %p) { define i1 @uge_zext(i1 %b, i8 %x) { ; CHECK-LABEL: @uge_zext( ; CHECK-NEXT: [[Z:%.*]] = zext i1 [[B:%.*]] to i8 -; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[Z]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X:%.*]], [[Z]] ; CHECK-NEXT: ret i1 [[R]] ; %z = zext i1 %b to i8 @@ -399,7 +399,7 @@ define i1 @ugt_zext_use(i1 %b, i8 %x) { ; CHECK-LABEL: @ugt_zext_use( ; CHECK-NEXT: [[Z:%.*]] = zext i1 [[B:%.*]] to i8 ; CHECK-NEXT: call void @use(i8 [[Z]]) -; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[Z]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[X:%.*]], [[Z]] ; CHECK-NEXT: ret i1 [[R]] ; %z = zext i1 %b to i8 @@ -413,7 +413,7 @@ define i1 @ugt_zext_use(i1 %b, i8 %x) { define i1 @ult_zext_not_i1(i2 %b, i8 %x) { ; CHECK-LABEL: @ult_zext_not_i1( ; CHECK-NEXT: [[Z:%.*]] = zext i2 [[B:%.*]] to i8 -; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[Z]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[X:%.*]], [[Z]] ; CHECK-NEXT: ret i1 [[R]] ; %z = zext i2 %b to i8 @@ -600,7 +600,7 @@ define <2 x i1> @ule_sext(<2 x i1> %b, <2 x i8> %p) { define i1 @ugt_sext(i1 %b, i8 %x) { ; CHECK-LABEL: @ugt_sext( ; CHECK-NEXT: [[S:%.*]] = sext i1 [[B:%.*]] to i8 -; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[S]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[X:%.*]], [[S]] ; CHECK-NEXT: ret i1 [[R]] ; %s = sext i1 %b to i8 @@ -629,7 +629,7 @@ define i1 @uge_sext_use(i1 %b, i8 %x) { ; CHECK-LABEL: @uge_sext_use( ; CHECK-NEXT: [[S:%.*]] = sext i1 [[B:%.*]] to i8 ; CHECK-NEXT: call void @use(i8 [[S]]) -; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[S]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X:%.*]], [[S]] ; CHECK-NEXT: ret i1 [[R]] ; %s = sext i1 %b to i8 @@ -643,7 +643,7 @@ define i1 @uge_sext_use(i1 %b, i8 %x) { define i1 @ule_sext_not_i1(i2 %b, i8 %x) { ; CHECK-LABEL: @ule_sext_not_i1( ; CHECK-NEXT: [[S:%.*]] = sext i2 [[B:%.*]] to i8 -; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[S]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X:%.*]], [[S]] ; CHECK-NEXT: ret i1 [[R]] ; %s = sext i2 %b to i8 @@ -869,7 +869,7 @@ define i1 @zext_sext_add_icmp_i128(i1 %a, i1 %b) { define i1 @zext_sext_add_icmp_eq_minus1(i1 %a, i1 %b) { ; CHECK-LABEL: @zext_sext_add_icmp_eq_minus1( ; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[A:%.*]], true -; CHECK-NEXT: [[R:%.*]] = and i1 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[R:%.*]] = and i1 [[B:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %zext.a = zext i1 %a to i8 @@ -885,7 +885,7 @@ define i1 @zext_sext_add_icmp_eq_minus1(i1 %a, i1 %b) { define i1 @zext_sext_add_icmp_ne_minus1(i1 %a, i1 %b) { ; CHECK-LABEL: @zext_sext_add_icmp_ne_minus1( ; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[B:%.*]], true -; CHECK-NEXT: [[R:%.*]] = or i1 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[R:%.*]] = or i1 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %zext.a = zext i1 %a to i8 @@ -899,8 +899,8 @@ define i1 @zext_sext_add_icmp_ne_minus1(i1 %a, i1 %b) { define i1 @zext_sext_add_icmp_sgt_minus1(i1 %a, i1 %b) { ; CHECK-LABEL: @zext_sext_add_icmp_sgt_minus1( -; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[B:%.*]], true -; CHECK-NEXT: [[R:%.*]] = or i1 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[B_NOT:%.*]] = xor i1 [[B:%.*]], true +; CHECK-NEXT: [[R:%.*]] = or i1 [[A:%.*]], [[B_NOT]] ; CHECK-NEXT: ret i1 [[R]] ; %zext.a = zext i1 %a to i8 @@ -915,7 +915,7 @@ define i1 @zext_sext_add_icmp_sgt_minus1(i1 %a, i1 %b) { define i1 @zext_sext_add_icmp_ult_minus1(i1 %a, i1 %b) { ; CHECK-LABEL: @zext_sext_add_icmp_ult_minus1( ; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[B:%.*]], true -; CHECK-NEXT: [[R:%.*]] = or i1 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[R:%.*]] = or i1 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %zext.a = zext i1 %a to i8 @@ -930,7 +930,7 @@ define i1 @zext_sext_add_icmp_ult_minus1(i1 %a, i1 %b) { define i1 @zext_sext_add_icmp_sgt_0(i1 %a, i1 %b) { ; CHECK-LABEL: @zext_sext_add_icmp_sgt_0( ; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[B:%.*]], true -; CHECK-NEXT: [[R:%.*]] = and i1 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[R:%.*]] = and i1 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %zext.a = zext i1 %a to i8 @@ -945,8 +945,8 @@ define i1 @zext_sext_add_icmp_sgt_0(i1 %a, i1 %b) { define i1 @zext_sext_add_icmp_slt_0(i1 %a, i1 %b) { ; CHECK-LABEL: @zext_sext_add_icmp_slt_0( ; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[A:%.*]], true -; CHECK-NEXT: [[R:%.*]] = and i1 [[TMP1]], [[B:%.*]] -; CHECK-NEXT: ret i1 [[R]] +; CHECK-NEXT: [[TMP2:%.*]] = and i1 [[B:%.*]], [[TMP1]] +; CHECK-NEXT: ret i1 [[TMP2]] ; %zext.a = zext i1 %a to i8 %sext.b = sext i1 %b to i8 @@ -960,7 +960,7 @@ define i1 @zext_sext_add_icmp_slt_0(i1 %a, i1 %b) { define i1 @zext_sext_add_icmp_eq_1(i1 %a, i1 %b) { ; CHECK-LABEL: @zext_sext_add_icmp_eq_1( ; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[B:%.*]], true -; CHECK-NEXT: [[R:%.*]] = and i1 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[R:%.*]] = and i1 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %zext.a = zext i1 %a to i8 @@ -975,7 +975,7 @@ define i1 @zext_sext_add_icmp_eq_1(i1 %a, i1 %b) { define i1 @zext_sext_add_icmp_ne_1(i1 %a, i1 %b) { ; CHECK-LABEL: @zext_sext_add_icmp_ne_1( ; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[A:%.*]], true -; CHECK-NEXT: [[R:%.*]] = or i1 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[R:%.*]] = or i1 [[B:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %zext.a = zext i1 %a to i8 @@ -990,7 +990,7 @@ define i1 @zext_sext_add_icmp_ne_1(i1 %a, i1 %b) { define i1 @zext_sext_add_icmp_slt_1(i1 %a, i1 %b) { ; CHECK-LABEL: @zext_sext_add_icmp_slt_1( ; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[A:%.*]], true -; CHECK-NEXT: [[R:%.*]] = or i1 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[R:%.*]] = or i1 [[B:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %zext.a = zext i1 %a to i8 @@ -1005,8 +1005,8 @@ define i1 @zext_sext_add_icmp_slt_1(i1 %a, i1 %b) { define i1 @zext_sext_add_icmp_ugt_1(i1 %a, i1 %b) { ; CHECK-LABEL: @zext_sext_add_icmp_ugt_1( ; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[A:%.*]], true -; CHECK-NEXT: [[R:%.*]] = and i1 [[TMP1]], [[B:%.*]] -; CHECK-NEXT: ret i1 [[R]] +; CHECK-NEXT: [[TMP2:%.*]] = and i1 [[B:%.*]], [[TMP1]] +; CHECK-NEXT: ret i1 [[TMP2]] ; %zext.a = zext i1 %a to i8 %sext.b = sext i1 %b to i8 @@ -1018,7 +1018,7 @@ define i1 @zext_sext_add_icmp_ugt_1(i1 %a, i1 %b) { define <2 x i1> @vector_zext_sext_add_icmp_slt_1(<2 x i1> %a, <2 x i1> %b) { ; CHECK-LABEL: @vector_zext_sext_add_icmp_slt_1( ; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i1> [[A:%.*]], -; CHECK-NEXT: [[R:%.*]] = or <2 x i1> [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[R:%.*]] = or <2 x i1> [[B:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i1> [[R]] ; %zext.a = zext <2 x i1> %a to <2 x i8> @@ -1601,7 +1601,7 @@ define i1 @icmp_ne_sext_sgt_zero_nofold(i32 %a) { ; CHECK-LABEL: @icmp_ne_sext_sgt_zero_nofold( ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 0 ; CHECK-NEXT: [[CONV:%.*]] = sext i1 [[CMP]] to i32 -; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[CONV]], [[A]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[A]], [[CONV]] ; CHECK-NEXT: ret i1 [[CMP1]] ; %cmp = icmp sgt i32 %a, 0 @@ -1614,7 +1614,7 @@ define i1 @icmp_slt_sext_ne_zero_nofold(i32 %a) { ; CHECK-LABEL: @icmp_slt_sext_ne_zero_nofold( ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[A:%.*]], 0 ; CHECK-NEXT: [[CONV:%.*]] = sext i1 [[CMP]] to i32 -; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[CONV]], [[A]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[A]], [[CONV]] ; CHECK-NEXT: ret i1 [[CMP1]] ; %cmp = icmp ne i32 %a, 0 @@ -1627,7 +1627,7 @@ define i1 @icmp_ne_sext_slt_allones_nofold(i32 %a) { ; CHECK-LABEL: @icmp_ne_sext_slt_allones_nofold( ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A:%.*]], -1 ; CHECK-NEXT: [[CONV:%.*]] = sext i1 [[CMP]] to i32 -; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[CONV]], [[A]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[A]], [[CONV]] ; CHECK-NEXT: ret i1 [[CMP1]] ; %cmp = icmp slt i32 %a, -1 @@ -1640,7 +1640,7 @@ define i1 @icmp_slt_sext_ne_allones_nofold(i32 %a) { ; CHECK-LABEL: @icmp_slt_sext_ne_allones_nofold( ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[A:%.*]], -1 ; CHECK-NEXT: [[CONV:%.*]] = sext i1 [[CMP]] to i32 -; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[CONV]], [[A]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[A]], [[CONV]] ; CHECK-NEXT: ret i1 [[CMP1]] ; %cmp = icmp ne i32 %a, -1 @@ -1653,7 +1653,7 @@ define i1 @icmp_ne_sext_slt_otherwise_nofold(i32 %a) { ; CHECK-LABEL: @icmp_ne_sext_slt_otherwise_nofold( ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A:%.*]], 2 ; CHECK-NEXT: [[CONV:%.*]] = sext i1 [[CMP]] to i32 -; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[CONV]], [[A]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[A]], [[CONV]] ; CHECK-NEXT: ret i1 [[CMP1]] ; %cmp = icmp slt i32 %a, 2 @@ -1666,7 +1666,7 @@ define i1 @icmp_slt_sext_ne_otherwise_nofold(i32 %a) { ; CHECK-LABEL: @icmp_slt_sext_ne_otherwise_nofold( ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[A:%.*]], 2 ; CHECK-NEXT: [[CONV:%.*]] = sext i1 [[CMP]] to i32 -; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[CONV]], [[A]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[A]], [[CONV]] ; CHECK-NEXT: ret i1 [[CMP1]] ; %cmp = icmp ne i32 %a, 2 diff --git a/llvm/test/Transforms/InstCombine/icmp-rotate.ll b/llvm/test/Transforms/InstCombine/icmp-rotate.ll index 2580bb6a865c7b..eeaa1c78610976 100644 --- a/llvm/test/Transforms/InstCombine/icmp-rotate.ll +++ b/llvm/test/Transforms/InstCombine/icmp-rotate.ll @@ -213,7 +213,7 @@ define i1 @amounts_mismatch(i8 %x, i8 %y, i8 %z, i8 %w) { ; CHECK-LABEL: @amounts_mismatch( ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 [[Z:%.*]], [[W:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = call i8 @llvm.fshl.i8(i8 [[X:%.*]], i8 [[X]], i8 [[TMP1]]) -; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret i1 [[R]] ; %f = tail call i8 @llvm.fshl.i8(i8 %x, i8 %x, i8 %z) diff --git a/llvm/test/Transforms/InstCombine/icmp-select-implies-common-op.ll b/llvm/test/Transforms/InstCombine/icmp-select-implies-common-op.ll index 8d393a7ae28c9a..d23634f8caf555 100644 --- a/llvm/test/Transforms/InstCombine/icmp-select-implies-common-op.ll +++ b/llvm/test/Transforms/InstCombine/icmp-select-implies-common-op.ll @@ -4,9 +4,9 @@ define i1 @sgt_3_impliesF_eq_2(i8 %x, i8 %y) { ; CHECK-LABEL: @sgt_3_impliesF_eq_2( ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 4 -; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i8 [[SEL:%.*]], [[X]] -; CHECK-NEXT: [[CMP3:%.*]] = select i1 [[CMP]], i1 [[CMP2]], i1 false -; CHECK-NEXT: ret i1 [[CMP3]] +; CHECK-NEXT: [[CMP21:%.*]] = icmp eq i8 [[Y:%.*]], [[X]] +; CHECK-NEXT: [[CMP2:%.*]] = select i1 [[CMP]], i1 [[CMP21]], i1 false +; CHECK-NEXT: ret i1 [[CMP2]] ; %cmp = icmp sgt i8 %x, 3 %sel = select i1 %cmp, i8 2, i8 %y @@ -17,9 +17,9 @@ define i1 @sgt_3_impliesF_eq_2(i8 %x, i8 %y) { define i1 @sgt_3_impliesT_sgt_2(i8 %x, i8 %y) { ; CHECK-LABEL: @sgt_3_impliesT_sgt_2( ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 4 -; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i8 [[SEL:%.*]], [[X]] -; CHECK-NEXT: [[CMP3:%.*]] = select i1 [[CMP]], i1 [[CMP2]], i1 false -; CHECK-NEXT: ret i1 [[CMP3]] +; CHECK-NEXT: [[CMP21:%.*]] = icmp sgt i8 [[Y:%.*]], [[X]] +; CHECK-NEXT: [[CMP2:%.*]] = select i1 [[CMP]], i1 [[CMP21]], i1 false +; CHECK-NEXT: ret i1 [[CMP2]] ; %cmp = icmp sgt i8 %x, 3 %sel = select i1 %cmp, i8 2, i8 %y @@ -30,9 +30,9 @@ define i1 @sgt_3_impliesT_sgt_2(i8 %x, i8 %y) { define i1 @sgt_x_impliesF_eq_smin_todo(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @sgt_x_impliesF_eq_smin_todo( ; CHECK-NEXT: [[CMP:%.*]] = icmp sle i8 [[X:%.*]], [[Z:%.*]] -; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i8 [[SEL:%.*]], [[X]] -; CHECK-NEXT: [[CMP3:%.*]] = select i1 [[CMP]], i1 [[CMP2]], i1 false -; CHECK-NEXT: ret i1 [[CMP3]] +; CHECK-NEXT: [[CMP21:%.*]] = icmp eq i8 [[Y:%.*]], [[X]] +; CHECK-NEXT: [[CMP2:%.*]] = select i1 [[CMP]], i1 [[CMP21]], i1 false +; CHECK-NEXT: ret i1 [[CMP2]] ; %cmp = icmp sgt i8 %x, %z %sel = select i1 %cmp, i8 -128, i8 %y @@ -43,9 +43,9 @@ define i1 @sgt_x_impliesF_eq_smin_todo(i8 %x, i8 %y, i8 %z) { define i1 @slt_x_impliesT_ne_smin_todo(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @slt_x_impliesT_ne_smin_todo( ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[X:%.*]], [[Z:%.*]] -; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i8 [[SEL:%.*]], [[X]] -; CHECK-NEXT: [[CMP3:%.*]] = select i1 [[CMP]], i1 true, i1 [[CMP2]] -; CHECK-NEXT: ret i1 [[CMP3]] +; CHECK-NEXT: [[CMP21:%.*]] = icmp ne i8 [[Y:%.*]], [[X]] +; CHECK-NEXT: [[CMP2:%.*]] = select i1 [[CMP]], i1 true, i1 [[CMP21]] +; CHECK-NEXT: ret i1 [[CMP2]] ; %cmp = icmp slt i8 %x, %z %sel = select i1 %cmp, i8 127, i8 %y @@ -56,9 +56,9 @@ define i1 @slt_x_impliesT_ne_smin_todo(i8 %x, i8 %y, i8 %z) { define i1 @ult_x_impliesT_eq_umax_todo(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @ult_x_impliesT_eq_umax_todo( ; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i8 [[Z:%.*]], [[X:%.*]] -; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i8 [[SEL:%.*]], [[X]] -; CHECK-NEXT: [[CMP3:%.*]] = select i1 [[CMP]], i1 true, i1 [[CMP2]] -; CHECK-NEXT: ret i1 [[CMP3]] +; CHECK-NEXT: [[CMP21:%.*]] = icmp ne i8 [[Y:%.*]], [[X]] +; CHECK-NEXT: [[CMP2:%.*]] = select i1 [[CMP]], i1 true, i1 [[CMP21]] +; CHECK-NEXT: ret i1 [[CMP2]] ; %cmp = icmp ugt i8 %z, %x %sel = select i1 %cmp, i8 255, i8 %y @@ -68,10 +68,10 @@ define i1 @ult_x_impliesT_eq_umax_todo(i8 %x, i8 %y, i8 %z) { define i1 @ult_1_impliesF_eq_1(i8 %x, i8 %y) { ; CHECK-LABEL: @ult_1_impliesF_eq_1( -; CHECK-NEXT: [[CMP:%.*]] = icmp ne i8 [[SEL:%.*]], 0 -; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i8 [[X:%.*]], [[SEL]] -; CHECK-NEXT: [[CMP3:%.*]] = select i1 [[CMP]], i1 [[CMP2]], i1 false -; CHECK-NEXT: ret i1 [[CMP3]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i8 [[X:%.*]], 0 +; CHECK-NEXT: [[CMP21:%.*]] = icmp eq i8 [[Y:%.*]], [[X]] +; CHECK-NEXT: [[CMP2:%.*]] = select i1 [[CMP]], i1 [[CMP21]], i1 false +; CHECK-NEXT: ret i1 [[CMP2]] ; %cmp = icmp ult i8 %x, 1 %sel = select i1 %cmp, i8 1, i8 %y @@ -83,7 +83,7 @@ define i1 @ugt_x_impliesF_eq_umin_todo(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @ugt_x_impliesF_eq_umin_todo( ; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i8 [[Z:%.*]], [[X:%.*]] ; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i8 0, i8 [[Y:%.*]] -; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i8 [[SEL]], [[X]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i8 [[X]], [[SEL]] ; CHECK-NEXT: ret i1 [[CMP2]] ; %cmp = icmp ugt i8 %z, %x diff --git a/llvm/test/Transforms/InstCombine/icmp-select.ll b/llvm/test/Transforms/InstCombine/icmp-select.ll index 59d2a1b165c0f8..fb68c6ee942075 100644 --- a/llvm/test/Transforms/InstCombine/icmp-select.ll +++ b/llvm/test/Transforms/InstCombine/icmp-select.ll @@ -35,7 +35,7 @@ define i1 @icmp_select_var_commuted(i8 %x, i8 %y, i8 %_z) { ; CHECK-LABEL: @icmp_select_var_commuted( ; CHECK-NEXT: [[Z:%.*]] = udiv i8 42, [[_Z:%.*]] ; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[X:%.*]], 0 -; CHECK-NEXT: [[CMP21:%.*]] = icmp eq i8 [[Z]], [[Y:%.*]] +; CHECK-NEXT: [[CMP21:%.*]] = icmp eq i8 [[Y:%.*]], [[Z]] ; CHECK-NEXT: [[CMP2:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP21]] ; CHECK-NEXT: ret i1 [[CMP2]] ; @@ -122,7 +122,7 @@ define i1 @icmp_select_var_pred_ult(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @icmp_select_var_pred_ult( ; CHECK-NEXT: [[Z1:%.*]] = add nuw i8 [[Z:%.*]], 2 ; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[X:%.*]], 0 -; CHECK-NEXT: [[CMP21:%.*]] = icmp ugt i8 [[Z1]], [[Y:%.*]] +; CHECK-NEXT: [[CMP21:%.*]] = icmp ult i8 [[Y:%.*]], [[Z1]] ; CHECK-NEXT: [[CMP2:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP21]] ; CHECK-NEXT: ret i1 [[CMP2]] ; @@ -137,7 +137,7 @@ define i1 @icmp_select_var_pred_uge(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @icmp_select_var_pred_uge( ; CHECK-NEXT: [[Z1:%.*]] = add nuw i8 [[Z:%.*]], 2 ; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i8 [[X:%.*]], 0 -; CHECK-NEXT: [[CMP21:%.*]] = icmp ule i8 [[Z1]], [[Y:%.*]] +; CHECK-NEXT: [[CMP21:%.*]] = icmp uge i8 [[Y:%.*]], [[Z1]] ; CHECK-NEXT: [[CMP2:%.*]] = select i1 [[CMP1]], i1 [[CMP21]], i1 false ; CHECK-NEXT: ret i1 [[CMP2]] ; @@ -152,7 +152,7 @@ define i1 @icmp_select_var_pred_uge_commuted(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @icmp_select_var_pred_uge_commuted( ; CHECK-NEXT: [[Z1:%.*]] = add nuw i8 [[Z:%.*]], 2 ; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[X:%.*]], 0 -; CHECK-NEXT: [[CMP21:%.*]] = icmp uge i8 [[Z1]], [[Y:%.*]] +; CHECK-NEXT: [[CMP21:%.*]] = icmp ule i8 [[Y:%.*]], [[Z1]] ; CHECK-NEXT: [[CMP2:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP21]] ; CHECK-NEXT: ret i1 [[CMP2]] ; diff --git a/llvm/test/Transforms/InstCombine/icmp-sub.ll b/llvm/test/Transforms/InstCombine/icmp-sub.ll index 5645dededf2e4b..8cb3c1c181cec7 100644 --- a/llvm/test/Transforms/InstCombine/icmp-sub.ll +++ b/llvm/test/Transforms/InstCombine/icmp-sub.ll @@ -622,7 +622,7 @@ define i1 @PR60818_eq_multi_use(i32 %a) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SUB:%.*]] = sub i32 0, [[A:%.*]] ; CHECK-NEXT: call void @use(i32 [[SUB]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[SUB]], [[A]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A]], [[SUB]] ; CHECK-NEXT: ret i1 [[CMP]] ; entry: @@ -637,7 +637,7 @@ define i1 @PR60818_sgt(i32 %a) { ; CHECK-LABEL: @PR60818_sgt( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SUB:%.*]] = sub i32 0, [[A:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[SUB]], [[A]] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], [[SUB]] ; CHECK-NEXT: ret i1 [[CMP]] ; entry: diff --git a/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll b/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll index 27b02c8c6e9366..ba47ed02edbdf4 100644 --- a/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll +++ b/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll @@ -138,7 +138,7 @@ define i1 @oneuse1(i8 %val, i8 %bits) { ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[BITS:%.*]] ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 ; CHECK-NEXT: call void @use8(i8 [[T1]]) -; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[T1]], [[VAL:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[VAL:%.*]], [[T1]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = shl i8 -1, %bits @@ -154,7 +154,7 @@ define i1 @oneuse2(i8 %val, i8 %bits) { ; CHECK-NEXT: call void @use8(i8 [[T0]]) ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 ; CHECK-NEXT: call void @use8(i8 [[T1]]) -; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[T1]], [[VAL:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[VAL:%.*]], [[T1]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = shl i8 -1, %bits @@ -173,7 +173,7 @@ define i1 @n0(i8 %val, i8 %bits) { ; CHECK-LABEL: @n0( ; CHECK-NEXT: [[T0:%.*]] = shl nuw i8 1, [[BITS:%.*]] ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 -; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[T1]], [[VAL:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[VAL:%.*]], [[T1]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = shl i8 1, %bits ; constant is not -1 @@ -199,7 +199,7 @@ define <2 x i1> @n2_vec_nonsplat(<2 x i8> %val, <2 x i8> %bits) { ; CHECK-LABEL: @n2_vec_nonsplat( ; CHECK-NEXT: [[T0:%.*]] = shl <2 x i8> , [[BITS:%.*]] ; CHECK-NEXT: [[T1:%.*]] = xor <2 x i8> [[T0]], -; CHECK-NEXT: [[R:%.*]] = icmp uge <2 x i8> [[T1]], [[VAL:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ule <2 x i8> [[VAL:%.*]], [[T1]] ; CHECK-NEXT: ret <2 x i1> [[R]] ; %t0 = shl <2 x i8> , %bits ; again, wrong constant @@ -225,7 +225,7 @@ define i1 @n3(i8 %val, i8 %bits) { ; CHECK-LABEL: @n3( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[BITS:%.*]] ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 -; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[T1]], [[VAL:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[VAL:%.*]], [[T1]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = shl i8 -1, %bits diff --git a/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll b/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll index 8441033d4857ea..37aa85202e5622 100644 --- a/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll +++ b/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll @@ -138,7 +138,7 @@ define i1 @oneuse1(i8 %val, i8 %bits) { ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[BITS:%.*]] ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 ; CHECK-NEXT: call void @use8(i8 [[T1]]) -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[T1]], [[VAL:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[VAL:%.*]], [[T1]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = shl i8 -1, %bits @@ -154,7 +154,7 @@ define i1 @oneuse2(i8 %val, i8 %bits) { ; CHECK-NEXT: call void @use8(i8 [[T0]]) ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 ; CHECK-NEXT: call void @use8(i8 [[T1]]) -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[T1]], [[VAL:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[VAL:%.*]], [[T1]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = shl i8 -1, %bits @@ -173,7 +173,7 @@ define i1 @n0(i8 %val, i8 %bits) { ; CHECK-LABEL: @n0( ; CHECK-NEXT: [[T0:%.*]] = shl nuw i8 1, [[BITS:%.*]] ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[T1]], [[VAL:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[VAL:%.*]], [[T1]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = shl i8 1, %bits ; constant is not -1 @@ -199,7 +199,7 @@ define <2 x i1> @n2_vec_nonsplat(<2 x i8> %val, <2 x i8> %bits) { ; CHECK-LABEL: @n2_vec_nonsplat( ; CHECK-NEXT: [[T0:%.*]] = shl <2 x i8> , [[BITS:%.*]] ; CHECK-NEXT: [[T1:%.*]] = xor <2 x i8> [[T0]], -; CHECK-NEXT: [[R:%.*]] = icmp ult <2 x i8> [[T1]], [[VAL:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt <2 x i8> [[VAL:%.*]], [[T1]] ; CHECK-NEXT: ret <2 x i1> [[R]] ; %t0 = shl <2 x i8> , %bits ; again, wrong constant @@ -225,7 +225,7 @@ define i1 @n3(i8 %val, i8 %bits) { ; CHECK-LABEL: @n3( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[BITS:%.*]] ; CHECK-NEXT: [[T1:%.*]] = xor i8 [[T0]], -1 -; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[T1]], [[VAL:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[VAL:%.*]], [[T1]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = shl i8 -1, %bits diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll index 8fc4a40141931d..e492055fea8b8d 100644 --- a/llvm/test/Transforms/InstCombine/icmp.ll +++ b/llvm/test/Transforms/InstCombine/icmp.ll @@ -581,7 +581,7 @@ define i1 @test28_extra_uses(i32 %x, i32 %y, i32 %z) { define i1 @ugt_sub(i32 %xsrc, i32 %y) { ; CHECK-LABEL: @ugt_sub( ; CHECK-NEXT: [[X:%.*]] = udiv i32 [[XSRC:%.*]], 42 -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[Y:%.*]], [[X]] ; CHECK-NEXT: ret i1 [[CMP]] ; %x = udiv i32 %xsrc, 42 ; thwart complexity-based canonicalization @@ -1266,7 +1266,7 @@ define i1 @test62_as1(ptr addrspace(1) %a) { define i1 @low_mask_eq_zext(i8 %a, i32 %b) { ; CHECK-LABEL: @low_mask_eq_zext( ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[B:%.*]] to i8 -; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[C]] ; %z = zext i8 %a to i32 @@ -1278,7 +1278,7 @@ define i1 @low_mask_eq_zext(i8 %a, i32 %b) { define i1 @low_mask_eq_zext_commute(i8 %a, i32 %b) { ; CHECK-LABEL: @low_mask_eq_zext_commute( ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[B:%.*]] to i8 -; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[C]] ; %t = and i32 %b, 255 @@ -1322,7 +1322,7 @@ define i1 @low_mask_eq_zext_use1(i8 %a, i32 %b) { ; CHECK-NEXT: [[T:%.*]] = and i32 [[B:%.*]], 255 ; CHECK-NEXT: call void @use_i32(i32 [[T]]) ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[B]] to i8 -; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[C]] ; %t = and i32 %b, 255 @@ -1337,7 +1337,7 @@ define i1 @low_mask_eq_zext_use2(i8 %a, i32 %b) { ; CHECK-NEXT: [[Z:%.*]] = zext i8 [[A:%.*]] to i32 ; CHECK-NEXT: call void @use_i32(i32 [[Z]]) ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[B:%.*]] to i8 -; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[TMP1]], [[A]] +; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[A]], [[TMP1]] ; CHECK-NEXT: ret i1 [[C]] ; %t = and i32 %b, 255 @@ -1367,7 +1367,7 @@ define i1 @low_mask_eq_zext_use3(i8 %a, i32 %b) { define <2 x i1> @low_mask_eq_zext_vec_splat(<2 x i8> %a, <2 x i32> %b) { ; CHECK-LABEL: @low_mask_eq_zext_vec_splat( ; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[B:%.*]] to <2 x i8> -; CHECK-NEXT: [[C:%.*]] = icmp eq <2 x i8> [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp eq <2 x i8> [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i1> [[C]] ; %t = and <2 x i32> %b, @@ -1769,7 +1769,7 @@ define i1 @icmp_mul0_ne0(i32 %x) { define i1 @icmp_add20_eq_add57(i32 %x, i32 %y) { ; CHECK-LABEL: @icmp_add20_eq_add57( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[Y:%.*]], 37 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[CMP]] ; %1 = add i32 %x, 20 @@ -1781,7 +1781,7 @@ define i1 @icmp_add20_eq_add57(i32 %x, i32 %y) { define <2 x i1> @icmp_add20_eq_add57_splat(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @icmp_add20_eq_add57_splat( ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[Y:%.*]], -; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i32> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i32> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %1 = add <2 x i32> %x, @@ -1793,7 +1793,7 @@ define <2 x i1> @icmp_add20_eq_add57_splat(<2 x i32> %x, <2 x i32> %y) { define <2 x i1> @icmp_add20_eq_add57_poison(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @icmp_add20_eq_add57_poison( ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[Y:%.*]], -; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i32> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i32> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %1 = add <2 x i32> %x, @@ -1805,7 +1805,7 @@ define <2 x i1> @icmp_add20_eq_add57_poison(<2 x i32> %x, <2 x i32> %y) { define <2 x i1> @icmp_add20_eq_add57_vec_nonsplat(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @icmp_add20_eq_add57_vec_nonsplat( ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[Y:%.*]], -; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i32> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i32> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %1 = add <2 x i32> %x, @@ -1853,7 +1853,7 @@ define <2 x i1> @icmp_sub57_ne_sub20_vec_poison(<2 x i32> %x, <2 x i32> %y) { define <2 x i1> @icmp_sub57_ne_sub20_vec_nonsplat(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @icmp_sub57_ne_sub20_vec_nonsplat( ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[Y:%.*]], -; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i32> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i32> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %1 = add <2 x i32> %x, @@ -1905,7 +1905,7 @@ define i1 @icmp_add1_sle(i32 %x, i32 %y) { define i1 @icmp_add20_sge_add57(i32 %x, i32 %y) { ; CHECK-LABEL: @icmp_add20_sge_add57( ; CHECK-NEXT: [[TMP1:%.*]] = add nsw i32 [[Y:%.*]], 37 -; CHECK-NEXT: [[CMP:%.*]] = icmp sle i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp sge i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[CMP]] ; %1 = add nsw i32 %x, 20 @@ -1917,7 +1917,7 @@ define i1 @icmp_add20_sge_add57(i32 %x, i32 %y) { define <2 x i1> @icmp_add20_sge_add57_splat(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @icmp_add20_sge_add57_splat( ; CHECK-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[Y:%.*]], -; CHECK-NEXT: [[CMP:%.*]] = icmp sle <2 x i32> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp sge <2 x i32> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %1 = add nsw <2 x i32> %x, @@ -1929,7 +1929,7 @@ define <2 x i1> @icmp_add20_sge_add57_splat(<2 x i32> %x, <2 x i32> %y) { define <2 x i1> @icmp_add20_sge_add57_poison(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @icmp_add20_sge_add57_poison( ; CHECK-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[Y:%.*]], -; CHECK-NEXT: [[CMP:%.*]] = icmp sle <2 x i32> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp sge <2 x i32> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %1 = add nsw <2 x i32> %x, @@ -3192,7 +3192,7 @@ define i1 @icmp_and_or_lshr(i32 %x, i32 %y) { ; CHECK-LABEL: @icmp_and_or_lshr( ; CHECK-NEXT: [[SHF1:%.*]] = shl nuw i32 1, [[Y:%.*]] ; CHECK-NEXT: [[OR2:%.*]] = or i32 [[SHF1]], 1 -; CHECK-NEXT: [[AND3:%.*]] = and i32 [[OR2]], [[X:%.*]] +; CHECK-NEXT: [[AND3:%.*]] = and i32 [[X:%.*]], [[OR2]] ; CHECK-NEXT: [[RET:%.*]] = icmp ne i32 [[AND3]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; @@ -3634,7 +3634,7 @@ define i1 @f10(i16 %p) { define i1 @cmp_sgt_rhs_dec(float %x, i32 %i) { ; CHECK-LABEL: @cmp_sgt_rhs_dec( ; CHECK-NEXT: [[CONV:%.*]] = fptosi float [[X:%.*]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp sge i32 [[CONV]], [[I:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp sle i32 [[I:%.*]], [[CONV]] ; CHECK-NEXT: ret i1 [[CMP]] ; %conv = fptosi float %x to i32 @@ -3646,7 +3646,7 @@ define i1 @cmp_sgt_rhs_dec(float %x, i32 %i) { define i1 @cmp_sle_rhs_dec(float %x, i32 %i) { ; CHECK-LABEL: @cmp_sle_rhs_dec( ; CHECK-NEXT: [[CONV:%.*]] = fptosi float [[X:%.*]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[CONV]], [[I:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[I:%.*]], [[CONV]] ; CHECK-NEXT: ret i1 [[CMP]] ; %conv = fptosi float %x to i32 @@ -3658,7 +3658,7 @@ define i1 @cmp_sle_rhs_dec(float %x, i32 %i) { define i1 @cmp_sge_rhs_inc(float %x, i32 %i) { ; CHECK-LABEL: @cmp_sge_rhs_inc( ; CHECK-NEXT: [[CONV:%.*]] = fptosi float [[X:%.*]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[I:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I:%.*]], [[CONV]] ; CHECK-NEXT: ret i1 [[CMP]] ; %conv = fptosi float %x to i32 @@ -3670,7 +3670,7 @@ define i1 @cmp_sge_rhs_inc(float %x, i32 %i) { define i1 @cmp_slt_rhs_inc(float %x, i32 %i) { ; CHECK-LABEL: @cmp_slt_rhs_inc( ; CHECK-NEXT: [[CONV:%.*]] = fptosi float [[X:%.*]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp sle i32 [[CONV]], [[I:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp sge i32 [[I:%.*]], [[CONV]] ; CHECK-NEXT: ret i1 [[CMP]] ; %conv = fptosi float %x to i32 @@ -3823,7 +3823,7 @@ define i1 @icmp_add1_ule(i32 %x, i32 %y) { define i1 @cmp_uge_rhs_inc(float %x, i32 %i) { ; CHECK-LABEL: @cmp_uge_rhs_inc( ; CHECK-NEXT: [[CONV:%.*]] = fptosi float [[X:%.*]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[CONV]], [[I:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[I:%.*]], [[CONV]] ; CHECK-NEXT: ret i1 [[CMP]] ; %conv = fptosi float %x to i32 @@ -3835,7 +3835,7 @@ define i1 @cmp_uge_rhs_inc(float %x, i32 %i) { define i1 @cmp_ult_rhs_inc(float %x, i32 %i) { ; CHECK-LABEL: @cmp_ult_rhs_inc( ; CHECK-NEXT: [[CONV:%.*]] = fptosi float [[X:%.*]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp ule i32 [[CONV]], [[I:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp uge i32 [[I:%.*]], [[CONV]] ; CHECK-NEXT: ret i1 [[CMP]] ; %conv = fptosi float %x to i32 @@ -4655,7 +4655,7 @@ define <2 x i1> @zext_bool_and_eq1(<2 x i1> %x, <2 x i8> %y) { define i1 @zext_bool_or_eq0(i1 %x, i8 %y) { ; CHECK-LABEL: @zext_bool_or_eq0( ; CHECK-NEXT: [[ZX:%.*]] = zext i1 [[X:%.*]] to i8 -; CHECK-NEXT: [[A:%.*]] = or i8 [[ZX]], [[Y:%.*]] +; CHECK-NEXT: [[A:%.*]] = or i8 [[Y:%.*]], [[ZX]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[A]], 0 ; CHECK-NEXT: ret i1 [[R]] ; @@ -4671,7 +4671,7 @@ define i1 @zext_bool_and_eq0_use(i1 %x, i64 %y) { ; CHECK-LABEL: @zext_bool_and_eq0_use( ; CHECK-NEXT: [[ZX:%.*]] = zext i1 [[X:%.*]] to i64 ; CHECK-NEXT: call void @use_i64(i64 [[ZX]]) -; CHECK-NEXT: [[A:%.*]] = and i64 [[ZX]], [[Y:%.*]] +; CHECK-NEXT: [[A:%.*]] = and i64 [[Y:%.*]], [[ZX]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i64 [[A]], 0 ; CHECK-NEXT: ret i1 [[R]] ; @@ -4704,7 +4704,7 @@ define i1 @zext_bool_and_ne0_use(i1 %x, i64 %y) { define i1 @zext_notbool_and_ne0(i2 %x, i8 %y) { ; CHECK-LABEL: @zext_notbool_and_ne0( ; CHECK-NEXT: [[ZX:%.*]] = zext i2 [[X:%.*]] to i8 -; CHECK-NEXT: [[A:%.*]] = and i8 [[ZX]], [[Y:%.*]] +; CHECK-NEXT: [[A:%.*]] = and i8 [[Y:%.*]], [[ZX]] ; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[A]], 0 ; CHECK-NEXT: ret i1 [[R]] ; @@ -5055,7 +5055,7 @@ define i1 @or_positive_sgt_zero_multi_use(i8 %a) { define i1 @disjoint_or_sgt_1(i8 %a, i8 %b) { ; CHECK-LABEL: @disjoint_or_sgt_1( ; CHECK-NEXT: [[B1:%.*]] = add nsw i8 [[B:%.*]], 2 -; CHECK-NEXT: [[ICMP_:%.*]] = icmp sle i8 [[B1]], [[A:%.*]] +; CHECK-NEXT: [[ICMP_:%.*]] = icmp sge i8 [[A:%.*]], [[B1]] ; CHECK-NEXT: ret i1 [[ICMP_]] ; %a1 = or disjoint i8 %a, 1 @@ -5093,7 +5093,7 @@ define i1 @disjoint_or_sgt_3(i8 %a, i8 %b) { define i1 @disjoint_or_ugt_1(i8 %a, i8 %b) { ; CHECK-LABEL: @disjoint_or_ugt_1( ; CHECK-NEXT: [[B1:%.*]] = add nsw i8 [[B:%.*]], 2 -; CHECK-NEXT: [[ICMP_:%.*]] = icmp ule i8 [[B1]], [[A:%.*]] +; CHECK-NEXT: [[ICMP_:%.*]] = icmp uge i8 [[A:%.*]], [[B1]] ; CHECK-NEXT: ret i1 [[ICMP_]] ; %a1 = or disjoint i8 %a, 1 @@ -5146,7 +5146,7 @@ define i1 @deduce_nuw_flag_2(i8 %a, i8 %b) { ; CHECK-LABEL: @deduce_nuw_flag_2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = add nuw i8 [[B:%.*]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[TMP0]], [[A:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[A:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[CMP]] ; entry: @@ -5174,7 +5174,7 @@ define i1 @dont_deduce_nuw_flag_2(i8 %a, i8 %b) { ; CHECK-LABEL: @dont_deduce_nuw_flag_2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = add i8 [[B:%.*]], -1 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[TMP0]], [[A:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[A:%.*]], [[TMP0]] ; CHECK-NEXT: ret i1 [[CMP]] ; entry: diff --git a/llvm/test/Transforms/InstCombine/implies.ll b/llvm/test/Transforms/InstCombine/implies.ll index c02d84d3f83711..047b2aa816e0bf 100644 --- a/llvm/test/Transforms/InstCombine/implies.ll +++ b/llvm/test/Transforms/InstCombine/implies.ll @@ -137,7 +137,7 @@ F: define i1 @src_or_distjoint_implies_sle_fail(i8 %x, i8 %y, i1 %other) { ; CHECK-LABEL: @src_or_distjoint_implies_sle_fail( ; CHECK-NEXT: [[X2:%.*]] = or disjoint i8 [[X:%.*]], 24 -; CHECK-NEXT: [[COND_NOT:%.*]] = icmp slt i8 [[X2]], [[Y:%.*]] +; CHECK-NEXT: [[COND_NOT:%.*]] = icmp sgt i8 [[Y:%.*]], [[X2]] ; CHECK-NEXT: br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]] ; CHECK: T: ; CHECK-NEXT: [[X1:%.*]] = or disjoint i8 [[X]], 23 @@ -268,7 +268,7 @@ F: define i1 @src_or_implies_ule(i8 %x, i8 %y, i8 %z, i1 %other) { ; CHECK-LABEL: @src_or_implies_ule( ; CHECK-NEXT: [[OR:%.*]] = or i8 [[Y:%.*]], [[X:%.*]] -; CHECK-NEXT: [[COND_NOT:%.*]] = icmp ugt i8 [[OR]], [[Z:%.*]] +; CHECK-NEXT: [[COND_NOT:%.*]] = icmp ult i8 [[Z:%.*]], [[OR]] ; CHECK-NEXT: br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]] ; CHECK: T: ; CHECK-NEXT: ret i1 true diff --git a/llvm/test/Transforms/InstCombine/invert-variable-mask-in-masked-merge-scalar.ll b/llvm/test/Transforms/InstCombine/invert-variable-mask-in-masked-merge-scalar.ll index fff05a416dece9..abb36b6a785e5c 100644 --- a/llvm/test/Transforms/InstCombine/invert-variable-mask-in-masked-merge-scalar.ll +++ b/llvm/test/Transforms/InstCombine/invert-variable-mask-in-masked-merge-scalar.ll @@ -53,7 +53,7 @@ define i4 @in_constant_varx_6_invmask(i4 %x, i4 %mask) { define i4 @in_constant_mone_vary_invmask(i4 %y, i4 %mask) { ; CHECK-LABEL: @in_constant_mone_vary_invmask( ; CHECK-NEXT: [[MASK_NOT:%.*]] = xor i4 [[MASK:%.*]], -1 -; CHECK-NEXT: [[R:%.*]] = or i4 [[MASK_NOT]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = or i4 [[Y:%.*]], [[MASK_NOT]] ; CHECK-NEXT: ret i4 [[R]] ; %notmask = xor i4 %mask, -1 diff --git a/llvm/test/Transforms/InstCombine/invert-variable-mask-in-masked-merge-vector.ll b/llvm/test/Transforms/InstCombine/invert-variable-mask-in-masked-merge-vector.ll index a76662c4bc4395..0440199dadb873 100644 --- a/llvm/test/Transforms/InstCombine/invert-variable-mask-in-masked-merge-vector.ll +++ b/llvm/test/Transforms/InstCombine/invert-variable-mask-in-masked-merge-vector.ll @@ -95,7 +95,7 @@ define <3 x i4> @in_constant_varx_6_invmask_poison(<3 x i4> %x, <3 x i4> %mask) define <2 x i4> @in_constant_mone_vary_invmask(<2 x i4> %y, <2 x i4> %mask) { ; CHECK-LABEL: @in_constant_mone_vary_invmask( ; CHECK-NEXT: [[MASK_NOT:%.*]] = xor <2 x i4> [[MASK:%.*]], -; CHECK-NEXT: [[R:%.*]] = or <2 x i4> [[MASK_NOT]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = or <2 x i4> [[Y:%.*]], [[MASK_NOT]] ; CHECK-NEXT: ret <2 x i4> [[R]] ; %notmask = xor <2 x i4> %mask, diff --git a/llvm/test/Transforms/InstCombine/ispow2.ll b/llvm/test/Transforms/InstCombine/ispow2.ll index 3f2c31d05f3ed9..c21ad95f83a1c4 100644 --- a/llvm/test/Transforms/InstCombine/ispow2.ll +++ b/llvm/test/Transforms/InstCombine/ispow2.ll @@ -161,7 +161,7 @@ define i1 @is_pow2or0_negate_op_extra_use1(i32 %x) { define i1 @is_pow2or0_negate_op_extra_use2(i32 %x) { ; CHECK-LABEL: @is_pow2or0_negate_op_extra_use2( ; CHECK-NEXT: [[NEG:%.*]] = sub i32 0, [[X:%.*]] -; CHECK-NEXT: [[AND:%.*]] = and i32 [[NEG]], [[X]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], [[NEG]] ; CHECK-NEXT: call void @use(i32 [[AND]]) ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], [[X]] ; CHECK-NEXT: ret i1 [[CMP]] @@ -1190,7 +1190,7 @@ define <2 x i1> @isnot_pow2nor0_wrong_pred3_ctpop_commute_vec(<2 x i8> %x) { define i1 @is_pow2_fail_pr63327(i32 %x) { ; CHECK-LABEL: @is_pow2_fail_pr63327( ; CHECK-NEXT: [[NX:%.*]] = sub i32 0, [[X:%.*]] -; CHECK-NEXT: [[X_AND_NX:%.*]] = and i32 [[NX]], [[X]] +; CHECK-NEXT: [[X_AND_NX:%.*]] = and i32 [[X]], [[NX]] ; CHECK-NEXT: [[R:%.*]] = icmp sge i32 [[X_AND_NX]], [[X]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -1244,7 +1244,7 @@ define i1 @blsmsk_is_p2_or_z_fail(i32 %xx, i32 %yy) { define i1 @blsmsk_isnt_p2_or_z_fail(i32 %x) { ; CHECK-LABEL: @blsmsk_isnt_p2_or_z_fail( ; CHECK-NEXT: [[XM1:%.*]] = add i32 [[X:%.*]], -1 -; CHECK-NEXT: [[Y:%.*]] = xor i32 [[XM1]], [[X]] +; CHECK-NEXT: [[Y:%.*]] = xor i32 [[X]], [[XM1]] ; CHECK-NEXT: [[R:%.*]] = icmp ule i32 [[Y]], [[X]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -1259,7 +1259,7 @@ declare void @use.i32(i32) define i1 @blsmsk_isnt_p2_or_z_fail_multiuse(i32 %x) { ; CHECK-LABEL: @blsmsk_isnt_p2_or_z_fail_multiuse( ; CHECK-NEXT: [[XM1:%.*]] = add i32 [[X:%.*]], -1 -; CHECK-NEXT: [[Y:%.*]] = xor i32 [[XM1]], [[X]] +; CHECK-NEXT: [[Y:%.*]] = xor i32 [[X]], [[XM1]] ; CHECK-NEXT: call void @use.i32(i32 [[Y]]) ; CHECK-NEXT: [[R:%.*]] = icmp ult i32 [[Y]], [[X]] ; CHECK-NEXT: ret i1 [[R]] @@ -1274,7 +1274,7 @@ define i1 @blsmsk_isnt_p2_or_z_fail_multiuse(i32 %x) { define i1 @blsmsk_isnt_p2_or_z_fail_wrong_add(i32 %x, i32 %z) { ; CHECK-LABEL: @blsmsk_isnt_p2_or_z_fail_wrong_add( ; CHECK-NEXT: [[XM1:%.*]] = add i32 [[Z:%.*]], -1 -; CHECK-NEXT: [[Y:%.*]] = xor i32 [[XM1]], [[X:%.*]] +; CHECK-NEXT: [[Y:%.*]] = xor i32 [[X:%.*]], [[XM1]] ; CHECK-NEXT: [[R:%.*]] = icmp ult i32 [[Y]], [[X]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -1288,7 +1288,7 @@ define i1 @blsmsk_isnt_p2_or_z_fail_wrong_add(i32 %x, i32 %z) { define i1 @blsmsk_isnt_p2_or_z_fail_bad_xor(i32 %x, i32 %z) { ; CHECK-LABEL: @blsmsk_isnt_p2_or_z_fail_bad_xor( ; CHECK-NEXT: [[XM1:%.*]] = add i32 [[X:%.*]], -1 -; CHECK-NEXT: [[Y:%.*]] = xor i32 [[XM1]], [[Z:%.*]] +; CHECK-NEXT: [[Y:%.*]] = xor i32 [[Z:%.*]], [[XM1]] ; CHECK-NEXT: [[R:%.*]] = icmp ult i32 [[Y]], [[X]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -1302,7 +1302,7 @@ define i1 @blsmsk_isnt_p2_or_z_fail_bad_xor(i32 %x, i32 %z) { define i1 @blsmsk_is_p2_or_z_fail_bad_cmp(i32 %x, i32 %z) { ; CHECK-LABEL: @blsmsk_is_p2_or_z_fail_bad_cmp( ; CHECK-NEXT: [[XM1:%.*]] = add i32 [[X:%.*]], -1 -; CHECK-NEXT: [[Y:%.*]] = xor i32 [[XM1]], [[X]] +; CHECK-NEXT: [[Y:%.*]] = xor i32 [[X]], [[XM1]] ; CHECK-NEXT: [[R:%.*]] = icmp uge i32 [[Y]], [[Z:%.*]] ; CHECK-NEXT: ret i1 [[R]] ; diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll index c7445a6ce2fe24..3482a8e9759929 100644 --- a/llvm/test/Transforms/InstCombine/known-bits.ll +++ b/llvm/test/Transforms/InstCombine/known-bits.ll @@ -1018,7 +1018,7 @@ define i1 @extract_value_sadd_fail(i8 %xx, i8 %yy) { define i1 @extract_value_usub(i8 %x, i8 %zz) { ; CHECK-LABEL: @extract_value_usub( ; CHECK-NEXT: [[Z:%.*]] = add nuw i8 [[ZZ:%.*]], 1 -; CHECK-NEXT: [[Y:%.*]] = add i8 [[Z]], [[X:%.*]] +; CHECK-NEXT: [[Y:%.*]] = add i8 [[X:%.*]], [[Z]] ; CHECK-NEXT: [[SUB_UOV:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X]], i8 [[Y]]) ; CHECK-NEXT: [[SUB:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 0 ; CHECK-NEXT: [[UOV:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 1 @@ -1062,7 +1062,7 @@ define i1 @extract_value_usub_fail(i8 %x, i8 %z) { define i1 @extract_value_ssub(i8 %x, i8 %zz) { ; CHECK-LABEL: @extract_value_ssub( ; CHECK-NEXT: [[Z:%.*]] = add nuw i8 [[ZZ:%.*]], 1 -; CHECK-NEXT: [[Y:%.*]] = add i8 [[Z]], [[X:%.*]] +; CHECK-NEXT: [[Y:%.*]] = add i8 [[X:%.*]], [[Z]] ; CHECK-NEXT: [[SUB_SOV:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[Y]], i8 [[X]]) ; CHECK-NEXT: [[SUB:%.*]] = extractvalue { i8, i1 } [[SUB_SOV]], 0 ; CHECK-NEXT: [[SOV:%.*]] = extractvalue { i8, i1 } [[SUB_SOV]], 1 @@ -1586,7 +1586,7 @@ define i32 @test_qnan_quiet_bit2(float nofpclass(sub norm inf snan) %x) { define i16 @test_simplify_mask(i32 %ui, float %x) { ; CHECK-LABEL: @test_simplify_mask( ; CHECK-NEXT: [[CONV:%.*]] = uitofp i32 [[UI:%.*]] to float -; CHECK-NEXT: [[CMP:%.*]] = fcmp ogt float [[CONV]], [[X:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], [[CONV]] ; CHECK-NEXT: br i1 [[CMP]], label [[IF_ELSE:%.*]], label [[IF_END:%.*]] ; CHECK: if.end: ; CHECK-NEXT: ret i16 31744 diff --git a/llvm/test/Transforms/InstCombine/known-never-nan.ll b/llvm/test/Transforms/InstCombine/known-never-nan.ll index 82075b37b4361f..1ca24671d65c49 100644 --- a/llvm/test/Transforms/InstCombine/known-never-nan.ll +++ b/llvm/test/Transforms/InstCombine/known-never-nan.ll @@ -62,7 +62,7 @@ define i1 @nnan_fadd(double %arg0, double %arg1) { define i1 @nnan_fadd_maybe_nan_lhs(double %arg0, double %arg1) { ; CHECK-LABEL: @nnan_fadd_maybe_nan_lhs( ; CHECK-NEXT: [[NNAN_ARG1:%.*]] = fadd nnan double [[ARG1:%.*]], 1.000000e+00 -; CHECK-NEXT: [[OP:%.*]] = fadd double [[NNAN_ARG1]], [[ARG0:%.*]] +; CHECK-NEXT: [[OP:%.*]] = fadd double [[ARG0:%.*]], [[NNAN_ARG1]] ; CHECK-NEXT: [[TMP:%.*]] = fcmp ord double [[OP]], 0.000000e+00 ; CHECK-NEXT: ret i1 [[TMP]] ; diff --git a/llvm/test/Transforms/InstCombine/ldexp-ext.ll b/llvm/test/Transforms/InstCombine/ldexp-ext.ll index 4608553eb88743..58710005d6cce0 100644 --- a/llvm/test/Transforms/InstCombine/ldexp-ext.ll +++ b/llvm/test/Transforms/InstCombine/ldexp-ext.ll @@ -4,7 +4,7 @@ define float @ldexp_zext_float(float %x, i1 %bool) { ; CHECK-LABEL: @ldexp_zext_float( ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[BOOL:%.*]], float 2.000000e+00, float 1.000000e+00 -; CHECK-NEXT: [[LDEXP:%.*]] = fmul float [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[LDEXP:%.*]] = fmul float [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret float [[LDEXP]] ; %zext = zext i1 %bool to i32 @@ -26,7 +26,7 @@ define float @ldexp_zext_float_negative(float %x, i8 %y) { define double @ldexp_zext_double(double %x, i1 %bool) { ; CHECK-LABEL: @ldexp_zext_double( ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[BOOL:%.*]], double 2.000000e+00, double 1.000000e+00 -; CHECK-NEXT: [[LDEXP:%.*]] = fmul double [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[LDEXP:%.*]] = fmul double [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret double [[LDEXP]] ; %zext = zext i1 %bool to i32 @@ -37,7 +37,7 @@ define double @ldexp_zext_double(double %x, i1 %bool) { define double @ldexp_zext_double_fast_math(double %x, i1 %bool) { ; CHECK-LABEL: @ldexp_zext_double_fast_math( ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[BOOL:%.*]], double 2.000000e+00, double 1.000000e+00 -; CHECK-NEXT: [[LDEXP:%.*]] = fmul reassoc double [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[LDEXP:%.*]] = fmul reassoc double [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret double [[LDEXP]] ; %zext = zext i1 %bool to i32 @@ -48,7 +48,7 @@ define double @ldexp_zext_double_fast_math(double %x, i1 %bool) { define <2 x float> @ldexp_zext_float_vector(<2 x float> %x, <2 x i1> %bool) { ; CHECK-LABEL: @ldexp_zext_float_vector( ; CHECK-NEXT: [[TMP1:%.*]] = select <2 x i1> [[BOOL:%.*]], <2 x float> , <2 x float> -; CHECK-NEXT: [[LDEXP:%.*]] = fmul <2 x float> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[LDEXP:%.*]] = fmul <2 x float> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x float> [[LDEXP]] ; %zext = zext <2 x i1> %bool to <2 x i32> @@ -59,7 +59,7 @@ define <2 x float> @ldexp_zext_float_vector(<2 x float> %x, <2 x i1> %bool) { define float @ldexp_sext_float(float %x, i1 %bool) { ; CHECK-LABEL: @ldexp_sext_float( ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[BOOL:%.*]], float 5.000000e-01, float 1.000000e+00 -; CHECK-NEXT: [[LDEXP:%.*]] = fmul float [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[LDEXP:%.*]] = fmul float [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret float [[LDEXP]] ; %sext = sext i1 %bool to i32 @@ -81,7 +81,7 @@ define float @ldexp_sext_float_negative(float %x, i8 %y) { define double @ldexp_sext_double(double %x, i1 %bool) { ; CHECK-LABEL: @ldexp_sext_double( ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[BOOL:%.*]], double 5.000000e-01, double 1.000000e+00 -; CHECK-NEXT: [[LDEXP:%.*]] = fmul double [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[LDEXP:%.*]] = fmul double [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret double [[LDEXP]] ; %sext = sext i1 %bool to i32 @@ -92,7 +92,7 @@ define double @ldexp_sext_double(double %x, i1 %bool) { define double @ldexp_sext_double_fast_math(double %x, i1 %bool) { ; CHECK-LABEL: @ldexp_sext_double_fast_math( ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[BOOL:%.*]], double 5.000000e-01, double 1.000000e+00 -; CHECK-NEXT: [[LDEXP:%.*]] = fmul reassoc double [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[LDEXP:%.*]] = fmul reassoc double [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret double [[LDEXP]] ; %sext = sext i1 %bool to i32 @@ -103,7 +103,7 @@ define double @ldexp_sext_double_fast_math(double %x, i1 %bool) { define <2 x float> @ldexp_sext_float_vector(<2 x float> %x, <2 x i1> %bool) { ; CHECK-LABEL: @ldexp_sext_float_vector( ; CHECK-NEXT: [[TMP1:%.*]] = select <2 x i1> [[BOOL:%.*]], <2 x float> , <2 x float> -; CHECK-NEXT: [[LDEXP:%.*]] = fmul <2 x float> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[LDEXP:%.*]] = fmul <2 x float> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x float> [[LDEXP]] ; %sext = sext <2 x i1> %bool to <2 x i32> diff --git a/llvm/test/Transforms/InstCombine/log-pow.ll b/llvm/test/Transforms/InstCombine/log-pow.ll index 1dfe5c944eee75..b628e7cc57f15f 100644 --- a/llvm/test/Transforms/InstCombine/log-pow.ll +++ b/llvm/test/Transforms/InstCombine/log-pow.ll @@ -4,7 +4,7 @@ define double @log_pow(double %x, double %y) { ; CHECK-LABEL: @log_pow( ; CHECK-NEXT: [[LOG1:%.*]] = call fast double @llvm.log.f64(double [[X:%.*]]) -; CHECK-NEXT: [[MUL:%.*]] = fmul fast double [[LOG1]], [[Y:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = fmul fast double [[Y:%.*]], [[LOG1]] ; CHECK-NEXT: ret double [[MUL]] ; %pow = call fast double @pow(double %x, double %y) @@ -84,7 +84,7 @@ define double @log_powi_not_fast(double %x, i32 %y) { define float @log10f_powf(float %x, float %y) { ; CHECK-LABEL: @log10f_powf( ; CHECK-NEXT: [[LOG1:%.*]] = call fast float @llvm.log10.f32(float [[X:%.*]]) -; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[LOG1]], [[Y:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[Y:%.*]], [[LOG1]] ; CHECK-NEXT: ret float [[MUL]] ; %pow = call fast float @powf(float %x, float %y) @@ -95,7 +95,7 @@ define float @log10f_powf(float %x, float %y) { define <2 x double> @log2v_powv(<2 x double> %x, <2 x double> %y) { ; CHECK-LABEL: @log2v_powv( ; CHECK-NEXT: [[LOG1:%.*]] = call fast <2 x double> @llvm.log2.v2f64(<2 x double> [[X:%.*]]) -; CHECK-NEXT: [[MUL:%.*]] = fmul fast <2 x double> [[LOG1]], [[Y:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = fmul fast <2 x double> [[Y:%.*]], [[LOG1]] ; CHECK-NEXT: ret <2 x double> [[MUL]] ; %pow = call fast <2 x double> @llvm.pow.v2f64(<2 x double> %x, <2 x double> %y) diff --git a/llvm/test/Transforms/InstCombine/logical-select-inseltpoison.ll b/llvm/test/Transforms/InstCombine/logical-select-inseltpoison.ll index 20d60206ebcdff..cf0dc350328846 100644 --- a/llvm/test/Transforms/InstCombine/logical-select-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/logical-select-inseltpoison.ll @@ -4,8 +4,8 @@ define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: @foo( -; CHECK-NEXT: [[E:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[J:%.*]] = select i1 [[E]], i32 [[C:%.*]], i32 [[D:%.*]] +; CHECK-NEXT: [[E_NOT:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[J:%.*]] = select i1 [[E_NOT]], i32 [[C:%.*]], i32 [[D:%.*]] ; CHECK-NEXT: ret i32 [[J]] ; %e = icmp slt i32 %a, %b @@ -19,8 +19,8 @@ define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d) { define i32 @bar(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: @bar( -; CHECK-NEXT: [[E:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[J:%.*]] = select i1 [[E]], i32 [[C:%.*]], i32 [[D:%.*]] +; CHECK-NEXT: [[E_NOT:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[J:%.*]] = select i1 [[E_NOT]], i32 [[C:%.*]], i32 [[D:%.*]] ; CHECK-NEXT: ret i32 [[J]] ; %e = icmp slt i32 %a, %b @@ -34,8 +34,8 @@ define i32 @bar(i32 %a, i32 %b, i32 %c, i32 %d) { define i32 @goo(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: @goo( -; CHECK-NEXT: [[T0:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[T3:%.*]] = select i1 [[T0]], i32 [[C:%.*]], i32 [[D:%.*]] +; CHECK-NEXT: [[T0_NOT:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[T3:%.*]] = select i1 [[T0_NOT]], i32 [[C:%.*]], i32 [[D:%.*]] ; CHECK-NEXT: ret i32 [[T3]] ; %t0 = icmp slt i32 %a, %b @@ -141,8 +141,8 @@ define <2 x i32> @fold_inverted_icmp_vector_preds(<2 x i32> %a, <2 x i32> %b, <2 define i32 @par(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: @par( -; CHECK-NEXT: [[T0:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[T3:%.*]] = select i1 [[T0]], i32 [[C:%.*]], i32 [[D:%.*]] +; CHECK-NEXT: [[T0_NOT:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[T3:%.*]] = select i1 [[T0_NOT]], i32 [[C:%.*]], i32 [[D:%.*]] ; CHECK-NEXT: ret i32 [[T3]] ; %t0 = icmp slt i32 %a, %b @@ -343,10 +343,10 @@ define <2 x i64> @bitcast_select_multi_uses(<4 x i1> %cmp, <2 x i64> %a, <2 x i6 ; CHECK-LABEL: @bitcast_select_multi_uses( ; CHECK-NEXT: [[SEXT:%.*]] = sext <4 x i1> [[CMP:%.*]] to <4 x i32> ; CHECK-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[SEXT]] to <2 x i64> -; CHECK-NEXT: [[AND1:%.*]] = and <2 x i64> [[BC1]], [[A:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and <2 x i64> [[A:%.*]], [[BC1]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SEXT]] to <2 x i64> ; CHECK-NEXT: [[BC2:%.*]] = xor <2 x i64> [[TMP1]], -; CHECK-NEXT: [[AND2:%.*]] = and <2 x i64> [[BC2]], [[B:%.*]] +; CHECK-NEXT: [[AND2:%.*]] = and <2 x i64> [[B:%.*]], [[BC2]] ; CHECK-NEXT: [[OR:%.*]] = or <2 x i64> [[AND2]], [[AND1]] ; CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[AND2]], [[BC2]] ; CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[OR]], [[ADD]] @@ -393,7 +393,7 @@ define i1 @bools_logical(i1 %a, i1 %b, i1 %c) { define i1 @bools_multi_uses1(i1 %a, i1 %b, i1 %c) { ; CHECK-LABEL: @bools_multi_uses1( ; CHECK-NEXT: [[NOT:%.*]] = xor i1 [[C:%.*]], true -; CHECK-NEXT: [[AND1:%.*]] = and i1 [[NOT]], [[A:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i1 [[A:%.*]], [[NOT]] ; CHECK-NEXT: [[OR:%.*]] = select i1 [[C]], i1 [[B:%.*]], i1 [[A]] ; CHECK-NEXT: [[XOR:%.*]] = xor i1 [[OR]], [[AND1]] ; CHECK-NEXT: ret i1 [[XOR]] diff --git a/llvm/test/Transforms/InstCombine/logical-select.ll b/llvm/test/Transforms/InstCombine/logical-select.ll index 6e2ed6bf796d08..62a63839704a44 100644 --- a/llvm/test/Transforms/InstCombine/logical-select.ll +++ b/llvm/test/Transforms/InstCombine/logical-select.ll @@ -9,8 +9,8 @@ declare void @use2(<2 x i1>) define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: @foo( -; CHECK-NEXT: [[E:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[J:%.*]] = select i1 [[E]], i32 [[C:%.*]], i32 [[D:%.*]] +; CHECK-NEXT: [[E_NOT:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[J:%.*]] = select i1 [[E_NOT]], i32 [[C:%.*]], i32 [[D:%.*]] ; CHECK-NEXT: ret i32 [[J]] ; %e = icmp slt i32 %a, %b @@ -24,8 +24,8 @@ define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d) { define i32 @bar(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: @bar( -; CHECK-NEXT: [[E:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[J:%.*]] = select i1 [[E]], i32 [[C:%.*]], i32 [[D:%.*]] +; CHECK-NEXT: [[E_NOT:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[J:%.*]] = select i1 [[E_NOT]], i32 [[C:%.*]], i32 [[D:%.*]] ; CHECK-NEXT: ret i32 [[J]] ; %e = icmp slt i32 %a, %b @@ -39,8 +39,8 @@ define i32 @bar(i32 %a, i32 %b, i32 %c, i32 %d) { define i32 @goo(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: @goo( -; CHECK-NEXT: [[T0:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[T3:%.*]] = select i1 [[T0]], i32 [[C:%.*]], i32 [[D:%.*]] +; CHECK-NEXT: [[T0_NOT:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[T3:%.*]] = select i1 [[T0_NOT]], i32 [[C:%.*]], i32 [[D:%.*]] ; CHECK-NEXT: ret i32 [[T3]] ; %t0 = icmp slt i32 %a, %b @@ -146,8 +146,8 @@ define <2 x i32> @fold_inverted_icmp_vector_preds(<2 x i32> %a, <2 x i32> %b, <2 define i32 @par(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: @par( -; CHECK-NEXT: [[T0:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[T3:%.*]] = select i1 [[T0]], i32 [[C:%.*]], i32 [[D:%.*]] +; CHECK-NEXT: [[T0_NOT:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[T3:%.*]] = select i1 [[T0_NOT]], i32 [[C:%.*]], i32 [[D:%.*]] ; CHECK-NEXT: ret i32 [[T3]] ; %t0 = icmp slt i32 %a, %b @@ -348,10 +348,10 @@ define <2 x i64> @bitcast_select_multi_uses(<4 x i1> %cmp, <2 x i64> %a, <2 x i6 ; CHECK-LABEL: @bitcast_select_multi_uses( ; CHECK-NEXT: [[SEXT:%.*]] = sext <4 x i1> [[CMP:%.*]] to <4 x i32> ; CHECK-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[SEXT]] to <2 x i64> -; CHECK-NEXT: [[AND1:%.*]] = and <2 x i64> [[BC1]], [[A:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and <2 x i64> [[A:%.*]], [[BC1]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SEXT]] to <2 x i64> ; CHECK-NEXT: [[BC2:%.*]] = xor <2 x i64> [[TMP1]], -; CHECK-NEXT: [[AND2:%.*]] = and <2 x i64> [[BC2]], [[B:%.*]] +; CHECK-NEXT: [[AND2:%.*]] = and <2 x i64> [[B:%.*]], [[BC2]] ; CHECK-NEXT: [[OR:%.*]] = or <2 x i64> [[AND2]], [[AND1]] ; CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[AND2]], [[BC2]] ; CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[OR]], [[ADD]] @@ -398,7 +398,7 @@ define i1 @bools_logical(i1 %a, i1 %b, i1 %c) { define i1 @bools_multi_uses1(i1 %a, i1 %b, i1 %c) { ; CHECK-LABEL: @bools_multi_uses1( ; CHECK-NEXT: [[NOT:%.*]] = xor i1 [[C:%.*]], true -; CHECK-NEXT: [[AND1:%.*]] = and i1 [[NOT]], [[A:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i1 [[A:%.*]], [[NOT]] ; CHECK-NEXT: [[OR:%.*]] = select i1 [[C]], i1 [[B:%.*]], i1 [[A]] ; CHECK-NEXT: [[XOR:%.*]] = xor i1 [[OR]], [[AND1]] ; CHECK-NEXT: ret i1 [[XOR]] @@ -766,7 +766,7 @@ define <8 x i3> @bitcast_vec_cond_commute1(<3 x i1> noundef %cond, <8 x i3> %pc, ; CHECK-NEXT: [[T9:%.*]] = bitcast <3 x i8> [[S]] to <8 x i3> ; CHECK-NEXT: [[NOTT9:%.*]] = xor <8 x i3> [[T9]], ; CHECK-NEXT: [[T11:%.*]] = and <8 x i3> [[C]], [[NOTT9]] -; CHECK-NEXT: [[T12:%.*]] = and <8 x i3> [[T9]], [[D:%.*]] +; CHECK-NEXT: [[T12:%.*]] = and <8 x i3> [[D:%.*]], [[T9]] ; CHECK-NEXT: [[R:%.*]] = or disjoint <8 x i3> [[T11]], [[T12]] ; CHECK-NEXT: ret <8 x i3> [[R]] ; @@ -831,8 +831,8 @@ define <2 x i64> @bitcast_fp_vec_cond(<2 x double> noundef %s, <2 x i64> %c, <2 ; CHECK-LABEL: @bitcast_fp_vec_cond( ; CHECK-NEXT: [[T9:%.*]] = bitcast <2 x double> [[S:%.*]] to <2 x i64> ; CHECK-NEXT: [[NOTT9:%.*]] = xor <2 x i64> [[T9]], -; CHECK-NEXT: [[T11:%.*]] = and <2 x i64> [[NOTT9]], [[C:%.*]] -; CHECK-NEXT: [[T12:%.*]] = and <2 x i64> [[T9]], [[D:%.*]] +; CHECK-NEXT: [[T11:%.*]] = and <2 x i64> [[C:%.*]], [[NOTT9]] +; CHECK-NEXT: [[T12:%.*]] = and <2 x i64> [[D:%.*]], [[T9]] ; CHECK-NEXT: [[R:%.*]] = or disjoint <2 x i64> [[T11]], [[T12]] ; CHECK-NEXT: ret <2 x i64> [[R]] ; @@ -851,8 +851,8 @@ define <2 x i64> @bitcast_int_vec_cond(i1 noundef %b, <2 x i64> %c, <2 x i64> %d ; CHECK-NEXT: [[S:%.*]] = sext i1 [[B:%.*]] to i128 ; CHECK-NEXT: [[T9:%.*]] = bitcast i128 [[S]] to <2 x i64> ; CHECK-NEXT: [[NOTT9:%.*]] = xor <2 x i64> [[T9]], -; CHECK-NEXT: [[T11:%.*]] = and <2 x i64> [[NOTT9]], [[C:%.*]] -; CHECK-NEXT: [[T12:%.*]] = and <2 x i64> [[T9]], [[D:%.*]] +; CHECK-NEXT: [[T11:%.*]] = and <2 x i64> [[C:%.*]], [[NOTT9]] +; CHECK-NEXT: [[T12:%.*]] = and <2 x i64> [[D:%.*]], [[T9]] ; CHECK-NEXT: [[R:%.*]] = or disjoint <2 x i64> [[T11]], [[T12]] ; CHECK-NEXT: ret <2 x i64> [[R]] ; @@ -1126,7 +1126,7 @@ define i1 @not_d_bools_negative_use2(i1 %c, i1 %x, i1 %y) { define i1 @logical_and_or_with_not_op(i1 %a, i1 %b, i1 %c) { ; CHECK-LABEL: @logical_and_or_with_not_op( ; CHECK-NEXT: [[NOT:%.*]] = xor i1 [[C:%.*]], true -; CHECK-NEXT: [[OR:%.*]] = or i1 [[NOT]], [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i1 [[B:%.*]], [[NOT]] ; CHECK-NEXT: [[AND:%.*]] = select i1 [[A:%.*]], i1 [[OR]], i1 false ; CHECK-NEXT: ret i1 [[AND]] ; @@ -1217,7 +1217,7 @@ define i1 @logical_and_or_with_common_not_op_variant5(i1 %a) { define i1 @logical_or_and_with_not_op(i1 %a, i1 %b, i1 %c) { ; CHECK-LABEL: @logical_or_and_with_not_op( ; CHECK-NEXT: [[NOT:%.*]] = xor i1 [[C:%.*]], true -; CHECK-NEXT: [[AND:%.*]] = and i1 [[NOT]], [[B:%.*]] +; CHECK-NEXT: [[AND:%.*]] = and i1 [[B:%.*]], [[NOT]] ; CHECK-NEXT: [[OR:%.*]] = select i1 [[A:%.*]], i1 true, i1 [[AND]] ; CHECK-NEXT: ret i1 [[OR]] ; @@ -1325,9 +1325,9 @@ define i1 @reduce_logical_and2(i1 %a, i1 %b, i1 %c) { ; CHECK-LABEL: @reduce_logical_and2( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[C:%.*]], true -; CHECK-NEXT: [[B:%.*]] = and i1 [[TMP0]], [[B1:%.*]] -; CHECK-NEXT: [[AND3:%.*]] = select i1 [[AND2:%.*]], i1 [[B]], i1 false -; CHECK-NEXT: ret i1 [[AND3]] +; CHECK-NEXT: [[TMP1:%.*]] = and i1 [[B:%.*]], [[TMP0]] +; CHECK-NEXT: [[AND2:%.*]] = select i1 [[A:%.*]], i1 [[TMP1]], i1 false +; CHECK-NEXT: ret i1 [[AND2]] ; bb: %or = xor i1 %c, %b @@ -1373,9 +1373,9 @@ bb: define i1 @reduce_logical_or2(i1 %a, i1 %b, i1 %c) { ; CHECK-LABEL: @reduce_logical_or2( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[B:%.*]] = or i1 [[C:%.*]], [[B1:%.*]] -; CHECK-NEXT: [[AND3:%.*]] = select i1 [[AND2:%.*]], i1 true, i1 [[B]] -; CHECK-NEXT: ret i1 [[AND3]] +; CHECK-NEXT: [[TMP0:%.*]] = or i1 [[C:%.*]], [[B:%.*]] +; CHECK-NEXT: [[AND2:%.*]] = select i1 [[A:%.*]], i1 true, i1 [[TMP0]] +; CHECK-NEXT: ret i1 [[AND2]] ; bb: %or = xor i1 %c, %b @@ -1493,7 +1493,7 @@ define i1 @reduce_bitwise_and1(i1 %a, i32 %b, i32 %c) { ; CHECK-NEXT: bb: ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[B:%.*]], 6 ; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[C:%.*]], [[B]] -; CHECK-NEXT: [[AND1:%.*]] = or i1 [[CMP1]], [[A:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = or i1 [[A:%.*]], [[CMP1]] ; CHECK-NEXT: [[AND2:%.*]] = and i1 [[AND1]], [[CMP]] ; CHECK-NEXT: ret i1 [[AND2]] ; diff --git a/llvm/test/Transforms/InstCombine/lshr-and-negC-icmpeq-zero.ll b/llvm/test/Transforms/InstCombine/lshr-and-negC-icmpeq-zero.ll index 5d058b20be7207..89522a00d78949 100644 --- a/llvm/test/Transforms/InstCombine/lshr-and-negC-icmpeq-zero.ll +++ b/llvm/test/Transforms/InstCombine/lshr-and-negC-icmpeq-zero.ll @@ -252,7 +252,7 @@ define i1 @scalar_i32_lshr_and_negC_eq_nonzero(i32 %x, i32 %y) { define i1 @scalar_i8_lshr_and_negC_eq_not_negatedPowerOf2(i8 %x, i8 %y) { ; CHECK-LABEL: @scalar_i8_lshr_and_negC_eq_not_negatedPowerOf2( ; CHECK-NEXT: [[TMP1:%.*]] = shl i8 -3, [[Y:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[TMP2]], 0 ; CHECK-NEXT: ret i1 [[R]] ; diff --git a/llvm/test/Transforms/InstCombine/lshr.ll b/llvm/test/Transforms/InstCombine/lshr.ll index 01e07985ba6ab5..4360714c78caa6 100644 --- a/llvm/test/Transforms/InstCombine/lshr.ll +++ b/llvm/test/Transforms/InstCombine/lshr.ll @@ -742,7 +742,7 @@ define i32 @mul_splat_fold_wrong_lshr_const(i32 %x) { define i32 @mul_splat_fold_no_nuw(i32 %x) { ; CHECK-LABEL: @mul_splat_fold_no_nuw( ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 16 -; CHECK-NEXT: [[T:%.*]] = add nsw i32 [[TMP1]], [[X]] +; CHECK-NEXT: [[T:%.*]] = add nsw i32 [[X]], [[TMP1]] ; CHECK-NEXT: ret i32 [[T]] ; %m = mul nsw i32 %x, 65537 @@ -1406,7 +1406,7 @@ define i2 @bool_add_lshr(i1 %a, i1 %b) { define i4 @not_bool_add_lshr(i2 %a, i2 %b) { ; CHECK-LABEL: @not_bool_add_lshr( ; CHECK-NEXT: [[TMP1:%.*]] = xor i2 [[A:%.*]], -1 -; CHECK-NEXT: [[ADD_NARROWED_OVERFLOW:%.*]] = icmp ult i2 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[ADD_NARROWED_OVERFLOW:%.*]] = icmp ugt i2 [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[LSHR:%.*]] = zext i1 [[ADD_NARROWED_OVERFLOW]] to i4 ; CHECK-NEXT: ret i4 [[LSHR]] ; diff --git a/llvm/test/Transforms/InstCombine/masked-merge-add.ll b/llvm/test/Transforms/InstCombine/masked-merge-add.ll index 0484369e99d6a5..5ef53ad5150137 100644 --- a/llvm/test/Transforms/InstCombine/masked-merge-add.ll +++ b/llvm/test/Transforms/InstCombine/masked-merge-add.ll @@ -20,7 +20,7 @@ define i32 @p(i32 %x, i32 %y, i32 noundef %m) { ; CHECK-LABEL: @p( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], [[M:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = or disjoint i32 [[AND]], [[AND1]] ; CHECK-NEXT: ret i32 [[RET]] ; @@ -35,7 +35,7 @@ define <2 x i32> @p_splatvec(<2 x i32> %x, <2 x i32> %y, <2 x i32> noundef %m) { ; CHECK-LABEL: @p_splatvec( ; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[X:%.*]], [[M:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor <2 x i32> [[M]], -; CHECK-NEXT: [[AND1:%.*]] = and <2 x i32> [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and <2 x i32> [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = or disjoint <2 x i32> [[AND]], [[AND1]] ; CHECK-NEXT: ret <2 x i32> [[RET]] ; @@ -65,7 +65,7 @@ define <3 x i32> @p_vec_poison(<3 x i32> %x, <3 x i32> %y, <3 x i32> noundef %m) ; CHECK-LABEL: @p_vec_poison( ; CHECK-NEXT: [[AND:%.*]] = and <3 x i32> [[X:%.*]], [[M:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor <3 x i32> [[M]], -; CHECK-NEXT: [[AND1:%.*]] = and <3 x i32> [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and <3 x i32> [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = or disjoint <3 x i32> [[AND]], [[AND1]] ; CHECK-NEXT: ret <3 x i32> [[RET]] ; @@ -199,7 +199,7 @@ define i32 @p_commutative0(i32 %x, i32 %y, i32 noundef %m) { ; CHECK-LABEL: @p_commutative0( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[M:%.*]], [[X:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = or disjoint i32 [[AND]], [[AND1]] ; CHECK-NEXT: ret i32 [[RET]] ; @@ -231,7 +231,7 @@ define i32 @p_commutative2(i32 %x, i32 %y, i32 noundef %m) { ; CHECK-LABEL: @p_commutative2( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], [[M:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = or disjoint i32 [[AND1]], [[AND]] ; CHECK-NEXT: ret i32 [[RET]] ; @@ -263,7 +263,7 @@ define i32 @p_commutative4(i32 %x, i32 %y, i32 noundef %m) { ; CHECK-LABEL: @p_commutative4( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[M:%.*]], [[X:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = or disjoint i32 [[AND1]], [[AND]] ; CHECK-NEXT: ret i32 [[RET]] ; @@ -333,7 +333,7 @@ define i32 @n0_oneuse(i32 %x, i32 %y, i32 noundef %m) { ; CHECK-LABEL: @n0_oneuse( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], [[M:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = or disjoint i32 [[AND]], [[AND1]] ; CHECK-NEXT: call void @use32(i32 [[AND]]) ; CHECK-NEXT: call void @use32(i32 [[NEG]]) @@ -390,7 +390,7 @@ define i32 @n2_badmask(i32 %x, i32 %y, i32 %m1, i32 %m2) { ; CHECK-LABEL: @n2_badmask( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[M1:%.*]], [[X:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M2:%.*]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = add i32 [[AND]], [[AND1]] ; CHECK-NEXT: ret i32 [[RET]] ; diff --git a/llvm/test/Transforms/InstCombine/masked-merge-and-of-ors.ll b/llvm/test/Transforms/InstCombine/masked-merge-and-of-ors.ll index dc76743c565ed4..639478dfcc6fe8 100644 --- a/llvm/test/Transforms/InstCombine/masked-merge-and-of-ors.ll +++ b/llvm/test/Transforms/InstCombine/masked-merge-and-of-ors.ll @@ -17,7 +17,7 @@ define i32 @p(i32 %x, i32 %y, i32 %m) { ; CHECK-LABEL: @p( ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M:%.*]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NEG]], [[X:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[X:%.*]], [[NEG]] ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[Y:%.*]], [[M]] ; CHECK-NEXT: [[RET:%.*]] = and i32 [[OR]], [[OR1]] ; CHECK-NEXT: ret i32 [[RET]] @@ -32,7 +32,7 @@ define i32 @p(i32 %x, i32 %y, i32 %m) { define <2 x i32> @p_splatvec(<2 x i32> %x, <2 x i32> %y, <2 x i32> %m) { ; CHECK-LABEL: @p_splatvec( ; CHECK-NEXT: [[NEG:%.*]] = xor <2 x i32> [[M:%.*]], -; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[NEG]], [[X:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[X:%.*]], [[NEG]] ; CHECK-NEXT: [[OR1:%.*]] = or <2 x i32> [[Y:%.*]], [[M]] ; CHECK-NEXT: [[RET:%.*]] = and <2 x i32> [[OR]], [[OR1]] ; CHECK-NEXT: ret <2 x i32> [[RET]] @@ -125,7 +125,7 @@ declare i32 @gen32() define i32 @p_commutative0(i32 %x, i32 %y, i32 %m) { ; CHECK-LABEL: @p_commutative0( ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M:%.*]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NEG]], [[X:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[X:%.*]], [[NEG]] ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[Y:%.*]], [[M]] ; CHECK-NEXT: [[RET:%.*]] = and i32 [[OR]], [[OR1]] ; CHECK-NEXT: ret i32 [[RET]] @@ -141,8 +141,8 @@ define i32 @p_commutative1(i32 %x, i32 %m) { ; CHECK-LABEL: @p_commutative1( ; CHECK-NEXT: [[Y:%.*]] = call i32 @gen32() ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M:%.*]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NEG]], [[X:%.*]] -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[Y]], [[M]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[X:%.*]], [[NEG]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[M]], [[Y]] ; CHECK-NEXT: [[RET:%.*]] = and i32 [[OR]], [[OR1]] ; CHECK-NEXT: ret i32 [[RET]] ; @@ -157,7 +157,7 @@ define i32 @p_commutative1(i32 %x, i32 %m) { define i32 @p_commutative2(i32 %x, i32 %y, i32 %m) { ; CHECK-LABEL: @p_commutative2( ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M:%.*]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NEG]], [[X:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[X:%.*]], [[NEG]] ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[Y:%.*]], [[M]] ; CHECK-NEXT: [[RET:%.*]] = and i32 [[OR1]], [[OR]] ; CHECK-NEXT: ret i32 [[RET]] @@ -173,8 +173,8 @@ define i32 @p_commutative3(i32 %x, i32 %m) { ; CHECK-LABEL: @p_commutative3( ; CHECK-NEXT: [[Y:%.*]] = call i32 @gen32() ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M:%.*]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NEG]], [[X:%.*]] -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[Y]], [[M]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[X:%.*]], [[NEG]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[M]], [[Y]] ; CHECK-NEXT: [[RET:%.*]] = and i32 [[OR]], [[OR1]] ; CHECK-NEXT: ret i32 [[RET]] ; @@ -189,7 +189,7 @@ define i32 @p_commutative3(i32 %x, i32 %m) { define i32 @p_commutative4(i32 %x, i32 %y, i32 %m) { ; CHECK-LABEL: @p_commutative4( ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M:%.*]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NEG]], [[X:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[X:%.*]], [[NEG]] ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[Y:%.*]], [[M]] ; CHECK-NEXT: [[RET:%.*]] = and i32 [[OR1]], [[OR]] ; CHECK-NEXT: ret i32 [[RET]] @@ -205,8 +205,8 @@ define i32 @p_commutative5(i32 %x, i32 %m) { ; CHECK-LABEL: @p_commutative5( ; CHECK-NEXT: [[Y:%.*]] = call i32 @gen32() ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M:%.*]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NEG]], [[X:%.*]] -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[Y]], [[M]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[X:%.*]], [[NEG]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[M]], [[Y]] ; CHECK-NEXT: [[RET:%.*]] = and i32 [[OR1]], [[OR]] ; CHECK-NEXT: ret i32 [[RET]] ; @@ -222,8 +222,8 @@ define i32 @p_commutative6(i32 %x, i32 %m) { ; CHECK-LABEL: @p_commutative6( ; CHECK-NEXT: [[Y:%.*]] = call i32 @gen32() ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M:%.*]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NEG]], [[X:%.*]] -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[Y]], [[M]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[X:%.*]], [[NEG]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[M]], [[Y]] ; CHECK-NEXT: [[RET:%.*]] = and i32 [[OR1]], [[OR]] ; CHECK-NEXT: ret i32 [[RET]] ; @@ -259,7 +259,7 @@ declare void @use32(i32) define i32 @n0_oneuse_of_neg_is_ok_0(i32 %x, i32 %y, i32 %m) { ; CHECK-LABEL: @n0_oneuse_of_neg_is_ok_0( ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M:%.*]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NEG]], [[X:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[X:%.*]], [[NEG]] ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[Y:%.*]], [[M]] ; CHECK-NEXT: [[RET:%.*]] = and i32 [[OR]], [[OR1]] ; CHECK-NEXT: call void @use32(i32 [[NEG]]) @@ -276,7 +276,7 @@ define i32 @n0_oneuse_of_neg_is_ok_0(i32 %x, i32 %y, i32 %m) { define i32 @n0_oneuse_1(i32 %x, i32 %y, i32 %m) { ; CHECK-LABEL: @n0_oneuse_1( ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M:%.*]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NEG]], [[X:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[X:%.*]], [[NEG]] ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[Y:%.*]], [[M]] ; CHECK-NEXT: [[RET:%.*]] = and i32 [[OR]], [[OR1]] ; CHECK-NEXT: call void @use32(i32 [[OR]]) @@ -293,7 +293,7 @@ define i32 @n0_oneuse_1(i32 %x, i32 %y, i32 %m) { define i32 @n0_oneuse_2(i32 %x, i32 %y, i32 %m) { ; CHECK-LABEL: @n0_oneuse_2( ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M:%.*]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NEG]], [[X:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[X:%.*]], [[NEG]] ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[Y:%.*]], [[M]] ; CHECK-NEXT: [[RET:%.*]] = and i32 [[OR]], [[OR1]] ; CHECK-NEXT: call void @use32(i32 [[OR1]]) @@ -310,7 +310,7 @@ define i32 @n0_oneuse_2(i32 %x, i32 %y, i32 %m) { define i32 @n0_oneuse_3(i32 %x, i32 %y, i32 %m) { ; CHECK-LABEL: @n0_oneuse_3( ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M:%.*]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NEG]], [[X:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[X:%.*]], [[NEG]] ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[Y:%.*]], [[M]] ; CHECK-NEXT: [[RET:%.*]] = and i32 [[OR]], [[OR1]] ; CHECK-NEXT: call void @use32(i32 [[NEG]]) @@ -329,7 +329,7 @@ define i32 @n0_oneuse_3(i32 %x, i32 %y, i32 %m) { define i32 @n0_oneuse_4(i32 %x, i32 %y, i32 %m) { ; CHECK-LABEL: @n0_oneuse_4( ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M:%.*]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NEG]], [[X:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[X:%.*]], [[NEG]] ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[Y:%.*]], [[M]] ; CHECK-NEXT: [[RET:%.*]] = and i32 [[OR]], [[OR1]] ; CHECK-NEXT: call void @use32(i32 [[NEG]]) @@ -348,7 +348,7 @@ define i32 @n0_oneuse_4(i32 %x, i32 %y, i32 %m) { define i32 @n0_oneuse_5(i32 %x, i32 %y, i32 %m) { ; CHECK-LABEL: @n0_oneuse_5( ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M:%.*]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NEG]], [[X:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[X:%.*]], [[NEG]] ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[Y:%.*]], [[M]] ; CHECK-NEXT: [[RET:%.*]] = and i32 [[OR]], [[OR1]] ; CHECK-NEXT: call void @use32(i32 [[NEG]]) @@ -369,7 +369,7 @@ define i32 @n0_oneuse_5(i32 %x, i32 %y, i32 %m) { define i32 @n0_oneuse_6(i32 %x, i32 %y, i32 %m) { ; CHECK-LABEL: @n0_oneuse_6( ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M:%.*]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NEG]], [[X:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[X:%.*]], [[NEG]] ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[Y:%.*]], [[M]] ; CHECK-NEXT: [[RET:%.*]] = and i32 [[OR]], [[OR1]] ; CHECK-NEXT: call void @use32(i32 [[OR]]) @@ -456,7 +456,7 @@ define i32 @n1_badxor(i32 %x, i32 %y, i32 %m) { define i32 @n2_badmask(i32 %x, i32 %y, i32 %m1, i32 %m2) { ; CHECK-LABEL: @n2_badmask( ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M2:%.*]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NEG]], [[X:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[X:%.*]], [[NEG]] ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[M1:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[RET:%.*]] = and i32 [[OR]], [[OR1]] ; CHECK-NEXT: ret i32 [[RET]] diff --git a/llvm/test/Transforms/InstCombine/masked-merge-or.ll b/llvm/test/Transforms/InstCombine/masked-merge-or.ll index 0531a532fc7e0a..dd2ac6dfe51091 100644 --- a/llvm/test/Transforms/InstCombine/masked-merge-or.ll +++ b/llvm/test/Transforms/InstCombine/masked-merge-or.ll @@ -20,7 +20,7 @@ define i32 @p(i32 %x, i32 %y, i32 noundef %m) { ; CHECK-LABEL: @p( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], [[M:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = or disjoint i32 [[AND]], [[AND1]] ; CHECK-NEXT: ret i32 [[RET]] ; @@ -35,7 +35,7 @@ define <2 x i32> @p_splatvec(<2 x i32> %x, <2 x i32> %y, <2 x i32> noundef %m) { ; CHECK-LABEL: @p_splatvec( ; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[X:%.*]], [[M:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor <2 x i32> [[M]], -; CHECK-NEXT: [[AND1:%.*]] = and <2 x i32> [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and <2 x i32> [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = or disjoint <2 x i32> [[AND]], [[AND1]] ; CHECK-NEXT: ret <2 x i32> [[RET]] ; @@ -65,7 +65,7 @@ define <3 x i32> @p_vec_poison(<3 x i32> %x, <3 x i32> %y, <3 x i32> noundef %m) ; CHECK-LABEL: @p_vec_poison( ; CHECK-NEXT: [[AND:%.*]] = and <3 x i32> [[X:%.*]], [[M:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor <3 x i32> [[M]], -; CHECK-NEXT: [[AND1:%.*]] = and <3 x i32> [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and <3 x i32> [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = or disjoint <3 x i32> [[AND]], [[AND1]] ; CHECK-NEXT: ret <3 x i32> [[RET]] ; @@ -199,7 +199,7 @@ define i32 @p_commutative0(i32 %x, i32 %y, i32 noundef %m) { ; CHECK-LABEL: @p_commutative0( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[M:%.*]], [[X:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = or disjoint i32 [[AND]], [[AND1]] ; CHECK-NEXT: ret i32 [[RET]] ; @@ -231,7 +231,7 @@ define i32 @p_commutative2(i32 %x, i32 %y, i32 noundef %m) { ; CHECK-LABEL: @p_commutative2( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], [[M:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = or disjoint i32 [[AND1]], [[AND]] ; CHECK-NEXT: ret i32 [[RET]] ; @@ -263,7 +263,7 @@ define i32 @p_commutative4(i32 %x, i32 %y, i32 noundef %m) { ; CHECK-LABEL: @p_commutative4( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[M:%.*]], [[X:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = or disjoint i32 [[AND1]], [[AND]] ; CHECK-NEXT: ret i32 [[RET]] ; @@ -333,7 +333,7 @@ define i32 @n0_oneuse(i32 %x, i32 %y, i32 noundef %m) { ; CHECK-LABEL: @n0_oneuse( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], [[M:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = or disjoint i32 [[AND]], [[AND1]] ; CHECK-NEXT: call void @use32(i32 [[AND]]) ; CHECK-NEXT: call void @use32(i32 [[NEG]]) @@ -390,7 +390,7 @@ define i32 @n2_badmask(i32 %x, i32 %y, i32 %m1, i32 %m2) { ; CHECK-LABEL: @n2_badmask( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[M1:%.*]], [[X:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M2:%.*]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = or i32 [[AND]], [[AND1]] ; CHECK-NEXT: ret i32 [[RET]] ; diff --git a/llvm/test/Transforms/InstCombine/masked-merge-xor.ll b/llvm/test/Transforms/InstCombine/masked-merge-xor.ll index 74cc7625aebff5..7ed1f3fdfdab64 100644 --- a/llvm/test/Transforms/InstCombine/masked-merge-xor.ll +++ b/llvm/test/Transforms/InstCombine/masked-merge-xor.ll @@ -20,7 +20,7 @@ define i32 @p(i32 %x, i32 %y, i32 noundef %m) { ; CHECK-LABEL: @p( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], [[M:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = or disjoint i32 [[AND]], [[AND1]] ; CHECK-NEXT: ret i32 [[RET]] ; @@ -35,7 +35,7 @@ define <2 x i32> @p_splatvec(<2 x i32> %x, <2 x i32> %y, <2 x i32> noundef %m) { ; CHECK-LABEL: @p_splatvec( ; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[X:%.*]], [[M:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor <2 x i32> [[M]], -; CHECK-NEXT: [[AND1:%.*]] = and <2 x i32> [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and <2 x i32> [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = or disjoint <2 x i32> [[AND]], [[AND1]] ; CHECK-NEXT: ret <2 x i32> [[RET]] ; @@ -65,7 +65,7 @@ define <3 x i32> @p_vec_poison(<3 x i32> %x, <3 x i32> %y, <3 x i32> noundef %m) ; CHECK-LABEL: @p_vec_poison( ; CHECK-NEXT: [[AND:%.*]] = and <3 x i32> [[X:%.*]], [[M:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor <3 x i32> [[M]], -; CHECK-NEXT: [[AND1:%.*]] = and <3 x i32> [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and <3 x i32> [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = or disjoint <3 x i32> [[AND]], [[AND1]] ; CHECK-NEXT: ret <3 x i32> [[RET]] ; @@ -84,8 +84,8 @@ define i32 @p_constmask(i32 %x, i32 %y) { ; CHECK-LABEL: @p_constmask( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 65280 ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[Y:%.*]], -65281 -; CHECK-NEXT: [[RET1:%.*]] = or disjoint i32 [[AND]], [[AND1]] -; CHECK-NEXT: ret i32 [[RET1]] +; CHECK-NEXT: [[RET:%.*]] = or disjoint i32 [[AND]], [[AND1]] +; CHECK-NEXT: ret i32 [[RET]] ; %and = and i32 %x, 65280 %and1 = and i32 %y, -65281 @@ -97,8 +97,8 @@ define <2 x i32> @p_constmask_splatvec(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @p_constmask_splatvec( ; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[AND1:%.*]] = and <2 x i32> [[Y:%.*]], -; CHECK-NEXT: [[RET1:%.*]] = or disjoint <2 x i32> [[AND]], [[AND1]] -; CHECK-NEXT: ret <2 x i32> [[RET1]] +; CHECK-NEXT: [[RET:%.*]] = or disjoint <2 x i32> [[AND]], [[AND1]] +; CHECK-NEXT: ret <2 x i32> [[RET]] ; %and = and <2 x i32> %x, %and1 = and <2 x i32> %y, @@ -140,8 +140,8 @@ define i32 @p_constmask2(i32 %x, i32 %y) { ; CHECK-LABEL: @p_constmask2( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 61440 ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[Y:%.*]], -65281 -; CHECK-NEXT: [[RET1:%.*]] = or disjoint i32 [[AND]], [[AND1]] -; CHECK-NEXT: ret i32 [[RET1]] +; CHECK-NEXT: [[RET:%.*]] = or disjoint i32 [[AND]], [[AND1]] +; CHECK-NEXT: ret i32 [[RET]] ; %and = and i32 %x, 61440 %and1 = and i32 %y, -65281 @@ -153,8 +153,8 @@ define <2 x i32> @p_constmask2_splatvec(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @p_constmask2_splatvec( ; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[AND1:%.*]] = and <2 x i32> [[Y:%.*]], -; CHECK-NEXT: [[RET1:%.*]] = or disjoint <2 x i32> [[AND]], [[AND1]] -; CHECK-NEXT: ret <2 x i32> [[RET1]] +; CHECK-NEXT: [[RET:%.*]] = or disjoint <2 x i32> [[AND]], [[AND1]] +; CHECK-NEXT: ret <2 x i32> [[RET]] ; %and = and <2 x i32> %x, %and1 = and <2 x i32> %y, @@ -199,7 +199,7 @@ define i32 @p_commutative0(i32 %x, i32 %y, i32 noundef %m) { ; CHECK-LABEL: @p_commutative0( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[M:%.*]], [[X:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = or disjoint i32 [[AND]], [[AND1]] ; CHECK-NEXT: ret i32 [[RET]] ; @@ -231,7 +231,7 @@ define i32 @p_commutative2(i32 %x, i32 %y, i32 noundef %m) { ; CHECK-LABEL: @p_commutative2( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], [[M:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = or disjoint i32 [[AND1]], [[AND]] ; CHECK-NEXT: ret i32 [[RET]] ; @@ -263,7 +263,7 @@ define i32 @p_commutative4(i32 %x, i32 %y, i32 noundef %m) { ; CHECK-LABEL: @p_commutative4( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[M:%.*]], [[X:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = or disjoint i32 [[AND1]], [[AND]] ; CHECK-NEXT: ret i32 [[RET]] ; @@ -312,8 +312,8 @@ define i32 @p_constmask_commutative(i32 %x, i32 %y) { ; CHECK-LABEL: @p_constmask_commutative( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 65280 ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[Y:%.*]], -65281 -; CHECK-NEXT: [[RET1:%.*]] = or disjoint i32 [[AND1]], [[AND]] -; CHECK-NEXT: ret i32 [[RET1]] +; CHECK-NEXT: [[RET:%.*]] = or disjoint i32 [[AND1]], [[AND]] +; CHECK-NEXT: ret i32 [[RET]] ; %and = and i32 %x, 65280 %and1 = and i32 %y, -65281 @@ -333,7 +333,7 @@ define i32 @n0_oneuse(i32 %x, i32 %y, i32 noundef %m) { ; CHECK-LABEL: @n0_oneuse( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], [[M:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = or disjoint i32 [[AND]], [[AND1]] ; CHECK-NEXT: call void @use32(i32 [[AND]]) ; CHECK-NEXT: call void @use32(i32 [[NEG]]) @@ -354,10 +354,10 @@ define i32 @n0_constmask_oneuse(i32 %x, i32 %y) { ; CHECK-LABEL: @n0_constmask_oneuse( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 65280 ; CHECK-NEXT: [[AND1:%.*]] = and i32 [[Y:%.*]], -65281 -; CHECK-NEXT: [[RET1:%.*]] = or disjoint i32 [[AND]], [[AND1]] +; CHECK-NEXT: [[RET:%.*]] = or disjoint i32 [[AND]], [[AND1]] ; CHECK-NEXT: call void @use32(i32 [[AND]]) ; CHECK-NEXT: call void @use32(i32 [[AND1]]) -; CHECK-NEXT: ret i32 [[RET1]] +; CHECK-NEXT: ret i32 [[RET]] ; %and = and i32 %x, 65280 %and1 = and i32 %y, -65281 @@ -390,7 +390,7 @@ define i32 @n2_badmask(i32 %x, i32 %y, i32 %m1, i32 %m2) { ; CHECK-LABEL: @n2_badmask( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[M1:%.*]], [[X:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor i32 [[M2:%.*]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[Y:%.*]], [[NEG]] ; CHECK-NEXT: [[RET:%.*]] = xor i32 [[AND]], [[AND1]] ; CHECK-NEXT: ret i32 [[RET]] ; diff --git a/llvm/test/Transforms/InstCombine/minmax-fold.ll b/llvm/test/Transforms/InstCombine/minmax-fold.ll index 3e870c695cf1a5..26cd4996e687d5 100644 --- a/llvm/test/Transforms/InstCombine/minmax-fold.ll +++ b/llvm/test/Transforms/InstCombine/minmax-fold.ll @@ -99,7 +99,7 @@ define i32 @t8(i64 %a, i32 %b) { ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.smin.i64(i64 [[A:%.*]], i64 -32767) ; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = icmp slt i32 [[B:%.*]], 42 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[B]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[B]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], i1 true, i1 [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 ; CHECK-NEXT: ret i32 [[TMP6]] @@ -1360,11 +1360,11 @@ define i8 @PR14613_smax(i8 %x) { define i8 @PR46271(<2 x i8> %x) { ; CHECK-LABEL: @PR46271( -; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i8> [[X:%.*]], +; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i8> [[X:%.*]], ; CHECK-NEXT: [[A_INV:%.*]] = icmp slt <2 x i8> [[X]], zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = select <2 x i1> [[A_INV]], <2 x i8> , <2 x i8> [[TMP3]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i8> [[TMP1]], i64 1 -; CHECK-NEXT: ret i8 [[TMP2]] +; CHECK-NEXT: [[NOT:%.*]] = select <2 x i1> [[A_INV]], <2 x i8> , <2 x i8> [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = extractelement <2 x i8> [[NOT]], i64 1 +; CHECK-NEXT: ret i8 [[R]] ; %a = icmp sgt <2 x i8> %x, %b = select <2 x i1> %a, <2 x i8> %x, <2 x i8> diff --git a/llvm/test/Transforms/InstCombine/minmax-of-xor-x.ll b/llvm/test/Transforms/InstCombine/minmax-of-xor-x.ll index b8430da451f9a9..8b896632b8adcc 100644 --- a/llvm/test/Transforms/InstCombine/minmax-of-xor-x.ll +++ b/llvm/test/Transforms/InstCombine/minmax-of-xor-x.ll @@ -76,8 +76,8 @@ define i8 @smin_xor_Cpow2_neg(i8 %x) { define i8 @umax_xor_pow2(i8 %x, i8 %y) { ; CHECK-LABEL: @umax_xor_pow2( ; CHECK-NEXT: [[NY:%.*]] = sub i8 0, [[Y:%.*]] -; CHECK-NEXT: [[YP2:%.*]] = and i8 [[NY]], [[Y]] -; CHECK-NEXT: [[R:%.*]] = or i8 [[YP2]], [[X:%.*]] +; CHECK-NEXT: [[YP2:%.*]] = and i8 [[Y]], [[NY]] +; CHECK-NEXT: [[R:%.*]] = or i8 [[X:%.*]], [[YP2]] ; CHECK-NEXT: ret i8 [[R]] ; %ny = sub i8 0, %y @@ -90,9 +90,9 @@ define i8 @umax_xor_pow2(i8 %x, i8 %y) { define <2 x i8> @umin_xor_pow2(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @umin_xor_pow2( ; CHECK-NEXT: [[NY:%.*]] = sub <2 x i8> zeroinitializer, [[Y:%.*]] -; CHECK-NEXT: [[YP2:%.*]] = and <2 x i8> [[NY]], [[Y]] +; CHECK-NEXT: [[YP2:%.*]] = and <2 x i8> [[Y]], [[NY]] ; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i8> [[YP2]], -; CHECK-NEXT: [[R:%.*]] = and <2 x i8> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = and <2 x i8> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i8> [[R]] ; %ny = sub <2 x i8> , %y @@ -105,8 +105,8 @@ define <2 x i8> @umin_xor_pow2(<2 x i8> %x, <2 x i8> %y) { define i8 @smax_xor_pow2_unk(i8 %x, i8 %y) { ; CHECK-LABEL: @smax_xor_pow2_unk( ; CHECK-NEXT: [[NY:%.*]] = sub i8 0, [[Y:%.*]] -; CHECK-NEXT: [[YP2:%.*]] = and i8 [[NY]], [[Y]] -; CHECK-NEXT: [[X_XOR:%.*]] = xor i8 [[YP2]], [[X:%.*]] +; CHECK-NEXT: [[YP2:%.*]] = and i8 [[Y]], [[NY]] +; CHECK-NEXT: [[X_XOR:%.*]] = xor i8 [[X:%.*]], [[YP2]] ; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.smax.i8(i8 [[X]], i8 [[X_XOR]]) ; CHECK-NEXT: ret i8 [[R]] ; @@ -120,8 +120,8 @@ define i8 @smax_xor_pow2_unk(i8 %x, i8 %y) { define <2 x i8> @smin_xor_pow2_unk(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @smin_xor_pow2_unk( ; CHECK-NEXT: [[NY:%.*]] = sub <2 x i8> zeroinitializer, [[Y:%.*]] -; CHECK-NEXT: [[YP2:%.*]] = and <2 x i8> [[NY]], [[Y]] -; CHECK-NEXT: [[X_XOR:%.*]] = xor <2 x i8> [[YP2]], [[X:%.*]] +; CHECK-NEXT: [[YP2:%.*]] = and <2 x i8> [[Y]], [[NY]] +; CHECK-NEXT: [[X_XOR:%.*]] = xor <2 x i8> [[X:%.*]], [[YP2]] ; CHECK-NEXT: [[R:%.*]] = call <2 x i8> @llvm.smin.v2i8(<2 x i8> [[X]], <2 x i8> [[X_XOR]]) ; CHECK-NEXT: ret <2 x i8> [[R]] ; @@ -159,12 +159,12 @@ pos: define i8 @smin_xor_pow2_pos(i8 %x, i8 %y) { ; CHECK-LABEL: @smin_xor_pow2_pos( ; CHECK-NEXT: [[NY:%.*]] = sub i8 0, [[Y:%.*]] -; CHECK-NEXT: [[YP2:%.*]] = and i8 [[NY]], [[Y]] +; CHECK-NEXT: [[YP2:%.*]] = and i8 [[Y]], [[NY]] ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i8 [[YP2]], 0 ; CHECK-NEXT: br i1 [[CMP]], label [[NEG:%.*]], label [[POS:%.*]] ; CHECK: neg: ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[YP2]], -1 -; CHECK-NEXT: [[R:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = and i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i8 [[R]] ; CHECK: pos: ; CHECK-NEXT: call void @barrier() diff --git a/llvm/test/Transforms/InstCombine/mul-masked-bits.ll b/llvm/test/Transforms/InstCombine/mul-masked-bits.ll index e940ae3fec163b..fd8ad88764f592 100644 --- a/llvm/test/Transforms/InstCombine/mul-masked-bits.ll +++ b/llvm/test/Transforms/InstCombine/mul-masked-bits.ll @@ -182,7 +182,7 @@ define i33 @squared_demanded_3_low_bits(i33 %x) { define i64 @scalar_mul_bit_x0_y0(i64 %x, i64 %y) { ; CHECK-LABEL: @scalar_mul_bit_x0_y0( ; CHECK-NEXT: [[AND2:%.*]] = and i64 [[Y:%.*]], 1 -; CHECK-NEXT: [[MUL:%.*]] = and i64 [[AND2]], [[X:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = and i64 [[X:%.*]], [[AND2]] ; CHECK-NEXT: ret i64 [[MUL]] ; %and1 = and i64 %x, 1 @@ -199,7 +199,7 @@ define i64 @scalar_mul_bit_x0_y0_uses(i64 %x, i64 %y) { ; CHECK-NEXT: call void @use(i64 [[AND1]]) ; CHECK-NEXT: [[AND2:%.*]] = and i64 [[Y:%.*]], 1 ; CHECK-NEXT: call void @use(i64 [[AND2]]) -; CHECK-NEXT: [[MUL:%.*]] = and i64 [[AND2]], [[X]] +; CHECK-NEXT: [[MUL:%.*]] = and i64 [[X]], [[AND2]] ; CHECK-NEXT: ret i64 [[MUL]] ; %and1 = and i64 %x, 1 @@ -241,7 +241,7 @@ define i64 @scalar_mul_bit_x0_yC(i64 %x, i64 %y, i64 %c) { define <2 x i64> @vector_mul_bit_x0_y0(<2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: @vector_mul_bit_x0_y0( ; CHECK-NEXT: [[AND2:%.*]] = and <2 x i64> [[Y:%.*]], -; CHECK-NEXT: [[MUL:%.*]] = and <2 x i64> [[AND2]], [[X:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = and <2 x i64> [[X:%.*]], [[AND2]] ; CHECK-NEXT: ret <2 x i64> [[MUL]] ; %and1 = and <2 x i64> %x, diff --git a/llvm/test/Transforms/InstCombine/mul-pow2.ll b/llvm/test/Transforms/InstCombine/mul-pow2.ll index c16fd710f309b1..bc172f0152fe5a 100644 --- a/llvm/test/Transforms/InstCombine/mul-pow2.ll +++ b/llvm/test/Transforms/InstCombine/mul-pow2.ll @@ -107,7 +107,7 @@ define <2 x i8> @mul_x_selectp2_vec(<2 x i8> %xx, i1 %c) { define i8 @shl_add_log_may_cause_poison_pr62175_fail(i8 %x, i8 %y) { ; CHECK-LABEL: @shl_add_log_may_cause_poison_pr62175_fail( ; CHECK-NEXT: [[SHL:%.*]] = shl i8 4, [[X:%.*]] -; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[SHL]], [[Y:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[Y:%.*]], [[SHL]] ; CHECK-NEXT: ret i8 [[MUL]] ; %shl = shl i8 4, %x diff --git a/llvm/test/Transforms/InstCombine/mul.ll b/llvm/test/Transforms/InstCombine/mul.ll index 66455479feaaa6..8c528e340bc6ce 100644 --- a/llvm/test/Transforms/InstCombine/mul.ll +++ b/llvm/test/Transforms/InstCombine/mul.ll @@ -289,7 +289,7 @@ define i32 @shl1_decrement_use(i32 %x, i32 %y) { ; CHECK-NEXT: [[NOTMASK:%.*]] = shl nsw i32 -1, [[X:%.*]] ; CHECK-NEXT: [[X1:%.*]] = xor i32 [[NOTMASK]], -1 ; CHECK-NEXT: call void @use32(i32 [[X1]]) -; CHECK-NEXT: [[M:%.*]] = mul i32 [[X1]], [[Y:%.*]] +; CHECK-NEXT: [[M:%.*]] = mul i32 [[Y:%.*]], [[X1]] ; CHECK-NEXT: ret i32 [[M]] ; %pow2x = shl i32 1, %x @@ -1413,7 +1413,7 @@ define i32 @mul_nsw_shl_nsw_neg_onearg(i32 %x) { define i32 @mul_use_mul_neg(i32 %x,i32 %y) { ; CHECK-LABEL: @mul_use_mul_neg( ; CHECK-NEXT: [[NEG:%.*]] = sub i32 0, [[X:%.*]] -; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[Y:%.*]], [[NEG]] ; CHECK-NEXT: call void @use32(i32 [[MUL]]) ; CHECK-NEXT: [[MUL2:%.*]] = mul i32 [[MUL]], [[NEG]] ; CHECK-NEXT: ret i32 [[MUL2]] @@ -2121,7 +2121,7 @@ define i32 @test_mul_sext_bool_commuted(i1 %x, i32 %y) { define i32 @test_mul_sext_nonbool(i2 %x, i32 %y) { ; CHECK-LABEL: @test_mul_sext_nonbool( ; CHECK-NEXT: [[SEXT:%.*]] = sext i2 [[X:%.*]] to i32 -; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[SEXT]], [[Y:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[Y:%.*]], [[SEXT]] ; CHECK-NEXT: ret i32 [[MUL]] ; %sext = sext i2 %x to i32 @@ -2133,7 +2133,7 @@ define i32 @test_mul_sext_multiuse(i1 %x, i32 %y) { ; CHECK-LABEL: @test_mul_sext_multiuse( ; CHECK-NEXT: [[SEXT:%.*]] = sext i1 [[X:%.*]] to i32 ; CHECK-NEXT: tail call void @use(i32 [[SEXT]]) -; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[SEXT]], [[Y:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[Y:%.*]], [[SEXT]] ; CHECK-NEXT: ret i32 [[MUL]] ; %sext = sext i1 %x to i32 diff --git a/llvm/test/Transforms/InstCombine/mul_fold.ll b/llvm/test/Transforms/InstCombine/mul_fold.ll index a1fdec3c68cc4b..e4a21db8a6ece7 100644 --- a/llvm/test/Transforms/InstCombine/mul_fold.ll +++ b/llvm/test/Transforms/InstCombine/mul_fold.ll @@ -55,7 +55,7 @@ define i8 @mul8_low_A0_B1(i8 %p, i8 %in1) { define i8 @mul8_low_A0_B2(i8 %in0, i8 %p) { ; CHECK-LABEL: @mul8_low_A0_B2( ; CHECK-NEXT: [[IN1:%.*]] = call i8 @use8(i8 [[P:%.*]]) -; CHECK-NEXT: [[RETLO:%.*]] = mul i8 [[IN1]], [[IN0:%.*]] +; CHECK-NEXT: [[RETLO:%.*]] = mul i8 [[IN0:%.*]], [[IN1]] ; CHECK-NEXT: ret i8 [[RETLO]] ; @@ -262,7 +262,7 @@ define i32 @mul32_low_A2_B2(i32 %in0, i32 %p) { ; CHECK-NEXT: [[IN1HI:%.*]] = lshr i32 [[IN1]], 16 ; CHECK-NEXT: [[M10:%.*]] = mul nuw i32 [[IN0LO]], [[IN1HI]] ; CHECK-NEXT: call void @use32(i32 [[M10]]) -; CHECK-NEXT: [[RETLO:%.*]] = mul i32 [[IN1]], [[IN0]] +; CHECK-NEXT: [[RETLO:%.*]] = mul i32 [[IN0]], [[IN1]] ; CHECK-NEXT: ret i32 [[RETLO]] ; %in1 = call i32 @use32(i32 %p) ; thwart complexity-based canonicalization @@ -287,7 +287,7 @@ define i32 @mul32_low_A2_B3(i32 %in0, i32 %p) { ; CHECK-NEXT: [[IN1HI:%.*]] = lshr i32 [[IN1]], 16 ; CHECK-NEXT: [[M10:%.*]] = mul nuw i32 [[IN1HI]], [[IN0LO]] ; CHECK-NEXT: call void @use32(i32 [[M10]]) -; CHECK-NEXT: [[RETLO:%.*]] = mul i32 [[IN1]], [[IN0]] +; CHECK-NEXT: [[RETLO:%.*]] = mul i32 [[IN0]], [[IN1]] ; CHECK-NEXT: ret i32 [[RETLO]] ; %in1 = call i32 @use32(i32 %p) ; thwart complexity-based canonicalization @@ -639,7 +639,7 @@ define i64 @mul64_low_no_and(i64 %in0, i64 %in1) { ; CHECK-NEXT: [[IN0HI:%.*]] = lshr i64 [[IN0:%.*]], 32 ; CHECK-NEXT: [[IN1HI:%.*]] = lshr i64 [[IN1:%.*]], 32 ; CHECK-NEXT: [[M10:%.*]] = mul i64 [[IN1HI]], [[IN0]] -; CHECK-NEXT: [[M01:%.*]] = mul i64 [[IN0HI]], [[IN1]] +; CHECK-NEXT: [[M01:%.*]] = mul i64 [[IN1]], [[IN0HI]] ; CHECK-NEXT: [[M00:%.*]] = mul i64 [[IN1]], [[IN0]] ; CHECK-NEXT: [[ADDC:%.*]] = add i64 [[M10]], [[M01]] ; CHECK-NEXT: [[SHL:%.*]] = shl i64 [[ADDC]], 32 @@ -719,7 +719,7 @@ define i32 @mul32_low_extra_shl_use(i32 %in0, i32 %in1) { ; CHECK-NEXT: [[IN0HI:%.*]] = lshr i32 [[IN0:%.*]], 16 ; CHECK-NEXT: [[IN1HI:%.*]] = lshr i32 [[IN1:%.*]], 16 ; CHECK-NEXT: [[M10:%.*]] = mul i32 [[IN1HI]], [[IN0]] -; CHECK-NEXT: [[M01:%.*]] = mul i32 [[IN0HI]], [[IN1]] +; CHECK-NEXT: [[M01:%.*]] = mul i32 [[IN1]], [[IN0HI]] ; CHECK-NEXT: [[ADDC:%.*]] = add i32 [[M10]], [[M01]] ; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[ADDC]], 16 ; CHECK-NEXT: call void @use32(i32 [[SHL]]) @@ -738,4 +738,4 @@ define i32 @mul32_low_extra_shl_use(i32 %in0, i32 %in1) { call void @use32(i32 %shl) %retLo = add i32 %shl, %m00 ret i32 %retLo -} \ No newline at end of file +} diff --git a/llvm/test/Transforms/InstCombine/mul_full_64.ll b/llvm/test/Transforms/InstCombine/mul_full_64.ll index 7cddb63b9ba63e..1bec5bb9276041 100644 --- a/llvm/test/Transforms/InstCombine/mul_full_64.ll +++ b/llvm/test/Transforms/InstCombine/mul_full_64.ll @@ -459,7 +459,7 @@ define i64 @mullo(i64 %x, i64 %y) { ; CHECK-NEXT: [[YL:%.*]] = and i64 [[Y:%.*]], 4294967295 ; CHECK-NEXT: [[YH:%.*]] = lshr i64 [[Y]], 32 ; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[YL]], [[XL]] -; CHECK-NEXT: [[T1:%.*]] = mul i64 [[XH]], [[Y]] +; CHECK-NEXT: [[T1:%.*]] = mul i64 [[Y]], [[XH]] ; CHECK-NEXT: [[T2:%.*]] = mul i64 [[YH]], [[X]] ; CHECK-NEXT: [[T0L:%.*]] = and i64 [[T0]], 4294967295 ; CHECK-NEXT: [[T0H:%.*]] = lshr i64 [[T0]], 32 @@ -526,7 +526,7 @@ define i64 @mullo_duplicate(i64 %x, i64 %y) { ; CHECK-NEXT: [[YL:%.*]] = and i64 [[Y]], 4294967295 ; CHECK-NEXT: [[YH:%.*]] = lshr i64 [[Y]], 32 ; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[YL]], [[XL]] -; CHECK-NEXT: [[T1:%.*]] = mul i64 [[XH]], [[Y]] +; CHECK-NEXT: [[T1:%.*]] = mul i64 [[Y]], [[XH]] ; CHECK-NEXT: [[T2:%.*]] = mul i64 [[YH]], [[X]] ; CHECK-NEXT: [[T0L:%.*]] = and i64 [[T0]], 4294967295 ; CHECK-NEXT: [[T0H:%.*]] = lshr i64 [[T0]], 32 diff --git a/llvm/test/Transforms/InstCombine/not-add.ll b/llvm/test/Transforms/InstCombine/not-add.ll index 5c600c991de586..ecbd11dbdc620b 100644 --- a/llvm/test/Transforms/InstCombine/not-add.ll +++ b/llvm/test/Transforms/InstCombine/not-add.ll @@ -42,7 +42,7 @@ define i8 @basic_use_xor(i8 %x, i8 %y) { define i8 @basic_use_add(i8 %x, i8 %y) { ; CHECK-LABEL: @basic_use_add( ; CHECK-NEXT: [[NOTX:%.*]] = xor i8 [[X:%.*]], -1 -; CHECK-NEXT: [[A:%.*]] = add i8 [[NOTX]], [[Y:%.*]] +; CHECK-NEXT: [[A:%.*]] = add i8 [[Y:%.*]], [[NOTX]] ; CHECK-NEXT: call void @use(i8 [[A]]) ; CHECK-NEXT: [[NOTA:%.*]] = sub i8 [[X]], [[Y]] ; CHECK-NEXT: ret i8 [[NOTA]] @@ -58,7 +58,7 @@ define i8 @basic_use_both(i8 %x, i8 %y) { ; CHECK-LABEL: @basic_use_both( ; CHECK-NEXT: [[NOTX:%.*]] = xor i8 [[X:%.*]], -1 ; CHECK-NEXT: call void @use(i8 [[NOTX]]) -; CHECK-NEXT: [[A:%.*]] = add i8 [[NOTX]], [[Y:%.*]] +; CHECK-NEXT: [[A:%.*]] = add i8 [[Y:%.*]], [[NOTX]] ; CHECK-NEXT: call void @use(i8 [[A]]) ; CHECK-NEXT: [[NOTA:%.*]] = sub i8 [[X]], [[Y]] ; CHECK-NEXT: ret i8 [[NOTA]] @@ -143,8 +143,8 @@ define i32 @pr50308(i1 %c1, i32 %v1, i32 %v2, i32 %v3) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[C1:%.*]], label [[COND_TRUE:%.*]], label [[COND_END:%.*]] ; CHECK: cond.true: -; CHECK-NEXT: [[ADD_NOT:%.*]] = sub i32 -2, [[V1:%.*]] -; CHECK-NEXT: [[ADD1_NEG:%.*]] = xor i32 [[ADD_NOT]], [[V2:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = sub i32 -2, [[V1:%.*]] +; CHECK-NEXT: [[ADD1_NEG:%.*]] = xor i32 [[TMP0]], [[V2:%.*]] ; CHECK-NEXT: br label [[COND_END]] ; CHECK: cond.end: ; CHECK-NEXT: [[COND_NEG:%.*]] = phi i32 [ [[ADD1_NEG]], [[COND_TRUE]] ], [ 0, [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/InstCombine/not.ll b/llvm/test/Transforms/InstCombine/not.ll index 0c2c6195e32407..3679976d9dc393 100644 --- a/llvm/test/Transforms/InstCombine/not.ll +++ b/llvm/test/Transforms/InstCombine/not.ll @@ -442,7 +442,7 @@ define i8 @not_or_neg_use1(i8 %x, i8 %y) { ; CHECK-LABEL: @not_or_neg_use1( ; CHECK-NEXT: [[S:%.*]] = sub i8 0, [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[S]]) -; CHECK-NEXT: [[O:%.*]] = or i8 [[S]], [[X:%.*]] +; CHECK-NEXT: [[O:%.*]] = or i8 [[X:%.*]], [[S]] ; CHECK-NEXT: [[NOT:%.*]] = xor i8 [[O]], -1 ; CHECK-NEXT: ret i8 [[NOT]] ; @@ -458,7 +458,7 @@ define i8 @not_or_neg_use1(i8 %x, i8 %y) { define i8 @not_or_neg_use2(i8 %x, i8 %y) { ; CHECK-LABEL: @not_or_neg_use2( ; CHECK-NEXT: [[S:%.*]] = sub i8 0, [[Y:%.*]] -; CHECK-NEXT: [[O:%.*]] = or i8 [[S]], [[X:%.*]] +; CHECK-NEXT: [[O:%.*]] = or i8 [[X:%.*]], [[S]] ; CHECK-NEXT: call void @use8(i8 [[O]]) ; CHECK-NEXT: [[NOT:%.*]] = xor i8 [[O]], -1 ; CHECK-NEXT: ret i8 [[NOT]] @@ -850,7 +850,7 @@ define i32 @test_zext(i32 %a, i32 %b){ ; CHECK-LABEL: @test_zext( ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0 ; CHECK-NEXT: [[SEXT:%.*]] = zext i1 [[CMP]] to i32 -; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SEXT]], [[B:%.*]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[B:%.*]], [[SEXT]] ; CHECK-NEXT: [[NOT:%.*]] = xor i32 [[ADD]], -1 ; CHECK-NEXT: ret i32 [[NOT]] ; @@ -864,11 +864,11 @@ define i32 @test_zext(i32 %a, i32 %b){ define void @test_invert_demorgan_or(i32 %a, i32 %b, i1 %cond) { ; CHECK-LABEL: @test_invert_demorgan_or( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i32 [[B:%.*]], 0 -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i32 [[B1:%.*]], 0 -; CHECK-NEXT: [[OR_NOT1:%.*]] = and i1 [[CMP2]], [[CMP3]] -; CHECK-NEXT: [[MERGE:%.*]] = and i1 [[OR_NOT1]], [[COND:%.*]] -; CHECK-NEXT: br i1 [[MERGE]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[A:%.*]], 0 +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[B:%.*]], 0 +; CHECK-NEXT: [[OR_NOT1:%.*]] = and i1 [[CMP1]], [[CMP2]] +; CHECK-NEXT: [[MERGE_NOT:%.*]] = and i1 [[OR_NOT1]], [[COND:%.*]] +; CHECK-NEXT: br i1 [[MERGE_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] ; CHECK: if.then: ; CHECK-NEXT: call void @f1() ; CHECK-NEXT: unreachable @@ -897,8 +897,8 @@ define i1 @test_invert_demorgan_or2(i64 %a, i64 %b, i64 %c) { ; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i64 [[B:%.*]], 60 ; CHECK-NEXT: [[OR1_NOT1:%.*]] = and i1 [[CMP1]], [[CMP2]] ; CHECK-NEXT: [[CMP3:%.*]] = icmp ult i64 [[C:%.*]], 60 -; CHECK-NEXT: [[NOT:%.*]] = and i1 [[OR1_NOT1]], [[CMP3]] -; CHECK-NEXT: ret i1 [[NOT]] +; CHECK-NEXT: [[OR2_NOT:%.*]] = and i1 [[OR1_NOT1]], [[CMP3]] +; CHECK-NEXT: ret i1 [[OR2_NOT]] ; %cmp1 = icmp ugt i64 %a, 23 %cmp2 = icmp ugt i64 %b, 59 @@ -920,8 +920,8 @@ define i1 @test_invert_demorgan_or3(i32 %a, i32 %b) { ; CHECK-NEXT: [[CMP4:%.*]] = icmp ult i32 [[TMP3]], -196112 ; CHECK-NEXT: [[OR1_NOT2:%.*]] = and i1 [[CMP1]], [[CMP2]] ; CHECK-NEXT: [[OR2_NOT1:%.*]] = and i1 [[OR1_NOT2]], [[CMP3]] -; CHECK-NEXT: [[NOT:%.*]] = and i1 [[OR2_NOT1]], [[CMP4]] -; CHECK-NEXT: ret i1 [[NOT]] +; CHECK-NEXT: [[OR3_NOT:%.*]] = and i1 [[OR2_NOT1]], [[CMP4]] +; CHECK-NEXT: ret i1 [[OR3_NOT]] ; %cmp1 = icmp eq i32 %a, 178206 %v1 = add i32 %b, -195102 @@ -943,8 +943,8 @@ define i1 @test_invert_demorgan_logical_or(i64 %x, i64 %y) { ; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i64 [[Y:%.*]], 0 ; CHECK-NEXT: [[SEL_NOT1:%.*]] = select i1 [[CMP1]], i1 [[CMP2]], i1 false ; CHECK-NEXT: [[CMP3:%.*]] = icmp ne i64 [[X]], 0 -; CHECK-NEXT: [[NOT:%.*]] = and i1 [[CMP3]], [[SEL_NOT1]] -; CHECK-NEXT: ret i1 [[NOT]] +; CHECK-NEXT: [[OR_NOT:%.*]] = and i1 [[CMP3]], [[SEL_NOT1]] +; CHECK-NEXT: ret i1 [[OR_NOT]] ; %cmp1 = icmp eq i64 %x, 27 %cmp2 = icmp eq i64 %y, 0 @@ -958,11 +958,11 @@ define i1 @test_invert_demorgan_logical_or(i64 %x, i64 %y) { define i1 @test_invert_demorgan_and(i32 %a, i32 %b, i1 %cond) { ; CHECK-LABEL: @test_invert_demorgan_and( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i32 [[B:%.*]], 0 -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i32 [[B1:%.*]], 0 -; CHECK-NEXT: [[AND_NOT1:%.*]] = or i1 [[CMP2]], [[CMP3]] -; CHECK-NEXT: [[MERGE:%.*]] = or i1 [[AND_NOT1]], [[COND:%.*]] -; CHECK-NEXT: br i1 [[MERGE]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[A:%.*]], 0 +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[B:%.*]], 0 +; CHECK-NEXT: [[AND_NOT1:%.*]] = or i1 [[CMP1]], [[CMP2]] +; CHECK-NEXT: [[MERGE_NOT:%.*]] = or i1 [[AND_NOT1]], [[COND:%.*]] +; CHECK-NEXT: br i1 [[MERGE_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] ; CHECK: if.then: ; CHECK-NEXT: call void @f1() ; CHECK-NEXT: unreachable @@ -999,9 +999,9 @@ define i64 @test_invert_demorgan_and2(i64 %x) { define i1 @test_invert_demorgan_and3(i32 %a, i32 %b) { ; CHECK-LABEL: @test_invert_demorgan_and3( -; CHECK-NEXT: [[ADD:%.*]] = sub i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[AND:%.*]] = and i32 [[ADD]], 4095 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 4095 +; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 4095 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP2]], 4095 ; CHECK-NEXT: ret i1 [[CMP]] ; %not = xor i32 %a, -1 @@ -1017,8 +1017,8 @@ define i1 @test_invert_demorgan_logical_and(i64 %x, i64 %y) { ; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i64 [[Y:%.*]], 0 ; CHECK-NEXT: [[SEL_NOT1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP2]] ; CHECK-NEXT: [[CMP3:%.*]] = icmp ne i64 [[X]], 0 -; CHECK-NEXT: [[NOT:%.*]] = and i1 [[CMP3]], [[SEL_NOT1]] -; CHECK-NEXT: ret i1 [[NOT]] +; CHECK-NEXT: [[OR_NOT:%.*]] = and i1 [[CMP3]], [[SEL_NOT1]] +; CHECK-NEXT: ret i1 [[OR_NOT]] ; %cmp1 = icmp eq i64 %x, 27 %cmp2 = icmp eq i64 %y, 0 diff --git a/llvm/test/Transforms/InstCombine/onehot_merge.ll b/llvm/test/Transforms/InstCombine/onehot_merge.ll index 228ad233c97634..d8ef66a4dd7818 100644 --- a/llvm/test/Transforms/InstCombine/onehot_merge.ll +++ b/llvm/test/Transforms/InstCombine/onehot_merge.ll @@ -48,7 +48,7 @@ define i1 @foo1_and(i32 %k, i32 %c1, i32 %c2) { ; CHECK-NEXT: [[T:%.*]] = shl nuw i32 1, [[C1:%.*]] ; CHECK-NEXT: [[T4:%.*]] = shl nuw i32 1, [[C2:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[T]], [[T4]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[K:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[K:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR:%.*]] = icmp ne i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: ret i1 [[OR]] ; @@ -68,7 +68,7 @@ define i1 @foo1_and_logical(i32 %k, i32 %c1, i32 %c2) { ; CHECK-NEXT: [[T4:%.*]] = shl nuw i32 1, [[C2:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = freeze i32 [[T4]] ; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[T]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], [[K:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[K:%.*]], [[TMP2]] ; CHECK-NEXT: [[OR:%.*]] = icmp ne i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: ret i1 [[OR]] ; @@ -87,7 +87,7 @@ define <2 x i1> @foo1_and_vector(<2 x i32> %k, <2 x i32> %c1, <2 x i32> %c2) { ; CHECK-NEXT: [[T:%.*]] = shl nuw <2 x i32> , [[C1:%.*]] ; CHECK-NEXT: [[T4:%.*]] = shl nuw <2 x i32> , [[C2:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> [[T]], [[T4]] -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], [[K:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[K:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR:%.*]] = icmp ne <2 x i32> [[TMP2]], [[TMP1]] ; CHECK-NEXT: ret <2 x i1> [[OR]] ; @@ -213,7 +213,7 @@ define i1 @foo1_or(i32 %k, i32 %c1, i32 %c2) { ; CHECK-NEXT: [[T:%.*]] = shl nuw i32 1, [[C1:%.*]] ; CHECK-NEXT: [[T4:%.*]] = shl nuw i32 1, [[C2:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[T]], [[T4]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[K:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[K:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR:%.*]] = icmp eq i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: ret i1 [[OR]] ; @@ -233,7 +233,7 @@ define i1 @foo1_or_logical(i32 %k, i32 %c1, i32 %c2) { ; CHECK-NEXT: [[T4:%.*]] = shl nuw i32 1, [[C2:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = freeze i32 [[T4]] ; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[T]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], [[K:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[K:%.*]], [[TMP2]] ; CHECK-NEXT: [[OR:%.*]] = icmp eq i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: ret i1 [[OR]] ; @@ -252,7 +252,7 @@ define <2 x i1> @foo1_or_vector(<2 x i32> %k, <2 x i32> %c1, <2 x i32> %c2) { ; CHECK-NEXT: [[T:%.*]] = shl nuw <2 x i32> , [[C1:%.*]] ; CHECK-NEXT: [[T4:%.*]] = shl nuw <2 x i32> , [[C2:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> [[T]], [[T4]] -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], [[K:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[K:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR:%.*]] = icmp eq <2 x i32> [[TMP2]], [[TMP1]] ; CHECK-NEXT: ret <2 x i1> [[OR]] ; @@ -336,7 +336,7 @@ define i1 @foo1_and_signbit_lshr(i32 %k, i32 %c1, i32 %c2) { ; CHECK-NEXT: [[T:%.*]] = shl nuw i32 1, [[C1:%.*]] ; CHECK-NEXT: [[T4:%.*]] = lshr exact i32 -2147483648, [[C2:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[T]], [[T4]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[K:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[K:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR:%.*]] = icmp ne i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: ret i1 [[OR]] ; @@ -356,7 +356,7 @@ define i1 @foo1_and_signbit_lshr_logical(i32 %k, i32 %c1, i32 %c2) { ; CHECK-NEXT: [[T4:%.*]] = lshr exact i32 -2147483648, [[C2:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = freeze i32 [[T4]] ; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[T]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], [[K:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[K:%.*]], [[TMP2]] ; CHECK-NEXT: [[OR:%.*]] = icmp ne i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: ret i1 [[OR]] ; @@ -375,7 +375,7 @@ define <2 x i1> @foo1_and_signbit_lshr_vector(<2 x i32> %k, <2 x i32> %c1, <2 x ; CHECK-NEXT: [[T:%.*]] = shl nuw <2 x i32> , [[C1:%.*]] ; CHECK-NEXT: [[T4:%.*]] = lshr exact <2 x i32> , [[C2:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> [[T]], [[T4]] -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], [[K:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[K:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR:%.*]] = icmp ne <2 x i32> [[TMP2]], [[TMP1]] ; CHECK-NEXT: ret <2 x i1> [[OR]] ; @@ -394,7 +394,7 @@ define i1 @foo1_or_signbit_lshr(i32 %k, i32 %c1, i32 %c2) { ; CHECK-NEXT: [[T:%.*]] = shl nuw i32 1, [[C1:%.*]] ; CHECK-NEXT: [[T4:%.*]] = lshr exact i32 -2147483648, [[C2:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[T]], [[T4]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[K:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[K:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR:%.*]] = icmp eq i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: ret i1 [[OR]] ; @@ -414,7 +414,7 @@ define i1 @foo1_or_signbit_lshr_logical(i32 %k, i32 %c1, i32 %c2) { ; CHECK-NEXT: [[T4:%.*]] = lshr exact i32 -2147483648, [[C2:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = freeze i32 [[T4]] ; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[T]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], [[K:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[K:%.*]], [[TMP2]] ; CHECK-NEXT: [[OR:%.*]] = icmp eq i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: ret i1 [[OR]] ; @@ -433,7 +433,7 @@ define <2 x i1> @foo1_or_signbit_lshr_vector(<2 x i32> %k, <2 x i32> %c1, <2 x i ; CHECK-NEXT: [[T:%.*]] = shl nuw <2 x i32> , [[C1:%.*]] ; CHECK-NEXT: [[T4:%.*]] = lshr exact <2 x i32> , [[C2:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> [[T]], [[T4]] -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], [[K:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[K:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR:%.*]] = icmp eq <2 x i32> [[TMP2]], [[TMP1]] ; CHECK-NEXT: ret <2 x i1> [[OR]] ; @@ -618,7 +618,7 @@ define i1 @foo1_and_extra_use_shl(i32 %k, i32 %c1, i32 %c2, ptr %p) { ; CHECK-NEXT: store i32 [[T0]], ptr [[P:%.*]], align 4 ; CHECK-NEXT: [[T1:%.*]] = shl nuw i32 1, [[C2:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[T0]], [[T1]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[K:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[K:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR:%.*]] = icmp ne i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: ret i1 [[OR]] ; @@ -640,7 +640,7 @@ define i1 @foo1_and_extra_use_shl_logical(i32 %k, i32 %c1, i32 %c2, ptr %p) { ; CHECK-NEXT: [[T1:%.*]] = shl nuw i32 1, [[C2:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = freeze i32 [[T1]] ; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[T0]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], [[K:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[K:%.*]], [[TMP2]] ; CHECK-NEXT: [[OR:%.*]] = icmp ne i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: ret i1 [[OR]] ; @@ -663,7 +663,7 @@ define i1 @foo1_and_extra_use_and(i32 %k, i32 %c1, i32 %c2, ptr %p) { ; CHECK-NEXT: [[T2:%.*]] = and i32 [[T0]], [[K:%.*]] ; CHECK-NEXT: store i32 [[T2]], ptr [[P:%.*]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[T0]], [[T1]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[K]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[K]], [[TMP1]] ; CHECK-NEXT: [[OR:%.*]] = icmp ne i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: ret i1 [[OR]] ; @@ -686,7 +686,7 @@ define i1 @foo1_and_extra_use_and_logical(i32 %k, i32 %c1, i32 %c2, ptr %p) { ; CHECK-NEXT: store i32 [[T2]], ptr [[P:%.*]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = freeze i32 [[T1]] ; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[T0]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], [[K]] +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[K]], [[TMP2]] ; CHECK-NEXT: [[OR:%.*]] = icmp ne i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: ret i1 [[OR]] ; @@ -710,7 +710,7 @@ define i1 @foo1_and_extra_use_cmp(i32 %k, i32 %c1, i32 %c2, ptr %p) { ; CHECK-NEXT: [[T3:%.*]] = icmp eq i32 [[T2]], 0 ; CHECK-NEXT: store i1 [[T3]], ptr [[P:%.*]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[T0]], [[T1]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[K]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[K]], [[TMP1]] ; CHECK-NEXT: [[OR:%.*]] = icmp ne i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: ret i1 [[OR]] ; @@ -734,7 +734,7 @@ define i1 @foo1_and_extra_use_cmp_logical(i32 %k, i32 %c1, i32 %c2, ptr %p) { ; CHECK-NEXT: store i1 [[T3]], ptr [[P:%.*]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = freeze i32 [[T1]] ; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[T0]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], [[K]] +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[K]], [[TMP2]] ; CHECK-NEXT: [[OR:%.*]] = icmp ne i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: ret i1 [[OR]] ; @@ -756,7 +756,7 @@ define i1 @foo1_and_extra_use_shl2(i32 %k, i32 %c1, i32 %c2, ptr %p) { ; CHECK-NEXT: [[T1:%.*]] = shl nuw i32 1, [[C2:%.*]] ; CHECK-NEXT: store i32 [[T1]], ptr [[P:%.*]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[T0]], [[T1]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[K:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[K:%.*]], [[TMP1]] ; CHECK-NEXT: [[OR:%.*]] = icmp ne i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: ret i1 [[OR]] ; @@ -778,7 +778,7 @@ define i1 @foo1_and_extra_use_shl2_logical(i32 %k, i32 %c1, i32 %c2, ptr %p) { ; CHECK-NEXT: [[TMP1:%.*]] = freeze i32 [[T1]] ; CHECK-NEXT: store i32 [[TMP1]], ptr [[P:%.*]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[T0]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], [[K:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[K:%.*]], [[TMP2]] ; CHECK-NEXT: [[OR:%.*]] = icmp ne i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: ret i1 [[OR]] ; @@ -801,7 +801,7 @@ define i1 @foo1_and_extra_use_and2(i32 %k, i32 %c1, i32 %c2, ptr %p) { ; CHECK-NEXT: [[T4:%.*]] = and i32 [[T1]], [[K:%.*]] ; CHECK-NEXT: store i32 [[T4]], ptr [[P:%.*]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[T0]], [[T1]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[K]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[K]], [[TMP1]] ; CHECK-NEXT: [[OR:%.*]] = icmp ne i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: ret i1 [[OR]] ; @@ -824,7 +824,7 @@ define i1 @foo1_and_extra_use_and2_logical(i32 %k, i32 %c1, i32 %c2, ptr %p) { ; CHECK-NEXT: [[T4:%.*]] = and i32 [[TMP1]], [[K:%.*]] ; CHECK-NEXT: store i32 [[T4]], ptr [[P:%.*]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[T0]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], [[K]] +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[K]], [[TMP2]] ; CHECK-NEXT: [[OR:%.*]] = icmp ne i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: ret i1 [[OR]] ; @@ -848,7 +848,7 @@ define i1 @foo1_and_extra_use_cmp2(i32 %k, i32 %c1, i32 %c2, ptr %p) { ; CHECK-NEXT: [[T5:%.*]] = icmp eq i32 [[T4]], 0 ; CHECK-NEXT: store i1 [[T5]], ptr [[P:%.*]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[T0]], [[T1]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[K]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[K]], [[TMP1]] ; CHECK-NEXT: [[OR:%.*]] = icmp ne i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: ret i1 [[OR]] ; @@ -872,7 +872,7 @@ define i1 @foo1_and_extra_use_cmp2_logical(i32 %k, i32 %c1, i32 %c2, ptr %p) { ; CHECK-NEXT: [[T5:%.*]] = icmp eq i32 [[T4]], 0 ; CHECK-NEXT: store i1 [[T5]], ptr [[P:%.*]], align 1 ; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[T0]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], [[K]] +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[K]], [[TMP2]] ; CHECK-NEXT: [[OR:%.*]] = icmp ne i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: ret i1 [[OR]] ; diff --git a/llvm/test/Transforms/InstCombine/or-xor-xor.ll b/llvm/test/Transforms/InstCombine/or-xor-xor.ll index 327d5f8d6220a5..c3f1aedb1879a0 100644 --- a/llvm/test/Transforms/InstCombine/or-xor-xor.ll +++ b/llvm/test/Transforms/InstCombine/or-xor-xor.ll @@ -98,7 +98,7 @@ define i3 @or_xor_xor_normal_multiple_uses_and(i3 %a, i3 %b) { define i32 @or_xor_xor_negative_multiple_uses_xor1(i32 %a, i32 %b) { ; CHECK-LABEL: @or_xor_xor_negative_multiple_uses_xor1( ; CHECK-NEXT: [[AND1:%.*]] = xor i32 [[A:%.*]], -1 -; CHECK-NEXT: [[XOR1:%.*]] = and i32 [[AND1]], [[B:%.*]] +; CHECK-NEXT: [[XOR1:%.*]] = and i32 [[B:%.*]], [[AND1]] ; CHECK-NEXT: call void @use.i32(i32 [[XOR1]]) ; CHECK-NEXT: [[OR:%.*]] = xor i32 [[A]], [[B]] ; CHECK-NEXT: ret i32 [[OR]] @@ -114,7 +114,7 @@ define i32 @or_xor_xor_negative_multiple_uses_xor1(i32 %a, i32 %b) { define i5 @or_xor_xor_negative_multiple_uses_xor2(i5 %a, i5 %b) { ; CHECK-LABEL: @or_xor_xor_negative_multiple_uses_xor2( ; CHECK-NEXT: [[A1:%.*]] = xor i5 [[B:%.*]], -1 -; CHECK-NEXT: [[XOR2:%.*]] = and i5 [[A1]], [[A:%.*]] +; CHECK-NEXT: [[XOR2:%.*]] = and i5 [[A:%.*]], [[A1]] ; CHECK-NEXT: call void @use.i5(i5 [[XOR2]]) ; CHECK-NEXT: [[OR:%.*]] = xor i5 [[A]], [[B]] ; CHECK-NEXT: ret i5 [[OR]] diff --git a/llvm/test/Transforms/InstCombine/or-xor.ll b/llvm/test/Transforms/InstCombine/or-xor.ll index cf6b9000182d22..f4ddbb5abc4639 100644 --- a/llvm/test/Transforms/InstCombine/or-xor.ll +++ b/llvm/test/Transforms/InstCombine/or-xor.ll @@ -8,7 +8,7 @@ declare void @use(i8) define i32 @test1(i32 %x, i32 %y) { ; CHECK-LABEL: @test1( ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[Y:%.*]], -1 -; CHECK-NEXT: [[Z:%.*]] = or i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[Z:%.*]] = or i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[Z]] ; %or = or i32 %x, %y @@ -23,7 +23,7 @@ define i32 @test1(i32 %x, i32 %y) { define i32 @test2(i32 %x, i32 %y) { ; CHECK-LABEL: @test2( ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X:%.*]], -1 -; CHECK-NEXT: [[Z:%.*]] = or i32 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[Z:%.*]] = or i32 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[Z]] ; %or = or i32 %x, %y @@ -37,7 +37,7 @@ define i32 @test2(i32 %x, i32 %y) { define i32 @test3(i32 %x, i32 %y) { ; CHECK-LABEL: @test3( ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[Y:%.*]], -1 -; CHECK-NEXT: [[Z:%.*]] = or i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[Z:%.*]] = or i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[Z]] ; %xor = xor i32 %x, %y @@ -52,7 +52,7 @@ define i32 @test3(i32 %x, i32 %y) { define i32 @test4(i32 %x, i32 %y) { ; CHECK-LABEL: @test4( ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X:%.*]], -1 -; CHECK-NEXT: [[Z:%.*]] = or i32 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[Z:%.*]] = or i32 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[Z]] ; %xor = xor i32 %x, %y @@ -206,7 +206,7 @@ define i8 @xor_common_op_commute3(i8 %p, i8 %q) { define i32 @test8(i32 %x, i32 %y) { ; CHECK-LABEL: @test8( ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X:%.*]], -1 -; CHECK-NEXT: [[Z:%.*]] = or i32 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[Z:%.*]] = or i32 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[Z]] ; %not = xor i32 %y, -1 @@ -218,7 +218,7 @@ define i32 @test8(i32 %x, i32 %y) { define i32 @test9(i32 %x, i32 %y) { ; CHECK-LABEL: @test9( ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[Y:%.*]], -1 -; CHECK-NEXT: [[Z:%.*]] = or i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[Z:%.*]] = or i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[Z]] ; %not = xor i32 %x, -1 @@ -306,7 +306,7 @@ define i32 @test10_canonical(i32 %A, i32 %B) { ; (x | y) & ((~x) ^ y) -> (x & y) define i32 @test11(i32 %x, i32 %y) { ; CHECK-LABEL: @test11( -; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: ret i32 [[AND]] ; %or = or i32 %x, %y @@ -319,7 +319,7 @@ define i32 @test11(i32 %x, i32 %y) { ; ((~x) ^ y) & (x | y) -> (x & y) define i32 @test12(i32 %x, i32 %y) { ; CHECK-LABEL: @test12( -; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: ret i32 [[AND]] ; %neg = xor i32 %x, -1 @@ -331,7 +331,7 @@ define i32 @test12(i32 %x, i32 %y) { define i32 @test12_commuted(i32 %x, i32 %y) { ; CHECK-LABEL: @test12_commuted( -; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: ret i32 [[AND]] ; %neg = xor i32 %x, -1 @@ -344,7 +344,7 @@ define i32 @test12_commuted(i32 %x, i32 %y) { ; ((x | y) ^ (x ^ y)) -> (x & y) define i32 @test13(i32 %x, i32 %y) { ; CHECK-LABEL: @test13( -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[Y:%.*]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: ret i32 [[TMP1]] ; %1 = xor i32 %y, %x @@ -800,7 +800,7 @@ define i4 @or_not_xor_common_op_commute0(i4 %x, i4 %y, i4 %z) { ; CHECK-LABEL: @or_not_xor_common_op_commute0( ; CHECK-NEXT: [[TMP1:%.*]] = and i4 [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[NAND:%.*]] = xor i4 [[TMP1]], -1 -; CHECK-NEXT: [[O2:%.*]] = or i4 [[NAND]], [[Z:%.*]] +; CHECK-NEXT: [[O2:%.*]] = or i4 [[Z:%.*]], [[NAND]] ; CHECK-NEXT: ret i4 [[O2]] ; %notx = xor i4 %x, -1 @@ -816,7 +816,7 @@ define i8 @or_not_xor_common_op_commute1(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: call void @use(i8 [[NOTX]]) ; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X]], [[Y:%.*]] ; CHECK-NEXT: [[NAND:%.*]] = xor i8 [[TMP1]], -1 -; CHECK-NEXT: [[O2:%.*]] = or i8 [[NAND]], [[Z:%.*]] +; CHECK-NEXT: [[O2:%.*]] = or i8 [[Z:%.*]], [[NAND]] ; CHECK-NEXT: ret i8 [[O2]] ; %notx = xor i8 %x, -1 @@ -863,7 +863,7 @@ define <2 x i4> @or_not_xor_common_op_commute4(<2 x i4> %x, <2 x i4> %y, <2 x i4 ; CHECK-LABEL: @or_not_xor_common_op_commute4( ; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i4> [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: [[NAND:%.*]] = xor <2 x i4> [[TMP1]], -; CHECK-NEXT: [[O2:%.*]] = or <2 x i4> [[NAND]], [[Z:%.*]] +; CHECK-NEXT: [[O2:%.*]] = or <2 x i4> [[Z:%.*]], [[NAND]] ; CHECK-NEXT: ret <2 x i4> [[O2]] ; %notx = xor <2 x i4> %x, @@ -877,7 +877,7 @@ define i8 @or_not_xor_common_op_commute5(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @or_not_xor_common_op_commute5( ; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: [[NAND:%.*]] = xor i8 [[TMP1]], -1 -; CHECK-NEXT: [[O2:%.*]] = or i8 [[NAND]], [[Z:%.*]] +; CHECK-NEXT: [[O2:%.*]] = or i8 [[Z:%.*]], [[NAND]] ; CHECK-NEXT: ret i8 [[O2]] ; %notx = xor i8 %x, -1 @@ -926,7 +926,7 @@ define i8 @or_not_xor_common_op_use1(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[NOTX:%.*]] = xor i8 [[X:%.*]], -1 ; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X]], [[Y:%.*]] ; CHECK-NEXT: call void @use(i8 [[XOR]]) -; CHECK-NEXT: [[O1:%.*]] = or i8 [[NOTX]], [[Z:%.*]] +; CHECK-NEXT: [[O1:%.*]] = or i8 [[Z:%.*]], [[NOTX]] ; CHECK-NEXT: [[O2:%.*]] = or i8 [[XOR]], [[O1]] ; CHECK-NEXT: ret i8 [[O2]] ; @@ -944,7 +944,7 @@ define i8 @or_not_xor_common_op_use2(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @or_not_xor_common_op_use2( ; CHECK-NEXT: [[NOTX:%.*]] = xor i8 [[X:%.*]], -1 ; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[X]], [[Y:%.*]] -; CHECK-NEXT: [[O1:%.*]] = or i8 [[NOTX]], [[Z:%.*]] +; CHECK-NEXT: [[O1:%.*]] = or i8 [[Z:%.*]], [[NOTX]] ; CHECK-NEXT: call void @use(i8 [[O1]]) ; CHECK-NEXT: [[O2:%.*]] = or i8 [[XOR]], [[O1]] ; CHECK-NEXT: ret i8 [[O2]] @@ -1098,7 +1098,7 @@ define i32 @PR75692_3(i32 %x, i32 %y) { define i32 @or_xor_not(i32 %x, i32 %y) { ; CHECK-LABEL: @or_xor_not( ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X:%.*]], -1 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[OR1]] ; %not = xor i32 %y, -1 @@ -1112,7 +1112,7 @@ define i32 @or_xor_not_uses1(i32 %x, i32 %y) { ; CHECK-NEXT: [[NOT:%.*]] = xor i32 [[Y:%.*]], -1 ; CHECK-NEXT: call void @use(i32 [[NOT]]) ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X:%.*]], -1 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[TMP1]], [[Y]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[Y]], [[TMP1]] ; CHECK-NEXT: ret i32 [[OR1]] ; %not = xor i32 %y, -1 @@ -1127,7 +1127,7 @@ define i32 @or_xor_not_uses2(i32 %x, i32 %y) { ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], -1 ; CHECK-NEXT: call void @use(i32 [[XOR]]) -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[XOR]], [[Y]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[Y]], [[XOR]] ; CHECK-NEXT: ret i32 [[OR1]] ; %not = xor i32 %y, -1 diff --git a/llvm/test/Transforms/InstCombine/or.ll b/llvm/test/Transforms/InstCombine/or.ll index 6e2085a8bb6c7d..9bcad034b363e7 100644 --- a/llvm/test/Transforms/InstCombine/or.ll +++ b/llvm/test/Transforms/InstCombine/or.ll @@ -696,7 +696,7 @@ define i32 @test39d(i32 %a, float %b) { define i32 @test40(i32 %a, i32 %b) { ; CHECK-LABEL: @test40( ; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[A:%.*]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[XOR]], [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[B:%.*]], [[XOR]] ; CHECK-NEXT: ret i32 [[OR]] ; %and = and i32 %a, %b @@ -708,7 +708,7 @@ define i32 @test40(i32 %a, i32 %b) { define i32 @test40b(i32 %a, i32 %b) { ; CHECK-LABEL: @test40b( ; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[A:%.*]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[XOR]], [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[B:%.*]], [[XOR]] ; CHECK-NEXT: ret i32 [[OR]] ; %and = and i32 %b, %a @@ -720,7 +720,7 @@ define i32 @test40b(i32 %a, i32 %b) { define i32 @test40c(i32 %a, i32 %b) { ; CHECK-LABEL: @test40c( ; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[A:%.*]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[XOR]], [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[B:%.*]], [[XOR]] ; CHECK-NEXT: ret i32 [[OR]] ; %and = and i32 %b, %a @@ -732,7 +732,7 @@ define i32 @test40c(i32 %a, i32 %b) { define i32 @test40d(i32 %a, i32 %b) { ; CHECK-LABEL: @test40d( ; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[A:%.*]], -1 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[XOR]], [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[B:%.*]], [[XOR]] ; CHECK-NEXT: ret i32 [[OR]] ; %and = and i32 %a, %b @@ -743,7 +743,7 @@ define i32 @test40d(i32 %a, i32 %b) { define i32 @test45(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @test45( -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[Z:%.*]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[X:%.*]], [[Z:%.*]] ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[TMP1]], [[Y:%.*]] ; CHECK-NEXT: ret i32 [[OR1]] ; @@ -757,7 +757,7 @@ define i32 @test45_uses1(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @test45_uses1( ; CHECK-NEXT: [[OR:%.*]] = or i32 [[Y:%.*]], [[Z:%.*]] ; CHECK-NEXT: call void @use(i32 [[OR]]) -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[Z]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[X:%.*]], [[Z]] ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[TMP1]], [[Y]] ; CHECK-NEXT: ret i32 [[OR1]] ; @@ -771,7 +771,7 @@ define i32 @test45_uses1(i32 %x, i32 %y, i32 %z) { define i32 @test45_uses2(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @test45_uses2( ; CHECK-NEXT: [[OR:%.*]] = or i32 [[Y:%.*]], [[Z:%.*]] -; CHECK-NEXT: [[AND:%.*]] = and i32 [[OR]], [[X:%.*]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], [[OR]] ; CHECK-NEXT: call void @use(i32 [[AND]]) ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[AND]], [[Y]] ; CHECK-NEXT: ret i32 [[OR1]] @@ -1605,7 +1605,7 @@ define i32 @mul_no_common_bits_commute2(i32 %p1, i32 %p2) { define i32 @mul_no_common_bits_disjoint(i32 %x, i32 %y) { ; CHECK-LABEL: @mul_no_common_bits_disjoint( ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[Y:%.*]], 1 -; CHECK-NEXT: [[R:%.*]] = mul i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = mul i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[R]] ; %m = mul i32 %x, %y @@ -1976,7 +1976,7 @@ define i32 @or_xor_and_uses1(i32 %x, i32 %y, i32 %z) { define i32 @or_xor_and_uses2(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @or_xor_and_uses2( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[Y:%.*]], [[Z:%.*]] -; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[AND]], [[X:%.*]] +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[X:%.*]], [[AND]] ; CHECK-NEXT: call void @use(i32 [[XOR]]) ; CHECK-NEXT: [[OR1:%.*]] = or i32 [[X]], [[Y]] ; CHECK-NEXT: ret i32 [[OR1]] @@ -2019,7 +2019,7 @@ define i32 @or_xor_and_commuted2(i32 %x, i32 %y, i32 %z) { define i32 @or_xor_and_commuted3(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @or_xor_and_commuted3( ; CHECK-NEXT: [[YY:%.*]] = mul i32 [[Y:%.*]], [[Y]] -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[YY]], [[X:%.*]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[X:%.*]], [[YY]] ; CHECK-NEXT: ret i32 [[OR1]] ; %yy = mul i32 %y, %y ; thwart complexity-based ordering diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-b.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-b.ll index 5ed7d641df65be..469375633b60e1 100644 --- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-b.ll +++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-b.ll @@ -210,7 +210,7 @@ define i32 @n5_extrause0(i64 %x, i32 %nbits) { ; CHECK-NEXT: call void @use64(i64 [[T2]]) ; CHECK-NEXT: call void @use64(i64 [[T3]]) ; CHECK-NEXT: call void @use32(i32 [[T4]]) -; CHECK-NEXT: [[T5:%.*]] = and i64 [[T3]], [[X:%.*]] +; CHECK-NEXT: [[T5:%.*]] = and i64 [[X:%.*]], [[T3]] ; CHECK-NEXT: call void @use64(i64 [[T5]]) ; CHECK-NEXT: [[T6:%.*]] = trunc i64 [[T5]] to i32 ; CHECK-NEXT: [[T7:%.*]] = shl i32 [[T6]], [[T4]] @@ -246,7 +246,7 @@ define i32 @n6_extrause1(i64 %x, i32 %nbits) { ; CHECK-NEXT: call void @use64(i64 [[T2]]) ; CHECK-NEXT: call void @use64(i64 [[T3]]) ; CHECK-NEXT: call void @use32(i32 [[T4]]) -; CHECK-NEXT: [[T5:%.*]] = and i64 [[T3]], [[X:%.*]] +; CHECK-NEXT: [[T5:%.*]] = and i64 [[X:%.*]], [[T3]] ; CHECK-NEXT: [[T6:%.*]] = trunc i64 [[T5]] to i32 ; CHECK-NEXT: call void @use32(i32 [[T6]]) ; CHECK-NEXT: [[T7:%.*]] = shl i32 [[T6]], [[T4]] @@ -282,7 +282,7 @@ define i32 @n7_extrause2(i64 %x, i32 %nbits) { ; CHECK-NEXT: call void @use64(i64 [[T2]]) ; CHECK-NEXT: call void @use64(i64 [[T3]]) ; CHECK-NEXT: call void @use32(i32 [[T4]]) -; CHECK-NEXT: [[T5:%.*]] = and i64 [[T3]], [[X:%.*]] +; CHECK-NEXT: [[T5:%.*]] = and i64 [[X:%.*]], [[T3]] ; CHECK-NEXT: call void @use64(i64 [[T5]]) ; CHECK-NEXT: [[T6:%.*]] = trunc i64 [[T5]] to i32 ; CHECK-NEXT: call void @use32(i32 [[T6]]) diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-b.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-b.ll index 1debf111b18cd7..bce2a1c3f7e505 100644 --- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-b.ll +++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-b.ll @@ -132,7 +132,7 @@ define i32 @n3_extrause(i32 %x, i32 %nbits) { ; CHECK-NEXT: [[T0:%.*]] = add i32 [[NBITS:%.*]], -1 ; CHECK-NEXT: [[T1:%.*]] = shl nsw i32 -1, [[T0]] ; CHECK-NEXT: [[T2:%.*]] = xor i32 [[T1]], -1 -; CHECK-NEXT: [[T3:%.*]] = and i32 [[T2]], [[X:%.*]] +; CHECK-NEXT: [[T3:%.*]] = and i32 [[X:%.*]], [[T2]] ; CHECK-NEXT: [[T4:%.*]] = sub i32 32, [[NBITS]] ; CHECK-NEXT: call void @use32(i32 [[T0]]) ; CHECK-NEXT: call void @use32(i32 [[T1]]) diff --git a/llvm/test/Transforms/InstCombine/phi.ll b/llvm/test/Transforms/InstCombine/phi.ll index 2673b1d74bb6fb..e03e45312687bc 100644 --- a/llvm/test/Transforms/InstCombine/phi.ll +++ b/llvm/test/Transforms/InstCombine/phi.ll @@ -1416,7 +1416,7 @@ define i1 @phi_knownnonzero_eq_oricmp_commuted(i32 %n, i32 %s, ptr %P, i32 %val) ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: ; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 1, [[IF_THEN]] ], [ [[N]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ORPHI:%.*]] = or i32 [[PHI]], [[VAL:%.*]] +; CHECK-NEXT: [[ORPHI:%.*]] = or i32 [[VAL:%.*]], [[PHI]] ; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[ORPHI]], 0 ; CHECK-NEXT: ret i1 [[CMP1]] ; @@ -1506,7 +1506,7 @@ define i1 @phi_knownnonzero_ne_oricmp_commuted(i32 %n, i32 %s, ptr %P, i32 %val) ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: ; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 1, [[IF_THEN]] ], [ [[N]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ORPHI:%.*]] = or i32 [[PHI]], [[VAL:%.*]] +; CHECK-NEXT: [[ORPHI:%.*]] = or i32 [[VAL:%.*]], [[PHI]] ; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[ORPHI]], 0 ; CHECK-NEXT: ret i1 [[CMP1]] ; @@ -1580,7 +1580,7 @@ define i1 @phi_knownnonzero_ne_multiuse_oricmp_commuted(i32 %n, i32 %s, ptr %P, ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: ; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 1, [[IF_THEN]] ], [ [[N]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[ORPHI:%.*]] = or i32 [[PHI]], [[VAL:%.*]] +; CHECK-NEXT: [[ORPHI:%.*]] = or i32 [[VAL:%.*]], [[PHI]] ; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[ORPHI]], 0 ; CHECK-NEXT: br i1 [[CMP1]], label [[NEXT:%.*]], label [[CLEANUP:%.*]] ; CHECK: next: @@ -1622,7 +1622,7 @@ define i1 @phi_knownnonzero_eq_multiuse_andicmp(i32 %n, i32 %s, ptr %P, i32 %val ; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] ; CHECK: if.then: ; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P:%.*]], align 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LOAD]], [[N]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[N]], [[LOAD]] ; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 1, i32 2 ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: @@ -1669,7 +1669,7 @@ define i1 @phi_knownnonzero_ne_multiuse_andicmp(i32 %n, i32 %s, ptr %P, i32 %val ; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] ; CHECK: if.then: ; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P:%.*]], align 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LOAD]], [[N]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[N]], [[LOAD]] ; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 1, i32 2 ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: diff --git a/llvm/test/Transforms/InstCombine/pr44242.ll b/llvm/test/Transforms/InstCombine/pr44242.ll index e86c17057fe27c..bce22734127da2 100644 --- a/llvm/test/Transforms/InstCombine/pr44242.ll +++ b/llvm/test/Transforms/InstCombine/pr44242.ll @@ -12,7 +12,7 @@ define float @sitofp(float %x) { ; CHECK: loop_header: ; CHECK-NEXT: [[VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VAL_INCR_CASTED:%.*]], [[LOOP:%.*]] ] ; CHECK-NEXT: [[VAL_CASTED:%.*]] = bitcast i32 [[VAL]] to float -; CHECK-NEXT: [[CMP:%.*]] = fcmp ogt float [[VAL_CASTED]], [[X:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], [[VAL_CASTED]] ; CHECK-NEXT: br i1 [[CMP]], label [[END:%.*]], label [[LOOP]] ; CHECK: loop: ; CHECK-NEXT: [[VAL_INCR:%.*]] = fadd float [[VAL_CASTED]], 1.000000e+00 @@ -46,7 +46,7 @@ define <2 x i16> @bitcast(float %x) { ; CHECK: loop_header: ; CHECK-NEXT: [[VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VAL_INCR_CASTED:%.*]], [[LOOP:%.*]] ] ; CHECK-NEXT: [[VAL_CASTED:%.*]] = bitcast i32 [[VAL]] to float -; CHECK-NEXT: [[CMP:%.*]] = fcmp ogt float [[VAL_CASTED]], [[X:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], [[VAL_CASTED]] ; CHECK-NEXT: br i1 [[CMP]], label [[END:%.*]], label [[LOOP]] ; CHECK: loop: ; CHECK-NEXT: [[VAL_INCR:%.*]] = fadd float [[VAL_CASTED]], 1.000000e+00 @@ -82,7 +82,7 @@ define void @store_volatile(float %x) { ; CHECK: loop_header: ; CHECK-NEXT: [[VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VAL_INCR_CASTED:%.*]], [[LOOP:%.*]] ] ; CHECK-NEXT: [[VAL_CASTED:%.*]] = bitcast i32 [[VAL]] to float -; CHECK-NEXT: [[CMP:%.*]] = fcmp ogt float [[VAL_CASTED]], [[X:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], [[VAL_CASTED]] ; CHECK-NEXT: br i1 [[CMP]], label [[END:%.*]], label [[LOOP]] ; CHECK: loop: ; CHECK-NEXT: [[VAL_INCR:%.*]] = fadd float [[VAL_CASTED]], 1.000000e+00 @@ -149,7 +149,7 @@ define i32 @multiple_phis(float %x) { ; CHECK: loop_header: ; CHECK-NEXT: [[VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VAL2:%.*]], [[LOOP_END:%.*]] ] ; CHECK-NEXT: [[VAL_CASTED:%.*]] = bitcast i32 [[VAL]] to float -; CHECK-NEXT: [[CMP:%.*]] = fcmp ogt float [[VAL_CASTED]], [[X:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[X:%.*]], [[VAL_CASTED]] ; CHECK-NEXT: br i1 [[CMP]], label [[END:%.*]], label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt float [[VAL_CASTED]], 2.000000e+00 diff --git a/llvm/test/Transforms/InstCombine/pr49688.ll b/llvm/test/Transforms/InstCombine/pr49688.ll index 284b098b02afa0..902aea262f537a 100644 --- a/llvm/test/Transforms/InstCombine/pr49688.ll +++ b/llvm/test/Transforms/InstCombine/pr49688.ll @@ -7,7 +7,7 @@ define i1 @f(i32 %i1) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I1:%.*]], 0 ; CHECK-NEXT: [[SHR:%.*]] = lshr i32 7, [[I1]] -; CHECK-NEXT: [[CMP4:%.*]] = icmp slt i32 [[SHR]], [[I1]] +; CHECK-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[I1]], [[SHR]] ; CHECK-NEXT: [[I2:%.*]] = select i1 [[CMP]], i1 true, i1 [[CMP4]] ; CHECK-NEXT: ret i1 [[I2]] ; @@ -24,7 +24,7 @@ define i32 @f2(i32 signext %g, i32 zeroext %h) { ; CHECK-LABEL: @f2( ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[G:%.*]], 0 ; CHECK-NEXT: [[SHR:%.*]] = lshr i32 7, [[H:%.*]] -; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[SHR]], [[G]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[G]], [[SHR]] ; CHECK-NEXT: [[DOT0:%.*]] = select i1 [[CMP]], i1 true, i1 [[CMP1]] ; CHECK-NEXT: [[LOR_EXT:%.*]] = zext i1 [[DOT0]] to i32 ; CHECK-NEXT: ret i32 [[LOR_EXT]] diff --git a/llvm/test/Transforms/InstCombine/pr75369.ll b/llvm/test/Transforms/InstCombine/pr75369.ll index 2f90753504b36d..3855880047d6b7 100644 --- a/llvm/test/Transforms/InstCombine/pr75369.ll +++ b/llvm/test/Transforms/InstCombine/pr75369.ll @@ -5,7 +5,7 @@ define i32 @main(ptr %a, i8 %a0, i32 %conv, i8 %a1) { ; CHECK-LABEL: define i32 @main( ; CHECK-SAME: ptr [[A:%.*]], i8 [[A0:%.*]], i32 [[CONV:%.*]], i8 [[A1:%.*]]) { ; CHECK-NEXT: [[A3:%.*]] = trunc i32 [[CONV]] to i8 -; CHECK-NEXT: [[OR11:%.*]] = or i8 [[A3]], [[A0]] +; CHECK-NEXT: [[OR11:%.*]] = or i8 [[A0]], [[A3]] ; CHECK-NEXT: store i8 [[OR11]], ptr [[A]], align 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[A1]], 0 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) diff --git a/llvm/test/Transforms/InstCombine/ptr-int-ptr-icmp.ll b/llvm/test/Transforms/InstCombine/ptr-int-ptr-icmp.ll index 5249aa4269e877..eec78063805a1a 100644 --- a/llvm/test/Transforms/InstCombine/ptr-int-ptr-icmp.ll +++ b/llvm/test/Transforms/InstCombine/ptr-int-ptr-icmp.ll @@ -8,7 +8,7 @@ target triple = "x86_64-unknown-linux-gnu" define i1 @func(ptr %X, ptr %Y) { ; CHECK-LABEL: @func( -; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: ret i1 [[CMP]] ; %i = ptrtoint ptr %X to i64 @@ -19,7 +19,7 @@ define i1 @func(ptr %X, ptr %Y) { define <2 x i1> @func_vec(<2 x ptr> %X, <2 x ptr> %Y) { ; CHECK-LABEL: @func_vec( -; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x ptr> [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x ptr> [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %i = ptrtoint <2 x ptr> %X to <2 x i64> @@ -30,7 +30,7 @@ define <2 x i1> @func_vec(<2 x ptr> %X, <2 x ptr> %Y) { define @func_svec( %X, %Y) { ; CHECK-LABEL: @func_svec( -; CHECK-NEXT: [[CMP:%.*]] = icmp eq [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: ret [[CMP]] ; %i = ptrtoint %X to @@ -41,7 +41,7 @@ define @func_svec( %X, %Y define i1 @func_pointer_different_types(ptr %X, ptr %Y) { ; CHECK-LABEL: @func_pointer_different_types( -; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: ret i1 [[CMP]] ; %i = ptrtoint ptr %X to i64 @@ -72,7 +72,7 @@ define i1 @func_integer_type_too_small(ptr %X, ptr %Y) { ; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[X:%.*]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], 4294967295 ; CHECK-NEXT: [[P:%.*]] = inttoptr i64 [[TMP2]] to ptr -; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[P]], [[Y:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[Y:%.*]], [[P]] ; CHECK-NEXT: ret i1 [[CMP]] ; %i = ptrtoint ptr %X to i32 @@ -87,7 +87,7 @@ define i1 @func_ptr_different_addrspace(ptr %X, ptr addrspace(3) %Y){ ; CHECK-LABEL: @func_ptr_different_addrspace( ; CHECK-NEXT: [[I:%.*]] = ptrtoint ptr [[X:%.*]] to i64 ; CHECK-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr addrspace(3) -; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr addrspace(3) [[P]], [[Y:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr addrspace(3) [[Y:%.*]], [[P]] ; CHECK-NEXT: ret i1 [[CMP]] ; %i = ptrtoint ptr %X to i64 @@ -103,7 +103,7 @@ define i1 @func_ptr_different_addrspace1(ptr addrspace(2) %X, ptr %Y){ ; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(2) [[X:%.*]] to i32 ; CHECK-NEXT: [[I:%.*]] = zext i32 [[TMP1]] to i64 ; CHECK-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr -; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[P]], [[Y:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[Y:%.*]], [[P]] ; CHECK-NEXT: ret i1 [[CMP]] ; %i = ptrtoint ptr addrspace(2) %X to i64 diff --git a/llvm/test/Transforms/InstCombine/ptrmask.ll b/llvm/test/Transforms/InstCombine/ptrmask.ll index 4631b81cd1ce1f..24777b1b7f2085 100644 --- a/llvm/test/Transforms/InstCombine/ptrmask.ll +++ b/llvm/test/Transforms/InstCombine/ptrmask.ll @@ -155,7 +155,7 @@ define i64 @ptrtoint_of_ptrmask(ptr %p, i64 %m) { ; CHECK-LABEL: define i64 @ptrtoint_of_ptrmask ; CHECK-SAME: (ptr [[P:%.*]], i64 [[M:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64 -; CHECK-NEXT: [[R:%.*]] = and i64 [[TMP1]], [[M]] +; CHECK-NEXT: [[R:%.*]] = and i64 [[M]], [[TMP1]] ; CHECK-NEXT: ret i64 [[R]] ; %pm = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 %m) @@ -168,7 +168,7 @@ define i32 @ptrtoint_of_ptrmask2(ptr %p, i64 %m) { ; CHECK-LABEL: define i32 @ptrtoint_of_ptrmask2 ; CHECK-SAME: (ptr [[P:%.*]], i64 [[M:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], [[M]] +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[M]], [[TMP1]] ; CHECK-NEXT: [[R:%.*]] = trunc i64 [[TMP2]] to i32 ; CHECK-NEXT: ret i32 [[R]] ; @@ -181,7 +181,7 @@ define <2 x i64> @ptrtoint_of_ptrmask_vec(<2 x ptr> %p, <2 x i64> %m) { ; CHECK-LABEL: define <2 x i64> @ptrtoint_of_ptrmask_vec ; CHECK-SAME: (<2 x ptr> [[P:%.*]], <2 x i64> [[M:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint <2 x ptr> [[P]] to <2 x i64> -; CHECK-NEXT: [[R:%.*]] = and <2 x i64> [[TMP1]], [[M]] +; CHECK-NEXT: [[R:%.*]] = and <2 x i64> [[M]], [[TMP1]] ; CHECK-NEXT: ret <2 x i64> [[R]] ; %pm = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %p, <2 x i64> %m) @@ -193,7 +193,7 @@ define <2 x i32> @ptrtoint_of_ptrmask_vec2(<2 x ptr> %p, <2 x i64> %m) { ; CHECK-LABEL: define <2 x i32> @ptrtoint_of_ptrmask_vec2 ; CHECK-SAME: (<2 x ptr> [[P:%.*]], <2 x i64> [[M:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint <2 x ptr> [[P]] to <2 x i64> -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i64> [[TMP1]], [[M]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i64> [[M]], [[TMP1]] ; CHECK-NEXT: [[R:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> ; CHECK-NEXT: ret <2 x i32> [[R]] ; @@ -374,10 +374,10 @@ define ptr @ptrmask_to_modified_gep6(ptr align 16 %p) { define ptr @ptrmask_to_modified_gep_indirect0(ptr align 16 %p) { ; CHECK-LABEL: define ptr @ptrmask_to_modified_gep_indirect0 ; CHECK-SAME: (ptr align 16 [[P:%.*]]) { -; 44 from 4*sizeof(i32) + (31 & -4) -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr [[P]], i64 44 -; CHECK-NEXT: ret ptr [[GEP1]] +; CHECK-NEXT: [[GEP11:%.*]] = getelementptr i8, ptr [[P]], i64 44 +; CHECK-NEXT: ret ptr [[GEP11]] ; +; 44 from 4*sizeof(i32) + (31 & -4) %gep0 = getelementptr i32, ptr %p, i32 4 %gep1 = getelementptr i8, ptr %gep0, i32 31 %pm = call ptr @llvm.ptrmask.p0.i64(ptr %gep1, i64 -4) @@ -387,11 +387,11 @@ define ptr @ptrmask_to_modified_gep_indirect0(ptr align 16 %p) { define ptr @ptrmask_to_modified_gep_indirect1(ptr %p) { ; CHECK-LABEL: define ptr @ptrmask_to_modified_gep_indirect1 ; CHECK-SAME: (ptr [[P:%.*]]) { - -; CHECK-NEXT: [[R:%.*]] = call align 16 ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 -16) -; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[R]], i64 32 -; CHECK-NEXT: ret ptr [[GEP]] +; CHECK-NEXT: [[PM0:%.*]] = call align 16 ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 -16) +; CHECK-NEXT: [[PGEP1:%.*]] = getelementptr i8, ptr [[PM0]], i64 32 +; CHECK-NEXT: ret ptr [[PGEP1]] ; + %pm0 = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 -16) %pgep = getelementptr i8, ptr %pm0, i64 33 %r = call ptr @llvm.ptrmask.p0.i64(ptr %pgep, i64 -16) diff --git a/llvm/test/Transforms/InstCombine/range-check.ll b/llvm/test/Transforms/InstCombine/range-check.ll index 210e57c1d1fe4c..ebb310fb7c1f8f 100644 --- a/llvm/test/Transforms/InstCombine/range-check.ll +++ b/llvm/test/Transforms/InstCombine/range-check.ll @@ -7,7 +7,7 @@ define i1 @test_and1(i32 %x, i32 %n) { ; CHECK-LABEL: @test_and1( ; CHECK-NEXT: [[NN:%.*]] = and i32 [[N:%.*]], 2147483647 -; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[NN]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[X:%.*]], [[NN]] ; CHECK-NEXT: ret i1 [[C]] ; %nn = and i32 %n, 2147483647 @@ -21,7 +21,7 @@ define i1 @test_and1_logical(i32 %x, i32 %n) { ; CHECK-LABEL: @test_and1_logical( ; CHECK-NEXT: [[NN:%.*]] = and i32 [[N:%.*]], 2147483647 ; CHECK-NEXT: [[A:%.*]] = icmp sgt i32 [[X:%.*]], -1 -; CHECK-NEXT: [[B:%.*]] = icmp sgt i32 [[NN]], [[X]] +; CHECK-NEXT: [[B:%.*]] = icmp slt i32 [[X]], [[NN]] ; CHECK-NEXT: [[C:%.*]] = select i1 [[A]], i1 [[B]], i1 false ; CHECK-NEXT: ret i1 [[C]] ; @@ -35,7 +35,7 @@ define i1 @test_and1_logical(i32 %x, i32 %n) { define i1 @test_and2(i32 %x, i32 %n) { ; CHECK-LABEL: @test_and2( ; CHECK-NEXT: [[NN:%.*]] = and i32 [[N:%.*]], 2147483647 -; CHECK-NEXT: [[C:%.*]] = icmp uge i32 [[NN]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp ule i32 [[X:%.*]], [[NN]] ; CHECK-NEXT: ret i1 [[C]] ; %nn = and i32 %n, 2147483647 @@ -49,7 +49,7 @@ define i1 @test_and2_logical(i32 %x, i32 %n) { ; CHECK-LABEL: @test_and2_logical( ; CHECK-NEXT: [[NN:%.*]] = and i32 [[N:%.*]], 2147483647 ; CHECK-NEXT: [[A:%.*]] = icmp sgt i32 [[X:%.*]], -1 -; CHECK-NEXT: [[B:%.*]] = icmp sge i32 [[NN]], [[X]] +; CHECK-NEXT: [[B:%.*]] = icmp sle i32 [[X]], [[NN]] ; CHECK-NEXT: [[C:%.*]] = select i1 [[A]], i1 [[B]], i1 false ; CHECK-NEXT: ret i1 [[C]] ; @@ -63,7 +63,7 @@ define i1 @test_and2_logical(i32 %x, i32 %n) { define i1 @test_and3(i32 %x, i32 %n) { ; CHECK-LABEL: @test_and3( ; CHECK-NEXT: [[NN:%.*]] = and i32 [[N:%.*]], 2147483647 -; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[NN]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[X:%.*]], [[NN]] ; CHECK-NEXT: ret i1 [[C]] ; %nn = and i32 %n, 2147483647 @@ -76,7 +76,7 @@ define i1 @test_and3(i32 %x, i32 %n) { define i1 @test_and3_logical(i32 %x, i32 %n) { ; CHECK-LABEL: @test_and3_logical( ; CHECK-NEXT: [[NN:%.*]] = and i32 [[N:%.*]], 2147483647 -; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[NN]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[X:%.*]], [[NN]] ; CHECK-NEXT: ret i1 [[C]] ; %nn = and i32 %n, 2147483647 @@ -89,7 +89,7 @@ define i1 @test_and3_logical(i32 %x, i32 %n) { define i1 @test_and4(i32 %x, i32 %n) { ; CHECK-LABEL: @test_and4( ; CHECK-NEXT: [[NN:%.*]] = and i32 [[N:%.*]], 2147483647 -; CHECK-NEXT: [[C:%.*]] = icmp uge i32 [[NN]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp ule i32 [[X:%.*]], [[NN]] ; CHECK-NEXT: ret i1 [[C]] ; %nn = and i32 %n, 2147483647 @@ -102,7 +102,7 @@ define i1 @test_and4(i32 %x, i32 %n) { define i1 @test_and4_logical(i32 %x, i32 %n) { ; CHECK-LABEL: @test_and4_logical( ; CHECK-NEXT: [[NN:%.*]] = and i32 [[N:%.*]], 2147483647 -; CHECK-NEXT: [[C:%.*]] = icmp uge i32 [[NN]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp ule i32 [[X:%.*]], [[NN]] ; CHECK-NEXT: ret i1 [[C]] ; %nn = and i32 %n, 2147483647 @@ -115,7 +115,7 @@ define i1 @test_and4_logical(i32 %x, i32 %n) { define i1 @test_or1(i32 %x, i32 %n) { ; CHECK-LABEL: @test_or1( ; CHECK-NEXT: [[NN:%.*]] = and i32 [[N:%.*]], 2147483647 -; CHECK-NEXT: [[C:%.*]] = icmp ule i32 [[NN]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp uge i32 [[X:%.*]], [[NN]] ; CHECK-NEXT: ret i1 [[C]] ; %nn = and i32 %n, 2147483647 @@ -129,7 +129,7 @@ define i1 @test_or1_logical(i32 %x, i32 %n) { ; CHECK-LABEL: @test_or1_logical( ; CHECK-NEXT: [[NN:%.*]] = and i32 [[N:%.*]], 2147483647 ; CHECK-NEXT: [[A:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[B:%.*]] = icmp sle i32 [[NN]], [[X]] +; CHECK-NEXT: [[B:%.*]] = icmp sge i32 [[X]], [[NN]] ; CHECK-NEXT: [[C:%.*]] = select i1 [[A]], i1 true, i1 [[B]] ; CHECK-NEXT: ret i1 [[C]] ; @@ -143,7 +143,7 @@ define i1 @test_or1_logical(i32 %x, i32 %n) { define i1 @test_or2(i32 %x, i32 %n) { ; CHECK-LABEL: @test_or2( ; CHECK-NEXT: [[NN:%.*]] = and i32 [[N:%.*]], 2147483647 -; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[NN]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[X:%.*]], [[NN]] ; CHECK-NEXT: ret i1 [[C]] ; %nn = and i32 %n, 2147483647 @@ -157,7 +157,7 @@ define i1 @test_or2_logical(i32 %x, i32 %n) { ; CHECK-LABEL: @test_or2_logical( ; CHECK-NEXT: [[NN:%.*]] = and i32 [[N:%.*]], 2147483647 ; CHECK-NEXT: [[A:%.*]] = icmp slt i32 [[X:%.*]], 0 -; CHECK-NEXT: [[B:%.*]] = icmp slt i32 [[NN]], [[X]] +; CHECK-NEXT: [[B:%.*]] = icmp sgt i32 [[X]], [[NN]] ; CHECK-NEXT: [[C:%.*]] = select i1 [[A]], i1 true, i1 [[B]] ; CHECK-NEXT: ret i1 [[C]] ; @@ -171,7 +171,7 @@ define i1 @test_or2_logical(i32 %x, i32 %n) { define i1 @test_or3(i32 %x, i32 %n) { ; CHECK-LABEL: @test_or3( ; CHECK-NEXT: [[NN:%.*]] = and i32 [[N:%.*]], 2147483647 -; CHECK-NEXT: [[C:%.*]] = icmp ule i32 [[NN]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp uge i32 [[X:%.*]], [[NN]] ; CHECK-NEXT: ret i1 [[C]] ; %nn = and i32 %n, 2147483647 @@ -184,7 +184,7 @@ define i1 @test_or3(i32 %x, i32 %n) { define i1 @test_or3_logical(i32 %x, i32 %n) { ; CHECK-LABEL: @test_or3_logical( ; CHECK-NEXT: [[NN:%.*]] = and i32 [[N:%.*]], 2147483647 -; CHECK-NEXT: [[C:%.*]] = icmp ule i32 [[NN]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp uge i32 [[X:%.*]], [[NN]] ; CHECK-NEXT: ret i1 [[C]] ; %nn = and i32 %n, 2147483647 @@ -197,7 +197,7 @@ define i1 @test_or3_logical(i32 %x, i32 %n) { define i1 @test_or4(i32 %x, i32 %n) { ; CHECK-LABEL: @test_or4( ; CHECK-NEXT: [[NN:%.*]] = and i32 [[N:%.*]], 2147483647 -; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[NN]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[X:%.*]], [[NN]] ; CHECK-NEXT: ret i1 [[C]] ; %nn = and i32 %n, 2147483647 @@ -210,7 +210,7 @@ define i1 @test_or4(i32 %x, i32 %n) { define i1 @test_or4_logical(i32 %x, i32 %n) { ; CHECK-LABEL: @test_or4_logical( ; CHECK-NEXT: [[NN:%.*]] = and i32 [[N:%.*]], 2147483647 -; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[NN]], [[X:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[X:%.*]], [[NN]] ; CHECK-NEXT: ret i1 [[C]] ; %nn = and i32 %n, 2147483647 @@ -225,7 +225,7 @@ define i1 @test_or4_logical(i32 %x, i32 %n) { define i1 @negative1(i32 %x, i32 %n) { ; CHECK-LABEL: @negative1( ; CHECK-NEXT: [[NN:%.*]] = and i32 [[N:%.*]], 2147483647 -; CHECK-NEXT: [[A:%.*]] = icmp sgt i32 [[NN]], [[X:%.*]] +; CHECK-NEXT: [[A:%.*]] = icmp slt i32 [[X:%.*]], [[NN]] ; CHECK-NEXT: [[B:%.*]] = icmp sgt i32 [[X]], 0 ; CHECK-NEXT: [[C:%.*]] = and i1 [[A]], [[B]] ; CHECK-NEXT: ret i1 [[C]] @@ -240,7 +240,7 @@ define i1 @negative1(i32 %x, i32 %n) { define i1 @negative1_logical(i32 %x, i32 %n) { ; CHECK-LABEL: @negative1_logical( ; CHECK-NEXT: [[NN:%.*]] = and i32 [[N:%.*]], 2147483647 -; CHECK-NEXT: [[A:%.*]] = icmp sgt i32 [[NN]], [[X:%.*]] +; CHECK-NEXT: [[A:%.*]] = icmp slt i32 [[X:%.*]], [[NN]] ; CHECK-NEXT: [[B:%.*]] = icmp sgt i32 [[X]], 0 ; CHECK-NEXT: [[C:%.*]] = and i1 [[A]], [[B]] ; CHECK-NEXT: ret i1 [[C]] @@ -281,7 +281,7 @@ define i1 @negative2_logical(i32 %x, i32 %n) { define i1 @negative3(i32 %x, i32 %y, i32 %n) { ; CHECK-LABEL: @negative3( ; CHECK-NEXT: [[NN:%.*]] = and i32 [[N:%.*]], 2147483647 -; CHECK-NEXT: [[A:%.*]] = icmp sgt i32 [[NN]], [[X:%.*]] +; CHECK-NEXT: [[A:%.*]] = icmp slt i32 [[X:%.*]], [[NN]] ; CHECK-NEXT: [[B:%.*]] = icmp sgt i32 [[Y:%.*]], -1 ; CHECK-NEXT: [[C:%.*]] = and i1 [[A]], [[B]] ; CHECK-NEXT: ret i1 [[C]] @@ -296,7 +296,7 @@ define i1 @negative3(i32 %x, i32 %y, i32 %n) { define i1 @negative3_logical(i32 %x, i32 %y, i32 %n) { ; CHECK-LABEL: @negative3_logical( ; CHECK-NEXT: [[NN:%.*]] = and i32 [[N:%.*]], 2147483647 -; CHECK-NEXT: [[A:%.*]] = icmp sgt i32 [[NN]], [[X:%.*]] +; CHECK-NEXT: [[A:%.*]] = icmp slt i32 [[X:%.*]], [[NN]] ; CHECK-NEXT: [[B:%.*]] = icmp sgt i32 [[Y:%.*]], -1 ; CHECK-NEXT: [[C:%.*]] = select i1 [[A]], i1 [[B]], i1 false ; CHECK-NEXT: ret i1 [[C]] @@ -311,7 +311,7 @@ define i1 @negative3_logical(i32 %x, i32 %y, i32 %n) { define i1 @negative4(i32 %x, i32 %n) { ; CHECK-LABEL: @negative4( ; CHECK-NEXT: [[NN:%.*]] = and i32 [[N:%.*]], 2147483647 -; CHECK-NEXT: [[A:%.*]] = icmp ne i32 [[NN]], [[X:%.*]] +; CHECK-NEXT: [[A:%.*]] = icmp ne i32 [[X:%.*]], [[NN]] ; CHECK-NEXT: [[B:%.*]] = icmp sgt i32 [[X]], -1 ; CHECK-NEXT: [[C:%.*]] = and i1 [[A]], [[B]] ; CHECK-NEXT: ret i1 [[C]] @@ -326,7 +326,7 @@ define i1 @negative4(i32 %x, i32 %n) { define i1 @negative4_logical(i32 %x, i32 %n) { ; CHECK-LABEL: @negative4_logical( ; CHECK-NEXT: [[NN:%.*]] = and i32 [[N:%.*]], 2147483647 -; CHECK-NEXT: [[A:%.*]] = icmp ne i32 [[NN]], [[X:%.*]] +; CHECK-NEXT: [[A:%.*]] = icmp ne i32 [[X:%.*]], [[NN]] ; CHECK-NEXT: [[B:%.*]] = icmp sgt i32 [[X]], -1 ; CHECK-NEXT: [[C:%.*]] = and i1 [[A]], [[B]] ; CHECK-NEXT: ret i1 [[C]] diff --git a/llvm/test/Transforms/InstCombine/reassociate-nuw.ll b/llvm/test/Transforms/InstCombine/reassociate-nuw.ll index 9718739ed8ab25..99f07c0a8e0ad1 100644 --- a/llvm/test/Transforms/InstCombine/reassociate-nuw.ll +++ b/llvm/test/Transforms/InstCombine/reassociate-nuw.ll @@ -132,7 +132,7 @@ define i32 @tryFactorization_add_nuw_mul(i32 %x) { define i32 @tryFactorization_add_nuw_mul_nuw_mul_nuw_var(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @tryFactorization_add_nuw_mul_nuw_mul_nuw_var( ; CHECK-NEXT: [[MUL21:%.*]] = add i32 [[Y:%.*]], [[Z:%.*]] -; CHECK-NEXT: [[ADD1:%.*]] = mul nuw i32 [[MUL21]], [[X:%.*]] +; CHECK-NEXT: [[ADD1:%.*]] = mul nuw i32 [[X:%.*]], [[MUL21]] ; CHECK-NEXT: ret i32 [[ADD1]] ; %mul1 = mul nuw i32 %x, %y @@ -144,7 +144,7 @@ define i32 @tryFactorization_add_nuw_mul_nuw_mul_nuw_var(i32 %x, i32 %y, i32 %z) define i32 @tryFactorization_add_nuw_mul_mul_nuw_var(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @tryFactorization_add_nuw_mul_mul_nuw_var( ; CHECK-NEXT: [[MUL21:%.*]] = add i32 [[Y:%.*]], [[Z:%.*]] -; CHECK-NEXT: [[ADD1:%.*]] = mul i32 [[MUL21]], [[X:%.*]] +; CHECK-NEXT: [[ADD1:%.*]] = mul i32 [[X:%.*]], [[MUL21]] ; CHECK-NEXT: ret i32 [[ADD1]] ; %mul1 = mul i32 %x, %y @@ -156,7 +156,7 @@ define i32 @tryFactorization_add_nuw_mul_mul_nuw_var(i32 %x, i32 %y, i32 %z) { define i32 @tryFactorization_add_nuw_mul_nuw_mul_var(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @tryFactorization_add_nuw_mul_nuw_mul_var( ; CHECK-NEXT: [[MUL21:%.*]] = add i32 [[Y:%.*]], [[Z:%.*]] -; CHECK-NEXT: [[ADD1:%.*]] = mul i32 [[MUL21]], [[X:%.*]] +; CHECK-NEXT: [[ADD1:%.*]] = mul i32 [[X:%.*]], [[MUL21]] ; CHECK-NEXT: ret i32 [[ADD1]] ; %mul1 = mul nuw i32 %x, %y @@ -168,7 +168,7 @@ define i32 @tryFactorization_add_nuw_mul_nuw_mul_var(i32 %x, i32 %y, i32 %z) { define i32 @tryFactorization_add_mul_nuw_mul_var(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @tryFactorization_add_mul_nuw_mul_var( ; CHECK-NEXT: [[MUL21:%.*]] = add i32 [[Y:%.*]], [[Z:%.*]] -; CHECK-NEXT: [[ADD1:%.*]] = mul i32 [[MUL21]], [[X:%.*]] +; CHECK-NEXT: [[ADD1:%.*]] = mul i32 [[X:%.*]], [[MUL21]] ; CHECK-NEXT: ret i32 [[ADD1]] ; %mul1 = mul nuw i32 %x, %y diff --git a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-b.ll b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-b.ll index cb6775e689b8cb..8c61e24a97f1d0 100644 --- a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-b.ll +++ b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-b.ll @@ -20,7 +20,7 @@ define i32 @t0_basic(i64 %x, i32 %nbits) { ; CHECK-NEXT: [[T1:%.*]] = shl nsw i64 -1, [[T0]] ; CHECK-NEXT: [[T2:%.*]] = xor i64 [[T1]], -1 ; CHECK-NEXT: [[T3:%.*]] = sub i32 32, [[NBITS]] -; CHECK-NEXT: [[T4:%.*]] = and i64 [[T2]], [[X:%.*]] +; CHECK-NEXT: [[T4:%.*]] = and i64 [[X:%.*]], [[T2]] ; CHECK-NEXT: call void @use32(i32 [[NBITS]]) ; CHECK-NEXT: call void @use64(i64 [[T0]]) ; CHECK-NEXT: call void @use64(i64 [[T1]]) @@ -60,7 +60,7 @@ define <8 x i32> @t1_vec_splat(<8 x i64> %x, <8 x i32> %nbits) { ; CHECK-NEXT: [[T1:%.*]] = shl nsw <8 x i64> , [[T0]] ; CHECK-NEXT: [[T2:%.*]] = xor <8 x i64> [[T1]], ; CHECK-NEXT: [[T3:%.*]] = sub <8 x i32> , [[NBITS]] -; CHECK-NEXT: [[T4:%.*]] = and <8 x i64> [[T2]], [[X:%.*]] +; CHECK-NEXT: [[T4:%.*]] = and <8 x i64> [[X:%.*]], [[T2]] ; CHECK-NEXT: call void @use8xi32(<8 x i32> [[NBITS]]) ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T0]]) ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T1]]) @@ -95,7 +95,7 @@ define <8 x i32> @t2_vec_splat_poison(<8 x i64> %x, <8 x i32> %nbits) { ; CHECK-NEXT: [[T1:%.*]] = shl nsw <8 x i64> , [[T0]] ; CHECK-NEXT: [[T2:%.*]] = xor <8 x i64> [[T1]], ; CHECK-NEXT: [[T3:%.*]] = sub <8 x i32> , [[NBITS]] -; CHECK-NEXT: [[T4:%.*]] = and <8 x i64> [[T2]], [[X:%.*]] +; CHECK-NEXT: [[T4:%.*]] = and <8 x i64> [[X:%.*]], [[T2]] ; CHECK-NEXT: call void @use8xi32(<8 x i32> [[NBITS]]) ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T0]]) ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T1]]) @@ -131,7 +131,7 @@ define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) { ; CHECK-NEXT: [[T2:%.*]] = shl nsw <8 x i64> , [[T1]] ; CHECK-NEXT: [[T3:%.*]] = xor <8 x i64> [[T2]], ; CHECK-NEXT: [[T4:%.*]] = sub <8 x i32> , [[NBITS]] -; CHECK-NEXT: [[T5:%.*]] = and <8 x i64> [[T3]], [[X:%.*]] +; CHECK-NEXT: [[T5:%.*]] = and <8 x i64> [[X:%.*]], [[T3]] ; CHECK-NEXT: call void @use8xi32(<8 x i32> [[T0]]) ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T1]]) ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T2]]) @@ -206,7 +206,7 @@ define i32 @n5_extrause(i64 %x, i32 %nbits) { ; CHECK-NEXT: [[T1:%.*]] = shl nsw i64 -1, [[T0]] ; CHECK-NEXT: [[T2:%.*]] = xor i64 [[T1]], -1 ; CHECK-NEXT: [[T3:%.*]] = sub i32 32, [[NBITS]] -; CHECK-NEXT: [[T4:%.*]] = and i64 [[T2]], [[X:%.*]] +; CHECK-NEXT: [[T4:%.*]] = and i64 [[X:%.*]], [[T2]] ; CHECK-NEXT: call void @use32(i32 [[NBITS]]) ; CHECK-NEXT: call void @use64(i64 [[T0]]) ; CHECK-NEXT: call void @use64(i64 [[T1]]) diff --git a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-b.ll b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-b.ll index 4b955a894fcfe6..e3c09813891163 100644 --- a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-b.ll +++ b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-b.ll @@ -19,7 +19,7 @@ define i32 @t0_basic(i32 %x, i32 %nbits) { ; CHECK-LABEL: @t0_basic( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i32 -1, [[NBITS:%.*]] ; CHECK-NEXT: [[T1:%.*]] = xor i32 [[T0]], -1 -; CHECK-NEXT: [[T2:%.*]] = and i32 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[T2:%.*]] = and i32 [[X:%.*]], [[T1]] ; CHECK-NEXT: [[T3:%.*]] = sub i32 32, [[NBITS]] ; CHECK-NEXT: call void @use32(i32 [[T0]]) ; CHECK-NEXT: call void @use32(i32 [[T1]]) @@ -44,7 +44,7 @@ define i32 @t1_bigger_shift(i32 %x, i32 %nbits) { ; CHECK-LABEL: @t1_bigger_shift( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i32 -1, [[NBITS:%.*]] ; CHECK-NEXT: [[T1:%.*]] = xor i32 [[T0]], -1 -; CHECK-NEXT: [[T2:%.*]] = and i32 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[T2:%.*]] = and i32 [[X:%.*]], [[T1]] ; CHECK-NEXT: [[T3:%.*]] = sub i32 33, [[NBITS]] ; CHECK-NEXT: call void @use32(i32 [[T0]]) ; CHECK-NEXT: call void @use32(i32 [[T1]]) @@ -70,7 +70,7 @@ define i32 @t2_bigger_mask(i32 %x, i32 %nbits) { ; CHECK-NEXT: [[T0:%.*]] = add i32 [[NBITS:%.*]], 1 ; CHECK-NEXT: [[T1:%.*]] = shl nsw i32 -1, [[T0]] ; CHECK-NEXT: [[T2:%.*]] = xor i32 [[T1]], -1 -; CHECK-NEXT: [[T3:%.*]] = and i32 [[T2]], [[X:%.*]] +; CHECK-NEXT: [[T3:%.*]] = and i32 [[X:%.*]], [[T2]] ; CHECK-NEXT: [[T4:%.*]] = sub i32 32, [[NBITS]] ; CHECK-NEXT: call void @use32(i32 [[T0]]) ; CHECK-NEXT: call void @use32(i32 [[T1]]) @@ -102,7 +102,7 @@ define <3 x i32> @t3_vec_splat(<3 x i32> %x, <3 x i32> %nbits) { ; CHECK-LABEL: @t3_vec_splat( ; CHECK-NEXT: [[T1:%.*]] = shl nsw <3 x i32> , [[NBITS:%.*]] ; CHECK-NEXT: [[T2:%.*]] = xor <3 x i32> [[T1]], -; CHECK-NEXT: [[T3:%.*]] = and <3 x i32> [[T2]], [[X:%.*]] +; CHECK-NEXT: [[T3:%.*]] = and <3 x i32> [[X:%.*]], [[T2]] ; CHECK-NEXT: [[T4:%.*]] = sub <3 x i32> , [[NBITS]] ; CHECK-NEXT: call void @use3xi32(<3 x i32> [[NBITS]]) ; CHECK-NEXT: call void @use3xi32(<3 x i32> [[T1]]) @@ -131,7 +131,7 @@ define <3 x i32> @t4_vec_nonsplat(<3 x i32> %x, <3 x i32> %nbits) { ; CHECK-NEXT: [[T0:%.*]] = add <3 x i32> [[NBITS:%.*]], ; CHECK-NEXT: [[T1:%.*]] = shl nsw <3 x i32> , [[T0]] ; CHECK-NEXT: [[T2:%.*]] = xor <3 x i32> [[T1]], -; CHECK-NEXT: [[T3:%.*]] = and <3 x i32> [[T2]], [[X:%.*]] +; CHECK-NEXT: [[T3:%.*]] = and <3 x i32> [[X:%.*]], [[T2]] ; CHECK-NEXT: [[T4:%.*]] = sub <3 x i32> , [[NBITS]] ; CHECK-NEXT: call void @use3xi32(<3 x i32> [[T0]]) ; CHECK-NEXT: call void @use3xi32(<3 x i32> [[T1]]) @@ -159,7 +159,7 @@ define <3 x i32> @t5_vec_poison(<3 x i32> %x, <3 x i32> %nbits) { ; CHECK-LABEL: @t5_vec_poison( ; CHECK-NEXT: [[T1:%.*]] = shl nsw <3 x i32> , [[NBITS:%.*]] ; CHECK-NEXT: [[T2:%.*]] = xor <3 x i32> [[T1]], -; CHECK-NEXT: [[T3:%.*]] = and <3 x i32> [[T2]], [[X:%.*]] +; CHECK-NEXT: [[T3:%.*]] = and <3 x i32> [[X:%.*]], [[T2]] ; CHECK-NEXT: [[T4:%.*]] = sub <3 x i32> , [[NBITS]] ; CHECK-NEXT: call void @use3xi32(<3 x i32> [[NBITS]]) ; CHECK-NEXT: call void @use3xi32(<3 x i32> [[T1]]) @@ -285,7 +285,7 @@ define i32 @t9_nuw(i32 %x, i32 %nbits) { ; CHECK-LABEL: @t9_nuw( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i32 -1, [[NBITS:%.*]] ; CHECK-NEXT: [[T1:%.*]] = xor i32 [[T0]], -1 -; CHECK-NEXT: [[T2:%.*]] = and i32 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[T2:%.*]] = and i32 [[X:%.*]], [[T1]] ; CHECK-NEXT: [[T3:%.*]] = sub i32 32, [[NBITS]] ; CHECK-NEXT: call void @use32(i32 [[T0]]) ; CHECK-NEXT: call void @use32(i32 [[T1]]) @@ -310,7 +310,7 @@ define i32 @t10_nsw(i32 %x, i32 %nbits) { ; CHECK-LABEL: @t10_nsw( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i32 -1, [[NBITS:%.*]] ; CHECK-NEXT: [[T1:%.*]] = xor i32 [[T0]], -1 -; CHECK-NEXT: [[T2:%.*]] = and i32 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[T2:%.*]] = and i32 [[X:%.*]], [[T1]] ; CHECK-NEXT: [[T3:%.*]] = sub i32 32, [[NBITS]] ; CHECK-NEXT: call void @use32(i32 [[T0]]) ; CHECK-NEXT: call void @use32(i32 [[T1]]) @@ -335,7 +335,7 @@ define i32 @t11_nuw_nsw(i32 %x, i32 %nbits) { ; CHECK-LABEL: @t11_nuw_nsw( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i32 -1, [[NBITS:%.*]] ; CHECK-NEXT: [[T1:%.*]] = xor i32 [[T0]], -1 -; CHECK-NEXT: [[T2:%.*]] = and i32 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[T2:%.*]] = and i32 [[X:%.*]], [[T1]] ; CHECK-NEXT: [[T3:%.*]] = sub i32 32, [[NBITS]] ; CHECK-NEXT: call void @use32(i32 [[T0]]) ; CHECK-NEXT: call void @use32(i32 [[T1]]) @@ -362,7 +362,7 @@ define i32 @n12_not_minus_one(i32 %x, i32 %nbits) { ; CHECK-LABEL: @n12_not_minus_one( ; CHECK-NEXT: [[T0:%.*]] = shl i32 -2, [[NBITS:%.*]] ; CHECK-NEXT: [[T1:%.*]] = xor i32 [[T0]], -1 -; CHECK-NEXT: [[T2:%.*]] = and i32 [[T1]], [[X:%.*]] +; CHECK-NEXT: [[T2:%.*]] = and i32 [[X:%.*]], [[T1]] ; CHECK-NEXT: [[T3:%.*]] = sub i32 32, [[NBITS]] ; CHECK-NEXT: call void @use32(i32 [[T0]]) ; CHECK-NEXT: call void @use32(i32 [[T1]]) diff --git a/llvm/test/Transforms/InstCombine/rem.ll b/llvm/test/Transforms/InstCombine/rem.ll index de484fe6df8573..05ff214f91b8ce 100644 --- a/llvm/test/Transforms/InstCombine/rem.ll +++ b/llvm/test/Transforms/InstCombine/rem.ll @@ -239,7 +239,7 @@ define <2 x i1> @test3a_vec(<2 x i32> %A) { define i32 @test4(i32 %X, i1 %C) { ; CHECK-LABEL: @test4( ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[C:%.*]], i32 0, i32 7 -; CHECK-NEXT: [[R:%.*]] = and i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = and i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[R]] ; %V = select i1 %C, i32 1, i32 8 @@ -252,7 +252,7 @@ define i32 @test5(i32 %X, i8 %B) { ; CHECK-NEXT: [[SHIFT_UPGRD_1:%.*]] = zext nneg i8 [[B:%.*]] to i32 ; CHECK-NEXT: [[AMT:%.*]] = shl nuw i32 32, [[SHIFT_UPGRD_1]] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[AMT]], -1 -; CHECK-NEXT: [[V:%.*]] = and i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[V:%.*]] = and i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[V]] ; %shift.upgrd.1 = zext i8 %B to i32 @@ -340,7 +340,7 @@ define i64 @test14(i64 %x, i32 %y) { ; CHECK-NEXT: [[SHL:%.*]] = shl nuw i32 1, [[Y:%.*]] ; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[SHL]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[ZEXT]], -1 -; CHECK-NEXT: [[UREM:%.*]] = and i64 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[UREM:%.*]] = and i64 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i64 [[UREM]] ; %shl = shl i32 1, %y @@ -353,7 +353,7 @@ define i64 @test15(i32 %x, i32 %y) { ; CHECK-LABEL: @test15( ; CHECK-NEXT: [[NOTMASK:%.*]] = shl nsw i32 -1, [[Y:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[NOTMASK]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[UREM:%.*]] = zext nneg i32 [[TMP2]] to i64 ; CHECK-NEXT: ret i64 [[UREM]] ; @@ -369,7 +369,7 @@ define i32 @test16(i32 %x, i32 %y) { ; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[Y:%.*]], 11 ; CHECK-NEXT: [[AND:%.*]] = and i32 [[SHR]], 4 ; CHECK-NEXT: [[TMP1:%.*]] = or disjoint i32 [[AND]], 3 -; CHECK-NEXT: [[REM:%.*]] = and i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[REM:%.*]] = and i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[REM]] ; %shr = lshr i32 %y, 11 @@ -394,7 +394,7 @@ define i32 @test18(i16 %x, i32 %y) { ; CHECK-NEXT: [[TMP1:%.*]] = and i16 [[X:%.*]], 4 ; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i16 [[TMP1]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[DOTNOT]], i32 63, i32 31 -; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret i32 [[TMP3]] ; %1 = and i16 %x, 4 @@ -411,7 +411,7 @@ define i32 @test19(i32 %x, i32 %y) { ; CHECK-NEXT: [[C:%.*]] = and i32 [[A]], [[B]] ; CHECK-NEXT: [[D:%.*]] = add i32 [[C]], [[A]] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[D]], -1 -; CHECK-NEXT: [[E:%.*]] = and i32 [[TMP1]], [[Y]] +; CHECK-NEXT: [[E:%.*]] = and i32 [[Y]], [[TMP1]] ; CHECK-NEXT: ret i32 [[E]] ; %A = shl i32 1, %x @@ -429,7 +429,7 @@ define i32 @test19_commutative0(i32 %x, i32 %y) { ; CHECK-NEXT: [[C:%.*]] = and i32 [[B]], [[A]] ; CHECK-NEXT: [[D:%.*]] = add i32 [[C]], [[A]] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[D]], -1 -; CHECK-NEXT: [[E:%.*]] = and i32 [[TMP1]], [[Y]] +; CHECK-NEXT: [[E:%.*]] = and i32 [[Y]], [[TMP1]] ; CHECK-NEXT: ret i32 [[E]] ; %A = shl i32 1, %x @@ -447,7 +447,7 @@ define i32 @test19_commutative1(i32 %x, i32 %y) { ; CHECK-NEXT: [[C:%.*]] = and i32 [[A]], [[B]] ; CHECK-NEXT: [[D:%.*]] = add i32 [[A]], [[C]] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[D]], -1 -; CHECK-NEXT: [[E:%.*]] = and i32 [[TMP1]], [[Y]] +; CHECK-NEXT: [[E:%.*]] = and i32 [[Y]], [[TMP1]] ; CHECK-NEXT: ret i32 [[E]] ; %A = shl i32 1, %x @@ -465,7 +465,7 @@ define i32 @test19_commutative2(i32 %x, i32 %y) { ; CHECK-NEXT: [[C:%.*]] = and i32 [[B]], [[A]] ; CHECK-NEXT: [[D:%.*]] = add i32 [[A]], [[C]] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[D]], -1 -; CHECK-NEXT: [[E:%.*]] = and i32 [[TMP1]], [[Y]] +; CHECK-NEXT: [[E:%.*]] = and i32 [[Y]], [[TMP1]] ; CHECK-NEXT: ret i32 [[E]] ; %A = shl i32 1, %x @@ -726,7 +726,7 @@ define i1 @test26(i32 %A, i32 %B) { ; CHECK-LABEL: @test26( ; CHECK-NEXT: [[NOTMASK:%.*]] = shl nsw i32 -1, [[B:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[NOTMASK]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: [[E:%.*]] = icmp ne i32 [[TMP2]], 0 ; CHECK-NEXT: ret i1 [[E]] ; diff --git a/llvm/test/Transforms/InstCombine/result-of-add-of-negative-is-non-zero-and-no-underflow.ll b/llvm/test/Transforms/InstCombine/result-of-add-of-negative-is-non-zero-and-no-underflow.ll index 107ef291bf4393..8103d366d444d0 100644 --- a/llvm/test/Transforms/InstCombine/result-of-add-of-negative-is-non-zero-and-no-underflow.ll +++ b/llvm/test/Transforms/InstCombine/result-of-add-of-negative-is-non-zero-and-no-underflow.ll @@ -49,7 +49,7 @@ define i1 @t1(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[BASE]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[OFFSET]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[OFFSET]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp = icmp slt i8 %base, 0 @@ -70,7 +70,7 @@ define i1 @t1_logical(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[BASE]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[OFFSET]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[OFFSET]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp = icmp slt i8 %base, 0 @@ -92,7 +92,7 @@ define i1 @t2(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE:%.*]], [[OFFSET]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[OFFSET]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp = icmp slt i8 %offset, 0 @@ -113,7 +113,7 @@ define i1 @t2_logical(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE:%.*]], [[OFFSET]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[OFFSET]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp = icmp slt i8 %offset, 0 @@ -137,7 +137,7 @@ define i1 @t3_oneuse0(i8 %base, i8 %offset) { ; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0 ; CHECK-NEXT: call void @use1(i1 [[NOT_NULL]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[BASE]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[OFFSET]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[OFFSET]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp = icmp slt i8 %base, 0 @@ -161,7 +161,7 @@ define i1 @t3_oneuse0_logical(i8 %base, i8 %offset) { ; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0 ; CHECK-NEXT: call void @use1(i1 [[NOT_NULL]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[BASE]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[OFFSET]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[OFFSET]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp = icmp slt i8 %base, 0 @@ -184,7 +184,7 @@ define i1 @t4_oneuse1(i8 %base, i8 %offset) { ; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = icmp ult i8 [[ADJUSTED]], [[BASE]] ; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[BASE]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[OFFSET]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[OFFSET]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp = icmp slt i8 %base, 0 @@ -208,7 +208,7 @@ define i1 @t4_oneuse1_logical(i8 %base, i8 %offset) { ; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = icmp ult i8 [[ADJUSTED]], [[BASE]] ; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[BASE]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[OFFSET]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[OFFSET]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp = icmp slt i8 %base, 0 @@ -281,7 +281,7 @@ define i1 @t6_commutativity0(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[BASE]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[OFFSET]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[OFFSET]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp = icmp slt i8 %base, 0 @@ -302,7 +302,7 @@ define i1 @t6_commutativity0_logical(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[BASE]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[OFFSET]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[OFFSET]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp = icmp slt i8 %base, 0 @@ -322,7 +322,7 @@ define i1 @t7_commutativity1(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[BASE]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[OFFSET]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[OFFSET]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp = icmp slt i8 %base, 0 @@ -343,7 +343,7 @@ define i1 @t7_commutativity1_logical(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[BASE]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[OFFSET]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[OFFSET]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp = icmp slt i8 %base, 0 @@ -363,7 +363,7 @@ define i1 @t7_commutativity3(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[BASE]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[OFFSET]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[OFFSET]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp = icmp slt i8 %base, 0 @@ -384,7 +384,7 @@ define i1 @t7_commutativity3_logical(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[BASE]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[OFFSET]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[OFFSET]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp = icmp slt i8 %base, 0 @@ -406,7 +406,7 @@ define i1 @t8(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[BASE]] -; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[TMP1]], [[OFFSET]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[OFFSET]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp = icmp slt i8 %base, 0 @@ -427,7 +427,7 @@ define i1 @t8_logical(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[BASE]] -; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[TMP1]], [[OFFSET]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[OFFSET]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp = icmp slt i8 %base, 0 @@ -449,7 +449,7 @@ define i1 @t9(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[BASE]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[OFFSET]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[OFFSET]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp = icmp slt i8 %base, 0 @@ -470,7 +470,7 @@ define i1 @t9_logical(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[BASE]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[OFFSET]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[OFFSET]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp = icmp slt i8 %base, 0 diff --git a/llvm/test/Transforms/InstCombine/result-of-add-of-negative-or-zero-is-non-zero-and-no-underflow.ll b/llvm/test/Transforms/InstCombine/result-of-add-of-negative-or-zero-is-non-zero-and-no-underflow.ll index 0be4457ad3fc04..f967fcac367bb9 100644 --- a/llvm/test/Transforms/InstCombine/result-of-add-of-negative-or-zero-is-non-zero-and-no-underflow.ll +++ b/llvm/test/Transforms/InstCombine/result-of-add-of-negative-or-zero-is-non-zero-and-no-underflow.ll @@ -11,7 +11,7 @@ define i1 @t0(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE:%.*]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[OFFSET]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %adjusted = add i8 %base, %offset @@ -27,7 +27,7 @@ define i1 @t0_logical(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE:%.*]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[OFFSET]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %adjusted = add i8 %base, %offset @@ -46,7 +46,7 @@ define i1 @t1_oneuse0(i8 %base, i8 %offset) { ; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0 ; CHECK-NEXT: call void @use1(i1 [[NOT_NULL]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[OFFSET]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %adjusted = add i8 %base, %offset @@ -65,7 +65,7 @@ define i1 @t1_oneuse0_logical(i8 %base, i8 %offset) { ; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0 ; CHECK-NEXT: call void @use1(i1 [[NOT_NULL]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[OFFSET]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %adjusted = add i8 %base, %offset @@ -83,7 +83,7 @@ define i1 @t2_oneuse1(i8 %base, i8 %offset) { ; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = icmp ule i8 [[ADJUSTED]], [[BASE]] ; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[OFFSET]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %adjusted = add i8 %base, %offset @@ -102,7 +102,7 @@ define i1 @t2_oneuse1_logical(i8 %base, i8 %offset) { ; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = icmp ule i8 [[ADJUSTED]], [[BASE]] ; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[OFFSET]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %adjusted = add i8 %base, %offset @@ -160,7 +160,7 @@ define i1 @t4_commutativity0(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE:%.*]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[OFFSET]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %adjusted = add i8 %base, %offset @@ -176,7 +176,7 @@ define i1 @t4_commutativity0_logical(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE:%.*]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[OFFSET]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %adjusted = add i8 %base, %offset @@ -191,7 +191,7 @@ define i1 @t5_commutativity1(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE:%.*]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[OFFSET]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %adjusted = add i8 %base, %offset @@ -207,7 +207,7 @@ define i1 @t5_commutativity1_logical(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE:%.*]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[OFFSET]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %adjusted = add i8 %base, %offset @@ -222,7 +222,7 @@ define i1 @t6_commutativity3(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE:%.*]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[OFFSET]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %adjusted = add i8 %base, %offset @@ -238,7 +238,7 @@ define i1 @t6_commutativity3_logical(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE:%.*]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[OFFSET]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %adjusted = add i8 %base, %offset @@ -255,7 +255,7 @@ define i1 @t7(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE:%.*]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[OFFSET]] -; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[TMP1]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[BASE]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %adjusted = add i8 %base, %offset @@ -271,7 +271,7 @@ define i1 @t7_logical(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE:%.*]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[OFFSET]] -; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[TMP1]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[BASE]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %adjusted = add i8 %base, %offset @@ -288,7 +288,7 @@ define i1 @t8(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE:%.*]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[BASE]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[OFFSET]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[OFFSET]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %adjusted = add i8 %base, %offset @@ -304,7 +304,7 @@ define i1 @t8_logical(i8 %base, i8 %offset) { ; CHECK-NEXT: [[ADJUSTED:%.*]] = add i8 [[BASE:%.*]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[BASE]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[OFFSET]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[OFFSET]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %adjusted = add i8 %base, %offset diff --git a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll index a8be8180b9118c..30a5072c7edc82 100644 --- a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll +++ b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll @@ -509,11 +509,11 @@ define i1 @t9_commutative(i8 %base, i8 %offset) { ; CHECK-LABEL: @t9_commutative( ; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) -; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ugt i8 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]] ; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]]) ; CHECK-NEXT: [[NULL:%.*]] = icmp eq i8 [[BASE]], [[OFFSET]] ; CHECK-NEXT: call void @use1(i1 [[NULL]]) -; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]] ; CHECK-NEXT: ret i1 [[R]] ; %adjusted = sub i8 %base, %offset @@ -530,11 +530,11 @@ define i1 @t9_commutative_logical(i8 %base, i8 %offset) { ; CHECK-LABEL: @t9_commutative_logical( ; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]] ; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]]) -; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ugt i8 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]] ; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]]) ; CHECK-NEXT: [[NULL:%.*]] = icmp eq i8 [[BASE]], [[OFFSET]] ; CHECK-NEXT: call void @use1(i1 [[NULL]]) -; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]] ; CHECK-NEXT: ret i1 [[R]] ; %adjusted = sub i8 %base, %offset @@ -554,11 +554,11 @@ define i1 @t10(i64 %base, ptr nonnull %offsetptr) { ; CHECK-NEXT: [[OFFSET:%.*]] = ptrtoint ptr [[OFFSETPTR:%.*]] to i64 ; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i64 [[BASE:%.*]], [[OFFSET]] ; CHECK-NEXT: call void @use64(i64 [[ADJUSTED]]) -; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = icmp ule i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = icmp uge i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]]) -; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: call void @use1(i1 [[NOT_NULL]]) -; CHECK-NEXT: [[R:%.*]] = icmp ult i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: ret i1 [[R]] ; %offset = ptrtoint ptr %offsetptr to i64 @@ -578,11 +578,11 @@ define i1 @t10_logical(i64 %base, ptr nonnull %offsetptr) { ; CHECK-NEXT: [[OFFSET:%.*]] = ptrtoint ptr [[OFFSETPTR:%.*]] to i64 ; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i64 [[BASE:%.*]], [[OFFSET]] ; CHECK-NEXT: call void @use64(i64 [[ADJUSTED]]) -; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = icmp ule i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = icmp uge i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]]) -; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: call void @use1(i1 [[NOT_NULL]]) -; CHECK-NEXT: [[R:%.*]] = icmp ult i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: ret i1 [[R]] ; %offset = ptrtoint ptr %offsetptr to i64 @@ -601,11 +601,11 @@ define i1 @t11_commutative(i64 %base, ptr nonnull %offsetptr) { ; CHECK-NEXT: [[OFFSET:%.*]] = ptrtoint ptr [[OFFSETPTR:%.*]] to i64 ; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i64 [[BASE:%.*]], [[OFFSET]] ; CHECK-NEXT: call void @use64(i64 [[ADJUSTED]]) -; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = icmp ule i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = icmp uge i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]]) -; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: call void @use1(i1 [[NOT_NULL]]) -; CHECK-NEXT: [[R:%.*]] = icmp ult i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: ret i1 [[R]] ; %offset = ptrtoint ptr %offsetptr to i64 @@ -625,11 +625,11 @@ define i1 @t11_commutative_logical(i64 %base, ptr nonnull %offsetptr) { ; CHECK-NEXT: [[OFFSET:%.*]] = ptrtoint ptr [[OFFSETPTR:%.*]] to i64 ; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i64 [[BASE:%.*]], [[OFFSET]] ; CHECK-NEXT: call void @use64(i64 [[ADJUSTED]]) -; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = icmp ule i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = icmp uge i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]]) -; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: call void @use1(i1 [[NOT_NULL]]) -; CHECK-NEXT: [[R:%.*]] = icmp ult i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: ret i1 [[R]] ; %offset = ptrtoint ptr %offsetptr to i64 @@ -649,11 +649,11 @@ define i1 @t12(i64 %base, ptr nonnull %offsetptr) { ; CHECK-NEXT: [[OFFSET:%.*]] = ptrtoint ptr [[OFFSETPTR:%.*]] to i64 ; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i64 [[BASE:%.*]], [[OFFSET]] ; CHECK-NEXT: call void @use64(i64 [[ADJUSTED]]) -; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = icmp ugt i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = icmp ult i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]]) -; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp eq i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp eq i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: call void @use1(i1 [[NOT_NULL]]) -; CHECK-NEXT: [[R:%.*]] = icmp uge i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: ret i1 [[R]] ; %offset = ptrtoint ptr %offsetptr to i64 @@ -673,11 +673,11 @@ define i1 @t12_logical(i64 %base, ptr nonnull %offsetptr) { ; CHECK-NEXT: [[OFFSET:%.*]] = ptrtoint ptr [[OFFSETPTR:%.*]] to i64 ; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i64 [[BASE:%.*]], [[OFFSET]] ; CHECK-NEXT: call void @use64(i64 [[ADJUSTED]]) -; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = icmp ugt i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = icmp ult i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]]) -; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp eq i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp eq i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: call void @use1(i1 [[NOT_NULL]]) -; CHECK-NEXT: [[R:%.*]] = icmp uge i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: ret i1 [[R]] ; %offset = ptrtoint ptr %offsetptr to i64 @@ -696,11 +696,11 @@ define i1 @t13(i64 %base, ptr nonnull %offsetptr) { ; CHECK-NEXT: [[OFFSET:%.*]] = ptrtoint ptr [[OFFSETPTR:%.*]] to i64 ; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i64 [[BASE:%.*]], [[OFFSET]] ; CHECK-NEXT: call void @use64(i64 [[ADJUSTED]]) -; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = icmp ugt i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = icmp ult i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]]) -; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp eq i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp eq i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: call void @use1(i1 [[NOT_NULL]]) -; CHECK-NEXT: [[R:%.*]] = icmp uge i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: ret i1 [[R]] ; %offset = ptrtoint ptr %offsetptr to i64 @@ -720,11 +720,11 @@ define i1 @t13_logical(i64 %base, ptr nonnull %offsetptr) { ; CHECK-NEXT: [[OFFSET:%.*]] = ptrtoint ptr [[OFFSETPTR:%.*]] to i64 ; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i64 [[BASE:%.*]], [[OFFSET]] ; CHECK-NEXT: call void @use64(i64 [[ADJUSTED]]) -; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = icmp ugt i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = icmp ult i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]]) -; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp eq i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp eq i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: call void @use1(i1 [[NOT_NULL]]) -; CHECK-NEXT: [[R:%.*]] = icmp uge i64 [[OFFSET]], [[BASE]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i64 [[BASE]], [[OFFSET]] ; CHECK-NEXT: ret i1 [[R]] ; %offset = ptrtoint ptr %offsetptr to i64 diff --git a/llvm/test/Transforms/InstCombine/saturating-add-sub.ll b/llvm/test/Transforms/InstCombine/saturating-add-sub.ll index d23f8d48e0c710..a88fd3cc21f1bc 100644 --- a/llvm/test/Transforms/InstCombine/saturating-add-sub.ll +++ b/llvm/test/Transforms/InstCombine/saturating-add-sub.ll @@ -1809,7 +1809,7 @@ define i32 @not_uadd_sat2(i32 %x, i32 %y) { define i32 @uadd_sat_not(i32 %x, i32 %y) { ; CHECK-LABEL: @uadd_sat_not( ; CHECK-NEXT: [[NOTX:%.*]] = xor i32 [[X:%.*]], -1 -; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[NOTX]], i32 [[Y:%.*]]) +; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[Y:%.*]], i32 [[NOTX]]) ; CHECK-NEXT: ret i32 [[R]] ; %notx = xor i32 %x, -1 @@ -1822,7 +1822,7 @@ define i32 @uadd_sat_not(i32 %x, i32 %y) { define i32 @uadd_sat_not_nonstrict(i32 %x, i32 %y) { ; CHECK-LABEL: @uadd_sat_not_nonstrict( ; CHECK-NEXT: [[NOTX:%.*]] = xor i32 [[X:%.*]], -1 -; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[NOTX]], i32 [[Y:%.*]]) +; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[Y:%.*]], i32 [[NOTX]]) ; CHECK-NEXT: ret i32 [[R]] ; %notx = xor i32 %x, -1 @@ -1852,7 +1852,7 @@ define i32 @uadd_sat_not_commute_add(i32 %xp, i32 %yp) { define i32 @uadd_sat_not_ugt(i32 %x, i32 %y) { ; CHECK-LABEL: @uadd_sat_not_ugt( ; CHECK-NEXT: [[NOTX:%.*]] = xor i32 [[X:%.*]], -1 -; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[NOTX]], i32 [[Y:%.*]]) +; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[Y:%.*]], i32 [[NOTX]]) ; CHECK-NEXT: ret i32 [[R]] ; %notx = xor i32 %x, -1 @@ -1865,7 +1865,7 @@ define i32 @uadd_sat_not_ugt(i32 %x, i32 %y) { define i32 @uadd_sat_not_uge(i32 %x, i32 %y) { ; CHECK-LABEL: @uadd_sat_not_uge( ; CHECK-NEXT: [[NOTX:%.*]] = xor i32 [[X:%.*]], -1 -; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[NOTX]], i32 [[Y:%.*]]) +; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[Y:%.*]], i32 [[NOTX]]) ; CHECK-NEXT: ret i32 [[R]] ; %notx = xor i32 %x, -1 @@ -1893,7 +1893,7 @@ define <2 x i32> @uadd_sat_not_ugt_commute_add(<2 x i32> %x, <2 x i32> %yp) { define i32 @uadd_sat_not_commute_select(i32 %x, i32 %y) { ; CHECK-LABEL: @uadd_sat_not_commute_select( ; CHECK-NEXT: [[NOTX:%.*]] = xor i32 [[X:%.*]], -1 -; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[NOTX]], i32 [[Y:%.*]]) +; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[Y:%.*]], i32 [[NOTX]]) ; CHECK-NEXT: ret i32 [[R]] ; %notx = xor i32 %x, -1 @@ -1906,7 +1906,7 @@ define i32 @uadd_sat_not_commute_select(i32 %x, i32 %y) { define i32 @uadd_sat_not_commute_select_nonstrict(i32 %x, i32 %y) { ; CHECK-LABEL: @uadd_sat_not_commute_select_nonstrict( ; CHECK-NEXT: [[NOTX:%.*]] = xor i32 [[X:%.*]], -1 -; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[NOTX]], i32 [[Y:%.*]]) +; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[Y:%.*]], i32 [[NOTX]]) ; CHECK-NEXT: ret i32 [[R]] ; %notx = xor i32 %x, -1 @@ -1951,7 +1951,7 @@ define <2 x i32> @uadd_sat_not_commute_select_ugt(<2 x i32> %xp, <2 x i32> %yp) define i32 @uadd_sat_not_commute_select_ugt_commute_add(i32 %x, i32 %y) { ; CHECK-LABEL: @uadd_sat_not_commute_select_ugt_commute_add( ; CHECK-NEXT: [[NOTX:%.*]] = xor i32 [[X:%.*]], -1 -; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[NOTX]], i32 [[Y:%.*]]) +; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[Y:%.*]], i32 [[NOTX]]) ; CHECK-NEXT: ret i32 [[R]] ; %notx = xor i32 %x, -1 @@ -1964,7 +1964,7 @@ define i32 @uadd_sat_not_commute_select_ugt_commute_add(i32 %x, i32 %y) { define i32 @uadd_sat_not_commute_select_uge_commute_add(i32 %x, i32 %y) { ; CHECK-LABEL: @uadd_sat_not_commute_select_uge_commute_add( ; CHECK-NEXT: [[NOTX:%.*]] = xor i32 [[X:%.*]], -1 -; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[NOTX]], i32 [[Y:%.*]]) +; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[Y:%.*]], i32 [[NOTX]]) ; CHECK-NEXT: ret i32 [[R]] ; %notx = xor i32 %x, -1 @@ -2138,7 +2138,7 @@ define i32 @unsigned_sat_variable_using_wrong_min(i32 %x) { ; CHECK-LABEL: @unsigned_sat_variable_using_wrong_min( ; CHECK-NEXT: [[Y:%.*]] = call i32 @get_i32() ; CHECK-NEXT: [[NOTY:%.*]] = xor i32 [[Y]], -1 -; CHECK-NEXT: [[S:%.*]] = call i32 @llvm.smin.i32(i32 [[NOTY]], i32 [[X:%.*]]) +; CHECK-NEXT: [[S:%.*]] = call i32 @llvm.smin.i32(i32 [[X:%.*]], i32 [[NOTY]]) ; CHECK-NEXT: [[R:%.*]] = add i32 [[Y]], [[S]] ; CHECK-NEXT: ret i32 [[R]] ; @@ -2156,8 +2156,8 @@ define i32 @unsigned_sat_variable_using_wrong_value(i32 %x, i32 %z) { ; CHECK-LABEL: @unsigned_sat_variable_using_wrong_value( ; CHECK-NEXT: [[Y:%.*]] = call i32 @get_i32() ; CHECK-NEXT: [[NOTY:%.*]] = xor i32 [[Y]], -1 -; CHECK-NEXT: [[S:%.*]] = call i32 @llvm.umin.i32(i32 [[NOTY]], i32 [[X:%.*]]) -; CHECK-NEXT: [[R:%.*]] = add i32 [[S]], [[Z:%.*]] +; CHECK-NEXT: [[S:%.*]] = call i32 @llvm.umin.i32(i32 [[X:%.*]], i32 [[NOTY]]) +; CHECK-NEXT: [[R:%.*]] = add i32 [[Z:%.*]], [[S]] ; CHECK-NEXT: ret i32 [[R]] ; %y = call i32 @get_i32() ; thwart complexity-based canonicalization @@ -2268,7 +2268,7 @@ define i32 @uadd_sat_via_add_swapped_cmp(i32 %x, i32 %y) { define i32 @uadd_sat_via_add_swapped_cmp_nonstrict(i32 %x, i32 %y) { ; CHECK-LABEL: @uadd_sat_via_add_swapped_cmp_nonstrict( ; CHECK-NEXT: [[A:%.*]] = add i32 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[C_NOT:%.*]] = icmp ugt i32 [[A]], [[Y]] +; CHECK-NEXT: [[C_NOT:%.*]] = icmp ult i32 [[Y]], [[A]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[C_NOT]], i32 [[A]], i32 -1 ; CHECK-NEXT: ret i32 [[R]] ; @@ -2292,7 +2292,7 @@ define i32 @uadd_sat_via_add_swapped_cmp_nonstric(i32 %x, i32 %y) { define i32 @uadd_sat_via_add_swapped_cmp_select_nonstrict(i32 %x, i32 %y) { ; CHECK-LABEL: @uadd_sat_via_add_swapped_cmp_select_nonstrict( ; CHECK-NEXT: [[A:%.*]] = add i32 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[A]], [[Y]] +; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[Y]], [[A]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[C]], i32 [[A]], i32 -1 ; CHECK-NEXT: ret i32 [[R]] ; diff --git a/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll b/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll index 424470aa929e1d..29c0ac415ce7c5 100644 --- a/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll @@ -184,8 +184,8 @@ define float @extract_element_load(<4 x float> %x, ptr %ptr) { ; ; CHECK-LABEL: @extract_element_load( ; CHECK-NEXT: [[LOAD:%.*]] = load <4 x float>, ptr [[PTR:%.*]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[LOAD]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[X:%.*]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[LOAD]], i64 2 ; CHECK-NEXT: [[R:%.*]] = fadd float [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret float [[R]] ; @@ -200,7 +200,7 @@ define float @extract_element_multi_Use_load(<4 x float> %x, ptr %ptr0, ptr %ptr ; CHECK-LABEL: @extract_element_multi_Use_load( ; CHECK-NEXT: [[LOAD:%.*]] = load <4 x float>, ptr [[PTR0:%.*]], align 16 ; CHECK-NEXT: store <4 x float> [[LOAD]], ptr [[PTR1:%.*]], align 16 -; CHECK-NEXT: [[ADD:%.*]] = fadd <4 x float> [[LOAD]], [[X:%.*]] +; CHECK-NEXT: [[ADD:%.*]] = fadd <4 x float> [[X:%.*]], [[LOAD]] ; CHECK-NEXT: [[R:%.*]] = extractelement <4 x float> [[ADD]], i64 2 ; CHECK-NEXT: ret float [[R]] ; @@ -227,7 +227,7 @@ define float @extelt_binop_insertelt(<4 x float> %A, <4 x float> %B, float %f) { ; ; CHECK-LABEL: @extelt_binop_insertelt( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 -; CHECK-NEXT: [[E:%.*]] = fmul nnan float [[TMP1]], [[F:%.*]] +; CHECK-NEXT: [[E:%.*]] = fmul nnan float [[F:%.*]], [[TMP1]] ; CHECK-NEXT: ret float [[E]] ; %C = insertelement <4 x float> %A, float %f, i32 0 @@ -243,7 +243,7 @@ define i32 @extelt_binop_binop_insertelt(<4 x i32> %A, <4 x i32> %B, i32 %f) { ; ; CHECK-LABEL: @extelt_binop_binop_insertelt( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[B:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], [[F:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[F:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[B]], i64 0 ; CHECK-NEXT: [[E:%.*]] = mul nsw i32 [[TMP2]], [[TMP3]] ; CHECK-NEXT: ret i32 [[E]] @@ -348,7 +348,7 @@ define i1 @extractelt_vector_fcmp_not_cheap_to_scalarize_multi_use(<2 x float> % ; CHECK-LABEL: @extractelt_vector_fcmp_not_cheap_to_scalarize_multi_use( ; CHECK-NEXT: [[ADD:%.*]] = fadd <2 x float> [[ARG1:%.*]], [[ARG2:%.*]] ; CHECK-NEXT: store volatile <2 x float> [[ADD]], ptr undef, align 8 -; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq <2 x float> [[ADD]], [[ARG0:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq <2 x float> [[ARG0:%.*]], [[ADD]] ; CHECK-NEXT: [[EXT:%.*]] = extractelement <2 x i1> [[CMP]], i64 0 ; CHECK-NEXT: ret i1 [[EXT]] ; diff --git a/llvm/test/Transforms/InstCombine/scalarization.ll b/llvm/test/Transforms/InstCombine/scalarization.ll index 2f539ece88320e..591437b72c1fc4 100644 --- a/llvm/test/Transforms/InstCombine/scalarization.ll +++ b/llvm/test/Transforms/InstCombine/scalarization.ll @@ -212,8 +212,8 @@ define float @extract_element_load(<4 x float> %x, ptr %ptr) { ; ; CHECK-LABEL: @extract_element_load( ; CHECK-NEXT: [[LOAD:%.*]] = load <4 x float>, ptr [[PTR:%.*]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[LOAD]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[X:%.*]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[LOAD]], i64 2 ; CHECK-NEXT: [[R:%.*]] = fadd float [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret float [[R]] ; @@ -228,7 +228,7 @@ define float @extract_element_multi_Use_load(<4 x float> %x, ptr %ptr0, ptr %ptr ; CHECK-LABEL: @extract_element_multi_Use_load( ; CHECK-NEXT: [[LOAD:%.*]] = load <4 x float>, ptr [[PTR0:%.*]], align 16 ; CHECK-NEXT: store <4 x float> [[LOAD]], ptr [[PTR1:%.*]], align 16 -; CHECK-NEXT: [[ADD:%.*]] = fadd <4 x float> [[LOAD]], [[X:%.*]] +; CHECK-NEXT: [[ADD:%.*]] = fadd <4 x float> [[X:%.*]], [[LOAD]] ; CHECK-NEXT: [[R:%.*]] = extractelement <4 x float> [[ADD]], i64 2 ; CHECK-NEXT: ret float [[R]] ; @@ -255,7 +255,7 @@ define float @extelt_binop_insertelt(<4 x float> %A, <4 x float> %B, float %f) { ; ; CHECK-LABEL: @extelt_binop_insertelt( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 -; CHECK-NEXT: [[E:%.*]] = fmul nnan float [[TMP1]], [[F:%.*]] +; CHECK-NEXT: [[E:%.*]] = fmul nnan float [[F:%.*]], [[TMP1]] ; CHECK-NEXT: ret float [[E]] ; %C = insertelement <4 x float> %A, float %f, i32 0 @@ -269,7 +269,7 @@ define i32 @extelt_binop_binop_insertelt(<4 x i32> %A, <4 x i32> %B, i32 %f) { ; ; CHECK-LABEL: @extelt_binop_binop_insertelt( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[B:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], [[F:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[F:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[B]], i64 0 ; CHECK-NEXT: [[E:%.*]] = mul nsw i32 [[TMP2]], [[TMP3]] ; CHECK-NEXT: ret i32 [[E]] @@ -385,7 +385,7 @@ define i1 @extractelt_vector_fcmp_not_cheap_to_scalarize_multi_use(<2 x float> % ; CHECK-LABEL: @extractelt_vector_fcmp_not_cheap_to_scalarize_multi_use( ; CHECK-NEXT: [[ADD:%.*]] = fadd <2 x float> [[ARG1:%.*]], [[ARG2:%.*]] ; CHECK-NEXT: store volatile <2 x float> [[ADD]], ptr undef, align 8 -; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq <2 x float> [[ADD]], [[ARG0:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq <2 x float> [[ARG0:%.*]], [[ADD]] ; CHECK-NEXT: [[EXT:%.*]] = extractelement <2 x i1> [[CMP]], i64 0 ; CHECK-NEXT: ret i1 [[EXT]] ; diff --git a/llvm/test/Transforms/InstCombine/select-and-or.ll b/llvm/test/Transforms/InstCombine/select-and-or.ll index c4c279361d2a69..68bd28cf234b47 100644 --- a/llvm/test/Transforms/InstCombine/select-and-or.ll +++ b/llvm/test/Transforms/InstCombine/select-and-or.ll @@ -509,7 +509,7 @@ define i1 @and_or2_commuted(i1 %a, i1 %b, i1 %c) { define i1 @and_or1_multiuse(i1 %a, i1 %b, i1 %c) { ; CHECK-LABEL: @and_or1_multiuse( ; CHECK-NEXT: [[NOTA:%.*]] = xor i1 [[A:%.*]], true -; CHECK-NEXT: [[COND:%.*]] = or i1 [[NOTA]], [[C:%.*]] +; CHECK-NEXT: [[COND:%.*]] = or i1 [[C:%.*]], [[NOTA]] ; CHECK-NEXT: call void @use(i1 [[COND]]) ; CHECK-NEXT: [[R:%.*]] = select i1 [[COND]], i1 [[A]], i1 [[B:%.*]] ; CHECK-NEXT: ret i1 [[R]] @@ -524,7 +524,7 @@ define i1 @and_or1_multiuse(i1 %a, i1 %b, i1 %c) { define i1 @and_or2_multiuse(i1 %a, i1 %b, i1 %c) { ; CHECK-LABEL: @and_or2_multiuse( ; CHECK-NEXT: [[NOTC:%.*]] = xor i1 [[C:%.*]], true -; CHECK-NEXT: [[COND:%.*]] = and i1 [[NOTC]], [[B:%.*]] +; CHECK-NEXT: [[COND:%.*]] = and i1 [[B:%.*]], [[NOTC]] ; CHECK-NEXT: call void @use(i1 [[COND]]) ; CHECK-NEXT: [[R:%.*]] = select i1 [[COND]], i1 [[A:%.*]], i1 [[B]] ; CHECK-NEXT: ret i1 [[R]] @@ -595,7 +595,7 @@ define <2 x i1> @and_or2_vec_commuted(<2 x i1> %a, <2 x i1> %b) { define i1 @and_or1_wrong_operand(i1 %a, i1 %b, i1 %c, i1 %d) { ; CHECK-LABEL: @and_or1_wrong_operand( ; CHECK-NEXT: [[NOTA:%.*]] = xor i1 [[A:%.*]], true -; CHECK-NEXT: [[COND:%.*]] = or i1 [[NOTA]], [[C:%.*]] +; CHECK-NEXT: [[COND:%.*]] = or i1 [[C:%.*]], [[NOTA]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[COND]], i1 [[D:%.*]], i1 [[B:%.*]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -608,7 +608,7 @@ define i1 @and_or1_wrong_operand(i1 %a, i1 %b, i1 %c, i1 %d) { define i1 @and_or2_wrong_operand(i1 %a, i1 %b, i1 %c, i1 %d) { ; CHECK-LABEL: @and_or2_wrong_operand( ; CHECK-NEXT: [[NOTC:%.*]] = xor i1 [[C:%.*]], true -; CHECK-NEXT: [[COND:%.*]] = and i1 [[NOTC]], [[B:%.*]] +; CHECK-NEXT: [[COND:%.*]] = and i1 [[B:%.*]], [[NOTC]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[COND]], i1 [[A:%.*]], i1 [[D:%.*]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -658,7 +658,7 @@ define i1 @and_or3_not_free_to_invert(i1 %a, i1 %b, i1 %c) { define i1 @and_or3_multiuse(i1 %a, i1 %b, i32 %x, i32 %y) { ; CHECK-LABEL: @and_or3_multiuse( ; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[COND:%.*]] = and i1 [[C]], [[B:%.*]] +; CHECK-NEXT: [[COND:%.*]] = and i1 [[B:%.*]], [[C]] ; CHECK-NEXT: call void @use(i1 [[COND]]) ; CHECK-NEXT: [[R:%.*]] = select i1 [[COND]], i1 [[A:%.*]], i1 [[B]] ; CHECK-NEXT: ret i1 [[R]] @@ -699,7 +699,7 @@ define <2 x i1> @and_or3_vec_commuted(<2 x i1> %a, <2 x i1> %b, <2 x i32> %x, <2 define i1 @and_or3_wrong_operand(i1 %a, i1 %b, i32 %x, i32 %y, i1 %d) { ; CHECK-LABEL: @and_or3_wrong_operand( ; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[COND:%.*]] = and i1 [[C]], [[B:%.*]] +; CHECK-NEXT: [[COND:%.*]] = and i1 [[B:%.*]], [[C]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[COND]], i1 [[A:%.*]], i1 [[D:%.*]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -760,7 +760,7 @@ define i1 @or_and2_commuted(i1 %a, i1 %b, i1 %c) { define i1 @or_and1_multiuse(i1 %a, i1 %b, i1 %c) { ; CHECK-LABEL: @or_and1_multiuse( ; CHECK-NEXT: [[NOTB:%.*]] = xor i1 [[B:%.*]], true -; CHECK-NEXT: [[COND:%.*]] = and i1 [[NOTB]], [[C:%.*]] +; CHECK-NEXT: [[COND:%.*]] = and i1 [[C:%.*]], [[NOTB]] ; CHECK-NEXT: call void @use(i1 [[COND]]) ; CHECK-NEXT: [[R:%.*]] = select i1 [[COND]], i1 [[A:%.*]], i1 [[B]] ; CHECK-NEXT: ret i1 [[R]] @@ -775,7 +775,7 @@ define i1 @or_and1_multiuse(i1 %a, i1 %b, i1 %c) { define i1 @or_and2_multiuse(i1 %a, i1 %b, i1 %c) { ; CHECK-LABEL: @or_and2_multiuse( ; CHECK-NEXT: [[NOTC:%.*]] = xor i1 [[C:%.*]], true -; CHECK-NEXT: [[COND:%.*]] = or i1 [[NOTC]], [[A:%.*]] +; CHECK-NEXT: [[COND:%.*]] = or i1 [[A:%.*]], [[NOTC]] ; CHECK-NEXT: call void @use(i1 [[COND]]) ; CHECK-NEXT: [[R:%.*]] = select i1 [[COND]], i1 [[A]], i1 [[B:%.*]] ; CHECK-NEXT: ret i1 [[R]] @@ -846,7 +846,7 @@ define <2 x i1> @or_and2_vec_commuted(<2 x i1> %a, <2 x i1> %b) { define i1 @or_and1_wrong_operand(i1 %a, i1 %b, i1 %c, i1 %d) { ; CHECK-LABEL: @or_and1_wrong_operand( ; CHECK-NEXT: [[NOTB:%.*]] = xor i1 [[B:%.*]], true -; CHECK-NEXT: [[COND:%.*]] = and i1 [[NOTB]], [[C:%.*]] +; CHECK-NEXT: [[COND:%.*]] = and i1 [[C:%.*]], [[NOTB]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[COND]], i1 [[A:%.*]], i1 [[D:%.*]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -859,7 +859,7 @@ define i1 @or_and1_wrong_operand(i1 %a, i1 %b, i1 %c, i1 %d) { define i1 @or_and2_wrong_operand(i1 %a, i1 %b, i1 %c, i1 %d) { ; CHECK-LABEL: @or_and2_wrong_operand( ; CHECK-NEXT: [[NOTC:%.*]] = xor i1 [[C:%.*]], true -; CHECK-NEXT: [[COND:%.*]] = or i1 [[NOTC]], [[A:%.*]] +; CHECK-NEXT: [[COND:%.*]] = or i1 [[A:%.*]], [[NOTC]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[COND]], i1 [[D:%.*]], i1 [[B:%.*]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -922,7 +922,7 @@ define i1 @or_and3_not_free_to_invert(i1 %a, i1 %b, i1 %c) { define i1 @or_and3_multiuse(i1 %a, i1 %b, i32 %x, i32 %y) { ; CHECK-LABEL: @or_and3_multiuse( ; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[COND:%.*]] = or i1 [[C]], [[A:%.*]] +; CHECK-NEXT: [[COND:%.*]] = or i1 [[A:%.*]], [[C]] ; CHECK-NEXT: call void @use(i1 [[COND]]) ; CHECK-NEXT: [[R:%.*]] = select i1 [[COND]], i1 [[A]], i1 [[B:%.*]] ; CHECK-NEXT: ret i1 [[R]] @@ -963,7 +963,7 @@ define <2 x i1> @or_and3_vec_commuted(<2 x i1> %a, <2 x i1> %b, <2 x i32> %x, <2 define i1 @or_and3_wrong_operand(i1 %a, i1 %b, i32 %x, i32 %y, i1 %d) { ; CHECK-LABEL: @or_and3_wrong_operand( ; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[COND:%.*]] = or i1 [[C]], [[A:%.*]] +; CHECK-NEXT: [[COND:%.*]] = or i1 [[A:%.*]], [[C]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[COND]], i1 [[D:%.*]], i1 [[B:%.*]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -1223,7 +1223,7 @@ define i8 @test_or_eq_different_operands(i8 %a, i8 %b, i8 %c) { define i8 @test_or_eq_a_b_multi_use(i1 %other_cond, i8 %a, i8 %b) { ; CHECK-LABEL: @test_or_eq_a_b_multi_use( ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[COND:%.*]] = or i1 [[CMP]], [[OTHER_COND:%.*]] +; CHECK-NEXT: [[COND:%.*]] = or i1 [[OTHER_COND:%.*]], [[CMP]] ; CHECK-NEXT: call void @use(i1 [[CMP]]) ; CHECK-NEXT: call void @use(i1 [[COND]]) ; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[OTHER_COND]], i8 [[A]], i8 [[B]] diff --git a/llvm/test/Transforms/InstCombine/select-binop-cmp.ll b/llvm/test/Transforms/InstCombine/select-binop-cmp.ll index fb56764598e2dc..647287ef5ebad1 100644 --- a/llvm/test/Transforms/InstCombine/select-binop-cmp.ll +++ b/llvm/test/Transforms/InstCombine/select-binop-cmp.ll @@ -1210,7 +1210,7 @@ define i32 @select_replace_nested(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @select_replace_nested( ; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[ADD:%.*]] = select i1 [[C]], i32 [[Z:%.*]], i32 0 -; CHECK-NEXT: [[S:%.*]] = add i32 [[ADD]], [[Y:%.*]] +; CHECK-NEXT: [[S:%.*]] = add i32 [[Y:%.*]], [[ADD]] ; CHECK-NEXT: ret i32 [[S]] ; %c = icmp eq i32 %x, 0 diff --git a/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll b/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll index 77ff16a8b2e3d8..e5ad312bb85c17 100644 --- a/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll +++ b/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll @@ -15,7 +15,7 @@ define float @select_maybe_nan_fadd(i1 %cond, float %A, float %B) { define float @select_fpclass_fadd(i1 %cond, float nofpclass(nan) %A, float %B) { ; CHECK-LABEL: @select_fpclass_fadd( ; CHECK-NEXT: [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float -0.000000e+00 -; CHECK-NEXT: [[D:%.*]] = fadd float [[C]], [[A:%.*]] +; CHECK-NEXT: [[D:%.*]] = fadd float [[A:%.*]], [[C]] ; CHECK-NEXT: ret float [[D]] ; %C = fadd float %A, %B @@ -26,7 +26,7 @@ define float @select_fpclass_fadd(i1 %cond, float nofpclass(nan) %A, float %B) { define float @select_nnan_fadd(i1 %cond, float %A, float %B) { ; CHECK-LABEL: @select_nnan_fadd( ; CHECK-NEXT: [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float -0.000000e+00 -; CHECK-NEXT: [[D:%.*]] = fadd float [[C]], [[A:%.*]] +; CHECK-NEXT: [[D:%.*]] = fadd float [[A:%.*]], [[C]] ; CHECK-NEXT: ret float [[D]] ; %C = fadd float %A, %B @@ -37,7 +37,7 @@ define float @select_nnan_fadd(i1 %cond, float %A, float %B) { define float @select_nnan_fadd_swapped(i1 %cond, float %A, float %B) { ; CHECK-LABEL: @select_nnan_fadd_swapped( ; CHECK-NEXT: [[C:%.*]] = select nnan i1 [[COND:%.*]], float -0.000000e+00, float [[B:%.*]] -; CHECK-NEXT: [[D:%.*]] = fadd float [[C]], [[A:%.*]] +; CHECK-NEXT: [[D:%.*]] = fadd float [[A:%.*]], [[C]] ; CHECK-NEXT: ret float [[D]] ; %C = fadd float %A, %B @@ -48,7 +48,7 @@ define float @select_nnan_fadd_swapped(i1 %cond, float %A, float %B) { define float @select_nnan_fadd_fast_math(i1 %cond, float %A, float %B) { ; CHECK-LABEL: @select_nnan_fadd_fast_math( ; CHECK-NEXT: [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float -0.000000e+00 -; CHECK-NEXT: [[D:%.*]] = fadd fast float [[C]], [[A:%.*]] +; CHECK-NEXT: [[D:%.*]] = fadd fast float [[A:%.*]], [[C]] ; CHECK-NEXT: ret float [[D]] ; %C = fadd fast float %A, %B @@ -59,7 +59,7 @@ define float @select_nnan_fadd_fast_math(i1 %cond, float %A, float %B) { define float @select_nnan_fadd_swapped_fast_math(i1 %cond, float %A, float %B) { ; CHECK-LABEL: @select_nnan_fadd_swapped_fast_math( ; CHECK-NEXT: [[C:%.*]] = select nnan i1 [[COND:%.*]], float -0.000000e+00, float [[B:%.*]] -; CHECK-NEXT: [[D:%.*]] = fadd fast float [[C]], [[A:%.*]] +; CHECK-NEXT: [[D:%.*]] = fadd fast float [[A:%.*]], [[C]] ; CHECK-NEXT: ret float [[D]] ; %C = fadd fast float %A, %B @@ -70,7 +70,7 @@ define float @select_nnan_fadd_swapped_fast_math(i1 %cond, float %A, float %B) { define <4 x float> @select_nnan_nsz_fadd_v4f32(<4 x i1> %cond, <4 x float> %A, <4 x float> %B) { ; CHECK-LABEL: @select_nnan_nsz_fadd_v4f32( ; CHECK-NEXT: [[C:%.*]] = select nnan nsz <4 x i1> [[COND:%.*]], <4 x float> [[B:%.*]], <4 x float> zeroinitializer -; CHECK-NEXT: [[D:%.*]] = fadd nnan nsz <4 x float> [[C]], [[A:%.*]] +; CHECK-NEXT: [[D:%.*]] = fadd nnan nsz <4 x float> [[A:%.*]], [[C]] ; CHECK-NEXT: ret <4 x float> [[D]] ; %C = fadd nsz nnan <4 x float> %A, %B @@ -81,7 +81,7 @@ define <4 x float> @select_nnan_nsz_fadd_v4f32(<4 x i1> %cond, <4 x float> %A, < define @select_nnan_nsz_fadd_nxv4f32( %cond, %A, %B) { ; CHECK-LABEL: @select_nnan_nsz_fadd_nxv4f32( ; CHECK-NEXT: [[C:%.*]] = select nnan nsz [[COND:%.*]], [[B:%.*]], zeroinitializer -; CHECK-NEXT: [[D:%.*]] = fadd nnan nsz [[C]], [[A:%.*]] +; CHECK-NEXT: [[D:%.*]] = fadd nnan nsz [[A:%.*]], [[C]] ; CHECK-NEXT: ret [[D]] ; %C = fadd nnan nsz %A, %B @@ -92,7 +92,7 @@ define @select_nnan_nsz_fadd_nxv4f32( %con define @select_nnan_nsz_fadd_nxv4f32_swapops( %cond, %A, %B) { ; CHECK-LABEL: @select_nnan_nsz_fadd_nxv4f32_swapops( ; CHECK-NEXT: [[C:%.*]] = select fast [[COND:%.*]], zeroinitializer, [[B:%.*]] -; CHECK-NEXT: [[D:%.*]] = fadd fast [[C]], [[A:%.*]] +; CHECK-NEXT: [[D:%.*]] = fadd fast [[A:%.*]], [[C]] ; CHECK-NEXT: ret [[D]] ; %C = fadd fast %A, %B @@ -103,7 +103,7 @@ define @select_nnan_nsz_fadd_nxv4f32_swapops( @icmp_ne_common_op11(<3 x i1> %c, <3 x i17> %x, <3 x i17> %y, <3 x i17> %z) { ; CHECK-LABEL: @icmp_ne_common_op11( ; CHECK-NEXT: [[R_V:%.*]] = select <3 x i1> [[C:%.*]], <3 x i17> [[Y:%.*]], <3 x i17> [[Z:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp ne <3 x i17> [[R_V]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ne <3 x i17> [[X:%.*]], [[R_V]] ; CHECK-NEXT: ret <3 x i1> [[R]] ; %cmp1 = icmp ne <3 x i17> %y, %x @@ -62,7 +62,7 @@ define <3 x i1> @icmp_ne_common_op11(<3 x i1> %c, <3 x i17> %x, <3 x i17> %y, <3 define i1 @icmp_eq_common_op00(i1 %c, i5 %x, i5 %y, i5 %z) { ; CHECK-LABEL: @icmp_eq_common_op00( ; CHECK-NEXT: [[R_V:%.*]] = select i1 [[C:%.*]], i5 [[Y:%.*]], i5 [[Z:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp eq i5 [[R_V]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i5 [[X:%.*]], [[R_V]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp1 = icmp eq i5 %x, %y @@ -74,7 +74,7 @@ define i1 @icmp_eq_common_op00(i1 %c, i5 %x, i5 %y, i5 %z) { define <5 x i1> @icmp_eq_common_op01(<5 x i1> %c, <5 x i7> %x, <5 x i7> %y, <5 x i7> %z) { ; CHECK-LABEL: @icmp_eq_common_op01( ; CHECK-NEXT: [[R_V:%.*]] = select <5 x i1> [[C:%.*]], <5 x i7> [[Y:%.*]], <5 x i7> [[Z:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp eq <5 x i7> [[R_V]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp eq <5 x i7> [[X:%.*]], [[R_V]] ; CHECK-NEXT: ret <5 x i1> [[R]] ; %cmp1 = icmp eq <5 x i7> %x, %y @@ -86,7 +86,7 @@ define <5 x i1> @icmp_eq_common_op01(<5 x i1> %c, <5 x i7> %x, <5 x i7> %y, <5 x define i1 @icmp_eq_common_op10(i1 %c, i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @icmp_eq_common_op10( ; CHECK-NEXT: [[R_V:%.*]] = select i1 [[C:%.*]], i32 [[Y:%.*]], i32 [[Z:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp eq i32 [[R_V]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i32 [[X:%.*]], [[R_V]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp1 = icmp eq i32 %y, %x @@ -98,7 +98,7 @@ define i1 @icmp_eq_common_op10(i1 %c, i32 %x, i32 %y, i32 %z) { define i1 @icmp_eq_common_op11(i1 %c, i64 %x, i64 %y, i64 %z) { ; CHECK-LABEL: @icmp_eq_common_op11( ; CHECK-NEXT: [[R_V:%.*]] = select i1 [[C:%.*]], i64 [[Y:%.*]], i64 [[Z:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp eq i64 [[R_V]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i64 [[X:%.*]], [[R_V]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp1 = icmp eq i64 %y, %x @@ -112,7 +112,7 @@ define i1 @icmp_common_one_use_1(i1 %c, i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: call void @use(i1 [[CMP1]]) ; CHECK-NEXT: [[R_V:%.*]] = select i1 [[C:%.*]], i8 [[Y]], i8 [[Z:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[R_V]], [[X]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[X]], [[R_V]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp1 = icmp eq i8 %y, %x @@ -125,7 +125,7 @@ define i1 @icmp_common_one_use_1(i1 %c, i8 %x, i8 %y, i8 %z) { define i1 @icmp_slt_common(i1 %c, i6 %x, i6 %y, i6 %z) { ; CHECK-LABEL: @icmp_slt_common( ; CHECK-NEXT: [[R_V:%.*]] = select i1 [[C:%.*]], i6 [[Y:%.*]], i6 [[Z:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp sgt i6 [[R_V]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp slt i6 [[X:%.*]], [[R_V]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp1 = icmp slt i6 %x, %y @@ -137,7 +137,7 @@ define i1 @icmp_slt_common(i1 %c, i6 %x, i6 %y, i6 %z) { define i1 @icmp_sgt_common(i1 %c, i6 %x, i6 %y, i6 %z) { ; CHECK-LABEL: @icmp_sgt_common( ; CHECK-NEXT: [[R_V:%.*]] = select i1 [[C:%.*]], i6 [[Y:%.*]], i6 [[Z:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp slt i6 [[R_V]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp sgt i6 [[X:%.*]], [[R_V]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp1 = icmp sgt i6 %x, %y @@ -149,7 +149,7 @@ define i1 @icmp_sgt_common(i1 %c, i6 %x, i6 %y, i6 %z) { define i1 @icmp_sle_common(i1 %c, i6 %x, i6 %y, i6 %z) { ; CHECK-LABEL: @icmp_sle_common( ; CHECK-NEXT: [[R_V:%.*]] = select i1 [[C:%.*]], i6 [[Y:%.*]], i6 [[Z:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp sle i6 [[R_V]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp sge i6 [[X:%.*]], [[R_V]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp1 = icmp sle i6 %y, %x @@ -161,7 +161,7 @@ define i1 @icmp_sle_common(i1 %c, i6 %x, i6 %y, i6 %z) { define i1 @icmp_sge_common(i1 %c, i6 %x, i6 %y, i6 %z) { ; CHECK-LABEL: @icmp_sge_common( ; CHECK-NEXT: [[R_V:%.*]] = select i1 [[C:%.*]], i6 [[Y:%.*]], i6 [[Z:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp sge i6 [[R_V]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp sle i6 [[X:%.*]], [[R_V]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp1 = icmp sge i6 %y, %x @@ -173,7 +173,7 @@ define i1 @icmp_sge_common(i1 %c, i6 %x, i6 %y, i6 %z) { define i1 @icmp_slt_sgt_common(i1 %c, i6 %x, i6 %y, i6 %z) { ; CHECK-LABEL: @icmp_slt_sgt_common( ; CHECK-NEXT: [[R_V:%.*]] = select i1 [[C:%.*]], i6 [[Y:%.*]], i6 [[Z:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp sgt i6 [[R_V]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp slt i6 [[X:%.*]], [[R_V]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp1 = icmp slt i6 %x, %y @@ -185,7 +185,7 @@ define i1 @icmp_slt_sgt_common(i1 %c, i6 %x, i6 %y, i6 %z) { define i1 @icmp_sle_sge_common(i1 %c, i6 %x, i6 %y, i6 %z) { ; CHECK-LABEL: @icmp_sle_sge_common( ; CHECK-NEXT: [[R_V:%.*]] = select i1 [[C:%.*]], i6 [[Y:%.*]], i6 [[Z:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp sle i6 [[R_V]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp sge i6 [[X:%.*]], [[R_V]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp1 = icmp sle i6 %y, %x @@ -197,7 +197,7 @@ define i1 @icmp_sle_sge_common(i1 %c, i6 %x, i6 %y, i6 %z) { define i1 @icmp_ult_common(i1 %c, i6 %x, i6 %y, i6 %z) { ; CHECK-LABEL: @icmp_ult_common( ; CHECK-NEXT: [[R_V:%.*]] = select i1 [[C:%.*]], i6 [[Y:%.*]], i6 [[Z:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp ugt i6 [[R_V]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ult i6 [[X:%.*]], [[R_V]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp1 = icmp ult i6 %x, %y @@ -209,7 +209,7 @@ define i1 @icmp_ult_common(i1 %c, i6 %x, i6 %y, i6 %z) { define i1 @icmp_ule_common(i1 %c, i6 %x, i6 %y, i6 %z) { ; CHECK-LABEL: @icmp_ule_common( ; CHECK-NEXT: [[R_V:%.*]] = select i1 [[C:%.*]], i6 [[Y:%.*]], i6 [[Z:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp ule i6 [[R_V]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp uge i6 [[X:%.*]], [[R_V]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp1 = icmp ule i6 %y, %x @@ -221,7 +221,7 @@ define i1 @icmp_ule_common(i1 %c, i6 %x, i6 %y, i6 %z) { define i1 @icmp_ugt_common(i1 %c, i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @icmp_ugt_common( ; CHECK-NEXT: [[R_V:%.*]] = select i1 [[C:%.*]], i8 [[Y:%.*]], i8 [[Z:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[R_V]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[X:%.*]], [[R_V]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp1 = icmp ugt i8 %y, %x @@ -233,7 +233,7 @@ define i1 @icmp_ugt_common(i1 %c, i8 %x, i8 %y, i8 %z) { define i1 @icmp_uge_common(i1 %c, i6 %x, i6 %y, i6 %z) { ; CHECK-LABEL: @icmp_uge_common( ; CHECK-NEXT: [[R_V:%.*]] = select i1 [[C:%.*]], i6 [[Y:%.*]], i6 [[Z:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp uge i6 [[R_V]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i6 [[X:%.*]], [[R_V]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp1 = icmp uge i6 %y, %x @@ -245,7 +245,7 @@ define i1 @icmp_uge_common(i1 %c, i6 %x, i6 %y, i6 %z) { define i1 @icmp_ult_ugt_common(i1 %c, i6 %x, i6 %y, i6 %z) { ; CHECK-LABEL: @icmp_ult_ugt_common( ; CHECK-NEXT: [[R_V:%.*]] = select i1 [[C:%.*]], i6 [[Y:%.*]], i6 [[Z:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp ugt i6 [[R_V]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ult i6 [[X:%.*]], [[R_V]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp1 = icmp ult i6 %x, %y @@ -257,7 +257,7 @@ define i1 @icmp_ult_ugt_common(i1 %c, i6 %x, i6 %y, i6 %z) { define i1 @icmp_ule_uge_common(i1 %c, i6 %x, i6 %y, i6 %z) { ; CHECK-LABEL: @icmp_ule_uge_common( ; CHECK-NEXT: [[R_V:%.*]] = select i1 [[C:%.*]], i6 [[Y:%.*]], i6 [[Z:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp ule i6 [[R_V]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp uge i6 [[X:%.*]], [[R_V]] ; CHECK-NEXT: ret i1 [[R]] ; %cmp1 = icmp ule i6 %y, %x @@ -348,7 +348,7 @@ define i1 @icmp_no_common(i1 %c, i8 %x, i8 %y, i8 %z) { define i1 @test_select_inverse_eq(i64 %x, i1 %y) { ; CHECK-LABEL: @test_select_inverse_eq( ; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i64 [[X:%.*]], 0 -; CHECK-NEXT: [[SEL:%.*]] = xor i1 [[CMP2]], [[Y:%.*]] +; CHECK-NEXT: [[SEL:%.*]] = xor i1 [[Y:%.*]], [[CMP2]] ; CHECK-NEXT: ret i1 [[SEL]] ; %cmp1 = icmp ne i64 %x, 0 @@ -360,7 +360,7 @@ define i1 @test_select_inverse_eq(i64 %x, i1 %y) { define i1 @test_select_inverse_signed(i64 %x, i1 %y) { ; CHECK-LABEL: @test_select_inverse_signed( ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i64 [[X:%.*]], 0 -; CHECK-NEXT: [[SEL:%.*]] = xor i1 [[CMP2]], [[Y:%.*]] +; CHECK-NEXT: [[SEL:%.*]] = xor i1 [[Y:%.*]], [[CMP2]] ; CHECK-NEXT: ret i1 [[SEL]] ; %cmp1 = icmp sgt i64 %x, -1 @@ -372,7 +372,7 @@ define i1 @test_select_inverse_signed(i64 %x, i1 %y) { define i1 @test_select_inverse_unsigned(i64 %x, i1 %y) { ; CHECK-LABEL: @test_select_inverse_unsigned( ; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt i64 [[X:%.*]], 10 -; CHECK-NEXT: [[SEL:%.*]] = xor i1 [[CMP2]], [[Y:%.*]] +; CHECK-NEXT: [[SEL:%.*]] = xor i1 [[Y:%.*]], [[CMP2]] ; CHECK-NEXT: ret i1 [[SEL]] ; %cmp1 = icmp ult i64 %x, 11 @@ -384,7 +384,7 @@ define i1 @test_select_inverse_unsigned(i64 %x, i1 %y) { define i1 @test_select_inverse_eq_ptr(ptr %x, i1 %y) { ; CHECK-LABEL: @test_select_inverse_eq_ptr( ; CHECK-NEXT: [[CMP2:%.*]] = icmp ne ptr [[X:%.*]], null -; CHECK-NEXT: [[SEL:%.*]] = xor i1 [[CMP2]], [[Y:%.*]] +; CHECK-NEXT: [[SEL:%.*]] = xor i1 [[Y:%.*]], [[CMP2]] ; CHECK-NEXT: ret i1 [[SEL]] ; %cmp1 = icmp eq ptr %x, null @@ -409,7 +409,7 @@ define i1 @test_select_inverse_fail(i64 %x, i1 %y) { define <2 x i1> @test_select_inverse_vec(<2 x i64> %x, <2 x i1> %y) { ; CHECK-LABEL: @test_select_inverse_vec( ; CHECK-NEXT: [[CMP2:%.*]] = icmp eq <2 x i64> [[X:%.*]], zeroinitializer -; CHECK-NEXT: [[SEL:%.*]] = xor <2 x i1> [[CMP2]], [[Y:%.*]] +; CHECK-NEXT: [[SEL:%.*]] = xor <2 x i1> [[Y:%.*]], [[CMP2]] ; CHECK-NEXT: ret <2 x i1> [[SEL]] ; %cmp1 = icmp ne <2 x i64> %x, zeroinitializer @@ -434,7 +434,7 @@ define <2 x i1> @test_select_inverse_vec_fail(<2 x i64> %x, i1 %y) { define i1 @test_select_inverse_nonconst1(i64 %x, i64 %y, i1 %cond) { ; CHECK-LABEL: @test_select_inverse_nonconst1( ; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i64 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[SEL:%.*]] = xor i1 [[CMP2]], [[COND:%.*]] +; CHECK-NEXT: [[SEL:%.*]] = xor i1 [[COND:%.*]], [[CMP2]] ; CHECK-NEXT: ret i1 [[SEL]] ; %cmp1 = icmp ne i64 %x, %y @@ -446,7 +446,7 @@ define i1 @test_select_inverse_nonconst1(i64 %x, i64 %y, i1 %cond) { define i1 @test_select_inverse_nonconst2(i64 %x, i64 %y, i1 %cond) { ; CHECK-LABEL: @test_select_inverse_nonconst2( ; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i64 [[Y:%.*]], [[X:%.*]] -; CHECK-NEXT: [[SEL:%.*]] = xor i1 [[CMP2]], [[COND:%.*]] +; CHECK-NEXT: [[SEL:%.*]] = xor i1 [[COND:%.*]], [[CMP2]] ; CHECK-NEXT: ret i1 [[SEL]] ; %cmp1 = icmp ne i64 %x, %y @@ -458,7 +458,7 @@ define i1 @test_select_inverse_nonconst2(i64 %x, i64 %y, i1 %cond) { define i1 @test_select_inverse_nonconst3(i64 %x, i64 %y, i1 %cond) { ; CHECK-LABEL: @test_select_inverse_nonconst3( ; CHECK-NEXT: [[CMP2:%.*]] = icmp uge i64 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[SEL:%.*]] = xor i1 [[CMP2]], [[COND:%.*]] +; CHECK-NEXT: [[SEL:%.*]] = xor i1 [[COND:%.*]], [[CMP2]] ; CHECK-NEXT: ret i1 [[SEL]] ; %cmp1 = icmp ult i64 %x, %y diff --git a/llvm/test/Transforms/InstCombine/select-ctlz-to-cttz.ll b/llvm/test/Transforms/InstCombine/select-ctlz-to-cttz.ll index 59d33ee3b39df5..cc8f5d53fddddd 100644 --- a/llvm/test/Transforms/InstCombine/select-ctlz-to-cttz.ll +++ b/llvm/test/Transforms/InstCombine/select-ctlz-to-cttz.ll @@ -154,10 +154,10 @@ define i32 @select_clz_to_ctz_wrong_sub(i32 %a) { define i64 @select_clz_to_ctz_i64_wrong_xor(i64 %a) { ; CHECK-LABEL: @select_clz_to_ctz_i64_wrong_xor( ; CHECK-NEXT: [[SUB:%.*]] = sub i64 0, [[A:%.*]] -; CHECK-NEXT: [[AND:%.*]] = and i64 [[SUB]], [[A]] +; CHECK-NEXT: [[AND:%.*]] = and i64 [[A]], [[SUB]] ; CHECK-NEXT: [[LZ:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[AND]], i1 true) -; CHECK-NEXT: [[SUB11:%.*]] = or disjoint i64 [[LZ]], 64 -; CHECK-NEXT: ret i64 [[SUB11]] +; CHECK-NEXT: [[SUB1:%.*]] = or disjoint i64 [[LZ]], 64 +; CHECK-NEXT: ret i64 [[SUB1]] ; %sub = sub i64 0, %a %and = and i64 %sub, %a @@ -187,7 +187,7 @@ define i64 @select_clz_to_ctz_i64_wrong_icmp_cst(i64 %a) { define i64 @select_clz_to_ctz_i64_wrong_icmp_pred(i64 %a) { ; CHECK-LABEL: @select_clz_to_ctz_i64_wrong_icmp_pred( ; CHECK-NEXT: [[SUB:%.*]] = sub i64 0, [[A:%.*]] -; CHECK-NEXT: [[AND:%.*]] = and i64 [[SUB]], [[A]] +; CHECK-NEXT: [[AND:%.*]] = and i64 [[A]], [[SUB]] ; CHECK-NEXT: [[LZ:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[AND]], i1 true) ; CHECK-NEXT: [[TOBOOL:%.*]] = icmp slt i64 [[A]], 0 ; CHECK-NEXT: [[SUB1:%.*]] = xor i64 [[LZ]], 63 @@ -206,7 +206,7 @@ define i64 @select_clz_to_ctz_i64_wrong_icmp_pred(i64 %a) { define <2 x i32> @select_clz_to_ctz_vec_with_undef(<2 x i32> %a) { ; CHECK-LABEL: @select_clz_to_ctz_vec_with_undef( ; CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> zeroinitializer, [[A:%.*]] -; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[SUB]], [[A]] +; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[A]], [[SUB]] ; CHECK-NEXT: [[LZ:%.*]] = tail call range(i32 0, 33) <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[AND]], i1 true) ; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq <2 x i32> [[A]], zeroinitializer ; CHECK-NEXT: [[SUB1:%.*]] = xor <2 x i32> [[LZ]], @@ -225,7 +225,7 @@ define <2 x i32> @select_clz_to_ctz_vec_with_undef(<2 x i32> %a) { define i32 @select_clz_to_ctz_wrong_constant_for_zero(i32 %a) { ; CHECK-LABEL: @select_clz_to_ctz_wrong_constant_for_zero( ; CHECK-NEXT: [[SUB:%.*]] = sub i32 0, [[A:%.*]] -; CHECK-NEXT: [[AND:%.*]] = and i32 [[SUB]], [[A]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[A]], [[SUB]] ; CHECK-NEXT: [[LZ:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[AND]], i1 false) ; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[A]], 0 ; CHECK-NEXT: [[SUB1:%.*]] = xor i32 [[LZ]], 31 diff --git a/llvm/test/Transforms/InstCombine/select-divrem.ll b/llvm/test/Transforms/InstCombine/select-divrem.ll index e0c460c37451db..e11afd7b543b20 100644 --- a/llvm/test/Transforms/InstCombine/select-divrem.ll +++ b/llvm/test/Transforms/InstCombine/select-divrem.ll @@ -311,7 +311,7 @@ define i8 @rem_euclid_non_const_pow2(i8 %0, i8 %1) { ; CHECK-LABEL: @rem_euclid_non_const_pow2( ; CHECK-NEXT: [[NOTMASK:%.*]] = shl nsw i8 -1, [[TMP0:%.*]] ; CHECK-NEXT: [[TMP3:%.*]] = xor i8 [[NOTMASK]], -1 -; CHECK-NEXT: [[SEL:%.*]] = and i8 [[TMP3]], [[TMP1:%.*]] +; CHECK-NEXT: [[SEL:%.*]] = and i8 [[TMP1:%.*]], [[TMP3]] ; CHECK-NEXT: ret i8 [[SEL]] ; %pow2 = shl i8 1, %0 diff --git a/llvm/test/Transforms/InstCombine/select-factorize.ll b/llvm/test/Transforms/InstCombine/select-factorize.ll index 386c8e522759e2..ab9d9f6b24754d 100644 --- a/llvm/test/Transforms/InstCombine/select-factorize.ll +++ b/llvm/test/Transforms/InstCombine/select-factorize.ll @@ -230,7 +230,7 @@ define i1 @and_logic_and_logic_or_5(i1 %c, i1 %a, i1 %b) { define i1 @and_logic_and_logic_or_6(i1 %c, i1 %a, i1 %b) { ; CHECK-LABEL: @and_logic_and_logic_or_6( ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[B:%.*]], i1 true, i1 [[A:%.*]] -; CHECK-NEXT: [[OR:%.*]] = and i1 [[TMP1]], [[C:%.*]] +; CHECK-NEXT: [[OR:%.*]] = and i1 [[C:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[OR]] ; %ac = and i1 %c, %a @@ -254,7 +254,7 @@ define i1 @and_logic_and_logic_or_7(i1 %c, i1 %a, i1 %b) { define i1 @and_logic_and_logic_or_8(i1 %c, i1 %a, i1 %b) { ; CHECK-LABEL: @and_logic_and_logic_or_8( ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[B:%.*]], i1 true, i1 [[A:%.*]] -; CHECK-NEXT: [[OR:%.*]] = and i1 [[TMP1]], [[C:%.*]] +; CHECK-NEXT: [[OR:%.*]] = and i1 [[C:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[OR]] ; %ac = and i1 %a, %c @@ -319,7 +319,7 @@ define i1 @and_logic_and_logic_or_not_one_use(i1 %c, i1 %a, i1 %b) { define i1 @and_and_logic_or_1(i1 %c, i1 %a, i1 %b) { ; CHECK-LABEL: @and_and_logic_or_1( ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[A:%.*]], i1 true, i1 [[B:%.*]] -; CHECK-NEXT: [[OR:%.*]] = and i1 [[TMP1]], [[C:%.*]] +; CHECK-NEXT: [[OR:%.*]] = and i1 [[C:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[OR]] ; %ac = and i1 %c, %a @@ -331,7 +331,7 @@ define i1 @and_and_logic_or_1(i1 %c, i1 %a, i1 %b) { define i1 @and_and_logic_or_2(i1 %c, i1 %a, i1 %b) { ; CHECK-LABEL: @and_and_logic_or_2( ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[B:%.*]], i1 true, i1 [[A:%.*]] -; CHECK-NEXT: [[OR:%.*]] = and i1 [[TMP1]], [[C:%.*]] +; CHECK-NEXT: [[OR:%.*]] = and i1 [[C:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[OR]] ; %ac = and i1 %a, %c @@ -343,7 +343,7 @@ define i1 @and_and_logic_or_2(i1 %c, i1 %a, i1 %b) { define <3 x i1> @and_and_logic_or_vector(<3 x i1> %c, <3 x i1> %a, <3 x i1> %b) { ; CHECK-LABEL: @and_and_logic_or_vector( ; CHECK-NEXT: [[TMP1:%.*]] = select <3 x i1> [[A:%.*]], <3 x i1> , <3 x i1> [[B:%.*]] -; CHECK-NEXT: [[OR:%.*]] = and <3 x i1> [[TMP1]], [[C:%.*]] +; CHECK-NEXT: [[OR:%.*]] = and <3 x i1> [[C:%.*]], [[TMP1]] ; CHECK-NEXT: ret <3 x i1> [[OR]] ; %ac = and <3 x i1> %c, %a @@ -355,7 +355,7 @@ define <3 x i1> @and_and_logic_or_vector(<3 x i1> %c, <3 x i1> %a, <3 x i1> %b) define <3 x i1> @and_and_logic_or_vector_poison(<3 x i1> %c, <3 x i1> %a, <3 x i1> %b) { ; CHECK-LABEL: @and_and_logic_or_vector_poison( ; CHECK-NEXT: [[TMP1:%.*]] = select <3 x i1> [[A:%.*]], <3 x i1> , <3 x i1> [[B:%.*]] -; CHECK-NEXT: [[OR:%.*]] = and <3 x i1> [[TMP1]], [[C:%.*]] +; CHECK-NEXT: [[OR:%.*]] = and <3 x i1> [[C:%.*]], [[TMP1]] ; CHECK-NEXT: ret <3 x i1> [[OR]] ; %ac = and <3 x i1> %c, %a @@ -584,7 +584,7 @@ define i1 @or_logic_or_logic_and_3(i1 %c, i1 %a, i1 %b) { define i1 @or_logic_or_logic_and_4(i1 %c, i1 %a, i1 %b) { ; CHECK-LABEL: @or_logic_or_logic_and_4( ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[B:%.*]], i1 [[A:%.*]], i1 false -; CHECK-NEXT: [[OR:%.*]] = or i1 [[TMP1]], [[C:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i1 [[C:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[OR]] ; %ac = or i1 %c, %a @@ -632,7 +632,7 @@ define i1 @or_logic_or_logic_and_7(i1 %c, i1 %a, i1 %b) { define i1 @or_logic_or_logic_and_8(i1 %c, i1 %a, i1 %b) { ; CHECK-LABEL: @or_logic_or_logic_and_8( ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[B:%.*]], i1 [[A:%.*]], i1 false -; CHECK-NEXT: [[OR:%.*]] = or i1 [[TMP1]], [[C:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i1 [[C:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[OR]] ; %ac = or i1 %a, %c @@ -697,7 +697,7 @@ define i1 @or_logic_or_logic_and_not_one_use(i1 %c, i1 %a, i1 %b) { define i1 @or_or_logic_and_1(i1 %c, i1 %a, i1 %b) { ; CHECK-LABEL: @or_or_logic_and_1( ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[A:%.*]], i1 [[B:%.*]], i1 false -; CHECK-NEXT: [[OR:%.*]] = or i1 [[TMP1]], [[C:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i1 [[C:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[OR]] ; %ac = or i1 %c, %a @@ -709,7 +709,7 @@ define i1 @or_or_logic_and_1(i1 %c, i1 %a, i1 %b) { define i1 @or_or_logic_and_2(i1 %c, i1 %a, i1 %b) { ; CHECK-LABEL: @or_or_logic_and_2( ; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[B:%.*]], i1 [[A:%.*]], i1 false -; CHECK-NEXT: [[OR:%.*]] = or i1 [[TMP1]], [[C:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i1 [[C:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[OR]] ; %ac = or i1 %c, %a @@ -721,7 +721,7 @@ define i1 @or_or_logic_and_2(i1 %c, i1 %a, i1 %b) { define <3 x i1> @or_or_logic_and_vector(<3 x i1> %c, <3 x i1> %a, <3 x i1> %b) { ; CHECK-LABEL: @or_or_logic_and_vector( ; CHECK-NEXT: [[TMP1:%.*]] = select <3 x i1> [[A:%.*]], <3 x i1> [[B:%.*]], <3 x i1> zeroinitializer -; CHECK-NEXT: [[OR:%.*]] = or <3 x i1> [[TMP1]], [[C:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or <3 x i1> [[C:%.*]], [[TMP1]] ; CHECK-NEXT: ret <3 x i1> [[OR]] ; %ac = or <3 x i1> %c, %a @@ -733,7 +733,7 @@ define <3 x i1> @or_or_logic_and_vector(<3 x i1> %c, <3 x i1> %a, <3 x i1> %b) { define <3 x i1> @or_or_logic_and_vector_poison(<3 x i1> %c, <3 x i1> %a, <3 x i1> %b) { ; CHECK-LABEL: @or_or_logic_and_vector_poison( ; CHECK-NEXT: [[TMP1:%.*]] = select <3 x i1> [[A:%.*]], <3 x i1> [[B:%.*]], <3 x i1> zeroinitializer -; CHECK-NEXT: [[OR:%.*]] = or <3 x i1> [[TMP1]], [[C:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or <3 x i1> [[C:%.*]], [[TMP1]] ; CHECK-NEXT: ret <3 x i1> [[OR]] ; %ac = or <3 x i1> %c, %a diff --git a/llvm/test/Transforms/InstCombine/select-masked_gather.ll b/llvm/test/Transforms/InstCombine/select-masked_gather.ll index 70d798ecd5085e..a232bdbca0df4b 100644 --- a/llvm/test/Transforms/InstCombine/select-masked_gather.ll +++ b/llvm/test/Transforms/InstCombine/select-masked_gather.ll @@ -95,7 +95,7 @@ define @masked_gather_and_zero_inactive_7( define @masked_gather_and_zero_inactive_8( %ptr, %inv_mask, %cond) { ; CHECK-LABEL: @masked_gather_and_zero_inactive_8( ; CHECK-NEXT: [[MASK:%.*]] = xor [[INV_MASK:%.*]], shufflevector ( insertelement ( undef, i1 true, i32 0), undef, zeroinitializer) -; CHECK-NEXT: [[PG:%.*]] = and [[MASK]], [[COND:%.*]] +; CHECK-NEXT: [[PG:%.*]] = and [[COND:%.*]], [[MASK]] ; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.masked.gather.nxv2f32.nxv2p0( [[PTR:%.*]], i32 4, [[PG]], zeroinitializer) ; CHECK-NEXT: ret [[GATHER]] ; diff --git a/llvm/test/Transforms/InstCombine/select-masked_load.ll b/llvm/test/Transforms/InstCombine/select-masked_load.ll index 0e82def113e967..51525e5ee83467 100644 --- a/llvm/test/Transforms/InstCombine/select-masked_load.ll +++ b/llvm/test/Transforms/InstCombine/select-masked_load.ll @@ -92,7 +92,7 @@ define <4 x i32> @masked_load_and_zero_inactive_7(ptr %ptr, <4 x i1> %mask1, <4 define <4 x float> @masked_load_and_zero_inactive_8(ptr %ptr, <4 x i1> %inv_mask, <4 x i1> %cond) { ; CHECK-LABEL: @masked_load_and_zero_inactive_8( ; CHECK-NEXT: [[MASK:%.*]] = xor <4 x i1> [[INV_MASK:%.*]], -; CHECK-NEXT: [[PG:%.*]] = and <4 x i1> [[MASK]], [[COND:%.*]] +; CHECK-NEXT: [[PG:%.*]] = and <4 x i1> [[COND:%.*]], [[MASK]] ; CHECK-NEXT: [[LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[PG]], <4 x float> zeroinitializer) ; CHECK-NEXT: ret <4 x float> [[LOAD]] ; diff --git a/llvm/test/Transforms/InstCombine/select-of-bittest.ll b/llvm/test/Transforms/InstCombine/select-of-bittest.ll index 50d3c87f199c30..0c7624018cb02c 100644 --- a/llvm/test/Transforms/InstCombine/select-of-bittest.ll +++ b/llvm/test/Transforms/InstCombine/select-of-bittest.ll @@ -158,7 +158,7 @@ define <3 x i32> @and_and_vec_poison(<3 x i32> %arg) { define i32 @f_var0(i32 %arg, i32 %arg1) { ; CHECK-LABEL: @f_var0( ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[ARG1:%.*]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[ARG:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[ARG:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 ; CHECK-NEXT: [[T5:%.*]] = zext i1 [[TMP3]] to i32 ; CHECK-NEXT: ret i32 [[T5]] @@ -175,7 +175,7 @@ define i32 @f_var0(i32 %arg, i32 %arg1) { define i32 @f_var0_commutative_and(i32 %arg, i32 %arg1) { ; CHECK-LABEL: @f_var0_commutative_and( ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[ARG1:%.*]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[ARG:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[ARG:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 ; CHECK-NEXT: [[T5:%.*]] = zext i1 [[TMP3]] to i32 ; CHECK-NEXT: ret i32 [[T5]] @@ -191,7 +191,7 @@ define i32 @f_var0_commutative_and(i32 %arg, i32 %arg1) { define <2 x i32> @f_var0_splatvec(<2 x i32> %arg, <2 x i32> %arg1) { ; CHECK-LABEL: @f_var0_splatvec( ; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> [[ARG1:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], [[ARG:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[ARG:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[T5:%.*]] = zext <2 x i1> [[TMP3]] to <2 x i32> ; CHECK-NEXT: ret <2 x i32> [[T5]] @@ -207,7 +207,7 @@ define <2 x i32> @f_var0_splatvec(<2 x i32> %arg, <2 x i32> %arg1) { define <2 x i32> @f_var0_vec(<2 x i32> %arg, <2 x i32> %arg1) { ; CHECK-LABEL: @f_var0_vec( ; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> [[ARG1:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], [[ARG:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[ARG:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[T5:%.*]] = zext <2 x i1> [[TMP3]] to <2 x i32> ; CHECK-NEXT: ret <2 x i32> [[T5]] @@ -223,7 +223,7 @@ define <2 x i32> @f_var0_vec(<2 x i32> %arg, <2 x i32> %arg1) { define <3 x i32> @f_var0_vec_poison(<3 x i32> %arg, <3 x i32> %arg1) { ; CHECK-LABEL: @f_var0_vec_poison( ; CHECK-NEXT: [[TMP1:%.*]] = or <3 x i32> [[ARG1:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = and <3 x i32> [[TMP1]], [[ARG:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <3 x i32> [[ARG:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <3 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[T5:%.*]] = zext <3 x i1> [[TMP3]] to <3 x i32> ; CHECK-NEXT: ret <3 x i32> [[T5]] @@ -240,7 +240,7 @@ define <3 x i32> @f_var0_vec_poison(<3 x i32> %arg, <3 x i32> %arg1) { define i32 @f_var1(i32 %arg, i32 %arg1) { ; CHECK-LABEL: @f_var1( ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[ARG1:%.*]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[ARG:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[ARG:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 ; CHECK-NEXT: [[T4:%.*]] = zext i1 [[TMP3]] to i32 ; CHECK-NEXT: ret i32 [[T4]] @@ -256,7 +256,7 @@ define i32 @f_var1(i32 %arg, i32 %arg1) { define i32 @f_var1_commutative_and(i32 %arg, i32 %arg1) { ; CHECK-LABEL: @f_var1_commutative_and( ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[ARG1:%.*]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[ARG:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[ARG:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 ; CHECK-NEXT: [[T4:%.*]] = zext i1 [[TMP3]] to i32 ; CHECK-NEXT: ret i32 [[T4]] @@ -271,7 +271,7 @@ define i32 @f_var1_commutative_and(i32 %arg, i32 %arg1) { define <2 x i32> @f_var1_vec(<2 x i32> %arg, <2 x i32> %arg1) { ; CHECK-LABEL: @f_var1_vec( ; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> [[ARG1:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], [[ARG:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[ARG:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[T4:%.*]] = zext <2 x i1> [[TMP3]] to <2 x i32> ; CHECK-NEXT: ret <2 x i32> [[T4]] @@ -286,7 +286,7 @@ define <2 x i32> @f_var1_vec(<2 x i32> %arg, <2 x i32> %arg1) { define <3 x i32> @f_var1_vec_poison(<3 x i32> %arg, <3 x i32> %arg1) { ; CHECK-LABEL: @f_var1_vec_poison( ; CHECK-NEXT: [[TMP1:%.*]] = or <3 x i32> [[ARG1:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = and <3 x i32> [[TMP1]], [[ARG:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <3 x i32> [[ARG:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <3 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[T4:%.*]] = zext <3 x i1> [[TMP3]] to <3 x i32> ; CHECK-NEXT: ret <3 x i32> [[T4]] diff --git a/llvm/test/Transforms/InstCombine/select-safe-transforms.ll b/llvm/test/Transforms/InstCombine/select-safe-transforms.ll index 70e62712333212..e0306972e48e2c 100644 --- a/llvm/test/Transforms/InstCombine/select-safe-transforms.ll +++ b/llvm/test/Transforms/InstCombine/select-safe-transforms.ll @@ -194,7 +194,7 @@ define i1 @andn_or_cmp_2_logical(i16 %a, i16 %b, i1 %y) { define i1 @andn_or_cmp_2_partial_logical(i16 %a, i16 %b, i1 %y) { ; CHECK-LABEL: @andn_or_cmp_2_partial_logical( ; CHECK-NEXT: [[X_INV:%.*]] = icmp slt i16 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[AND:%.*]] = and i1 [[X_INV]], [[Y:%.*]] +; CHECK-NEXT: [[AND:%.*]] = and i1 [[Y:%.*]], [[X_INV]] ; CHECK-NEXT: ret i1 [[AND]] ; %x = icmp sge i16 %a, %b @@ -735,7 +735,7 @@ define i1 @orn_and_cmp_2_logical(i16 %a, i16 %b, i1 %y) { define i1 @orn_and_cmp_2_partial_logical(i16 %a, i16 %b, i1 %y) { ; CHECK-LABEL: @orn_and_cmp_2_partial_logical( ; CHECK-NEXT: [[X_INV:%.*]] = icmp slt i16 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[OR:%.*]] = or i1 [[X_INV]], [[Y:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i1 [[Y:%.*]], [[X_INV]] ; CHECK-NEXT: ret i1 [[OR]] ; %x = icmp sge i16 %a, %b diff --git a/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll b/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll index 416a6d71055b62..1647233595b37e 100644 --- a/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll +++ b/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll @@ -10,7 +10,7 @@ define i32 @select_icmp_eq_and_1_0_or_2(i32 %x, i32 %y) { ; CHECK-LABEL: @select_icmp_eq_and_1_0_or_2( ; CHECK-NEXT: [[AND:%.*]] = shl i32 [[X:%.*]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[AND]], 2 -; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[SELECT]] ; %and = and i32 %x, 1 @@ -24,7 +24,7 @@ define <2 x i32> @select_icmp_eq_and_1_0_or_2_vec(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @select_icmp_eq_and_1_0_or_2_vec( ; CHECK-NEXT: [[AND:%.*]] = shl <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[AND]], -; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i32> [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i32> [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i32> [[SELECT]] ; %and = and <2 x i32> %x, @@ -38,7 +38,7 @@ define <2 x i32> @select_icmp_eq_and_1_0_or_2_vec_poison1(<2 x i32> %x, <2 x i32 ; CHECK-LABEL: @select_icmp_eq_and_1_0_or_2_vec_poison1( ; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw <2 x i32> [[AND]], -; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i32> [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i32> [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i32> [[SELECT]] ; %and = and <2 x i32> %x, @@ -52,7 +52,7 @@ define <2 x i32> @select_icmp_eq_and_1_0_or_2_vec_poison2(<2 x i32> %x, <2 x i32 ; CHECK-LABEL: @select_icmp_eq_and_1_0_or_2_vec_poison2( ; CHECK-NEXT: [[AND:%.*]] = shl <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[AND]], -; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i32> [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i32> [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i32> [[SELECT]] ; %and = and <2 x i32> %x, @@ -66,7 +66,7 @@ define <2 x i32> @select_icmp_eq_and_1_0_or_2_vec_poison3(<2 x i32> %x, <2 x i32 ; CHECK-LABEL: @select_icmp_eq_and_1_0_or_2_vec_poison3( ; CHECK-NEXT: [[AND:%.*]] = shl <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[AND]], -; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i32> [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i32> [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i32> [[SELECT]] ; %and = and <2 x i32> %x, @@ -80,7 +80,7 @@ define i32 @select_icmp_eq_and_1_0_xor_2(i32 %x, i32 %y) { ; CHECK-LABEL: @select_icmp_eq_and_1_0_xor_2( ; CHECK-NEXT: [[AND:%.*]] = shl i32 [[X:%.*]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[AND]], 2 -; CHECK-NEXT: [[SELECT:%.*]] = xor i32 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = xor i32 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[SELECT]] ; %and = and i32 %x, 1 @@ -109,7 +109,7 @@ define i32 @select_icmp_eq_and_32_0_or_8(i32 %x, i32 %y) { ; CHECK-LABEL: @select_icmp_eq_and_32_0_or_8( ; CHECK-NEXT: [[AND:%.*]] = lshr i32 [[X:%.*]], 2 ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[AND]], 8 -; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[SELECT]] ; %and = and i32 %x, 32 @@ -123,7 +123,7 @@ define <2 x i32> @select_icmp_eq_and_32_0_or_8_vec(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @select_icmp_eq_and_32_0_or_8_vec( ; CHECK-NEXT: [[AND:%.*]] = lshr <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[AND]], -; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i32> [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i32> [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i32> [[SELECT]] ; %and = and <2 x i32> %x, @@ -137,7 +137,7 @@ define i32 @select_icmp_eq_and_32_0_xor_8(i32 %x, i32 %y) { ; CHECK-LABEL: @select_icmp_eq_and_32_0_xor_8( ; CHECK-NEXT: [[AND:%.*]] = lshr i32 [[X:%.*]], 2 ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[AND]], 8 -; CHECK-NEXT: [[SELECT:%.*]] = xor i32 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = xor i32 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[SELECT]] ; %and = and i32 %x, 32 @@ -166,7 +166,7 @@ define i32 @select_icmp_ne_0_and_4096_or_4096(i32 %x, i32 %y) { ; CHECK-LABEL: @select_icmp_ne_0_and_4096_or_4096( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 4096 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[AND]], 4096 -; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[SELECT]] ; %and = and i32 %x, 4096 @@ -180,7 +180,7 @@ define <2 x i32> @select_icmp_ne_0_and_4096_or_4096_vec(<2 x i32> %x, <2 x i32> ; CHECK-LABEL: @select_icmp_ne_0_and_4096_or_4096_vec( ; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i32> [[AND]], -; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i32> [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i32> [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i32> [[SELECT]] ; %and = and <2 x i32> %x, @@ -222,7 +222,7 @@ define i32 @select_icmp_ne_0_and_4096_and_not_4096(i32 %x, i32 %y) { define i32 @select_icmp_eq_and_4096_0_or_4096(i32 %x, i32 %y) { ; CHECK-LABEL: @select_icmp_eq_and_4096_0_or_4096( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 4096 -; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[AND]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[Y:%.*]], [[AND]] ; CHECK-NEXT: ret i32 [[SELECT]] ; %and = and i32 %x, 4096 @@ -235,7 +235,7 @@ define i32 @select_icmp_eq_and_4096_0_or_4096(i32 %x, i32 %y) { define <2 x i32> @select_icmp_eq_and_4096_0_or_4096_vec(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @select_icmp_eq_and_4096_0_or_4096_vec( ; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[X:%.*]], -; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i32> [[AND]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i32> [[Y:%.*]], [[AND]] ; CHECK-NEXT: ret <2 x i32> [[SELECT]] ; %and = and <2 x i32> %x, @@ -248,7 +248,7 @@ define <2 x i32> @select_icmp_eq_and_4096_0_or_4096_vec(<2 x i32> %x, <2 x i32> define i32 @select_icmp_eq_and_4096_0_xor_4096(i32 %x, i32 %y) { ; CHECK-LABEL: @select_icmp_eq_and_4096_0_xor_4096( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 4096 -; CHECK-NEXT: [[SELECT:%.*]] = xor i32 [[AND]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = xor i32 [[Y:%.*]], [[AND]] ; CHECK-NEXT: ret i32 [[SELECT]] ; %and = and i32 %x, 4096 @@ -277,7 +277,7 @@ define i32 @select_icmp_eq_0_and_1_or_1(i64 %x, i32 %y) { ; CHECK-LABEL: @select_icmp_eq_0_and_1_or_1( ; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[X:%.*]] to i32 ; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 1 -; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret i32 [[SELECT]] ; %and = and i64 %x, 1 @@ -291,7 +291,7 @@ define <2 x i32> @select_icmp_eq_0_and_1_or_1_vec(<2 x i64> %x, <2 x i32> %y) { ; CHECK-LABEL: @select_icmp_eq_0_and_1_or_1_vec( ; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i64> [[X:%.*]] to <2 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], -; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i32> [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i32> [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret <2 x i32> [[SELECT]] ; %and = and <2 x i64> %x, @@ -305,7 +305,7 @@ define i32 @select_icmp_eq_0_and_1_xor_1(i64 %x, i32 %y) { ; CHECK-LABEL: @select_icmp_eq_0_and_1_xor_1( ; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[X:%.*]] to i32 ; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 1 -; CHECK-NEXT: [[SELECT:%.*]] = xor i32 [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = xor i32 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret i32 [[SELECT]] ; %and = and i64 %x, 1 @@ -335,7 +335,7 @@ define i32 @select_icmp_ne_0_and_4096_or_32(i32 %x, i32 %y) { ; CHECK-NEXT: [[AND:%.*]] = lshr i32 [[X:%.*]], 7 ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[AND]], 32 ; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], 32 -; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret i32 [[SELECT]] ; %and = and i32 %x, 4096 @@ -380,7 +380,7 @@ define i32 @select_icmp_ne_0_and_32_or_4096(i32 %x, i32 %y) { ; CHECK-NEXT: [[AND:%.*]] = shl i32 [[X:%.*]], 7 ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[AND]], 4096 ; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], 4096 -; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret i32 [[SELECT]] ; %and = and i32 %x, 32 @@ -395,7 +395,7 @@ define <2 x i32> @select_icmp_ne_0_and_32_or_4096_vec(<2 x i32> %x, <2 x i32> %y ; CHECK-NEXT: [[AND:%.*]] = shl <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[AND]], ; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i32> [[TMP1]], -; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i32> [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i32> [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret <2 x i32> [[SELECT]] ; %and = and <2 x i32> %x, @@ -570,7 +570,7 @@ define i64 @select_icmp_x_and_8_eq_0_y_xor_8(i32 %x, i64 %y) { ; CHECK-LABEL: @select_icmp_x_and_8_eq_0_y_xor_8( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 8 ; CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[AND]] to i64 -; CHECK-NEXT: [[Y_XOR:%.*]] = xor i64 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[Y_XOR:%.*]] = xor i64 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i64 [[Y_XOR]] ; %and = and i32 %x, 8 @@ -585,7 +585,7 @@ define i64 @select_icmp_x_and_8_ne_0_y_xor_8(i32 %x, i64 %y) { ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 8 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[AND]], 8 ; CHECK-NEXT: [[TMP2:%.*]] = zext nneg i32 [[TMP1]] to i64 -; CHECK-NEXT: [[XOR_Y:%.*]] = xor i64 [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[XOR_Y:%.*]] = xor i64 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret i64 [[XOR_Y]] ; %and = and i32 %x, 8 @@ -600,7 +600,7 @@ define i64 @select_icmp_x_and_8_ne_0_y_or_8(i32 %x, i64 %y) { ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 8 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[AND]], 8 ; CHECK-NEXT: [[TMP2:%.*]] = zext nneg i32 [[TMP1]] to i64 -; CHECK-NEXT: [[OR_Y:%.*]] = or i64 [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[OR_Y:%.*]] = or i64 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret i64 [[OR_Y]] ; %and = and i32 %x, 8 @@ -615,7 +615,7 @@ define <2 x i64> @select_icmp_x_and_8_ne_0_y_or_8_vec(<2 x i32> %x, <2 x i64> %y ; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i32> [[AND]], ; CHECK-NEXT: [[TMP2:%.*]] = zext nneg <2 x i32> [[TMP1]] to <2 x i64> -; CHECK-NEXT: [[OR_Y:%.*]] = or <2 x i64> [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[OR_Y:%.*]] = or <2 x i64> [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret <2 x i64> [[OR_Y]] ; %and = and <2 x i32> %x, @@ -680,7 +680,7 @@ define i32 @test68(i32 %x, i32 %y) { ; CHECK-LABEL: @test68( ; CHECK-NEXT: [[AND:%.*]] = lshr i32 [[X:%.*]], 6 ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[AND]], 2 -; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[SELECT]] ; %and = and i32 %x, 128 @@ -694,7 +694,7 @@ define <2 x i32> @test68vec(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @test68vec( ; CHECK-NEXT: [[AND:%.*]] = lshr <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[AND]], -; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i32> [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i32> [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i32> [[SELECT]] ; %and = and <2 x i32> %x, @@ -708,7 +708,7 @@ define i32 @test68_xor(i32 %x, i32 %y) { ; CHECK-LABEL: @test68_xor( ; CHECK-NEXT: [[AND:%.*]] = lshr i32 [[X:%.*]], 6 ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[AND]], 2 -; CHECK-NEXT: [[SELECT:%.*]] = xor i32 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = xor i32 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[SELECT]] ; %and = and i32 %x, 128 @@ -738,7 +738,7 @@ define i32 @test69(i32 %x, i32 %y) { ; CHECK-NEXT: [[AND:%.*]] = lshr i32 [[X:%.*]], 6 ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[AND]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], 2 -; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret i32 [[SELECT]] ; %and = and i32 %x, 128 @@ -753,7 +753,7 @@ define <2 x i32> @test69vec(<2 x i32> %x, <2 x i32> %y) { ; CHECK-NEXT: [[AND:%.*]] = lshr <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[AND]], ; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i32> [[TMP1]], -; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i32> [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i32> [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret <2 x i32> [[SELECT]] ; %and = and <2 x i32> %x, @@ -797,7 +797,7 @@ define i8 @test70(i8 %x, i8 %y) { ; CHECK-LABEL: @test70( ; CHECK-NEXT: [[TMP1:%.*]] = lshr i8 [[X:%.*]], 6 ; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 2 -; CHECK-NEXT: [[SELECT:%.*]] = or i8 [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or i8 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret i8 [[SELECT]] ; %cmp = icmp slt i8 %x, 0 @@ -826,7 +826,7 @@ define i32 @shift_no_xor_multiuse_or(i32 %x, i32 %y) { ; CHECK-NEXT: [[OR:%.*]] = or i32 [[Y:%.*]], 2 ; CHECK-NEXT: [[AND:%.*]] = shl i32 [[X:%.*]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[AND]], 2 -; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[TMP1]], [[Y]] +; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[Y]], [[TMP1]] ; CHECK-NEXT: [[RES:%.*]] = mul i32 [[SELECT]], [[OR]] ; CHECK-NEXT: ret i32 [[RES]] ; @@ -843,7 +843,7 @@ define i32 @shift_no_xor_multiuse_xor(i32 %x, i32 %y) { ; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[Y:%.*]], 2 ; CHECK-NEXT: [[AND:%.*]] = shl i32 [[X:%.*]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[AND]], 2 -; CHECK-NEXT: [[SELECT:%.*]] = xor i32 [[TMP1]], [[Y]] +; CHECK-NEXT: [[SELECT:%.*]] = xor i32 [[Y]], [[TMP1]] ; CHECK-NEXT: [[RES:%.*]] = mul i32 [[SELECT]], [[XOR]] ; CHECK-NEXT: ret i32 [[RES]] ; @@ -876,7 +876,7 @@ define i32 @no_shift_no_xor_multiuse_or(i32 %x, i32 %y) { ; CHECK-LABEL: @no_shift_no_xor_multiuse_or( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 4096 ; CHECK-NEXT: [[OR:%.*]] = or i32 [[Y:%.*]], 4096 -; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[AND]], [[Y]] +; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[Y]], [[AND]] ; CHECK-NEXT: [[RES:%.*]] = mul i32 [[SELECT]], [[OR]] ; CHECK-NEXT: ret i32 [[RES]] ; @@ -892,7 +892,7 @@ define i32 @no_shift_no_xor_multiuse_xor(i32 %x, i32 %y) { ; CHECK-LABEL: @no_shift_no_xor_multiuse_xor( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 4096 ; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[Y:%.*]], 4096 -; CHECK-NEXT: [[SELECT:%.*]] = xor i32 [[AND]], [[Y]] +; CHECK-NEXT: [[SELECT:%.*]] = xor i32 [[Y]], [[AND]] ; CHECK-NEXT: [[RES:%.*]] = mul i32 [[SELECT]], [[XOR]] ; CHECK-NEXT: ret i32 [[RES]] ; @@ -926,7 +926,7 @@ define i32 @no_shift_xor_multiuse_or(i32 %x, i32 %y) { ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 4096 ; CHECK-NEXT: [[OR:%.*]] = or i32 [[Y:%.*]], 4096 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[AND]], 4096 -; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[TMP1]], [[Y]] +; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[Y]], [[TMP1]] ; CHECK-NEXT: [[RES:%.*]] = mul i32 [[SELECT]], [[OR]] ; CHECK-NEXT: ret i32 [[RES]] ; @@ -1028,7 +1028,7 @@ define i32 @shift_no_xor_multiuse_cmp(i32 %x, i32 %y, i32 %z, i32 %w) { ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[AND]], 1 -; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: [[SELECT2:%.*]] = select i1 [[CMP]], i32 [[Z:%.*]], i32 [[W:%.*]] ; CHECK-NEXT: [[RES:%.*]] = mul i32 [[SELECT]], [[SELECT2]] ; CHECK-NEXT: ret i32 [[RES]] @@ -1047,7 +1047,7 @@ define i32 @shift_no_xor_multiuse_cmp_with_xor(i32 %x, i32 %y, i32 %z, i32 %w) { ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[AND]], 1 -; CHECK-NEXT: [[SELECT:%.*]] = xor i32 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = xor i32 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: [[SELECT2:%.*]] = select i1 [[CMP]], i32 [[Z:%.*]], i32 [[W:%.*]] ; CHECK-NEXT: [[RES:%.*]] = mul i32 [[SELECT]], [[SELECT2]] ; CHECK-NEXT: ret i32 [[RES]] @@ -1084,7 +1084,7 @@ define i32 @no_shift_no_xor_multiuse_cmp(i32 %x, i32 %y, i32 %z, i32 %w) { ; CHECK-LABEL: @no_shift_no_xor_multiuse_cmp( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 4096 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 0 -; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[AND]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[Y:%.*]], [[AND]] ; CHECK-NEXT: [[SELECT2:%.*]] = select i1 [[CMP]], i32 [[Z:%.*]], i32 [[W:%.*]] ; CHECK-NEXT: [[RES:%.*]] = mul i32 [[SELECT]], [[SELECT2]] ; CHECK-NEXT: ret i32 [[RES]] @@ -1102,7 +1102,7 @@ define i32 @no_shift_no_xor_multiuse_cmp_with_xor(i32 %x, i32 %y, i32 %z, i32 %w ; CHECK-LABEL: @no_shift_no_xor_multiuse_cmp_with_xor( ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 4096 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 0 -; CHECK-NEXT: [[SELECT:%.*]] = xor i32 [[AND]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = xor i32 [[Y:%.*]], [[AND]] ; CHECK-NEXT: [[SELECT2:%.*]] = select i1 [[CMP]], i32 [[Z:%.*]], i32 [[W:%.*]] ; CHECK-NEXT: [[RES:%.*]] = mul i32 [[SELECT]], [[SELECT2]] ; CHECK-NEXT: ret i32 [[RES]] @@ -1140,7 +1140,7 @@ define i32 @no_shift_xor_multiuse_cmp(i32 %x, i32 %y, i32 %z, i32 %w) { ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 4096 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[AND]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[AND]], 4096 -; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: [[SELECT2:%.*]] = select i1 [[CMP_NOT]], i32 [[W:%.*]], i32 [[Z:%.*]] ; CHECK-NEXT: [[RES:%.*]] = mul i32 [[SELECT]], [[SELECT2]] ; CHECK-NEXT: ret i32 [[RES]] @@ -1317,7 +1317,7 @@ define i32 @no_shift_no_xor_multiuse_cmp_or(i32 %x, i32 %y, i32 %z, i32 %w) { ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 4096 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 0 ; CHECK-NEXT: [[OR:%.*]] = or i32 [[Y:%.*]], 4096 -; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[AND]], [[Y]] +; CHECK-NEXT: [[SELECT:%.*]] = or i32 [[Y]], [[AND]] ; CHECK-NEXT: [[SELECT2:%.*]] = select i1 [[CMP]], i32 [[Z:%.*]], i32 [[W:%.*]] ; CHECK-NEXT: [[RES:%.*]] = mul i32 [[SELECT]], [[SELECT2]] ; CHECK-NEXT: [[RES2:%.*]] = mul i32 [[RES]], [[OR]] @@ -1338,7 +1338,7 @@ define i32 @no_shift_no_xor_multiuse_cmp_xor(i32 %x, i32 %y, i32 %z, i32 %w) { ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 4096 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 0 ; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[Y:%.*]], 4096 -; CHECK-NEXT: [[SELECT:%.*]] = xor i32 [[AND]], [[Y]] +; CHECK-NEXT: [[SELECT:%.*]] = xor i32 [[Y]], [[AND]] ; CHECK-NEXT: [[SELECT2:%.*]] = select i1 [[CMP]], i32 [[Z:%.*]], i32 [[W:%.*]] ; CHECK-NEXT: [[RES:%.*]] = mul i32 [[SELECT]], [[SELECT2]] ; CHECK-NEXT: [[RES2:%.*]] = mul i32 [[RES]], [[XOR]] @@ -1641,7 +1641,7 @@ define i64 @xor_i8_to_i64_shl_save_and_ne(i8 %x, i64 %y) { ; CHECK-LABEL: @xor_i8_to_i64_shl_save_and_ne( ; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 63 -; CHECK-NEXT: [[R:%.*]] = xor i64 [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = xor i64 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret i64 [[R]] ; %xx = and i8 %x, 1 diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll index 1369be305ec13a..7d62b419424405 100644 --- a/llvm/test/Transforms/InstCombine/select.ll +++ b/llvm/test/Transforms/InstCombine/select.ll @@ -221,7 +221,7 @@ define i32 @test11(i32 %a) { define i32 @test12(i1 %cond, i32 %a) { ; CHECK-LABEL: @test12( ; CHECK-NEXT: [[B:%.*]] = zext i1 [[COND:%.*]] to i32 -; CHECK-NEXT: [[C:%.*]] = or i32 [[B]], [[A:%.*]] +; CHECK-NEXT: [[C:%.*]] = or i32 [[A:%.*]], [[B]] ; CHECK-NEXT: ret i32 [[C]] ; %b = or i32 %a, 1 @@ -232,7 +232,7 @@ define i32 @test12(i1 %cond, i32 %a) { define <2 x i32> @test12vec(<2 x i1> %cond, <2 x i32> %a) { ; CHECK-LABEL: @test12vec( ; CHECK-NEXT: [[B:%.*]] = zext <2 x i1> [[COND:%.*]] to <2 x i32> -; CHECK-NEXT: [[C:%.*]] = or <2 x i32> [[B]], [[A:%.*]] +; CHECK-NEXT: [[C:%.*]] = or <2 x i32> [[A:%.*]], [[B]] ; CHECK-NEXT: ret <2 x i32> [[C]] ; %b = or <2 x i32> %a, @@ -686,7 +686,7 @@ define i1 @test40(i1 %cond) { define i32 @test41(i1 %cond, i32 %x, i32 %y) { ; CHECK-LABEL: @test41( -; CHECK-NEXT: [[R:%.*]] = and i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = and i32 [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: ret i32 [[R]] ; %z = and i32 %x, %y @@ -699,7 +699,7 @@ define i32 @test42(i32 %x, i32 %y) { ; CHECK-LABEL: @test42( ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[B:%.*]] = sext i1 [[COND]] to i32 -; CHECK-NEXT: [[C:%.*]] = add i32 [[B]], [[Y:%.*]] +; CHECK-NEXT: [[C:%.*]] = add i32 [[Y:%.*]], [[B]] ; CHECK-NEXT: ret i32 [[C]] ; %b = add i32 %y, -1 @@ -712,7 +712,7 @@ define <2 x i32> @test42vec(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @test42vec( ; CHECK-NEXT: [[COND:%.*]] = icmp eq <2 x i32> [[X:%.*]], zeroinitializer ; CHECK-NEXT: [[B:%.*]] = sext <2 x i1> [[COND]] to <2 x i32> -; CHECK-NEXT: [[C:%.*]] = add <2 x i32> [[B]], [[Y:%.*]] +; CHECK-NEXT: [[C:%.*]] = add <2 x i32> [[Y:%.*]], [[B]] ; CHECK-NEXT: ret <2 x i32> [[C]] ; %b = add <2 x i32> %y, @@ -1569,7 +1569,7 @@ define i8 @test88(i1 %cond, i8 %w, i8 %x, i8 %y, i8 %z) { ; select(C, Z, binop(W, select(C, X, Y))) -> select(C, binop(X, W), Z) define i8 @test89(i1 %cond, i8 %w, i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @test89( -; CHECK-NEXT: [[B:%.*]] = and i8 [[X:%.*]], [[W:%.*]] +; CHECK-NEXT: [[B:%.*]] = and i8 [[W:%.*]], [[X:%.*]] ; CHECK-NEXT: [[C:%.*]] = select i1 [[COND:%.*]], i8 [[B]], i8 [[Z:%.*]] ; CHECK-NEXT: ret i8 [[C]] ; @@ -1582,7 +1582,7 @@ define i8 @test89(i1 %cond, i8 %w, i8 %x, i8 %y, i8 %z) { ; select(C, Z, binop(W, select(C, X, Y))) -> select(C, Z, binop(W, Y)) define i8 @test90(i1 %cond, i8 %w, i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @test90( -; CHECK-NEXT: [[B:%.*]] = or i8 [[Y:%.*]], [[W:%.*]] +; CHECK-NEXT: [[B:%.*]] = or i8 [[W:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[C:%.*]] = select i1 [[COND:%.*]], i8 [[Z:%.*]], i8 [[B]] ; CHECK-NEXT: ret i8 [[C]] ; @@ -2889,7 +2889,7 @@ define i8 @select_replacement_sub_noundef(i8 %x, i8 noundef %y, i8 %z) { define i8 @select_replacement_sub_noundef_but_may_be_poison(i8 %x, i8 noundef %yy, i8 %z) { ; CHECK-LABEL: @select_replacement_sub_noundef_but_may_be_poison( ; CHECK-NEXT: [[Y:%.*]] = shl nuw i8 [[YY:%.*]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[Y]], [[X:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[X:%.*]], [[Y]] ; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i8 0, i8 [[Z:%.*]] ; CHECK-NEXT: ret i8 [[SEL]] ; @@ -2975,7 +2975,7 @@ define i8 @select_replacement_loop3(i32 noundef %x) { ; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[X:%.*]] to i8 ; CHECK-NEXT: [[REV:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[TRUNC]]) ; CHECK-NEXT: [[EXT:%.*]] = zext i8 [[REV]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[EXT]], [[X]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X]], [[EXT]] ; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i8 [[TRUNC]], i8 0 ; CHECK-NEXT: ret i8 [[SEL]] ; @@ -3016,7 +3016,7 @@ define ptr @select_replacement_gep_inbounds(ptr %base, i64 %offset) { define i8 @replace_false_op_eq_shl_or_disjoint(i8 %x) { ; CHECK-LABEL: @replace_false_op_eq_shl_or_disjoint( ; CHECK-NEXT: [[SHL:%.*]] = shl i8 [[X:%.*]], 3 -; CHECK-NEXT: [[OR:%.*]] = or i8 [[SHL]], [[X]] +; CHECK-NEXT: [[OR:%.*]] = or i8 [[X]], [[SHL]] ; CHECK-NEXT: ret i8 [[OR]] ; %eq0 = icmp eq i8 %x, -1 @@ -3057,7 +3057,7 @@ define <2 x i1> @partial_false_undef_condval(<2 x i1> %x) { define i32 @mul_select_eq_zero(i32 %x, i32 %y) { ; CHECK-LABEL: @mul_select_eq_zero( ; CHECK-NEXT: [[Y_FR:%.*]] = freeze i32 [[Y:%.*]] -; CHECK-NEXT: [[M:%.*]] = mul i32 [[Y_FR]], [[X:%.*]] +; CHECK-NEXT: [[M:%.*]] = mul i32 [[X:%.*]], [[Y_FR]] ; CHECK-NEXT: ret i32 [[M]] ; %c = icmp eq i32 %x, 0 @@ -3083,7 +3083,7 @@ define i32 @mul_select_eq_zero_commute(i32 %x, i32 %y) { define i32 @mul_select_eq_zero_copy_flags(i32 %x, i32 %y) { ; CHECK-LABEL: @mul_select_eq_zero_copy_flags( ; CHECK-NEXT: [[Y_FR:%.*]] = freeze i32 [[Y:%.*]] -; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[Y_FR]], [[X:%.*]] +; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[X:%.*]], [[Y_FR]] ; CHECK-NEXT: ret i32 [[M]] ; %c = icmp eq i32 %x, 0 @@ -3098,7 +3098,7 @@ define i32 @mul_select_ne_zero(i32 %x, i32 %y) { ; CHECK-LABEL: @mul_select_ne_zero( ; CHECK-NEXT: [[C:%.*]] = icmp ne i32 [[X:%.*]], 0 ; CHECK-NEXT: [[Y_FR:%.*]] = freeze i32 [[Y:%.*]] -; CHECK-NEXT: [[M:%.*]] = mul i32 [[Y_FR]], [[X]] +; CHECK-NEXT: [[M:%.*]] = mul i32 [[X]], [[Y_FR]] ; CHECK-NEXT: call void @use(i1 [[C]]) ; CHECK-NEXT: ret i32 [[M]] ; @@ -3115,7 +3115,7 @@ define i32 @mul_select_ne_zero(i32 %x, i32 %y) { define i32 @mul_select_eq_zero_sel_undef(i32 %x, i32 %y) { ; CHECK-LABEL: @mul_select_eq_zero_sel_undef( ; CHECK-NEXT: [[Y_FR:%.*]] = freeze i32 [[Y:%.*]] -; CHECK-NEXT: [[M:%.*]] = mul i32 [[Y_FR]], [[X:%.*]] +; CHECK-NEXT: [[M:%.*]] = mul i32 [[X:%.*]], [[Y_FR]] ; CHECK-NEXT: ret i32 [[M]] ; %c = icmp eq i32 %x, 0 @@ -3129,7 +3129,7 @@ define i32 @mul_select_eq_zero_sel_undef(i32 %x, i32 %y) { define i32 @mul_select_eq_zero_multiple_users(i32 %x, i32 %y) { ; CHECK-LABEL: @mul_select_eq_zero_multiple_users( ; CHECK-NEXT: [[Y_FR:%.*]] = freeze i32 [[Y:%.*]] -; CHECK-NEXT: [[M:%.*]] = mul i32 [[Y_FR]], [[X:%.*]] +; CHECK-NEXT: [[M:%.*]] = mul i32 [[X:%.*]], [[Y_FR]] ; CHECK-NEXT: call void @use_i32(i32 [[M]]) ; CHECK-NEXT: call void @use_i32(i32 [[M]]) ; CHECK-NEXT: call void @use_i32(i32 [[M]]) @@ -3163,7 +3163,7 @@ define i32 @mul_select_eq_zero_unrelated_condition(i32 %x, i32 %y, i32 %z) { define <4 x i32> @mul_select_eq_zero_vector(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: @mul_select_eq_zero_vector( ; CHECK-NEXT: [[Y_FR:%.*]] = freeze <4 x i32> [[Y:%.*]] -; CHECK-NEXT: [[M:%.*]] = mul <4 x i32> [[Y_FR]], [[X:%.*]] +; CHECK-NEXT: [[M:%.*]] = mul <4 x i32> [[X:%.*]], [[Y_FR]] ; CHECK-NEXT: ret <4 x i32> [[M]] ; %c = icmp eq <4 x i32> %x, zeroinitializer @@ -3194,7 +3194,7 @@ define <2 x i32> @mul_select_eq_poison_vector(<2 x i32> %x, <2 x i32> %y) { define <2 x i32> @mul_select_eq_zero_sel_poison_vector(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @mul_select_eq_zero_sel_poison_vector( ; CHECK-NEXT: [[Y_FR:%.*]] = freeze <2 x i32> [[Y:%.*]] -; CHECK-NEXT: [[M:%.*]] = mul <2 x i32> [[Y_FR]], [[X:%.*]] +; CHECK-NEXT: [[M:%.*]] = mul <2 x i32> [[X:%.*]], [[Y_FR]] ; CHECK-NEXT: ret <2 x i32> [[M]] ; %c = icmp eq <2 x i32> %x, zeroinitializer @@ -4028,7 +4028,7 @@ define i32 @src_or_eq_C_and_andnotxorC(i32 %x, i32 %y, i32 %c) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[NOT:%.*]] = xor i32 [[TMP0]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NOT]], [[C:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[C:%.*]], [[NOT]] ; CHECK-NEXT: ret i32 [[AND1]] ; entry: @@ -4064,7 +4064,7 @@ define i32 @src_or_eq_C_xor_andnotandC(i32 %x, i32 %y, i32 %c) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: [[NOT:%.*]] = xor i32 [[AND]], -1 -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[NOT]], [[C:%.*]] +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[C:%.*]], [[NOT]] ; CHECK-NEXT: ret i32 [[AND1]] ; entry: diff --git a/llvm/test/Transforms/InstCombine/select_meta.ll b/llvm/test/Transforms/InstCombine/select_meta.ll index 3898fd9fa1f578..d8f945b8d1b32f 100644 --- a/llvm/test/Transforms/InstCombine/select_meta.ll +++ b/llvm/test/Transforms/InstCombine/select_meta.ll @@ -6,7 +6,7 @@ define i32 @foo(i32) local_unnamed_addr #0 { ; CHECK-LABEL: @foo( ; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[TMP0:%.*]], 2 ; CHECK-NEXT: [[DOTV:%.*]] = select i1 [[TMP2]], i32 20, i32 -20, !prof [[PROF0:![0-9]+]] -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[DOTV]], [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], [[DOTV]] ; CHECK-NEXT: ret i32 [[TMP3]] ; %2 = icmp sgt i32 %0, 2 @@ -51,7 +51,7 @@ define i32 @foo2(i32, i32) local_unnamed_addr #0 { ; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i32 [[TMP0:%.*]], 2 ; CHECK-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP1:%.*]] ; CHECK-NEXT: [[DOTP:%.*]] = select i1 [[TMP3]], i32 [[TMP1]], i32 [[TMP4]], !prof [[PROF0]] -; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[DOTP]], [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP0]], [[DOTP]] ; CHECK-NEXT: ret i32 [[TMP5]] ; %3 = icmp sgt i32 %0, 2 @@ -317,7 +317,7 @@ define <2 x i32> @not_cond_vec_poison(<2 x i1> %c, <2 x i32> %tv, <2 x i32> %fv) define i64 @select_add(i1 %cond, i64 %x, i64 %y) { ; CHECK-LABEL: @select_add( ; CHECK-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i64 [[Y:%.*]], i64 0, !prof [[PROF0]], !unpredictable [[META2:![0-9]+]] -; CHECK-NEXT: [[RET:%.*]] = add i64 [[OP]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = add i64 [[X:%.*]], [[OP]] ; CHECK-NEXT: ret i64 [[RET]] ; %op = add i64 %x, %y @@ -328,7 +328,7 @@ define i64 @select_add(i1 %cond, i64 %x, i64 %y) { define <2 x i32> @select_or(<2 x i1> %cond, <2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @select_or( ; CHECK-NEXT: [[OP:%.*]] = select <2 x i1> [[COND:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> zeroinitializer, !prof [[PROF0]], !unpredictable [[META2]] -; CHECK-NEXT: [[RET:%.*]] = or <2 x i32> [[OP]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = or <2 x i32> [[X:%.*]], [[OP]] ; CHECK-NEXT: ret <2 x i32> [[RET]] ; %op = or <2 x i32> %x, %y @@ -361,7 +361,7 @@ define i128 @select_ashr(i1 %cond, i128 %x, i128 %y) { define double @select_fmul(i1 %cond, double %x, double %y) { ; CHECK-LABEL: @select_fmul( ; CHECK-NEXT: [[OP:%.*]] = select nnan i1 [[COND:%.*]], double [[Y:%.*]], double 1.000000e+00, !prof [[PROF0]], !unpredictable [[META2]] -; CHECK-NEXT: [[RET:%.*]] = fmul double [[OP]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = fmul double [[X:%.*]], [[OP]] ; CHECK-NEXT: ret double [[RET]] ; %op = fmul double %x, %y diff --git a/llvm/test/Transforms/InstCombine/set.ll b/llvm/test/Transforms/InstCombine/set.ll index 50329ddf7caacd..f44ac83f7f5916 100644 --- a/llvm/test/Transforms/InstCombine/set.ll +++ b/llvm/test/Transforms/InstCombine/set.ll @@ -135,7 +135,7 @@ define i1 @test12(i1 %A) { define i1 @test13(i1 %A, i1 %B) { ; CHECK-LABEL: @test13( ; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[B:%.*]], true -; CHECK-NEXT: [[C:%.*]] = or i1 [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[C:%.*]] = or i1 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[C]] ; %C = icmp uge i1 %A, %B @@ -145,7 +145,7 @@ define i1 @test13(i1 %A, i1 %B) { define <2 x i1> @test13vec(<2 x i1> %A, <2 x i1> %B) { ; CHECK-LABEL: @test13vec( ; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i1> [[B:%.*]], -; CHECK-NEXT: [[C:%.*]] = or <2 x i1> [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[C:%.*]] = or <2 x i1> [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i1> [[C]] ; %C = icmp uge <2 x i1> %A, %B diff --git a/llvm/test/Transforms/InstCombine/shift-add.ll b/llvm/test/Transforms/InstCombine/shift-add.ll index 7f948848844c5a..016f877a9efb51 100644 --- a/llvm/test/Transforms/InstCombine/shift-add.ll +++ b/llvm/test/Transforms/InstCombine/shift-add.ll @@ -505,7 +505,7 @@ define i2 @ashr_2_add_zext_basic(i1 %a, i1 %b) { define i32 @lshr_16_add_zext_basic(i16 %a, i16 %b) { ; CHECK-LABEL: @lshr_16_add_zext_basic( ; CHECK-NEXT: [[TMP1:%.*]] = xor i16 [[A:%.*]], -1 -; CHECK-NEXT: [[ADD_NARROWED_OVERFLOW:%.*]] = icmp ult i16 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[ADD_NARROWED_OVERFLOW:%.*]] = icmp ugt i16 [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[LSHR:%.*]] = zext i1 [[ADD_NARROWED_OVERFLOW]] to i32 ; CHECK-NEXT: ret i32 [[LSHR]] ; @@ -566,7 +566,7 @@ define i32 @lshr_16_add_not_known_16_leading_zeroes(i32 %a, i32 %b) { define i64 @lshr_32_add_zext_basic(i32 %a, i32 %b) { ; CHECK-LABEL: @lshr_32_add_zext_basic( ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A:%.*]], -1 -; CHECK-NEXT: [[ADD_NARROWED_OVERFLOW:%.*]] = icmp ult i32 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[ADD_NARROWED_OVERFLOW:%.*]] = icmp ugt i32 [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[LSHR:%.*]] = zext i1 [[ADD_NARROWED_OVERFLOW]] to i64 ; CHECK-NEXT: ret i64 [[LSHR]] ; @@ -623,7 +623,7 @@ define i64 @lshr_33_i32_add_zext_basic(i32 %a, i32 %b) { define i64 @lshr_16_to_64_add_zext_basic(i16 %a, i16 %b) { ; CHECK-LABEL: @lshr_16_to_64_add_zext_basic( ; CHECK-NEXT: [[TMP1:%.*]] = xor i16 [[A:%.*]], -1 -; CHECK-NEXT: [[ADD_NARROWED_OVERFLOW:%.*]] = icmp ult i16 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[ADD_NARROWED_OVERFLOW:%.*]] = icmp ugt i16 [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[LSHR:%.*]] = zext i1 [[ADD_NARROWED_OVERFLOW]] to i64 ; CHECK-NEXT: ret i64 [[LSHR]] ; @@ -668,7 +668,7 @@ define i64 @lshr_32_add_not_known_32_leading_zeroes(i64 %a, i64 %b) { define i32 @ashr_16_add_zext_basic(i16 %a, i16 %b) { ; CHECK-LABEL: @ashr_16_add_zext_basic( ; CHECK-NEXT: [[TMP1:%.*]] = xor i16 [[A:%.*]], -1 -; CHECK-NEXT: [[ADD_NARROWED_OVERFLOW:%.*]] = icmp ult i16 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[ADD_NARROWED_OVERFLOW:%.*]] = icmp ugt i16 [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[LSHR:%.*]] = zext i1 [[ADD_NARROWED_OVERFLOW]] to i32 ; CHECK-NEXT: ret i32 [[LSHR]] ; @@ -682,7 +682,7 @@ define i32 @ashr_16_add_zext_basic(i16 %a, i16 %b) { define i64 @ashr_32_add_zext_basic(i32 %a, i32 %b) { ; CHECK-LABEL: @ashr_32_add_zext_basic( ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A:%.*]], -1 -; CHECK-NEXT: [[ADD_NARROWED_OVERFLOW:%.*]] = icmp ult i32 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[ADD_NARROWED_OVERFLOW:%.*]] = icmp ugt i32 [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[LSHR:%.*]] = zext i1 [[ADD_NARROWED_OVERFLOW]] to i64 ; CHECK-NEXT: ret i64 [[LSHR]] ; @@ -696,7 +696,7 @@ define i64 @ashr_32_add_zext_basic(i32 %a, i32 %b) { define i64 @ashr_16_to_64_add_zext_basic(i16 %a, i16 %b) { ; CHECK-LABEL: @ashr_16_to_64_add_zext_basic( ; CHECK-NEXT: [[TMP1:%.*]] = xor i16 [[A:%.*]], -1 -; CHECK-NEXT: [[ADD_NARROWED_OVERFLOW:%.*]] = icmp ult i16 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[ADD_NARROWED_OVERFLOW:%.*]] = icmp ugt i16 [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[LSHR:%.*]] = zext i1 [[ADD_NARROWED_OVERFLOW]] to i64 ; CHECK-NEXT: ret i64 [[LSHR]] ; diff --git a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest-with-truncation-lshr.ll b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest-with-truncation-lshr.ll index a0a3c8edfb4b5d..c4260f4cb2bf88 100644 --- a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest-with-truncation-lshr.ll +++ b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest-with-truncation-lshr.ll @@ -294,7 +294,7 @@ define i1 @t10_almost_highest_bit(i32 %x, i64 %y, i32 %len) { define i1 @t11_no_shift(i32 %x, i64 %y, i32 %len) { ; CHECK-LABEL: @t11_no_shift( ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[X:%.*]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: [[T5:%.*]] = icmp ne i64 [[TMP2]], 0 ; CHECK-NEXT: ret i1 [[T5]] ; diff --git a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest-with-truncation-shl.ll b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest-with-truncation-shl.ll index 3a85f19d8a0375..6e9552e2af4cce 100644 --- a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest-with-truncation-shl.ll +++ b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest-with-truncation-shl.ll @@ -17,7 +17,7 @@ define i1 @t0_const_after_fold_lshr_shl_ne(i32 %x, i64 %y, i32 %len) { ; CHECK-LABEL: @t0_const_after_fold_lshr_shl_ne( ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 31 ; CHECK-NEXT: [[TMP2:%.*]] = zext nneg i32 [[TMP1]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = and i64 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: [[T5:%.*]] = icmp ne i64 [[TMP3]], 0 ; CHECK-NEXT: ret i1 [[T5]] ; @@ -40,7 +40,7 @@ define <2 x i1> @t1_vec_splat(<2 x i32> %x, <2 x i64> %y, <2 x i32> %len) { ; CHECK-LABEL: @t1_vec_splat( ; CHECK-NEXT: [[TMP1:%.*]] = lshr <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = zext nneg <2 x i32> [[TMP1]] to <2 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i64> [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: [[T5:%.*]] = icmp ne <2 x i64> [[TMP3]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[T5]] ; @@ -212,7 +212,7 @@ define i1 @t6_oneuse3(i32 %x, i64 %y, i32 %len) { ; CHECK-NEXT: call void @use64(i64 [[T3]]) ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 31 ; CHECK-NEXT: [[TMP2:%.*]] = zext nneg i32 [[TMP1]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], [[Y]] +; CHECK-NEXT: [[TMP3:%.*]] = and i64 [[Y]], [[TMP2]] ; CHECK-NEXT: [[T5:%.*]] = icmp ne i64 [[TMP3]], 0 ; CHECK-NEXT: ret i1 [[T5]] ; @@ -244,7 +244,7 @@ define i1 @t7_oneuse4(i32 %x, i64 %y, i32 %len) { ; CHECK-NEXT: call void @use32(i32 [[T3_TRUNC]]) ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 31 ; CHECK-NEXT: [[TMP2:%.*]] = zext nneg i32 [[TMP1]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], [[Y]] +; CHECK-NEXT: [[TMP3:%.*]] = and i64 [[Y]], [[TMP2]] ; CHECK-NEXT: [[T5:%.*]] = icmp ne i64 [[TMP3]], 0 ; CHECK-NEXT: ret i1 [[T5]] ; diff --git a/llvm/test/Transforms/InstCombine/shift-direction-in-bit-test.ll b/llvm/test/Transforms/InstCombine/shift-direction-in-bit-test.ll index a8f4644f1ae42f..ebb53e36a3f216 100644 --- a/llvm/test/Transforms/InstCombine/shift-direction-in-bit-test.ll +++ b/llvm/test/Transforms/InstCombine/shift-direction-in-bit-test.ll @@ -239,7 +239,7 @@ define i1 @t13_shift_of_const1(i32 %x, i32 %y, i32 %z) { define i1 @t14_and_with_const0(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @t14_and_with_const0( ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 1, [[Y:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[T2:%.*]] = icmp eq i32 [[TMP2]], 0 ; CHECK-NEXT: ret i1 [[T2]] ; @@ -251,7 +251,7 @@ define i1 @t14_and_with_const0(i32 %x, i32 %y, i32 %z) { define i1 @t15_and_with_const1(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @t15_and_with_const1( ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i32 1, [[Y:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: [[T2:%.*]] = icmp eq i32 [[TMP2]], 0 ; CHECK-NEXT: ret i1 [[T2]] ; diff --git a/llvm/test/Transforms/InstCombine/shift-logic.ll b/llvm/test/Transforms/InstCombine/shift-logic.ll index 3d4547e0bb9caa..593a22bec6490b 100644 --- a/llvm/test/Transforms/InstCombine/shift-logic.ll +++ b/llvm/test/Transforms/InstCombine/shift-logic.ll @@ -189,7 +189,7 @@ define i32 @ashr_xor(i32 %x, i32 %py) { define i32 @shr_mismatch_xor(i32 %x, i32 %y) { ; CHECK-LABEL: @shr_mismatch_xor( ; CHECK-NEXT: [[SH0:%.*]] = ashr i32 [[X:%.*]], 5 -; CHECK-NEXT: [[R:%.*]] = xor i32 [[SH0]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = xor i32 [[Y:%.*]], [[SH0]] ; CHECK-NEXT: [[SH1:%.*]] = lshr i32 [[R]], 7 ; CHECK-NEXT: ret i32 [[SH1]] ; @@ -202,7 +202,7 @@ define i32 @shr_mismatch_xor(i32 %x, i32 %y) { define i32 @ashr_overshift_xor(i32 %x, i32 %y) { ; CHECK-LABEL: @ashr_overshift_xor( ; CHECK-NEXT: [[SH0:%.*]] = ashr i32 [[X:%.*]], 15 -; CHECK-NEXT: [[R:%.*]] = xor i32 [[SH0]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = xor i32 [[Y:%.*]], [[SH0]] ; CHECK-NEXT: [[SH1:%.*]] = ashr i32 [[R]], 17 ; CHECK-NEXT: ret i32 [[SH1]] ; @@ -215,7 +215,7 @@ define i32 @ashr_overshift_xor(i32 %x, i32 %y) { define <2 x i32> @ashr_poison_poison_xor(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @ashr_poison_poison_xor( ; CHECK-NEXT: [[SH0:%.*]] = ashr <2 x i32> [[X:%.*]], -; CHECK-NEXT: [[R:%.*]] = xor <2 x i32> [[SH0]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = xor <2 x i32> [[Y:%.*]], [[SH0]] ; CHECK-NEXT: [[SH1:%.*]] = ashr <2 x i32> [[R]], ; CHECK-NEXT: ret <2 x i32> [[SH1]] ; diff --git a/llvm/test/Transforms/InstCombine/shift.ll b/llvm/test/Transforms/InstCombine/shift.ll index f0bfd0171b265a..558f4ffbfcabe4 100644 --- a/llvm/test/Transforms/InstCombine/shift.ll +++ b/llvm/test/Transforms/InstCombine/shift.ll @@ -1692,7 +1692,7 @@ define i177 @lshr_out_of_range(i177 %Y, ptr %A2, ptr %ptr) { ; CHECK-LABEL: @lshr_out_of_range( ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i177 [[Y:%.*]], -1 ; CHECK-NEXT: [[B4:%.*]] = sext i1 [[TMP1]] to i177 -; CHECK-NEXT: [[C8:%.*]] = icmp ult i177 [[B4]], [[Y]] +; CHECK-NEXT: [[C8:%.*]] = icmp ugt i177 [[Y]], [[B4]] ; CHECK-NEXT: [[TMP2:%.*]] = sext i1 [[C8]] to i64 ; CHECK-NEXT: [[G18:%.*]] = getelementptr ptr, ptr [[A2:%.*]], i64 [[TMP2]] ; CHECK-NEXT: store ptr [[G18]], ptr [[PTR:%.*]], align 8 @@ -1810,7 +1810,7 @@ define void @ossfuzz_38078(i32 %arg, i32 %arg1, ptr %ptr, ptr %ptr2, ptr %ptr3, ; CHECK-NEXT: bb: ; CHECK-NEXT: [[G1:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 -4 ; CHECK-NEXT: [[I2:%.*]] = sub i32 0, [[ARG1:%.*]] -; CHECK-NEXT: [[I5:%.*]] = icmp eq i32 [[I2]], [[ARG:%.*]] +; CHECK-NEXT: [[I5:%.*]] = icmp eq i32 [[ARG:%.*]], [[I2]] ; CHECK-NEXT: call void @llvm.assume(i1 [[I5]]) ; CHECK-NEXT: store volatile i32 2147483647, ptr [[G1]], align 4 ; CHECK-NEXT: br label [[BB:%.*]] @@ -2047,7 +2047,7 @@ define i32 @ashr_sdiv_extra_use(i32 %x) { define i32 @shl1_cttz(i32 %x) { ; CHECK-LABEL: @shl1_cttz( ; CHECK-NEXT: [[NEG:%.*]] = sub i32 0, [[X:%.*]] -; CHECK-NEXT: [[SHL:%.*]] = and i32 [[NEG]], [[X]] +; CHECK-NEXT: [[SHL:%.*]] = and i32 [[X]], [[NEG]] ; CHECK-NEXT: ret i32 [[SHL]] ; %tz = call i32 @llvm.cttz.i32(i32 %x, i1 true) @@ -2058,7 +2058,7 @@ define i32 @shl1_cttz(i32 %x) { define <2 x i8> @shl1_cttz_vec(<2 x i8> %x) { ; CHECK-LABEL: @shl1_cttz_vec( ; CHECK-NEXT: [[NEG:%.*]] = sub <2 x i8> zeroinitializer, [[X:%.*]] -; CHECK-NEXT: [[SHL:%.*]] = and <2 x i8> [[NEG]], [[X]] +; CHECK-NEXT: [[SHL:%.*]] = and <2 x i8> [[X]], [[NEG]] ; CHECK-NEXT: ret <2 x i8> [[SHL]] ; %tz = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %x, i1 false) @@ -2069,7 +2069,7 @@ define <2 x i8> @shl1_cttz_vec(<2 x i8> %x) { define <2 x i8> @shl1_cttz_vec_poison(<2 x i8> %x) { ; CHECK-LABEL: @shl1_cttz_vec_poison( ; CHECK-NEXT: [[NEG:%.*]] = sub <2 x i8> zeroinitializer, [[X:%.*]] -; CHECK-NEXT: [[SHL:%.*]] = and <2 x i8> [[NEG]], [[X]] +; CHECK-NEXT: [[SHL:%.*]] = and <2 x i8> [[X]], [[NEG]] ; CHECK-NEXT: ret <2 x i8> [[SHL]] ; %tz = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %x, i1 false) diff --git a/llvm/test/Transforms/InstCombine/shl-bo.ll b/llvm/test/Transforms/InstCombine/shl-bo.ll index ab6e8c28cf9fc7..356c4a288f9e31 100644 --- a/llvm/test/Transforms/InstCombine/shl-bo.ll +++ b/llvm/test/Transforms/InstCombine/shl-bo.ll @@ -7,7 +7,7 @@ define i8 @lshr_add(i8 %a, i8 %y) { ; CHECK-LABEL: @lshr_add( ; CHECK-NEXT: [[X:%.*]] = srem i8 [[A:%.*]], 42 ; CHECK-NEXT: [[B1:%.*]] = shl i8 [[X]], 5 -; CHECK-NEXT: [[R2:%.*]] = add i8 [[B1]], [[Y:%.*]] +; CHECK-NEXT: [[R2:%.*]] = add i8 [[Y:%.*]], [[B1]] ; CHECK-NEXT: [[L:%.*]] = and i8 [[R2]], -32 ; CHECK-NEXT: ret i8 [[L]] ; @@ -22,7 +22,7 @@ define <2 x i8> @lshr_add_commute_splat(<2 x i8> %a, <2 x i8> %y) { ; CHECK-LABEL: @lshr_add_commute_splat( ; CHECK-NEXT: [[X:%.*]] = srem <2 x i8> [[A:%.*]], ; CHECK-NEXT: [[B1:%.*]] = shl <2 x i8> [[X]], -; CHECK-NEXT: [[R2:%.*]] = add <2 x i8> [[B1]], [[Y:%.*]] +; CHECK-NEXT: [[R2:%.*]] = add <2 x i8> [[Y:%.*]], [[B1]] ; CHECK-NEXT: [[L:%.*]] = and <2 x i8> [[R2]], ; CHECK-NEXT: ret <2 x i8> [[L]] ; @@ -67,7 +67,7 @@ define i8 @lshr_and(i8 %a, i8 %y) { ; CHECK-LABEL: @lshr_and( ; CHECK-NEXT: [[X:%.*]] = srem i8 [[A:%.*]], 42 ; CHECK-NEXT: [[B1:%.*]] = shl i8 [[X]], 6 -; CHECK-NEXT: [[R2:%.*]] = and i8 [[B1]], [[Y:%.*]] +; CHECK-NEXT: [[R2:%.*]] = and i8 [[Y:%.*]], [[B1]] ; CHECK-NEXT: ret i8 [[R2]] ; %x = srem i8 %a, 42 ; thwart complexity-based canonicalization @@ -81,7 +81,7 @@ define <2 x i8> @lshr_and_commute_splat(<2 x i8> %a, <2 x i8> %y) { ; CHECK-LABEL: @lshr_and_commute_splat( ; CHECK-NEXT: [[X:%.*]] = srem <2 x i8> [[A:%.*]], ; CHECK-NEXT: [[B1:%.*]] = shl <2 x i8> [[X]], -; CHECK-NEXT: [[R2:%.*]] = and <2 x i8> [[B1]], [[Y:%.*]] +; CHECK-NEXT: [[R2:%.*]] = and <2 x i8> [[Y:%.*]], [[B1]] ; CHECK-NEXT: ret <2 x i8> [[R2]] ; %x = srem <2 x i8> %a, ; thwart complexity-based canonicalization @@ -96,7 +96,7 @@ define i8 @lshr_or(i8 %a, i8 %y) { ; CHECK-NEXT: [[X:%.*]] = srem i8 [[A:%.*]], 42 ; CHECK-NEXT: [[B1:%.*]] = shl i8 [[X]], 4 ; CHECK-NEXT: [[Y_MASKED:%.*]] = and i8 [[Y:%.*]], -16 -; CHECK-NEXT: [[L:%.*]] = or i8 [[B1]], [[Y_MASKED]] +; CHECK-NEXT: [[L:%.*]] = or i8 [[Y_MASKED]], [[B1]] ; CHECK-NEXT: ret i8 [[L]] ; %x = srem i8 %a, 42 ; thwart complexity-based canonicalization @@ -111,7 +111,7 @@ define <2 x i8> @lshr_or_commute_splat(<2 x i8> %a, <2 x i8> %y) { ; CHECK-NEXT: [[X:%.*]] = srem <2 x i8> [[A:%.*]], ; CHECK-NEXT: [[B1:%.*]] = shl <2 x i8> [[X]], ; CHECK-NEXT: [[Y_MASKED:%.*]] = and <2 x i8> [[Y:%.*]], -; CHECK-NEXT: [[L:%.*]] = or <2 x i8> [[B1]], [[Y_MASKED]] +; CHECK-NEXT: [[L:%.*]] = or <2 x i8> [[Y_MASKED]], [[B1]] ; CHECK-NEXT: ret <2 x i8> [[L]] ; %x = srem <2 x i8> %a, ; thwart complexity-based canonicalization @@ -126,7 +126,7 @@ define i8 @lshr_xor(i8 %a, i8 %y) { ; CHECK-NEXT: [[X:%.*]] = srem i8 [[A:%.*]], 42 ; CHECK-NEXT: [[B1:%.*]] = shl i8 [[X]], 3 ; CHECK-NEXT: [[Y_MASKED:%.*]] = and i8 [[Y:%.*]], -8 -; CHECK-NEXT: [[L:%.*]] = xor i8 [[B1]], [[Y_MASKED]] +; CHECK-NEXT: [[L:%.*]] = xor i8 [[Y_MASKED]], [[B1]] ; CHECK-NEXT: ret i8 [[L]] ; %x = srem i8 %a, 42 ; thwart complexity-based canonicalization @@ -141,7 +141,7 @@ define <2 x i8> @lshr_xor_commute_splat(<2 x i8> %a, <2 x i8> %y) { ; CHECK-NEXT: [[X:%.*]] = srem <2 x i8> [[A:%.*]], ; CHECK-NEXT: [[B1:%.*]] = shl <2 x i8> [[X]], ; CHECK-NEXT: [[Y_MASKED:%.*]] = and <2 x i8> [[Y:%.*]], -; CHECK-NEXT: [[L:%.*]] = xor <2 x i8> [[B1]], [[Y_MASKED]] +; CHECK-NEXT: [[L:%.*]] = xor <2 x i8> [[Y_MASKED]], [[B1]] ; CHECK-NEXT: ret <2 x i8> [[L]] ; %x = srem <2 x i8> %a, ; thwart complexity-based canonicalization @@ -380,7 +380,7 @@ define i8 @lshr_and_add_use1(i8 %x, i8 %y) { ; CHECK-NEXT: [[R:%.*]] = lshr i8 [[Y:%.*]], 3 ; CHECK-NEXT: call void @use(i8 [[R]]) ; CHECK-NEXT: [[M:%.*]] = and i8 [[R]], 12 -; CHECK-NEXT: [[B:%.*]] = add i8 [[M]], [[X:%.*]] +; CHECK-NEXT: [[B:%.*]] = add i8 [[X:%.*]], [[M]] ; CHECK-NEXT: [[L:%.*]] = shl i8 [[B]], 3 ; CHECK-NEXT: ret i8 [[L]] ; @@ -397,7 +397,7 @@ define i8 @lshr_and_add_use2(i8 %x, i8 %y) { ; CHECK-NEXT: [[R:%.*]] = lshr i8 [[Y:%.*]], 3 ; CHECK-NEXT: [[M:%.*]] = and i8 [[R]], 12 ; CHECK-NEXT: call void @use(i8 [[M]]) -; CHECK-NEXT: [[B:%.*]] = add i8 [[M]], [[X:%.*]] +; CHECK-NEXT: [[B:%.*]] = add i8 [[X:%.*]], [[M]] ; CHECK-NEXT: [[L:%.*]] = shl i8 [[B]], 3 ; CHECK-NEXT: ret i8 [[L]] ; @@ -413,7 +413,7 @@ define i8 @lshr_and_add_use3(i8 %x, i8 %y) { ; CHECK-LABEL: @lshr_and_add_use3( ; CHECK-NEXT: [[R:%.*]] = lshr i8 [[Y:%.*]], 3 ; CHECK-NEXT: [[M:%.*]] = and i8 [[R]], 12 -; CHECK-NEXT: [[B:%.*]] = add i8 [[M]], [[X:%.*]] +; CHECK-NEXT: [[B:%.*]] = add i8 [[X:%.*]], [[M]] ; CHECK-NEXT: call void @use(i8 [[B]]) ; CHECK-NEXT: [[L:%.*]] = shl i8 [[B]], 3 ; CHECK-NEXT: ret i8 [[L]] @@ -432,7 +432,7 @@ define i8 @lshr_and_add_use4(i8 %x, i8 %y) { ; CHECK-NEXT: call void @use(i8 [[R]]) ; CHECK-NEXT: [[M:%.*]] = and i8 [[R]], 12 ; CHECK-NEXT: call void @use(i8 [[M]]) -; CHECK-NEXT: [[B:%.*]] = add i8 [[M]], [[X:%.*]] +; CHECK-NEXT: [[B:%.*]] = add i8 [[X:%.*]], [[M]] ; CHECK-NEXT: [[L:%.*]] = shl i8 [[B]], 3 ; CHECK-NEXT: ret i8 [[L]] ; @@ -450,7 +450,7 @@ define i8 @lshr_and_add_use5(i8 %x, i8 %y) { ; CHECK-NEXT: [[R:%.*]] = lshr i8 [[Y:%.*]], 3 ; CHECK-NEXT: [[M:%.*]] = and i8 [[R]], 12 ; CHECK-NEXT: call void @use(i8 [[M]]) -; CHECK-NEXT: [[B:%.*]] = add i8 [[M]], [[X:%.*]] +; CHECK-NEXT: [[B:%.*]] = add i8 [[X:%.*]], [[M]] ; CHECK-NEXT: call void @use(i8 [[B]]) ; CHECK-NEXT: [[L:%.*]] = shl i8 [[B]], 3 ; CHECK-NEXT: ret i8 [[L]] @@ -470,7 +470,7 @@ define i8 @lshr_and_add_use6(i8 %x, i8 %y) { ; CHECK-NEXT: call void @use(i8 [[R]]) ; CHECK-NEXT: [[M:%.*]] = and i8 [[R]], 12 ; CHECK-NEXT: call void @use(i8 [[M]]) -; CHECK-NEXT: [[B:%.*]] = add i8 [[M]], [[X:%.*]] +; CHECK-NEXT: [[B:%.*]] = add i8 [[X:%.*]], [[M]] ; CHECK-NEXT: [[L:%.*]] = shl i8 [[B]], 3 ; CHECK-NEXT: ret i8 [[L]] ; @@ -541,7 +541,7 @@ define <2 x i32> @lshr_add_and_shl_v2i32_undef(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @lshr_add_and_shl_v2i32_undef( ; CHECK-NEXT: [[TMP1:%.*]] = lshr <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = shl <2 x i32> [[TMP3]], ; CHECK-NEXT: ret <2 x i32> [[TMP4]] ; @@ -556,7 +556,7 @@ define <2 x i32> @lshr_add_and_shl_v2i32_nonuniform(<2 x i32> %x, <2 x i32> %y) ; CHECK-LABEL: @lshr_add_and_shl_v2i32_nonuniform( ; CHECK-NEXT: [[TMP1:%.*]] = lshr <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[Y:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = shl <2 x i32> [[TMP3]], ; CHECK-NEXT: ret <2 x i32> [[TMP4]] ; diff --git a/llvm/test/Transforms/InstCombine/shuffle-binop.ll b/llvm/test/Transforms/InstCombine/shuffle-binop.ll index 8460f8b2c6cd3d..8ab7f315dbf540 100644 --- a/llvm/test/Transforms/InstCombine/shuffle-binop.ll +++ b/llvm/test/Transforms/InstCombine/shuffle-binop.ll @@ -82,7 +82,7 @@ define <4 x i8> @splat_binop_splat_x_splat_y(<4 x i8> %x, <4 x i8> %y) { ; CHECK-NEXT: call void @use(<4 x i8> [[XSPLAT]]) ; CHECK-NEXT: [[YSPLAT:%.*]] = shufflevector <4 x i8> [[Y:%.*]], <4 x i8> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: call void @use(<4 x i8> [[YSPLAT]]) -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw <4 x i8> [[Y]], [[X]] +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw <4 x i8> [[X]], [[Y]] ; CHECK-NEXT: [[BSPLAT:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: ret <4 x i8> [[BSPLAT]] ; @@ -101,7 +101,7 @@ define <4 x float> @splat_binop_splat_x_splat_y_fmath_flags(<4 x float> %x, <4 x ; CHECK-NEXT: call void @use(<4 x float> [[XSPLAT]]) ; CHECK-NEXT: [[YSPLAT:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: call void @use(<4 x float> [[YSPLAT]]) -; CHECK-NEXT: [[TMP1:%.*]] = fmul fast <4 x float> [[Y]], [[X]] +; CHECK-NEXT: [[TMP1:%.*]] = fmul fast <4 x float> [[X]], [[Y]] ; CHECK-NEXT: [[BSPLAT:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: ret <4 x float> [[BSPLAT]] ; diff --git a/llvm/test/Transforms/InstCombine/signed-truncation-check.ll b/llvm/test/Transforms/InstCombine/signed-truncation-check.ll index 7e762627e5ec02..513fb69ab7463e 100644 --- a/llvm/test/Transforms/InstCombine/signed-truncation-check.ll +++ b/llvm/test/Transforms/InstCombine/signed-truncation-check.ll @@ -612,7 +612,7 @@ define zeroext i1 @oneuse_trunc_sext(i32 %arg) { ; CHECK-NEXT: call void @use8(i8 [[T3]]) ; CHECK-NEXT: [[T4:%.*]] = sext i8 [[T3]] to i32 ; CHECK-NEXT: call void @use32(i32 [[T4]]) -; CHECK-NEXT: [[T5:%.*]] = icmp eq i32 [[T4]], [[ARG]] +; CHECK-NEXT: [[T5:%.*]] = icmp eq i32 [[ARG]], [[T4]] ; CHECK-NEXT: call void @use1(i1 [[T5]]) ; CHECK-NEXT: [[T6:%.*]] = and i1 [[T2]], [[T5]] ; CHECK-NEXT: ret i1 [[T6]] @@ -641,7 +641,7 @@ define zeroext i1 @oneuse_trunc_sext_logical(i32 %arg) { ; CHECK-NEXT: call void @use8(i8 [[T3]]) ; CHECK-NEXT: [[T4:%.*]] = sext i8 [[T3]] to i32 ; CHECK-NEXT: call void @use32(i32 [[T4]]) -; CHECK-NEXT: [[T5:%.*]] = icmp eq i32 [[T4]], [[ARG]] +; CHECK-NEXT: [[T5:%.*]] = icmp eq i32 [[ARG]], [[T4]] ; CHECK-NEXT: call void @use1(i1 [[T5]]) ; CHECK-NEXT: [[T6:%.*]] = select i1 [[T2]], i1 [[T5]], i1 false ; CHECK-NEXT: ret i1 [[T6]] diff --git a/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll b/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll index e4fb7764ba9e53..403f3bacf34d89 100644 --- a/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll +++ b/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll @@ -16,7 +16,7 @@ define float @ninf_user_select_inf(i1 %cond, float %x, float %y) { ; CHECK-LABEL: define float @ninf_user_select_inf ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[Y:%.*]]) { ; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000 -; CHECK-NEXT: [[NINF_USER:%.*]] = fmul ninf float [[SELECT]], [[Y]] +; CHECK-NEXT: [[NINF_USER:%.*]] = fmul ninf float [[Y]], [[SELECT]] ; CHECK-NEXT: ret float [[NINF_USER]] ; %select = select i1 %cond, float %x, float 0x7FF0000000000000 diff --git a/llvm/test/Transforms/InstCombine/sink-not-into-and.ll b/llvm/test/Transforms/InstCombine/sink-not-into-and.ll index 9db6440a49ee71..1f3b46cdc386dd 100644 --- a/llvm/test/Transforms/InstCombine/sink-not-into-and.ll +++ b/llvm/test/Transforms/InstCombine/sink-not-into-and.ll @@ -40,7 +40,7 @@ define i1 @n1(i1 %i1, i32 %v2, i32 %v3) { define i1 @n2(i32 %v0, i32 %v1, i1 %i2) { ; CHECK-LABEL: @n2( ; CHECK-NEXT: [[I1:%.*]] = icmp eq i32 [[V0:%.*]], [[V1:%.*]] -; CHECK-NEXT: [[I3:%.*]] = and i1 [[I1]], [[I2:%.*]] +; CHECK-NEXT: [[I3:%.*]] = and i1 [[I2:%.*]], [[I1]] ; CHECK-NEXT: [[I4:%.*]] = xor i1 [[I3]], true ; CHECK-NEXT: ret i1 [[I4]] ; diff --git a/llvm/test/Transforms/InstCombine/sink-not-into-or.ll b/llvm/test/Transforms/InstCombine/sink-not-into-or.ll index 0b758112f699e0..8e6c983b71fe35 100644 --- a/llvm/test/Transforms/InstCombine/sink-not-into-or.ll +++ b/llvm/test/Transforms/InstCombine/sink-not-into-or.ll @@ -40,7 +40,7 @@ define i1 @n1(i1 %i1, i32 %v2, i32 %v3) { define i1 @n2(i32 %v0, i32 %v1, i1 %i2) { ; CHECK-LABEL: @n2( ; CHECK-NEXT: [[I1:%.*]] = icmp eq i32 [[V0:%.*]], [[V1:%.*]] -; CHECK-NEXT: [[I3:%.*]] = or i1 [[I1]], [[I2:%.*]] +; CHECK-NEXT: [[I3:%.*]] = or i1 [[I2:%.*]], [[I1]] ; CHECK-NEXT: [[I4:%.*]] = xor i1 [[I3]], true ; CHECK-NEXT: ret i1 [[I4]] ; diff --git a/llvm/test/Transforms/InstCombine/smax-icmp.ll b/llvm/test/Transforms/InstCombine/smax-icmp.ll index 022ec6ad4f3466..4c9cbed9d9ebf1 100644 --- a/llvm/test/Transforms/InstCombine/smax-icmp.ll +++ b/llvm/test/Transforms/InstCombine/smax-icmp.ll @@ -95,7 +95,7 @@ define i1 @sle_smax2(i32 %x, i32 %y) { define i1 @sle_smax3(i32 %a, i32 %y) { ; CHECK-LABEL: @sle_smax3( ; CHECK-NEXT: [[X:%.*]] = add i32 [[A:%.*]], 3 -; CHECK-NEXT: [[CMP2:%.*]] = icmp sge i32 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp sle i32 [[Y:%.*]], [[X]] ; CHECK-NEXT: ret i1 [[CMP2]] ; %x = add i32 %a, 3 ; thwart complexity-based canonicalization @@ -110,7 +110,7 @@ define i1 @sle_smax3(i32 %a, i32 %y) { define i1 @sle_smax4(i32 %a, i32 %y) { ; CHECK-LABEL: @sle_smax4( ; CHECK-NEXT: [[X:%.*]] = add i32 [[A:%.*]], 3 -; CHECK-NEXT: [[CMP2:%.*]] = icmp sge i32 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp sle i32 [[Y:%.*]], [[X]] ; CHECK-NEXT: ret i1 [[CMP2]] ; %x = add i32 %a, 3 ; thwart complexity-based canonicalization @@ -207,7 +207,7 @@ define i1 @sgt_smax2(i32 %x, i32 %y) { define i1 @sgt_smax3(i32 %a, i32 %y) { ; CHECK-LABEL: @sgt_smax3( ; CHECK-NEXT: [[X:%.*]] = add i32 [[A:%.*]], 3 -; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[Y:%.*]], [[X]] ; CHECK-NEXT: ret i1 [[CMP2]] ; %x = add i32 %a, 3 ; thwart complexity-based canonicalization @@ -222,7 +222,7 @@ define i1 @sgt_smax3(i32 %a, i32 %y) { define i1 @sgt_smax4(i32 %a, i32 %y) { ; CHECK-LABEL: @sgt_smax4( ; CHECK-NEXT: [[X:%.*]] = add i32 [[A:%.*]], 3 -; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[Y:%.*]], [[X]] ; CHECK-NEXT: ret i1 [[CMP2]] ; %x = add i32 %a, 3 ; thwart complexity-based canonicalization diff --git a/llvm/test/Transforms/InstCombine/smin-icmp.ll b/llvm/test/Transforms/InstCombine/smin-icmp.ll index c97f29f5eff8d7..d1283d8afc0a74 100644 --- a/llvm/test/Transforms/InstCombine/smin-icmp.ll +++ b/llvm/test/Transforms/InstCombine/smin-icmp.ll @@ -94,7 +94,7 @@ define i1 @sge_smin2(i32 %x, i32 %y) { define i1 @sge_smin3(i32 %a, i32 %y) { ; CHECK-LABEL: @sge_smin3( ; CHECK-NEXT: [[X:%.*]] = add i32 [[A:%.*]], 3 -; CHECK-NEXT: [[CMP2:%.*]] = icmp sle i32 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp sge i32 [[Y:%.*]], [[X]] ; CHECK-NEXT: ret i1 [[CMP2]] ; %x = add i32 %a, 3 ; thwart complexity-based canonicalization @@ -109,7 +109,7 @@ define i1 @sge_smin3(i32 %a, i32 %y) { define i1 @sge_smin4(i32 %a, i32 %y) { ; CHECK-LABEL: @sge_smin4( ; CHECK-NEXT: [[X:%.*]] = add i32 [[A:%.*]], 3 -; CHECK-NEXT: [[CMP2:%.*]] = icmp sle i32 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp sge i32 [[Y:%.*]], [[X]] ; CHECK-NEXT: ret i1 [[CMP2]] ; %x = add i32 %a, 3 ; thwart complexity-based canonicalization @@ -206,7 +206,7 @@ define i1 @slt_smin2(i32 %x, i32 %y) { define i1 @slt_smin3(i32 %a, i32 %y) { ; CHECK-LABEL: @slt_smin3( ; CHECK-NEXT: [[X:%.*]] = add i32 [[A:%.*]], 3 -; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[Y:%.*]], [[X]] ; CHECK-NEXT: ret i1 [[CMP2]] ; %x = add i32 %a, 3 ; thwart complexity-based canonicalization @@ -221,7 +221,7 @@ define i1 @slt_smin3(i32 %a, i32 %y) { define i1 @slt_smin4(i32 %a, i32 %y) { ; CHECK-LABEL: @slt_smin4( ; CHECK-NEXT: [[X:%.*]] = add i32 [[A:%.*]], 3 -; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[Y:%.*]], [[X]] ; CHECK-NEXT: ret i1 [[CMP2]] ; %x = add i32 %a, 3 ; thwart complexity-based canonicalization diff --git a/llvm/test/Transforms/InstCombine/sub-ashr-or-to-icmp-select.ll b/llvm/test/Transforms/InstCombine/sub-ashr-or-to-icmp-select.ll index 0379f82f4a7835..e21ca605fc5af5 100644 --- a/llvm/test/Transforms/InstCombine/sub-ashr-or-to-icmp-select.ll +++ b/llvm/test/Transforms/InstCombine/sub-ashr-or-to-icmp-select.ll @@ -242,7 +242,7 @@ define i32 @sub_ashr_or_i32_extra_use_ashr(i32 %x, i32 %y, ptr %p) { ; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i32 [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: [[SHR:%.*]] = sext i1 [[TMP1]] to i32 ; CHECK-NEXT: store i32 [[SHR]], ptr [[P:%.*]], align 4 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHR]], [[X]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[X]], [[SHR]] ; CHECK-NEXT: ret i32 [[OR]] ; %sub = sub nsw i32 %y, %x @@ -268,7 +268,7 @@ define i32 @sub_ashr_or_i32_no_nsw_nuw(i32 %x, i32 %y) { define i32 @neg_or_extra_use_ashr_i32(i32 %x, ptr %p) { ; CHECK-LABEL: @neg_or_extra_use_ashr_i32( ; CHECK-NEXT: [[NEG:%.*]] = sub i32 0, [[X:%.*]] -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NEG]], [[X]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[X]], [[NEG]] ; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[OR]], 31 ; CHECK-NEXT: store i32 [[OR]], ptr [[P:%.*]], align 4 ; CHECK-NEXT: ret i32 [[SHR]] diff --git a/llvm/test/Transforms/InstCombine/sub-gep.ll b/llvm/test/Transforms/InstCombine/sub-gep.ll index 5130883409b28d..b773d106b2c98a 100644 --- a/llvm/test/Transforms/InstCombine/sub-gep.ll +++ b/llvm/test/Transforms/InstCombine/sub-gep.ll @@ -422,7 +422,7 @@ define i64 @nullptrtoint_scalable_x(i64 %x) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4 -; CHECK-NEXT: [[PTR_IDX:%.*]] = mul nsw i64 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[PTR_IDX:%.*]] = mul nsw i64 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i64 [[PTR_IDX]] ; entry: diff --git a/llvm/test/Transforms/InstCombine/sub-lshr-or-to-icmp-select.ll b/llvm/test/Transforms/InstCombine/sub-lshr-or-to-icmp-select.ll index 5ecf4b8da0c490..33c02d77c45b90 100644 --- a/llvm/test/Transforms/InstCombine/sub-lshr-or-to-icmp-select.ll +++ b/llvm/test/Transforms/InstCombine/sub-lshr-or-to-icmp-select.ll @@ -81,7 +81,7 @@ define i32 @neg_extra_use_or_lshr_i32(i32 %x, ptr %p) { define i32 @neg_or_extra_use_lshr_i32(i32 %x, ptr %p) { ; CHECK-LABEL: @neg_or_extra_use_lshr_i32( ; CHECK-NEXT: [[NEG:%.*]] = sub i32 0, [[X:%.*]] -; CHECK-NEXT: [[OR:%.*]] = or i32 [[NEG]], [[X]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[X]], [[NEG]] ; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[OR]], 31 ; CHECK-NEXT: store i32 [[OR]], ptr [[P:%.*]], align 4 ; CHECK-NEXT: ret i32 [[SHR]] diff --git a/llvm/test/Transforms/InstCombine/sub-minmax.ll b/llvm/test/Transforms/InstCombine/sub-minmax.ll index c9ce165c389886..c5af57449bf719 100644 --- a/llvm/test/Transforms/InstCombine/sub-minmax.ll +++ b/llvm/test/Transforms/InstCombine/sub-minmax.ll @@ -770,7 +770,7 @@ define i8 @sub_add_umin(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: define {{[^@]+}}@sub_add_umin ; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[Z:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[Y]], i8 [[Z]]) -; CHECK-NEXT: [[S:%.*]] = add i8 [[TMP1]], [[X]] +; CHECK-NEXT: [[S:%.*]] = add i8 [[X]], [[TMP1]] ; CHECK-NEXT: ret i8 [[S]] ; %a = add i8 %x, %y @@ -783,7 +783,7 @@ define i8 @sub_add_umin_commute_umin(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: define {{[^@]+}}@sub_add_umin_commute_umin ; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[Z:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[Y]], i8 [[Z]]) -; CHECK-NEXT: [[S:%.*]] = add i8 [[TMP1]], [[X]] +; CHECK-NEXT: [[S:%.*]] = add i8 [[X]], [[TMP1]] ; CHECK-NEXT: ret i8 [[S]] ; %a = add i8 %x, %y @@ -796,7 +796,7 @@ define i8 @sub_add_umin_commute_add(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: define {{[^@]+}}@sub_add_umin_commute_add ; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[Z:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[Y]], i8 [[Z]]) -; CHECK-NEXT: [[S:%.*]] = add i8 [[TMP1]], [[X]] +; CHECK-NEXT: [[S:%.*]] = add i8 [[X]], [[TMP1]] ; CHECK-NEXT: ret i8 [[S]] ; %a = add i8 %y, %x @@ -809,7 +809,7 @@ define i8 @sub_add_umin_commute_add_umin(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: define {{[^@]+}}@sub_add_umin_commute_add_umin ; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[Z:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[Y]], i8 [[Z]]) -; CHECK-NEXT: [[S:%.*]] = add i8 [[TMP1]], [[X]] +; CHECK-NEXT: [[S:%.*]] = add i8 [[X]], [[TMP1]] ; CHECK-NEXT: ret i8 [[S]] ; %a = add i8 %y, %x @@ -822,7 +822,7 @@ define <2 x i8> @sub_add_umin_vec(<2 x i8> %x, <2 x i8> %y, <2 x i8> %z) { ; CHECK-LABEL: define {{[^@]+}}@sub_add_umin_vec ; CHECK-SAME: (<2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <2 x i8> [[Z:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> [[Y]], <2 x i8> [[Z]]) -; CHECK-NEXT: [[S:%.*]] = add <2 x i8> [[TMP1]], [[X]] +; CHECK-NEXT: [[S:%.*]] = add <2 x i8> [[X]], [[TMP1]] ; CHECK-NEXT: ret <2 x i8> [[S]] ; %a = add <2 x i8> %x, %y diff --git a/llvm/test/Transforms/InstCombine/sub-not.ll b/llvm/test/Transforms/InstCombine/sub-not.ll index 89ccf5aa3c8f4f..5053319162f0d2 100644 --- a/llvm/test/Transforms/InstCombine/sub-not.ll +++ b/llvm/test/Transforms/InstCombine/sub-not.ll @@ -6,7 +6,7 @@ declare void @use(i8) define i8 @sub_not(i8 %x, i8 %y) { ; CHECK-LABEL: @sub_not( ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X:%.*]], -1 -; CHECK-NEXT: [[R:%.*]] = add i8 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = add i8 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i8 [[R]] ; %s = sub i8 %x, %y @@ -30,7 +30,7 @@ define i8 @sub_not_extra_use(i8 %x, i8 %y) { define <2 x i8> @sub_not_vec(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @sub_not_vec( ; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i8> [[X:%.*]], -; CHECK-NEXT: [[R:%.*]] = add <2 x i8> [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = add <2 x i8> [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i8> [[R]] ; %s = sub <2 x i8> %x, %y @@ -41,7 +41,7 @@ define <2 x i8> @sub_not_vec(<2 x i8> %x, <2 x i8> %y) { define i8 @dec_sub(i8 %x, i8 %y) { ; CHECK-LABEL: @dec_sub( ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[R:%.*]] = add i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = add i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i8 [[R]] ; %s = sub i8 %x, %y @@ -65,7 +65,7 @@ define i8 @dec_sub_extra_use(i8 %x, i8 %y) { define <2 x i8> @dec_sub_vec(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @dec_sub_vec( ; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i8> [[Y:%.*]], -; CHECK-NEXT: [[R:%.*]] = add <2 x i8> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = add <2 x i8> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i8> [[R]] ; %s = sub <2 x i8> %x, %y @@ -76,7 +76,7 @@ define <2 x i8> @dec_sub_vec(<2 x i8> %x, <2 x i8> %y) { define i8 @sub_inc(i8 %x, i8 %y) { ; CHECK-LABEL: @sub_inc( ; CHECK-NEXT: [[S_NEG:%.*]] = xor i8 [[X:%.*]], -1 -; CHECK-NEXT: [[R:%.*]] = add i8 [[S_NEG]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = add i8 [[Y:%.*]], [[S_NEG]] ; CHECK-NEXT: ret i8 [[R]] ; %s = add i8 %x, 1 @@ -100,7 +100,7 @@ define i8 @sub_inc_extra_use(i8 %x, i8 %y) { define <2 x i8> @sub_inc_vec(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @sub_inc_vec( ; CHECK-NEXT: [[S_NEG:%.*]] = xor <2 x i8> [[X:%.*]], -; CHECK-NEXT: [[R:%.*]] = add <2 x i8> [[S_NEG]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = add <2 x i8> [[Y:%.*]], [[S_NEG]] ; CHECK-NEXT: ret <2 x i8> [[R]] ; %s = add <2 x i8> %x, @@ -111,7 +111,7 @@ define <2 x i8> @sub_inc_vec(<2 x i8> %x, <2 x i8> %y) { define i8 @sub_dec(i8 %x, i8 %y) { ; CHECK-LABEL: @sub_dec( ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[R:%.*]] = add i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = add i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i8 [[R]] ; %s = add i8 %x, -1 @@ -135,7 +135,7 @@ define i8 @sub_dec_extra_use(i8 %x, i8 %y) { define <2 x i8> @sub_dec_vec(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @sub_dec_vec( ; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i8> [[Y:%.*]], -; CHECK-NEXT: [[R:%.*]] = add <2 x i8> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = add <2 x i8> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i8> [[R]] ; %s = add <2 x i8> %x, diff --git a/llvm/test/Transforms/InstCombine/sub-of-negatible-inseltpoison.ll b/llvm/test/Transforms/InstCombine/sub-of-negatible-inseltpoison.ll index 76a172302999ac..60607041ad2f90 100644 --- a/llvm/test/Transforms/InstCombine/sub-of-negatible-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/sub-of-negatible-inseltpoison.ll @@ -262,7 +262,7 @@ define i8 @t12(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[T1:%.*]] = sub i8 0, [[Z:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T1]]) ; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[Y]], [[Z]] -; CHECK-NEXT: [[T3:%.*]] = add i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[T3:%.*]] = add i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i8 [[T3]] ; %t0 = sub i8 0, %y @@ -296,7 +296,7 @@ define i8 @n14(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[Y]], [[Z]] ; CHECK-NEXT: [[T2:%.*]] = sub i8 0, [[TMP1]] ; CHECK-NEXT: call void @use8(i8 [[T2]]) -; CHECK-NEXT: [[T3:%.*]] = add i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[T3:%.*]] = add i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i8 [[T3]] ; %t0 = sub i8 0, %y @@ -399,7 +399,7 @@ define i8 @n16(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @n16( ; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[T1:%.*]] = mul i8 [[T0]], [[Z:%.*]] +; CHECK-NEXT: [[T1:%.*]] = mul i8 [[Z:%.*]], [[T0]] ; CHECK-NEXT: call void @use8(i8 [[T1]]) ; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] ; CHECK-NEXT: ret i8 [[T2]] @@ -535,7 +535,7 @@ define i8 @t20(i8 %x, i16 %y) { ; CHECK-LABEL: @t20( ; CHECK-NEXT: [[T0_NEG:%.*]] = shl i16 42, [[Y:%.*]] ; CHECK-NEXT: [[T1_NEG:%.*]] = trunc i16 [[T0_NEG]] to i8 -; CHECK-NEXT: [[T2:%.*]] = add i8 [[T1_NEG]], [[X:%.*]] +; CHECK-NEXT: [[T2:%.*]] = add i8 [[X:%.*]], [[T1_NEG]] ; CHECK-NEXT: ret i8 [[T2]] ; %t0 = shl i16 -42, %y @@ -742,7 +742,7 @@ define i8 @negate_lshr_wrongshift(i8 %x, i8 %y) { define i8 @negate_sext(i8 %x, i1 %y) { ; CHECK-LABEL: @negate_sext( ; CHECK-NEXT: [[T0_NEG:%.*]] = zext i1 [[Y:%.*]] to i8 -; CHECK-NEXT: [[T1:%.*]] = add i8 [[T0_NEG]], [[X:%.*]] +; CHECK-NEXT: [[T1:%.*]] = add i8 [[X:%.*]], [[T0_NEG]] ; CHECK-NEXT: ret i8 [[T1]] ; %t0 = sext i1 %y to i8 @@ -752,7 +752,7 @@ define i8 @negate_sext(i8 %x, i1 %y) { define i8 @negate_zext(i8 %x, i1 %y) { ; CHECK-LABEL: @negate_zext( ; CHECK-NEXT: [[T0_NEG:%.*]] = sext i1 [[Y:%.*]] to i8 -; CHECK-NEXT: [[T1:%.*]] = add i8 [[T0_NEG]], [[X:%.*]] +; CHECK-NEXT: [[T1:%.*]] = add i8 [[X:%.*]], [[T0_NEG]] ; CHECK-NEXT: ret i8 [[T1]] ; %t0 = zext i1 %y to i8 @@ -1009,7 +1009,7 @@ define i8 @negation_of_increment_via_or_with_no_common_bits_set(i8 %x, i8 %y) { ; CHECK-LABEL: @negation_of_increment_via_or_with_no_common_bits_set( ; CHECK-NEXT: [[T0:%.*]] = shl i8 [[Y:%.*]], 1 ; CHECK-NEXT: [[T1_NEG:%.*]] = xor i8 [[T0]], -1 -; CHECK-NEXT: [[T2:%.*]] = add i8 [[T1_NEG]], [[X:%.*]] +; CHECK-NEXT: [[T2:%.*]] = add i8 [[X:%.*]], [[T1_NEG]] ; CHECK-NEXT: ret i8 [[T2]] ; %t0 = shl i8 %y, 1 @@ -1312,7 +1312,7 @@ define i8 @negate_nabs(i8 %x, i8 %y) { ; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[X:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T0]]) ; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[X]], i1 false) -; CHECK-NEXT: [[T3:%.*]] = add i8 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[T3:%.*]] = add i8 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i8 [[T3]] ; %t0 = sub i8 0, %x diff --git a/llvm/test/Transforms/InstCombine/sub-of-negatible.ll b/llvm/test/Transforms/InstCombine/sub-of-negatible.ll index b2e14ceaca1b08..b19eae4d8f9a41 100644 --- a/llvm/test/Transforms/InstCombine/sub-of-negatible.ll +++ b/llvm/test/Transforms/InstCombine/sub-of-negatible.ll @@ -286,7 +286,7 @@ define i8 @t12(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[T1:%.*]] = sub i8 0, [[Z:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T1]]) ; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[Y]], [[Z]] -; CHECK-NEXT: [[T3:%.*]] = add i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[T3:%.*]] = add i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i8 [[T3]] ; %t0 = sub i8 0, %y @@ -320,7 +320,7 @@ define i8 @n14(i8 %x, i8 %y, i8 %z) { ; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[Y]], [[Z]] ; CHECK-NEXT: [[T2:%.*]] = sub i8 0, [[TMP1]] ; CHECK-NEXT: call void @use8(i8 [[T2]]) -; CHECK-NEXT: [[T3:%.*]] = add i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[T3:%.*]] = add i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i8 [[T3]] ; %t0 = sub i8 0, %y @@ -423,7 +423,7 @@ define i8 @n16(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @n16( ; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[T1:%.*]] = mul i8 [[T0]], [[Z:%.*]] +; CHECK-NEXT: [[T1:%.*]] = mul i8 [[Z:%.*]], [[T0]] ; CHECK-NEXT: call void @use8(i8 [[T1]]) ; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] ; CHECK-NEXT: ret i8 [[T2]] @@ -559,7 +559,7 @@ define i8 @t20(i8 %x, i16 %y) { ; CHECK-LABEL: @t20( ; CHECK-NEXT: [[T0_NEG:%.*]] = shl i16 42, [[Y:%.*]] ; CHECK-NEXT: [[T1_NEG:%.*]] = trunc i16 [[T0_NEG]] to i8 -; CHECK-NEXT: [[T2:%.*]] = add i8 [[T1_NEG]], [[X:%.*]] +; CHECK-NEXT: [[T2:%.*]] = add i8 [[X:%.*]], [[T1_NEG]] ; CHECK-NEXT: ret i8 [[T2]] ; %t0 = shl i16 -42, %y @@ -766,7 +766,7 @@ define i8 @negate_lshr_wrongshift(i8 %x, i8 %y) { define i8 @negate_sext(i8 %x, i1 %y) { ; CHECK-LABEL: @negate_sext( ; CHECK-NEXT: [[T0_NEG:%.*]] = zext i1 [[Y:%.*]] to i8 -; CHECK-NEXT: [[T1:%.*]] = add i8 [[T0_NEG]], [[X:%.*]] +; CHECK-NEXT: [[T1:%.*]] = add i8 [[X:%.*]], [[T0_NEG]] ; CHECK-NEXT: ret i8 [[T1]] ; %t0 = sext i1 %y to i8 @@ -776,7 +776,7 @@ define i8 @negate_sext(i8 %x, i1 %y) { define i8 @negate_zext(i8 %x, i1 %y) { ; CHECK-LABEL: @negate_zext( ; CHECK-NEXT: [[T0_NEG:%.*]] = sext i1 [[Y:%.*]] to i8 -; CHECK-NEXT: [[T1:%.*]] = add i8 [[T0_NEG]], [[X:%.*]] +; CHECK-NEXT: [[T1:%.*]] = add i8 [[X:%.*]], [[T0_NEG]] ; CHECK-NEXT: ret i8 [[T1]] ; %t0 = zext i1 %y to i8 @@ -1033,7 +1033,7 @@ define i8 @negation_of_increment_via_or_with_no_common_bits_set(i8 %x, i8 %y) { ; CHECK-LABEL: @negation_of_increment_via_or_with_no_common_bits_set( ; CHECK-NEXT: [[T0:%.*]] = shl i8 [[Y:%.*]], 1 ; CHECK-NEXT: [[T1_NEG:%.*]] = xor i8 [[T0]], -1 -; CHECK-NEXT: [[T2:%.*]] = add i8 [[T1_NEG]], [[X:%.*]] +; CHECK-NEXT: [[T2:%.*]] = add i8 [[X:%.*]], [[T1_NEG]] ; CHECK-NEXT: ret i8 [[T2]] ; %t0 = shl i8 %y, 1 @@ -1071,7 +1071,7 @@ define i8 @negation_of_increment_via_or_common_bits_set(i8 %x, i8 %y) { define i8 @negation_of_increment_via_or_disjoint(i8 %x, i8 %y) { ; CHECK-LABEL: @negation_of_increment_via_or_disjoint( ; CHECK-NEXT: [[T1_NEG:%.*]] = xor i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[T2:%.*]] = add i8 [[T1_NEG]], [[X:%.*]] +; CHECK-NEXT: [[T2:%.*]] = add i8 [[X:%.*]], [[T1_NEG]] ; CHECK-NEXT: ret i8 [[T2]] ; %t1 = or disjoint i8 %y, 1 @@ -1347,7 +1347,7 @@ define i8 @negate_nabs(i8 %x, i8 %y) { ; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[X:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T0]]) ; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[X]], i1 false) -; CHECK-NEXT: [[T3:%.*]] = add i8 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[T3:%.*]] = add i8 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i8 [[T3]] ; %t0 = sub i8 0, %x diff --git a/llvm/test/Transforms/InstCombine/sub-xor-cmp.ll b/llvm/test/Transforms/InstCombine/sub-xor-cmp.ll index 461c9b0fb1e0c0..acbc29db871e88 100644 --- a/llvm/test/Transforms/InstCombine/sub-xor-cmp.ll +++ b/llvm/test/Transforms/InstCombine/sub-xor-cmp.ll @@ -58,7 +58,7 @@ define i64 @sext_non_bool_xor_sub(i64 %a, i8 %b) { ; CHECK-LABEL: define i64 @sext_non_bool_xor_sub( ; CHECK-SAME: i64 [[A:%.*]], i8 [[B:%.*]]) { ; CHECK-NEXT: [[C:%.*]] = sext i8 [[B]] to i64 -; CHECK-NEXT: [[D:%.*]] = xor i64 [[C]], [[A]] +; CHECK-NEXT: [[D:%.*]] = xor i64 [[A]], [[C]] ; CHECK-NEXT: [[R:%.*]] = sub i64 [[D]], [[C]] ; CHECK-NEXT: ret i64 [[R]] ; @@ -72,7 +72,7 @@ define i64 @sext_non_bool_xor_sub_1(i64 %a, i8 %b) { ; CHECK-LABEL: define i64 @sext_non_bool_xor_sub_1( ; CHECK-SAME: i64 [[A:%.*]], i8 [[B:%.*]]) { ; CHECK-NEXT: [[C:%.*]] = sext i8 [[B]] to i64 -; CHECK-NEXT: [[D:%.*]] = xor i64 [[C]], [[A]] +; CHECK-NEXT: [[D:%.*]] = xor i64 [[A]], [[C]] ; CHECK-NEXT: [[R:%.*]] = sub i64 [[D]], [[C]] ; CHECK-NEXT: ret i64 [[R]] ; @@ -135,9 +135,9 @@ define i64 @xor_multi_uses(i64 %a, i1 %b, i64 %x) { ; CHECK-LABEL: define i64 @xor_multi_uses( ; CHECK-SAME: i64 [[A:%.*]], i1 [[B:%.*]], i64 [[X:%.*]]) { ; CHECK-NEXT: [[C:%.*]] = sext i1 [[B]] to i64 -; CHECK-NEXT: [[D:%.*]] = xor i64 [[C]], [[A]] +; CHECK-NEXT: [[D:%.*]] = xor i64 [[A]], [[C]] ; CHECK-NEXT: [[E:%.*]] = sub i64 [[D]], [[C]] -; CHECK-NEXT: [[F:%.*]] = mul i64 [[D]], [[X]] +; CHECK-NEXT: [[F:%.*]] = mul i64 [[X]], [[D]] ; CHECK-NEXT: [[R:%.*]] = add i64 [[F]], [[E]] ; CHECK-NEXT: ret i64 [[R]] ; diff --git a/llvm/test/Transforms/InstCombine/sub.ll b/llvm/test/Transforms/InstCombine/sub.ll index cb308ab66b0935..ec88984c49cca6 100644 --- a/llvm/test/Transforms/InstCombine/sub.ll +++ b/llvm/test/Transforms/InstCombine/sub.ll @@ -230,7 +230,7 @@ define i32 @test5(i32 %A, i32 %B, i32 %C) { define i32 @test6(i32 %A, i32 %B) { ; CHECK-LABEL: @test6( ; CHECK-NEXT: [[B_NOT:%.*]] = xor i32 [[B:%.*]], -1 -; CHECK-NEXT: [[D:%.*]] = and i32 [[B_NOT]], [[A:%.*]] +; CHECK-NEXT: [[D:%.*]] = and i32 [[A:%.*]], [[B_NOT]] ; CHECK-NEXT: ret i32 [[D]] ; %C = and i32 %A, %B @@ -241,7 +241,7 @@ define i32 @test6(i32 %A, i32 %B) { define i32 @test6commuted(i32 %A, i32 %B) { ; CHECK-LABEL: @test6commuted( ; CHECK-NEXT: [[B_NOT:%.*]] = xor i32 [[B:%.*]], -1 -; CHECK-NEXT: [[D:%.*]] = and i32 [[B_NOT]], [[A:%.*]] +; CHECK-NEXT: [[D:%.*]] = and i32 [[A:%.*]], [[B_NOT]] ; CHECK-NEXT: ret i32 [[D]] ; %C = and i32 %B, %A @@ -686,7 +686,7 @@ define <2 x i32> @test27commutedvecmixed(<2 x i32> %x, <2 x i32> %y) { define i32 @test28(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @test28( ; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[Z:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[SUB:%.*]] = add i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[SUB:%.*]] = add i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[SUB]] ; %neg = sub i32 0, %z @@ -698,7 +698,7 @@ define i32 @test28(i32 %x, i32 %y, i32 %z) { define i32 @test28commuted(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @test28commuted( ; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[Z:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[SUB:%.*]] = add i32 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[SUB:%.*]] = add i32 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[SUB]] ; %neg = sub i32 0, %z @@ -893,7 +893,7 @@ define i32 @test45commuted(i32 %x, i32 %y) { define i32 @test46(i32 %x, i32 %y) { ; CHECK-LABEL: @test46( ; CHECK-NEXT: [[X_NOT:%.*]] = xor i32 [[X:%.*]], -1 -; CHECK-NEXT: [[SUB:%.*]] = and i32 [[X_NOT]], [[Y:%.*]] +; CHECK-NEXT: [[SUB:%.*]] = and i32 [[Y:%.*]], [[X_NOT]] ; CHECK-NEXT: ret i32 [[SUB]] ; %or = or i32 %x, %y @@ -904,7 +904,7 @@ define i32 @test46(i32 %x, i32 %y) { define i32 @test46commuted(i32 %x, i32 %y) { ; CHECK-LABEL: @test46commuted( ; CHECK-NEXT: [[X_NOT:%.*]] = xor i32 [[X:%.*]], -1 -; CHECK-NEXT: [[SUB:%.*]] = and i32 [[X_NOT]], [[Y:%.*]] +; CHECK-NEXT: [[SUB:%.*]] = and i32 [[Y:%.*]], [[X_NOT]] ; CHECK-NEXT: ret i32 [[SUB]] ; %or = or i32 %y, %x @@ -1368,7 +1368,7 @@ define i32 @test71(i32 %A, i32 %B) { define <2 x i32> @test72(<2 x i32> %A, <2 x i32> %B) { ; CHECK-LABEL: @test72( ; CHECK-NEXT: [[B_NOT:%.*]] = xor <2 x i32> [[B:%.*]], -; CHECK-NEXT: [[D:%.*]] = and <2 x i32> [[B_NOT]], [[A:%.*]] +; CHECK-NEXT: [[D:%.*]] = and <2 x i32> [[A:%.*]], [[B_NOT]] ; CHECK-NEXT: ret <2 x i32> [[D]] ; %C = or <2 x i32> %A, %B @@ -1460,7 +1460,7 @@ define i8 @sub_add_sub_reassoc(i8 %w, i8 %x, i8 %y, i8 %z) { define <2 x i8> @sub_add_sub_reassoc_commute(<2 x i8> %w, <2 x i8> %x, <2 x i8> %y, <2 x i8> %z) { ; CHECK-LABEL: @sub_add_sub_reassoc_commute( ; CHECK-NEXT: [[D:%.*]] = sdiv <2 x i8> [[Y:%.*]], -; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i8> [[D]], [[W:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i8> [[W:%.*]], [[D]] ; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i8> [[X:%.*]], [[Z:%.*]] ; CHECK-NEXT: [[S2:%.*]] = sub <2 x i8> [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret <2 x i8> [[S2]] @@ -1478,7 +1478,7 @@ define i8 @sub_add_sub_reassoc_twice(i8 %v, i8 %w, i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @sub_add_sub_reassoc_twice( ; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[W:%.*]], [[Z:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X:%.*]], [[V:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = add i8 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = add i8 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: [[S3:%.*]] = sub i8 [[TMP2]], [[TMP3]] ; CHECK-NEXT: ret i8 [[S3]] ; @@ -2044,7 +2044,7 @@ define i16 @urem_zext_noundef(i8 noundef %x, i8 %y) { define i8 @mul_sub_common_factor_commute1(i8 %x, i8 %y) { ; CHECK-LABEL: @mul_sub_common_factor_commute1( ; CHECK-NEXT: [[X1:%.*]] = add i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[A:%.*]] = mul i8 [[X1]], [[X:%.*]] +; CHECK-NEXT: [[A:%.*]] = mul i8 [[X:%.*]], [[X1]] ; CHECK-NEXT: ret i8 [[A]] ; %m = mul nsw i8 %x, %y @@ -2070,7 +2070,7 @@ define <2 x i8> @mul_sub_common_factor_commute2(<2 x i8> %x, <2 x i8> %y) { define i8 @mul_sub_common_factor_commute3(i8 %x, i8 %y) { ; CHECK-LABEL: @mul_sub_common_factor_commute3( ; CHECK-NEXT: [[M1:%.*]] = sub i8 1, [[Y:%.*]] -; CHECK-NEXT: [[A:%.*]] = mul i8 [[M1]], [[X:%.*]] +; CHECK-NEXT: [[A:%.*]] = mul i8 [[X:%.*]], [[M1]] ; CHECK-NEXT: ret i8 [[A]] ; %m = mul nuw i8 %x, %y @@ -2081,7 +2081,7 @@ define i8 @mul_sub_common_factor_commute3(i8 %x, i8 %y) { define i8 @mul_sub_common_factor_commute4(i8 %x, i8 %y) { ; CHECK-LABEL: @mul_sub_common_factor_commute4( ; CHECK-NEXT: [[M1:%.*]] = sub i8 1, [[Y:%.*]] -; CHECK-NEXT: [[A:%.*]] = mul i8 [[M1]], [[X:%.*]] +; CHECK-NEXT: [[A:%.*]] = mul i8 [[X:%.*]], [[M1]] ; CHECK-NEXT: ret i8 [[A]] ; %m = mul nsw i8 %y, %x @@ -2734,7 +2734,7 @@ if.else: define i1 @sub_infer_nuw_from_domcond_fold3(i16 %xx, i32 range(i32 0, 12) %y) { ; CHECK-LABEL: @sub_infer_nuw_from_domcond_fold3( ; CHECK-NEXT: [[X:%.*]] = zext i16 [[XX:%.*]] to i32 -; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[COND:%.*]] = icmp ugt i32 [[Y:%.*]], [[X]] ; CHECK-NEXT: br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: ; CHECK-NEXT: ret i1 false diff --git a/llvm/test/Transforms/InstCombine/trunc-binop-ext.ll b/llvm/test/Transforms/InstCombine/trunc-binop-ext.ll index e3103906911af9..4593730b8809f4 100644 --- a/llvm/test/Transforms/InstCombine/trunc-binop-ext.ll +++ b/llvm/test/Transforms/InstCombine/trunc-binop-ext.ll @@ -5,7 +5,7 @@ define i16 @narrow_sext_and(i16 %x16, i32 %y32) { ; CHECK-LABEL: define i16 @narrow_sext_and( ; CHECK-SAME: i16 [[X16:%.*]], i32 [[Y32:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[Y32]] to i16 -; CHECK-NEXT: [[R:%.*]] = and i16 [[TMP1]], [[X16]] +; CHECK-NEXT: [[R:%.*]] = and i16 [[X16]], [[TMP1]] ; CHECK-NEXT: ret i16 [[R]] ; %x32 = sext i16 %x16 to i32 @@ -18,7 +18,7 @@ define i16 @narrow_zext_and(i16 %x16, i32 %y32) { ; CHECK-LABEL: define i16 @narrow_zext_and( ; CHECK-SAME: i16 [[X16:%.*]], i32 [[Y32:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[Y32]] to i16 -; CHECK-NEXT: [[R:%.*]] = and i16 [[TMP1]], [[X16]] +; CHECK-NEXT: [[R:%.*]] = and i16 [[X16]], [[TMP1]] ; CHECK-NEXT: ret i16 [[R]] ; %x32 = zext i16 %x16 to i32 @@ -31,7 +31,7 @@ define i16 @narrow_sext_or(i16 %x16, i32 %y32) { ; CHECK-LABEL: define i16 @narrow_sext_or( ; CHECK-SAME: i16 [[X16:%.*]], i32 [[Y32:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[Y32]] to i16 -; CHECK-NEXT: [[R:%.*]] = or i16 [[TMP1]], [[X16]] +; CHECK-NEXT: [[R:%.*]] = or i16 [[X16]], [[TMP1]] ; CHECK-NEXT: ret i16 [[R]] ; %x32 = sext i16 %x16 to i32 @@ -44,7 +44,7 @@ define i16 @narrow_zext_or(i16 %x16, i32 %y32) { ; CHECK-LABEL: define i16 @narrow_zext_or( ; CHECK-SAME: i16 [[X16:%.*]], i32 [[Y32:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[Y32]] to i16 -; CHECK-NEXT: [[R:%.*]] = or i16 [[TMP1]], [[X16]] +; CHECK-NEXT: [[R:%.*]] = or i16 [[X16]], [[TMP1]] ; CHECK-NEXT: ret i16 [[R]] ; %x32 = zext i16 %x16 to i32 @@ -57,7 +57,7 @@ define i16 @narrow_sext_xor(i16 %x16, i32 %y32) { ; CHECK-LABEL: define i16 @narrow_sext_xor( ; CHECK-SAME: i16 [[X16:%.*]], i32 [[Y32:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[Y32]] to i16 -; CHECK-NEXT: [[R:%.*]] = xor i16 [[TMP1]], [[X16]] +; CHECK-NEXT: [[R:%.*]] = xor i16 [[X16]], [[TMP1]] ; CHECK-NEXT: ret i16 [[R]] ; %x32 = sext i16 %x16 to i32 @@ -70,7 +70,7 @@ define i16 @narrow_zext_xor(i16 %x16, i32 %y32) { ; CHECK-LABEL: define i16 @narrow_zext_xor( ; CHECK-SAME: i16 [[X16:%.*]], i32 [[Y32:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[Y32]] to i16 -; CHECK-NEXT: [[R:%.*]] = xor i16 [[TMP1]], [[X16]] +; CHECK-NEXT: [[R:%.*]] = xor i16 [[X16]], [[TMP1]] ; CHECK-NEXT: ret i16 [[R]] ; %x32 = zext i16 %x16 to i32 @@ -83,7 +83,7 @@ define i16 @narrow_sext_add(i16 %x16, i32 %y32) { ; CHECK-LABEL: define i16 @narrow_sext_add( ; CHECK-SAME: i16 [[X16:%.*]], i32 [[Y32:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[Y32]] to i16 -; CHECK-NEXT: [[R:%.*]] = add i16 [[TMP1]], [[X16]] +; CHECK-NEXT: [[R:%.*]] = add i16 [[X16]], [[TMP1]] ; CHECK-NEXT: ret i16 [[R]] ; %x32 = sext i16 %x16 to i32 @@ -96,7 +96,7 @@ define i16 @narrow_zext_add(i16 %x16, i32 %y32) { ; CHECK-LABEL: define i16 @narrow_zext_add( ; CHECK-SAME: i16 [[X16:%.*]], i32 [[Y32:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[Y32]] to i16 -; CHECK-NEXT: [[R:%.*]] = add i16 [[TMP1]], [[X16]] +; CHECK-NEXT: [[R:%.*]] = add i16 [[X16]], [[TMP1]] ; CHECK-NEXT: ret i16 [[R]] ; %x32 = zext i16 %x16 to i32 @@ -135,7 +135,7 @@ define i16 @narrow_sext_mul(i16 %x16, i32 %y32) { ; CHECK-LABEL: define i16 @narrow_sext_mul( ; CHECK-SAME: i16 [[X16:%.*]], i32 [[Y32:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[Y32]] to i16 -; CHECK-NEXT: [[R:%.*]] = mul i16 [[TMP1]], [[X16]] +; CHECK-NEXT: [[R:%.*]] = mul i16 [[X16]], [[TMP1]] ; CHECK-NEXT: ret i16 [[R]] ; %x32 = sext i16 %x16 to i32 @@ -148,7 +148,7 @@ define i16 @narrow_zext_mul(i16 %x16, i32 %y32) { ; CHECK-LABEL: define i16 @narrow_zext_mul( ; CHECK-SAME: i16 [[X16:%.*]], i32 [[Y32:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[Y32]] to i16 -; CHECK-NEXT: [[R:%.*]] = mul i16 [[TMP1]], [[X16]] +; CHECK-NEXT: [[R:%.*]] = mul i16 [[X16]], [[TMP1]] ; CHECK-NEXT: ret i16 [[R]] ; %x32 = zext i16 %x16 to i32 @@ -165,7 +165,7 @@ define <2 x i16> @narrow_sext_and_commute(<2 x i16> %x16, <2 x i32> %y32) { ; CHECK-SAME: <2 x i16> [[X16:%.*]], <2 x i32> [[Y32:%.*]]) { ; CHECK-NEXT: [[Y32OP0:%.*]] = sdiv <2 x i32> [[Y32]], ; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[Y32OP0]] to <2 x i16> -; CHECK-NEXT: [[R:%.*]] = and <2 x i16> [[TMP1]], [[X16]] +; CHECK-NEXT: [[R:%.*]] = and <2 x i16> [[X16]], [[TMP1]] ; CHECK-NEXT: ret <2 x i16> [[R]] ; %y32op0 = sdiv <2 x i32> %y32, @@ -180,7 +180,7 @@ define <2 x i16> @narrow_zext_and_commute(<2 x i16> %x16, <2 x i32> %y32) { ; CHECK-SAME: <2 x i16> [[X16:%.*]], <2 x i32> [[Y32:%.*]]) { ; CHECK-NEXT: [[Y32OP0:%.*]] = sdiv <2 x i32> [[Y32]], ; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[Y32OP0]] to <2 x i16> -; CHECK-NEXT: [[R:%.*]] = and <2 x i16> [[TMP1]], [[X16]] +; CHECK-NEXT: [[R:%.*]] = and <2 x i16> [[X16]], [[TMP1]] ; CHECK-NEXT: ret <2 x i16> [[R]] ; %y32op0 = sdiv <2 x i32> %y32, @@ -195,7 +195,7 @@ define <2 x i16> @narrow_sext_or_commute(<2 x i16> %x16, <2 x i32> %y32) { ; CHECK-SAME: <2 x i16> [[X16:%.*]], <2 x i32> [[Y32:%.*]]) { ; CHECK-NEXT: [[Y32OP0:%.*]] = sdiv <2 x i32> [[Y32]], ; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[Y32OP0]] to <2 x i16> -; CHECK-NEXT: [[R:%.*]] = or <2 x i16> [[TMP1]], [[X16]] +; CHECK-NEXT: [[R:%.*]] = or <2 x i16> [[X16]], [[TMP1]] ; CHECK-NEXT: ret <2 x i16> [[R]] ; %y32op0 = sdiv <2 x i32> %y32, @@ -210,7 +210,7 @@ define <2 x i16> @narrow_zext_or_commute(<2 x i16> %x16, <2 x i32> %y32) { ; CHECK-SAME: <2 x i16> [[X16:%.*]], <2 x i32> [[Y32:%.*]]) { ; CHECK-NEXT: [[Y32OP0:%.*]] = sdiv <2 x i32> [[Y32]], ; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[Y32OP0]] to <2 x i16> -; CHECK-NEXT: [[R:%.*]] = or <2 x i16> [[TMP1]], [[X16]] +; CHECK-NEXT: [[R:%.*]] = or <2 x i16> [[X16]], [[TMP1]] ; CHECK-NEXT: ret <2 x i16> [[R]] ; %y32op0 = sdiv <2 x i32> %y32, @@ -225,7 +225,7 @@ define <2 x i16> @narrow_sext_xor_commute(<2 x i16> %x16, <2 x i32> %y32) { ; CHECK-SAME: <2 x i16> [[X16:%.*]], <2 x i32> [[Y32:%.*]]) { ; CHECK-NEXT: [[Y32OP0:%.*]] = sdiv <2 x i32> [[Y32]], ; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[Y32OP0]] to <2 x i16> -; CHECK-NEXT: [[R:%.*]] = xor <2 x i16> [[TMP1]], [[X16]] +; CHECK-NEXT: [[R:%.*]] = xor <2 x i16> [[X16]], [[TMP1]] ; CHECK-NEXT: ret <2 x i16> [[R]] ; %y32op0 = sdiv <2 x i32> %y32, @@ -240,7 +240,7 @@ define <2 x i16> @narrow_zext_xor_commute(<2 x i16> %x16, <2 x i32> %y32) { ; CHECK-SAME: <2 x i16> [[X16:%.*]], <2 x i32> [[Y32:%.*]]) { ; CHECK-NEXT: [[Y32OP0:%.*]] = sdiv <2 x i32> [[Y32]], ; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[Y32OP0]] to <2 x i16> -; CHECK-NEXT: [[R:%.*]] = xor <2 x i16> [[TMP1]], [[X16]] +; CHECK-NEXT: [[R:%.*]] = xor <2 x i16> [[X16]], [[TMP1]] ; CHECK-NEXT: ret <2 x i16> [[R]] ; %y32op0 = sdiv <2 x i32> %y32, @@ -255,7 +255,7 @@ define <2 x i16> @narrow_sext_add_commute(<2 x i16> %x16, <2 x i32> %y32) { ; CHECK-SAME: <2 x i16> [[X16:%.*]], <2 x i32> [[Y32:%.*]]) { ; CHECK-NEXT: [[Y32OP0:%.*]] = sdiv <2 x i32> [[Y32]], ; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[Y32OP0]] to <2 x i16> -; CHECK-NEXT: [[R:%.*]] = add <2 x i16> [[TMP1]], [[X16]] +; CHECK-NEXT: [[R:%.*]] = add <2 x i16> [[X16]], [[TMP1]] ; CHECK-NEXT: ret <2 x i16> [[R]] ; %y32op0 = sdiv <2 x i32> %y32, @@ -270,7 +270,7 @@ define <2 x i16> @narrow_zext_add_commute(<2 x i16> %x16, <2 x i32> %y32) { ; CHECK-SAME: <2 x i16> [[X16:%.*]], <2 x i32> [[Y32:%.*]]) { ; CHECK-NEXT: [[Y32OP0:%.*]] = sdiv <2 x i32> [[Y32]], ; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[Y32OP0]] to <2 x i16> -; CHECK-NEXT: [[R:%.*]] = add <2 x i16> [[TMP1]], [[X16]] +; CHECK-NEXT: [[R:%.*]] = add <2 x i16> [[X16]], [[TMP1]] ; CHECK-NEXT: ret <2 x i16> [[R]] ; %y32op0 = sdiv <2 x i32> %y32, @@ -315,7 +315,7 @@ define <2 x i16> @narrow_sext_mul_commute(<2 x i16> %x16, <2 x i32> %y32) { ; CHECK-SAME: <2 x i16> [[X16:%.*]], <2 x i32> [[Y32:%.*]]) { ; CHECK-NEXT: [[Y32OP0:%.*]] = sdiv <2 x i32> [[Y32]], ; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[Y32OP0]] to <2 x i16> -; CHECK-NEXT: [[R:%.*]] = mul <2 x i16> [[TMP1]], [[X16]] +; CHECK-NEXT: [[R:%.*]] = mul <2 x i16> [[X16]], [[TMP1]] ; CHECK-NEXT: ret <2 x i16> [[R]] ; %y32op0 = sdiv <2 x i32> %y32, @@ -330,7 +330,7 @@ define <2 x i16> @narrow_zext_mul_commute(<2 x i16> %x16, <2 x i32> %y32) { ; CHECK-SAME: <2 x i16> [[X16:%.*]], <2 x i32> [[Y32:%.*]]) { ; CHECK-NEXT: [[Y32OP0:%.*]] = sdiv <2 x i32> [[Y32]], ; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[Y32OP0]] to <2 x i16> -; CHECK-NEXT: [[R:%.*]] = mul <2 x i16> [[TMP1]], [[X16]] +; CHECK-NEXT: [[R:%.*]] = mul <2 x i16> [[X16]], [[TMP1]] ; CHECK-NEXT: ret <2 x i16> [[R]] ; %y32op0 = sdiv <2 x i32> %y32, diff --git a/llvm/test/Transforms/InstCombine/uaddo.ll b/llvm/test/Transforms/InstCombine/uaddo.ll index c638c0adef055b..9b56dce8b45856 100644 --- a/llvm/test/Transforms/InstCombine/uaddo.ll +++ b/llvm/test/Transforms/InstCombine/uaddo.ll @@ -5,7 +5,7 @@ define i32 @uaddo_commute1(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @uaddo_commute1( ; CHECK-NEXT: [[NOTY:%.*]] = xor i32 [[Y:%.*]], -1 ; CHECK-NEXT: [[A:%.*]] = add i32 [[X:%.*]], [[Y]] -; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[NOTY]], [[X]] +; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[X]], [[NOTY]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[C]], i32 [[Z:%.*]], i32 [[A]] ; CHECK-NEXT: ret i32 [[R]] ; @@ -20,7 +20,7 @@ define <2 x i32> @uaddo_commute2(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; CHECK-LABEL: @uaddo_commute2( ; CHECK-NEXT: [[NOTY:%.*]] = xor <2 x i32> [[Y:%.*]], ; CHECK-NEXT: [[A:%.*]] = add <2 x i32> [[Y]], [[X:%.*]] -; CHECK-NEXT: [[C:%.*]] = icmp ult <2 x i32> [[NOTY]], [[X]] +; CHECK-NEXT: [[C:%.*]] = icmp ugt <2 x i32> [[X]], [[NOTY]] ; CHECK-NEXT: [[R:%.*]] = select <2 x i1> [[C]], <2 x i32> [[Z:%.*]], <2 x i32> [[A]] ; CHECK-NEXT: ret <2 x i32> [[R]] ; @@ -35,7 +35,7 @@ define i32 @uaddo_commute3(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @uaddo_commute3( ; CHECK-NEXT: [[NOTY:%.*]] = xor i32 [[Y:%.*]], -1 ; CHECK-NEXT: [[A:%.*]] = add i32 [[X:%.*]], [[Y]] -; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[NOTY]], [[X]] +; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[X]], [[NOTY]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[C]], i32 [[Z:%.*]], i32 [[A]] ; CHECK-NEXT: ret i32 [[R]] ; @@ -50,7 +50,7 @@ define i32 @uaddo_commute4(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @uaddo_commute4( ; CHECK-NEXT: [[NOTY:%.*]] = xor i32 [[Y:%.*]], -1 ; CHECK-NEXT: [[A:%.*]] = add i32 [[Y]], [[X:%.*]] -; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[NOTY]], [[X]] +; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[X]], [[NOTY]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[C]], i32 [[Z:%.*]], i32 [[A]] ; CHECK-NEXT: ret i32 [[R]] ; @@ -65,7 +65,7 @@ define i32 @uaddo_commute5(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @uaddo_commute5( ; CHECK-NEXT: [[NOTY:%.*]] = xor i32 [[Y:%.*]], -1 ; CHECK-NEXT: [[A:%.*]] = add i32 [[X:%.*]], [[Y]] -; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[NOTY]], [[X]] +; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[X]], [[NOTY]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[C]], i32 [[A]], i32 [[Z:%.*]] ; CHECK-NEXT: ret i32 [[R]] ; @@ -80,7 +80,7 @@ define i32 @uaddo_commute6(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @uaddo_commute6( ; CHECK-NEXT: [[NOTY:%.*]] = xor i32 [[Y:%.*]], -1 ; CHECK-NEXT: [[A:%.*]] = add i32 [[Y]], [[X:%.*]] -; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[NOTY]], [[X]] +; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[X]], [[NOTY]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[C]], i32 [[A]], i32 [[Z:%.*]] ; CHECK-NEXT: ret i32 [[R]] ; @@ -95,7 +95,7 @@ define i32 @uaddo_commute7(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @uaddo_commute7( ; CHECK-NEXT: [[NOTY:%.*]] = xor i32 [[Y:%.*]], -1 ; CHECK-NEXT: [[A:%.*]] = add i32 [[X:%.*]], [[Y]] -; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[NOTY]], [[X]] +; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[X]], [[NOTY]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[C]], i32 [[A]], i32 [[Z:%.*]] ; CHECK-NEXT: ret i32 [[R]] ; @@ -110,7 +110,7 @@ define i32 @uaddo_commute8(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @uaddo_commute8( ; CHECK-NEXT: [[NOTY:%.*]] = xor i32 [[Y:%.*]], -1 ; CHECK-NEXT: [[A:%.*]] = add i32 [[Y]], [[X:%.*]] -; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[NOTY]], [[X]] +; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[X]], [[NOTY]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[C]], i32 [[A]], i32 [[Z:%.*]] ; CHECK-NEXT: ret i32 [[R]] ; @@ -125,7 +125,7 @@ define i32 @uaddo_wrong_pred1(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @uaddo_wrong_pred1( ; CHECK-NEXT: [[NOTY:%.*]] = xor i32 [[Y:%.*]], -1 ; CHECK-NEXT: [[A:%.*]] = add i32 [[X:%.*]], [[Y]] -; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[NOTY]], [[X]] +; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[X]], [[NOTY]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[C]], i32 [[Z:%.*]], i32 [[A]] ; CHECK-NEXT: ret i32 [[R]] ; @@ -140,7 +140,7 @@ define i32 @uaddo_wrong_pred2(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @uaddo_wrong_pred2( ; CHECK-NEXT: [[NOTY:%.*]] = xor i32 [[Y:%.*]], -1 ; CHECK-NEXT: [[A:%.*]] = add i32 [[X:%.*]], [[Y]] -; CHECK-NEXT: [[C_NOT:%.*]] = icmp ugt i32 [[NOTY]], [[X]] +; CHECK-NEXT: [[C_NOT:%.*]] = icmp ult i32 [[X]], [[NOTY]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[C_NOT]], i32 [[A]], i32 [[Z:%.*]] ; CHECK-NEXT: ret i32 [[R]] ; diff --git a/llvm/test/Transforms/InstCombine/umax-icmp.ll b/llvm/test/Transforms/InstCombine/umax-icmp.ll index 9946f3c390f0f3..b4eea30bfc6af5 100644 --- a/llvm/test/Transforms/InstCombine/umax-icmp.ll +++ b/llvm/test/Transforms/InstCombine/umax-icmp.ll @@ -95,7 +95,7 @@ define i1 @ule_umax2(i32 %x, i32 %y) { define i1 @ule_umax3(i32 %a, i32 %y) { ; CHECK-LABEL: @ule_umax3( ; CHECK-NEXT: [[X:%.*]] = add i32 [[A:%.*]], 3 -; CHECK-NEXT: [[CMP2:%.*]] = icmp uge i32 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp ule i32 [[Y:%.*]], [[X]] ; CHECK-NEXT: ret i1 [[CMP2]] ; %x = add i32 %a, 3 ; thwart complexity-based canonicalization @@ -110,7 +110,7 @@ define i1 @ule_umax3(i32 %a, i32 %y) { define i1 @ule_umax4(i32 %a, i32 %y) { ; CHECK-LABEL: @ule_umax4( ; CHECK-NEXT: [[X:%.*]] = add i32 [[A:%.*]], 3 -; CHECK-NEXT: [[CMP2:%.*]] = icmp uge i32 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp ule i32 [[Y:%.*]], [[X]] ; CHECK-NEXT: ret i1 [[CMP2]] ; %x = add i32 %a, 3 ; thwart complexity-based canonicalization @@ -207,7 +207,7 @@ define i1 @ugt_umax2(i32 %x, i32 %y) { define i1 @ugt_umax3(i32 %a, i32 %y) { ; CHECK-LABEL: @ugt_umax3( ; CHECK-NEXT: [[X:%.*]] = add i32 [[A:%.*]], 3 -; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt i32 [[Y:%.*]], [[X]] ; CHECK-NEXT: ret i1 [[CMP2]] ; %x = add i32 %a, 3 ; thwart complexity-based canonicalization @@ -222,7 +222,7 @@ define i1 @ugt_umax3(i32 %a, i32 %y) { define i1 @ugt_umax4(i32 %a, i32 %y) { ; CHECK-LABEL: @ugt_umax4( ; CHECK-NEXT: [[X:%.*]] = add i32 [[A:%.*]], 3 -; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt i32 [[Y:%.*]], [[X]] ; CHECK-NEXT: ret i1 [[CMP2]] ; %x = add i32 %a, 3 ; thwart complexity-based canonicalization diff --git a/llvm/test/Transforms/InstCombine/umin-icmp.ll b/llvm/test/Transforms/InstCombine/umin-icmp.ll index da901c6c5e4847..cb23b2f00d2921 100644 --- a/llvm/test/Transforms/InstCombine/umin-icmp.ll +++ b/llvm/test/Transforms/InstCombine/umin-icmp.ll @@ -95,7 +95,7 @@ define i1 @uge_umin2(i32 %x, i32 %y) { define i1 @uge_umin3(i32 %a, i32 %y) { ; CHECK-LABEL: @uge_umin3( ; CHECK-NEXT: [[X:%.*]] = add i32 [[A:%.*]], 3 -; CHECK-NEXT: [[CMP2:%.*]] = icmp ule i32 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp uge i32 [[Y:%.*]], [[X]] ; CHECK-NEXT: ret i1 [[CMP2]] ; %x = add i32 %a, 3 ; thwart complexity-based canonicalization @@ -110,7 +110,7 @@ define i1 @uge_umin3(i32 %a, i32 %y) { define i1 @uge_umin4(i32 %a, i32 %y) { ; CHECK-LABEL: @uge_umin4( ; CHECK-NEXT: [[X:%.*]] = add i32 [[A:%.*]], 3 -; CHECK-NEXT: [[CMP2:%.*]] = icmp ule i32 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp uge i32 [[Y:%.*]], [[X]] ; CHECK-NEXT: ret i1 [[CMP2]] ; %x = add i32 %a, 3 ; thwart complexity-based canonicalization @@ -207,7 +207,7 @@ define i1 @ult_umin2(i32 %x, i32 %y) { define i1 @ult_umin3(i32 %a, i32 %y) { ; CHECK-LABEL: @ult_umin3( ; CHECK-NEXT: [[X:%.*]] = add i32 [[A:%.*]], 3 -; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt i32 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[Y:%.*]], [[X]] ; CHECK-NEXT: ret i1 [[CMP2]] ; %x = add i32 %a, 3 ; thwart complexity-based canonicalization @@ -222,7 +222,7 @@ define i1 @ult_umin3(i32 %a, i32 %y) { define i1 @ult_umin4(i32 %a, i32 %y) { ; CHECK-LABEL: @ult_umin4( ; CHECK-NEXT: [[X:%.*]] = add i32 [[A:%.*]], 3 -; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt i32 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[Y:%.*]], [[X]] ; CHECK-NEXT: ret i1 [[CMP2]] ; %x = add i32 %a, 3 ; thwart complexity-based canonicalization diff --git a/llvm/test/Transforms/InstCombine/unordered-compare-and-ordered.ll b/llvm/test/Transforms/InstCombine/unordered-compare-and-ordered.ll index 8ab1f130f1cda6..ec015e8ad2aaa0 100644 --- a/llvm/test/Transforms/InstCombine/unordered-compare-and-ordered.ll +++ b/llvm/test/Transforms/InstCombine/unordered-compare-and-ordered.ll @@ -360,7 +360,7 @@ define i1 @fcmp_ord_and_fneg_ueq(half %x, half %y) { ; CHECK-LABEL: @fcmp_ord_and_fneg_ueq( ; CHECK-NEXT: [[FNEG_X:%.*]] = fneg half [[X:%.*]] ; CHECK-NEXT: [[ORD:%.*]] = fcmp ord half [[X]], 0xH0000 -; CHECK-NEXT: [[UEQ:%.*]] = fcmp ueq half [[FNEG_X]], [[Y:%.*]] +; CHECK-NEXT: [[UEQ:%.*]] = fcmp ueq half [[Y:%.*]], [[FNEG_X]] ; CHECK-NEXT: [[AND:%.*]] = and i1 [[ORD]], [[UEQ]] ; CHECK-NEXT: ret i1 [[AND]] ; @@ -389,7 +389,7 @@ define i1 @fcmp_ord_fneg_and_fneg_ueq(half %x, half %y) { ; CHECK-LABEL: @fcmp_ord_fneg_and_fneg_ueq( ; CHECK-NEXT: [[FNEG_X:%.*]] = fneg half [[X:%.*]] ; CHECK-NEXT: [[ORD:%.*]] = fcmp ord half [[X]], 0xH0000 -; CHECK-NEXT: [[UEQ:%.*]] = fcmp ueq half [[FNEG_X]], [[Y:%.*]] +; CHECK-NEXT: [[UEQ:%.*]] = fcmp ueq half [[Y:%.*]], [[FNEG_X]] ; CHECK-NEXT: [[AND:%.*]] = and i1 [[ORD]], [[UEQ]] ; CHECK-NEXT: ret i1 [[AND]] ; @@ -405,7 +405,7 @@ define i1 @fcmp_ord_and_fneg_fabs_ueq(half %x, half %y) { ; CHECK-NEXT: [[FABS_X:%.*]] = call half @llvm.fabs.f16(half [[X:%.*]]) ; CHECK-NEXT: [[FNEG_FABS_X:%.*]] = fneg half [[FABS_X]] ; CHECK-NEXT: [[ORD:%.*]] = fcmp ord half [[X]], 0xH0000 -; CHECK-NEXT: [[UEQ:%.*]] = fcmp ueq half [[FNEG_FABS_X]], [[Y:%.*]] +; CHECK-NEXT: [[UEQ:%.*]] = fcmp ueq half [[Y:%.*]], [[FNEG_FABS_X]] ; CHECK-NEXT: [[AND:%.*]] = and i1 [[ORD]], [[UEQ]] ; CHECK-NEXT: ret i1 [[AND]] ; @@ -451,7 +451,7 @@ define i1 @fcmp_ord_and_copysign_ueq_commute(half %x, half %y, half %z) { ; CHECK-LABEL: @fcmp_ord_and_copysign_ueq_commute( ; CHECK-NEXT: [[COPYSIGN_X_Y:%.*]] = call half @llvm.copysign.f16(half [[X:%.*]], half [[Z:%.*]]) ; CHECK-NEXT: [[ORD:%.*]] = fcmp ord half [[X]], 0xH0000 -; CHECK-NEXT: [[UEQ:%.*]] = fcmp ueq half [[COPYSIGN_X_Y]], [[Y:%.*]] +; CHECK-NEXT: [[UEQ:%.*]] = fcmp ueq half [[Y:%.*]], [[COPYSIGN_X_Y]] ; CHECK-NEXT: [[AND:%.*]] = and i1 [[ORD]], [[UEQ]] ; CHECK-NEXT: ret i1 [[AND]] ; diff --git a/llvm/test/Transforms/InstCombine/unsigned-add-lack-of-overflow-check-via-add.ll b/llvm/test/Transforms/InstCombine/unsigned-add-lack-of-overflow-check-via-add.ll index c5be9a7b769ce4..5a0d283ff8bb66 100644 --- a/llvm/test/Transforms/InstCombine/unsigned-add-lack-of-overflow-check-via-add.ll +++ b/llvm/test/Transforms/InstCombine/unsigned-add-lack-of-overflow-check-via-add.ll @@ -108,7 +108,7 @@ define i1 @t5_commutative(i8 %x) { define i1 @t6_no_extrause(i8 %x, i8 %y) { ; CHECK-LABEL: @t6_no_extrause( ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = add i8 %x, %y diff --git a/llvm/test/Transforms/InstCombine/unsigned-add-lack-of-overflow-check-via-xor.ll b/llvm/test/Transforms/InstCombine/unsigned-add-lack-of-overflow-check-via-xor.ll index 1b41f609705ef9..17b32670ae9d7b 100644 --- a/llvm/test/Transforms/InstCombine/unsigned-add-lack-of-overflow-check-via-xor.ll +++ b/llvm/test/Transforms/InstCombine/unsigned-add-lack-of-overflow-check-via-xor.ll @@ -15,7 +15,7 @@ define i1 @t0_basic(i8 %x, i8 %y) { ; CHECK-LABEL: @t0_basic( ; CHECK-NEXT: [[T0:%.*]] = xor i8 [[Y:%.*]], -1 ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = xor i8 %y, -1 @@ -28,7 +28,7 @@ define <2 x i1> @t1_vec(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @t1_vec( ; CHECK-NEXT: [[T0:%.*]] = xor <2 x i8> [[Y:%.*]], ; CHECK-NEXT: call void @use2x8(<2 x i8> [[T0]]) -; CHECK-NEXT: [[R:%.*]] = icmp uge <2 x i8> [[T0]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ule <2 x i8> [[X:%.*]], [[T0]] ; CHECK-NEXT: ret <2 x i1> [[R]] ; %t0 = xor <2 x i8> %y, @@ -61,7 +61,7 @@ define i1 @t2_commutative(i8 %y) { define i1 @t3_no_extrause(i8 %x, i8 %y) { ; CHECK-LABEL: @t3_no_extrause( ; CHECK-NEXT: [[T0:%.*]] = xor i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = xor i8 %y, -1 @@ -75,7 +75,7 @@ define i1 @n4_wrong_pred0(i8 %x, i8 %y) { ; CHECK-LABEL: @n4_wrong_pred0( ; CHECK-NEXT: [[T0:%.*]] = xor i8 [[Y:%.*]], -1 ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = xor i8 %y, -1 @@ -88,7 +88,7 @@ define i1 @n5_wrong_pred1(i8 %x, i8 %y) { ; CHECK-LABEL: @n5_wrong_pred1( ; CHECK-NEXT: [[T0:%.*]] = xor i8 [[Y:%.*]], -1 ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = xor i8 %y, -1 @@ -101,7 +101,7 @@ define i1 @n6_wrong_pred2(i8 %x, i8 %y) { ; CHECK-LABEL: @n6_wrong_pred2( ; CHECK-NEXT: [[T0:%.*]] = xor i8 [[Y:%.*]], -1 ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = xor i8 %y, -1 @@ -114,7 +114,7 @@ define i1 @n7_wrong_pred3(i8 %x, i8 %y) { ; CHECK-LABEL: @n7_wrong_pred3( ; CHECK-NEXT: [[T0:%.*]] = xor i8 [[Y:%.*]], -1 ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = xor i8 %y, -1 @@ -127,7 +127,7 @@ define i1 @n8_wrong_pred4(i8 %x, i8 %y) { ; CHECK-LABEL: @n8_wrong_pred4( ; CHECK-NEXT: [[T0:%.*]] = xor i8 [[Y:%.*]], -1 ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[R:%.*]] = icmp slt i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp sgt i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = xor i8 %y, -1 @@ -140,7 +140,7 @@ define i1 @n9_wrong_pred5(i8 %x, i8 %y) { ; CHECK-LABEL: @n9_wrong_pred5( ; CHECK-NEXT: [[T0:%.*]] = xor i8 [[Y:%.*]], -1 ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[R:%.*]] = icmp sle i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp sge i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = xor i8 %y, -1 @@ -153,7 +153,7 @@ define i1 @n10_wrong_pred6(i8 %x, i8 %y) { ; CHECK-LABEL: @n10_wrong_pred6( ; CHECK-NEXT: [[T0:%.*]] = xor i8 [[Y:%.*]], -1 ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[R:%.*]] = icmp sgt i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp slt i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = xor i8 %y, -1 @@ -166,7 +166,7 @@ define i1 @n11_wrong_pred7(i8 %x, i8 %y) { ; CHECK-LABEL: @n11_wrong_pred7( ; CHECK-NEXT: [[T0:%.*]] = xor i8 [[Y:%.*]], -1 ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[R:%.*]] = icmp sge i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp sle i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = xor i8 %y, -1 diff --git a/llvm/test/Transforms/InstCombine/unsigned-add-lack-of-overflow-check.ll b/llvm/test/Transforms/InstCombine/unsigned-add-lack-of-overflow-check.ll index e7120a7d01cfaa..677ef47456c013 100644 --- a/llvm/test/Transforms/InstCombine/unsigned-add-lack-of-overflow-check.ll +++ b/llvm/test/Transforms/InstCombine/unsigned-add-lack-of-overflow-check.ll @@ -11,7 +11,7 @@ define i1 @t0_basic(i8 %x, i8 %y) { ; CHECK-LABEL: @t0_basic( ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = add i8 %x, %y @@ -22,7 +22,7 @@ define i1 @t0_basic(i8 %x, i8 %y) { define <2 x i1> @t1_vec(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @t1_vec( ; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i8> [[Y:%.*]], -; CHECK-NEXT: [[R:%.*]] = icmp uge <2 x i8> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ule <2 x i8> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i1> [[R]] ; %t0 = add <2 x i8> %x, %y @@ -35,7 +35,7 @@ define <2 x i1> @t1_vec(<2 x i8> %x, <2 x i8> %y) { define i1 @t2_symmetry(i8 %x, i8 %y) { ; CHECK-LABEL: @t2_symmetry( ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X:%.*]], -1 -; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = add i8 %x, %y @@ -49,7 +49,7 @@ define i1 @t3_commutative(i8 %x) { ; CHECK-LABEL: @t3_commutative( ; CHECK-NEXT: [[Y:%.*]] = call i8 @gen8() ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y]], -1 -; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %y = call i8 @gen8() @@ -61,7 +61,7 @@ define i1 @t3_commutative(i8 %x) { define i1 @t4_commutative(i8 %x, i8 %y) { ; CHECK-LABEL: @t4_commutative( ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = add i8 %x, %y @@ -73,7 +73,7 @@ define i1 @t5_commutative(i8 %x) { ; CHECK-LABEL: @t5_commutative( ; CHECK-NEXT: [[Y:%.*]] = call i8 @gen8() ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y]], -1 -; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %y = call i8 @gen8() diff --git a/llvm/test/Transforms/InstCombine/unsigned-add-overflow-check-via-add.ll b/llvm/test/Transforms/InstCombine/unsigned-add-overflow-check-via-add.ll index 23b89b7c1e65f6..bfdcb8343f2d97 100644 --- a/llvm/test/Transforms/InstCombine/unsigned-add-overflow-check-via-add.ll +++ b/llvm/test/Transforms/InstCombine/unsigned-add-overflow-check-via-add.ll @@ -75,7 +75,7 @@ define i1 @t4_commutative(i8 %x, i8 %y) { ; CHECK-LABEL: @t4_commutative( ; CHECK-NEXT: [[T0:%.*]] = add i8 [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[T0]], [[Y]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[Y]], [[T0]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = add i8 %x, %y @@ -104,7 +104,7 @@ define i1 @t5_commutative(i8 %x) { define i1 @t6_no_extrause(i8 %x, i8 %y) { ; CHECK-LABEL: @t6_no_extrause( ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = add i8 %x, %y diff --git a/llvm/test/Transforms/InstCombine/unsigned-add-overflow-check-via-xor.ll b/llvm/test/Transforms/InstCombine/unsigned-add-overflow-check-via-xor.ll index 646bd635807a76..457a0e594b6303 100644 --- a/llvm/test/Transforms/InstCombine/unsigned-add-overflow-check-via-xor.ll +++ b/llvm/test/Transforms/InstCombine/unsigned-add-overflow-check-via-xor.ll @@ -15,7 +15,7 @@ define i1 @t0_basic(i8 %x, i8 %y) { ; CHECK-LABEL: @t0_basic( ; CHECK-NEXT: [[T0:%.*]] = xor i8 [[Y:%.*]], -1 ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = xor i8 %y, -1 @@ -28,7 +28,7 @@ define <2 x i1> @t1_vec(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @t1_vec( ; CHECK-NEXT: [[T0:%.*]] = xor <2 x i8> [[Y:%.*]], ; CHECK-NEXT: call void @use2x8(<2 x i8> [[T0]]) -; CHECK-NEXT: [[R:%.*]] = icmp ult <2 x i8> [[T0]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt <2 x i8> [[X:%.*]], [[T0]] ; CHECK-NEXT: ret <2 x i1> [[R]] ; %t0 = xor <2 x i8> %y, @@ -61,7 +61,7 @@ define i1 @t2_commutative(i8 %y) { define i1 @t3_no_extrause(i8 %x, i8 %y) { ; CHECK-LABEL: @t3_no_extrause( ; CHECK-NEXT: [[T0:%.*]] = xor i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = xor i8 %y, -1 @@ -75,7 +75,7 @@ define i1 @n4_wrong_pred0(i8 %x, i8 %y) { ; CHECK-LABEL: @n4_wrong_pred0( ; CHECK-NEXT: [[T0:%.*]] = xor i8 [[Y:%.*]], -1 ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = xor i8 %y, -1 @@ -88,7 +88,7 @@ define i1 @n5_wrong_pred1(i8 %x, i8 %y) { ; CHECK-LABEL: @n5_wrong_pred1( ; CHECK-NEXT: [[T0:%.*]] = xor i8 [[Y:%.*]], -1 ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = xor i8 %y, -1 @@ -101,7 +101,7 @@ define i1 @n6_wrong_pred2(i8 %x, i8 %y) { ; CHECK-LABEL: @n6_wrong_pred2( ; CHECK-NEXT: [[T0:%.*]] = xor i8 [[Y:%.*]], -1 ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = xor i8 %y, -1 @@ -114,7 +114,7 @@ define i1 @n7_wrong_pred3(i8 %x, i8 %y) { ; CHECK-LABEL: @n7_wrong_pred3( ; CHECK-NEXT: [[T0:%.*]] = xor i8 [[Y:%.*]], -1 ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ne i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = xor i8 %y, -1 @@ -127,7 +127,7 @@ define i1 @n8_wrong_pred4(i8 %x, i8 %y) { ; CHECK-LABEL: @n8_wrong_pred4( ; CHECK-NEXT: [[T0:%.*]] = xor i8 [[Y:%.*]], -1 ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[R:%.*]] = icmp slt i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp sgt i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = xor i8 %y, -1 @@ -140,7 +140,7 @@ define i1 @n9_wrong_pred5(i8 %x, i8 %y) { ; CHECK-LABEL: @n9_wrong_pred5( ; CHECK-NEXT: [[T0:%.*]] = xor i8 [[Y:%.*]], -1 ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[R:%.*]] = icmp sle i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp sge i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = xor i8 %y, -1 @@ -153,7 +153,7 @@ define i1 @n10_wrong_pred6(i8 %x, i8 %y) { ; CHECK-LABEL: @n10_wrong_pred6( ; CHECK-NEXT: [[T0:%.*]] = xor i8 [[Y:%.*]], -1 ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[R:%.*]] = icmp sgt i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp slt i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = xor i8 %y, -1 @@ -166,7 +166,7 @@ define i1 @n11_wrong_pred7(i8 %x, i8 %y) { ; CHECK-LABEL: @n11_wrong_pred7( ; CHECK-NEXT: [[T0:%.*]] = xor i8 [[Y:%.*]], -1 ; CHECK-NEXT: call void @use8(i8 [[T0]]) -; CHECK-NEXT: [[R:%.*]] = icmp sge i8 [[T0]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp sle i8 [[X:%.*]], [[T0]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = xor i8 %y, -1 diff --git a/llvm/test/Transforms/InstCombine/unsigned-add-overflow-check.ll b/llvm/test/Transforms/InstCombine/unsigned-add-overflow-check.ll index 3533c6a54a22ab..94966a1eba3289 100644 --- a/llvm/test/Transforms/InstCombine/unsigned-add-overflow-check.ll +++ b/llvm/test/Transforms/InstCombine/unsigned-add-overflow-check.ll @@ -11,7 +11,7 @@ define i1 @t0_basic(i8 %x, i8 %y) { ; CHECK-LABEL: @t0_basic( ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = add i8 %x, %y @@ -22,7 +22,7 @@ define i1 @t0_basic(i8 %x, i8 %y) { define <2 x i1> @t1_vec(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @t1_vec( ; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i8> [[Y:%.*]], -; CHECK-NEXT: [[R:%.*]] = icmp ult <2 x i8> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt <2 x i8> [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret <2 x i1> [[R]] ; %t0 = add <2 x i8> %x, %y @@ -35,7 +35,7 @@ define <2 x i1> @t1_vec(<2 x i8> %x, <2 x i8> %y) { define i1 @t2_symmetry(i8 %x, i8 %y) { ; CHECK-LABEL: @t2_symmetry( ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X:%.*]], -1 -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = add i8 %x, %y @@ -49,7 +49,7 @@ define i1 @t3_commutative(i8 %x) { ; CHECK-LABEL: @t3_commutative( ; CHECK-NEXT: [[Y:%.*]] = call i8 @gen8() ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y]], -1 -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %y = call i8 @gen8() @@ -61,7 +61,7 @@ define i1 @t3_commutative(i8 %x) { define i1 @t4_commutative(i8 %x, i8 %y) { ; CHECK-LABEL: @t4_commutative( ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = add i8 %x, %y @@ -73,7 +73,7 @@ define i1 @t5_commutative(i8 %x) { ; CHECK-LABEL: @t5_commutative( ; CHECK-NEXT: [[Y:%.*]] = call i8 @gen8() ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[Y]], -1 -; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[X:%.*]], [[TMP1]] ; CHECK-NEXT: ret i1 [[R]] ; %y = call i8 @gen8() diff --git a/llvm/test/Transforms/InstCombine/unsigned-sub-lack-of-overflow-check.ll b/llvm/test/Transforms/InstCombine/unsigned-sub-lack-of-overflow-check.ll index 500d61ac1b111e..e844b321830a16 100644 --- a/llvm/test/Transforms/InstCombine/unsigned-sub-lack-of-overflow-check.ll +++ b/llvm/test/Transforms/InstCombine/unsigned-sub-lack-of-overflow-check.ll @@ -30,7 +30,7 @@ define <2 x i1> @t1_vec(<2 x i8> %x, <2 x i8> %y) { define i1 @t2_commutative(i8 %x, i8 %y) { ; CHECK-LABEL: @t2_commutative( -; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[Y:%.*]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = sub i8 %x, %y diff --git a/llvm/test/Transforms/InstCombine/unsigned-sub-overflow-check.ll b/llvm/test/Transforms/InstCombine/unsigned-sub-overflow-check.ll index 5b273026dafe72..5f37b1d962345e 100644 --- a/llvm/test/Transforms/InstCombine/unsigned-sub-overflow-check.ll +++ b/llvm/test/Transforms/InstCombine/unsigned-sub-overflow-check.ll @@ -30,7 +30,7 @@ define <2 x i1> @t1_vec(<2 x i8> %x, <2 x i8> %y) { define i1 @t2_commutative(i8 %x, i8 %y) { ; CHECK-LABEL: @t2_commutative( -; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[Y:%.*]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = sub i8 %x, %y diff --git a/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll b/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll index a6d083276cbb55..1fd7903307cef4 100644 --- a/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll +++ b/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll @@ -1163,7 +1163,7 @@ define i4 @common_binop_demand_via_extelt_op0_mismatch_elt1(<2 x i4> %x, <2 x i4 define <2 x i8> @common_binop_demand_via_splat_mask_poison(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @common_binop_demand_via_splat_mask_poison( ; CHECK-NEXT: [[YSPLAT:%.*]] = shufflevector <2 x i8> [[Y:%.*]], <2 x i8> poison, <2 x i32> -; CHECK-NEXT: [[VV:%.*]] = add <2 x i8> [[YSPLAT]], [[X:%.*]] +; CHECK-NEXT: [[VV:%.*]] = add <2 x i8> [[X:%.*]], [[YSPLAT]] ; CHECK-NEXT: [[MSPLAT:%.*]] = shufflevector <2 x i8> [[VV]], <2 x i8> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[RES:%.*]] = add <2 x i8> [[VV]], [[MSPLAT]] ; CHECK-NEXT: ret <2 x i8> [[RES]] @@ -1179,7 +1179,7 @@ define <2 x i8> @common_binop_demand_via_splat_mask_poison(<2 x i8> %x, <2 x i8> define <2 x i8> @common_binop_demand_via_splat_mask_poison_2(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @common_binop_demand_via_splat_mask_poison_2( ; CHECK-NEXT: [[YSPLAT:%.*]] = shufflevector <2 x i8> [[Y:%.*]], <2 x i8> poison, <2 x i32> -; CHECK-NEXT: [[VV:%.*]] = add <2 x i8> [[YSPLAT]], [[X:%.*]] +; CHECK-NEXT: [[VV:%.*]] = add <2 x i8> [[X:%.*]], [[YSPLAT]] ; CHECK-NEXT: [[M:%.*]] = add <2 x i8> [[X]], [[Y]] ; CHECK-NEXT: [[MSPLAT:%.*]] = shufflevector <2 x i8> [[M]], <2 x i8> [[Y]], <2 x i32> ; CHECK-NEXT: [[RES:%.*]] = add <2 x i8> [[VV]], [[MSPLAT]] @@ -1196,7 +1196,7 @@ define <2 x i8> @common_binop_demand_via_splat_mask_poison_2(<2 x i8> %x, <2 x i define <2 x i8> @common_binop_demand_via_splat_mask_poison_3(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @common_binop_demand_via_splat_mask_poison_3( ; CHECK-NEXT: [[YSPLAT:%.*]] = shufflevector <2 x i8> [[Y:%.*]], <2 x i8> poison, <2 x i32> -; CHECK-NEXT: [[VV:%.*]] = add <2 x i8> [[YSPLAT]], [[X:%.*]] +; CHECK-NEXT: [[VV:%.*]] = add <2 x i8> [[X:%.*]], [[YSPLAT]] ; CHECK-NEXT: [[M:%.*]] = add <2 x i8> [[X]], [[Y]] ; CHECK-NEXT: [[MSPLAT:%.*]] = shufflevector <2 x i8> [[M]], <2 x i8> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[RES:%.*]] = add <2 x i8> [[VV]], [[MSPLAT]] diff --git a/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll index a9cdc8bd202476..0f233fbb4729e6 100644 --- a/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll @@ -1611,7 +1611,7 @@ define <2 x float> @splat_assoc_fmul(<2 x float> %x, <2 x float> %y) { define <3 x i8> @splat_assoc_mul(<3 x i8> %x, <3 x i8> %y, <3 x i8> %z) { ; CHECK-LABEL: @splat_assoc_mul( -; CHECK-NEXT: [[TMP1:%.*]] = mul <3 x i8> [[Z:%.*]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = mul <3 x i8> [[X:%.*]], [[Z:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i8> [[TMP1]], <3 x i8> poison, <3 x i32> ; CHECK-NEXT: [[R:%.*]] = mul <3 x i8> [[TMP2]], [[Y:%.*]] ; CHECK-NEXT: ret <3 x i8> [[R]] @@ -1625,7 +1625,7 @@ define <3 x i8> @splat_assoc_mul(<3 x i8> %x, <3 x i8> %y, <3 x i8> %z) { define <3 x i8> @splat_assoc_mul_undef_elt1(<3 x i8> %x, <3 x i8> %y, <3 x i8> %z) { ; CHECK-LABEL: @splat_assoc_mul_undef_elt1( -; CHECK-NEXT: [[TMP1:%.*]] = mul <3 x i8> [[Z:%.*]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = mul <3 x i8> [[X:%.*]], [[Z:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i8> [[TMP1]], <3 x i8> poison, <3 x i32> ; CHECK-NEXT: [[R:%.*]] = mul <3 x i8> [[TMP2]], [[Y:%.*]] ; CHECK-NEXT: ret <3 x i8> [[R]] @@ -1641,7 +1641,7 @@ define <3 x i8> @splat_assoc_mul_undef_elt2(<3 x i8> %x, <3 x i8> %y, <3 x i8> % ; CHECK-LABEL: @splat_assoc_mul_undef_elt2( ; CHECK-NEXT: [[SPLATX:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> poison, <3 x i32> ; CHECK-NEXT: [[SPLATZ:%.*]] = shufflevector <3 x i8> [[Z:%.*]], <3 x i8> poison, <3 x i32> -; CHECK-NEXT: [[A:%.*]] = mul nsw <3 x i8> [[SPLATZ]], [[Y:%.*]] +; CHECK-NEXT: [[A:%.*]] = mul nsw <3 x i8> [[Y:%.*]], [[SPLATZ]] ; CHECK-NEXT: [[R:%.*]] = mul nuw nsw <3 x i8> [[A]], [[SPLATX]] ; CHECK-NEXT: ret <3 x i8> [[R]] ; @@ -1654,7 +1654,7 @@ define <3 x i8> @splat_assoc_mul_undef_elt2(<3 x i8> %x, <3 x i8> %y, <3 x i8> % define <3 x i8> @splat_assoc_mul_undef_elt_at_splat_index1(<3 x i8> %x, <3 x i8> %y, <3 x i8> %z) { ; CHECK-LABEL: @splat_assoc_mul_undef_elt_at_splat_index1( -; CHECK-NEXT: [[TMP1:%.*]] = mul <3 x i8> [[Z:%.*]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = mul <3 x i8> [[X:%.*]], [[Z:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i8> [[TMP1]], <3 x i8> poison, <3 x i32> ; CHECK-NEXT: [[R:%.*]] = mul <3 x i8> [[TMP2]], [[Y:%.*]] ; CHECK-NEXT: ret <3 x i8> [[R]] @@ -1670,7 +1670,7 @@ define <3 x i8> @splat_assoc_mul_undef_elt_at_splat_index2(<3 x i8> %x, <3 x i8> ; CHECK-LABEL: @splat_assoc_mul_undef_elt_at_splat_index2( ; CHECK-NEXT: [[SPLATX:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> poison, <3 x i32> ; CHECK-NEXT: [[SPLATZ:%.*]] = shufflevector <3 x i8> [[Z:%.*]], <3 x i8> poison, <3 x i32> -; CHECK-NEXT: [[A:%.*]] = mul nsw <3 x i8> [[SPLATZ]], [[Y:%.*]] +; CHECK-NEXT: [[A:%.*]] = mul nsw <3 x i8> [[Y:%.*]], [[SPLATZ]] ; CHECK-NEXT: [[R:%.*]] = mul nuw nsw <3 x i8> [[A]], [[SPLATX]] ; CHECK-NEXT: ret <3 x i8> [[R]] ; @@ -1687,7 +1687,7 @@ define <3 x i8> @splat_assoc_or(<3 x i8> %x, <3 x i8> %y, <3 x i8> %z) { ; CHECK-LABEL: @splat_assoc_or( ; CHECK-NEXT: [[SPLATX:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> poison, <3 x i32> ; CHECK-NEXT: [[SPLATZ:%.*]] = shufflevector <3 x i8> [[Z:%.*]], <3 x i8> poison, <3 x i32> -; CHECK-NEXT: [[A:%.*]] = or <3 x i8> [[SPLATZ]], [[Y:%.*]] +; CHECK-NEXT: [[A:%.*]] = or <3 x i8> [[Y:%.*]], [[SPLATZ]] ; CHECK-NEXT: [[R:%.*]] = or <3 x i8> [[A]], [[SPLATX]] ; CHECK-NEXT: ret <3 x i8> [[R]] ; @@ -1750,7 +1750,7 @@ define <3 x i32> @splat_assoc_and(<4 x i32> %x, <3 x i32> %y) { define <5 x i32> @splat_assoc_xor(<4 x i32> %x, <5 x i32> %y) { ; CHECK-LABEL: @splat_assoc_xor( ; CHECK-NEXT: [[SPLATX:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <5 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = xor <5 x i32> [[SPLATX]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor <5 x i32> [[Y:%.*]], [[SPLATX]] ; CHECK-NEXT: [[R:%.*]] = xor <5 x i32> [[TMP1]], ; CHECK-NEXT: ret <5 x i32> [[R]] ; diff --git a/llvm/test/Transforms/InstCombine/vec_shuffle.ll b/llvm/test/Transforms/InstCombine/vec_shuffle.ll index 8c91efb473faec..75a84e51279b80 100644 --- a/llvm/test/Transforms/InstCombine/vec_shuffle.ll +++ b/llvm/test/Transforms/InstCombine/vec_shuffle.ll @@ -1616,7 +1616,7 @@ define <2 x float> @splat_assoc_fmul(<2 x float> %x, <2 x float> %y) { define <3 x i8> @splat_assoc_mul(<3 x i8> %x, <3 x i8> %y, <3 x i8> %z) { ; CHECK-LABEL: @splat_assoc_mul( -; CHECK-NEXT: [[TMP1:%.*]] = mul <3 x i8> [[Z:%.*]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = mul <3 x i8> [[X:%.*]], [[Z:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i8> [[TMP1]], <3 x i8> poison, <3 x i32> ; CHECK-NEXT: [[R:%.*]] = mul <3 x i8> [[TMP2]], [[Y:%.*]] ; CHECK-NEXT: ret <3 x i8> [[R]] @@ -1630,7 +1630,7 @@ define <3 x i8> @splat_assoc_mul(<3 x i8> %x, <3 x i8> %y, <3 x i8> %z) { define <3 x i8> @splat_assoc_mul_undef_elt1(<3 x i8> %x, <3 x i8> %y, <3 x i8> %z) { ; CHECK-LABEL: @splat_assoc_mul_undef_elt1( -; CHECK-NEXT: [[TMP1:%.*]] = mul <3 x i8> [[Z:%.*]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = mul <3 x i8> [[X:%.*]], [[Z:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i8> [[TMP1]], <3 x i8> poison, <3 x i32> ; CHECK-NEXT: [[R:%.*]] = mul <3 x i8> [[TMP2]], [[Y:%.*]] ; CHECK-NEXT: ret <3 x i8> [[R]] @@ -1646,7 +1646,7 @@ define <3 x i8> @splat_assoc_mul_undef_elt2(<3 x i8> %x, <3 x i8> %y, <3 x i8> % ; CHECK-LABEL: @splat_assoc_mul_undef_elt2( ; CHECK-NEXT: [[SPLATX:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> poison, <3 x i32> ; CHECK-NEXT: [[SPLATZ:%.*]] = shufflevector <3 x i8> [[Z:%.*]], <3 x i8> poison, <3 x i32> -; CHECK-NEXT: [[A:%.*]] = mul nsw <3 x i8> [[SPLATZ]], [[Y:%.*]] +; CHECK-NEXT: [[A:%.*]] = mul nsw <3 x i8> [[Y:%.*]], [[SPLATZ]] ; CHECK-NEXT: [[R:%.*]] = mul nuw nsw <3 x i8> [[A]], [[SPLATX]] ; CHECK-NEXT: ret <3 x i8> [[R]] ; @@ -1659,7 +1659,7 @@ define <3 x i8> @splat_assoc_mul_undef_elt2(<3 x i8> %x, <3 x i8> %y, <3 x i8> % define <3 x i8> @splat_assoc_mul_undef_elt_at_splat_index1(<3 x i8> %x, <3 x i8> %y, <3 x i8> %z) { ; CHECK-LABEL: @splat_assoc_mul_undef_elt_at_splat_index1( -; CHECK-NEXT: [[TMP1:%.*]] = mul <3 x i8> [[Z:%.*]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = mul <3 x i8> [[X:%.*]], [[Z:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i8> [[TMP1]], <3 x i8> poison, <3 x i32> ; CHECK-NEXT: [[R:%.*]] = mul <3 x i8> [[TMP2]], [[Y:%.*]] ; CHECK-NEXT: ret <3 x i8> [[R]] @@ -1675,7 +1675,7 @@ define <3 x i8> @splat_assoc_mul_undef_elt_at_splat_index2(<3 x i8> %x, <3 x i8> ; CHECK-LABEL: @splat_assoc_mul_undef_elt_at_splat_index2( ; CHECK-NEXT: [[SPLATX:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> poison, <3 x i32> ; CHECK-NEXT: [[SPLATZ:%.*]] = shufflevector <3 x i8> [[Z:%.*]], <3 x i8> poison, <3 x i32> -; CHECK-NEXT: [[A:%.*]] = mul nsw <3 x i8> [[SPLATZ]], [[Y:%.*]] +; CHECK-NEXT: [[A:%.*]] = mul nsw <3 x i8> [[Y:%.*]], [[SPLATZ]] ; CHECK-NEXT: [[R:%.*]] = mul nuw nsw <3 x i8> [[A]], [[SPLATX]] ; CHECK-NEXT: ret <3 x i8> [[R]] ; @@ -1692,7 +1692,7 @@ define <3 x i8> @splat_assoc_or(<3 x i8> %x, <3 x i8> %y, <3 x i8> %z) { ; CHECK-LABEL: @splat_assoc_or( ; CHECK-NEXT: [[SPLATX:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> poison, <3 x i32> ; CHECK-NEXT: [[SPLATZ:%.*]] = shufflevector <3 x i8> [[Z:%.*]], <3 x i8> poison, <3 x i32> -; CHECK-NEXT: [[A:%.*]] = or <3 x i8> [[SPLATZ]], [[Y:%.*]] +; CHECK-NEXT: [[A:%.*]] = or <3 x i8> [[Y:%.*]], [[SPLATZ]] ; CHECK-NEXT: [[R:%.*]] = or <3 x i8> [[A]], [[SPLATX]] ; CHECK-NEXT: ret <3 x i8> [[R]] ; @@ -1755,7 +1755,7 @@ define <3 x i32> @splat_assoc_and(<4 x i32> %x, <3 x i32> %y) { define <5 x i32> @splat_assoc_xor(<4 x i32> %x, <5 x i32> %y) { ; CHECK-LABEL: @splat_assoc_xor( ; CHECK-NEXT: [[SPLATX:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <5 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = xor <5 x i32> [[SPLATX]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = xor <5 x i32> [[Y:%.*]], [[SPLATX]] ; CHECK-NEXT: [[R:%.*]] = xor <5 x i32> [[TMP1]], ; CHECK-NEXT: ret <5 x i32> [[R]] ; diff --git a/llvm/test/Transforms/InstCombine/vector-reverse.ll b/llvm/test/Transforms/InstCombine/vector-reverse.ll index a1a6ee949a1389..c9c68d2241b345 100644 --- a/llvm/test/Transforms/InstCombine/vector-reverse.ll +++ b/llvm/test/Transforms/InstCombine/vector-reverse.ll @@ -250,7 +250,7 @@ define @icmp_reverse_splat_RHS( %a, i32 %b) ; CHECK-LABEL: @icmp_reverse_splat_RHS( ; CHECK-NEXT: [[B_INSERT:%.*]] = insertelement poison, i32 [[B:%.*]], i64 0 ; CHECK-NEXT: [[B_SPLAT:%.*]] = shufflevector [[B_INSERT]], poison, zeroinitializer -; CHECK-NEXT: [[CMP1:%.*]] = icmp slt [[B_SPLAT]], [[A:%.*]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt [[A:%.*]], [[B_SPLAT]] ; CHECK-NEXT: [[CMP:%.*]] = call @llvm.vector.reverse.nxv4i1( [[CMP1]]) ; CHECK-NEXT: ret [[CMP]] ; diff --git a/llvm/test/Transforms/InstCombine/vector-xor.ll b/llvm/test/Transforms/InstCombine/vector-xor.ll index 5c96f1a691ed03..13894ef85b5da8 100644 --- a/llvm/test/Transforms/InstCombine/vector-xor.ll +++ b/llvm/test/Transforms/InstCombine/vector-xor.ll @@ -6,7 +6,7 @@ define <4 x i32> @test_v4i32_xor_repeated_and_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: @test_v4i32_xor_repeated_and_0( ; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i32> [[B:%.*]], [[C:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i32> [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i32> [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret <4 x i32> [[TMP2]] ; %1 = and <4 x i32> %a, %b @@ -18,7 +18,7 @@ define <4 x i32> @test_v4i32_xor_repeated_and_0(<4 x i32> %a, <4 x i32> %b, <4 x define <4 x i32> @test_v4i32_xor_repeated_and_1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: @test_v4i32_xor_repeated_and_1( ; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i32> [[B:%.*]], [[C:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i32> [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i32> [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret <4 x i32> [[TMP2]] ; %1 = and <4 x i32> %a, %b @@ -69,7 +69,7 @@ define <4 x i32> @test_v4i32_xor_bswap_const_poison(<4 x i32> %a0) { define <4 x i32> @test_v4i32_demorgan_and(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: @test_v4i32_demorgan_and( ; CHECK-NEXT: [[Y_NOT:%.*]] = xor <4 x i32> [[Y:%.*]], -; CHECK-NEXT: [[TMP1:%.*]] = or <4 x i32> [[Y_NOT]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = or <4 x i32> [[X:%.*]], [[Y_NOT]] ; CHECK-NEXT: ret <4 x i32> [[TMP1]] ; %1 = xor <4 x i32> , %x @@ -83,7 +83,7 @@ define <4 x i32> @test_v4i32_demorgan_and(<4 x i32> %x, <4 x i32> %y) { define <4 x i32> @test_v4i32_demorgan_or(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: @test_v4i32_demorgan_or( ; CHECK-NEXT: [[Y_NOT:%.*]] = xor <4 x i32> [[Y:%.*]], -; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[Y_NOT]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[X:%.*]], [[Y_NOT]] ; CHECK-NEXT: ret <4 x i32> [[TMP1]] ; %1 = xor <4 x i32> , %x diff --git a/llvm/test/Transforms/InstCombine/widenable-conditions.ll b/llvm/test/Transforms/InstCombine/widenable-conditions.ll index 0e377c9fa48628..46a93580e9c78c 100644 --- a/llvm/test/Transforms/InstCombine/widenable-conditions.ll +++ b/llvm/test/Transforms/InstCombine/widenable-conditions.ll @@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu" define i1 @test1(i1 %a, i1 %b) { ; CHECK-LABEL: @test1( ; CHECK-NEXT: [[WC:%.*]] = call i1 @llvm.experimental.widenable.condition() -; CHECK-NEXT: [[LHS:%.*]] = and i1 [[WC]], [[B:%.*]] +; CHECK-NEXT: [[LHS:%.*]] = and i1 [[B:%.*]], [[WC]] ; CHECK-NEXT: [[AND:%.*]] = and i1 [[LHS]], [[A:%.*]] ; CHECK-NEXT: ret i1 [[AND]] ; @@ -20,7 +20,7 @@ define i1 @test1(i1 %a, i1 %b) { define i1 @test1_logical(i1 %a, i1 %b) { ; CHECK-LABEL: @test1_logical( ; CHECK-NEXT: [[WC:%.*]] = call i1 @llvm.experimental.widenable.condition() -; CHECK-NEXT: [[LHS:%.*]] = and i1 [[WC]], [[B:%.*]] +; CHECK-NEXT: [[LHS:%.*]] = and i1 [[B:%.*]], [[WC]] ; CHECK-NEXT: [[AND:%.*]] = select i1 [[LHS]], i1 [[A:%.*]], i1 false ; CHECK-NEXT: ret i1 [[AND]] ; @@ -34,7 +34,7 @@ define i1 @test1_logical(i1 %a, i1 %b) { define i1 @test1b(i1 %a, i1 %b) { ; CHECK-LABEL: @test1b( ; CHECK-NEXT: [[WC:%.*]] = call i1 @llvm.experimental.widenable.condition() -; CHECK-NEXT: [[LHS:%.*]] = and i1 [[WC]], [[B:%.*]] +; CHECK-NEXT: [[LHS:%.*]] = and i1 [[B:%.*]], [[WC]] ; CHECK-NEXT: call void @use(i1 [[LHS]]) ; CHECK-NEXT: [[AND:%.*]] = and i1 [[LHS]], [[A:%.*]] ; CHECK-NEXT: ret i1 [[AND]] @@ -49,7 +49,7 @@ define i1 @test1b(i1 %a, i1 %b) { define i1 @test1b_logical(i1 %a, i1 %b) { ; CHECK-LABEL: @test1b_logical( ; CHECK-NEXT: [[WC:%.*]] = call i1 @llvm.experimental.widenable.condition() -; CHECK-NEXT: [[LHS:%.*]] = and i1 [[WC]], [[B:%.*]] +; CHECK-NEXT: [[LHS:%.*]] = and i1 [[B:%.*]], [[WC]] ; CHECK-NEXT: call void @use(i1 [[LHS]]) ; CHECK-NEXT: [[AND:%.*]] = select i1 [[LHS]], i1 [[A:%.*]], i1 false ; CHECK-NEXT: ret i1 [[AND]] @@ -68,7 +68,7 @@ define i1 @test1c(i1 %a, i1 %b) { ; CHECK-NEXT: call void @use(i1 [[B:%.*]]) ; CHECK-NEXT: [[WC:%.*]] = call i1 @llvm.experimental.widenable.condition() ; CHECK-NEXT: call void @use(i1 [[WC]]) -; CHECK-NEXT: [[LHS:%.*]] = and i1 [[WC]], [[B]] +; CHECK-NEXT: [[LHS:%.*]] = and i1 [[B]], [[WC]] ; CHECK-NEXT: [[AND:%.*]] = and i1 [[LHS]], [[A]] ; CHECK-NEXT: ret i1 [[AND]] ; @@ -87,7 +87,7 @@ define i1 @test1c_logical(i1 %a, i1 %b) { ; CHECK-NEXT: call void @use(i1 [[B:%.*]]) ; CHECK-NEXT: [[WC:%.*]] = call i1 @llvm.experimental.widenable.condition() ; CHECK-NEXT: call void @use(i1 [[WC]]) -; CHECK-NEXT: [[LHS:%.*]] = and i1 [[WC]], [[B]] +; CHECK-NEXT: [[LHS:%.*]] = and i1 [[B]], [[WC]] ; CHECK-NEXT: [[AND:%.*]] = select i1 [[LHS]], i1 [[A]], i1 false ; CHECK-NEXT: ret i1 [[AND]] ; @@ -132,7 +132,7 @@ define i1 @test3(i1 %a, i1 %b, i1 %c) { ; CHECK-LABEL: @test3( ; CHECK-NEXT: [[WC:%.*]] = call i1 @llvm.experimental.widenable.condition() ; CHECK-NEXT: [[LHS:%.*]] = and i1 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[RHS:%.*]] = and i1 [[WC]], [[C:%.*]] +; CHECK-NEXT: [[RHS:%.*]] = and i1 [[C:%.*]], [[WC]] ; CHECK-NEXT: [[AND:%.*]] = and i1 [[LHS]], [[RHS]] ; CHECK-NEXT: ret i1 [[AND]] ; @@ -147,7 +147,7 @@ define i1 @test3_logical(i1 %a, i1 %b, i1 %c) { ; CHECK-LABEL: @test3_logical( ; CHECK-NEXT: [[WC:%.*]] = call i1 @llvm.experimental.widenable.condition() ; CHECK-NEXT: [[LHS:%.*]] = select i1 [[A:%.*]], i1 [[B:%.*]], i1 false -; CHECK-NEXT: [[RHS:%.*]] = and i1 [[WC]], [[C:%.*]] +; CHECK-NEXT: [[RHS:%.*]] = and i1 [[C:%.*]], [[WC]] ; CHECK-NEXT: [[AND:%.*]] = select i1 [[LHS]], i1 [[RHS]], i1 false ; CHECK-NEXT: ret i1 [[AND]] ; diff --git a/llvm/test/Transforms/InstCombine/xor.ll b/llvm/test/Transforms/InstCombine/xor.ll index 2ff95821f4e000..ea7f7382ee7c8e 100644 --- a/llvm/test/Transforms/InstCombine/xor.ll +++ b/llvm/test/Transforms/InstCombine/xor.ll @@ -72,8 +72,8 @@ define i32 @test7(i32 %A, i32 %B) { ; CHECK-LABEL: @test7( ; CHECK-NEXT: [[A1:%.*]] = and i32 [[A:%.*]], 7 ; CHECK-NEXT: [[B1:%.*]] = and i32 [[B:%.*]], 128 -; CHECK-NEXT: [[C11:%.*]] = or disjoint i32 [[A1]], [[B1]] -; CHECK-NEXT: ret i32 [[C11]] +; CHECK-NEXT: [[C1:%.*]] = or disjoint i32 [[A1]], [[B1]] +; CHECK-NEXT: ret i32 [[C1]] ; %A1 = and i32 %A, 7 %B1 = and i32 %B, 128 @@ -122,8 +122,8 @@ define <2 x i1> @test9vec(<2 x i8> %a) { define i8 @test10(i8 %A) { ; CHECK-LABEL: @test10( ; CHECK-NEXT: [[B:%.*]] = and i8 [[A:%.*]], 3 -; CHECK-NEXT: [[C1:%.*]] = or disjoint i8 [[B]], 4 -; CHECK-NEXT: ret i8 [[C1]] +; CHECK-NEXT: [[C:%.*]] = or disjoint i8 [[B]], 4 +; CHECK-NEXT: ret i8 [[C]] ; %B = and i8 %A, 3 %C = xor i8 %B, 4 @@ -253,7 +253,7 @@ define i1 @test24(i32 %c, i32 %d) { define i32 @test25(i32 %g, i32 %h) { ; CHECK-LABEL: @test25( -; CHECK-NEXT: [[T4:%.*]] = and i32 [[H:%.*]], [[G:%.*]] +; CHECK-NEXT: [[T4:%.*]] = and i32 [[G:%.*]], [[H:%.*]] ; CHECK-NEXT: ret i32 [[T4]] ; %h2 = xor i32 %h, -1 @@ -487,7 +487,7 @@ define i32 @or_xor_extra_use(i32 %a, i32 %b, ptr %p) { ; CHECK-LABEL: @or_xor_extra_use( ; CHECK-NEXT: [[O:%.*]] = or i32 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: store i32 [[O]], ptr [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = xor i32 [[O]], [[B]] +; CHECK-NEXT: [[R:%.*]] = xor i32 [[B]], [[O]] ; CHECK-NEXT: ret i32 [[R]] ; %o = or i32 %a, %b @@ -572,7 +572,7 @@ define i32 @and_xor_extra_use(i32 %a, i32 %b, ptr %p) { ; CHECK-LABEL: @and_xor_extra_use( ; CHECK-NEXT: [[O:%.*]] = and i32 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: store i32 [[O]], ptr [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = xor i32 [[O]], [[B]] +; CHECK-NEXT: [[R:%.*]] = xor i32 [[B]], [[O]] ; CHECK-NEXT: ret i32 [[R]] ; %o = and i32 %a, %b @@ -773,7 +773,7 @@ define <4 x i32> @test46(<4 x i32> %x) { define i32 @test47(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: @test47( ; CHECK-NEXT: [[NOTX:%.*]] = xor i32 [[X:%.*]], -1 -; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[NOTX]], i32 [[Y:%.*]]) +; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[Y:%.*]], i32 [[NOTX]]) ; CHECK-NEXT: [[UMIN:%.*]] = xor i32 [[UMAX]], -1 ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[UMAX]], [[Z:%.*]] ; CHECK-NEXT: [[RES:%.*]] = mul i32 [[ADD]], [[UMIN]] @@ -988,7 +988,7 @@ define i4 @or_or_xor_use2(i4 %x, i4 %y, i4 %z, ptr %p) { define i32 @not_is_canonical(i32 %x, i32 %y) { ; CHECK-LABEL: @not_is_canonical( ; CHECK-NEXT: [[SUB:%.*]] = xor i32 [[X:%.*]], -1 -; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SUB]], [[Y:%.*]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[Y:%.*]], [[SUB]] ; CHECK-NEXT: [[MUL:%.*]] = shl i32 [[ADD]], 2 ; CHECK-NEXT: ret i32 [[MUL]] ; @@ -1175,7 +1175,7 @@ define <2 x i32> @xor_andn_commute1(<2 x i32> %a, <2 x i32> %b) { define i33 @xor_andn_commute2(i33 %a, i33 %pb) { ; CHECK-LABEL: @xor_andn_commute2( ; CHECK-NEXT: [[B:%.*]] = udiv i33 42, [[PB:%.*]] -; CHECK-NEXT: [[Z:%.*]] = or i33 [[B]], [[A:%.*]] +; CHECK-NEXT: [[Z:%.*]] = or i33 [[A:%.*]], [[B]] ; CHECK-NEXT: ret i33 [[Z]] ; %b = udiv i33 42, %pb ; thwart complexity-based canonicalization @@ -1252,7 +1252,7 @@ define i8 @xor_orn_commute1(i8 %pa, i8 %b) { define i32 @xor_orn_commute2(i32 %a, i32 %pb,ptr %s) { ; CHECK-LABEL: @xor_orn_commute2( ; CHECK-NEXT: [[B:%.*]] = udiv i32 42, [[PB:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[B]], [[A:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], [[B]] ; CHECK-NEXT: [[Z:%.*]] = xor i32 [[TMP1]], -1 ; CHECK-NEXT: ret i32 [[Z]] ; @@ -1268,7 +1268,7 @@ define i32 @xor_orn_commute2_1use(i32 %a, i32 %pb,ptr %s) { ; CHECK-NEXT: [[B:%.*]] = udiv i32 42, [[PB:%.*]] ; CHECK-NEXT: [[NOTA:%.*]] = xor i32 [[A:%.*]], -1 ; CHECK-NEXT: store i32 [[NOTA]], ptr [[S:%.*]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[B]], [[A]] +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A]], [[B]] ; CHECK-NEXT: [[Z:%.*]] = xor i32 [[TMP1]], -1 ; CHECK-NEXT: ret i32 [[Z]] ; @@ -1321,7 +1321,7 @@ define i32 @xor_orn_2use(i32 %a, i32 %b, ptr %s1, ptr %s2) { ; CHECK-LABEL: @xor_orn_2use( ; CHECK-NEXT: [[NOTA:%.*]] = xor i32 [[A:%.*]], -1 ; CHECK-NEXT: store i32 [[NOTA]], ptr [[S1:%.*]], align 4 -; CHECK-NEXT: [[L:%.*]] = or i32 [[NOTA]], [[B:%.*]] +; CHECK-NEXT: [[L:%.*]] = or i32 [[B:%.*]], [[NOTA]] ; CHECK-NEXT: store i32 [[L]], ptr [[S2:%.*]], align 4 ; CHECK-NEXT: [[Z:%.*]] = xor i32 [[L]], [[A]] ; CHECK-NEXT: ret i32 [[Z]] @@ -1367,7 +1367,7 @@ define <2 x i8> @cttz_pow2(<2 x i8> %x, <2 x i8> %y) { define i32 @ctlz_pow2_or_zero(i32 %x) { ; CHECK-LABEL: @ctlz_pow2_or_zero( ; CHECK-NEXT: [[N:%.*]] = sub i32 0, [[X:%.*]] -; CHECK-NEXT: [[A:%.*]] = and i32 [[N]], [[X]] +; CHECK-NEXT: [[A:%.*]] = and i32 [[X]], [[N]] ; CHECK-NEXT: [[Z:%.*]] = call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[A]], i1 false) ; CHECK-NEXT: [[R:%.*]] = xor i32 [[Z]], 31 ; CHECK-NEXT: ret i32 [[R]] @@ -1384,7 +1384,7 @@ define i32 @ctlz_pow2_or_zero(i32 %x) { define i32 @ctlz_pow2_wrong_const(i32 %x) { ; CHECK-LABEL: @ctlz_pow2_wrong_const( ; CHECK-NEXT: [[N:%.*]] = sub i32 0, [[X:%.*]] -; CHECK-NEXT: [[A:%.*]] = and i32 [[N]], [[X]] +; CHECK-NEXT: [[A:%.*]] = and i32 [[X]], [[N]] ; CHECK-NEXT: [[Z:%.*]] = call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[A]], i1 true) ; CHECK-NEXT: [[R:%.*]] = xor i32 [[Z]], 30 ; CHECK-NEXT: ret i32 [[R]] @@ -1459,7 +1459,7 @@ define i4 @PR96857_xor_with_noundef(i4 %val0, i4 %val1, i4 noundef %val2) { ; CHECK-LABEL: @PR96857_xor_with_noundef( ; CHECK-NEXT: [[VAL4:%.*]] = and i4 [[VAL2:%.*]], [[VAL0:%.*]] ; CHECK-NEXT: [[VAL5:%.*]] = xor i4 [[VAL2]], -1 -; CHECK-NEXT: [[VAL6:%.*]] = and i4 [[VAL5]], [[VAL1:%.*]] +; CHECK-NEXT: [[VAL6:%.*]] = and i4 [[VAL1:%.*]], [[VAL5]] ; CHECK-NEXT: [[VAL7:%.*]] = or disjoint i4 [[VAL4]], [[VAL6]] ; CHECK-NEXT: ret i4 [[VAL7]] ; @@ -1475,7 +1475,7 @@ define i4 @PR96857_xor_without_noundef(i4 %val0, i4 %val1, i4 %val2) { ; CHECK-LABEL: @PR96857_xor_without_noundef( ; CHECK-NEXT: [[VAL4:%.*]] = and i4 [[VAL2:%.*]], [[VAL0:%.*]] ; CHECK-NEXT: [[VAL5:%.*]] = xor i4 [[VAL2]], -1 -; CHECK-NEXT: [[VAL6:%.*]] = and i4 [[VAL5]], [[VAL1:%.*]] +; CHECK-NEXT: [[VAL6:%.*]] = and i4 [[VAL1:%.*]], [[VAL5]] ; CHECK-NEXT: [[VAL7:%.*]] = or i4 [[VAL4]], [[VAL6]] ; CHECK-NEXT: ret i4 [[VAL7]] ; diff --git a/llvm/test/Transforms/InstCombine/xor2.ll b/llvm/test/Transforms/InstCombine/xor2.ll index 7d12a00a8bd515..0b4fca76ed0a7f 100644 --- a/llvm/test/Transforms/InstCombine/xor2.ll +++ b/llvm/test/Transforms/InstCombine/xor2.ll @@ -36,8 +36,8 @@ define i1 @test1(i32 %A) { define i32 @test2(i32 %t1) { ; CHECK-LABEL: @test2( ; CHECK-NEXT: [[OVM:%.*]] = and i32 [[T1:%.*]], 32 -; CHECK-NEXT: [[OV1101:%.*]] = or disjoint i32 [[OVM]], 8 -; CHECK-NEXT: ret i32 [[OV1101]] +; CHECK-NEXT: [[OV110:%.*]] = or disjoint i32 [[OVM]], 8 +; CHECK-NEXT: ret i32 [[OV110]] ; %ovm = and i32 %t1, 32 %ov3 = add i32 %ovm, 145 @@ -48,8 +48,8 @@ define i32 @test2(i32 %t1) { define i32 @test3(i32 %t1) { ; CHECK-LABEL: @test3( ; CHECK-NEXT: [[OVM:%.*]] = and i32 [[T1:%.*]], 32 -; CHECK-NEXT: [[OV1101:%.*]] = or disjoint i32 [[OVM]], 8 -; CHECK-NEXT: ret i32 [[OV1101]] +; CHECK-NEXT: [[OV110:%.*]] = or disjoint i32 [[OVM]], 8 +; CHECK-NEXT: ret i32 [[OV110]] ; %ovm = or i32 %t1, 145 %ov31 = and i32 %ovm, 177 @@ -99,7 +99,7 @@ define i32 @test6(i32 %x) { define i32 @test7(i32 %a, i32 %b) { ; CHECK-LABEL: @test7( ; CHECK-NEXT: [[B_NOT:%.*]] = xor i32 [[B:%.*]], -1 -; CHECK-NEXT: [[XOR:%.*]] = or i32 [[B_NOT]], [[A:%.*]] +; CHECK-NEXT: [[XOR:%.*]] = or i32 [[A:%.*]], [[B_NOT]] ; CHECK-NEXT: ret i32 [[XOR]] ; %or = or i32 %a, %b @@ -112,7 +112,7 @@ define i32 @test7(i32 %a, i32 %b) { define i32 @test8(i32 %a, i32 %b) { ; CHECK-LABEL: @test8( ; CHECK-NEXT: [[B_NOT:%.*]] = xor i32 [[B:%.*]], -1 -; CHECK-NEXT: [[XOR:%.*]] = or i32 [[B_NOT]], [[A:%.*]] +; CHECK-NEXT: [[XOR:%.*]] = or i32 [[A:%.*]], [[B_NOT]] ; CHECK-NEXT: ret i32 [[XOR]] ; %neg = xor i32 %a, -1 @@ -233,7 +233,7 @@ define i32 @test11e(i32 %A, i32 %B, i32 %C) { ; CHECK-LABEL: @test11e( ; CHECK-NEXT: [[FORCE:%.*]] = mul i32 [[B:%.*]], [[C:%.*]] ; CHECK-NEXT: [[XOR1:%.*]] = xor i32 [[FORCE]], [[A:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[FORCE]], [[A]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A]], [[FORCE]] ; CHECK-NEXT: [[XOR2:%.*]] = xor i32 [[TMP1]], -1 ; CHECK-NEXT: [[AND:%.*]] = and i32 [[XOR1]], [[XOR2]] ; CHECK-NEXT: ret i32 [[AND]] @@ -250,7 +250,7 @@ define i32 @test11f(i32 %A, i32 %B, i32 %C) { ; CHECK-LABEL: @test11f( ; CHECK-NEXT: [[FORCE:%.*]] = mul i32 [[B:%.*]], [[C:%.*]] ; CHECK-NEXT: [[XOR1:%.*]] = xor i32 [[FORCE]], [[A:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[FORCE]], [[A]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A]], [[FORCE]] ; CHECK-NEXT: [[XOR2:%.*]] = xor i32 [[TMP1]], -1 ; CHECK-NEXT: [[AND:%.*]] = and i32 [[XOR1]], [[XOR2]] ; CHECK-NEXT: ret i32 [[AND]] @@ -324,7 +324,7 @@ define i32 @test13commuted(i32 %a, i32 %b) { define i32 @xor_or_xor_common_op_commute1(i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: @xor_or_xor_common_op_commute1( ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A:%.*]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[R:%.*]] = xor i32 [[TMP2]], [[C:%.*]] ; CHECK-NEXT: ret i32 [[R]] ; @@ -339,7 +339,7 @@ define i32 @xor_or_xor_common_op_commute1(i32 %a, i32 %b, i32 %c) { define i32 @xor_or_xor_common_op_commute2(i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: @xor_or_xor_common_op_commute2( ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A:%.*]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[R:%.*]] = xor i32 [[TMP2]], [[C:%.*]] ; CHECK-NEXT: ret i32 [[R]] ; @@ -354,7 +354,7 @@ define i32 @xor_or_xor_common_op_commute2(i32 %a, i32 %b, i32 %c) { define i32 @xor_or_xor_common_op_commute3(i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: @xor_or_xor_common_op_commute3( ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A:%.*]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[R:%.*]] = xor i32 [[TMP2]], [[C:%.*]] ; CHECK-NEXT: ret i32 [[R]] ; @@ -369,7 +369,7 @@ define i32 @xor_or_xor_common_op_commute3(i32 %a, i32 %b, i32 %c) { define i32 @xor_or_xor_common_op_commute4(i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: @xor_or_xor_common_op_commute4( ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A:%.*]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[R:%.*]] = xor i32 [[TMP2]], [[C:%.*]] ; CHECK-NEXT: ret i32 [[R]] ; @@ -384,7 +384,7 @@ define i32 @xor_or_xor_common_op_commute4(i32 %a, i32 %b, i32 %c) { define i32 @xor_or_xor_common_op_commute5(i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: @xor_or_xor_common_op_commute5( ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A:%.*]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[R:%.*]] = xor i32 [[TMP2]], [[C:%.*]] ; CHECK-NEXT: ret i32 [[R]] ; @@ -399,7 +399,7 @@ define i32 @xor_or_xor_common_op_commute5(i32 %a, i32 %b, i32 %c) { define i32 @xor_or_xor_common_op_commute6(i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: @xor_or_xor_common_op_commute6( ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A:%.*]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[R:%.*]] = xor i32 [[TMP2]], [[C:%.*]] ; CHECK-NEXT: ret i32 [[R]] ; @@ -414,7 +414,7 @@ define i32 @xor_or_xor_common_op_commute6(i32 %a, i32 %b, i32 %c) { define i32 @xor_or_xor_common_op_commute7(i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: @xor_or_xor_common_op_commute7( ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A:%.*]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[R:%.*]] = xor i32 [[TMP2]], [[C:%.*]] ; CHECK-NEXT: ret i32 [[R]] ; @@ -429,7 +429,7 @@ define i32 @xor_or_xor_common_op_commute7(i32 %a, i32 %b, i32 %c) { define i32 @xor_or_xor_common_op_commute8(i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: @xor_or_xor_common_op_commute8( ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A:%.*]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[R:%.*]] = xor i32 [[TMP2]], [[C:%.*]] ; CHECK-NEXT: ret i32 [[R]] ; diff --git a/llvm/test/Transforms/InstCombine/zext-bool-add-sub.ll b/llvm/test/Transforms/InstCombine/zext-bool-add-sub.ll index 12739b5686a0ad..c9da18d3d88bdb 100644 --- a/llvm/test/Transforms/InstCombine/zext-bool-add-sub.ll +++ b/llvm/test/Transforms/InstCombine/zext-bool-add-sub.ll @@ -268,7 +268,7 @@ define <2 x i64> @sext_sub_const_vec_poison_elt(<2 x i1> %A) { define i8 @sext_sub(i8 %x, i1 %y) { ; CHECK-LABEL: @sext_sub( ; CHECK-NEXT: [[SEXT_NEG:%.*]] = zext i1 [[Y:%.*]] to i8 -; CHECK-NEXT: [[SUB:%.*]] = add i8 [[SEXT_NEG]], [[X:%.*]] +; CHECK-NEXT: [[SUB:%.*]] = add i8 [[X:%.*]], [[SEXT_NEG]] ; CHECK-NEXT: ret i8 [[SUB]] ; %sext = sext i1 %y to i8 @@ -281,7 +281,7 @@ define i8 @sext_sub(i8 %x, i1 %y) { define <2 x i8> @sext_sub_vec(<2 x i8> %x, <2 x i1> %y) { ; CHECK-LABEL: @sext_sub_vec( ; CHECK-NEXT: [[SEXT_NEG:%.*]] = zext <2 x i1> [[Y:%.*]] to <2 x i8> -; CHECK-NEXT: [[SUB:%.*]] = add <2 x i8> [[SEXT_NEG]], [[X:%.*]] +; CHECK-NEXT: [[SUB:%.*]] = add <2 x i8> [[X:%.*]], [[SEXT_NEG]] ; CHECK-NEXT: ret <2 x i8> [[SUB]] ; %sext = sext <2 x i1> %y to <2 x i8> @@ -294,7 +294,7 @@ define <2 x i8> @sext_sub_vec(<2 x i8> %x, <2 x i1> %y) { define <2 x i8> @sext_sub_vec_nsw(<2 x i8> %x, <2 x i1> %y) { ; CHECK-LABEL: @sext_sub_vec_nsw( ; CHECK-NEXT: [[SEXT_NEG:%.*]] = zext <2 x i1> [[Y:%.*]] to <2 x i8> -; CHECK-NEXT: [[SUB:%.*]] = add <2 x i8> [[SEXT_NEG]], [[X:%.*]] +; CHECK-NEXT: [[SUB:%.*]] = add <2 x i8> [[X:%.*]], [[SEXT_NEG]] ; CHECK-NEXT: ret <2 x i8> [[SUB]] ; %sext = sext <2 x i1> %y to <2 x i8> @@ -307,7 +307,7 @@ define <2 x i8> @sext_sub_vec_nsw(<2 x i8> %x, <2 x i1> %y) { define i8 @sext_sub_nuw(i8 %x, i1 %y) { ; CHECK-LABEL: @sext_sub_nuw( ; CHECK-NEXT: [[SEXT_NEG:%.*]] = zext i1 [[Y:%.*]] to i8 -; CHECK-NEXT: [[SUB:%.*]] = add i8 [[SEXT_NEG]], [[X:%.*]] +; CHECK-NEXT: [[SUB:%.*]] = add i8 [[X:%.*]], [[SEXT_NEG]] ; CHECK-NEXT: ret i8 [[SUB]] ; %sext = sext i1 %y to i8 @@ -318,7 +318,7 @@ define i8 @sext_sub_nuw(i8 %x, i1 %y) { define i32 @sextbool_add(i1 %c, i32 %x) { ; CHECK-LABEL: @sextbool_add( ; CHECK-NEXT: [[B:%.*]] = sext i1 [[C:%.*]] to i32 -; CHECK-NEXT: [[S:%.*]] = add i32 [[B]], [[X:%.*]] +; CHECK-NEXT: [[S:%.*]] = add i32 [[X:%.*]], [[B]] ; CHECK-NEXT: ret i32 [[S]] ; %b = sext i1 %c to i32 @@ -347,7 +347,7 @@ define i32 @sextbool_add_uses(i1 %c, i32 %x) { ; CHECK-LABEL: @sextbool_add_uses( ; CHECK-NEXT: [[B:%.*]] = sext i1 [[C:%.*]] to i32 ; CHECK-NEXT: call void @use32(i32 [[B]]) -; CHECK-NEXT: [[S:%.*]] = add i32 [[B]], [[X:%.*]] +; CHECK-NEXT: [[S:%.*]] = add i32 [[X:%.*]], [[B]] ; CHECK-NEXT: ret i32 [[S]] ; %b = sext i1 %c to i32 @@ -359,7 +359,7 @@ define i32 @sextbool_add_uses(i1 %c, i32 %x) { define <4 x i32> @sextbool_add_vector(<4 x i1> %c, <4 x i32> %x) { ; CHECK-LABEL: @sextbool_add_vector( ; CHECK-NEXT: [[B:%.*]] = sext <4 x i1> [[C:%.*]] to <4 x i32> -; CHECK-NEXT: [[S:%.*]] = add <4 x i32> [[B]], [[X:%.*]] +; CHECK-NEXT: [[S:%.*]] = add <4 x i32> [[X:%.*]], [[B]] ; CHECK-NEXT: ret <4 x i32> [[S]] ; %b = sext <4 x i1> %c to <4 x i32> @@ -394,7 +394,7 @@ define i32 @zextbool_sub_uses(i1 %c, i32 %x) { define <4 x i32> @zextbool_sub_vector(<4 x i1> %c, <4 x i32> %x) { ; CHECK-LABEL: @zextbool_sub_vector( ; CHECK-NEXT: [[B_NEG:%.*]] = sext <4 x i1> [[C:%.*]] to <4 x i32> -; CHECK-NEXT: [[S:%.*]] = add <4 x i32> [[B_NEG]], [[X:%.*]] +; CHECK-NEXT: [[S:%.*]] = add <4 x i32> [[X:%.*]], [[B_NEG]] ; CHECK-NEXT: ret <4 x i32> [[S]] ; %b = zext <4 x i1> %c to <4 x i32> diff --git a/llvm/test/Transforms/InstCombine/zext-or-icmp.ll b/llvm/test/Transforms/InstCombine/zext-or-icmp.ll index a4b74aa8cc7dc3..acf547b55722fc 100644 --- a/llvm/test/Transforms/InstCombine/zext-or-icmp.ll +++ b/llvm/test/Transforms/InstCombine/zext-or-icmp.ll @@ -181,7 +181,7 @@ define i8 @PR49475_infloop(i32 %t0, i16 %insert, i64 %e, i8 %i162) { ; CHECK-NEXT: [[CONV18:%.*]] = ashr exact i64 [[SEXT]], 32 ; CHECK-NEXT: [[CMP:%.*]] = icmp sge i64 [[XOR]], [[CONV18]] ; CHECK-NEXT: [[TRUNC44:%.*]] = zext i1 [[CMP]] to i8 -; CHECK-NEXT: [[INC:%.*]] = add i8 [[TRUNC44]], [[I162]] +; CHECK-NEXT: [[INC:%.*]] = add i8 [[I162]], [[TRUNC44]] ; CHECK-NEXT: [[TOBOOL23_NOT:%.*]] = xor i1 [[CMP]], true ; CHECK-NEXT: call void @llvm.assume(i1 [[TOBOOL23_NOT]]) ; CHECK-NEXT: ret i8 [[INC]] diff --git a/llvm/test/Transforms/InstCombine/zext.ll b/llvm/test/Transforms/InstCombine/zext.ll index 88cd9c70af40d8..7b2cf131c396ab 100644 --- a/llvm/test/Transforms/InstCombine/zext.ll +++ b/llvm/test/Transforms/InstCombine/zext.ll @@ -546,7 +546,7 @@ define i64 @and_trunc_extra_use1(i64 %x, i32 %y) { ; CHECK-LABEL: @and_trunc_extra_use1( ; CHECK-NEXT: [[T:%.*]] = trunc i64 [[X:%.*]] to i32 ; CHECK-NEXT: call void @use32(i32 [[T]]) -; CHECK-NEXT: [[A:%.*]] = and i32 [[T]], [[Y:%.*]] +; CHECK-NEXT: [[A:%.*]] = and i32 [[Y:%.*]], [[T]] ; CHECK-NEXT: [[Z:%.*]] = zext i32 [[A]] to i64 ; CHECK-NEXT: ret i64 [[Z]] ; @@ -581,7 +581,7 @@ define i64 @and_trunc_extra_use1_commute(i64 %x, i32 %p) { define i64 @and_trunc_extra_use2(i64 %x, i32 %y) { ; CHECK-LABEL: @and_trunc_extra_use2( ; CHECK-NEXT: [[T:%.*]] = trunc i64 [[X:%.*]] to i32 -; CHECK-NEXT: [[A:%.*]] = and i32 [[T]], [[Y:%.*]] +; CHECK-NEXT: [[A:%.*]] = and i32 [[Y:%.*]], [[T]] ; CHECK-NEXT: call void @use32(i32 [[A]]) ; CHECK-NEXT: [[Z:%.*]] = zext i32 [[A]] to i64 ; CHECK-NEXT: ret i64 [[Z]] @@ -635,7 +635,7 @@ define i64 @and_trunc_extra_use1_wider_src(i65 %x, i32 %y) { ; CHECK-LABEL: @and_trunc_extra_use1_wider_src( ; CHECK-NEXT: [[T:%.*]] = trunc i65 [[X:%.*]] to i32 ; CHECK-NEXT: call void @use32(i32 [[T]]) -; CHECK-NEXT: [[A:%.*]] = and i32 [[T]], [[Y:%.*]] +; CHECK-NEXT: [[A:%.*]] = and i32 [[Y:%.*]], [[T]] ; CHECK-NEXT: [[Z:%.*]] = zext i32 [[A]] to i64 ; CHECK-NEXT: ret i64 [[Z]] ; @@ -782,7 +782,7 @@ define i64 @evaluate_zexted_const_expr(i1 %c) { define i16 @zext_nneg_flag_drop(i8 %x, i16 %y) { ; CHECK-LABEL: @zext_nneg_flag_drop( ; CHECK-NEXT: [[EXT:%.*]] = zext i8 [[X:%.*]] to i16 -; CHECK-NEXT: [[OR1:%.*]] = or i16 [[EXT]], [[Y:%.*]] +; CHECK-NEXT: [[OR1:%.*]] = or i16 [[Y:%.*]], [[EXT]] ; CHECK-NEXT: [[OR2:%.*]] = or i16 [[OR1]], 128 ; CHECK-NEXT: ret i16 [[OR2]] ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll index ed8d8e15282d57..6953d6c48694c2 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll @@ -110,7 +110,7 @@ define void @test_pr25490(i32 %n, ptr noalias nocapture %a, ptr noalias nocaptur ; CHECK-NEXT: store i8 [[CONV12]], ptr [[ARRAYIDX8]], align 1 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll index 6f62f2f2096f17..4768167a9c69f1 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll @@ -7,12 +7,12 @@ define void @cond_inv_load_i32i32i16(ptr noalias nocapture %a, ptr noalias nocap ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4 -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]] +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[INV:%.*]], i64 0 @@ -31,7 +31,7 @@ define void @cond_inv_load_i32i32i16(ptr noalias nocapture %a, ptr noalias nocap ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -86,12 +86,12 @@ define void @cond_inv_load_f64f64f64(ptr noalias nocapture %a, ptr noalias nocap ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4 -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]] +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[INV:%.*]], i64 0 @@ -109,7 +109,7 @@ define void @cond_inv_load_f64f64f64(ptr noalias nocapture %a, ptr noalias nocap ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -162,12 +162,12 @@ define void @invariant_load_cond(ptr noalias nocapture %a, ptr nocapture readonl ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4 -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]] +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -189,7 +189,7 @@ define void @invariant_load_cond(ptr noalias nocapture %a, ptr nocapture readonl ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll index 2a80a7affa4f8c..dac64c3d0f58d0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll @@ -7,12 +7,12 @@ define void @gather_nxv4i32_ind64(ptr noalias nocapture readonly %a, ptr noalias ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4 -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]] +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -28,7 +28,7 @@ define void @gather_nxv4i32_ind64(ptr noalias nocapture readonly %a, ptr noalias ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -74,12 +74,12 @@ define void @scatter_nxv4i32_ind32(ptr noalias nocapture %a, ptr noalias nocaptu ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4 -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]] +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -96,7 +96,7 @@ define void @scatter_nxv4i32_ind32(ptr noalias nocapture %a, ptr noalias nocaptu ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -141,12 +141,12 @@ define void @scatter_inv_nxv4i32(ptr noalias nocapture %inv, ptr noalias nocaptu ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4 -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]] +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[INV:%.*]], i64 0 @@ -162,7 +162,7 @@ define void @scatter_inv_nxv4i32(ptr noalias nocapture %inv, ptr noalias nocaptu ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -211,12 +211,12 @@ define void @gather_inv_nxv4i32(ptr noalias nocapture %a, ptr noalias nocapture ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4 -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]] +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[INV:%.*]], i64 0 @@ -233,7 +233,7 @@ define void @gather_inv_nxv4i32(ptr noalias nocapture %a, ptr noalias nocapture ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -286,12 +286,12 @@ define void @gather_nxv4i32_ind64_stride2(ptr noalias nocapture %a, ptr noalias ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -8 -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]] +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 3 ; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv4i64() @@ -321,7 +321,7 @@ define void @gather_nxv4i32_ind64_stride2(ptr noalias nocapture %a, ptr noalias ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll index 965c71c008aa19..34fb5bb640471f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll @@ -16,7 +16,7 @@ define void @cond_ind64(ptr noalias nocapture %a, ptr noalias nocapture readonly ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index d6794420c403f9..ba8f69b63f0607 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -1464,7 +1464,7 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 { ; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 2 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]] ; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[TMP7]], i64 6 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP2]], [[B]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP2]] ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll index e3bba1338e1df3..81121019efe767 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll @@ -16,49 +16,49 @@ define void @vector_reverse_f64(i64 %N, ptr noalias %a, ptr noalias %b) #0{ ; CHECK: for.body.preheader: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 4 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP31:%.*]] = shl i64 [[TMP30]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[INDEX]], -1 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], [[N]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 3 -; CHECK-NEXT: [[TMP9:%.*]] = sub i64 1, [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP12:%.*]] = shl i64 [[TMP11]], 3 -; CHECK-NEXT: [[TMP13:%.*]] = sub i64 0, [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = sub i64 1, [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[TMP15]], i64 [[TMP14]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 8 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP16]], align 8 -; CHECK-NEXT: [[TMP17:%.*]] = fadd [[WIDE_LOAD]], shufflevector ( insertelement ( poison, double 1.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP18:%.*]] = fadd [[WIDE_LOAD1]], shufflevector ( insertelement ( poison, double 1.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP21:%.*]] = shl i64 [[TMP20]], 3 -; CHECK-NEXT: [[TMP22:%.*]] = sub i64 1, [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, ptr [[TMP19]], i64 [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP25:%.*]] = shl i64 [[TMP24]], 3 -; CHECK-NEXT: [[TMP26:%.*]] = sub i64 0, [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP25]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, ptr [[TMP19]], i64 [[TMP26]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds double, ptr [[TMP28]], i64 [[TMP27]] -; CHECK-NEXT: store [[TMP17]], ptr [[TMP23]], align 8 -; CHECK-NEXT: store [[TMP18]], ptr [[TMP29]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP31]] +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[INDEX]], -1 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[N]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[TMP9]], 3 +; CHECK-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[TMP8]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = shl i64 [[TMP13]], 3 +; CHECK-NEXT: [[TMP15:%.*]] = sub i64 0, [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = sub i64 1, [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, ptr [[TMP8]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, ptr [[TMP17]], i64 [[TMP16]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 8 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP18]], align 8 +; CHECK-NEXT: [[TMP19:%.*]] = fadd [[WIDE_LOAD]], shufflevector ( insertelement ( poison, double 1.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP20:%.*]] = fadd [[WIDE_LOAD1]], shufflevector ( insertelement ( poison, double 1.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP23:%.*]] = shl i64 [[TMP22]], 3 +; CHECK-NEXT: [[TMP24:%.*]] = sub i64 1, [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, ptr [[TMP21]], i64 [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP27:%.*]] = shl i64 [[TMP26]], 3 +; CHECK-NEXT: [[TMP28:%.*]] = sub i64 0, [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = sub i64 1, [[TMP27]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, ptr [[TMP21]], i64 [[TMP28]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds double, ptr [[TMP30]], i64 [[TMP29]] +; CHECK-NEXT: store [[TMP19]], ptr [[TMP25]], align 8 +; CHECK-NEXT: store [[TMP20]], ptr [[TMP31]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -112,7 +112,7 @@ define void @vector_reverse_i64(i64 %N, ptr %a, ptr %b) #0 { ; CHECK: for.body.preheader: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -125,42 +125,42 @@ define void @vector_reverse_i64(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 4 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP6]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP34:%.*]] = shl i64 [[TMP33]], 4 +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[INDEX]], -1 -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], [[N]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP11:%.*]] = shl i64 [[TMP10]], 3 -; CHECK-NEXT: [[TMP12:%.*]] = sub i64 1, [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = shl i64 [[TMP14]], 3 -; CHECK-NEXT: [[TMP16:%.*]] = sub i64 0, [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = sub i64 1, [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[TMP18]], i64 [[TMP17]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 8 -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP19]], align 8 -; CHECK-NEXT: [[TMP20:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP21:%.*]] = add [[WIDE_LOAD3]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP24:%.*]] = shl i64 [[TMP23]], 3 -; CHECK-NEXT: [[TMP25:%.*]] = sub i64 1, [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[TMP22]], i64 [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP28:%.*]] = shl i64 [[TMP27]], 3 -; CHECK-NEXT: [[TMP29:%.*]] = sub i64 0, [[TMP28]] -; CHECK-NEXT: [[TMP30:%.*]] = sub i64 1, [[TMP28]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[TMP22]], i64 [[TMP29]] -; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[TMP31]], i64 [[TMP30]] -; CHECK-NEXT: store [[TMP20]], ptr [[TMP26]], align 8 -; CHECK-NEXT: store [[TMP21]], ptr [[TMP32]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP34]] +; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[INDEX]], -1 +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[N]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = shl i64 [[TMP12]], 3 +; CHECK-NEXT: [[TMP14:%.*]] = sub i64 1, [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP11]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = shl i64 [[TMP16]], 3 +; CHECK-NEXT: [[TMP18:%.*]] = sub i64 0, [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = sub i64 1, [[TMP17]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[TMP11]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i64 [[TMP19]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP15]], align 8 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP21]], align 8 +; CHECK-NEXT: [[TMP22:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP23:%.*]] = add [[WIDE_LOAD3]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP26:%.*]] = shl i64 [[TMP25]], 3 +; CHECK-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[TMP24]], i64 [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP30:%.*]] = shl i64 [[TMP29]], 3 +; CHECK-NEXT: [[TMP31:%.*]] = sub i64 0, [[TMP30]] +; CHECK-NEXT: [[TMP32:%.*]] = sub i64 1, [[TMP30]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i64, ptr [[TMP24]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i64, ptr [[TMP33]], i64 [[TMP32]] +; CHECK-NEXT: store [[TMP22]], ptr [[TMP28]], align 8 +; CHECK-NEXT: store [[TMP23]], ptr [[TMP34]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll index 76084776b2b765..626bb55cf2a77e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll @@ -19,12 +19,12 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -8 -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]] +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] ; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[N_VEC]], 3 ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() @@ -66,7 +66,7 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt ; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[C]], [[ENTRY:%.*]] ] @@ -132,12 +132,12 @@ define void @widen_2ptrs_phi_unrolled(ptr noalias nocapture %dst, ptr noalias no ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -8 -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]] +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] ; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[N_VEC]], 2 ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[N_VEC]], 2 @@ -149,25 +149,25 @@ define void @widen_2ptrs_phi_unrolled(ptr noalias nocapture %dst, ptr noalias no ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 2 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[OFFSET_IDX5:%.*]] = shl i64 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX5]] +; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = shl i64 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX4]] ; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[DOTIDX:%.*]] = shl nuw nsw i64 [[TMP7]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[DOTIDX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP]], align 4 -; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP8]], align 4 ; CHECK-NEXT: [[TMP9:%.*]] = shl nsw [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP10:%.*]] = shl nsw [[WIDE_LOAD8]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP10:%.*]] = shl nsw [[WIDE_LOAD6]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[DOTIDX9:%.*]] = shl nuw nsw i64 [[TMP11]], 4 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP6]], i64 [[DOTIDX9]] -; CHECK-NEXT: store [[TMP9]], ptr [[NEXT_GEP6]], align 4 +; CHECK-NEXT: [[DOTIDX7:%.*]] = shl nuw nsw i64 [[TMP11]], 4 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 [[DOTIDX7]] +; CHECK-NEXT: store [[TMP9]], ptr [[NEXT_GEP5]], align 4 ; CHECK-NEXT: store [[TMP10]], ptr [[TMP12]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll index c22613509be4fe..57807604b37a87 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll @@ -32,7 +32,7 @@ define void @vector_reverse_mask_v4i1(ptr noalias %a, ptr noalias %cond, i64 %N) ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = xor i64 [[INDEX]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], [[N]] +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[N]], [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[COND:%.*]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 -24 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 -56 @@ -47,17 +47,17 @@ define void @vector_reverse_mask_v4i1(ptr noalias %a, ptr noalias %cond, i64 %N) ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP7]], i64 -56 ; CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> poison, <4 x i32> ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP8]], i32 8, <4 x i1> [[REVERSE3]], <4 x double> poison) -; CHECK-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> poison, <4 x i32> -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP9]], i32 8, <4 x i1> [[REVERSE4]], <4 x double> poison) +; CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> poison, <4 x i32> +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP9]], i32 8, <4 x i1> [[REVERSE5]], <4 x double> poison) ; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], ; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD6]], ; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP10]], ptr [[TMP8]], i32 8, <4 x i1> [[REVERSE3]]) -; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP11]], ptr [[TMP9]], i32 8, <4 x i1> [[REVERSE4]]) +; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP11]], ptr [[TMP9]], i32 8, <4 x i1> [[REVERSE5]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ] diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll index 45b84a0b5e856c..fec5921720fed9 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll @@ -38,7 +38,7 @@ define void @arm_abs_q7(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 % ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[BLOCKSIZE]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PSRC]], [[WHILE_BODY_PREHEADER]] ] @@ -118,22 +118,22 @@ define void @arm_abs_q15(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = shl i32 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = shl i32 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[TMP4]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC]], i32 [[OFFSET_IDX]] +; CHECK-NEXT: [[OFFSET_IDX7:%.*]] = shl i32 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[OFFSET_IDX7]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2 -; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <8 x i16> [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <8 x i16> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP7:%.*]] = sub <8 x i16> zeroinitializer, [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP6]], <8 x i16> , <8 x i16> [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP5]], <8 x i16> [[WIDE_LOAD]], <8 x i16> [[TMP8]] -; CHECK-NEXT: store <8 x i16> [[TMP9]], ptr [[NEXT_GEP7]], align 2 +; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <8 x i16> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <8 x i16> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP5:%.*]] = sub <8 x i16> zeroinitializer, [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP4]], <8 x i16> , <8 x i16> [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP3]], <8 x i16> [[WIDE_LOAD]], <8 x i16> [[TMP6]] +; CHECK-NEXT: store <8 x i16> [[TMP7]], ptr [[NEXT_GEP8]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[BLOCKSIZE]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PSRC]], [[WHILE_BODY_PREHEADER]] ] @@ -145,12 +145,12 @@ define void @arm_abs_q15(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 ; CHECK-NEXT: [[BLKCNT_022:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[PDST_ADDR_021:%.*]] = phi ptr [ [[INCDEC_PTR13:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL6]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[PSRC_ADDR_023]], i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = load i16, ptr [[PSRC_ADDR_023]], align 2 -; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i16 [[TMP11]], 0 -; CHECK-NEXT: [[CMP5:%.*]] = icmp eq i16 [[TMP11]], -32768 -; CHECK-NEXT: [[SUB:%.*]] = sub i16 0, [[TMP11]] +; CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[PSRC_ADDR_023]], align 2 +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i16 [[TMP9]], 0 +; CHECK-NEXT: [[CMP5:%.*]] = icmp eq i16 [[TMP9]], -32768 +; CHECK-NEXT: [[SUB:%.*]] = sub i16 0, [[TMP9]] ; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP5]], i16 32767, i16 [[SUB]] -; CHECK-NEXT: [[COND11:%.*]] = select i1 [[CMP1]], i16 [[TMP11]], i16 [[COND]] +; CHECK-NEXT: [[COND11:%.*]] = select i1 [[CMP1]], i16 [[TMP9]], i16 [[COND]] ; CHECK-NEXT: [[INCDEC_PTR13]] = getelementptr inbounds i8, ptr [[PDST_ADDR_021]], i32 2 ; CHECK-NEXT: store i16 [[COND11]], ptr [[PDST_ADDR_021]], align 2 ; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_022]], -1 @@ -213,22 +213,22 @@ define void @arm_abs_q31(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = shl i32 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = shl i32 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[TMP4]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC]], i32 [[OFFSET_IDX]] +; CHECK-NEXT: [[OFFSET_IDX7:%.*]] = shl i32 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[OFFSET_IDX7]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[NEXT_GEP]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP7:%.*]] = sub nsw <4 x i32> zeroinitializer, [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> , <4 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[TMP8]] -; CHECK-NEXT: store <4 x i32> [[TMP9]], ptr [[NEXT_GEP7]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP5:%.*]] = sub nsw <4 x i32> zeroinitializer, [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> , <4 x i32> [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[TMP6]] +; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr [[NEXT_GEP8]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[BLOCKSIZE]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PSRC]], [[WHILE_BODY_PREHEADER]] ] @@ -240,12 +240,12 @@ define void @arm_abs_q31(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 ; CHECK-NEXT: [[BLKCNT_016:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[PDST_ADDR_015:%.*]] = phi ptr [ [[INCDEC_PTR7:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL6]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[PSRC_ADDR_017]], i32 4 -; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[PSRC_ADDR_017]], align 4 -; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP11]], 0 -; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[TMP11]], -2147483648 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[TMP11]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[PSRC_ADDR_017]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP9]], 0 +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[TMP9]], -2147483648 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[TMP9]] ; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP2]], i32 2147483647, i32 [[SUB]] -; CHECK-NEXT: [[COND6:%.*]] = select i1 [[CMP1]], i32 [[TMP11]], i32 [[COND]] +; CHECK-NEXT: [[COND6:%.*]] = select i1 [[CMP1]], i32 [[TMP9]], i32 [[COND]] ; CHECK-NEXT: [[INCDEC_PTR7]] = getelementptr inbounds i8, ptr [[PDST_ADDR_015]], i32 4 ; CHECK-NEXT: store i32 [[COND6]], ptr [[PDST_ADDR_015]], align 4 ; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_016]], -1 diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll index 18caa9cc16f350..a7cb5c61ca5502 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll @@ -67,7 +67,7 @@ define i64 @add_i32_i64(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -132,7 +132,7 @@ define i64 @add_i16_i64(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -197,7 +197,7 @@ define i64 @add_i8_i64(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -582,7 +582,7 @@ define i64 @mla_i32_i64(ptr nocapture readonly %x, ptr nocapture readonly %y, i3 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -658,7 +658,7 @@ define i64 @mla_i16_i64(ptr nocapture readonly %x, ptr nocapture readonly %y, i3 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -738,7 +738,7 @@ define i64 @mla_i8_i64(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -1197,7 +1197,7 @@ define i64 @red_mla_ext_s16_u16_s64(ptr noalias nocapture readonly %A, ptr noali ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll index 69538343356693..d904c50f3bf9cd 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll @@ -30,35 +30,35 @@ define float @test(ptr nocapture readonly %pA, ptr nocapture readonly %pB, i32 % ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PA]], i32 [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = shl i32 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[PB]], i32 [[TMP3]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PA]], i32 [[OFFSET_IDX]] +; CHECK-NEXT: [[OFFSET_IDX5:%.*]] = shl i32 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[PB]], i32 [[OFFSET_IDX5]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[NEXT_GEP]], align 4 -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[NEXT_GEP5]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD6]], zeroinitializer -; CHECK-NEXT: [[DOTNOT8:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[WIDE_LOAD]]) -; CHECK-NEXT: [[TMP7:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[WIDE_LOAD6]]) -; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <4 x float> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD6]] -; CHECK-NEXT: [[TMP10:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP9]]) -; CHECK-NEXT: [[TMP11:%.*]] = fdiv fast <4 x float> [[TMP10]], [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <4 x float> [[TMP11]], [[VEC_PHI]] -; CHECK-NEXT: [[PREDPHI]] = select <4 x i1> [[DOTNOT8]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP12]] +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[NEXT_GEP6]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD7]], zeroinitializer +; CHECK-NEXT: [[DOTNOT9:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP5:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[WIDE_LOAD7]]) +; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <4 x float> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD7]] +; CHECK-NEXT: [[TMP8:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = fdiv fast <4 x float> [[TMP8]], [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <4 x float> [[TMP9]], [[VEC_PHI]] +; CHECK-NEXT: [[PREDPHI]] = select <4 x i1> [[DOTNOT9]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP10]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PREDPHI]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]] +; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PREDPHI]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[BLOCKSIZE]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PA]], [[WHILE_BODY_PREHEADER]] ] ; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[PB]], [[WHILE_BODY_PREHEADER]] ] ; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[WHILE_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[PA_ADDR_020:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -66,20 +66,20 @@ define float @test(ptr nocapture readonly %pA, ptr nocapture readonly %pB, i32 % ; CHECK-NEXT: [[BLOCKSIZE_ADDR_018:%.*]] = phi i32 [ [[DEC:%.*]], [[IF_END]] ], [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ACCUM_017:%.*]] = phi float [ [[ACCUM_1:%.*]], [[IF_END]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[PA_ADDR_020]], i32 4 -; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[PA_ADDR_020]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[PA_ADDR_020]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1]] = getelementptr inbounds i8, ptr [[PB_ADDR_019]], i32 4 -; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[PB_ADDR_019]], align 4 -; CHECK-NEXT: [[CMP2:%.*]] = fcmp fast une float [[TMP15]], 0.000000e+00 -; CHECK-NEXT: [[CMP3:%.*]] = fcmp fast une float [[TMP16]], 0.000000e+00 +; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[PB_ADDR_019]], align 4 +; CHECK-NEXT: [[CMP2:%.*]] = fcmp fast une float [[TMP13]], 0.000000e+00 +; CHECK-NEXT: [[CMP3:%.*]] = fcmp fast une float [[TMP14]], 0.000000e+00 ; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP2]], i1 true, i1 [[CMP3]] ; CHECK-NEXT: br i1 [[OR_COND]], label [[IF_THEN:%.*]], label [[IF_END]] ; CHECK: if.then: -; CHECK-NEXT: [[TMP17:%.*]] = tail call fast float @llvm.fabs.f32(float [[TMP15]]) -; CHECK-NEXT: [[TMP18:%.*]] = tail call fast float @llvm.fabs.f32(float [[TMP16]]) -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP18]], [[TMP17]] -; CHECK-NEXT: [[SUB:%.*]] = fsub fast float [[TMP15]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = tail call fast float @llvm.fabs.f32(float [[SUB]]) -; CHECK-NEXT: [[DIV:%.*]] = fdiv fast float [[TMP19]], [[ADD]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call fast float @llvm.fabs.f32(float [[TMP13]]) +; CHECK-NEXT: [[TMP16:%.*]] = tail call fast float @llvm.fabs.f32(float [[TMP14]]) +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[SUB:%.*]] = fsub fast float [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = tail call fast float @llvm.fabs.f32(float [[SUB]]) +; CHECK-NEXT: [[DIV:%.*]] = fdiv fast float [[TMP17]], [[ADD]] ; CHECK-NEXT: [[ADD4:%.*]] = fadd fast float [[DIV]], [[ACCUM_017]] ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: @@ -88,7 +88,7 @@ define float @test(ptr nocapture readonly %pA, ptr nocapture readonly %pB, i32 % ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: while.end: -; CHECK-NEXT: [[ACCUM_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ACCUM_1]], [[IF_END]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ACCUM_1]], [[IF_END]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[ACCUM_0_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll index 2269b774d9f31d..3432773b4e1b3d 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll @@ -12,16 +12,16 @@ define hidden void @pointer_phi_v4i32_add1(ptr noalias nocapture readonly %A, pt ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[TMP1]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[OFFSET_IDX]] +; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = shl i32 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[OFFSET_IDX4]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[NEXT_GEP]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[NEXT_GEP4]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store <4 x i32> [[TMP0]], ptr [[NEXT_GEP5]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TMP3]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP1]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: end: ; CHECK-NEXT: ret void ; @@ -53,24 +53,24 @@ define hidden void @pointer_phi_v4i32_add2(ptr noalias nocapture readonly %A, pt ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[INDEX]], 3 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[B]], i32 [[TMP1]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 3 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[OFFSET_IDX]] +; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = shl i32 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX4]] ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[NEXT_GEP]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[NEXT_GEP4]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store <4 x i32> [[TMP0]], ptr [[NEXT_GEP5]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996 -; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996 +; CHECK-NEXT: br i1 [[TMP1]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.body: ; CHECK-NEXT: [[A_ADDR_09:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ] ; CHECK-NEXT: [[B_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_ADDR_09]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_09]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_09]], i32 8 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP4]], [[Y]] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[Y]] ; CHECK-NEXT: store i32 [[ADD]], ptr [[B_ADDR_07]], align 4 ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_07]], i32 4 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 @@ -109,22 +109,22 @@ define hidden void @pointer_phi_v4i32_add3(ptr noalias nocapture readonly %A, pt ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i32 [[TMP1]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX]] ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> , <4 x i32> poison) -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[NEXT_GEP]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[NEXT_GEP]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 48 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996 -; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996 +; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.body: ; CHECK-NEXT: [[A_ADDR_09:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ] ; CHECK-NEXT: [[B_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_ADDR_09]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_ADDR_09]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_09]], i32 12 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP4]], [[Y]] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], [[Y]] ; CHECK-NEXT: store i32 [[ADD]], ptr [[B_ADDR_07]], align 4 ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_07]], i32 4 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 @@ -160,16 +160,16 @@ define hidden void @pointer_phi_v8i16_add1(ptr noalias nocapture readonly %A, pt ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[TMP2]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[OFFSET_IDX]] +; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = shl i32 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[OFFSET_IDX4]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store <8 x i16> [[TMP3]], ptr [[NEXT_GEP4]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store <8 x i16> [[TMP1]], ptr [[NEXT_GEP5]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TMP4]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP2]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: end: ; CHECK-NEXT: ret void ; @@ -203,17 +203,17 @@ define hidden void @pointer_phi_v8i16_add2(ptr noalias nocapture readonly %A, pt ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[B]], i32 [[TMP2]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[OFFSET_IDX]] +; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = shl i32 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX4]] ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i16>, ptr [[NEXT_GEP]], align 2 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i16> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store <8 x i16> [[TMP3]], ptr [[NEXT_GEP4]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i16> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store <8 x i16> [[TMP1]], ptr [[NEXT_GEP5]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992 -; CHECK-NEXT: br i1 [[TMP4]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992 +; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: for.body: ; CHECK-NEXT: [[A_ADDR_011:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ] @@ -352,23 +352,23 @@ define hidden void @pointer_phi_v16i8_add2(ptr noalias nocapture readonly %A, pt ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[TMP1]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[OFFSET_IDX]] ; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[B]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <32 x i8>, ptr [[NEXT_GEP]], align 1 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = add <16 x i8> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store <16 x i8> [[TMP2]], ptr [[NEXT_GEP4]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i8> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store <16 x i8> [[TMP1]], ptr [[NEXT_GEP4]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992 -; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992 +; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: for.body: ; CHECK-NEXT: [[A_ADDR_010:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[I_09:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ] ; CHECK-NEXT: [[B_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[A_ADDR_010]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[A_ADDR_010]], align 1 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_010]], i32 2 -; CHECK-NEXT: [[CONV1:%.*]] = add i8 [[TMP4]], [[TMP0]] +; CHECK-NEXT: [[CONV1:%.*]] = add i8 [[TMP3]], [[TMP0]] ; CHECK-NEXT: store i8 [[CONV1]], ptr [[B_ADDR_08]], align 1 ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_08]], i32 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_09]], 1 @@ -445,16 +445,16 @@ define hidden void @pointer_phi_v4f32_add1(ptr noalias nocapture readonly %A, pt ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[TMP1]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[OFFSET_IDX]] +; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = shl i32 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[OFFSET_IDX4]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[NEXT_GEP]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[NEXT_GEP4]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store <4 x float> [[TMP0]], ptr [[NEXT_GEP5]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TMP3]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP1]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: end: ; CHECK-NEXT: ret void ; @@ -486,24 +486,24 @@ define hidden void @pointer_phi_v4f32_add2(ptr noalias nocapture readonly %A, pt ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[INDEX]], 3 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[B]], i32 [[TMP1]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 3 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[OFFSET_IDX]] +; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = shl i32 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX4]] ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, ptr [[NEXT_GEP]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[NEXT_GEP4]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = fadd fast <4 x float> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store <4 x float> [[TMP0]], ptr [[NEXT_GEP5]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996 -; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996 +; CHECK-NEXT: br i1 [[TMP1]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: for.body: ; CHECK-NEXT: [[A_ADDR_09:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ] ; CHECK-NEXT: [[B_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[A_ADDR_09]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[A_ADDR_09]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_09]], i32 8 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP4]], [[Y]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP2]], [[Y]] ; CHECK-NEXT: store float [[ADD]], ptr [[B_ADDR_07]], align 4 ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_07]], i32 4 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 @@ -542,22 +542,22 @@ define hidden void @pointer_phi_v4f32_add3(ptr noalias nocapture readonly %A, pt ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i32 [[TMP1]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX]] ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> , <4 x float> poison) -; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[NEXT_GEP]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fadd fast <4 x float> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[NEXT_GEP]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 48 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996 -; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996 +; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: for.body: ; CHECK-NEXT: [[A_ADDR_09:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ] ; CHECK-NEXT: [[B_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[A_ADDR_09]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[A_ADDR_09]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_09]], i32 12 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP4]], [[Y]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP3]], [[Y]] ; CHECK-NEXT: store float [[ADD]], ptr [[B_ADDR_07]], align 4 ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_07]], i32 4 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 @@ -592,16 +592,16 @@ define hidden void @pointer_phi_v4half_add1(ptr noalias nocapture readonly %A, p ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[TMP1]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[OFFSET_IDX]] +; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = shl i32 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[OFFSET_IDX4]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[NEXT_GEP]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <8 x half> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store <8 x half> [[TMP2]], ptr [[NEXT_GEP4]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = fadd fast <8 x half> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store <8 x half> [[TMP0]], ptr [[NEXT_GEP5]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TMP3]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP1]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: end: ; CHECK-NEXT: ret void ; @@ -633,24 +633,24 @@ define hidden void @pointer_phi_v4half_add2(ptr noalias nocapture readonly %A, p ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[B]], i32 [[TMP1]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[OFFSET_IDX]] +; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = shl i32 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX4]] ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x half>, ptr [[NEXT_GEP]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x half> [[WIDE_VEC]], <16 x half> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <8 x half> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store <8 x half> [[TMP2]], ptr [[NEXT_GEP4]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = fadd fast <8 x half> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store <8 x half> [[TMP0]], ptr [[NEXT_GEP5]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992 -; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992 +; CHECK-NEXT: br i1 [[TMP1]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: for.body: ; CHECK-NEXT: [[A_ADDR_09:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ] ; CHECK-NEXT: [[B_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = load half, ptr [[A_ADDR_09]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load half, ptr [[A_ADDR_09]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_09]], i32 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast half [[TMP4]], [[Y]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast half [[TMP2]], [[Y]] ; CHECK-NEXT: store half [[ADD]], ptr [[B_ADDR_07]], align 4 ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_07]], i32 2 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 @@ -687,24 +687,24 @@ define hidden void @pointer_phi_v4half_add3(ptr noalias nocapture readonly %A, p ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[INDEX]], 6 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[B]], i32 [[TMP1]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], 6 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[OFFSET_IDX]] +; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = shl i32 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX4]] ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <24 x half>, ptr [[NEXT_GEP]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x half> [[WIDE_VEC]], <24 x half> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <8 x half> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store <8 x half> [[TMP2]], ptr [[NEXT_GEP4]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = fadd fast <8 x half> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store <8 x half> [[TMP0]], ptr [[NEXT_GEP5]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992 -; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992 +; CHECK-NEXT: br i1 [[TMP1]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: for.body: ; CHECK-NEXT: [[A_ADDR_09:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ] ; CHECK-NEXT: [[B_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = load half, ptr [[A_ADDR_09]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load half, ptr [[A_ADDR_09]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_09]], i32 6 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast half [[TMP4]], [[Y]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast half [[TMP2]], [[Y]] ; CHECK-NEXT: store half [[ADD]], ptr [[B_ADDR_07]], align 4 ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_07]], i32 2 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 @@ -747,28 +747,28 @@ define hidden void @pointer_phi_v4i32_uf2(ptr noalias nocapture readonly %A, ptr ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i32 [[TMP2]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX]] ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> , <4 x i32> poison) -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER5]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16 -; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr [[NEXT_GEP]], align 4 -; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER5]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16 +; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[NEXT_GEP]], align 4 +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr [[TMP4]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 192 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 9992 -; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 9992 +; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: [[A_ADDR_08:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 9992, [[VECTOR_BODY]] ] ; CHECK-NEXT: [[B_ADDR_06:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[A_ADDR_08]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR_08]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_08]], i32 24 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP7]], [[Y]] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], [[Y]] ; CHECK-NEXT: store i32 [[ADD]], ptr [[B_ADDR_06]], align 4 ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_06]], i32 4 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_07]], 1 @@ -814,36 +814,36 @@ define hidden void @pointer_phi_v4i32_uf4(ptr noalias nocapture readonly %A, ptr ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shl i32 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i32 [[TMP4]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX]] ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> , <4 x i32> poison) -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER7]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER8]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER9]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 32 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 48 -; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[NEXT_GEP]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER7]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER8]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER9]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 32 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 48 +; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[NEXT_GEP]], align 4 +; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP8]], align 4 ; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[TMP9]], align 4 ; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP10]], align 4 -; CHECK-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP11]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 384 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 9984 -; CHECK-NEXT: br i1 [[TMP12]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 9984 +; CHECK-NEXT: br i1 [[TMP11]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: [[A_ADDR_08:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 9984, [[VECTOR_BODY]] ] ; CHECK-NEXT: [[B_ADDR_06:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[A_ADDR_08]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[A_ADDR_08]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_08]], i32 24 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[Y]] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[Y]] ; CHECK-NEXT: store i32 [[ADD]], ptr [[B_ADDR_06]], align 4 ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_06]], i32 4 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_07]], 1 @@ -875,8 +875,8 @@ define hidden void @mult_ptr_iv(ptr noalias nocapture readonly %x, ptr noalias n ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[Z:%.*]], i32 3000 ; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[X:%.*]], i32 3000 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[Z]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[X]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[Z]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[X]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll index f58d864e1e1477..7db5bccd896b27 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll @@ -16,41 +16,41 @@ define arm_aapcs_vfpcc i32 @minmaxval4(ptr nocapture readonly %x, ptr nocapture ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI1]]) -; CHECK-NEXT: [[TMP3]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI]]) +; CHECK-NEXT: [[TMP1]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI1]]) +; CHECK-NEXT: [[TMP2]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP2]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP1]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 2147483647, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 2147483647, [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ -2147483648, [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[MAX_0_LCSSA:%.*]] = phi i32 [ -2147483648, [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[MIN_0_LCSSA:%.*]] = phi i32 [ 2147483647, [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[MAX_0_LCSSA:%.*]] = phi i32 [ -2147483648, [[ENTRY:%.*]] ], [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[MIN_0_LCSSA:%.*]] = phi i32 [ 2147483647, [[ENTRY]] ], [ [[COND9:%.*]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: store i32 [[MIN_0_LCSSA]], ptr [[MINP:%.*]], align 4 ; CHECK-NEXT: ret i32 [[MAX_0_LCSSA]] ; CHECK: for.body: ; CHECK-NEXT: [[I_029:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[MIN_028:%.*]] = phi i32 [ [[TMP9]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[MAX_027:%.*]] = phi i32 [ [[TMP8]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[MIN_028:%.*]] = phi i32 [ [[COND9]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[MAX_027:%.*]] = phi i32 [ [[COND]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_029]] -; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP8]] = call i32 @llvm.smax.i32(i32 [[TMP7]], i32 [[MAX_027]]) -; CHECK-NEXT: [[TMP9]] = call i32 @llvm.smin.i32(i32 [[TMP7]], i32 [[MIN_028]]) +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[COND]] = call i32 @llvm.smax.i32(i32 [[TMP6]], i32 [[MAX_027]]) +; CHECK-NEXT: [[COND9]] = call i32 @llvm.smin.i32(i32 [[TMP6]], i32 [[MIN_028]]) ; CHECK-NEXT: [[INC]] = add nuw i32 [[I_029]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; entry: %cmp26.not = icmp eq i32 %N, 0 diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll index 8783326b1ef1af..9f9db3ad859911 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll @@ -15,8 +15,8 @@ define i32 @inv_load_conditional(ptr %a, i64 %n, ptr %b, i32 %k) { ; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[N]], 2 ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[B]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll index 2b61a1cc3d78b3..2fb4a68f4b5860 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll @@ -18,8 +18,8 @@ define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b) ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 ; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[N]], 2 ; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[A]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[B]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: @@ -132,8 +132,8 @@ define void @inv_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, i3 ; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[N]], 2 ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[B]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: @@ -245,15 +245,15 @@ define void @variant_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 ; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[B]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: [[BOUND03:%.*]] = icmp ugt ptr [[SCEVGEP2]], [[B]] -; CHECK-NEXT: [[BOUND14:%.*]] = icmp ugt ptr [[SCEVGEP]], [[C]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[B]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[C]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] ; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] -; CHECK-NEXT: [[BOUND06:%.*]] = icmp ugt ptr [[SCEVGEP2]], [[A]] -; CHECK-NEXT: [[BOUND17:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[C]] +; CHECK-NEXT: [[BOUND06:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND17:%.*]] = icmp ult ptr [[C]], [[SCEVGEP1]] ; CHECK-NEXT: [[FOUND_CONFLICT8:%.*]] = and i1 [[BOUND06]], [[BOUND17]] ; CHECK-NEXT: [[CONFLICT_RDX9:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT8]] ; CHECK-NEXT: br i1 [[CONFLICT_RDX9]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll index 8800fa26f067c6..6bd70cefcaf746 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll @@ -48,7 +48,7 @@ define void @foo(ptr addrspace(1) align 8 dereferenceable_or_null(16), ptr addrs ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP2]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll b/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll index fe6d9b3ec690ec..47636b2c66d296 100644 --- a/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll +++ b/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll @@ -21,7 +21,7 @@ define void @inv_store_last_lane(ptr noalias nocapture %a, ptr noalias nocapture ; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i64 3 -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -82,7 +82,7 @@ define float @ret_last_lane(ptr noalias nocapture %a, ptr noalias nocapture read ; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i64 3 -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll index bd658c31768a84..bf1905bf334877 100644 --- a/llvm/test/Transforms/LoopVectorize/float-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll @@ -66,7 +66,7 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N) ; VEC4_INTERL1-NEXT: [[ADD]] = fsub fast float [[X_05]], [[FPINC]] ; VEC4_INTERL1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC4_INTERL1-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; VEC4_INTERL1-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; VEC4_INTERL1-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; VEC4_INTERL1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; VEC4_INTERL1: for.end.loopexit: ; VEC4_INTERL1-NEXT: br label [[FOR_END]] @@ -124,7 +124,7 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N) ; VEC4_INTERL2-NEXT: [[ADD]] = fsub fast float [[X_05]], [[FPINC]] ; VEC4_INTERL2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC4_INTERL2-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; VEC4_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; VEC4_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; VEC4_INTERL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; VEC4_INTERL2: for.end.loopexit: ; VEC4_INTERL2-NEXT: br label [[FOR_END]] @@ -175,7 +175,7 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N) ; VEC1_INTERL2-NEXT: [[ADD]] = fsub fast float [[X_05]], [[FPINC]] ; VEC1_INTERL2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC1_INTERL2-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; VEC1_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; VEC1_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; VEC1_INTERL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; VEC1_INTERL2: for.end.loopexit: ; VEC1_INTERL2-NEXT: br label [[FOR_END]] @@ -226,7 +226,7 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N) ; VEC2_INTERL1_PRED_STORE-NEXT: [[ADD]] = fsub fast float [[X_05]], [[FPINC]] ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC2_INTERL1_PRED_STORE-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; VEC2_INTERL1_PRED_STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; VEC2_INTERL1_PRED_STORE: for.end: ; VEC2_INTERL1_PRED_STORE-NEXT: ret void @@ -313,7 +313,7 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32 ; VEC4_INTERL1-NEXT: [[ADD]] = fsub reassoc float [[X_05]], [[FPINC]] ; VEC4_INTERL1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC4_INTERL1-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; VEC4_INTERL1-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; VEC4_INTERL1-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; VEC4_INTERL1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; VEC4_INTERL1: for.end.loopexit: ; VEC4_INTERL1-NEXT: br label [[FOR_END]] @@ -371,7 +371,7 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32 ; VEC4_INTERL2-NEXT: [[ADD]] = fsub reassoc float [[X_05]], [[FPINC]] ; VEC4_INTERL2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC4_INTERL2-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; VEC4_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; VEC4_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; VEC4_INTERL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; VEC4_INTERL2: for.end.loopexit: ; VEC4_INTERL2-NEXT: br label [[FOR_END]] @@ -424,7 +424,7 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32 ; VEC1_INTERL2-NEXT: [[ADD]] = fsub reassoc float [[X_05]], [[FPINC]] ; VEC1_INTERL2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC1_INTERL2-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; VEC1_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; VEC1_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; VEC1_INTERL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; VEC1_INTERL2: for.end.loopexit: ; VEC1_INTERL2-NEXT: br label [[FOR_END]] @@ -475,7 +475,7 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32 ; VEC2_INTERL1_PRED_STORE-NEXT: [[ADD]] = fsub reassoc float [[X_05]], [[FPINC]] ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC2_INTERL1_PRED_STORE-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; VEC2_INTERL1_PRED_STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; VEC2_INTERL1_PRED_STORE: for.end: ; VEC2_INTERL1_PRED_STORE-NEXT: ret void @@ -528,7 +528,7 @@ define void @fp_iv_loop2(float %init, ptr noalias nocapture %A, i32 %N) #0 { ; VEC4_INTERL1-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483644 ; VEC4_INTERL1-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL1-NEXT: [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01 -; VEC4_INTERL1-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP1]], [[INIT:%.*]] +; VEC4_INTERL1-NEXT: [[IND_END:%.*]] = fadd fast float [[INIT:%.*]], [[TMP1]] ; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL1-NEXT: [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], @@ -557,7 +557,7 @@ define void @fp_iv_loop2(float %init, ptr noalias nocapture %A, i32 %N) #0 { ; VEC4_INTERL1-NEXT: [[CONV1]] = fadd fast float [[X_06]], 5.000000e-01 ; VEC4_INTERL1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC4_INTERL1-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; VEC4_INTERL1-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; VEC4_INTERL1-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; VEC4_INTERL1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; VEC4_INTERL1: for.end.loopexit: ; VEC4_INTERL1-NEXT: br label [[FOR_END]] @@ -576,7 +576,7 @@ define void @fp_iv_loop2(float %init, ptr noalias nocapture %A, i32 %N) #0 { ; VEC4_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483640 ; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01 -; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP1]], [[INIT:%.*]] +; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[INIT:%.*]], [[TMP1]] ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], @@ -608,7 +608,7 @@ define void @fp_iv_loop2(float %init, ptr noalias nocapture %A, i32 %N) #0 { ; VEC4_INTERL2-NEXT: [[CONV1]] = fadd fast float [[X_06]], 5.000000e-01 ; VEC4_INTERL2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC4_INTERL2-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; VEC4_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; VEC4_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; VEC4_INTERL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; VEC4_INTERL2: for.end.loopexit: ; VEC4_INTERL2-NEXT: br label [[FOR_END]] @@ -627,14 +627,14 @@ define void @fp_iv_loop2(float %init, ptr noalias nocapture %A, i32 %N) #0 { ; VEC1_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646 ; VEC1_INTERL2-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC1_INTERL2-NEXT: [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01 -; VEC1_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP1]], [[INIT:%.*]] +; VEC1_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[INIT:%.*]], [[TMP1]] ; VEC1_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC1_INTERL2: vector.body: ; VEC1_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC1_INTERL2-NEXT: [[TMP2:%.*]] = or disjoint i64 [[INDEX]], 1 ; VEC1_INTERL2-NEXT: [[DOTCAST2:%.*]] = sitofp i64 [[INDEX]] to float ; VEC1_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast float [[DOTCAST2]], 5.000000e-01 -; VEC1_INTERL2-NEXT: [[OFFSET_IDX:%.*]] = fadd fast float [[TMP3]], [[INIT]] +; VEC1_INTERL2-NEXT: [[OFFSET_IDX:%.*]] = fadd fast float [[INIT]], [[TMP3]] ; VEC1_INTERL2-NEXT: [[TMP4:%.*]] = fadd fast float [[OFFSET_IDX]], 5.000000e-01 ; VEC1_INTERL2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] ; VEC1_INTERL2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]] @@ -658,7 +658,7 @@ define void @fp_iv_loop2(float %init, ptr noalias nocapture %A, i32 %N) #0 { ; VEC1_INTERL2-NEXT: [[CONV1]] = fadd fast float [[X_06]], 5.000000e-01 ; VEC1_INTERL2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC1_INTERL2-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; VEC1_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; VEC1_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; VEC1_INTERL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; VEC1_INTERL2: for.end.loopexit: ; VEC1_INTERL2-NEXT: br label [[FOR_END]] @@ -677,7 +677,7 @@ define void @fp_iv_loop2(float %init, ptr noalias nocapture %A, i32 %N) #0 { ; VEC2_INTERL1_PRED_STORE-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646 ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01 -; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP1]], [[INIT:%.*]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END:%.*]] = fadd fast float [[INIT:%.*]], [[TMP1]] ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0 ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDUCTION:%.*]] = fadd fast <2 x float> [[DOTSPLAT]], @@ -702,7 +702,7 @@ define void @fp_iv_loop2(float %init, ptr noalias nocapture %A, i32 %N) #0 { ; VEC2_INTERL1_PRED_STORE-NEXT: [[CONV1]] = fadd fast float [[X_06]], 5.000000e-01 ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC2_INTERL1_PRED_STORE-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; VEC2_INTERL1_PRED_STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; VEC2_INTERL1_PRED_STORE: for.end: ; VEC2_INTERL1_PRED_STORE-NEXT: ret void @@ -763,7 +763,7 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca ; VEC4_INTERL1-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP2]], 0x3FB99999A0000000 ; VEC4_INTERL1-NEXT: [[DOTCAST2:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL1-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]] -; VEC4_INTERL1-NEXT: [[IND_END3:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]] +; VEC4_INTERL1-NEXT: [[IND_END3:%.*]] = fadd fast float [[INIT:%.*]], [[TMP3]] ; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 @@ -817,7 +817,7 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca ; VEC4_INTERL1-NEXT: store float [[CONV1]], ptr [[ARRAYIDX6]], align 4 ; VEC4_INTERL1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC4_INTERL1-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; VEC4_INTERL1-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; VEC4_INTERL1-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; VEC4_INTERL1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; VEC4_INTERL1: for.end.loopexit: ; VEC4_INTERL1-NEXT: br label [[FOR_END]] @@ -840,7 +840,7 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca ; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP2]], 0x3FB99999A0000000 ; VEC4_INTERL2-NEXT: [[DOTCAST2:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]] -; VEC4_INTERL2-NEXT: [[IND_END3:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]] +; VEC4_INTERL2-NEXT: [[IND_END3:%.*]] = fadd fast float [[INIT:%.*]], [[TMP3]] ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT6:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 @@ -904,7 +904,7 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca ; VEC4_INTERL2-NEXT: store float [[CONV1]], ptr [[ARRAYIDX6]], align 4 ; VEC4_INTERL2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC4_INTERL2-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; VEC4_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; VEC4_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; VEC4_INTERL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; VEC4_INTERL2: for.end.loopexit: ; VEC4_INTERL2-NEXT: br label [[FOR_END]] @@ -927,7 +927,7 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca ; VEC1_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP2]], 0x3FB99999A0000000 ; VEC1_INTERL2-NEXT: [[DOTCAST2:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC1_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]] -; VEC1_INTERL2-NEXT: [[IND_END3:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]] +; VEC1_INTERL2-NEXT: [[IND_END3:%.*]] = fadd fast float [[INIT:%.*]], [[TMP3]] ; VEC1_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC1_INTERL2: vector.body: ; VEC1_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -936,7 +936,7 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca ; VEC1_INTERL2-NEXT: [[TMP5:%.*]] = fmul fast float [[DOTCAST5]], -5.000000e-01 ; VEC1_INTERL2-NEXT: [[DOTCAST6:%.*]] = sitofp i64 [[INDEX]] to float ; VEC1_INTERL2-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP0]], [[DOTCAST6]] -; VEC1_INTERL2-NEXT: [[OFFSET_IDX7:%.*]] = fadd fast float [[TMP6]], [[INIT]] +; VEC1_INTERL2-NEXT: [[OFFSET_IDX7:%.*]] = fadd fast float [[INIT]], [[TMP6]] ; VEC1_INTERL2-NEXT: [[TMP7:%.*]] = fadd fast float [[OFFSET_IDX7]], [[TMP0]] ; VEC1_INTERL2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] ; VEC1_INTERL2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] @@ -982,7 +982,7 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca ; VEC1_INTERL2-NEXT: store float [[CONV1]], ptr [[ARRAYIDX6]], align 4 ; VEC1_INTERL2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC1_INTERL2-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; VEC1_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; VEC1_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; VEC1_INTERL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; VEC1_INTERL2: for.end.loopexit: ; VEC1_INTERL2-NEXT: br label [[FOR_END]] @@ -1005,7 +1005,7 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca ; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP2]], 0x3FB99999A0000000 ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST2:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]] -; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END3:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END3:%.*]] = fadd fast float [[INIT:%.*]], [[TMP3]] ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0 ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i64 0 @@ -1054,7 +1054,7 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca ; VEC2_INTERL1_PRED_STORE-NEXT: store float [[CONV1]], ptr [[ARRAYIDX6]], align 4 ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC2_INTERL1_PRED_STORE-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; VEC2_INTERL1_PRED_STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; VEC2_INTERL1_PRED_STORE: for.end: ; VEC2_INTERL1_PRED_STORE-NEXT: ret void @@ -1141,7 +1141,7 @@ define void @fp_iv_loop4(ptr noalias nocapture %A, i32 %N) { ; VEC4_INTERL1-NEXT: [[CONV1]] = fadd fast float [[X_06]], 5.000000e-01 ; VEC4_INTERL1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC4_INTERL1-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; VEC4_INTERL1-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; VEC4_INTERL1-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; VEC4_INTERL1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; VEC4_INTERL1: for.end.loopexit: ; VEC4_INTERL1-NEXT: br label [[FOR_END]] @@ -1189,7 +1189,7 @@ define void @fp_iv_loop4(ptr noalias nocapture %A, i32 %N) { ; VEC4_INTERL2-NEXT: [[CONV1]] = fadd fast float [[X_06]], 5.000000e-01 ; VEC4_INTERL2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC4_INTERL2-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; VEC4_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; VEC4_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; VEC4_INTERL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; VEC4_INTERL2: for.end.loopexit: ; VEC4_INTERL2-NEXT: br label [[FOR_END]] @@ -1239,7 +1239,7 @@ define void @fp_iv_loop4(ptr noalias nocapture %A, i32 %N) { ; VEC1_INTERL2-NEXT: [[CONV1]] = fadd fast float [[X_06]], 5.000000e-01 ; VEC1_INTERL2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC1_INTERL2-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; VEC1_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; VEC1_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; VEC1_INTERL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; VEC1_INTERL2: for.end.loopexit: ; VEC1_INTERL2-NEXT: br label [[FOR_END]] @@ -1280,7 +1280,7 @@ define void @fp_iv_loop4(ptr noalias nocapture %A, i32 %N) { ; VEC2_INTERL1_PRED_STORE-NEXT: [[CONV1]] = fadd fast float [[X_06]], 5.000000e-01 ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC2_INTERL1_PRED_STORE-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; VEC2_INTERL1_PRED_STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; VEC2_INTERL1_PRED_STORE: for.end: ; VEC2_INTERL1_PRED_STORE-NEXT: ret void diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll index 1d7ead0a8e49b2..d19ca172a8c0a8 100644 --- a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll +++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll @@ -19,8 +19,8 @@ define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) { ; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 4 ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP4]] ; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[A]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[B]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -71,7 +71,7 @@ define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) { ; CHECK-NEXT: store i32 [[X_0]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: br label [[FOR_END]] diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll index 45674acaae5387..08d05a1e2db69f 100644 --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -90,7 +90,7 @@ define void @multi_int_induction(ptr %A, i32 %N) { ; IND-NEXT: [[INC]] = add nsw i32 [[COUNT_09]], 1 ; IND-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; IND-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; IND-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; IND-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; IND-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IND: for.end: ; IND-NEXT: ret void @@ -134,7 +134,7 @@ define void @multi_int_induction(ptr %A, i32 %N) { ; UNROLL-NEXT: [[INC]] = add nsw i32 [[COUNT_09]], 1 ; UNROLL-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; UNROLL-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; UNROLL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; UNROLL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; UNROLL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; UNROLL: for.end: ; UNROLL-NEXT: ret void @@ -227,7 +227,7 @@ define void @multi_int_induction(ptr %A, i32 %N) { ; INTERLEAVE-NEXT: [[INC]] = add nsw i32 [[COUNT_09]], 1 ; INTERLEAVE-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; INTERLEAVE-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; INTERLEAVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; INTERLEAVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; INTERLEAVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; INTERLEAVE: for.end: ; INTERLEAVE-NEXT: ret void @@ -361,7 +361,7 @@ define void @scalar_use(ptr %a, float %b, i64 %offset, i64 %offset2, i64 %n) { ; IND-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IND-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; IND: middle.block: -; IND-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; IND-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; IND-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; IND: scalar.ph: ; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] @@ -374,7 +374,7 @@ define void @scalar_use(ptr %a, float %b, i64 %offset, i64 %offset2, i64 %n) { ; IND-NEXT: [[TMP13:%.*]] = getelementptr float, ptr [[A]], i64 [[IV]] ; IND-NEXT: [[ARR_IDX2:%.*]] = getelementptr float, ptr [[TMP13]], i64 [[OFFSET2]] ; IND-NEXT: [[L2:%.*]] = load float, ptr [[ARR_IDX2]], align 4 -; IND-NEXT: [[M:%.*]] = fmul fast float [[L2]], [[B]] +; IND-NEXT: [[M:%.*]] = fmul fast float [[B]], [[L2]] ; IND-NEXT: [[AD:%.*]] = fadd fast float [[L1]], [[M]] ; IND-NEXT: store float [[AD]], ptr [[ARR_IDX]], align 4 ; IND-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 @@ -428,7 +428,7 @@ define void @scalar_use(ptr %a, float %b, i64 %offset, i64 %offset2, i64 %n) { ; UNROLL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; UNROLL: middle.block: -; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: ; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] @@ -441,7 +441,7 @@ define void @scalar_use(ptr %a, float %b, i64 %offset, i64 %offset2, i64 %n) { ; UNROLL-NEXT: [[TMP17:%.*]] = getelementptr float, ptr [[A]], i64 [[IV]] ; UNROLL-NEXT: [[ARR_IDX2:%.*]] = getelementptr float, ptr [[TMP17]], i64 [[OFFSET2]] ; UNROLL-NEXT: [[L2:%.*]] = load float, ptr [[ARR_IDX2]], align 4 -; UNROLL-NEXT: [[M:%.*]] = fmul fast float [[L2]], [[B]] +; UNROLL-NEXT: [[M:%.*]] = fmul fast float [[B]], [[L2]] ; UNROLL-NEXT: [[AD:%.*]] = fadd fast float [[L1]], [[M]] ; UNROLL-NEXT: store float [[AD]], ptr [[ARR_IDX]], align 4 ; UNROLL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 @@ -571,7 +571,7 @@ define void @scalar_use(ptr %a, float %b, i64 %offset, i64 %offset2, i64 %n) { ; INTERLEAVE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; INTERLEAVE: middle.block: -; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: ; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] @@ -584,7 +584,7 @@ define void @scalar_use(ptr %a, float %b, i64 %offset, i64 %offset2, i64 %n) { ; INTERLEAVE-NEXT: [[TMP17:%.*]] = getelementptr float, ptr [[A]], i64 [[IV]] ; INTERLEAVE-NEXT: [[ARR_IDX2:%.*]] = getelementptr float, ptr [[TMP17]], i64 [[OFFSET2]] ; INTERLEAVE-NEXT: [[L2:%.*]] = load float, ptr [[ARR_IDX2]], align 4 -; INTERLEAVE-NEXT: [[M:%.*]] = fmul fast float [[L2]], [[B]] +; INTERLEAVE-NEXT: [[M:%.*]] = fmul fast float [[B]], [[L2]] ; INTERLEAVE-NEXT: [[AD:%.*]] = fadd fast float [[L1]], [[M]] ; INTERLEAVE-NEXT: store float [[AD]], ptr [[ARR_IDX]], align 4 ; INTERLEAVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 @@ -1636,7 +1636,7 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; IND-NEXT: [[TMP8:%.*]] = or disjoint i64 [[TMP7]], 4 ; IND-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP8]] ; IND-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP2]] -; IND-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[A]] +; IND-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] ; IND-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; IND-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; IND: vector.ph: @@ -1676,7 +1676,7 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; IND-NEXT: store i32 [[TMP21]], ptr [[TMP22]], align 1 ; IND-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; IND-NEXT: [[TMP23:%.*]] = trunc i64 [[I_NEXT]] to i32 -; IND-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP23]], [[N]] +; IND-NEXT: [[COND:%.*]] = icmp eq i32 [[N]], [[TMP23]] ; IND-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; IND: for.end: ; IND-NEXT: ret void @@ -1699,7 +1699,7 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; UNROLL-NEXT: [[TMP8:%.*]] = or disjoint i64 [[TMP7]], 4 ; UNROLL-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP8]] ; UNROLL-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP2]] -; UNROLL-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[A]] +; UNROLL-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] ; UNROLL-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; UNROLL-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL: vector.ph: @@ -1753,7 +1753,7 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; UNROLL-NEXT: store i32 [[TMP32]], ptr [[TMP33]], align 1 ; UNROLL-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; UNROLL-NEXT: [[TMP34:%.*]] = trunc i64 [[I_NEXT]] to i32 -; UNROLL-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP34]], [[N]] +; UNROLL-NEXT: [[COND:%.*]] = icmp eq i32 [[N]], [[TMP34]] ; UNROLL-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; UNROLL: for.end: ; UNROLL-NEXT: ret void @@ -1855,7 +1855,7 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; INTERLEAVE-NEXT: [[TMP8:%.*]] = or disjoint i64 [[TMP7]], 4 ; INTERLEAVE-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP8]] ; INTERLEAVE-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP2]] -; INTERLEAVE-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[A]] +; INTERLEAVE-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] ; INTERLEAVE-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; INTERLEAVE-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; INTERLEAVE: vector.ph: @@ -1920,7 +1920,7 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; INTERLEAVE-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 1 ; INTERLEAVE-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; INTERLEAVE-NEXT: [[TMP40:%.*]] = trunc i64 [[I_NEXT]] to i32 -; INTERLEAVE-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP40]], [[N]] +; INTERLEAVE-NEXT: [[COND:%.*]] = icmp eq i32 [[N]], [[TMP40]] ; INTERLEAVE-NEXT: br i1 [[COND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; INTERLEAVE: for.end: ; INTERLEAVE-NEXT: ret void @@ -2535,13 +2535,13 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; IND: for.body: ; IND-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; IND-NEXT: [[TMP11:%.*]] = trunc i64 [[I]] to i32 -; IND-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], [[A]] +; IND-NEXT: [[TMP12:%.*]] = add i32 [[A]], [[TMP11]] ; IND-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 ; IND-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[I]], i32 1 ; IND-NEXT: store i16 [[TMP13]], ptr [[TMP14]], align 2 ; IND-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; IND-NEXT: [[TMP15:%.*]] = trunc i64 [[I_NEXT]] to i32 -; IND-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP15]], [[N]] +; IND-NEXT: [[COND:%.*]] = icmp eq i32 [[N]], [[TMP15]] ; IND-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; IND: for.end: ; IND-NEXT: ret void @@ -2594,13 +2594,13 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; UNROLL: for.body: ; UNROLL-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NEXT: [[TMP19:%.*]] = trunc i64 [[I]] to i32 -; UNROLL-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], [[A]] +; UNROLL-NEXT: [[TMP20:%.*]] = add i32 [[A]], [[TMP19]] ; UNROLL-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 ; UNROLL-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[I]], i32 1 ; UNROLL-NEXT: store i16 [[TMP21]], ptr [[TMP22]], align 2 ; UNROLL-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; UNROLL-NEXT: [[TMP23:%.*]] = trunc i64 [[I_NEXT]] to i32 -; UNROLL-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP23]], [[N]] +; UNROLL-NEXT: [[COND:%.*]] = icmp eq i32 [[N]], [[TMP23]] ; UNROLL-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; UNROLL: for.end: ; UNROLL-NEXT: ret void @@ -2730,13 +2730,13 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) { ; INTERLEAVE: for.body: ; INTERLEAVE-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; INTERLEAVE-NEXT: [[TMP31:%.*]] = trunc i64 [[I]] to i32 -; INTERLEAVE-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], [[A]] +; INTERLEAVE-NEXT: [[TMP32:%.*]] = add i32 [[A]], [[TMP31]] ; INTERLEAVE-NEXT: [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16 ; INTERLEAVE-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[I]], i32 1 ; INTERLEAVE-NEXT: store i16 [[TMP33]], ptr [[TMP34]], align 2 ; INTERLEAVE-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; INTERLEAVE-NEXT: [[TMP35:%.*]] = trunc i64 [[I_NEXT]] to i32 -; INTERLEAVE-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP35]], [[N]] +; INTERLEAVE-NEXT: [[COND:%.*]] = icmp eq i32 [[N]], [[TMP35]] ; INTERLEAVE-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; INTERLEAVE: for.end: ; INTERLEAVE-NEXT: ret void @@ -3516,7 +3516,7 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) { ; IND-NEXT: [[TMP2:%.*]] = xor i8 [[T]], -1 ; IND-NEXT: [[TMP3:%.*]] = icmp ult i8 [[TMP2]], [[TMP1]] ; IND-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; IND-NEXT: [[TMP5:%.*]] = add i8 [[TMP4]], [[T]] +; IND-NEXT: [[TMP5:%.*]] = add i8 [[T]], [[TMP4]] ; IND-NEXT: [[TMP6:%.*]] = icmp slt i8 [[TMP5]], [[T]] ; IND-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 ; IND-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] @@ -3525,7 +3525,7 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) { ; IND: vector.ph: ; IND-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], 510 ; IND-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 -; IND-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] +; IND-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]] ; IND-NEXT: [[IND_END2:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]] ; IND-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0 ; IND-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer @@ -3535,7 +3535,7 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) { ; IND-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[DOTCAST4:%.*]] = trunc i32 [[INDEX]] to i8 -; IND-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST4]], [[T]] +; IND-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST4]] ; IND-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 ; IND-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] ; IND-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 @@ -3582,7 +3582,7 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) { ; UNROLL-NEXT: [[TMP2:%.*]] = xor i8 [[T]], -1 ; UNROLL-NEXT: [[TMP3:%.*]] = icmp ult i8 [[TMP2]], [[TMP1]] ; UNROLL-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; UNROLL-NEXT: [[TMP5:%.*]] = add i8 [[TMP4]], [[T]] +; UNROLL-NEXT: [[TMP5:%.*]] = add i8 [[T]], [[TMP4]] ; UNROLL-NEXT: [[TMP6:%.*]] = icmp slt i8 [[TMP5]], [[T]] ; UNROLL-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 ; UNROLL-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] @@ -3591,7 +3591,7 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) { ; UNROLL: vector.ph: ; UNROLL-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], 508 ; UNROLL-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 -; UNROLL-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] +; UNROLL-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]] ; UNROLL-NEXT: [[IND_END2:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]] ; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0 ; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer @@ -3602,7 +3602,7 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) { ; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], ; UNROLL-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8 -; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]] +; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST5]] ; UNROLL-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 ; UNROLL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] ; UNROLL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 8 @@ -3726,7 +3726,7 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) { ; INTERLEAVE-NEXT: [[TMP2:%.*]] = xor i8 [[T]], -1 ; INTERLEAVE-NEXT: [[TMP3:%.*]] = icmp ult i8 [[TMP2]], [[TMP1]] ; INTERLEAVE-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; INTERLEAVE-NEXT: [[TMP5:%.*]] = add i8 [[TMP4]], [[T]] +; INTERLEAVE-NEXT: [[TMP5:%.*]] = add i8 [[T]], [[TMP4]] ; INTERLEAVE-NEXT: [[TMP6:%.*]] = icmp slt i8 [[TMP5]], [[T]] ; INTERLEAVE-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 ; INTERLEAVE-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] @@ -3735,7 +3735,7 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) { ; INTERLEAVE: vector.ph: ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], 504 ; INTERLEAVE-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 -; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] +; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]] ; INTERLEAVE-NEXT: [[IND_END2:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]] ; INTERLEAVE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT]], i64 0 ; INTERLEAVE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer @@ -3746,7 +3746,7 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) { ; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], ; INTERLEAVE-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8 -; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]] +; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST5]] ; INTERLEAVE-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 ; INTERLEAVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] ; INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 16 @@ -3900,7 +3900,7 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) { ; IND-NEXT: [[TMP2:%.*]] = xor i8 [[T]], -1 ; IND-NEXT: [[TMP3:%.*]] = icmp ult i8 [[TMP2]], [[TMP1]] ; IND-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; IND-NEXT: [[TMP5:%.*]] = add i8 [[TMP4]], [[T]] +; IND-NEXT: [[TMP5:%.*]] = add i8 [[T]], [[TMP4]] ; IND-NEXT: [[TMP6:%.*]] = icmp slt i8 [[TMP5]], [[T]] ; IND-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 ; IND-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] @@ -3909,7 +3909,7 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) { ; IND: vector.ph: ; IND-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], 510 ; IND-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 -; IND-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] +; IND-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]] ; IND-NEXT: [[EXT_MUL5:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]] ; IND-NEXT: [[IND_END1:%.*]] = shl nuw nsw i32 [[EXT_MUL5]], 2 ; IND-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0 @@ -3920,7 +3920,7 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) { ; IND-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[DOTCAST4:%.*]] = trunc i32 [[INDEX]] to i8 -; IND-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST4]], [[T]] +; IND-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST4]] ; IND-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 ; IND-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] ; IND-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 @@ -3969,7 +3969,7 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) { ; UNROLL-NEXT: [[TMP2:%.*]] = xor i8 [[T]], -1 ; UNROLL-NEXT: [[TMP3:%.*]] = icmp ult i8 [[TMP2]], [[TMP1]] ; UNROLL-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; UNROLL-NEXT: [[TMP5:%.*]] = add i8 [[TMP4]], [[T]] +; UNROLL-NEXT: [[TMP5:%.*]] = add i8 [[T]], [[TMP4]] ; UNROLL-NEXT: [[TMP6:%.*]] = icmp slt i8 [[TMP5]], [[T]] ; UNROLL-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 ; UNROLL-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] @@ -3978,7 +3978,7 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) { ; UNROLL: vector.ph: ; UNROLL-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], 508 ; UNROLL-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 -; UNROLL-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] +; UNROLL-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]] ; UNROLL-NEXT: [[EXT_MUL6:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]] ; UNROLL-NEXT: [[IND_END1:%.*]] = shl nuw nsw i32 [[EXT_MUL6]], 2 ; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0 @@ -3990,7 +3990,7 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) { ; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], ; UNROLL-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8 -; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]] +; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST5]] ; UNROLL-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 ; UNROLL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] ; UNROLL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 8 @@ -4119,7 +4119,7 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) { ; INTERLEAVE-NEXT: [[TMP2:%.*]] = xor i8 [[T]], -1 ; INTERLEAVE-NEXT: [[TMP3:%.*]] = icmp ult i8 [[TMP2]], [[TMP1]] ; INTERLEAVE-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; INTERLEAVE-NEXT: [[TMP5:%.*]] = add i8 [[TMP4]], [[T]] +; INTERLEAVE-NEXT: [[TMP5:%.*]] = add i8 [[T]], [[TMP4]] ; INTERLEAVE-NEXT: [[TMP6:%.*]] = icmp slt i8 [[TMP5]], [[T]] ; INTERLEAVE-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 ; INTERLEAVE-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] @@ -4128,7 +4128,7 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) { ; INTERLEAVE: vector.ph: ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], 504 ; INTERLEAVE-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 -; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] +; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]] ; INTERLEAVE-NEXT: [[EXT_MUL6:%.*]] = add nuw nsw i32 [[N_VEC]], [[EXT]] ; INTERLEAVE-NEXT: [[IND_END1:%.*]] = shl nuw nsw i32 [[EXT_MUL6]], 2 ; INTERLEAVE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT_MUL]], i64 0 @@ -4140,7 +4140,7 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) { ; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], ; INTERLEAVE-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8 -; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]] +; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST5]] ; INTERLEAVE-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 ; INTERLEAVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] ; INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 16 @@ -4262,7 +4262,7 @@ define void @veciv(ptr nocapture %a, i32 %start, i32 %k) { ; IND-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; IND-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] ; IND: middle.block: -; IND-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[K]] +; IND-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]] ; IND-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; IND: scalar.ph: ; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ] @@ -4299,7 +4299,7 @@ define void @veciv(ptr nocapture %a, i32 %start, i32 %k) { ; UNROLL-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] ; UNROLL: middle.block: -; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[K]] +; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: ; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ] @@ -4376,7 +4376,7 @@ define void @veciv(ptr nocapture %a, i32 %start, i32 %k) { ; INTERLEAVE-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] ; INTERLEAVE: middle.block: -; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[K]] +; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: ; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ] @@ -4474,7 +4474,7 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) { ; IND-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IND-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] ; IND: middle.block: -; IND-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[K]] +; IND-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[K]], [[N_VEC]] ; IND-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; IND: scalar.ph: ; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] @@ -4517,7 +4517,7 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) { ; UNROLL-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] ; UNROLL: middle.block: -; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[K]] +; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[K]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: ; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] @@ -4609,7 +4609,7 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) { ; INTERLEAVE-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] ; INTERLEAVE: middle.block: -; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[K]] +; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[K]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: ; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] @@ -4694,7 +4694,7 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) { ; IND-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IND: vector.ph: ; IND-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -2 -; IND-NEXT: [[IND_END:%.*]] = add i32 [[N_VEC]], [[I]] +; IND-NEXT: [[IND_END:%.*]] = add i32 [[I]], [[N_VEC]] ; IND-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[I]], i64 0 ; IND-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; IND-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], @@ -4702,7 +4702,7 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) { ; IND: vector.body: ; IND-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; IND-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[INDEX]], [[I]] +; IND-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[I]], [[INDEX]] ; IND-NEXT: [[TMP1:%.*]] = sext i32 [[OFFSET_IDX]] to i64 ; IND-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]] ; IND-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4 @@ -4734,7 +4734,7 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) { ; UNROLL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; UNROLL: vector.ph: ; UNROLL-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -4 -; UNROLL-NEXT: [[IND_END:%.*]] = add i32 [[N_VEC]], [[I]] +; UNROLL-NEXT: [[IND_END:%.*]] = add i32 [[I]], [[N_VEC]] ; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[I]], i64 0 ; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], @@ -4743,7 +4743,7 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) { ; UNROLL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[INDEX]], [[I]] +; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[I]], [[INDEX]] ; UNROLL-NEXT: [[TMP1:%.*]] = sext i32 [[OFFSET_IDX]] to i64 ; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]] ; UNROLL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 8 @@ -4823,7 +4823,7 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) { ; INTERLEAVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; INTERLEAVE: vector.ph: ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -8 -; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i32 [[N_VEC]], [[I]] +; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i32 [[I]], [[N_VEC]] ; INTERLEAVE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I]], i64 0 ; INTERLEAVE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; INTERLEAVE-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], @@ -4832,7 +4832,7 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) { ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], -; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[INDEX]], [[I]] +; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[I]], [[INDEX]] ; INTERLEAVE-NEXT: [[TMP1:%.*]] = sext i32 [[OFFSET_IDX]] to i64 ; INTERLEAVE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]] ; INTERLEAVE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 16 @@ -6307,7 +6307,7 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; IND: vector.ph: ; IND-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -2 ; IND-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 -; IND-NEXT: [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]] +; IND-NEXT: [[IND_END:%.*]] = mul i32 [[STEP]], [[DOTCAST]] ; IND-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0 ; IND-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; IND-NEXT: [[TMP15:%.*]] = mul nuw <2 x i32> [[DOTSPLAT]], @@ -6328,7 +6328,7 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; IND-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]] ; IND: middle.block: ; IND-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[VEC_IND]], i64 1 -; IND-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; IND-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; IND-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; IND: scalar.ph: ; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] @@ -6378,7 +6378,7 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; UNROLL: vector.ph: ; UNROLL-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4 ; UNROLL-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 -; UNROLL-NEXT: [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]] +; UNROLL-NEXT: [[IND_END:%.*]] = mul i32 [[STEP]], [[DOTCAST]] ; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0 ; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NEXT: [[TMP15:%.*]] = mul nuw <2 x i32> [[DOTSPLAT]], @@ -6403,7 +6403,7 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; UNROLL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]] ; UNROLL: middle.block: ; UNROLL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[STEP_ADD]], i64 1 -; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: ; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] @@ -6536,7 +6536,7 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; INTERLEAVE: vector.ph: ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -8 ; INTERLEAVE-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 -; INTERLEAVE-NEXT: [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]] +; INTERLEAVE-NEXT: [[IND_END:%.*]] = mul i32 [[STEP]], [[DOTCAST]] ; INTERLEAVE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i64 0 ; INTERLEAVE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; INTERLEAVE-NEXT: [[TMP15:%.*]] = mul <4 x i32> [[DOTSPLAT]], @@ -6561,7 +6561,7 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; INTERLEAVE-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]] ; INTERLEAVE: middle.block: ; INTERLEAVE-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i64 3 -; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: ; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll index 6fc52ab3f26e03..29ce8457e8d65f 100644 --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -1481,7 +1481,7 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) { ; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 2 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]] ; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[TMP5]], i64 6 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP2]], [[B]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP2]] ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll index 50c67040cfb2a9..45de11141235e6 100644 --- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll +++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll @@ -28,8 +28,8 @@ define void @inv_val_store_to_inv_address_conditional_diff_values_ic(ptr %a, i64 ; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[N]], 2 ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[B]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -120,7 +120,7 @@ define void @inv_val_store_to_inv_address_conditional_inv(ptr %a, i64 %n, ptr %b ; CHECK-LABEL: @inv_val_store_to_inv_address_conditional_inv( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[NTRUNC]], [[K:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[K:%.*]], [[NTRUNC]] ; CHECK-NEXT: [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] @@ -128,8 +128,8 @@ define void @inv_val_store_to_inv_address_conditional_inv(ptr %a, i64 %n, ptr %b ; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[N]], 2 ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[B]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -217,8 +217,8 @@ define i32 @variant_val_store_to_inv_address(ptr %a, i64 %n, ptr %b, i32 %k) { ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 ; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[N]], 2 ; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[A]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[B]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll index 20d612a548b153..63381454cc5900 100644 --- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll @@ -27,8 +27,8 @@ define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b) ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 ; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[N]], 2 ; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[A]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[B]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -101,8 +101,8 @@ define void @inv_val_store_to_inv_address(ptr %a, i64 %n, ptr %b) { ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 ; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[N]], 2 ; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[A]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[B]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -176,8 +176,8 @@ define void @inv_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, i3 ; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[N]], 2 ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP1]], [[B]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -360,7 +360,7 @@ define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly ; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[J_022]] to i64 ; CHECK-NEXT: [[ARRAYIDX5_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[J_022]], -1 -; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], [[ITR]] +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[ITR]], [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 ; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP6]], 3 @@ -369,12 +369,12 @@ define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly ; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP4]], 2 ; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[VAR2]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP10:%.*]] = xor i32 [[J_022]], -1 -; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[ITR]] +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[ITR]], [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64 ; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i64 [[TMP4]], [[TMP12]] ; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2 ; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SCEVGEP2]], i64 [[TMP14]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP3]], [[VAR1]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[VAR1]], [[SCEVGEP3]] ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] @@ -414,7 +414,7 @@ define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly ; CHECK-NEXT: store i32 [[TMP22]], ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[ITR]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[ITR]], [[LFTR_WIDEIV]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_INC8_LOOPEXIT_LOOPEXIT:%.*]], label [[FOR_BODY3]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: for.inc8.loopexit.loopexit: ; CHECK-NEXT: br label [[FOR_INC8_LOOPEXIT]] @@ -424,7 +424,7 @@ define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly ; CHECK-NEXT: [[J_1_LCSSA]] = phi i32 [ [[J_022]], [[FOR_COND1_PREHEADER]] ], [ [[ITR]], [[FOR_INC8_LOOPEXIT]] ] ; CHECK-NEXT: [[INDVARS_IV_NEXT24]] = add nuw nsw i64 [[INDVARS_IV23]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV25:%.*]] = trunc i64 [[INDVARS_IV_NEXT24]] to i32 -; CHECK-NEXT: [[EXITCOND26:%.*]] = icmp eq i32 [[LFTR_WIDEIV25]], [[ITR]] +; CHECK-NEXT: [[EXITCOND26:%.*]] = icmp eq i32 [[ITR]], [[LFTR_WIDEIV25]] ; CHECK-NEXT: br i1 [[EXITCOND26]], label [[FOR_END10_LOOPEXIT:%.*]], label [[FOR_COND1_PREHEADER]] ; CHECK: for.end10.loopexit: ; CHECK-NEXT: br label [[FOR_END10]] @@ -507,7 +507,7 @@ define i32 @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocaptu ; CHECK-NEXT: store i32 [[TMP5]], ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[ITR]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[ITR]], [[LFTR_WIDEIV]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_INC8_LOOPEXIT:%.*]], label [[FOR_BODY3]] ; CHECK: for.inc8.loopexit: ; CHECK-NEXT: br label [[FOR_INC8]] @@ -515,7 +515,7 @@ define i32 @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocaptu ; CHECK-NEXT: [[J_1_LCSSA]] = phi i32 [ [[J_022]], [[FOR_COND1_PREHEADER]] ], [ [[ITR]], [[FOR_INC8_LOOPEXIT]] ] ; CHECK-NEXT: [[INDVARS_IV_NEXT24]] = add nuw nsw i64 [[INDVARS_IV23]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV25:%.*]] = trunc i64 [[INDVARS_IV_NEXT24]] to i32 -; CHECK-NEXT: [[EXITCOND26:%.*]] = icmp eq i32 [[LFTR_WIDEIV25]], [[ITR]] +; CHECK-NEXT: [[EXITCOND26:%.*]] = icmp eq i32 [[ITR]], [[LFTR_WIDEIV25]] ; CHECK-NEXT: br i1 [[EXITCOND26]], label [[FOR_END10_LOOPEXIT:%.*]], label [[FOR_COND1_PREHEADER]] ; CHECK: for.end10.loopexit: ; CHECK-NEXT: br label [[FOR_END10]] @@ -589,7 +589,7 @@ define void @unsafe_dep_uniform_load_store(i32 %arg, i32 %arg1, i64 %arg2, ptr % ; CHECK-NEXT: [[I13:%.*]] = add nsw i32 [[I12]], [[I9]] ; CHECK-NEXT: [[I14:%.*]] = trunc i32 [[I13]] to i16 ; CHECK-NEXT: [[I15:%.*]] = trunc i64 [[I8]] to i32 -; CHECK-NEXT: [[I16:%.*]] = add i32 [[I15]], [[ARG:%.*]] +; CHECK-NEXT: [[I16:%.*]] = add i32 [[ARG:%.*]], [[I15]] ; CHECK-NEXT: [[I17:%.*]] = zext i32 [[I16]] to i64 ; CHECK-NEXT: [[I18:%.*]] = getelementptr inbounds i16, ptr [[I6]], i64 [[I17]] ; CHECK-NEXT: store i16 [[I14]], ptr [[I18]], align 2 diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll index 873f6364f82811..c50bcf8ae88f5c 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll @@ -61,7 +61,7 @@ define float @cond_fadd(ptr noalias nocapture readonly %a, ptr noalias nocapture ; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -176,7 +176,7 @@ define float @cond_cmp_sel(ptr noalias %a, ptr noalias %cond, i64 %N) { ; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -294,7 +294,7 @@ define i32 @conditional_and(ptr noalias %A, ptr noalias %B, i32 %cond, i64 nound ; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -428,7 +428,7 @@ define i32 @simple_chained_rdx(ptr noalias %a, ptr noalias %b, ptr noalias %cond ; CHECK-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP47]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -597,7 +597,7 @@ define i64 @nested_cond_and(ptr noalias nocapture readonly %a, ptr noalias nocap ; CHECK-NEXT: br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP50:%.*]] = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> [[PREDPHI15]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -732,7 +732,7 @@ define i32 @cond-uncond(ptr noalias %src1, ptr noalias %src2, ptr noalias %cond, ; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP27]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -896,7 +896,7 @@ define float @cond_cond(ptr noalias %src1, ptr noalias %src2, ptr noalias %cond, ; CHECK-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PREDPHI15]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -1038,7 +1038,7 @@ define i32 @uncond_cond(ptr noalias %src1, ptr noalias %src2, ptr noalias %cond, ; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PREDPHI]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -1167,7 +1167,7 @@ define i32 @uncond_cond_uncond(ptr noalias %src1, ptr noalias %src2, ptr noalias ; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP28]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll index e6936b19415d0c..a226a5a36d63b4 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll @@ -989,7 +989,7 @@ define float @reduction_fmuladd(ptr %a, ptr %b, i64 %n) { ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -1132,7 +1132,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[I]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END7:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -1221,7 +1221,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[I]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END7:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -1292,7 +1292,7 @@ define i32 @predicated_or_dominates_reduction(ptr %b) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ undef, [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ undef, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[TMP0:%.*]] = or disjoint i32 [[INDEX]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = or disjoint i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = or disjoint i32 [[INDEX]], 3 @@ -1354,21 +1354,21 @@ define i32 @predicated_or_dominates_reduction(ptr %b) { ; CHECK: pred.load.continue6: ; CHECK-NEXT: [[TMP43:%.*]] = phi <4 x i32> [ [[TMP37]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP42]], [[PRED_LOAD_IF5]] ] ; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <4 x i32> [[TMP43]], zeroinitializer -; CHECK-NEXT: [[TMP46:%.*]] = xor <4 x i1> [[TMP19]], -; CHECK-NEXT: [[TMP47:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> , <4 x i1> [[TMP44]] -; CHECK-NEXT: [[TMP48:%.*]] = bitcast <4 x i1> [[TMP47]] to i4 -; CHECK-NEXT: [[TMP49:%.*]] = call range(i4 0, 5) i4 @llvm.ctpop.i4(i4 [[TMP48]]) -; CHECK-NEXT: [[TMP50:%.*]] = zext nneg i4 [[TMP49]] to i32 -; CHECK-NEXT: [[TMP51]] = add i32 [[VEC_PHI]], [[TMP50]] +; CHECK-NEXT: [[NOT_:%.*]] = xor <4 x i1> [[TMP19]], +; CHECK-NEXT: [[DOTNOT7:%.*]] = select <4 x i1> [[NOT_]], <4 x i1> , <4 x i1> [[TMP44]] +; CHECK-NEXT: [[TMP45:%.*]] = bitcast <4 x i1> [[DOTNOT7]] to i4 +; CHECK-NEXT: [[TMP46:%.*]] = call range(i4 0, 5) i4 @llvm.ctpop.i4(i4 [[TMP45]]) +; CHECK-NEXT: [[TMP47:%.*]] = zext nneg i4 [[TMP46]] to i32 +; CHECK-NEXT: [[TMP48]] = add i32 [[VEC_PHI]], [[TMP47]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP52:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] +; CHECK-NEXT: [[TMP49:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[A_1_LCSSA:%.*]] = phi i32 [ poison, [[FOR_INC:%.*]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[A_1_LCSSA:%.*]] = phi i32 [ poison, [[FOR_INC:%.*]] ], [ [[TMP48]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[A_1_LCSSA]] ; CHECK: for.body: ; CHECK-NEXT: br i1 poison, label [[LOR_LHS_FALSE:%.*]], label [[IF_THEN:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/reduction.ll b/llvm/test/Transforms/LoopVectorize/reduction.ll index b66ce4047ad95e..89fd1a9a73f2f0 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction.ll @@ -49,7 +49,7 @@ define i32 @reduction_sum(i32 %n, ptr %A, ptr %B) { ; CHECK-NEXT: [[TMP17]] = add i32 [[TMP16]], [[TMP13]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: ._crit_edge.loopexit: ; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP17]], [[DOTLR_PH]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] @@ -130,7 +130,7 @@ define i32 @reduction_prod(i32 %n, ptr %A, ptr %B) { ; CHECK-NEXT: [[TMP17]] = mul i32 [[TMP16]], [[TMP13]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: ._crit_edge.loopexit: ; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP17]], [[DOTLR_PH]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] @@ -211,7 +211,7 @@ define i32 @reduction_mix(i32 %n, ptr %A, ptr %B) { ; CHECK-NEXT: [[TMP17]] = add i32 [[TMP16]], [[TMP14]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: ._crit_edge.loopexit: ; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP17]], [[DOTLR_PH]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] @@ -292,7 +292,7 @@ define i32 @reduction_mul(i32 %n, ptr %A, ptr %B) { ; CHECK-NEXT: [[TMP17]] = mul i32 [[TMP16]], [[SUM_02]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: ._crit_edge.loopexit: ; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP17]], [[DOTLR_PH]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] @@ -369,7 +369,7 @@ define i32 @start_at_non_zero(ptr %in, ptr %coeff, ptr %out, i32 %n) { ; CHECK-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[SUM_09]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] @@ -445,7 +445,7 @@ define i32 @reduction_and(i32 %n, ptr %A, ptr %B) { ; CHECK-NEXT: [[AND]] = and i32 [[ADD]], [[RESULT_08]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] @@ -521,7 +521,7 @@ define i32 @reduction_or(i32 %n, ptr %A, ptr %B) { ; CHECK-NEXT: [[OR]] = or i32 [[ADD]], [[RESULT_08]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] @@ -597,7 +597,7 @@ define i32 @reduction_xor(i32 %n, ptr %A, ptr %B) { ; CHECK-NEXT: [[XOR]] = xor i32 [[ADD]], [[RESULT_08]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] @@ -646,7 +646,7 @@ define i32 @reduction_sub_rhs(i32 %n, ptr %A) { ; CHECK-NEXT: [[SUB]] = sub nsw i32 [[TMP0]], [[X_05]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: br label [[FOR_END]] @@ -714,7 +714,7 @@ define i32 @reduction_sub_lhs(i32 %n, ptr %A) { ; CHECK-NEXT: [[SUB]] = sub nsw i32 [[X_05]], [[TMP5]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: [[SUB_LCSSA:%.*]] = phi i32 [ [[SUB]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] @@ -1083,7 +1083,7 @@ define i32 @reduction_sum_multiuse(i32 %n, ptr %A, ptr %B) { ; CHECK-NEXT: [[TMP17]] = add i32 [[TMP16]], [[TMP13]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: ._crit_edge: ; CHECK-NEXT: [[SUM_COPY:%.*]] = phi i32 [ [[TMP17]], [[DOTLR_PH]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check.ll b/llvm/test/Transforms/LoopVectorize/runtime-check.ll index d5df8afc80a792..9521c0933fe876 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-check.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check.ll @@ -53,7 +53,7 @@ define i32 @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp ; CHECK-NEXT: store float [[MUL]], ptr [[ARRAYIDX2]], align 4, !dbg [[DBG9]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1, !dbg [[DBG9]] ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG9]] -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG9]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]], !dbg [[DBG9]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !dbg [[DBG9]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: br label [[FOR_END]], !dbg [[DBG14:![0-9]+]] @@ -144,7 +144,7 @@ define void @test_runtime_check(ptr %a, float %b, i64 %offset, i64 %offset2, i64 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] @@ -157,7 +157,7 @@ define void @test_runtime_check(ptr %a, float %b, i64 %offset, i64 %offset2, i64 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr float, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[ARR_IDX2:%.*]] = getelementptr float, ptr [[TMP13]], i64 [[OFFSET2]] ; CHECK-NEXT: [[L2:%.*]] = load float, ptr [[ARR_IDX2]], align 4 -; CHECK-NEXT: [[M:%.*]] = fmul fast float [[L2]], [[B]] +; CHECK-NEXT: [[M:%.*]] = fmul fast float [[B]], [[L2]] ; CHECK-NEXT: [[AD:%.*]] = fadd fast float [[L1]], [[M]] ; CHECK-NEXT: store float [[AD]], ptr [[ARR_IDX]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 @@ -231,7 +231,7 @@ define void @test_runtime_check2(ptr %a, float %b, i64 %offset, i64 %offset2, i6 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr float, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[ARR_IDX2:%.*]] = getelementptr float, ptr [[TMP1]], i64 [[OFFSET2:%.*]] ; CHECK-NEXT: [[L2:%.*]] = load float, ptr [[ARR_IDX2]], align 4 -; CHECK-NEXT: [[M:%.*]] = fmul fast float [[L2]], [[B:%.*]] +; CHECK-NEXT: [[M:%.*]] = fmul fast float [[B:%.*]], [[L2]] ; CHECK-NEXT: [[AD:%.*]] = fadd fast float [[L1]], [[M]] ; CHECK-NEXT: store float [[AD]], ptr [[ARR_IDX]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[C:%.*]], i64 [[IV]] diff --git a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll index 030eb9e76b51a5..b97ceba8b0116e 100644 --- a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll @@ -12,7 +12,7 @@ define void @add_ind64_unrolled(ptr noalias nocapture %a, ptr noalias nocapture ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -94,7 +94,7 @@ define void @add_ind64_unrolled_nxv1i64(ptr noalias nocapture %a, ptr noalias no ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -180,7 +180,7 @@ define void @add_unique_ind32(ptr noalias nocapture %a, i64 %n) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() @@ -257,7 +257,7 @@ define void @add_unique_indf32(ptr noalias nocapture %a, i64 %n) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() diff --git a/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll index 629b15c824f673..63ca45495335f4 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll @@ -23,7 +23,7 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -87,7 +87,7 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/PGOProfile/chr.ll b/llvm/test/Transforms/PGOProfile/chr.ll index 38e8f8536a19c0..34e39fe37979ac 100644 --- a/llvm/test/Transforms/PGOProfile/chr.ll +++ b/llvm/test/Transforms/PGOProfile/chr.ll @@ -1931,15 +1931,15 @@ bb4: define i32 @test_chr_21(i64 %i, i64 %k, i64 %j) !prof !14 { ; CHECK-LABEL: @test_chr_21( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[J_FR:%.*]] = freeze i64 [[J:%.*]] ; CHECK-NEXT: [[I_FR:%.*]] = freeze i64 [[I:%.*]] -; CHECK-NEXT: [[CMP0:%.*]] = icmp ne i64 [[J_FR]], [[K:%.*]] +; CHECK-NEXT: [[CMP0:%.*]] = icmp ne i64 [[J:%.*]], [[K:%.*]] ; CHECK-NEXT: [[TMP0:%.*]] = freeze i1 [[CMP0]] -; CHECK-NEXT: [[CMP3:%.*]] = icmp ne i64 [[I_FR]], [[J_FR]] +; CHECK-NEXT: [[CMP3:%.*]] = icmp ne i64 [[J]], [[I_FR]] ; CHECK-NEXT: [[CMP_I:%.*]] = icmp ne i64 [[I_FR]], 86 -; CHECK-NEXT: [[TMP1:%.*]] = and i1 [[TMP0]], [[CMP3]] -; CHECK-NEXT: [[TMP2:%.*]] = and i1 [[TMP1]], [[CMP_I]] -; CHECK-NEXT: br i1 [[TMP2]], label [[BB1:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF15]] +; CHECK-NEXT: [[TMP1:%.*]] = freeze i1 [[CMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = and i1 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = and i1 [[TMP2]], [[CMP_I]] +; CHECK-NEXT: br i1 [[TMP3]], label [[BB1:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF15]] ; CHECK: bb1: ; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i64 [[I_FR]], 2 ; CHECK-NEXT: switch i64 [[I_FR]], label [[BB2:%.*]] [ @@ -1971,7 +1971,7 @@ define i32 @test_chr_21(i64 %i, i64 %k, i64 %j) !prof !14 { ; CHECK-NEXT: [[CMP_I_NONCHR:%.*]] = icmp eq i64 [[I_FR]], 86 ; CHECK-NEXT: br i1 [[CMP_I_NONCHR]], label [[BB6_NONCHR:%.*]], label [[BB4_NONCHR:%.*]], !prof [[PROF16]] ; CHECK: bb6.nonchr: -; CHECK-NEXT: [[CMP3_NONCHR:%.*]] = icmp eq i64 [[J_FR]], [[I_FR]] +; CHECK-NEXT: [[CMP3_NONCHR:%.*]] = icmp eq i64 [[J]], [[I_FR]] ; CHECK-NEXT: br i1 [[CMP3_NONCHR]], label [[BB8_NONCHR:%.*]], label [[BB7_NONCHR:%.*]], !prof [[PROF16]] ; CHECK: bb8.nonchr: ; CHECK-NEXT: br i1 [[CMP_I_NONCHR]], label [[BB10]], label [[BB9_NONCHR:%.*]], !prof [[PROF16]] diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll index 11b1d54227681c..f44e39e82d6063 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll @@ -13,7 +13,7 @@ define i32 @read_only_loop_with_runtime_check(ptr noundef %array, i32 noundef %c ; CHECK: for.body.preheader: ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[DOTNOT_NOT:%.*]] = icmp ult i32 [[TMP1]], [[COUNT]] +; CHECK-NEXT: [[DOTNOT_NOT:%.*]] = icmp ugt i32 [[COUNT]], [[TMP1]] ; CHECK-NEXT: br i1 [[DOTNOT_NOT]], label [[FOR_BODY_PREHEADER10:%.*]], label [[IF_THEN:%.*]] ; CHECK: for.body.preheader10: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8 @@ -128,7 +128,7 @@ define dso_local noundef i32 @sum_prefix_with_sum(ptr %s.coerce0, i64 %s.coerce1 ; CHECK-NEXT: br i1 [[CMP5_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]] ; CHECK: for.body.preheader: ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1 -; CHECK-NEXT: [[DOTNOT_NOT:%.*]] = icmp ult i64 [[TMP0]], [[S_COERCE1]] +; CHECK-NEXT: [[DOTNOT_NOT:%.*]] = icmp ugt i64 [[S_COERCE1]], [[TMP0]] ; CHECK-NEXT: br i1 [[DOTNOT_NOT]], label [[FOR_BODY_PREHEADER8:%.*]], label [[COND_FALSE_I:%.*]], !prof [[PROF4:![0-9]+]] ; CHECK: for.body.preheader8: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 @@ -156,7 +156,7 @@ define dso_local noundef i32 @sum_prefix_with_sum(ptr %s.coerce0, i64 %s.coerce1 ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]] ; CHECK-NEXT: [[ADD]] = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER11]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[RET_0_LCSSA1:%.*]] = phi i32 [ 0, [[ENTRY1:%.*]] ], [ [[ADD]], [[SPAN_CHECKED_ACCESS_EXIT]] ], [ [[ADD1:%.*]], [[FOR_BODY1]] ] @@ -227,7 +227,7 @@ define hidden noundef nonnull align 4 dereferenceable(4) ptr @span_checked_acces ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SIZE__I:%.*]] = getelementptr inbounds i8, ptr [[THIS]], i64 8 ; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[__SIZE__I]], align 8 -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[TMP0]], [[__IDX]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[__IDX]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]], !prof [[PROF4]] ; CHECK: cond.false: ; CHECK-NEXT: tail call void @llvm.trap() diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll index ad100c399c08ed..33bcab679ba91a 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll @@ -134,11 +134,11 @@ define void @loop2(ptr %A, ptr %B, ptr %C, float %x) { ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 40000 ; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 40000 ; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 40000 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP2]], [[B]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[C]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[C]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: [[BOUND04:%.*]] = icmp ugt ptr [[SCEVGEP3]], [[B]] -; CHECK-NEXT: [[BOUND15:%.*]] = icmp ugt ptr [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[BOUND04:%.*]] = icmp ult ptr [[B]], [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND15:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]] ; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]] ; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[LOOP_BODY:%.*]], label [[VECTOR_PH:%.*]] @@ -158,8 +158,8 @@ define void @loop2(ptr %A, ptr %B, ptr %C, float %x) { ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 16 ; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP4]], align 4, !alias.scope [[META7:![0-9]+]] ; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP5]], align 4, !alias.scope [[META7]] -; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x float> [[WIDE_LOAD8]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x float> [[WIDE_LOAD9]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD8]] +; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD9]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i64 16 ; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP8]], align 4, !alias.scope [[META9:![0-9]+]], !noalias [[META11:![0-9]+]] @@ -181,7 +181,7 @@ define void @loop2(ptr %A, ptr %B, ptr %C, float %x) { ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[C_LV]], 20 ; CHECK-NEXT: [[A_GEP_0:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]] ; CHECK-NEXT: [[A_LV_0:%.*]] = load float, ptr [[A_GEP_0]], align 4 -; CHECK-NEXT: [[MUL2_I81_I:%.*]] = fmul float [[A_LV_0]], [[X]] +; CHECK-NEXT: [[MUL2_I81_I:%.*]] = fmul float [[X]], [[A_LV_0]] ; CHECK-NEXT: [[B_GEP_0:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV1]] ; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_LATCH]], label [[ELSE:%.*]] ; CHECK: else: diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll index db16413cdc94af..db0656da579f4b 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll @@ -97,8 +97,8 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[I]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY4_US_PREHEADER:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP20]], [[B]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP20]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[FOR_BODY4_US_PREHEADER]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -172,8 +172,8 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK-NEXT: [[MIN_ITERS_CHECK_1:%.*]] = icmp ult i32 [[I]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_1]], label [[FOR_BODY4_US_PREHEADER_1:%.*]], label [[VECTOR_MEMCHECK_1:%.*]] ; CHECK: vector.memcheck.1: -; CHECK-NEXT: [[BOUND0_1:%.*]] = icmp ugt ptr [[SCEVGEP20]], [[B]] -; CHECK-NEXT: [[BOUND1_1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[BOUND0_1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP20]] +; CHECK-NEXT: [[BOUND1_1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT_1:%.*]] = and i1 [[BOUND0_1]], [[BOUND1_1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT_1]], label [[FOR_BODY4_US_PREHEADER_1]], label [[VECTOR_PH_1:%.*]] ; CHECK: vector.ph.1: @@ -249,8 +249,8 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK-NEXT: [[MIN_ITERS_CHECK_2:%.*]] = icmp ult i32 [[I]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_2]], label [[FOR_BODY4_US_PREHEADER_2:%.*]], label [[VECTOR_MEMCHECK_2:%.*]] ; CHECK: vector.memcheck.2: -; CHECK-NEXT: [[BOUND0_2:%.*]] = icmp ugt ptr [[SCEVGEP20]], [[B]] -; CHECK-NEXT: [[BOUND1_2:%.*]] = icmp ugt ptr [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[BOUND0_2:%.*]] = icmp ult ptr [[B]], [[SCEVGEP20]] +; CHECK-NEXT: [[BOUND1_2:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT_2:%.*]] = and i1 [[BOUND0_2]], [[BOUND1_2]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT_2]], label [[FOR_BODY4_US_PREHEADER_2]], label [[VECTOR_PH_2:%.*]] ; CHECK: vector.ph.2: @@ -326,8 +326,8 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK-NEXT: [[MIN_ITERS_CHECK_3:%.*]] = icmp ult i32 [[I]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_3]], label [[FOR_BODY4_US_PREHEADER_3:%.*]], label [[VECTOR_MEMCHECK_3:%.*]] ; CHECK: vector.memcheck.3: -; CHECK-NEXT: [[BOUND0_3:%.*]] = icmp ugt ptr [[SCEVGEP20]], [[B]] -; CHECK-NEXT: [[BOUND1_3:%.*]] = icmp ugt ptr [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[BOUND0_3:%.*]] = icmp ult ptr [[B]], [[SCEVGEP20]] +; CHECK-NEXT: [[BOUND1_3:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT_3:%.*]] = and i1 [[BOUND0_3]], [[BOUND1_3]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT_3]], label [[FOR_BODY4_US_PREHEADER_3]], label [[VECTOR_PH_3:%.*]] ; CHECK: vector.ph.3: diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll index cc4890e27f2bda..2fe49a31b7722d 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll @@ -248,7 +248,7 @@ define i64 @at_with_int_conversion(ptr %ptr, i64 %idx) { ; CHECK-NEXT: [[START_INT:%.*]] = ptrtoint ptr [[START]] to i64 ; CHECK-NEXT: [[END_INT:%.*]] = ptrtoint ptr [[END]] to i64 ; CHECK-NEXT: [[SUB:%.*]] = sub i64 [[END_INT]], [[START_INT]] -; CHECK-NEXT: [[INRANGE:%.*]] = icmp ult i64 [[SUB]], [[IDX:%.*]] +; CHECK-NEXT: [[INRANGE:%.*]] = icmp ugt i64 [[IDX:%.*]], [[SUB]] ; CHECK-NEXT: br i1 [[INRANGE]], label [[ERROR:%.*]], label [[EXIT:%.*]] ; CHECK: exit: ; CHECK-NEXT: [[GEP_IDX:%.*]] = getelementptr i64, ptr [[START]], i64 [[IDX]] diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll index c133852f66937d..b53d0c211919b8 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll @@ -13,11 +13,11 @@ define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) { ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DCT]], i64 32 ; CHECK-NEXT: [[SCEVGEP23:%.*]] = getelementptr i8, ptr [[BIAS]], i64 32 ; CHECK-NEXT: [[SCEVGEP24:%.*]] = getelementptr i8, ptr [[MF]], i64 32 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[SCEVGEP23]], [[DCT]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[SCEVGEP]], [[BIAS]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DCT]], [[SCEVGEP23]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[BIAS]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: [[BOUND025:%.*]] = icmp ugt ptr [[SCEVGEP24]], [[DCT]] -; CHECK-NEXT: [[BOUND126:%.*]] = icmp ugt ptr [[SCEVGEP]], [[MF]] +; CHECK-NEXT: [[BOUND025:%.*]] = icmp ult ptr [[DCT]], [[SCEVGEP24]] +; CHECK-NEXT: [[BOUND126:%.*]] = icmp ult ptr [[MF]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT27:%.*]] = and i1 [[BOUND025]], [[BOUND126]] ; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT27]] ; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]] diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll index 8b82f21e38c931..6e9abb3813aa1d 100644 --- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll +++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll @@ -47,7 +47,7 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[BLOCKSIZE]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[WHILE_BODY_PREHEADER18]] ; CHECK: while.body.preheader18: ; CHECK-NEXT: [[BLKCNT_06_PH:%.*]] = phi i32 [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll b/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll index 791ef7cbeb3617..33a0eb43b70856 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll @@ -46,7 +46,7 @@ define dso_local void @_Z7computeRSt6vectorIiSaIiEEy(ptr noundef nonnull align 8 ; O2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DATA]], align 8 ; O2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEMS]], 8 ; O2-NEXT: [[N_VEC:%.*]] = and i64 [[NUMELEMS]], -8 -; O2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[NUMELEMS]] +; O2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUMELEMS]], [[N_VEC]] ; O2-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] ; O2: for.cond1.preheader: ; O2-NEXT: [[I_06:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC7:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] @@ -96,7 +96,7 @@ define dso_local void @_Z7computeRSt6vectorIiSaIiEEy(ptr noundef nonnull align 8 ; O3: for.cond1.preheader.us.preheader: ; O3-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEMS]], 8 ; O3-NEXT: [[N_VEC:%.*]] = and i64 [[NUMELEMS]], -8 -; O3-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[NUMELEMS]] +; O3-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUMELEMS]], [[N_VEC]] ; O3-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]] ; O3: for.cond1.preheader.us: ; O3-NEXT: [[I_06_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ] diff --git a/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll b/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll index c5deb716d80302..5bf7be4362a8e4 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll @@ -47,7 +47,7 @@ define void @licm(ptr align 8 dereferenceable(8) %_M_start.i, i64 %numElem) { ; O23-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; O23-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; O23: middle.block: -; O23-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[NUMELEM]] +; O23-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUMELEM]], [[N_VEC]] ; O23-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER]] ; O23: for.body.preheader: ; O23-NEXT: [[K_02_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll index c97d8da58be441..d2850f36a80dc2 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll @@ -94,7 +94,7 @@ define float @test_merge_anyof_v4sf(<4 x float> %t) { ; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 ; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 ; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[T]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[SHIFT]], [[T]] +; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[T]], [[SHIFT]] ; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x float> [[TMP6]], i64 0 ; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[DOTNOT]], float [[ADD]], float 0.000000e+00 ; CHECK-NEXT: ret float [[RETVAL_0]] @@ -409,7 +409,7 @@ define float @test_merge_anyof_v4si(<4 x i32> %t) { ; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 ; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 ; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[T]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[SHIFT]], [[T]] +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[T]], [[SHIFT]] ; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x i32> [[TMP6]], i64 0 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[ADD]] to float ; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[DOTNOT]], float [[CONV]], float 0.000000e+00 diff --git a/llvm/test/Transforms/PhaseOrdering/fast-basictest.ll b/llvm/test/Transforms/PhaseOrdering/fast-basictest.ll index 0127f05022d713..ec217a9cd31c6d 100644 --- a/llvm/test/Transforms/PhaseOrdering/fast-basictest.ll +++ b/llvm/test/Transforms/PhaseOrdering/fast-basictest.ll @@ -139,7 +139,7 @@ define float @test15_reassoc_nsz(float %b, float %a) { define float @test15_reassoc(float %b, float %a) { ; CHECK-LABEL: @test15_reassoc( ; CHECK-NEXT: [[TMP1:%.*]] = fadd reassoc float [[A:%.*]], 1.234000e+03 -; CHECK-NEXT: [[TMP2:%.*]] = fadd reassoc float [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = fadd reassoc float [[B:%.*]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = fsub reassoc float 0.000000e+00, [[A]] ; CHECK-NEXT: [[TMP4:%.*]] = fadd reassoc float [[TMP3]], [[TMP2]] ; CHECK-NEXT: ret float [[TMP4]] diff --git a/llvm/test/Transforms/PhaseOrdering/reassociate-instcombine.ll b/llvm/test/Transforms/PhaseOrdering/reassociate-instcombine.ll index 7e958e8906c9a6..13aeb9e64fc3f5 100644 --- a/llvm/test/Transforms/PhaseOrdering/reassociate-instcombine.ll +++ b/llvm/test/Transforms/PhaseOrdering/reassociate-instcombine.ll @@ -8,7 +8,7 @@ define i4 @not_reassociate_and_and_not(i4 %a, i4 %b, i4 %c, i4 %d) { ; CHECK-LABEL: @not_reassociate_and_and_not( ; CHECK-NEXT: [[TMP1:%.*]] = or i4 [[B:%.*]], [[C:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = xor i4 [[TMP1]], -1 -; CHECK-NEXT: [[AND2:%.*]] = and i4 [[TMP2]], [[A:%.*]] +; CHECK-NEXT: [[AND2:%.*]] = and i4 [[A:%.*]], [[TMP2]] ; CHECK-NEXT: [[AND3:%.*]] = and i4 [[AND2]], [[D:%.*]] ; CHECK-NEXT: ret i4 [[AND3]] ; @@ -25,7 +25,7 @@ define i32 @not_reassociate_or_or_not(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: @not_reassociate_or_or_not( ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[B:%.*]], [[C:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], -1 -; CHECK-NEXT: [[B2:%.*]] = or i32 [[TMP2]], [[A:%.*]] +; CHECK-NEXT: [[B2:%.*]] = or i32 [[A:%.*]], [[TMP2]] ; CHECK-NEXT: [[B3:%.*]] = or i32 [[B2]], [[D:%.*]] ; CHECK-NEXT: ret i32 [[B3]] ; diff --git a/llvm/test/Transforms/PhaseOrdering/runtime-check-removal.ll b/llvm/test/Transforms/PhaseOrdering/runtime-check-removal.ll index 89095048f22493..2933249782f444 100644 --- a/llvm/test/Transforms/PhaseOrdering/runtime-check-removal.ll +++ b/llvm/test/Transforms/PhaseOrdering/runtime-check-removal.ll @@ -10,7 +10,7 @@ define void @test_remove_check_with_incrementing_integer_induction(i16 %start, i ; CHECK-LABEL: @test_remove_check_with_incrementing_integer_induction( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[LEN:%.*]] = zext i8 [[LEN_N:%.*]] to i16 -; CHECK-NEXT: [[LEN_NEG_NOT:%.*]] = icmp ult i16 [[LEN]], [[A:%.*]] +; CHECK-NEXT: [[LEN_NEG_NOT:%.*]] = icmp ugt i16 [[A:%.*]], [[LEN]] ; CHECK-NEXT: [[C1:%.*]] = icmp ne i8 [[LEN_N]], 0 ; CHECK-NEXT: [[OR_COND3:%.*]] = and i1 [[LEN_NEG_NOT]], [[C1]] ; CHECK-NEXT: br i1 [[OR_COND3]], label [[LOOP_LATCH_PREHEADER:%.*]], label [[EXIT:%.*]] diff --git a/llvm/test/Transforms/Reassociate/fast-ArrayOutOfBounds.ll b/llvm/test/Transforms/Reassociate/fast-ArrayOutOfBounds.ll index 6dc7b89a9b1862..d629ce15c1c92b 100644 --- a/llvm/test/Transforms/Reassociate/fast-ArrayOutOfBounds.ll +++ b/llvm/test/Transforms/Reassociate/fast-ArrayOutOfBounds.ll @@ -6,14 +6,14 @@ define float @test1(float %a0, float %a1, float %a2, float %a3, float %a4) { ; CHECK-LABEL: define float @test1( ; CHECK-SAME: float [[A0:%.*]], float [[A1:%.*]], float [[A2:%.*]], float [[A3:%.*]], float [[A4:%.*]]) { ; CHECK-NEXT: [[TMP_2:%.*]] = fadd float [[A3]], [[A4]] -; CHECK-NEXT: [[TMP_4:%.*]] = fadd float [[TMP_2]], [[A2]] -; CHECK-NEXT: [[TMP_6:%.*]] = fadd float [[TMP_4]], [[A1]] -; CHECK-NEXT: [[TMP_8:%.*]] = fadd float [[TMP_6]], [[A0]] +; CHECK-NEXT: [[TMP_4:%.*]] = fadd float [[A2]], [[TMP_2]] +; CHECK-NEXT: [[TMP_6:%.*]] = fadd float [[A1]], [[TMP_4]] +; CHECK-NEXT: [[TMP_8:%.*]] = fadd float [[A0]], [[TMP_6]] ; CHECK-NEXT: [[TMP_11:%.*]] = fadd float [[A2]], [[A3]] -; CHECK-NEXT: [[TMP_13:%.*]] = fadd float [[TMP_11]], [[A1]] -; CHECK-NEXT: [[TMP_15:%.*]] = fadd float [[TMP_13]], [[A0]] +; CHECK-NEXT: [[TMP_13:%.*]] = fadd float [[A1]], [[TMP_11]] +; CHECK-NEXT: [[TMP_15:%.*]] = fadd float [[A0]], [[TMP_13]] ; CHECK-NEXT: [[TMP_18:%.*]] = fadd float [[A1]], [[A2]] -; CHECK-NEXT: [[TMP_20:%.*]] = fadd float [[TMP_18]], [[A0]] +; CHECK-NEXT: [[TMP_20:%.*]] = fadd float [[A0]], [[TMP_18]] ; CHECK-NEXT: [[TMP_23:%.*]] = fadd float [[A0]], [[A1]] ; CHECK-NEXT: [[TMP_26:%.*]] = fsub float [[TMP_8]], [[TMP_15]] ; CHECK-NEXT: [[TMP_28:%.*]] = fadd float [[TMP_20]], [[TMP_26]] diff --git a/llvm/test/Transforms/Reassociate/fast-SubReassociate.ll b/llvm/test/Transforms/Reassociate/fast-SubReassociate.ll index 5152201ea7c934..2d6f67bbaff6a9 100644 --- a/llvm/test/Transforms/Reassociate/fast-SubReassociate.ll +++ b/llvm/test/Transforms/Reassociate/fast-SubReassociate.ll @@ -33,8 +33,8 @@ define float @test2(float %A, float %B) { ; Both 'reassoc' and 'nsz' are required. define float @test2_minimal(float %A, float %B) { ; CHECK-LABEL: @test2_minimal( -; CHECK-NEXT: [[TMP1:%.*]] = fsub reassoc nsz float [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: ret float [[TMP1]] +; CHECK-NEXT: [[Z:%.*]] = fsub reassoc nsz float [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret float [[Z]] ; %W = fadd reassoc nsz float %B, 5.000000e+00 %X = fadd reassoc nsz float %A, -7.000000e+00 @@ -81,7 +81,7 @@ define float @test3(float %A, float %B, float %C, float %D) { define float @test4(float %A, float %B, float %C, float %D) { ; CHECK-LABEL: @test4( ; CHECK-NEXT: [[TMP1:%.*]] = fadd fast float [[B:%.*]], [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fadd fast float [[TMP1]], [[C:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast float [[C:%.*]], [[TMP1]] ; CHECK-NEXT: [[Q:%.*]] = fsub fast float [[D:%.*]], [[TMP2]] ; CHECK-NEXT: ret float [[Q]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll index 997b8ac8add32d..fd5f09bf2adc04 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll @@ -8,9 +8,9 @@ define <4 x i32> @icmp_eq_v4i32(<4 x i32> %a, ptr %b) { ; CHECK-LABEL: @icmp_eq_v4i32( -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %a0 = extractelement <4 x i32> %a, i32 0 @@ -38,9 +38,9 @@ define <4 x i32> @icmp_eq_v4i32(<4 x i32> %a, ptr %b) { define <4 x i32> @icmp_ne_v4i32(<4 x i32> %a, ptr %b) { ; CHECK-LABEL: @icmp_ne_v4i32( -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i32> [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %a0 = extractelement <4 x i32> %a, i32 0 @@ -68,9 +68,9 @@ define <4 x i32> @icmp_ne_v4i32(<4 x i32> %a, ptr %b) { define <4 x i32> @fcmp_oeq_v4i32(<4 x float> %a, ptr %b) { ; CHECK-LABEL: @fcmp_oeq_v4i32( -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fcmp oeq <4 x float> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fcmp oeq <4 x float> [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %a0 = extractelement <4 x float> %a, i32 0 @@ -98,9 +98,9 @@ define <4 x i32> @fcmp_oeq_v4i32(<4 x float> %a, ptr %b) { define <4 x i32> @fcmp_uno_v4i32(<4 x float> %a, ptr %b) { ; CHECK-LABEL: @fcmp_uno_v4i32( -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fcmp uno <4 x float> [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %a0 = extractelement <4 x float> %a, i32 0 @@ -132,9 +132,9 @@ define <4 x i32> @fcmp_uno_v4i32(<4 x float> %a, ptr %b) { define <4 x i32> @icmp_sgt_slt_v4i32(<4 x i32> %a, ptr %b) { ; CHECK-LABEL: @icmp_sgt_slt_v4i32( -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[A:%.*]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %a0 = extractelement <4 x i32> %a, i32 0 @@ -162,9 +162,9 @@ define <4 x i32> @icmp_sgt_slt_v4i32(<4 x i32> %a, ptr %b) { define <4 x i32> @icmp_uge_ule_v4i32(<4 x i32> %a, ptr %b) { ; CHECK-LABEL: @icmp_uge_ule_v4i32( -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ule <4 x i32> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp uge <4 x i32> [[A:%.*]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %a0 = extractelement <4 x i32> %a, i32 0 @@ -192,9 +192,9 @@ define <4 x i32> @icmp_uge_ule_v4i32(<4 x i32> %a, ptr %b) { define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, ptr %b) { ; CHECK-LABEL: @fcmp_ogt_olt_v4i32( -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fcmp olt <4 x float> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fcmp ogt <4 x float> [[A:%.*]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %a0 = extractelement <4 x float> %a, i32 0 @@ -222,11 +222,11 @@ define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, ptr %b) { define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, ptr %b) { ; CHECK-LABEL: @fcmp_ord_uno_v4i32( -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fcmp ord <4 x float> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[TMP2]], [[A]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fcmp ord <4 x float> [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[TMP1]], [[A]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %a0 = extractelement <4 x float> %a, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll index 29cf66a1ea6562..35619d6d3ad1da 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll @@ -8,9 +8,9 @@ define <4 x i32> @icmp_eq_v4i32(<4 x i32> %a, ptr %b) { ; CHECK-LABEL: @icmp_eq_v4i32( -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %a0 = extractelement <4 x i32> %a, i32 0 @@ -38,9 +38,9 @@ define <4 x i32> @icmp_eq_v4i32(<4 x i32> %a, ptr %b) { define <4 x i32> @icmp_ne_v4i32(<4 x i32> %a, ptr %b) { ; CHECK-LABEL: @icmp_ne_v4i32( -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i32> [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %a0 = extractelement <4 x i32> %a, i32 0 @@ -68,9 +68,9 @@ define <4 x i32> @icmp_ne_v4i32(<4 x i32> %a, ptr %b) { define <4 x i32> @fcmp_oeq_v4i32(<4 x float> %a, ptr %b) { ; CHECK-LABEL: @fcmp_oeq_v4i32( -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fcmp oeq <4 x float> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fcmp oeq <4 x float> [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %a0 = extractelement <4 x float> %a, i32 0 @@ -98,9 +98,9 @@ define <4 x i32> @fcmp_oeq_v4i32(<4 x float> %a, ptr %b) { define <4 x i32> @fcmp_uno_v4i32(<4 x float> %a, ptr %b) { ; CHECK-LABEL: @fcmp_uno_v4i32( -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fcmp uno <4 x float> [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %a0 = extractelement <4 x float> %a, i32 0 @@ -132,9 +132,9 @@ define <4 x i32> @fcmp_uno_v4i32(<4 x float> %a, ptr %b) { define <4 x i32> @icmp_sgt_slt_v4i32(<4 x i32> %a, ptr %b) { ; CHECK-LABEL: @icmp_sgt_slt_v4i32( -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[A:%.*]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %a0 = extractelement <4 x i32> %a, i32 0 @@ -162,9 +162,9 @@ define <4 x i32> @icmp_sgt_slt_v4i32(<4 x i32> %a, ptr %b) { define <4 x i32> @icmp_uge_ule_v4i32(<4 x i32> %a, ptr %b) { ; CHECK-LABEL: @icmp_uge_ule_v4i32( -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ule <4 x i32> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp uge <4 x i32> [[A:%.*]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %a0 = extractelement <4 x i32> %a, i32 0 @@ -192,9 +192,9 @@ define <4 x i32> @icmp_uge_ule_v4i32(<4 x i32> %a, ptr %b) { define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, ptr %b) { ; CHECK-LABEL: @fcmp_ogt_olt_v4i32( -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fcmp olt <4 x float> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fcmp ogt <4 x float> [[A:%.*]], [[TMP1]] +; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %a0 = extractelement <4 x float> %a, i32 0 @@ -222,11 +222,11 @@ define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, ptr %b) { define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, ptr %b) { ; CHECK-LABEL: @fcmp_ord_uno_v4i32( -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fcmp ord <4 x float> [[TMP2]], [[A:%.*]] -; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[TMP2]], [[A]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fcmp ord <4 x float> [[TMP1]], [[A:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[TMP1]], [[A]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %a0 = extractelement <4 x float> %a, i32 0 From 9d364286f3b63e99ed3838f179aa2223f930f1ab Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 21 Aug 2024 14:26:42 +0400 Subject: [PATCH 042/426] AMDGPU: Remove flat/global atomic fadd v2bf16 intrinsics (#97050) These are now fully covered by atomicrmw. --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 4 - llvm/lib/IR/AutoUpgrade.cpp | 14 +- llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 2 - .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 - .../Target/AMDGPU/AMDGPUSearchableTables.td | 2 - llvm/lib/Target/AMDGPU/FLATInstructions.td | 2 - llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 +- llvm/test/Bitcode/amdgcn-atomic.ll | 22 ++ .../AMDGPU/GlobalISel/fp-atomics-gfx940.ll | 106 --------- .../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll | 218 ------------------ llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll | 193 ---------------- 11 files changed, 33 insertions(+), 538 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 7c93c19a410e43..539410f1ed05e6 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2996,10 +2996,6 @@ multiclass AMDGPUMFp8SmfmacIntrinsic { def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic; } -// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm. -def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn; -def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn; - defset list AMDGPUMFMAIntrinsics940 = { def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic; def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic; diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index e24d119b781628..c6963edf5288ae 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1034,7 +1034,9 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, } if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") || - Name.starts_with("ds.fmax")) { + Name.starts_with("ds.fmax") || + Name.starts_with("global.atomic.fadd.v2bf16") || + Name.starts_with("flat.atomic.fadd.v2bf16")) { // Replaced with atomicrmw fadd/fmin/fmax, so there's no new // declaration. NewFn = nullptr; @@ -4042,7 +4044,9 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI, .StartsWith("ds.fmin", AtomicRMWInst::FMin) .StartsWith("ds.fmax", AtomicRMWInst::FMax) .StartsWith("atomic.inc.", AtomicRMWInst::UIncWrap) - .StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap); + .StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap) + .StartsWith("global.atomic.fadd", AtomicRMWInst::FAdd) + .StartsWith("flat.atomic.fadd", AtomicRMWInst::FAdd); unsigned NumOperands = CI->getNumOperands(); if (NumOperands < 3) // Malformed bitcode. @@ -4097,8 +4101,10 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI, Builder.CreateAtomicRMW(RMWOp, Ptr, Val, std::nullopt, Order, SSID); if (PtrTy->getAddressSpace() != 3) { - RMW->setMetadata("amdgpu.no.fine.grained.memory", - MDNode::get(F->getContext(), {})); + MDNode *EmptyMD = MDNode::get(F->getContext(), {}); + RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD); + if (RMWOp == AtomicRMWInst::FAdd && RetTy->isFloatTy()) + RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD); } if (IsVolatile) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index c6dbc58395e48f..db8b44149cf47e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -620,12 +620,10 @@ multiclass local_addr_space_atomic_op { defm int_amdgcn_flat_atomic_fadd : noret_op; defm int_amdgcn_flat_atomic_fadd : flat_addr_space_atomic_op; -defm int_amdgcn_flat_atomic_fadd_v2bf16 : noret_op; defm int_amdgcn_flat_atomic_fmin : noret_op; defm int_amdgcn_flat_atomic_fmax : noret_op; defm int_amdgcn_global_atomic_fadd : global_addr_space_atomic_op; defm int_amdgcn_flat_atomic_fadd : global_addr_space_atomic_op; -defm int_amdgcn_global_atomic_fadd_v2bf16 : noret_op; defm int_amdgcn_global_atomic_fmin : noret_op; defm int_amdgcn_global_atomic_fmax : noret_op; defm int_amdgcn_global_atomic_csub : noret_op; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 5add368c05646a..12aa6ee2a2536a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4907,8 +4907,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmin_num: case Intrinsic::amdgcn_flat_atomic_fmax_num: - case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: - case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: case Intrinsic::amdgcn_atomic_cond_sub_u32: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: case Intrinsic::amdgcn_global_load_tr_b64: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 5c4d2b8d030e1d..48fb786ed97206 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -250,8 +250,6 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 351563657aeb55..8067090636a9aa 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1674,13 +1674,11 @@ defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in { defm : FlatAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>; -defm : FlatAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>; defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>; defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>; } let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>; } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ae55b56fbf43fb..d02d0bbb52e567 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1362,9 +1362,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmin_num: case Intrinsic::amdgcn_flat_atomic_fmax_num: - case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: - case Intrinsic::amdgcn_atomic_cond_sub_u32: - case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: { + case Intrinsic::amdgcn_atomic_cond_sub_u32: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); @@ -1467,14 +1465,12 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_flat_atomic_fadd: - case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_csub: case Intrinsic::amdgcn_global_atomic_fadd: - case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: case Intrinsic::amdgcn_global_atomic_fmax: case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fmin: diff --git a/llvm/test/Bitcode/amdgcn-atomic.ll b/llvm/test/Bitcode/amdgcn-atomic.ll index a114c27bafd4a2..9563d178e64330 100644 --- a/llvm/test/Bitcode/amdgcn-atomic.ll +++ b/llvm/test/Bitcode/amdgcn-atomic.ll @@ -300,4 +300,26 @@ define float @upgrade_amdgcn_ds_fmax_f32_no_suffix(ptr addrspace(3) %ptr, float ret float %result0 } +declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr, <2 x i16>) + +define <2 x i16> @upgrade_amdgcn_flat_atomic_fadd_v2bf16_p0(ptr %ptr, <2 x i16> %data) { + ; CHECK: [[BC0:%.+]] = bitcast <2 x i16> %data to <2 x bfloat> + ; CHECK-NEXT: [[ATOMIC:%.+]] = atomicrmw fadd ptr %ptr, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + ; CHECK-NEXT: [[BC1:%.+]] = bitcast <2 x bfloat> [[ATOMIC]] to <2 x i16> + ; CHECK-NEXT: ret <2 x i16> [[BC1]] + %result = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) + ret <2 x i16> %result +} + +declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1), <2 x i16>) + +define <2 x i16> @upgrade_amdgcn_global_atomic_fadd_v2bf16_p1(ptr addrspace(1) %ptr, <2 x i16> %data) { + ; CHECK: [[BC0:%.+]] = bitcast <2 x i16> %data to <2 x bfloat> + ; CHECK-NEXT: [[ATOMIC:%.+]] = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + ; CHECK-NEXT: [[BC1:%.+]] = bitcast <2 x bfloat> [[ATOMIC]] to <2 x i16> + ; CHECK-NEXT: ret <2 x i16> [[BC1]] + %result = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) + ret <2 x i16> %result +} + attributes #0 = { argmemonly nounwind willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll index 085da8bc4f8d99..031a3633bd3757 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -4,12 +4,6 @@ declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) -; bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm. -declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) -declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) -declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1) -declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) - define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret: ; GFX940: ; %bb.0: @@ -106,106 +100,6 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { ret <2 x half> %ret } -define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 -; GFX940-NEXT: s_endpgm - %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) - ret void -} - -define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) - ret <2 x i16> %ret -} - -define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) { -; GFX940-LABEL: global_atomic_fadd_v2bf16_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1] -; GFX940-NEXT: s_endpgm - %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) - ret void -} - -define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> %data) { -; GFX940-LABEL: global_atomic_fadd_v2bf16_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) - ret <2 x i16> %ret -} - -define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { -; GFX940-LABEL: local_atomic_fadd_v2f16_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 -; GFX940-NEXT: ds_pk_add_f16 v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_endpgm - %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) - ret void -} - -define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> %data) { -; GFX940-LABEL: local_atomic_fadd_v2f16_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) - ret <2 x half> %ret -} - -define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { -; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 -; GFX940-NEXT: ds_pk_add_f16 v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_endpgm - %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) - ret void -} - -define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> %data) { -; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) - ret <2 x i16> %ret -} - define <2 x half> @local_atomic_fadd_ret_v2f16_offset(ptr addrspace(3) %ptr, <2 x half> %val) { ; GFX940-LABEL: local_atomic_fadd_ret_v2f16_offset: ; GFX940: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll index 1914b74be1909b..05259b4f51310d 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll @@ -7,127 +7,7 @@ declare <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat>, declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32) declare <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32, i32, i32) declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) -declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) -declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1) -declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) -declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) - -define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { -; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_noret: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE -; GFX12-SDAG-NEXT: ds_pk_add_f16 v0, v1 -; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_noret: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE -; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1 -; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE -; GFX12-GISEL-NEXT: s_endpgm - %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) - ret void -} - -define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { -; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_noret: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE -; GFX12-SDAG-NEXT: ds_pk_add_bf16 v0, v1 -; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_noret: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE -; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1 -; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE -; GFX12-GISEL-NEXT: s_endpgm - %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) - ret void -} - -define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> %data) { -; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_rtn: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 -; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_rtn: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE -; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 -; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) - ret <2 x half> %ret -} - -define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> %data) { -; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_rtn: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 -; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_rtn: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE -; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 -; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) - ret <2 x i16> %ret -} define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_noret: @@ -177,104 +57,6 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { ret <2 x half> %ret } -define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) { -; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_noret: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-SDAG-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_noret: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-GISEL-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 -; GFX12-GISEL-NEXT: s_endpgm - %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) - ret void -} - -define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { -; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_rtn: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_rtn: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) - ret <2 x i16> %ret -} - -define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) { -; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_noret: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] -; GFX12-SDAG-NEXT: s_nop 0 -; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_noret: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1] -; GFX12-GISEL-NEXT: s_nop 0 -; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-NEXT: s_endpgm - %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) - ret void -} - -define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> %data) { -; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_rtn: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_rtn: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) - ret <2 x i16> %ret -} - define void @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) { ; GFX12-SDAG-LABEL: global_atomic_pk_add_v2f16: ; GFX12-SDAG: ; %bb.0: ; %main_body diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll index 1be934d517ef71..5322a283d3de4d 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll @@ -4,10 +4,6 @@ declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) - -; bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm. -declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) -declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1) declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) @@ -186,97 +182,6 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { ret <2 x half> %ret } -define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 -; GFX940-NEXT: s_endpgm -; -; GFX12-LABEL: flat_atomic_fadd_v2bf16_noret: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 -; GFX12-NEXT: s_endpgm - %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) - ret void -} - -define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2bf16_rtn: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) - ret <2 x i16> %ret -} - -define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) { -; GFX940-LABEL: global_atomic_fadd_v2bf16_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] -; GFX940-NEXT: s_endpgm -; -; GFX12-LABEL: global_atomic_fadd_v2bf16_noret: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) - ret void -} - -define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> %data) { -; GFX940-LABEL: global_atomic_fadd_v2bf16_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: global_atomic_fadd_v2bf16_rtn: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) - ret <2 x i16> %ret -} - define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: @@ -573,104 +478,6 @@ define void @flat_atomic_fadd_v2f16_intrinsic_noret__negoffset(ptr %ptr, <2 x ha ret void } -define <2 x i16> @flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:4092 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x i16>, ptr %ptr, i64 1023 - %result = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) - ret <2 x i16> %result -} - -define <2 x i16> @flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x i16>, ptr %ptr, i64 -256 - %result = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) - ret <2 x i16> %result -} - -define void @flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:4092 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:4092 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x i16>, ptr %ptr, i64 1023 - %unused = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) - ret void -} - -define void @flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset(ptr %ptr, <2 x i16> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-1024 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x i16>, ptr %ptr, i64 -256 - %unused = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) - ret void -} - attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" } !0 = !{} From 681ae0972205e575ff1fd1d7ab0ef710ae364348 Mon Sep 17 00:00:00 2001 From: Frank Schlimbach Date: Wed, 21 Aug 2024 12:59:44 +0200 Subject: [PATCH 043/426] [MLIR][mesh] moving shardinginterfaceimpl for tensor to tensor extension lib (#104913) Follow-up to #102598 : as discussed, move tensor sharding implementation into separate tensor extension lib. @sogartar @yaochengji, could you take a look at this PR? --- .../Func/Extensions/MeshShardingExtensions.h | 2 +- .../Dialect/Tensor/Extensions/AllExtensions.h | 30 +++++++++++++++++++ .../Extensions/MeshShardingExtensions.h | 23 ++++++++++++++ .../IR/ShardingInterfaceImpl.h} | 0 mlir/include/mlir/InitAllDialects.h | 2 -- mlir/include/mlir/InitAllExtensions.h | 2 ++ .../Dialect/Mesh/Interfaces/CMakeLists.txt | 1 - mlir/lib/Dialect/Tensor/CMakeLists.txt | 1 + .../Tensor/Extensions/AllExtensions.cpp | 16 ++++++++++ .../Dialect/Tensor/Extensions/CMakeLists.txt | 26 ++++++++++++++++ .../Extensions/MeshShardingExtensions.cpp} | 2 +- mlir/tools/mlir-lsp-server/CMakeLists.txt | 1 + 12 files changed, 101 insertions(+), 5 deletions(-) create mode 100644 mlir/include/mlir/Dialect/Tensor/Extensions/AllExtensions.h create mode 100644 mlir/include/mlir/Dialect/Tensor/Extensions/MeshShardingExtensions.h rename mlir/include/mlir/Dialect/{Mesh/IR/TensorShardingInterfaceImpl.h => Tensor/IR/ShardingInterfaceImpl.h} (100%) create mode 100644 mlir/lib/Dialect/Tensor/Extensions/AllExtensions.cpp create mode 100644 mlir/lib/Dialect/Tensor/Extensions/CMakeLists.txt rename mlir/lib/Dialect/{Mesh/Interfaces/TensorShardingInterfaceImpl.cpp => Tensor/Extensions/MeshShardingExtensions.cpp} (98%) diff --git a/mlir/include/mlir/Dialect/Func/Extensions/MeshShardingExtensions.h b/mlir/include/mlir/Dialect/Func/Extensions/MeshShardingExtensions.h index 9b7abbca5d7622..30d3033209d213 100644 --- a/mlir/include/mlir/Dialect/Func/Extensions/MeshShardingExtensions.h +++ b/mlir/include/mlir/Dialect/Func/Extensions/MeshShardingExtensions.h @@ -1,4 +1,4 @@ -//===- ShardingInterfaceImpl.h - ------------------------------------------===// +//===- MeshShardingExtensions.h - -----------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/mlir/include/mlir/Dialect/Tensor/Extensions/AllExtensions.h b/mlir/include/mlir/Dialect/Tensor/Extensions/AllExtensions.h new file mode 100644 index 00000000000000..db0afa858b1fa0 --- /dev/null +++ b/mlir/include/mlir/Dialect/Tensor/Extensions/AllExtensions.h @@ -0,0 +1,30 @@ +//===- AllExtensions.h - All Tensor Extensions ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines a common entry point for registering all extensions to the +// Tensor dialect. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_TENSOR_EXTENSIONS_ALLEXTENSIONS_H +#define MLIR_DIALECT_TENSOR_EXTENSIONS_ALLEXTENSIONS_H + +namespace mlir { +class DialectRegistry; + +namespace tensor { +/// Register all extensions of the Tensor dialect. This should generally only be +/// used by tools, or other use cases that really do want *all* extensions of +/// the dialect. All other cases should prefer to instead register the specific +/// extensions they intend to take advantage of. +void registerAllExtensions(DialectRegistry ®istry); +} // namespace tensor + +} // namespace mlir + +#endif // MLIR_DIALECT_TENSOR_EXTENSIONS_ALLEXTENSIONS_H diff --git a/mlir/include/mlir/Dialect/Tensor/Extensions/MeshShardingExtensions.h b/mlir/include/mlir/Dialect/Tensor/Extensions/MeshShardingExtensions.h new file mode 100644 index 00000000000000..cfac485b807f2b --- /dev/null +++ b/mlir/include/mlir/Dialect/Tensor/Extensions/MeshShardingExtensions.h @@ -0,0 +1,23 @@ +//===- MeshShardingExtensions.h - -------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_TENSOR_EXTENSIONS_SHARDINGEXTENSIONS_H_ +#define MLIR_DIALECT_TENSOR_EXTENSIONS_SHARDINGEXTENSIONS_H_ + +namespace mlir { + +class DialectRegistry; + +namespace tensor { + +void registerShardingInterfaceExternalModels(DialectRegistry ®istry); + +} // namespace tensor +} // namespace mlir + +#endif // MLIR_DIALECT_TENSOR_EXTENSIONS_SHARDINGEXTENSIONS_H_ diff --git a/mlir/include/mlir/Dialect/Mesh/IR/TensorShardingInterfaceImpl.h b/mlir/include/mlir/Dialect/Tensor/IR/ShardingInterfaceImpl.h similarity index 100% rename from mlir/include/mlir/Dialect/Mesh/IR/TensorShardingInterfaceImpl.h rename to mlir/include/mlir/Dialect/Tensor/IR/ShardingInterfaceImpl.h diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h index ab81832cdbee55..73dccdb017ee14 100644 --- a/mlir/include/mlir/InitAllDialects.h +++ b/mlir/include/mlir/InitAllDialects.h @@ -58,7 +58,6 @@ #include "mlir/Dialect/MemRef/Transforms/BufferViewFlowOpInterfaceImpl.h" #include "mlir/Dialect/MemRef/Transforms/RuntimeOpVerification.h" #include "mlir/Dialect/Mesh/IR/MeshDialect.h" -#include "mlir/Dialect/Mesh/IR/TensorShardingInterfaceImpl.h" #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" #include "mlir/Dialect/OpenACC/OpenACC.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" @@ -182,7 +181,6 @@ inline void registerAllDialects(DialectRegistry ®istry) { tensor::registerBufferizableOpInterfaceExternalModels(registry); tensor::registerFindPayloadReplacementOpInterfaceExternalModels(registry); tensor::registerInferTypeOpInterfaceExternalModels(registry); - tensor::registerShardingInterfaceExternalModels(registry); tensor::registerSubsetOpInterfaceExternalModels(registry); tensor::registerTilingInterfaceExternalModels(registry); tensor::registerValueBoundsOpInterfaceExternalModels(registry); diff --git a/mlir/include/mlir/InitAllExtensions.h b/mlir/include/mlir/InitAllExtensions.h index 0adc5e52f2a0e5..dc5d4fbea04f49 100644 --- a/mlir/include/mlir/InitAllExtensions.h +++ b/mlir/include/mlir/InitAllExtensions.h @@ -34,6 +34,7 @@ #include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h" #include "mlir/Dialect/SCF/TransformOps/SCFTransformOps.h" #include "mlir/Dialect/SparseTensor/TransformOps/SparseTensorTransformOps.h" +#include "mlir/Dialect/Tensor/Extensions/AllExtensions.h" #include "mlir/Dialect/Tensor/TransformOps/TensorTransformOps.h" #include "mlir/Dialect/Transform/DebugExtension/DebugExtension.h" #include "mlir/Dialect/Transform/IRDLExtension/IRDLExtension.h" @@ -60,6 +61,7 @@ inline void registerAllExtensions(DialectRegistry ®istry) { registerConvertComplexToLLVMInterface(registry); cf::registerConvertControlFlowToLLVMInterface(registry); func::registerAllExtensions(registry); + tensor::registerAllExtensions(registry); registerConvertFuncToLLVMInterface(registry); index::registerConvertIndexToLLVMInterface(registry); registerConvertMathToLLVMInterface(registry); diff --git a/mlir/lib/Dialect/Mesh/Interfaces/CMakeLists.txt b/mlir/lib/Dialect/Mesh/Interfaces/CMakeLists.txt index 266fa6fa54557c..afe76b539846a7 100644 --- a/mlir/lib/Dialect/Mesh/Interfaces/CMakeLists.txt +++ b/mlir/lib/Dialect/Mesh/Interfaces/CMakeLists.txt @@ -1,6 +1,5 @@ add_mlir_library(MLIRShardingInterface ShardingInterface.cpp - TensorShardingInterfaceImpl.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Mesh diff --git a/mlir/lib/Dialect/Tensor/CMakeLists.txt b/mlir/lib/Dialect/Tensor/CMakeLists.txt index 329a6c3e80254f..a834aae8fbf81e 100644 --- a/mlir/lib/Dialect/Tensor/CMakeLists.txt +++ b/mlir/lib/Dialect/Tensor/CMakeLists.txt @@ -1,3 +1,4 @@ +add_subdirectory(Extensions) add_subdirectory(IR) add_subdirectory(Transforms) add_subdirectory(TransformOps) diff --git a/mlir/lib/Dialect/Tensor/Extensions/AllExtensions.cpp b/mlir/lib/Dialect/Tensor/Extensions/AllExtensions.cpp new file mode 100644 index 00000000000000..93e1a2021857d3 --- /dev/null +++ b/mlir/lib/Dialect/Tensor/Extensions/AllExtensions.cpp @@ -0,0 +1,16 @@ +//===- AllExtensions.cpp - All Tensor Dialect Extensions ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Tensor/Extensions/AllExtensions.h" +#include "mlir/Dialect/Tensor/Extensions/MeshShardingExtensions.h" + +using namespace mlir; + +void mlir::tensor::registerAllExtensions(DialectRegistry ®istry) { + registerShardingInterfaceExternalModels(registry); +} \ No newline at end of file diff --git a/mlir/lib/Dialect/Tensor/Extensions/CMakeLists.txt b/mlir/lib/Dialect/Tensor/Extensions/CMakeLists.txt new file mode 100644 index 00000000000000..dba59333666f6b --- /dev/null +++ b/mlir/lib/Dialect/Tensor/Extensions/CMakeLists.txt @@ -0,0 +1,26 @@ +set(LLVM_OPTIONAL_SOURCES + AllExtensions.cpp + MeshShardingExtensions.cpp + ) + +add_mlir_extension_library(MLIRTensorMeshShardingExtensions + MeshShardingExtensions.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Tensor/Extensions + + LINK_LIBS PUBLIC + MLIRTensorDialect + MLIRIR + MLIRShardingInterface + ) + +add_mlir_extension_library(MLIRTensorAllExtensions + AllExtensions.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Tensor/Extensions + + LINK_LIBS PUBLIC + MLIRTensorMeshShardingExtensions + ) \ No newline at end of file diff --git a/mlir/lib/Dialect/Mesh/Interfaces/TensorShardingInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Extensions/MeshShardingExtensions.cpp similarity index 98% rename from mlir/lib/Dialect/Mesh/Interfaces/TensorShardingInterfaceImpl.cpp rename to mlir/lib/Dialect/Tensor/Extensions/MeshShardingExtensions.cpp index 9422dd4a529fd4..f3e72abe7516ee 100644 --- a/mlir/lib/Dialect/Mesh/Interfaces/TensorShardingInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Tensor/Extensions/MeshShardingExtensions.cpp @@ -6,9 +6,9 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Dialect/Mesh/IR/TensorShardingInterfaceImpl.h" #include "mlir/Dialect/Mesh/Interfaces/ShardingInterface.h" #include "mlir/Dialect/Mesh/Interfaces/ShardingInterfaceImpl.h" +#include "mlir/Dialect/Tensor/IR/ShardingInterfaceImpl.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/IR/DialectRegistry.h" #include "llvm/Support/Debug.h" diff --git a/mlir/tools/mlir-lsp-server/CMakeLists.txt b/mlir/tools/mlir-lsp-server/CMakeLists.txt index 0134b54eef1b07..8ff9cc2f07e8eb 100644 --- a/mlir/tools/mlir-lsp-server/CMakeLists.txt +++ b/mlir/tools/mlir-lsp-server/CMakeLists.txt @@ -47,6 +47,7 @@ set(LIBS MLIRLspServerLib MLIRParser MLIRPass + MLIRTensorAllExtensions MLIRTransforms MLIRTransformUtils MLIRSupport From 8109e5de57fbdfc0fd292f143da7dfa7543ebdab Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 21 Aug 2024 12:07:40 +0100 Subject: [PATCH 044/426] [DAG] Add select_cc -> abd folds (#102137) Fixes #100810 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 + llvm/test/CodeGen/AArch64/abds-neg.ll | 17 +- llvm/test/CodeGen/AArch64/abds.ll | 16 +- llvm/test/CodeGen/AArch64/abdu-neg.ll | 18 +- llvm/test/CodeGen/AArch64/abdu.ll | 16 +- llvm/test/CodeGen/AMDGPU/sad.ll | 21 +- llvm/test/CodeGen/RISCV/abds-neg.ll | 261 +++++++------ llvm/test/CodeGen/RISCV/abds.ll | 132 +++---- llvm/test/CodeGen/RISCV/abdu-neg.ll | 355 +++++++++--------- llvm/test/CodeGen/RISCV/abdu.ll | 123 +++--- llvm/test/CodeGen/X86/abds-neg.ll | 21 +- llvm/test/CodeGen/X86/abds.ll | 60 ++- llvm/test/CodeGen/X86/abdu.ll | 20 +- 13 files changed, 528 insertions(+), 534 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 80fdedcf9c6259..4180dcc8a720d5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -27919,6 +27919,8 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, return S; if (SDValue S = PerformUMinFpToSatCombine(N0, N1, N2, N3, CC, DAG)) return S; + if (SDValue ABD = foldSelectToABD(N0, N1, N2, N3, CC, DL)) + return ABD; return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/abds-neg.ll b/llvm/test/CodeGen/AArch64/abds-neg.ll index 8c9c9f7188d4dc..d4c6a09405e0c4 100644 --- a/llvm/test/CodeGen/AArch64/abds-neg.ll +++ b/llvm/test/CodeGen/AArch64/abds-neg.ll @@ -377,7 +377,7 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: sub x8, x1, x0 ; CHECK-NEXT: subs x9, x0, x1 -; CHECK-NEXT: csel x0, x8, x9, lt +; CHECK-NEXT: csel x0, x9, x8, gt ; CHECK-NEXT: ret %cmp = icmp slt i64 %a, %b %ab = sub i64 %a, %b @@ -389,14 +389,13 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; CHECK-LABEL: abd_cmp_i128: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp x0, x2 -; CHECK-NEXT: sbc x8, x1, x3 -; CHECK-NEXT: subs x9, x2, x0 -; CHECK-NEXT: sbc x10, x3, x1 -; CHECK-NEXT: subs x11, x0, x2 -; CHECK-NEXT: sbcs xzr, x1, x3 -; CHECK-NEXT: csel x0, x9, x11, lt -; CHECK-NEXT: csel x1, x10, x8, lt +; CHECK-NEXT: subs x8, x0, x2 +; CHECK-NEXT: sbc x9, x1, x3 +; CHECK-NEXT: subs x10, x2, x0 +; CHECK-NEXT: sbc x11, x3, x1 +; CHECK-NEXT: sbcs xzr, x3, x1 +; CHECK-NEXT: csel x0, x8, x10, lt +; CHECK-NEXT: csel x1, x9, x11, lt ; CHECK-NEXT: ret %cmp = icmp slt i128 %a, %b %ab = sub i128 %a, %b diff --git a/llvm/test/CodeGen/AArch64/abds.ll b/llvm/test/CodeGen/AArch64/abds.ll index 85b70ede2807bb..45bb8749b25ed9 100644 --- a/llvm/test/CodeGen/AArch64/abds.ll +++ b/llvm/test/CodeGen/AArch64/abds.ll @@ -298,10 +298,9 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind { ; CHECK-LABEL: abd_cmp_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: sub w9, w0, w1 -; CHECK-NEXT: sub w10, w1, w0 -; CHECK-NEXT: cmp w8, w1, sxtb -; CHECK-NEXT: csel w0, w9, w10, gt +; CHECK-NEXT: sub w8, w8, w1, sxtb +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cneg w0, w8, mi ; CHECK-NEXT: ret %cmp = icmp sgt i8 %a, %b %ab = sub i8 %a, %b @@ -314,10 +313,9 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind { ; CHECK-LABEL: abd_cmp_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: sxth w8, w0 -; CHECK-NEXT: sub w9, w0, w1 -; CHECK-NEXT: sub w10, w1, w0 -; CHECK-NEXT: cmp w8, w1, sxth -; CHECK-NEXT: csel w0, w9, w10, ge +; CHECK-NEXT: sub w8, w8, w1, sxth +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cneg w0, w8, mi ; CHECK-NEXT: ret %cmp = icmp sge i16 %a, %b %ab = sub i16 %a, %b @@ -331,7 +329,7 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: sub w8, w1, w0 ; CHECK-NEXT: subs w9, w0, w1 -; CHECK-NEXT: csel w0, w8, w9, lt +; CHECK-NEXT: csel w0, w9, w8, gt ; CHECK-NEXT: ret %cmp = icmp slt i32 %a, %b %ab = sub i32 %a, %b diff --git a/llvm/test/CodeGen/AArch64/abdu-neg.ll b/llvm/test/CodeGen/AArch64/abdu-neg.ll index 1613cbce4b8c8a..b148a29a72976c 100644 --- a/llvm/test/CodeGen/AArch64/abdu-neg.ll +++ b/llvm/test/CodeGen/AArch64/abdu-neg.ll @@ -379,7 +379,7 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: sub x8, x1, x0 ; CHECK-NEXT: subs x9, x0, x1 -; CHECK-NEXT: csel x0, x8, x9, lo +; CHECK-NEXT: csel x0, x9, x8, hi ; CHECK-NEXT: ret %cmp = icmp ult i64 %a, %b %ab = sub i64 %a, %b @@ -391,14 +391,14 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; CHECK-LABEL: abd_cmp_i128: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp x0, x2 -; CHECK-NEXT: sbc x8, x1, x3 -; CHECK-NEXT: subs x9, x2, x0 -; CHECK-NEXT: sbc x10, x3, x1 -; CHECK-NEXT: subs x11, x0, x2 -; CHECK-NEXT: sbcs xzr, x1, x3 -; CHECK-NEXT: csel x0, x9, x11, lo -; CHECK-NEXT: csel x1, x10, x8, lo +; CHECK-NEXT: subs x8, x0, x2 +; CHECK-NEXT: sbcs x9, x1, x3 +; CHECK-NEXT: cset w10, lo +; CHECK-NEXT: sbfx x10, x10, #0, #1 +; CHECK-NEXT: eor x8, x8, x10 +; CHECK-NEXT: eor x9, x9, x10 +; CHECK-NEXT: subs x0, x8, x10 +; CHECK-NEXT: sbc x1, x9, x10 ; CHECK-NEXT: ret %cmp = icmp ult i128 %a, %b %ab = sub i128 %a, %b diff --git a/llvm/test/CodeGen/AArch64/abdu.ll b/llvm/test/CodeGen/AArch64/abdu.ll index 2baa4f0ca43a7a..22d41dfb85a629 100644 --- a/llvm/test/CodeGen/AArch64/abdu.ll +++ b/llvm/test/CodeGen/AArch64/abdu.ll @@ -301,10 +301,9 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind { ; CHECK-LABEL: abd_cmp_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: sub w9, w0, w1 -; CHECK-NEXT: sub w10, w1, w0 -; CHECK-NEXT: cmp w8, w1, uxtb -; CHECK-NEXT: csel w0, w9, w10, hi +; CHECK-NEXT: sub w8, w8, w1, uxtb +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cneg w0, w8, mi ; CHECK-NEXT: ret %cmp = icmp ugt i8 %a, %b %ab = sub i8 %a, %b @@ -317,10 +316,9 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind { ; CHECK-LABEL: abd_cmp_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: sub w9, w0, w1 -; CHECK-NEXT: sub w10, w1, w0 -; CHECK-NEXT: cmp w8, w1, uxth -; CHECK-NEXT: csel w0, w9, w10, hs +; CHECK-NEXT: sub w8, w8, w1, uxth +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cneg w0, w8, mi ; CHECK-NEXT: ret %cmp = icmp uge i16 %a, %b %ab = sub i16 %a, %b @@ -334,7 +332,7 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: sub w8, w1, w0 ; CHECK-NEXT: subs w9, w0, w1 -; CHECK-NEXT: csel w0, w8, w9, lo +; CHECK-NEXT: csel w0, w9, w8, hi ; CHECK-NEXT: ret %cmp = icmp ult i32 %a, %b %ab = sub i32 %a, %b diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll index e4309a29193637..c2132cf907fdb2 100644 --- a/llvm/test/CodeGen/AMDGPU/sad.ll +++ b/llvm/test/CodeGen/AMDGPU/sad.ll @@ -258,10 +258,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out ; GCN-NEXT: s_add_u32 s16, s16, s13 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s3, s0, s1 -; GCN-NEXT: s_sub_i32 s6, s1, s0 -; GCN-NEXT: s_cmp_gt_u32 s0, s1 -; GCN-NEXT: s_cselect_b32 s0, s3, s6 +; GCN-NEXT: s_min_u32 s3, s0, s1 +; GCN-NEXT: s_max_u32 s0, s0, s1 +; GCN-NEXT: s_sub_i32 s0, s0, s3 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_add_i32 s0, s0, s2 @@ -477,18 +476,14 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext % ; GCN-NEXT: s_load_dword s2, s[6:7], 0x2 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s4, s2, 8 ; GCN-NEXT: s_and_b32 s3, s2, 0xff -; GCN-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GCN-NEXT: s_lshr_b32 s6, s2, 16 -; GCN-NEXT: s_sub_i32 s7, s2, s4 -; GCN-NEXT: s_sub_i32 s2, s4, s2 -; GCN-NEXT: s_cmp_gt_u32 s3, s5 -; GCN-NEXT: s_cselect_b32 s2, s7, s2 -; GCN-NEXT: s_add_i32 s2, s2, s6 +; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GCN-NEXT: s_lshr_b32 s2, s2, 16 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_sad_u32 v2, s3, v0, v1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: flat_store_byte v[0:1], v2 ; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i8 %a, %b diff --git a/llvm/test/CodeGen/RISCV/abds-neg.ll b/llvm/test/CodeGen/RISCV/abds-neg.ll index 9bd28b91dd4c95..058f105e8f7358 100644 --- a/llvm/test/CodeGen/RISCV/abds-neg.ll +++ b/llvm/test/CodeGen/RISCV/abds-neg.ll @@ -1791,64 +1791,61 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind { define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; RV32I-LABEL: abd_cmp_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: sltu a4, a0, a2 +; RV32I-NEXT: sltu a4, a2, a0 ; RV32I-NEXT: mv a5, a4 ; RV32I-NEXT: beq a1, a3, .LBB21_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt a5, a1, a3 +; RV32I-NEXT: slt a5, a3, a1 ; RV32I-NEXT: .LBB21_2: ; RV32I-NEXT: bnez a5, .LBB21_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sub a1, a1, a3 +; RV32I-NEXT: sub a1, a3, a1 ; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sub a0, a0, a2 +; RV32I-NEXT: sub a0, a2, a0 ; RV32I-NEXT: ret ; RV32I-NEXT: .LBB21_4: -; RV32I-NEXT: sltu a4, a2, a0 -; RV32I-NEXT: sub a1, a3, a1 +; RV32I-NEXT: sltu a4, a0, a2 +; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sub a0, a2, a0 +; RV32I-NEXT: sub a0, a0, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_cmp_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: blt a0, a1, .LBB21_2 +; RV64I-NEXT: blt a1, a0, .LBB21_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: sub a0, a1, a0 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB21_2: -; RV64I-NEXT: sub a0, a1, a0 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: ret ; ; RV32ZBB-LABEL: abd_cmp_i64: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: sltu a4, a0, a2 +; RV32ZBB-NEXT: sltu a4, a2, a0 ; RV32ZBB-NEXT: mv a5, a4 ; RV32ZBB-NEXT: beq a1, a3, .LBB21_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: slt a5, a1, a3 +; RV32ZBB-NEXT: slt a5, a3, a1 ; RV32ZBB-NEXT: .LBB21_2: ; RV32ZBB-NEXT: bnez a5, .LBB21_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sub a1, a1, a3 +; RV32ZBB-NEXT: sub a1, a3, a1 ; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sub a0, a0, a2 +; RV32ZBB-NEXT: sub a0, a2, a0 ; RV32ZBB-NEXT: ret ; RV32ZBB-NEXT: .LBB21_4: -; RV32ZBB-NEXT: sltu a4, a2, a0 -; RV32ZBB-NEXT: sub a1, a3, a1 +; RV32ZBB-NEXT: sltu a4, a0, a2 +; RV32ZBB-NEXT: sub a1, a1, a3 ; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sub a0, a2, a0 +; RV32ZBB-NEXT: sub a0, a0, a2 ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_cmp_i64: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: blt a0, a1, .LBB21_2 -; RV64ZBB-NEXT: # %bb.1: -; RV64ZBB-NEXT: sub a0, a0, a1 -; RV64ZBB-NEXT: ret -; RV64ZBB-NEXT: .LBB21_2: -; RV64ZBB-NEXT: sub a0, a1, a0 +; RV64ZBB-NEXT: min a2, a0, a1 +; RV64ZBB-NEXT: max a0, a0, a1 +; RV64ZBB-NEXT: sub a0, a0, a2 ; RV64ZBB-NEXT: ret %cmp = icmp slt i64 %a, %b %ab = sub i64 %a, %b @@ -1860,176 +1857,194 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: abd_cmp_i128: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a3, 0(a2) -; RV32I-NEXT: lw a4, 0(a1) -; RV32I-NEXT: lw a5, 4(a2) -; RV32I-NEXT: lw a6, 8(a2) -; RV32I-NEXT: lw a7, 8(a1) -; RV32I-NEXT: lw a2, 12(a2) +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: lw a5, 0(a2) +; RV32I-NEXT: lw a4, 4(a1) +; RV32I-NEXT: lw a6, 8(a1) +; RV32I-NEXT: lw a7, 8(a2) ; RV32I-NEXT: lw t0, 12(a1) -; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: sltu t1, a7, a6 -; RV32I-NEXT: mv t4, t1 -; RV32I-NEXT: beq t0, a2, .LBB22_2 +; RV32I-NEXT: lw t1, 12(a2) +; RV32I-NEXT: lw a1, 4(a2) +; RV32I-NEXT: sltu a2, a7, a6 +; RV32I-NEXT: mv t4, a2 +; RV32I-NEXT: beq t0, t1, .LBB22_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt t4, t0, a2 +; RV32I-NEXT: slt t4, t1, t0 ; RV32I-NEXT: .LBB22_2: -; RV32I-NEXT: sltu t2, a4, a3 +; RV32I-NEXT: sltu t2, a5, a3 +; RV32I-NEXT: sltu t5, a1, a4 ; RV32I-NEXT: mv t3, t2 -; RV32I-NEXT: beq a1, a5, .LBB22_4 +; RV32I-NEXT: beq a4, a1, .LBB22_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sltu t3, a1, a5 +; RV32I-NEXT: mv t3, t5 ; RV32I-NEXT: .LBB22_4: -; RV32I-NEXT: xor t5, t0, a2 -; RV32I-NEXT: xor t6, a7, a6 -; RV32I-NEXT: or t5, t6, t5 -; RV32I-NEXT: mv t6, t3 -; RV32I-NEXT: beqz t5, .LBB22_6 +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: xor t6, t0, t1 +; RV32I-NEXT: xor s0, a6, a7 +; RV32I-NEXT: or t6, s0, t6 +; RV32I-NEXT: beqz t6, .LBB22_6 ; RV32I-NEXT: # %bb.5: -; RV32I-NEXT: mv t6, t4 +; RV32I-NEXT: mv t3, t4 ; RV32I-NEXT: .LBB22_6: -; RV32I-NEXT: sltu t4, a3, a4 -; RV32I-NEXT: mv t5, t4 -; RV32I-NEXT: beq a1, a5, .LBB22_8 +; RV32I-NEXT: mv t4, t2 +; RV32I-NEXT: beq a1, a4, .LBB22_8 ; RV32I-NEXT: # %bb.7: -; RV32I-NEXT: sltu t5, a5, a1 +; RV32I-NEXT: mv t4, t5 ; RV32I-NEXT: .LBB22_8: -; RV32I-NEXT: bnez t6, .LBB22_10 +; RV32I-NEXT: sltu t5, a3, a5 +; RV32I-NEXT: mv t6, t5 +; RV32I-NEXT: beq a4, a1, .LBB22_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: sub a2, t0, a2 +; RV32I-NEXT: sltu t6, a4, a1 +; RV32I-NEXT: .LBB22_10: +; RV32I-NEXT: bnez t3, .LBB22_12 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: sub t0, t1, t0 ; RV32I-NEXT: sub a6, a7, a6 -; RV32I-NEXT: sub a2, a2, t1 -; RV32I-NEXT: sltu a7, a6, t3 -; RV32I-NEXT: sub a1, a1, a5 +; RV32I-NEXT: sub a2, t0, a2 +; RV32I-NEXT: sltu a7, a6, t4 ; RV32I-NEXT: sub a2, a2, a7 -; RV32I-NEXT: sub a6, a6, t3 +; RV32I-NEXT: sub a3, a5, a3 +; RV32I-NEXT: sub a1, a1, a4 ; RV32I-NEXT: sub a1, a1, t2 -; RV32I-NEXT: sub a3, a4, a3 -; RV32I-NEXT: j .LBB22_11 -; RV32I-NEXT: .LBB22_10: -; RV32I-NEXT: sltu t1, a6, a7 -; RV32I-NEXT: sub a2, a2, t0 -; RV32I-NEXT: sub a2, a2, t1 +; RV32I-NEXT: sub a4, a6, t4 +; RV32I-NEXT: j .LBB22_13 +; RV32I-NEXT: .LBB22_12: +; RV32I-NEXT: sltu a2, a6, a7 +; RV32I-NEXT: sub t0, t0, t1 +; RV32I-NEXT: sub a2, t0, a2 ; RV32I-NEXT: sub a6, a6, a7 -; RV32I-NEXT: sltu a7, a6, t5 +; RV32I-NEXT: sltu a7, a6, t6 ; RV32I-NEXT: sub a2, a2, a7 -; RV32I-NEXT: sub a6, a6, t5 -; RV32I-NEXT: sub a5, a5, a1 -; RV32I-NEXT: sub a1, a5, t4 -; RV32I-NEXT: sub a3, a3, a4 -; RV32I-NEXT: .LBB22_11: -; RV32I-NEXT: sw a6, 8(a0) +; RV32I-NEXT: sub a3, a3, a5 +; RV32I-NEXT: sub a4, a4, a1 +; RV32I-NEXT: sub a1, a4, t5 +; RV32I-NEXT: sub a4, a6, t6 +; RV32I-NEXT: .LBB22_13: +; RV32I-NEXT: sw a4, 8(a0) ; RV32I-NEXT: sw a1, 4(a0) ; RV32I-NEXT: sw a3, 0(a0) ; RV32I-NEXT: sw a2, 12(a0) +; RV32I-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_cmp_i128: ; RV64I: # %bb.0: -; RV64I-NEXT: sltu a4, a0, a2 +; RV64I-NEXT: sltu a4, a2, a0 ; RV64I-NEXT: mv a5, a4 ; RV64I-NEXT: beq a1, a3, .LBB22_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: slt a5, a1, a3 +; RV64I-NEXT: slt a5, a3, a1 ; RV64I-NEXT: .LBB22_2: ; RV64I-NEXT: bnez a5, .LBB22_4 ; RV64I-NEXT: # %bb.3: -; RV64I-NEXT: sub a1, a1, a3 +; RV64I-NEXT: sub a1, a3, a1 ; RV64I-NEXT: sub a1, a1, a4 -; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: sub a0, a2, a0 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB22_4: -; RV64I-NEXT: sltu a4, a2, a0 -; RV64I-NEXT: sub a1, a3, a1 +; RV64I-NEXT: sltu a4, a0, a2 +; RV64I-NEXT: sub a1, a1, a3 ; RV64I-NEXT: sub a1, a1, a4 -; RV64I-NEXT: sub a0, a2, a0 +; RV64I-NEXT: sub a0, a0, a2 ; RV64I-NEXT: ret ; ; RV32ZBB-LABEL: abd_cmp_i128: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a3, 0(a2) -; RV32ZBB-NEXT: lw a4, 0(a1) -; RV32ZBB-NEXT: lw a5, 4(a2) -; RV32ZBB-NEXT: lw a6, 8(a2) -; RV32ZBB-NEXT: lw a7, 8(a1) -; RV32ZBB-NEXT: lw a2, 12(a2) +; RV32ZBB-NEXT: lw a3, 0(a1) +; RV32ZBB-NEXT: lw a5, 0(a2) +; RV32ZBB-NEXT: lw a4, 4(a1) +; RV32ZBB-NEXT: lw a6, 8(a1) +; RV32ZBB-NEXT: lw a7, 8(a2) ; RV32ZBB-NEXT: lw t0, 12(a1) -; RV32ZBB-NEXT: lw a1, 4(a1) -; RV32ZBB-NEXT: sltu t1, a7, a6 -; RV32ZBB-NEXT: mv t4, t1 -; RV32ZBB-NEXT: beq t0, a2, .LBB22_2 +; RV32ZBB-NEXT: lw t1, 12(a2) +; RV32ZBB-NEXT: lw a1, 4(a2) +; RV32ZBB-NEXT: sltu a2, a7, a6 +; RV32ZBB-NEXT: mv t4, a2 +; RV32ZBB-NEXT: beq t0, t1, .LBB22_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: slt t4, t0, a2 +; RV32ZBB-NEXT: slt t4, t1, t0 ; RV32ZBB-NEXT: .LBB22_2: -; RV32ZBB-NEXT: sltu t2, a4, a3 +; RV32ZBB-NEXT: sltu t2, a5, a3 +; RV32ZBB-NEXT: sltu t5, a1, a4 ; RV32ZBB-NEXT: mv t3, t2 -; RV32ZBB-NEXT: beq a1, a5, .LBB22_4 +; RV32ZBB-NEXT: beq a4, a1, .LBB22_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sltu t3, a1, a5 +; RV32ZBB-NEXT: mv t3, t5 ; RV32ZBB-NEXT: .LBB22_4: -; RV32ZBB-NEXT: xor t5, t0, a2 -; RV32ZBB-NEXT: xor t6, a7, a6 -; RV32ZBB-NEXT: or t5, t6, t5 -; RV32ZBB-NEXT: mv t6, t3 -; RV32ZBB-NEXT: beqz t5, .LBB22_6 +; RV32ZBB-NEXT: addi sp, sp, -16 +; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZBB-NEXT: xor t6, t0, t1 +; RV32ZBB-NEXT: xor s0, a6, a7 +; RV32ZBB-NEXT: or t6, s0, t6 +; RV32ZBB-NEXT: beqz t6, .LBB22_6 ; RV32ZBB-NEXT: # %bb.5: -; RV32ZBB-NEXT: mv t6, t4 +; RV32ZBB-NEXT: mv t3, t4 ; RV32ZBB-NEXT: .LBB22_6: -; RV32ZBB-NEXT: sltu t4, a3, a4 -; RV32ZBB-NEXT: mv t5, t4 -; RV32ZBB-NEXT: beq a1, a5, .LBB22_8 +; RV32ZBB-NEXT: mv t4, t2 +; RV32ZBB-NEXT: beq a1, a4, .LBB22_8 ; RV32ZBB-NEXT: # %bb.7: -; RV32ZBB-NEXT: sltu t5, a5, a1 +; RV32ZBB-NEXT: mv t4, t5 ; RV32ZBB-NEXT: .LBB22_8: -; RV32ZBB-NEXT: bnez t6, .LBB22_10 +; RV32ZBB-NEXT: sltu t5, a3, a5 +; RV32ZBB-NEXT: mv t6, t5 +; RV32ZBB-NEXT: beq a4, a1, .LBB22_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: sub a2, t0, a2 +; RV32ZBB-NEXT: sltu t6, a4, a1 +; RV32ZBB-NEXT: .LBB22_10: +; RV32ZBB-NEXT: bnez t3, .LBB22_12 +; RV32ZBB-NEXT: # %bb.11: +; RV32ZBB-NEXT: sub t0, t1, t0 ; RV32ZBB-NEXT: sub a6, a7, a6 -; RV32ZBB-NEXT: sub a2, a2, t1 -; RV32ZBB-NEXT: sltu a7, a6, t3 -; RV32ZBB-NEXT: sub a1, a1, a5 +; RV32ZBB-NEXT: sub a2, t0, a2 +; RV32ZBB-NEXT: sltu a7, a6, t4 ; RV32ZBB-NEXT: sub a2, a2, a7 -; RV32ZBB-NEXT: sub a6, a6, t3 +; RV32ZBB-NEXT: sub a3, a5, a3 +; RV32ZBB-NEXT: sub a1, a1, a4 ; RV32ZBB-NEXT: sub a1, a1, t2 -; RV32ZBB-NEXT: sub a3, a4, a3 -; RV32ZBB-NEXT: j .LBB22_11 -; RV32ZBB-NEXT: .LBB22_10: -; RV32ZBB-NEXT: sltu t1, a6, a7 -; RV32ZBB-NEXT: sub a2, a2, t0 -; RV32ZBB-NEXT: sub a2, a2, t1 +; RV32ZBB-NEXT: sub a4, a6, t4 +; RV32ZBB-NEXT: j .LBB22_13 +; RV32ZBB-NEXT: .LBB22_12: +; RV32ZBB-NEXT: sltu a2, a6, a7 +; RV32ZBB-NEXT: sub t0, t0, t1 +; RV32ZBB-NEXT: sub a2, t0, a2 ; RV32ZBB-NEXT: sub a6, a6, a7 -; RV32ZBB-NEXT: sltu a7, a6, t5 +; RV32ZBB-NEXT: sltu a7, a6, t6 ; RV32ZBB-NEXT: sub a2, a2, a7 -; RV32ZBB-NEXT: sub a6, a6, t5 -; RV32ZBB-NEXT: sub a5, a5, a1 -; RV32ZBB-NEXT: sub a1, a5, t4 -; RV32ZBB-NEXT: sub a3, a3, a4 -; RV32ZBB-NEXT: .LBB22_11: -; RV32ZBB-NEXT: sw a6, 8(a0) +; RV32ZBB-NEXT: sub a3, a3, a5 +; RV32ZBB-NEXT: sub a4, a4, a1 +; RV32ZBB-NEXT: sub a1, a4, t5 +; RV32ZBB-NEXT: sub a4, a6, t6 +; RV32ZBB-NEXT: .LBB22_13: +; RV32ZBB-NEXT: sw a4, 8(a0) ; RV32ZBB-NEXT: sw a1, 4(a0) ; RV32ZBB-NEXT: sw a3, 0(a0) ; RV32ZBB-NEXT: sw a2, 12(a0) +; RV32ZBB-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZBB-NEXT: addi sp, sp, 16 ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_cmp_i128: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: sltu a4, a0, a2 +; RV64ZBB-NEXT: sltu a4, a2, a0 ; RV64ZBB-NEXT: mv a5, a4 ; RV64ZBB-NEXT: beq a1, a3, .LBB22_2 ; RV64ZBB-NEXT: # %bb.1: -; RV64ZBB-NEXT: slt a5, a1, a3 +; RV64ZBB-NEXT: slt a5, a3, a1 ; RV64ZBB-NEXT: .LBB22_2: ; RV64ZBB-NEXT: bnez a5, .LBB22_4 ; RV64ZBB-NEXT: # %bb.3: -; RV64ZBB-NEXT: sub a1, a1, a3 +; RV64ZBB-NEXT: sub a1, a3, a1 ; RV64ZBB-NEXT: sub a1, a1, a4 -; RV64ZBB-NEXT: sub a0, a0, a2 +; RV64ZBB-NEXT: sub a0, a2, a0 ; RV64ZBB-NEXT: ret ; RV64ZBB-NEXT: .LBB22_4: -; RV64ZBB-NEXT: sltu a4, a2, a0 -; RV64ZBB-NEXT: sub a1, a3, a1 +; RV64ZBB-NEXT: sltu a4, a0, a2 +; RV64ZBB-NEXT: sub a1, a1, a3 ; RV64ZBB-NEXT: sub a1, a1, a4 -; RV64ZBB-NEXT: sub a0, a2, a0 +; RV64ZBB-NEXT: sub a0, a0, a2 ; RV64ZBB-NEXT: ret %cmp = icmp slt i128 %a, %b %ab = sub i128 %a, %b diff --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll index 3cebc1128ae850..b867a55445c95b 100644 --- a/llvm/test/CodeGen/RISCV/abds.ll +++ b/llvm/test/CodeGen/RISCV/abds.ll @@ -1325,42 +1325,35 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind { ; RV32I-LABEL: abd_cmp_i8: ; RV32I: # %bb.0: -; RV32I-NEXT: slli a2, a0, 24 -; RV32I-NEXT: srai a2, a2, 24 -; RV32I-NEXT: slli a3, a1, 24 -; RV32I-NEXT: srai a3, a3, 24 -; RV32I-NEXT: blt a3, a2, .LBB18_2 -; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sub a0, a1, a0 -; RV32I-NEXT: ret -; RV32I-NEXT: .LBB18_2: +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: srai a1, a1, 24 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: srai a0, a0, 24 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: xor a0, a0, a1 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_cmp_i8: ; RV64I: # %bb.0: -; RV64I-NEXT: slli a2, a0, 56 -; RV64I-NEXT: srai a2, a2, 56 -; RV64I-NEXT: slli a3, a1, 56 -; RV64I-NEXT: srai a3, a3, 56 -; RV64I-NEXT: blt a3, a2, .LBB18_2 -; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sub a0, a1, a0 -; RV64I-NEXT: ret -; RV64I-NEXT: .LBB18_2: +; RV64I-NEXT: slli a1, a1, 56 +; RV64I-NEXT: srai a1, a1, 56 +; RV64I-NEXT: slli a0, a0, 56 +; RV64I-NEXT: srai a0, a0, 56 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: srai a1, a0, 63 +; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: ret ; ; ZBB-LABEL: abd_cmp_i8: ; ZBB: # %bb.0: -; ZBB-NEXT: sext.b a2, a0 -; ZBB-NEXT: sext.b a3, a1 -; ZBB-NEXT: blt a3, a2, .LBB18_2 -; ZBB-NEXT: # %bb.1: -; ZBB-NEXT: sub a0, a1, a0 -; ZBB-NEXT: ret -; ZBB-NEXT: .LBB18_2: -; ZBB-NEXT: sub a0, a0, a1 +; ZBB-NEXT: sext.b a1, a1 +; ZBB-NEXT: sext.b a0, a0 +; ZBB-NEXT: min a2, a0, a1 +; ZBB-NEXT: max a0, a0, a1 +; ZBB-NEXT: sub a0, a0, a2 ; ZBB-NEXT: ret %cmp = icmp sgt i8 %a, %b %ab = sub i8 %a, %b @@ -1372,42 +1365,35 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind { define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind { ; RV32I-LABEL: abd_cmp_i16: ; RV32I: # %bb.0: -; RV32I-NEXT: slli a2, a1, 16 -; RV32I-NEXT: srai a2, a2, 16 -; RV32I-NEXT: slli a3, a0, 16 -; RV32I-NEXT: srai a3, a3, 16 -; RV32I-NEXT: bge a3, a2, .LBB19_2 -; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sub a0, a1, a0 -; RV32I-NEXT: ret -; RV32I-NEXT: .LBB19_2: +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: srai a1, a1, 16 +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 16 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: xor a0, a0, a1 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_cmp_i16: ; RV64I: # %bb.0: -; RV64I-NEXT: slli a2, a1, 48 -; RV64I-NEXT: srai a2, a2, 48 -; RV64I-NEXT: slli a3, a0, 48 -; RV64I-NEXT: srai a3, a3, 48 -; RV64I-NEXT: bge a3, a2, .LBB19_2 -; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sub a0, a1, a0 -; RV64I-NEXT: ret -; RV64I-NEXT: .LBB19_2: +; RV64I-NEXT: slli a1, a1, 48 +; RV64I-NEXT: srai a1, a1, 48 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srai a0, a0, 48 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: srai a1, a0, 63 +; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: ret ; ; ZBB-LABEL: abd_cmp_i16: ; ZBB: # %bb.0: -; ZBB-NEXT: sext.h a2, a1 -; ZBB-NEXT: sext.h a3, a0 -; ZBB-NEXT: bge a3, a2, .LBB19_2 -; ZBB-NEXT: # %bb.1: -; ZBB-NEXT: sub a0, a1, a0 -; ZBB-NEXT: ret -; ZBB-NEXT: .LBB19_2: -; ZBB-NEXT: sub a0, a0, a1 +; ZBB-NEXT: sext.h a1, a1 +; ZBB-NEXT: sext.h a0, a0 +; ZBB-NEXT: min a2, a0, a1 +; ZBB-NEXT: max a0, a0, a1 +; ZBB-NEXT: sub a0, a0, a2 ; ZBB-NEXT: ret %cmp = icmp sge i16 %a, %b %ab = sub i16 %a, %b @@ -1419,46 +1405,38 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind { define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind { ; RV32I-LABEL: abd_cmp_i32: ; RV32I: # %bb.0: -; RV32I-NEXT: blt a0, a1, .LBB20_2 +; RV32I-NEXT: blt a1, a0, .LBB20_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: sub a0, a1, a0 ; RV32I-NEXT: ret ; RV32I-NEXT: .LBB20_2: -; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_cmp_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: sext.w a2, a1 -; RV64I-NEXT: sext.w a3, a0 -; RV64I-NEXT: blt a3, a2, .LBB20_2 -; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: subw a0, a0, a1 -; RV64I-NEXT: ret -; RV64I-NEXT: .LBB20_2: -; RV64I-NEXT: subw a0, a1, a0 +; RV64I-NEXT: sext.w a1, a1 +; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: srai a1, a0, 63 +; RV64I-NEXT: xor a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: ret ; ; RV32ZBB-LABEL: abd_cmp_i32: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: blt a0, a1, .LBB20_2 -; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sub a0, a0, a1 -; RV32ZBB-NEXT: ret -; RV32ZBB-NEXT: .LBB20_2: -; RV32ZBB-NEXT: sub a0, a1, a0 +; RV32ZBB-NEXT: min a2, a0, a1 +; RV32ZBB-NEXT: max a0, a0, a1 +; RV32ZBB-NEXT: sub a0, a0, a2 ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_cmp_i32: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: sext.w a2, a1 -; RV64ZBB-NEXT: sext.w a3, a0 -; RV64ZBB-NEXT: blt a3, a2, .LBB20_2 -; RV64ZBB-NEXT: # %bb.1: -; RV64ZBB-NEXT: subw a0, a0, a1 -; RV64ZBB-NEXT: ret -; RV64ZBB-NEXT: .LBB20_2: -; RV64ZBB-NEXT: subw a0, a1, a0 +; RV64ZBB-NEXT: sext.w a1, a1 +; RV64ZBB-NEXT: sext.w a0, a0 +; RV64ZBB-NEXT: min a2, a0, a1 +; RV64ZBB-NEXT: max a0, a0, a1 +; RV64ZBB-NEXT: sub a0, a0, a2 ; RV64ZBB-NEXT: ret %cmp = icmp slt i32 %a, %b %ab = sub i32 %a, %b diff --git a/llvm/test/CodeGen/RISCV/abdu-neg.ll b/llvm/test/CodeGen/RISCV/abdu-neg.ll index 6b121af7e4e84f..bcacdf44ab1030 100644 --- a/llvm/test/CodeGen/RISCV/abdu-neg.ll +++ b/llvm/test/CodeGen/RISCV/abdu-neg.ll @@ -1740,63 +1740,62 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; RV32I-LABEL: abd_cmp_i64: ; RV32I: # %bb.0: ; RV32I-NEXT: sltu a4, a0, a2 -; RV32I-NEXT: mv a5, a4 -; RV32I-NEXT: beq a1, a3, .LBB21_2 +; RV32I-NEXT: sub a3, a1, a3 +; RV32I-NEXT: sub a3, a3, a4 +; RV32I-NEXT: sub a2, a0, a2 +; RV32I-NEXT: beq a3, a1, .LBB21_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu a5, a1, a3 +; RV32I-NEXT: sltu a0, a1, a3 +; RV32I-NEXT: j .LBB21_3 ; RV32I-NEXT: .LBB21_2: -; RV32I-NEXT: bnez a5, .LBB21_4 -; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sub a1, a1, a3 -; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sub a0, a0, a2 -; RV32I-NEXT: ret -; RV32I-NEXT: .LBB21_4: -; RV32I-NEXT: sltu a4, a2, a0 -; RV32I-NEXT: sub a1, a3, a1 +; RV32I-NEXT: sltu a0, a0, a2 +; RV32I-NEXT: .LBB21_3: +; RV32I-NEXT: neg a1, a0 +; RV32I-NEXT: xor a2, a2, a1 +; RV32I-NEXT: sltu a4, a2, a1 +; RV32I-NEXT: xor a1, a3, a1 +; RV32I-NEXT: add a1, a1, a0 ; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sub a0, a2, a0 +; RV32I-NEXT: add a0, a2, a0 ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_cmp_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: bltu a0, a1, .LBB21_2 +; RV64I-NEXT: bltu a1, a0, .LBB21_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: sub a0, a1, a0 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB21_2: -; RV64I-NEXT: sub a0, a1, a0 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: ret ; ; RV32ZBB-LABEL: abd_cmp_i64: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: sltu a4, a0, a2 -; RV32ZBB-NEXT: mv a5, a4 -; RV32ZBB-NEXT: beq a1, a3, .LBB21_2 +; RV32ZBB-NEXT: sub a3, a1, a3 +; RV32ZBB-NEXT: sub a3, a3, a4 +; RV32ZBB-NEXT: sub a2, a0, a2 +; RV32ZBB-NEXT: beq a3, a1, .LBB21_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu a5, a1, a3 +; RV32ZBB-NEXT: sltu a0, a1, a3 +; RV32ZBB-NEXT: j .LBB21_3 ; RV32ZBB-NEXT: .LBB21_2: -; RV32ZBB-NEXT: bnez a5, .LBB21_4 -; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sub a1, a1, a3 -; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sub a0, a0, a2 -; RV32ZBB-NEXT: ret -; RV32ZBB-NEXT: .LBB21_4: -; RV32ZBB-NEXT: sltu a4, a2, a0 -; RV32ZBB-NEXT: sub a1, a3, a1 +; RV32ZBB-NEXT: sltu a0, a0, a2 +; RV32ZBB-NEXT: .LBB21_3: +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: xor a2, a2, a1 +; RV32ZBB-NEXT: sltu a4, a2, a1 +; RV32ZBB-NEXT: xor a1, a3, a1 +; RV32ZBB-NEXT: add a1, a1, a0 ; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sub a0, a2, a0 +; RV32ZBB-NEXT: add a0, a2, a0 ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_cmp_i64: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: bltu a0, a1, .LBB21_2 -; RV64ZBB-NEXT: # %bb.1: -; RV64ZBB-NEXT: sub a0, a0, a1 -; RV64ZBB-NEXT: ret -; RV64ZBB-NEXT: .LBB21_2: -; RV64ZBB-NEXT: sub a0, a1, a0 +; RV64ZBB-NEXT: minu a2, a0, a1 +; RV64ZBB-NEXT: maxu a0, a0, a1 +; RV64ZBB-NEXT: sub a0, a0, a2 ; RV64ZBB-NEXT: ret %cmp = icmp ult i64 %a, %b %ab = sub i64 %a, %b @@ -1808,176 +1807,194 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: abd_cmp_i128: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a3, 0(a2) -; RV32I-NEXT: lw a4, 0(a1) -; RV32I-NEXT: lw a5, 4(a2) -; RV32I-NEXT: lw a6, 8(a2) -; RV32I-NEXT: lw a7, 8(a1) -; RV32I-NEXT: lw a2, 12(a2) -; RV32I-NEXT: lw t0, 12(a1) +; RV32I-NEXT: lw a5, 0(a2) +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: lw t1, 12(a2) +; RV32I-NEXT: lw a7, 8(a2) +; RV32I-NEXT: lw a4, 8(a1) +; RV32I-NEXT: lw a6, 12(a1) +; RV32I-NEXT: lw t0, 4(a2) ; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: sltu t1, a7, a6 -; RV32I-NEXT: mv t4, t1 -; RV32I-NEXT: beq t0, a2, .LBB22_2 +; RV32I-NEXT: sltu a2, a4, a7 +; RV32I-NEXT: sub t1, a6, t1 +; RV32I-NEXT: sltu t2, a3, a5 +; RV32I-NEXT: sub a2, t1, a2 +; RV32I-NEXT: mv t1, t2 +; RV32I-NEXT: beq a1, t0, .LBB22_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t4, t0, a2 +; RV32I-NEXT: sltu t1, a1, t0 ; RV32I-NEXT: .LBB22_2: -; RV32I-NEXT: sltu t2, a4, a3 -; RV32I-NEXT: mv t3, t2 -; RV32I-NEXT: beq a1, a5, .LBB22_4 +; RV32I-NEXT: sub a7, a4, a7 +; RV32I-NEXT: sltu t3, a7, t1 +; RV32I-NEXT: sub a2, a2, t3 +; RV32I-NEXT: sub a7, a7, t1 +; RV32I-NEXT: beq a2, a6, .LBB22_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sltu t3, a1, a5 +; RV32I-NEXT: sltu t1, a6, a2 +; RV32I-NEXT: j .LBB22_5 ; RV32I-NEXT: .LBB22_4: -; RV32I-NEXT: xor t5, t0, a2 -; RV32I-NEXT: xor t6, a7, a6 -; RV32I-NEXT: or t5, t6, t5 -; RV32I-NEXT: mv t6, t3 -; RV32I-NEXT: beqz t5, .LBB22_6 -; RV32I-NEXT: # %bb.5: -; RV32I-NEXT: mv t6, t4 -; RV32I-NEXT: .LBB22_6: -; RV32I-NEXT: sltu t4, a3, a4 -; RV32I-NEXT: mv t5, t4 -; RV32I-NEXT: beq a1, a5, .LBB22_8 -; RV32I-NEXT: # %bb.7: -; RV32I-NEXT: sltu t5, a5, a1 +; RV32I-NEXT: sltu t1, a4, a7 +; RV32I-NEXT: .LBB22_5: +; RV32I-NEXT: sub t0, a1, t0 +; RV32I-NEXT: sub t0, t0, t2 +; RV32I-NEXT: sub a5, a3, a5 +; RV32I-NEXT: beq t0, a1, .LBB22_7 +; RV32I-NEXT: # %bb.6: +; RV32I-NEXT: sltu a1, a1, t0 +; RV32I-NEXT: j .LBB22_8 +; RV32I-NEXT: .LBB22_7: +; RV32I-NEXT: sltu a1, a3, a5 ; RV32I-NEXT: .LBB22_8: -; RV32I-NEXT: bnez t6, .LBB22_10 +; RV32I-NEXT: xor a3, a2, a6 +; RV32I-NEXT: xor a4, a7, a4 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: beqz a3, .LBB22_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: sub a2, t0, a2 -; RV32I-NEXT: sub a6, a7, a6 -; RV32I-NEXT: sub a2, a2, t1 -; RV32I-NEXT: sltu a7, a6, t3 -; RV32I-NEXT: sub a1, a1, a5 -; RV32I-NEXT: sub a2, a2, a7 -; RV32I-NEXT: sub a6, a6, t3 -; RV32I-NEXT: sub a1, a1, t2 -; RV32I-NEXT: sub a3, a4, a3 -; RV32I-NEXT: j .LBB22_11 +; RV32I-NEXT: mv a1, t1 ; RV32I-NEXT: .LBB22_10: -; RV32I-NEXT: sltu t1, a6, a7 -; RV32I-NEXT: sub a2, a2, t0 -; RV32I-NEXT: sub a2, a2, t1 -; RV32I-NEXT: sub a6, a6, a7 -; RV32I-NEXT: sltu a7, a6, t5 -; RV32I-NEXT: sub a2, a2, a7 -; RV32I-NEXT: sub a6, a6, t5 -; RV32I-NEXT: sub a5, a5, a1 -; RV32I-NEXT: sub a1, a5, t4 -; RV32I-NEXT: sub a3, a3, a4 -; RV32I-NEXT: .LBB22_11: -; RV32I-NEXT: sw a6, 8(a0) -; RV32I-NEXT: sw a1, 4(a0) -; RV32I-NEXT: sw a3, 0(a0) -; RV32I-NEXT: sw a2, 12(a0) +; RV32I-NEXT: neg a6, a1 +; RV32I-NEXT: xor a3, a7, a6 +; RV32I-NEXT: sltu a4, a3, a6 +; RV32I-NEXT: xor a2, a2, a6 +; RV32I-NEXT: add a2, a2, a1 +; RV32I-NEXT: sub a4, a2, a4 +; RV32I-NEXT: xor a2, a5, a6 +; RV32I-NEXT: sltu a5, a2, a6 +; RV32I-NEXT: xor a7, t0, a6 +; RV32I-NEXT: mv t1, a5 +; RV32I-NEXT: beqz t0, .LBB22_12 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: sltu t1, a7, a6 +; RV32I-NEXT: .LBB22_12: +; RV32I-NEXT: add a3, a3, a1 +; RV32I-NEXT: sltu a6, a3, t1 +; RV32I-NEXT: sub a4, a4, a6 +; RV32I-NEXT: sub a3, a3, t1 +; RV32I-NEXT: add a7, a7, a1 +; RV32I-NEXT: sub a5, a7, a5 +; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: sw a1, 0(a0) +; RV32I-NEXT: sw a5, 4(a0) +; RV32I-NEXT: sw a3, 8(a0) +; RV32I-NEXT: sw a4, 12(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_cmp_i128: ; RV64I: # %bb.0: ; RV64I-NEXT: sltu a4, a0, a2 -; RV64I-NEXT: mv a5, a4 -; RV64I-NEXT: beq a1, a3, .LBB22_2 +; RV64I-NEXT: sub a3, a1, a3 +; RV64I-NEXT: sub a3, a3, a4 +; RV64I-NEXT: sub a2, a0, a2 +; RV64I-NEXT: beq a3, a1, .LBB22_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sltu a5, a1, a3 +; RV64I-NEXT: sltu a0, a1, a3 +; RV64I-NEXT: j .LBB22_3 ; RV64I-NEXT: .LBB22_2: -; RV64I-NEXT: bnez a5, .LBB22_4 -; RV64I-NEXT: # %bb.3: -; RV64I-NEXT: sub a1, a1, a3 -; RV64I-NEXT: sub a1, a1, a4 -; RV64I-NEXT: sub a0, a0, a2 -; RV64I-NEXT: ret -; RV64I-NEXT: .LBB22_4: -; RV64I-NEXT: sltu a4, a2, a0 -; RV64I-NEXT: sub a1, a3, a1 +; RV64I-NEXT: sltu a0, a0, a2 +; RV64I-NEXT: .LBB22_3: +; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: sltu a4, a2, a1 +; RV64I-NEXT: xor a1, a3, a1 +; RV64I-NEXT: add a1, a1, a0 ; RV64I-NEXT: sub a1, a1, a4 -; RV64I-NEXT: sub a0, a2, a0 +; RV64I-NEXT: add a0, a2, a0 ; RV64I-NEXT: ret ; ; RV32ZBB-LABEL: abd_cmp_i128: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a3, 0(a2) -; RV32ZBB-NEXT: lw a4, 0(a1) -; RV32ZBB-NEXT: lw a5, 4(a2) -; RV32ZBB-NEXT: lw a6, 8(a2) -; RV32ZBB-NEXT: lw a7, 8(a1) -; RV32ZBB-NEXT: lw a2, 12(a2) -; RV32ZBB-NEXT: lw t0, 12(a1) +; RV32ZBB-NEXT: lw a5, 0(a2) +; RV32ZBB-NEXT: lw a3, 0(a1) +; RV32ZBB-NEXT: lw t1, 12(a2) +; RV32ZBB-NEXT: lw a7, 8(a2) +; RV32ZBB-NEXT: lw a4, 8(a1) +; RV32ZBB-NEXT: lw a6, 12(a1) +; RV32ZBB-NEXT: lw t0, 4(a2) ; RV32ZBB-NEXT: lw a1, 4(a1) -; RV32ZBB-NEXT: sltu t1, a7, a6 -; RV32ZBB-NEXT: mv t4, t1 -; RV32ZBB-NEXT: beq t0, a2, .LBB22_2 +; RV32ZBB-NEXT: sltu a2, a4, a7 +; RV32ZBB-NEXT: sub t1, a6, t1 +; RV32ZBB-NEXT: sltu t2, a3, a5 +; RV32ZBB-NEXT: sub a2, t1, a2 +; RV32ZBB-NEXT: mv t1, t2 +; RV32ZBB-NEXT: beq a1, t0, .LBB22_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu t4, t0, a2 +; RV32ZBB-NEXT: sltu t1, a1, t0 ; RV32ZBB-NEXT: .LBB22_2: -; RV32ZBB-NEXT: sltu t2, a4, a3 -; RV32ZBB-NEXT: mv t3, t2 -; RV32ZBB-NEXT: beq a1, a5, .LBB22_4 +; RV32ZBB-NEXT: sub a7, a4, a7 +; RV32ZBB-NEXT: sltu t3, a7, t1 +; RV32ZBB-NEXT: sub a2, a2, t3 +; RV32ZBB-NEXT: sub a7, a7, t1 +; RV32ZBB-NEXT: beq a2, a6, .LBB22_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sltu t3, a1, a5 +; RV32ZBB-NEXT: sltu t1, a6, a2 +; RV32ZBB-NEXT: j .LBB22_5 ; RV32ZBB-NEXT: .LBB22_4: -; RV32ZBB-NEXT: xor t5, t0, a2 -; RV32ZBB-NEXT: xor t6, a7, a6 -; RV32ZBB-NEXT: or t5, t6, t5 -; RV32ZBB-NEXT: mv t6, t3 -; RV32ZBB-NEXT: beqz t5, .LBB22_6 -; RV32ZBB-NEXT: # %bb.5: -; RV32ZBB-NEXT: mv t6, t4 -; RV32ZBB-NEXT: .LBB22_6: -; RV32ZBB-NEXT: sltu t4, a3, a4 -; RV32ZBB-NEXT: mv t5, t4 -; RV32ZBB-NEXT: beq a1, a5, .LBB22_8 -; RV32ZBB-NEXT: # %bb.7: -; RV32ZBB-NEXT: sltu t5, a5, a1 +; RV32ZBB-NEXT: sltu t1, a4, a7 +; RV32ZBB-NEXT: .LBB22_5: +; RV32ZBB-NEXT: sub t0, a1, t0 +; RV32ZBB-NEXT: sub t0, t0, t2 +; RV32ZBB-NEXT: sub a5, a3, a5 +; RV32ZBB-NEXT: beq t0, a1, .LBB22_7 +; RV32ZBB-NEXT: # %bb.6: +; RV32ZBB-NEXT: sltu a1, a1, t0 +; RV32ZBB-NEXT: j .LBB22_8 +; RV32ZBB-NEXT: .LBB22_7: +; RV32ZBB-NEXT: sltu a1, a3, a5 ; RV32ZBB-NEXT: .LBB22_8: -; RV32ZBB-NEXT: bnez t6, .LBB22_10 +; RV32ZBB-NEXT: xor a3, a2, a6 +; RV32ZBB-NEXT: xor a4, a7, a4 +; RV32ZBB-NEXT: or a3, a4, a3 +; RV32ZBB-NEXT: beqz a3, .LBB22_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: sub a2, t0, a2 -; RV32ZBB-NEXT: sub a6, a7, a6 -; RV32ZBB-NEXT: sub a2, a2, t1 -; RV32ZBB-NEXT: sltu a7, a6, t3 -; RV32ZBB-NEXT: sub a1, a1, a5 -; RV32ZBB-NEXT: sub a2, a2, a7 -; RV32ZBB-NEXT: sub a6, a6, t3 -; RV32ZBB-NEXT: sub a1, a1, t2 -; RV32ZBB-NEXT: sub a3, a4, a3 -; RV32ZBB-NEXT: j .LBB22_11 +; RV32ZBB-NEXT: mv a1, t1 ; RV32ZBB-NEXT: .LBB22_10: -; RV32ZBB-NEXT: sltu t1, a6, a7 -; RV32ZBB-NEXT: sub a2, a2, t0 -; RV32ZBB-NEXT: sub a2, a2, t1 -; RV32ZBB-NEXT: sub a6, a6, a7 -; RV32ZBB-NEXT: sltu a7, a6, t5 -; RV32ZBB-NEXT: sub a2, a2, a7 -; RV32ZBB-NEXT: sub a6, a6, t5 -; RV32ZBB-NEXT: sub a5, a5, a1 -; RV32ZBB-NEXT: sub a1, a5, t4 -; RV32ZBB-NEXT: sub a3, a3, a4 -; RV32ZBB-NEXT: .LBB22_11: -; RV32ZBB-NEXT: sw a6, 8(a0) -; RV32ZBB-NEXT: sw a1, 4(a0) -; RV32ZBB-NEXT: sw a3, 0(a0) -; RV32ZBB-NEXT: sw a2, 12(a0) +; RV32ZBB-NEXT: neg a6, a1 +; RV32ZBB-NEXT: xor a3, a7, a6 +; RV32ZBB-NEXT: sltu a4, a3, a6 +; RV32ZBB-NEXT: xor a2, a2, a6 +; RV32ZBB-NEXT: add a2, a2, a1 +; RV32ZBB-NEXT: sub a4, a2, a4 +; RV32ZBB-NEXT: xor a2, a5, a6 +; RV32ZBB-NEXT: sltu a5, a2, a6 +; RV32ZBB-NEXT: xor a7, t0, a6 +; RV32ZBB-NEXT: mv t1, a5 +; RV32ZBB-NEXT: beqz t0, .LBB22_12 +; RV32ZBB-NEXT: # %bb.11: +; RV32ZBB-NEXT: sltu t1, a7, a6 +; RV32ZBB-NEXT: .LBB22_12: +; RV32ZBB-NEXT: add a3, a3, a1 +; RV32ZBB-NEXT: sltu a6, a3, t1 +; RV32ZBB-NEXT: sub a4, a4, a6 +; RV32ZBB-NEXT: sub a3, a3, t1 +; RV32ZBB-NEXT: add a7, a7, a1 +; RV32ZBB-NEXT: sub a5, a7, a5 +; RV32ZBB-NEXT: add a1, a2, a1 +; RV32ZBB-NEXT: sw a1, 0(a0) +; RV32ZBB-NEXT: sw a5, 4(a0) +; RV32ZBB-NEXT: sw a3, 8(a0) +; RV32ZBB-NEXT: sw a4, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_cmp_i128: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: sltu a4, a0, a2 -; RV64ZBB-NEXT: mv a5, a4 -; RV64ZBB-NEXT: beq a1, a3, .LBB22_2 +; RV64ZBB-NEXT: sub a3, a1, a3 +; RV64ZBB-NEXT: sub a3, a3, a4 +; RV64ZBB-NEXT: sub a2, a0, a2 +; RV64ZBB-NEXT: beq a3, a1, .LBB22_2 ; RV64ZBB-NEXT: # %bb.1: -; RV64ZBB-NEXT: sltu a5, a1, a3 +; RV64ZBB-NEXT: sltu a0, a1, a3 +; RV64ZBB-NEXT: j .LBB22_3 ; RV64ZBB-NEXT: .LBB22_2: -; RV64ZBB-NEXT: bnez a5, .LBB22_4 -; RV64ZBB-NEXT: # %bb.3: -; RV64ZBB-NEXT: sub a1, a1, a3 -; RV64ZBB-NEXT: sub a1, a1, a4 -; RV64ZBB-NEXT: sub a0, a0, a2 -; RV64ZBB-NEXT: ret -; RV64ZBB-NEXT: .LBB22_4: -; RV64ZBB-NEXT: sltu a4, a2, a0 -; RV64ZBB-NEXT: sub a1, a3, a1 +; RV64ZBB-NEXT: sltu a0, a0, a2 +; RV64ZBB-NEXT: .LBB22_3: +; RV64ZBB-NEXT: neg a1, a0 +; RV64ZBB-NEXT: xor a2, a2, a1 +; RV64ZBB-NEXT: sltu a4, a2, a1 +; RV64ZBB-NEXT: xor a1, a3, a1 +; RV64ZBB-NEXT: add a1, a1, a0 ; RV64ZBB-NEXT: sub a1, a1, a4 -; RV64ZBB-NEXT: sub a0, a2, a0 +; RV64ZBB-NEXT: add a0, a2, a0 ; RV64ZBB-NEXT: ret %cmp = icmp ult i128 %a, %b %ab = sub i128 %a, %b diff --git a/llvm/test/CodeGen/RISCV/abdu.ll b/llvm/test/CodeGen/RISCV/abdu.ll index 0730b9b350863e..39aef369a29672 100644 --- a/llvm/test/CodeGen/RISCV/abdu.ll +++ b/llvm/test/CodeGen/RISCV/abdu.ll @@ -1331,17 +1331,34 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind { -; CHECK-LABEL: abd_cmp_i8: -; CHECK: # %bb.0: -; CHECK-NEXT: andi a2, a0, 255 -; CHECK-NEXT: andi a3, a1, 255 -; CHECK-NEXT: bltu a3, a2, .LBB18_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: sub a0, a1, a0 -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB18_2: -; CHECK-NEXT: sub a0, a0, a1 -; CHECK-NEXT: ret +; RV32I-LABEL: abd_cmp_i8: +; RV32I: # %bb.0: +; RV32I-NEXT: andi a1, a1, 255 +; RV32I-NEXT: andi a0, a0, 255 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: xor a0, a0, a1 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: abd_cmp_i8: +; RV64I: # %bb.0: +; RV64I-NEXT: andi a1, a1, 255 +; RV64I-NEXT: andi a0, a0, 255 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: srai a1, a0, 63 +; RV64I-NEXT: xor a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: ret +; +; ZBB-LABEL: abd_cmp_i8: +; ZBB: # %bb.0: +; ZBB-NEXT: andi a1, a1, 255 +; ZBB-NEXT: andi a0, a0, 255 +; ZBB-NEXT: minu a2, a0, a1 +; ZBB-NEXT: maxu a0, a0, a1 +; ZBB-NEXT: sub a0, a0, a2 +; ZBB-NEXT: ret %cmp = icmp ugt i8 %a, %b %ab = sub i8 %a, %b %ba = sub i8 %b, %a @@ -1354,13 +1371,11 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 16 ; RV32I-NEXT: addi a2, a2, -1 -; RV32I-NEXT: and a3, a1, a2 -; RV32I-NEXT: and a2, a0, a2 -; RV32I-NEXT: bgeu a2, a3, .LBB19_2 -; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sub a0, a1, a0 -; RV32I-NEXT: ret -; RV32I-NEXT: .LBB19_2: +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: xor a0, a0, a1 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: ret ; @@ -1368,26 +1383,21 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: lui a2, 16 ; RV64I-NEXT: addiw a2, a2, -1 -; RV64I-NEXT: and a3, a1, a2 -; RV64I-NEXT: and a2, a0, a2 -; RV64I-NEXT: bgeu a2, a3, .LBB19_2 -; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sub a0, a1, a0 -; RV64I-NEXT: ret -; RV64I-NEXT: .LBB19_2: +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: srai a1, a0, 63 +; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: ret ; ; ZBB-LABEL: abd_cmp_i16: ; ZBB: # %bb.0: -; ZBB-NEXT: zext.h a2, a1 -; ZBB-NEXT: zext.h a3, a0 -; ZBB-NEXT: bgeu a3, a2, .LBB19_2 -; ZBB-NEXT: # %bb.1: -; ZBB-NEXT: sub a0, a1, a0 -; ZBB-NEXT: ret -; ZBB-NEXT: .LBB19_2: -; ZBB-NEXT: sub a0, a0, a1 +; ZBB-NEXT: zext.h a1, a1 +; ZBB-NEXT: zext.h a0, a0 +; ZBB-NEXT: minu a2, a0, a1 +; ZBB-NEXT: maxu a0, a0, a1 +; ZBB-NEXT: sub a0, a0, a2 ; ZBB-NEXT: ret %cmp = icmp uge i16 %a, %b %ab = sub i16 %a, %b @@ -1399,46 +1409,42 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind { define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind { ; RV32I-LABEL: abd_cmp_i32: ; RV32I: # %bb.0: -; RV32I-NEXT: bltu a0, a1, .LBB20_2 +; RV32I-NEXT: bltu a1, a0, .LBB20_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: sub a0, a1, a0 ; RV32I-NEXT: ret ; RV32I-NEXT: .LBB20_2: -; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_cmp_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: sext.w a2, a1 -; RV64I-NEXT: sext.w a3, a0 -; RV64I-NEXT: bltu a3, a2, .LBB20_2 -; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: subw a0, a0, a1 -; RV64I-NEXT: ret -; RV64I-NEXT: .LBB20_2: -; RV64I-NEXT: subw a0, a1, a0 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: srli a1, a1, 32 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: srai a1, a0, 63 +; RV64I-NEXT: xor a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: ret ; ; RV32ZBB-LABEL: abd_cmp_i32: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: bltu a0, a1, .LBB20_2 -; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sub a0, a0, a1 -; RV32ZBB-NEXT: ret -; RV32ZBB-NEXT: .LBB20_2: -; RV32ZBB-NEXT: sub a0, a1, a0 +; RV32ZBB-NEXT: minu a2, a0, a1 +; RV32ZBB-NEXT: maxu a0, a0, a1 +; RV32ZBB-NEXT: sub a0, a0, a2 ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_cmp_i32: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: sext.w a2, a1 -; RV64ZBB-NEXT: sext.w a3, a0 -; RV64ZBB-NEXT: bltu a3, a2, .LBB20_2 -; RV64ZBB-NEXT: # %bb.1: -; RV64ZBB-NEXT: subw a0, a0, a1 -; RV64ZBB-NEXT: ret -; RV64ZBB-NEXT: .LBB20_2: -; RV64ZBB-NEXT: subw a0, a1, a0 +; RV64ZBB-NEXT: slli a1, a1, 32 +; RV64ZBB-NEXT: srli a1, a1, 32 +; RV64ZBB-NEXT: slli a0, a0, 32 +; RV64ZBB-NEXT: srli a0, a0, 32 +; RV64ZBB-NEXT: minu a2, a0, a1 +; RV64ZBB-NEXT: maxu a0, a0, a1 +; RV64ZBB-NEXT: sub a0, a0, a2 ; RV64ZBB-NEXT: ret %cmp = icmp ult i32 %a, %b %ab = sub i32 %a, %b @@ -1713,4 +1719,5 @@ declare i16 @llvm.umin.i16(i16, i16) declare i32 @llvm.umin.i32(i32, i32) declare i64 @llvm.umin.i64(i64, i64) ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} ; NOZBB: {{.*}} diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll index b72bb46209d2d6..246cd8e0e852d5 100644 --- a/llvm/test/CodeGen/X86/abds-neg.ll +++ b/llvm/test/CodeGen/X86/abds-neg.ll @@ -853,9 +853,8 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: subq %rsi, %rax -; X64-NEXT: negq %rax -; X64-NEXT: subq %rsi, %rdi -; X64-NEXT: cmovgeq %rdi, %rax +; X64-NEXT: subq %rdi, %rsi +; X64-NEXT: cmovgeq %rsi, %rax ; X64-NEXT: retq %cmp = icmp slt i64 %a, %b %ab = sub i64 %a, %b @@ -907,14 +906,14 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; ; X64-LABEL: abd_cmp_i128: ; X64: # %bb.0: -; X64-NEXT: movq %rdx, %rax -; X64-NEXT: subq %rdi, %rax -; X64-NEXT: movq %rcx, %r8 -; X64-NEXT: sbbq %rsi, %r8 -; X64-NEXT: subq %rdx, %rdi -; X64-NEXT: sbbq %rcx, %rsi -; X64-NEXT: cmovgeq %rdi, %rax -; X64-NEXT: cmovgeq %rsi, %r8 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: subq %rdx, %rax +; X64-NEXT: movq %rsi, %r8 +; X64-NEXT: sbbq %rcx, %r8 +; X64-NEXT: subq %rdi, %rdx +; X64-NEXT: sbbq %rsi, %rcx +; X64-NEXT: cmovgeq %rdx, %rax +; X64-NEXT: cmovgeq %rcx, %r8 ; X64-NEXT: movq %r8, %rdx ; X64-NEXT: retq %cmp = icmp slt i128 %a, %b diff --git a/llvm/test/CodeGen/X86/abds.ll b/llvm/test/CodeGen/X86/abds.ll index e972ef3787e4d5..9f3b99b349aeda 100644 --- a/llvm/test/CodeGen/X86/abds.ll +++ b/llvm/test/CodeGen/X86/abds.ll @@ -648,27 +648,23 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind { ; X86-LABEL: abd_cmp_i8: ; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %eax, %edx -; X86-NEXT: subb %cl, %dl -; X86-NEXT: negb %dl -; X86-NEXT: subb %cl, %al -; X86-NEXT: movzbl %al, %ecx -; X86-NEXT: movzbl %dl, %eax -; X86-NEXT: cmovgl %ecx, %eax +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: negl %eax +; X86-NEXT: cmovsl %ecx, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; ; X64-LABEL: abd_cmp_i8: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: subb %sil, %al -; X64-NEXT: negb %al -; X64-NEXT: subb %sil, %dil -; X64-NEXT: movzbl %dil, %ecx -; X64-NEXT: movzbl %al, %eax -; X64-NEXT: cmovgl %ecx, %eax +; X64-NEXT: movsbl %sil, %eax +; X64-NEXT: movsbl %dil, %ecx +; X64-NEXT: subl %eax, %ecx +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: negl %eax +; X64-NEXT: cmovsl %ecx, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %cmp = icmp sgt i8 %a, %b @@ -681,27 +677,23 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind { define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind { ; X86-LABEL: abd_cmp_i16: ; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: subw %dx, %si -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: negl %eax -; X86-NEXT: cmpw %dx, %cx -; X86-NEXT: cmovgel %esi, %eax +; X86-NEXT: cmovsl %ecx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: abd_cmp_i16: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: subw %si, %cx +; X64-NEXT: movswl %si, %eax +; X64-NEXT: movswl %di, %ecx +; X64-NEXT: subl %eax, %ecx ; X64-NEXT: movl %ecx, %eax ; X64-NEXT: negl %eax -; X64-NEXT: cmpw %si, %di -; X64-NEXT: cmovgel %ecx, %eax +; X64-NEXT: cmovsl %ecx, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %cmp = icmp sge i16 %a, %b @@ -716,9 +708,8 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %eax, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: negl %edx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: cmovll %edx, %eax ; X86-NEXT: retl @@ -727,9 +718,8 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind { ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax ; X64-NEXT: subl %esi, %eax -; X64-NEXT: negl %eax -; X64-NEXT: subl %esi, %edi -; X64-NEXT: cmovgel %edi, %eax +; X64-NEXT: subl %edi, %esi +; X64-NEXT: cmovgel %esi, %eax ; X64-NEXT: retq %cmp = icmp slt i32 %a, %b %ab = sub i32 %a, %b diff --git a/llvm/test/CodeGen/X86/abdu.ll b/llvm/test/CodeGen/X86/abdu.ll index e808e0f21babf2..c8fa19cb661b6b 100644 --- a/llvm/test/CodeGen/X86/abdu.ll +++ b/llvm/test/CodeGen/X86/abdu.ll @@ -639,27 +639,23 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind { define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind { ; X86-LABEL: abd_cmp_i16: ; X86: # %bb.0: -; X86-NEXT: pushl %esi +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: subw %dx, %si -; X86-NEXT: movl %esi, %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: negl %eax -; X86-NEXT: cmpw %dx, %cx -; X86-NEXT: cmovael %esi, %eax +; X86-NEXT: cmovsl %ecx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: abd_cmp_i16: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: subw %si, %cx +; X64-NEXT: movzwl %si, %eax +; X64-NEXT: movzwl %di, %ecx +; X64-NEXT: subl %eax, %ecx ; X64-NEXT: movl %ecx, %eax ; X64-NEXT: negl %eax -; X64-NEXT: cmpw %si, %di -; X64-NEXT: cmovael %ecx, %eax +; X64-NEXT: cmovsl %ecx, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %cmp = icmp uge i16 %a, %b From 0c07e7c211bed5e14372aebc2fc6edc16ecef8cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= Date: Wed, 21 Aug 2024 13:08:25 +0200 Subject: [PATCH 045/426] [SPIR-V] Sort basic blocks to match the SPIR-V spec (#102929) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SPIR-V spec required basic blocks to respect some kind of ordering (A block dominating another cannot be after in the binary layout). --------- Signed-off-by: Nathan Gauër --- llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp | 50 +++++++++++++++++ llvm/test/CodeGen/SPIRV/block-ordering.ll | 30 ++++++++++ .../SPIRV/branching/OpSwitchBranches.ll | 24 +++++--- .../branching/Two_OpSwitch_same_register.ll | 31 ++++++---- .../CodeGen/SPIRV/branching/if-merging.ll | 13 +++-- .../CodeGen/SPIRV/branching/if-non-merging.ll | 4 +- .../SPIRV/branching/switch-range-check.ll | 6 +- .../CodeGen/SPIRV/phi-ptrcast-dominate.ll | 56 ++++++++++++------- .../CodeGen/SPIRV/scfg-add-pre-headers.ll | 37 +++++++----- .../SPIRV/structurizer/merge-exit-break.ll | 23 ++++---- .../merge-exit-convergence-in-break.ll | 35 ++++++------ .../structurizer/merge-exit-multiple-break.ll | 29 ++++++---- ...ll => merge-exit-simple-while-identity.ll} | 5 +- 13 files changed, 237 insertions(+), 106 deletions(-) create mode 100644 llvm/test/CodeGen/SPIRV/block-ordering.ll rename llvm/test/CodeGen/SPIRV/structurizer/{merge-exit-simple-white-identity.ll => merge-exit-simple-while-identity.ll} (97%) diff --git a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp index 44685be3d68ad4..5ec228416a8886 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp @@ -19,11 +19,13 @@ #include "SPIRVUtils.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/IntrinsicsSPIRV.h" #include "llvm/Target/TargetIntrinsicInfo.h" +#include #define DEBUG_TYPE "spirv-postlegalizer" @@ -150,6 +152,53 @@ static void processNewInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, } } +// Do a preorder traversal of the CFG starting from the BB |Start|. +// point. Calls |op| on each basic block encountered during the traversal. +void visit(MachineFunction &MF, MachineBasicBlock &Start, + std::function op) { + std::stack ToVisit; + SmallPtrSet Seen; + + ToVisit.push(&Start); + Seen.insert(ToVisit.top()); + while (ToVisit.size() != 0) { + MachineBasicBlock *MBB = ToVisit.top(); + ToVisit.pop(); + + op(MBB); + + for (auto Succ : MBB->successors()) { + if (Seen.contains(Succ)) + continue; + ToVisit.push(Succ); + Seen.insert(Succ); + } + } +} + +// Do a preorder traversal of the CFG starting from the given function's entry +// point. Calls |op| on each basic block encountered during the traversal. +void visit(MachineFunction &MF, std::function op) { + visit(MF, *MF.begin(), op); +} + +// Sorts basic blocks by dominance to respect the SPIR-V spec. +void sortBlocks(MachineFunction &MF) { + MachineDominatorTree MDT(MF); + + std::unordered_map Order; + Order.reserve(MF.size()); + + size_t Index = 0; + visit(MF, [&Order, &Index](MachineBasicBlock *MBB) { Order[MBB] = Index++; }); + + auto Comparator = [&Order](MachineBasicBlock &LHS, MachineBasicBlock &RHS) { + return Order[&LHS] < Order[&RHS]; + }; + + MF.sort(Comparator); +} + bool SPIRVPostLegalizer::runOnMachineFunction(MachineFunction &MF) { // Initialize the type registry. const SPIRVSubtarget &ST = MF.getSubtarget(); @@ -158,6 +207,7 @@ bool SPIRVPostLegalizer::runOnMachineFunction(MachineFunction &MF) { MachineIRBuilder MIB(MF); processNewInstrs(MF, GR, MIB); + sortBlocks(MF); return true; } diff --git a/llvm/test/CodeGen/SPIRV/block-ordering.ll b/llvm/test/CodeGen/SPIRV/block-ordering.ll new file mode 100644 index 00000000000000..eee61ce9f22da5 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/block-ordering.ll @@ -0,0 +1,30 @@ +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; Checks SPIR-V blocks are correctly reordered so that dominators shows up +; before others in the binary layout. + +define void @main() { +; CHECK: OpLabel +; CHECK: OpBranch %[[#l1:]] + +; CHECK: %[[#l1]] = OpLabel +; CHECK: OpBranch %[[#l2:]] + +; CHECK: %[[#l2]] = OpLabel +; CHECK: OpBranch %[[#end:]] + +; CHECK: %[[#end]] = OpLabel +; CHECK: OpReturn +entry: + br label %l1 + +l2: + br label %end + +l1: + br label %l2 + +end: + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/branching/OpSwitchBranches.ll b/llvm/test/CodeGen/SPIRV/branching/OpSwitchBranches.ll index 145c43c6da32b6..454b51d952a365 100644 --- a/llvm/test/CodeGen/SPIRV/branching/OpSwitchBranches.ll +++ b/llvm/test/CodeGen/SPIRV/branching/OpSwitchBranches.ll @@ -10,32 +10,38 @@ entry: i32 3, label %case3 ] -; CHECK-SPIRV: %[[#CASE1]] = OpLabel case1: store i32 1, ptr %alloc -; CHECK-SPIRV: OpBranch %[[#END:]] br label %end -; CHECK-SPIRV: %[[#CASE2]] = OpLabel case2: store i32 2, ptr %alloc -; CHECK-SPIRV: OpBranch %[[#END]] br label %end -; CHECK-SPIRV: %[[#CASE3]] = OpLabel case3: store i32 3, ptr %alloc -; CHECK-SPIRV: OpBranch %[[#END]] br label %end -; CHECK-SPIRV: %[[#DEFAULT]] = OpLabel default: store i32 0, ptr %alloc -; CHECK-SPIRV: OpBranch %[[#END]] br label %end -; CHECK-SPIRV: %[[#END]] = OpLabel end: %result = load i32, ptr %alloc ret i32 %result + +; CHECK-SPIRV: %[[#CASE3]] = OpLabel +; CHECK-SPIRV: OpBranch %[[#END:]] + +; CHECK-SPIRV: %[[#END]] = OpLabel +; CHECK-SPIRV: OpReturnValue + +; CHECK-SPIRV: %[[#CASE2]] = OpLabel +; CHECK-SPIRV: OpBranch %[[#END]] + +; CHECK-SPIRV: %[[#CASE1]] = OpLabel +; CHECK-SPIRV: OpBranch %[[#END]] + +; CHECK-SPIRV: %[[#DEFAULT]] = OpLabel +; CHECK-SPIRV: OpBranch %[[#END]] } diff --git a/llvm/test/CodeGen/SPIRV/branching/Two_OpSwitch_same_register.ll b/llvm/test/CodeGen/SPIRV/branching/Two_OpSwitch_same_register.ll index 19c11ff64476b9..f5c12e1c0f5a3f 100644 --- a/llvm/test/CodeGen/SPIRV/branching/Two_OpSwitch_same_register.ll +++ b/llvm/test/CodeGen/SPIRV/branching/Two_OpSwitch_same_register.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} define spir_kernel void @test_two_switch_same_register(i32 %value) { ; CHECK-SPIRV: OpSwitch %[[#REGISTER:]] %[[#DEFAULT1:]] 1 %[[#CASE1:]] 0 %[[#CASE2:]] @@ -7,36 +8,42 @@ define spir_kernel void @test_two_switch_same_register(i32 %value) { i32 0, label %case2 ] -; CHECK-SPIRV: %[[#CASE1]] = OpLabel case1: -; CHECK-SPIRV-NEXT: OpBranch %[[#DEFAULT1]] br label %default1 -; CHECK-SPIRV: %[[#CASE2]] = OpLabel case2: -; CHECK-SPIRV-NEXT: OpBranch %[[#DEFAULT1]] br label %default1 -; CHECK-SPIRV: %[[#DEFAULT1]] = OpLabel default1: -; CHECK-SPIRV-NEXT: OpSwitch %[[#REGISTER]] %[[#DEFAULT2:]] 0 %[[#CASE3:]] 1 %[[#CASE4:]] switch i32 %value, label %default2 [ i32 0, label %case3 i32 1, label %case4 ] -; CHECK-SPIRV: %[[#CASE3]] = OpLabel case3: -; CHECK-SPIRV-NEXT: OpBranch %[[#DEFAULT2]] br label %default2 -; CHECK-SPIRV: %[[#CASE4]] = OpLabel case4: -; CHECK-SPIRV-NEXT: OpBranch %[[#DEFAULT2]] br label %default2 -; CHECK-SPIRV: %[[#DEFAULT2]] = OpLabel default2: -; CHECK-SPIRV-NEXT: OpReturn ret void + +; CHECK-SPIRV: %[[#CASE2]] = OpLabel +; CHECK-SPIRV-NEXT: OpBranch %[[#DEFAULT1]] + +; CHECK-SPIRV: %[[#CASE1]] = OpLabel +; CHECK-SPIRV-NEXT: OpBranch %[[#DEFAULT1]] + +; CHECK-SPIRV: %[[#DEFAULT1]] = OpLabel +; CHECK-SPIRV-NEXT: OpSwitch %[[#REGISTER]] %[[#DEFAULT2:]] 0 %[[#CASE3:]] 1 %[[#CASE4:]] + +; CHECK-SPIRV: %[[#CASE4:]] = OpLabel +; CHECK-SPIRV-NEXT: OpBranch %[[#DEFAULT2]] + +; CHECK-SPIRV: %[[#CASE3]] = OpLabel +; CHECK-SPIRV-NEXT: OpBranch %[[#DEFAULT2]] + +; CHECK-SPIRV: %[[#DEFAULT2]] = OpLabel +; CHECK-SPIRV-NEXT: OpReturn } diff --git a/llvm/test/CodeGen/SPIRV/branching/if-merging.ll b/llvm/test/CodeGen/SPIRV/branching/if-merging.ll index 43c0d65a5edc52..c45d06891e7e25 100644 --- a/llvm/test/CodeGen/SPIRV/branching/if-merging.ll +++ b/llvm/test/CodeGen/SPIRV/branching/if-merging.ll @@ -37,15 +37,16 @@ merge_label: ; CHECK: [[COND:%.+]] = OpIEqual [[BOOL]] [[A]] [[B]] ; CHECK: OpBranchConditional [[COND]] [[TRUE_LABEL:%.+]] [[FALSE_LABEL:%.+]] -; CHECK: [[TRUE_LABEL]] = OpLabel -; CHECK: [[V1:%.+]] = OpFunctionCall [[I32]] [[FOO]] -; CHECK: OpBranch [[MERGE_LABEL:%.+]] - ; CHECK: [[FALSE_LABEL]] = OpLabel ; CHECK: [[V2:%.+]] = OpFunctionCall [[I32]] [[BAR]] -; CHECK: OpBranch [[MERGE_LABEL]] +; CHECK: OpBranch [[MERGE_LABEL:%.+]] ; CHECK: [[MERGE_LABEL]] = OpLabel -; CHECK-NEXT: [[V:%.+]] = OpPhi [[I32]] [[V1]] [[TRUE_LABEL]] [[V2]] [[FALSE_LABEL]] +; CHECK-NEXT: [[V:%.+]] = OpPhi [[I32]] [[V1:%.+]] [[TRUE_LABEL]] [[V2]] [[FALSE_LABEL]] ; CHECK: OpReturnValue [[V]] + +; CHECK: [[TRUE_LABEL]] = OpLabel +; CHECK: [[V1]] = OpFunctionCall [[I32]] [[FOO]] +; CHECK: OpBranch [[MERGE_LABEL]] + ; CHECK-NEXT: OpFunctionEnd diff --git a/llvm/test/CodeGen/SPIRV/branching/if-non-merging.ll b/llvm/test/CodeGen/SPIRV/branching/if-non-merging.ll index 319abda86b046c..b9eb988cac1e4e 100644 --- a/llvm/test/CodeGen/SPIRV/branching/if-non-merging.ll +++ b/llvm/test/CodeGen/SPIRV/branching/if-non-merging.ll @@ -21,7 +21,7 @@ false_label: ; CHECK: [[ENTRY:%.+]] = OpLabel ; CHECK: [[COND:%.+]] = OpIEqual [[BOOL]] [[A]] [[B]] ; CHECK: OpBranchConditional [[COND]] [[TRUE_LABEL:%.+]] [[FALSE_LABEL:%.+]] -; CHECK: [[TRUE_LABEL]] = OpLabel -; CHECK: OpReturnValue [[TRUE]] ; CHECK: [[FALSE_LABEL]] = OpLabel ; CHECK: OpReturnValue [[FALSE]] +; CHECK: [[TRUE_LABEL]] = OpLabel +; CHECK: OpReturnValue [[TRUE]] diff --git a/llvm/test/CodeGen/SPIRV/branching/switch-range-check.ll b/llvm/test/CodeGen/SPIRV/branching/switch-range-check.ll index f8ce15323aacfb..a6967684f9147b 100644 --- a/llvm/test/CodeGen/SPIRV/branching/switch-range-check.ll +++ b/llvm/test/CodeGen/SPIRV/branching/switch-range-check.ll @@ -2,11 +2,15 @@ ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpFunction +; CHECK: OpBranchConditional %[[#]] %[[#if_then:]] %[[#if_end:]] +; CHECK: %[[#if_end]] = OpLabel ; CHECK: %[[#Var:]] = OpPhi ; CHECK: OpSwitch %[[#Var]] %[[#]] [[#]] %[[#]] [[#]] %[[#]] [[#]] %[[#]] [[#]] %[[#]] [[#]] %[[#]] [[#]] %[[#]] [[#]] %[[#]] [[#]] %[[#]] [[#]] %[[#]] [[#]] %[[#]] [[#]] %[[#]] [[#]] %[[#]] -; CHECK-COUNT-11: OpBranch +; CHECK-COUNT-11: OpLabel ; CHECK-NOT: OpBranch ; CHECK: OpReturn +; CHECK: %[[#if_then]] = OpLabel +; CHECK: OpBranch %[[#if_end]] ; CHECK-NEXT: OpFunctionEnd define spir_func void @foo(i64 noundef %addr, i64 noundef %as) { diff --git a/llvm/test/CodeGen/SPIRV/phi-ptrcast-dominate.ll b/llvm/test/CodeGen/SPIRV/phi-ptrcast-dominate.ll index 2cd321b05a4033..ff6db704ea426b 100644 --- a/llvm/test/CodeGen/SPIRV/phi-ptrcast-dominate.ll +++ b/llvm/test/CodeGen/SPIRV/phi-ptrcast-dominate.ll @@ -6,30 +6,10 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + ; CHECK-DAG: OpName %[[#Case1:]] "case1" ; CHECK-DAG: OpName %[[#Case2:]] "case2" ; CHECK-DAG: OpName %[[#Case3:]] "case3" -; CHECK: %[[#Case1]] = OpFunction -; CHECK: OpBranchConditional -; CHECK: OpPhi -; CHECK: OpBranch -; CHECK-COUNT-2: OpBranchConditional -; CHECK: OpFunctionEnd -; CHECK: %[[#Case2]] = OpFunction -; CHECK: OpBranchConditional -; CHECK: OpPhi -; CHECK: OpBranch -; CHECK-COUNT-2: OpBranchConditional -; CHECK: OpFunctionEnd -; CHECK: %[[#Case3]] = OpFunction -; CHECK: OpBranchConditional -; CHECK: OpPhi -; CHECK: OpBranch -; CHECK: OpInBoundsPtrAccessChain -; CHECK: OpBranchConditional -; CHECK: OpInBoundsPtrAccessChain -; CHECK: OpBranchConditional -; CHECK: OpFunctionEnd %struct1 = type { i64 } %struct2 = type { i64, i64 } @@ -37,58 +17,92 @@ @.str.1 = private unnamed_addr addrspace(1) constant [3 x i8] c"OK\00", align 1 @.str.2 = private unnamed_addr addrspace(1) constant [6 x i8] c"WRONG\00", align 1 +; CHECK: %[[#Case1]] = OpFunction define spir_func void @case1(i1 %b1, i1 %b2, i1 %b3) { entry: +; CHECK: OpBranchConditional %[[#]] %[[#l1:]] %[[#l2:]] br i1 %b1, label %l1, label %l2 l1: %str = phi ptr addrspace(1) [ @.str.1, %entry ], [ @.str.2, %l2 ], [ @.str.2, %l3 ] br label %exit +; CHECK: %[[#l2]] = OpLabel +; CHECK: OpBranchConditional %[[#]] %[[#l1:]] %[[#l3:]] l2: br i1 %b2, label %l1, label %l3 +; CHECK: %[[#l3]] = OpLabel +; CHECK: OpBranchConditional %[[#]] %[[#l1:]] %[[#exit:]] l3: br i1 %b3, label %l1, label %exit +; CHECK: %[[#exit]] = OpLabel +; CHECK: OpReturn exit: ret void + +; CHECK: %[[#l1]] = OpLabel +; CHECK-NEXT: OpPhi +; CHECK: OpBranch %[[#exit:]] } +; CHECK: %[[#Case2]] = OpFunction define spir_func void @case2(i1 %b1, i1 %b2, i1 %b3, ptr addrspace(1) byval(%struct1) %str1, ptr addrspace(1) byval(%struct2) %str2) { entry: +; CHECK: OpBranchConditional %[[#]] %[[#l1:]] %[[#l2:]] br i1 %b1, label %l1, label %l2 l1: %str = phi ptr addrspace(1) [ %str1, %entry ], [ %str2, %l2 ], [ %str2, %l3 ] br label %exit +; CHECK: %[[#l2]] = OpLabel +; CHECK: OpBranchConditional %[[#]] %[[#l1:]] %[[#l3:]] l2: br i1 %b2, label %l1, label %l3 +; CHECK: %[[#l3]] = OpLabel +; CHECK: OpBranchConditional %[[#]] %[[#l1:]] %[[#exit:]] l3: br i1 %b3, label %l1, label %exit +; CHECK: %[[#exit]] = OpLabel +; CHECK: OpReturn exit: ret void } +; CHECK: %[[#Case3]] = OpFunction define spir_func void @case3(i1 %b1, i1 %b2, i1 %b3, ptr addrspace(1) byval(%struct1) %_arg_str1, ptr addrspace(1) byval(%struct2) %_arg_str2) { entry: br i1 %b1, label %l1, label %l2 +; CHECK: OpBranchConditional %[[#]] %[[#l1:]] %[[#l2:]] l1: %str = phi ptr addrspace(1) [ %_arg_str1, %entry ], [ %str2, %l2 ], [ %str3, %l3 ] br label %exit +; CHECK: %[[#l2]] = OpLabel +; CHECK: OpInBoundsPtrAccessChain +; CHECK: OpBranchConditional %[[#]] %[[#l1:]] %[[#l3:]] l2: %str2 = getelementptr inbounds %struct2, ptr addrspace(1) %_arg_str2, i32 1 br i1 %b2, label %l1, label %l3 +; CHECK: %[[#l3]] = OpLabel +; CHECK: OpInBoundsPtrAccessChain +; CHECK: OpBranchConditional %[[#]] %[[#l1:]] %[[#exit:]] l3: %str3 = getelementptr inbounds %struct2, ptr addrspace(1) %_arg_str2, i32 2 br i1 %b3, label %l1, label %exit +; CHECK: %[[#exit]] = OpLabel +; CHECK: OpReturn exit: ret void + +; CHECK: %[[#l1]] = OpLabel +; CHECK-NEXT: OpPhi +; CHECK: OpBranch %[[#exit:]] } diff --git a/llvm/test/CodeGen/SPIRV/scfg-add-pre-headers.ll b/llvm/test/CodeGen/SPIRV/scfg-add-pre-headers.ll index 2ea5c767730e19..be482c675245ef 100644 --- a/llvm/test/CodeGen/SPIRV/scfg-add-pre-headers.ll +++ b/llvm/test/CodeGen/SPIRV/scfg-add-pre-headers.ll @@ -18,15 +18,35 @@ define void @main() #1 { ; CHECK-DAG: %[[#l2_pre]] = OpLabel ; CHECK-NEXT: OpBranch %[[#l2_header:]] -; CHECK-DAG: %[[#l1_pre]] = OpLabel -; CHECK-NEXT: OpBranch %[[#l1_header:]] +; CHECK-DAG: %[[#l2_header]] = OpLabel +; CHECK-NEXT: OpBranchConditional %[[#cond]] %[[#l2_body:]] %[[#l2_end:]] + +; CHECK-DAG: %[[#l2_end]] = OpLabel +; CHECK-NEXT: OpBranch %[[#end:]] + +; CHECK-DAG: %[[#end]] = OpLabel +; CHECK-NEXT: OpReturn + +; CHECK-DAG: %[[#l2_body]] = OpLabel +; CHECK-NEXT: OpBranch %[[#l2_continue:]] + +; CHECK-DAG: %[[#l2_continue]] = OpLabel +; CHECK-NEXT: OpBranch %[[#l2_header]] l1: %tl1 = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %t1) ] br i1 %1, label %l1_body, label %l1_end + +; CHECK-DAG: %[[#l1_pre]] = OpLabel +; CHECK-NEXT: OpBranch %[[#l1_header:]] + ; CHECK-DAG: %[[#l1_header]] = OpLabel ; CHECK-NEXT: OpBranchConditional %[[#cond]] %[[#l1_body:]] %[[#l1_end:]] +; CHECK-DAG: %[[#l1_end]] = OpLabel +; CHECK-DAG: %[[#]] = OpLoad %[[#]] %[[#SubgroupLocalInvocationId]] +; CHECK-NEXT: OpBranch %[[#end]] + l1_body: br label %l1_continue ; CHECK-DAG: %[[#l1_body]] = OpLabel @@ -40,35 +60,22 @@ l1_continue: l1_end: %call = call i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %tl1) ] br label %end -; CHECK-DAG: %[[#l1_end]] = OpLabel -; CHECK-DAG: %[[#]] = OpLoad %[[#]] %[[#SubgroupLocalInvocationId]] -; CHECK-NEXT: OpBranch %[[#end:]] l2: %tl2 = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %t1) ] br i1 %1, label %l2_body, label %l2_end -; CHECK-DAG: %[[#l2_header]] = OpLabel -; CHECK-NEXT: OpBranchConditional %[[#cond]] %[[#l2_body:]] %[[#l2_end:]] l2_body: br label %l2_continue -; CHECK-DAG: %[[#l2_body]] = OpLabel -; CHECK-NEXT: OpBranch %[[#l2_continue:]] l2_continue: br label %l2 -; CHECK-DAG: %[[#l2_continue]] = OpLabel -; CHECK-NEXT: OpBranch %[[#l2_header]] l2_end: br label %end -; CHECK-DAG: %[[#l2_end]] = OpLabel -; CHECK-NEXT: OpBranch %[[#end:]] end: ret void -; CHECK-DAG: %[[#end]] = OpLabel -; CHECK-NEXT: OpReturn } attributes #1 = { "hlsl.numthreads"="4,8,16" "hlsl.shader"="compute" convergent } diff --git a/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-break.ll b/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-break.ll index e7b1b441405f61..55f85726cbdd2b 100644 --- a/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-break.ll +++ b/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-break.ll @@ -33,6 +33,19 @@ while.cond: %cmp = icmp ne i32 %2, 10 br i1 %cmp, label %while.body, label %while.end +; CHECK: %[[#new_end]] = OpLabel +; CHECK: %[[#route:]] = OpPhi %[[#int_ty]] %[[#int_1]] %[[#while_cond]] %[[#int_0]] %[[#while_body]] +; CHECK: OpSwitch %[[#route]] %[[#if_then:]] 1 %[[#while_end_loopexit:]] + +; CHECK: %[[#while_end_loopexit]] = OpLabel +; CHECK: OpBranch %[[#while_end:]] + +; CHECK: %[[#while_end]] = OpLabel +; CHECK: OpReturn + +; CHECK: %[[#if_then]] = OpLabel +; CHECK: OpBranch %[[#while_end]] + ; CHECK: %[[#while_body]] = OpLabel ; CHECK-NEXT: %[[#tmp:]] = OpLoad %[[#int_ty]] %[[#builtin]] Aligned 1 ; CHECK-NEXT: OpStore %[[#idx]] %[[#tmp]] Aligned 4 @@ -46,8 +59,6 @@ while.body: %cmp1 = icmp eq i32 %4, 0 br i1 %cmp1, label %if.then, label %if.end -; CHECK: %[[#if_then:]] = OpLabel -; CHECK: OpBranch %[[#while_end:]] if.then: br label %while.end @@ -56,17 +67,9 @@ if.then: if.end: br label %while.cond -; CHECK: %[[#while_end_loopexit:]] = OpLabel -; CHECK: OpBranch %[[#while_end]] - -; CHECK: %[[#while_end]] = OpLabel -; CHECK: OpReturn while.end: ret void -; CHECK: %[[#new_end]] = OpLabel -; CHECK: %[[#route:]] = OpPhi %[[#int_ty]] %[[#int_1]] %[[#while_cond]] %[[#int_0]] %[[#while_body]] -; CHECK: OpSwitch %[[#route]] %[[#if_then]] 1 %[[#while_end_loopexit]] } declare token @llvm.experimental.convergence.entry() #2 diff --git a/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-convergence-in-break.ll b/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-convergence-in-break.ll index 593e3631c02b9d..72ce6bdcba5ffd 100644 --- a/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-convergence-in-break.ll +++ b/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-convergence-in-break.ll @@ -33,6 +33,16 @@ while.cond: %cmp = icmp ne i32 %2, 10 br i1 %cmp, label %while.body, label %while.end +; CHECK: %[[#new_end]] = OpLabel +; CHECK: %[[#route:]] = OpPhi %[[#int_ty]] %[[#int_0]] %[[#while_cond]] %[[#int_1]] %[[#tail:]] +; CHECK: OpSwitch %[[#route]] %[[#while_end_loopexit:]] 1 %[[#while_end:]] + +; CHECK: %[[#while_end]] = OpLabel +; CHECK: OpReturn + +; CHECK: %[[#while_end_loopexit]] = OpLabel +; CHECK: OpBranch %[[#while_end]] + ; CHECK: %[[#while_body]] = OpLabel ; CHECK-NEXT: %[[#tmp:]] = OpLoad %[[#int_ty]] %[[#builtin]] Aligned 1 ; CHECK-NEXT: OpStore %[[#idx]] %[[#tmp]] Aligned 4 @@ -46,36 +56,29 @@ while.body: %cmp1 = icmp eq i32 %4, 0 br i1 %cmp1, label %if.then, label %if.end -; CHECK: %[[#if_then:]] = OpLabel -; CHECK-NEXT: OpBranch %[[#tail:]] +; CHECK: %[[#if_end]] = OpLabel +; CHECK: OpBranch %[[#while_cond]] + +; CHECK: %[[#if_then]] = OpLabel +; CHECK-NEXT: OpBranch %[[#tail]] if.then: br label %tail -; CHECK: %[[#tail:]] = OpLabel -; CHECK-NEXT: %[[#tmp:]] = OpLoad %[[#int_ty]] %[[#builtin]] Aligned 1 -; CHECK-NEXT: OpStore %[[#idx]] %[[#tmp]] Aligned 4 -; CHECK: OpBranch %[[#new_end:]] +; CHECK: %[[#tail]] = OpLabel +; CHECK-NEXT: %[[#tmp:]] = OpLoad %[[#int_ty]] %[[#builtin]] Aligned 1 +; CHECK-NEXT: OpStore %[[#idx]] %[[#tmp]] Aligned 4 +; CHECK: OpBranch %[[#new_end:]] tail: %5 = call i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %1) ] store i32 %5, ptr %idx, align 4 br label %while.end -; CHECK: %[[#if_end]] = OpLabel -; CHECK: OpBranch %[[#while_cond]] if.end: br label %while.cond -; CHECK: %[[#while_end_loopexit:]] = OpLabel -; CHECK: OpBranch %[[#while_end:]] - -; CHECK: %[[#while_end]] = OpLabel -; CHECK: OpReturn while.end: ret void -; CHECK: %[[#new_end]] = OpLabel -; CHECK: %[[#route:]] = OpPhi %[[#int_ty]] %[[#int_0]] %[[#while_cond]] %[[#int_1]] %[[#tail]] -; CHECK: OpSwitch %[[#route]] %[[#while_end_loopexit]] 1 %[[#while_end]] } declare token @llvm.experimental.convergence.entry() #2 diff --git a/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-multiple-break.ll b/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-multiple-break.ll index 9806dd7955468e..1768d6526f2ba6 100644 --- a/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-multiple-break.ll +++ b/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-multiple-break.ll @@ -34,12 +34,28 @@ while.cond: %cmp = icmp ne i32 %2, 10 br i1 %cmp, label %while.body, label %while.end +; CHECK: %[[#new_end]] = OpLabel +; CHECK: %[[#route:]] = OpPhi %[[#int_ty]] %[[#int_2]] %[[#while_cond]] %[[#int_0]] %[[#while_body]] %[[#int_1]] %[[#if_end:]] +; CHECK: OpSwitch %[[#route]] %[[#if_then:]] 1 %[[#if_then2:]] 2 %[[#while_end_loopexit:]] + +; CHECK: %[[#while_end_loopexit]] = OpLabel +; CHECK: OpBranch %[[#while_end:]] + +; CHECK: %[[#while_end]] = OpLabel +; CHECK: OpReturn + +; CHECK: %[[#if_then2]] = OpLabel +; CHECK: OpBranch %[[#while_end]] + +; CHECK: %[[#if_then]] = OpLabel +; CHECK: OpBranch %[[#while_end]] + ; CHECK: %[[#while_body]] = OpLabel ; CHECK-NEXT: %[[#tmp:]] = OpLoad %[[#int_ty]] %[[#builtin]] Aligned 1 ; CHECK-NEXT: OpStore %[[#idx]] %[[#tmp]] Aligned 4 ; CHECK-NEXT: %[[#tmp:]] = OpLoad %[[#int_ty]] %[[#idx]] Aligned 4 ; CHECK-NEXT: %[[#cmp1:]] = OpIEqual %[[#bool_ty]] %[[#tmp]] %[[#int_0]] -; CHECK: OpBranchConditional %[[#cmp1]] %[[#new_end]] %[[#if_end:]] +; CHECK: OpBranchConditional %[[#cmp1]] %[[#new_end]] %[[#if_end]] while.body: %3 = call i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %1) ] store i32 %3, ptr %idx, align 4 @@ -47,8 +63,6 @@ while.body: %cmp1 = icmp eq i32 %4, 0 br i1 %cmp1, label %if.then, label %if.end -; CHECK: %[[#if_then:]] = OpLabel -; CHECK: OpBranch %[[#while_end:]] if.then: br label %while.end @@ -65,8 +79,6 @@ if.end: %cmp2 = icmp eq i32 %6, 0 br i1 %cmp2, label %if.then2, label %if.end2 -; CHECK: %[[#if_then2:]] = OpLabel -; CHECK: OpBranch %[[#while_end:]] if.then2: br label %while.end @@ -75,17 +87,10 @@ if.then2: if.end2: br label %while.cond -; CHECK: %[[#while_end_loopexit:]] = OpLabel -; CHECK: OpBranch %[[#while_end]] -; CHECK: %[[#while_end]] = OpLabel -; CHECK: OpReturn while.end: ret void -; CHECK: %[[#new_end]] = OpLabel -; CHECK: %[[#route:]] = OpPhi %[[#int_ty]] %[[#int_2]] %[[#while_cond]] %[[#int_0]] %[[#while_body]] %[[#int_1]] %[[#if_end]] -; CHECK: OpSwitch %[[#route]] %[[#if_then]] 1 %[[#if_then2]] 2 %[[#while_end_loopexit]] } declare token @llvm.experimental.convergence.entry() #2 diff --git a/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-simple-white-identity.ll b/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-simple-while-identity.ll similarity index 97% rename from llvm/test/CodeGen/SPIRV/structurizer/merge-exit-simple-white-identity.ll rename to llvm/test/CodeGen/SPIRV/structurizer/merge-exit-simple-while-identity.ll index a8bf4fb0db989d..755235b7012a3c 100644 --- a/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-simple-white-identity.ll +++ b/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-simple-while-identity.ll @@ -21,6 +21,9 @@ while.cond: %cmp = icmp ne i32 %2, 0 br i1 %cmp, label %while.body, label %while.end +; CHECK: %[[#while_end]] = OpLabel +; CHECK-NEXT: OpReturn + ; CHECK: %[[#while_body]] = OpLabel ; CHECK: OpBranch %[[#while_cond]] while.body: @@ -28,8 +31,6 @@ while.body: store i32 %3, ptr %idx, align 4 br label %while.cond - ; CHECK: %[[#while_end]] = OpLabel -; CHECK-NEXT: OpReturn while.end: ret void } From 4e04286d61edfb56338ca3a6d0735c5384508b00 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 21 Aug 2024 13:09:01 +0200 Subject: [PATCH 046/426] [VPlan] Only use selectVectorizationFactor for cross-check (NFCI). (#103033) Use getBestVF to select VF up-front and only use selectVectorizationFactor to get the VF legacy VF to check the vectorization decision matches the VPlan-based cost model. PR: https://github.com/llvm/llvm-project/pull/103033 --- .../Vectorize/LoopVectorizationPlanner.h | 11 ++- .../Transforms/Vectorize/LoopVectorize.cpp | 92 ++++++++----------- llvm/lib/Transforms/Vectorize/VPlan.cpp | 4 + .../RISCV/riscv-vector-reverse.ll | 2 - .../first-order-recurrence-chains-vplan.ll | 2 +- .../x86-loopvectorize-costmodel.ll.expected | 2 +- 6 files changed, 50 insertions(+), 63 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 7d242082172c62..3bb7a8e651a3f6 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -354,9 +354,10 @@ class LoopVectorizationPlanner { : OrigLoop(L), LI(LI), DT(DT), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI), PSE(PSE), Hints(Hints), ORE(ORE) {} - /// Plan how to best vectorize, return the best VF and its cost, or - /// std::nullopt if vectorization and interleaving should be avoided up front. - std::optional plan(ElementCount UserVF, unsigned UserIC); + /// Build VPlans for the specified \p UserVF and \p UserIC if they are + /// non-zero or all applicable candidate VFs otherwise. If vectorization and + /// interleaving should be avoided up-front, no plans are generated. + void plan(ElementCount UserVF, unsigned UserIC); /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. @@ -368,7 +369,7 @@ class LoopVectorizationPlanner { /// Compute and return the most profitable vectorization factor. Also collect /// all profitable VFs in ProfitableVFs. - ElementCount computeBestVF(); + VectorizationFactor computeBestVF(); /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan /// according to the best selected \p VF and \p UF. @@ -450,12 +451,14 @@ class LoopVectorizationPlanner { VPRecipeBuilder &RecipeBuilder, ElementCount MinVF); +#ifndef NDEBUG /// \return The most profitable vectorization factor for the available VPlans /// and the cost of that VF. /// This is now only used to verify the decisions by the new VPlan-based /// cost-model and will be retired once the VPlan-based cost-model is /// stabilized. VectorizationFactor selectVectorizationFactor(); +#endif /// Returns true if the per-lane cost of VectorizationFactor A is lower than /// that of B. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 364166b3ab5380..e3049da1b22188 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4546,6 +4546,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, return false; } +#ifndef NDEBUG VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1)); LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); @@ -4578,7 +4579,6 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { InstructionCost C = CM.expectedCost(VF); VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost); -#ifndef NDEBUG unsigned AssumedMinimumVscale = getVScaleForTuning(OrigLoop, TTI).value_or(1); unsigned Width = @@ -4591,7 +4591,6 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " << AssumedMinimumVscale << ")"); LLVM_DEBUG(dbgs() << ".\n"); -#endif if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) { LLVM_DEBUG( @@ -4621,6 +4620,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); return ChosenFactor; } +#endif bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( ElementCount VF) const { @@ -6985,15 +6985,14 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { return VectorizationFactor::Disabled(); } -std::optional -LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { +void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { assert(OrigLoop->isInnermost() && "Inner loop expected."); CM.collectValuesToIgnore(); CM.collectElementTypesForWidening(); FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. - return std::nullopt; + return; // Invalidate interleave groups if all blocks of loop will be predicated. if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && @@ -7028,14 +7027,8 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { if (CM.selectUserVectorizationFactor(UserVF)) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); buildVPlansWithVPRecipes(UserVF, UserVF); - if (!hasPlanWithVF(UserVF)) { - LLVM_DEBUG(dbgs() - << "LV: No VPlan could be built for " << UserVF << ".\n"); - return std::nullopt; - } - LLVM_DEBUG(printPlans(dbgs())); - return {{UserVF, 0, 0}}; + return; } else reportVectorizationInfo("UserVF ignored because of invalid costs.", "InvalidCost", ORE, OrigLoop); @@ -7066,24 +7059,6 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); LLVM_DEBUG(printPlans(dbgs())); - if (VPlans.empty()) - return std::nullopt; - if (all_of(VPlans, - [](std::unique_ptr &P) { return P->hasScalarVFOnly(); })) - return VectorizationFactor::Disabled(); - - // Select the optimal vectorization factor according to the legacy cost-model. - // This is now only used to verify the decisions by the new VPlan-based - // cost-model and will be retired once the VPlan-based cost-model is - // stabilized. - VectorizationFactor VF = selectVectorizationFactor(); - assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); - if (!hasPlanWithVF(VF.Width)) { - LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width - << ".\n"); - return std::nullopt; - } - return VF; } InstructionCost VPCostContext::getLegacyCost(Instruction *UI, @@ -7255,11 +7230,13 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, return Cost; } -ElementCount LoopVectorizationPlanner::computeBestVF() { +VectorizationFactor LoopVectorizationPlanner::computeBestVF() { + if (VPlans.empty()) + return VectorizationFactor::Disabled(); // If there is a single VPlan with a single VF, return it directly. VPlan &FirstPlan = *VPlans[0]; if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1) - return *FirstPlan.vectorFactors().begin(); + return {*FirstPlan.vectorFactors().begin(), 0, 0}; ElementCount ScalarVF = ElementCount::getFixed(1); assert(hasPlanWithVF(ScalarVF) && @@ -7267,6 +7244,7 @@ ElementCount LoopVectorizationPlanner::computeBestVF() { // TODO: Compute scalar cost using VPlan-based cost model. InstructionCost ScalarCost = CM.expectedCost(ScalarVF); + LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n"); VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost); VectorizationFactor BestFactor = ScalarFactor; @@ -7300,7 +7278,20 @@ ElementCount LoopVectorizationPlanner::computeBestVF() { ProfitableVFs.push_back(CurrentFactor); } } - return BestFactor.Width; + +#ifndef NDEBUG + // Select the optimal vectorization factor according to the legacy cost-model. + // This is now only used to verify the decisions by the new VPlan-based + // cost-model and will be retired once the VPlan-based cost-model is + // stabilized. + VectorizationFactor LegacyVF = selectVectorizationFactor(); + assert(BestFactor.Width == LegacyVF.Width && + " VPlan cost model and legacy cost model disagreed"); + assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) && + "when vectorizing, the scalar cost must be computed."); +#endif + + return BestFactor; } static void AddRuntimeUnrollDisableMetaData(Loop *L) { @@ -9971,21 +9962,19 @@ bool LoopVectorizePass::processLoop(Loop *L) { ElementCount UserVF = Hints.getWidth(); unsigned UserIC = Hints.getInterleave(); - // Plan how to best vectorize, return the best VF and its cost. - std::optional MaybeVF = LVP.plan(UserVF, UserIC); + // Plan how to best vectorize. + LVP.plan(UserVF, UserIC); + VectorizationFactor VF = LVP.computeBestVF(); + unsigned IC = 1; if (ORE->allowExtraAnalysis(LV_NAME)) LVP.emitInvalidCostRemarks(ORE); - VectorizationFactor VF = VectorizationFactor::Disabled(); - unsigned IC = 1; - bool AddBranchWeights = hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getDataLayout(), AddBranchWeights); - if (MaybeVF) { - VF = *MaybeVF; + if (LVP.hasPlanWithVF(VF.Width)) { // Select the interleave count. IC = CM.selectInterleaveCount(VF.Width, VF.Cost); @@ -10025,7 +10014,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { VectorizeLoop = false; } - if (!MaybeVF && UserIC > 1) { + if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) { // Tell the user interleaving was avoided up-front, despite being explicitly // requested. LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " @@ -10107,11 +10096,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, BFI, PSI, Checks); - ElementCount BestVF = LVP.computeBestVF(); - assert(BestVF.isScalar() && - "VPlan cost model and legacy cost model disagreed"); - VPlan &BestPlan = LVP.getPlanFor(BestVF); - LVP.executePlan(BestVF, IC, BestPlan, Unroller, DT, false); + VPlan &BestPlan = LVP.getPlanFor(VF.Width); + LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); ORE->emit([&]() { return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), @@ -10122,20 +10108,16 @@ bool LoopVectorizePass::processLoop(Loop *L) { } else { // If we decided that it is *legal* to vectorize the loop, then do it. - ElementCount BestVF = LVP.computeBestVF(); - LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << BestVF << "\n"); - assert(VF.Width == BestVF && - "VPlan cost model and legacy cost model disagreed"); - VPlan &BestPlan = LVP.getPlanFor(BestVF); + VPlan &BestPlan = LVP.getPlanFor(VF.Width); // Consider vectorizing the epilogue too if it's profitable. VectorizationFactor EpilogueVF = - LVP.selectEpilogueVectorizationFactor(BestVF, IC); + LVP.selectEpilogueVectorizationFactor(VF.Width, IC); if (EpilogueVF.Width.isVector()) { // The first pass vectorizes the main loop and creates a scalar epilogue // to be vectorized by executing the plan (potentially with a different // factor) again shortly afterwards. - EpilogueLoopVectorizationInfo EPI(BestVF, IC, EpilogueVF.Width, 1); + EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, &CM, BFI, PSI, Checks); @@ -10230,10 +10212,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (!MainILV.areSafetyChecksAdded()) DisableRuntimeUnroll = true; } else { - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, BestVF, + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, PSI, Checks); - LVP.executePlan(BestVF, IC, BestPlan, LB, DT, false); + LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); ++LoopsVectorized; // Add metadata to disable runtime unrolling a scalar loop when there diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 15acfc00251e29..f487b1de1e5b37 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1695,6 +1695,10 @@ VPlan &LoopVectorizationPlanner::getPlanFor(ElementCount VF) const { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void LoopVectorizationPlanner::printPlans(raw_ostream &O) { + if (VPlans.empty()) { + O << "LV: No VPlans built.\n"; + return; + } for (const auto &Plan : VPlans) if (PrintVPlansInDotFormat) Plan->printDOT(O); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index 9f70a891efe76b..38af580e25c9cc 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -133,7 +133,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Not Interleaving. ; CHECK-NEXT: LV: Interleaving is not beneficial. ; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in -; CHECK-NEXT: VF picked by VPlan cost model: vscale x 4 ; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop ; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 ; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF>=1' { @@ -336,7 +335,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Not Interleaving. ; CHECK-NEXT: LV: Interleaving is not beneficial. ; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in -; CHECK-NEXT: VF picked by VPlan cost model: vscale x 4 ; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop ; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 ; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF>=1' { diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll index 66a50f6e3f373f..6522ed2b52b4fb 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll @@ -154,7 +154,7 @@ exit: ; FOR (for.y) should be moved which is not currently supported. define i32 @test_chained_first_order_recurrences_4(ptr %base) { ; CHECK-LABEL: 'test_chained_first_order_recurrences_4' -; CHECK: No VPlan could be built for +; CHECK: No VPlans built. entry: br label %loop diff --git a/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/x86-loopvectorize-costmodel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/x86-loopvectorize-costmodel.ll.expected index 5aa270e76f4c80..4b146a5c4bc3c5 100644 --- a/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/x86-loopvectorize-costmodel.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/x86-loopvectorize-costmodel.ll.expected @@ -11,13 +11,13 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; CHECK-LABEL: 'test' ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 ; CHECK: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 ; CHECK: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 ; CHECK: LV: Found an estimated cost of 3 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 ; CHECK: LV: Found an estimated cost of 5 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 ; CHECK: LV: Found an estimated cost of 22 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 ; CHECK: LV: Found an estimated cost of 92 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4 -; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 ; entry: br label %for.body From a80dd44b0d96fa3ba3fe0501c3ad4b1ee7edff00 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Wed, 21 Aug 2024 12:11:19 +0100 Subject: [PATCH 047/426] LAA: pre-commit tests for stride-versioning (#97570) Add tests for when the Stride is unknown and equal to TC, with different kinds of casts. In these cases, LAA should not speculate on Stride. --- .../LoopAccessAnalysis/symbolic-stride.ll | 205 ++++++++++++++++++ 1 file changed, 205 insertions(+) diff --git a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll index f0aed2421a96e5..1e12dbf3bbee31 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll @@ -310,3 +310,208 @@ loop: exit: ; preds = %loop ret void } + +; Check the scenario where we have an unknown Stride, which happens to also be +; the loop iteration count. If we speculate Stride==1, it implies that the loop +; will iterate no more than a single iteration. +define void @unknown_stride_equalto_tc(i32 %N, ptr %A, ptr %B, i32 %j) { +; CHECK-LABEL: 'unknown_stride_equalto_tc' +; CHECK-NEXT: loop: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ([[GRP5:0x[0-9a-f]+]]): +; CHECK-NEXT: ptr %A +; CHECK-NEXT: Against group ([[GRP6:0x[0-9a-f]+]]): +; CHECK-NEXT: %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group [[GRP5]]: +; CHECK-NEXT: (Low: %A High: (4 + %A)) +; CHECK-NEXT: Member: %A +; CHECK-NEXT: Group [[GRP6]]: +; CHECK-NEXT: (Low: (((2 * (sext i32 %j to i64)) + %B) umin ((2 * (sext i32 %j to i64)) + (2 * (zext i32 (-1 + %N) to i64) * (sext i32 %N to i64)) + %B)) High: (2 + (((2 * (sext i32 %j to i64)) + %B) umax ((2 * (sext i32 %j to i64)) + (2 * (zext i32 (-1 + %N) to i64) * (sext i32 %N to i64)) + %B)))) +; CHECK-NEXT: Member: {((2 * (sext i32 %j to i64)) + %B),+,(2 * (sext i32 %N to i64))}<%loop> +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: {%j,+,%N}<%loop> Added Flags: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: [PSE] %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add: +; CHECK-NEXT: ((2 * (sext i32 {%j,+,%N}<%loop> to i64)) + %B) +; CHECK-NEXT: --> {((2 * (sext i32 %j to i64)) + %B),+,(2 * (sext i32 %N to i64))}<%loop> +; +entry: + %cmp = icmp eq i32 %N, 0 + br i1 %cmp, label %exit, label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %mul = mul i32 %iv, %N + %add = add i32 %mul, %j + %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add + %load = load i16, ptr %arrayidx + %sext = sext i16 %load to i32 + store i32 %sext, ptr %A + %iv.next = add nuw i32 %iv, 1 + %exitcond = icmp eq i32 %iv.next, %N + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + + +; Check the scenario where we have an unknown Stride, which happens to also be +; the loop iteration count, but the TC is zero-extended from a narrower type. +define void @unknown_stride_equalto_zext_tc(i16 zeroext %N, ptr %A, ptr %B, i32 %j) { +; CHECK-LABEL: 'unknown_stride_equalto_zext_tc' +; CHECK-NEXT: loop: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ([[GRP7:0x[0-9a-f]+]]): +; CHECK-NEXT: ptr %A +; CHECK-NEXT: Against group ([[GRP8:0x[0-9a-f]+]]): +; CHECK-NEXT: %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group [[GRP7]]: +; CHECK-NEXT: (Low: %A High: (4 + %A)) +; CHECK-NEXT: Member: %A +; CHECK-NEXT: Group [[GRP8]]: +; CHECK-NEXT: (Low: (((2 * (sext i32 %j to i64)) + %B) umin ((2 * (sext i32 %j to i64)) + (2 * (zext i32 (-1 + (zext i16 %N to i32)) to i64) * (zext i16 %N to i64)) + %B)) High: (2 + (((2 * (sext i32 %j to i64)) + %B) umax ((2 * (sext i32 %j to i64)) + (2 * (zext i32 (-1 + (zext i16 %N to i32)) to i64) * (zext i16 %N to i64)) + %B)))) +; CHECK-NEXT: Member: {((2 * (sext i32 %j to i64)) + %B),+,(2 * (zext i16 %N to i64))}<%loop> +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: {%j,+,(zext i16 %N to i32)}<%loop> Added Flags: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: [PSE] %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add: +; CHECK-NEXT: ((2 * (sext i32 {%j,+,(zext i16 %N to i32)}<%loop> to i64)) + %B) +; CHECK-NEXT: --> {((2 * (sext i32 %j to i64)) + %B),+,(2 * (zext i16 %N to i64))}<%loop> +; +entry: + %N.ext = zext i16 %N to i32 + %cmp = icmp eq i16 %N, 0 + br i1 %cmp, label %exit, label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %mul = mul nuw i32 %iv, %N.ext + %add = add i32 %mul, %j + %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add + %load = load i16, ptr %arrayidx + %sext = sext i16 %load to i32 + store i32 %sext, ptr %A + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %iv.next, %N.ext + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +; Check the scenario where we have an unknown Stride, which happens to also be +; the loop iteration count, but the TC is sign-extended from a narrower type. +define void @unknown_stride_equalto_sext_tc(i16 %N, ptr %A, ptr %B, i32 %j) { +; CHECK-LABEL: 'unknown_stride_equalto_sext_tc' +; CHECK-NEXT: loop: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ([[GRP9:0x[0-9a-f]+]]): +; CHECK-NEXT: ptr %A +; CHECK-NEXT: Against group ([[GRP10:0x[0-9a-f]+]]): +; CHECK-NEXT: %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group [[GRP9]]: +; CHECK-NEXT: (Low: %A High: (4 + %A)) +; CHECK-NEXT: Member: %A +; CHECK-NEXT: Group [[GRP10]]: +; CHECK-NEXT: (Low: (((2 * (sext i32 %j to i64)) + %B) umin ((2 * (sext i32 %j to i64)) + (2 * (zext i32 (-1 + (sext i16 %N to i32)) to i64) * (sext i16 %N to i64)) + %B)) High: (2 + (((2 * (sext i32 %j to i64)) + %B) umax ((2 * (sext i32 %j to i64)) + (2 * (zext i32 (-1 + (sext i16 %N to i32)) to i64) * (sext i16 %N to i64)) + %B)))) +; CHECK-NEXT: Member: {((2 * (sext i32 %j to i64)) + %B),+,(2 * (sext i16 %N to i64))}<%loop> +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: {%j,+,(sext i16 %N to i32)}<%loop> Added Flags: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: [PSE] %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add: +; CHECK-NEXT: ((2 * (sext i32 {%j,+,(sext i16 %N to i32)}<%loop> to i64)) + %B) +; CHECK-NEXT: --> {((2 * (sext i32 %j to i64)) + %B),+,(2 * (sext i16 %N to i64))}<%loop> +; +entry: + %N.ext = sext i16 %N to i32 + %cmp = icmp eq i16 %N, 0 + br i1 %cmp, label %exit, label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %mul = mul nuw i32 %iv, %N.ext + %add = add i32 %mul, %j + %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add + %load = load i16, ptr %arrayidx + %sext = sext i16 %load to i32 + store i32 %sext, ptr %A + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %iv.next, %N.ext + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +; Check the scenario where we have an unknown Stride, which happens to also be +; the loop iteration count, but the TC is truncated from a wider type. +define void @unknown_stride_equalto_trunc_tc(i64 %N, ptr %A, ptr %B, i32 %j) { +; CHECK-LABEL: 'unknown_stride_equalto_trunc_tc' +; CHECK-NEXT: loop: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ([[GRP11:0x[0-9a-f]+]]): +; CHECK-NEXT: ptr %A +; CHECK-NEXT: Against group ([[GRP12:0x[0-9a-f]+]]): +; CHECK-NEXT: %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group [[GRP11]]: +; CHECK-NEXT: (Low: %A High: (4 + %A)) +; CHECK-NEXT: Member: %A +; CHECK-NEXT: Group [[GRP12]]: +; CHECK-NEXT: (Low: (((2 * (sext i32 %j to i64)) + %B) umin ((2 * (sext i32 %j to i64)) + (2 * (zext i32 (-1 + (trunc i64 %N to i32)) to i64) * (sext i32 (trunc i64 %N to i32) to i64)) + %B)) High: (2 + (((2 * (sext i32 %j to i64)) + %B) umax ((2 * (sext i32 %j to i64)) + (2 * (zext i32 (-1 + (trunc i64 %N to i32)) to i64) * (sext i32 (trunc i64 %N to i32) to i64)) + %B)))) +; CHECK-NEXT: Member: {((2 * (sext i32 %j to i64)) + %B),+,(2 * (sext i32 (trunc i64 %N to i32) to i64))}<%loop> +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: {%j,+,(trunc i64 %N to i32)}<%loop> Added Flags: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: [PSE] %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add: +; CHECK-NEXT: ((2 * (sext i32 {%j,+,(trunc i64 %N to i32)}<%loop> to i64)) + %B) +; CHECK-NEXT: --> {((2 * (sext i32 %j to i64)) + %B),+,(2 * (sext i32 (trunc i64 %N to i32) to i64))}<%loop> +; +entry: + %N.trunc = trunc i64 %N to i32 + %cmp = icmp eq i64 %N, 0 + br i1 %cmp, label %exit, label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %mul = mul nuw i32 %iv, %N.trunc + %add = add i32 %mul, %j + %arrayidx = getelementptr inbounds i16, ptr %B, i32 %add + %load = load i16, ptr %arrayidx + %sext = sext i16 %load to i32 + store i32 %sext, ptr %A + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %iv.next, %N.trunc + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} From 7a19194d0ac0110e5dae43538423293b67a27466 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 21 Aug 2024 04:25:26 -0700 Subject: [PATCH 048/426] [NFC][ADT] Add unit test for llvm::mismatch. (#105459) - Add basic unit test for llvm::mismatch. --- llvm/unittests/ADT/STLExtrasTest.cpp | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/llvm/unittests/ADT/STLExtrasTest.cpp b/llvm/unittests/ADT/STLExtrasTest.cpp index 3927bc59c031a3..b7dc15bf60af51 100644 --- a/llvm/unittests/ADT/STLExtrasTest.cpp +++ b/llvm/unittests/ADT/STLExtrasTest.cpp @@ -1349,6 +1349,33 @@ TEST(STLExtrasTest, LessSecond) { } } +TEST(STLExtrasTest, Mismatch) { + { + const int MMIndex = 5; + StringRef First = "FooBar"; + StringRef Second = "FooBaz"; + auto [MMFirst, MMSecond] = mismatch(First, Second); + EXPECT_EQ(MMFirst, First.begin() + MMIndex); + EXPECT_EQ(MMSecond, Second.begin() + MMIndex); + } + + { + SmallVector First = {0, 1, 2}; + SmallVector Second = {0, 1, 2, 3}; + auto [MMFirst, MMSecond] = mismatch(First, Second); + EXPECT_EQ(MMFirst, First.end()); + EXPECT_EQ(MMSecond, Second.begin() + 3); + } + + { + SmallVector First = {0, 1}; + SmallVector Empty; + auto [MMFirst, MMEmpty] = mismatch(First, Empty); + EXPECT_EQ(MMFirst, First.begin()); + EXPECT_EQ(MMEmpty, Empty.begin()); + } +} + struct Foo; struct Bar {}; From 0cff3e85db00b5f425cc4ed0d6921445afa891ca Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 21 Aug 2024 04:26:34 -0700 Subject: [PATCH 049/426] [NFC][Support] Move ModRef/MemoryEffects printers to their own file (#105367) - Move raw_ostream << operators for `ModRef` and `MemoryEffects` to a new ModRef.cpp file under llvm/Support (instead of AliasAnalysis.cpp) - This enables calling these operators from `Core` files like Instructions.cpp (for instance for debugging). Currently, they live in `LLVMAnalysis` which cannot be linked with `Core`. --- llvm/include/llvm/Support/ModRef.h | 2 +- llvm/lib/Analysis/AliasAnalysis.cpp | 36 -------------------- llvm/lib/Support/CMakeLists.txt | 1 + llvm/lib/Support/ModRef.cpp | 51 +++++++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 37 deletions(-) create mode 100644 llvm/lib/Support/ModRef.cpp diff --git a/llvm/include/llvm/Support/ModRef.h b/llvm/include/llvm/Support/ModRef.h index 7687280111a1f8..5a9d80c87ae27a 100644 --- a/llvm/include/llvm/Support/ModRef.h +++ b/llvm/include/llvm/Support/ModRef.h @@ -1,4 +1,4 @@ -//===--- ModRef.h - Memory effect modelling ---------------------*- C++ -*-===// +//===--- ModRef.h - Memory effect modeling ----------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp index 6eaaad5f332eb9..9f529fde55c20f 100644 --- a/llvm/lib/Analysis/AliasAnalysis.cpp +++ b/llvm/lib/Analysis/AliasAnalysis.cpp @@ -423,42 +423,6 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, AliasResult AR) { return OS; } -raw_ostream &llvm::operator<<(raw_ostream &OS, ModRefInfo MR) { - switch (MR) { - case ModRefInfo::NoModRef: - OS << "NoModRef"; - break; - case ModRefInfo::Ref: - OS << "Ref"; - break; - case ModRefInfo::Mod: - OS << "Mod"; - break; - case ModRefInfo::ModRef: - OS << "ModRef"; - break; - } - return OS; -} - -raw_ostream &llvm::operator<<(raw_ostream &OS, MemoryEffects ME) { - for (IRMemLocation Loc : MemoryEffects::locations()) { - switch (Loc) { - case IRMemLocation::ArgMem: - OS << "ArgMem: "; - break; - case IRMemLocation::InaccessibleMem: - OS << "InaccessibleMem: "; - break; - case IRMemLocation::Other: - OS << "Other: "; - break; - } - OS << ME.getModRef(Loc) << ", "; - } - return OS; -} - //===----------------------------------------------------------------------===// // Helper method implementation //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index a73ac54a01c5a5..c55c17fae189f5 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -205,6 +205,7 @@ add_llvm_component_library(LLVMSupport MemAlloc.cpp MemoryBuffer.cpp MemoryBufferRef.cpp + ModRef.cpp MD5.cpp MSP430Attributes.cpp MSP430AttributeParser.cpp diff --git a/llvm/lib/Support/ModRef.cpp b/llvm/lib/Support/ModRef.cpp new file mode 100644 index 00000000000000..c5978296e97f0c --- /dev/null +++ b/llvm/lib/Support/ModRef.cpp @@ -0,0 +1,51 @@ +//===--- ModRef.cpp - Memory effect modeling --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements ModRef and MemoryEffects misc functions. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/ModRef.h" + +using namespace llvm; + +raw_ostream &llvm::operator<<(raw_ostream &OS, ModRefInfo MR) { + switch (MR) { + case ModRefInfo::NoModRef: + OS << "NoModRef"; + break; + case ModRefInfo::Ref: + OS << "Ref"; + break; + case ModRefInfo::Mod: + OS << "Mod"; + break; + case ModRefInfo::ModRef: + OS << "ModRef"; + break; + } + return OS; +} + +raw_ostream &llvm::operator<<(raw_ostream &OS, MemoryEffects ME) { + for (IRMemLocation Loc : MemoryEffects::locations()) { + switch (Loc) { + case IRMemLocation::ArgMem: + OS << "ArgMem: "; + break; + case IRMemLocation::InaccessibleMem: + OS << "InaccessibleMem: "; + break; + case IRMemLocation::Other: + OS << "Other: "; + break; + } + OS << ME.getModRef(Loc) << ", "; + } + return OS; +} From 2644fe4858a36ffa65c36645f362e79889a0ad21 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 21 Aug 2024 11:28:32 +0000 Subject: [PATCH 050/426] [gn build] Port 0cff3e85db00 --- llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn index d10a6e4be3770c..49bdd66843bdc9 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn @@ -111,6 +111,7 @@ static_library("Support") { "MemAlloc.cpp", "MemoryBuffer.cpp", "MemoryBufferRef.cpp", + "ModRef.cpp", "NativeFormatting.cpp", "OptimizedStructLayout.cpp", "Optional.cpp", From 4f075086e7b8d9108749117f53999cd4afdd6894 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Wed, 21 Aug 2024 12:51:40 +0100 Subject: [PATCH 051/426] [LLVM][VPlan] Keep all VPBlend masks until VPlan transformation. (#104015) It's not possible to pick the best mask to remove when optimising VPBlend at construction and so this patch refactors the code to move the decision (and thus transformation) to VPlanTransforms. NOTE: This patch does not change the decision of which mask to pick. That will be done in a following PR to keep this patch as NFC from an output point of view. --- .../Transforms/Vectorize/LoopVectorize.cpp | 4 -- llvm/lib/Transforms/Vectorize/VPlan.h | 26 ++++++----- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 1 + .../Transforms/Vectorize/VPlanTransforms.cpp | 44 ++++++++++++++++--- 4 files changed, 55 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e3049da1b22188..2145bb8c9ca872 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8149,8 +8149,6 @@ VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, // builder. At this point we generate the predication tree. There may be // duplications since this is a simple recursive scan, but future // optimizations will clean it up. - // TODO: At the moment the first mask is always skipped, but it would be - // better to skip the most expensive mask. SmallVector OperandsWithMask; for (unsigned In = 0; In < NumIncoming; In++) { @@ -8163,8 +8161,6 @@ VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, "Distinct incoming values with one having a full mask"); break; } - if (In == 0) - continue; OperandsWithMask.push_back(EdgeMask); } return new VPBlendRecipe(Phi, OperandsWithMask); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index a99f3882092c2c..24da8f6700dfae 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2041,12 +2041,12 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe { class VPBlendRecipe : public VPSingleDefRecipe { public: /// The blend operation is a User of the incoming values and of their - /// respective masks, ordered [I0, I1, M1, I2, M2, ...]. Note that the first - /// incoming value does not have a mask associated. + /// respective masks, ordered [I0, M0, I1, M1, I2, M2, ...]. Note that M0 can + /// be ommited (implied by passing an odd number of operands) in which case + /// all other incoming values are merged into it. VPBlendRecipe(PHINode *Phi, ArrayRef Operands) : VPSingleDefRecipe(VPDef::VPBlendSC, Operands, Phi, Phi->getDebugLoc()) { - assert((Operands.size() + 1) % 2 == 0 && - "Expected an odd number of operands"); + assert(Operands.size() > 0 && "Expected at least one operand!"); } VPBlendRecipe *clone() override { @@ -2056,19 +2056,25 @@ class VPBlendRecipe : public VPSingleDefRecipe { VP_CLASSOF_IMPL(VPDef::VPBlendSC) - /// Return the number of incoming values, taking into account that the first - /// incoming value has no mask. - unsigned getNumIncomingValues() const { return (getNumOperands() + 1) / 2; } + /// A normalized blend is one that has an odd number of operands, whereby the + /// first operand does not have an associated mask. + bool isNormalized() const { return getNumOperands() % 2; } + + /// Return the number of incoming values, taking into account when normalized + /// the first incoming value will have no mask. + unsigned getNumIncomingValues() const { + return (getNumOperands() + isNormalized()) / 2; + } /// Return incoming value number \p Idx. VPValue *getIncomingValue(unsigned Idx) const { - return Idx == 0 ? getOperand(0) : getOperand(Idx * 2 - 1); + return Idx == 0 ? getOperand(0) : getOperand(Idx * 2 - isNormalized()); } /// Return mask number \p Idx. VPValue *getMask(unsigned Idx) const { - assert(Idx > 0 && "First index has no mask associated."); - return getOperand(Idx * 2); + assert((Idx > 0 || !isNormalized()) && "First index has no mask!"); + return Idx == 0 ? getOperand(1) : getOperand(Idx * 2 + !isNormalized()); } /// Generate the phi/select nodes. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index aea5e681b081c6..63e0e8a8981373 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1703,6 +1703,7 @@ void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, #endif void VPBlendRecipe::execute(VPTransformState &State) { + assert(isNormalized() && "Expected blend to be normalized!"); State.setDebugLocFrom(getDebugLoc()); // We know that all PHIs in non-header blocks are converted into // selects, so we don't have to worry about the insertion order and we diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index a2496f067024cb..55e90298b36cda 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -895,15 +895,47 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { /// Try to simplify recipe \p R. static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { using namespace llvm::VPlanPatternMatch; - // Try to remove redundant blend recipes. + if (auto *Blend = dyn_cast(&R)) { - VPValue *Inc0 = Blend->getIncomingValue(0); + // Try to remove redundant blend recipes. + SmallPtrSet UniqueValues; + if (Blend->isNormalized() || !match(Blend->getMask(0), m_False())) + UniqueValues.insert(Blend->getIncomingValue(0)); for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I) - if (Inc0 != Blend->getIncomingValue(I) && - !match(Blend->getMask(I), m_False())) - return; - Blend->replaceAllUsesWith(Inc0); + if (!match(Blend->getMask(I), m_False())) + UniqueValues.insert(Blend->getIncomingValue(I)); + + if (UniqueValues.size() == 1) { + Blend->replaceAllUsesWith(*UniqueValues.begin()); + Blend->eraseFromParent(); + return; + } + + if (Blend->isNormalized()) + return; + + // Normalize the blend so its first incomming value is used as the initial + // value with the others blended into it. + + unsigned StartIndex = 0; + SmallVector OperandsWithMask; + OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex)); + + for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { + if (I == StartIndex) + continue; + OperandsWithMask.push_back(Blend->getIncomingValue(I)); + OperandsWithMask.push_back(Blend->getMask(I)); + } + + auto *NewBlend = new VPBlendRecipe( + cast(Blend->getUnderlyingValue()), OperandsWithMask); + NewBlend->insertBefore(&R); + + VPValue *DeadMask = Blend->getMask(StartIndex); + Blend->replaceAllUsesWith(NewBlend); Blend->eraseFromParent(); + recursivelyDeleteDeadRecipes(DeadMask); return; } From 170a21e7f00d0097d88cba3547967e500e0d8dfe Mon Sep 17 00:00:00 2001 From: Marius Kamp Date: Wed, 21 Aug 2024 14:09:02 +0200 Subject: [PATCH 052/426] [InstCombine] Extend Fold of Zero-extended Bit Test (#102100) Previously, (zext (icmp ne (and X, (1 << ShAmt)), 0)) has only been folded if the bit width of X and the result were equal. Use a trunc or zext instruction to also support other bit widths. This is a follow-up to commit 533190acdb9d2ed774f96a998b5c03be3df4f857, which introduced a regression: (zext (icmp ne (and (lshr X ShAmt) 1) 0)) is not folded any longer to (zext/trunc (and (lshr X ShAmt) 1)) since the commit introduced the fold of (icmp ne (and (lshr X ShAmt) 1) 0) to (icmp ne (and X (1 << ShAmt)) 0). The change introduced by this commit restores this fold. Alive proof: https://alive2.llvm.org/ce/z/MFkNXs Relates to issue #86813 and pull request #101838. --- .../InstCombine/InstCombineCasts.cpp | 19 ++- llvm/test/Transforms/InstCombine/zext.ll | 109 ++++++++++++++++++ 2 files changed, 122 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 97ee845548e28d..5c9faa9449f539 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -985,7 +985,7 @@ Instruction *InstCombinerImpl::transformZExtICmp(ICmpInst *Cmp, } } - if (Cmp->isEquality() && Zext.getType() == Cmp->getOperand(0)->getType()) { + if (Cmp->isEquality()) { // Test if a bit is clear/set using a shifted-one mask: // zext (icmp eq (and X, (1 << ShAmt)), 0) --> and (lshr (not X), ShAmt), 1 // zext (icmp ne (and X, (1 << ShAmt)), 0) --> and (lshr X, ShAmt), 1 @@ -993,11 +993,18 @@ Instruction *InstCombinerImpl::transformZExtICmp(ICmpInst *Cmp, if (Cmp->hasOneUse() && match(Cmp->getOperand(1), m_ZeroInt()) && match(Cmp->getOperand(0), m_OneUse(m_c_And(m_Shl(m_One(), m_Value(ShAmt)), m_Value(X))))) { - if (Cmp->getPredicate() == ICmpInst::ICMP_EQ) - X = Builder.CreateNot(X); - Value *Lshr = Builder.CreateLShr(X, ShAmt); - Value *And1 = Builder.CreateAnd(Lshr, ConstantInt::get(X->getType(), 1)); - return replaceInstUsesWith(Zext, And1); + auto *And = cast(Cmp->getOperand(0)); + Value *Shift = And->getOperand(X == And->getOperand(0) ? 1 : 0); + if (Zext.getType() == And->getType() || + Cmp->getPredicate() != ICmpInst::ICMP_EQ || Shift->hasOneUse()) { + if (Cmp->getPredicate() == ICmpInst::ICMP_EQ) + X = Builder.CreateNot(X); + Value *Lshr = Builder.CreateLShr(X, ShAmt); + Value *And1 = + Builder.CreateAnd(Lshr, ConstantInt::get(X->getType(), 1)); + return replaceInstUsesWith( + Zext, Builder.CreateZExtOrTrunc(And1, Zext.getType())); + } } } diff --git a/llvm/test/Transforms/InstCombine/zext.ll b/llvm/test/Transforms/InstCombine/zext.ll index 7b2cf131c396ab..872871cf15b033 100644 --- a/llvm/test/Transforms/InstCombine/zext.ll +++ b/llvm/test/Transforms/InstCombine/zext.ll @@ -454,6 +454,115 @@ define i32 @zext_or_masked_bit_test_uses(i32 %a, i32 %b, i32 %x) { ret i32 %z } +define i16 @zext_masked_bit_zero_to_smaller_bitwidth(i32 %a, i32 %b) { +; CHECK-LABEL: @zext_masked_bit_zero_to_smaller_bitwidth( +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A:%.*]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16 +; CHECK-NEXT: [[Z:%.*]] = and i16 [[TMP3]], 1 +; CHECK-NEXT: ret i16 [[Z]] +; + %shl = shl i32 1, %b + %and = and i32 %shl, %a + %cmp = icmp eq i32 %and, 0 + %z = zext i1 %cmp to i16 + ret i16 %z +} + +define <4 x i16> @zext_masked_bit_zero_to_smaller_bitwidth_v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: @zext_masked_bit_zero_to_smaller_bitwidth_v4i32( +; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i32> [[A:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[Z:%.*]] = and <4 x i16> [[TMP3]], +; CHECK-NEXT: ret <4 x i16> [[Z]] +; + %shl = shl <4 x i32> , %b + %and = and <4 x i32> %shl, %a + %cmp = icmp eq <4 x i32> %and, + %z = zext <4 x i1> %cmp to <4 x i16> + ret <4 x i16> %z +} + +; Negative test +define i16 @zext_masked_bit_zero_to_smaller_bitwidth_multi_use_shl(i32 %a, i32 %b) { +; CHECK-LABEL: @zext_masked_bit_zero_to_smaller_bitwidth_multi_use_shl( +; CHECK-NEXT: [[SHL:%.*]] = shl nuw i32 1, [[B:%.*]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[SHL]], [[A:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 0 +; CHECK-NEXT: [[Z:%.*]] = zext i1 [[CMP]] to i16 +; CHECK-NEXT: call void @use32(i32 [[SHL]]) +; CHECK-NEXT: ret i16 [[Z]] +; + %shl = shl i32 1, %b + %and = and i32 %shl, %a + %cmp = icmp eq i32 %and, 0 + %z = zext i1 %cmp to i16 + call void @use32(i32 %shl) + ret i16 %z +} + +define i16 @zext_masked_bit_nonzero_to_smaller_bitwidth(i32 %a, i32 %b) { +; CHECK-LABEL: @zext_masked_bit_nonzero_to_smaller_bitwidth( +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: [[Z:%.*]] = and i16 [[TMP2]], 1 +; CHECK-NEXT: ret i16 [[Z]] +; + %shl = shl i32 1, %b + %and = and i32 %shl, %a + %cmp = icmp ne i32 %and, 0 + %z = zext i1 %cmp to i16 + ret i16 %z +} + +define i16 @zext_masked_bit_nonzero_to_smaller_bitwidth_multi_use_shl(i32 %a, i32 %b) { +; CHECK-LABEL: @zext_masked_bit_nonzero_to_smaller_bitwidth_multi_use_shl( +; CHECK-NEXT: [[SHL:%.*]] = shl nuw i32 1, [[B:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[A:%.*]], [[B]] +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: [[Z:%.*]] = and i16 [[TMP2]], 1 +; CHECK-NEXT: call void @use32(i32 [[SHL]]) +; CHECK-NEXT: ret i16 [[Z]] +; + %shl = shl i32 1, %b + %and = and i32 %shl, %a + %cmp = icmp ne i32 %and, 0 + %z = zext i1 %cmp to i16 + call void @use32(i32 %shl) + ret i16 %z +} + +define i64 @zext_masked_bit_zero_to_larger_bitwidth(i32 %a, i32 %b) { +; CHECK-LABEL: @zext_masked_bit_zero_to_larger_bitwidth( +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A:%.*]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], 1 +; CHECK-NEXT: [[Z:%.*]] = zext nneg i32 [[TMP3]] to i64 +; CHECK-NEXT: ret i64 [[Z]] +; + %shl = shl i32 1, %b + %and = and i32 %shl, %a + %cmp = icmp eq i32 %and, 0 + %z = zext i1 %cmp to i64 + ret i64 %z +} + +define <4 x i64> @zext_masked_bit_zero_to_larger_bitwidth_v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: @zext_masked_bit_zero_to_larger_bitwidth_v4i32( +; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i32> [[A:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = and <4 x i32> [[TMP2]], +; CHECK-NEXT: [[Z:%.*]] = zext nneg <4 x i32> [[TMP3]] to <4 x i64> +; CHECK-NEXT: ret <4 x i64> [[Z]] +; + %shl = shl <4 x i32> , %b + %and = and <4 x i32> %shl, %a + %cmp = icmp eq <4 x i32> %and, + %z = zext <4 x i1> %cmp to <4 x i64> + ret <4 x i64> %z +} + define i32 @notneg_zext_wider(i8 %x) { ; CHECK-LABEL: @notneg_zext_wider( ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i8 [[X:%.*]], -1 From ad435bcc14f42dc97286c717cd12446a0facb2ee Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 21 Aug 2024 13:16:59 +0100 Subject: [PATCH 053/426] [clang][CodeGen][SPIR-V][AMDGPU] Tweak AMDGCNSPIRV ABI to allow for the correct handling of aggregates passed to kernels / functions. (#102776) The AMDGPU kernel ABI is not directly representable in SPIR-V, since it relies on passing aggregates `byref`, and SPIR-V only encodes `byval` (which the AMDGPU BE disallows for kernel arguments). As a temporary solution to this mismatch, we add special handling for AMDGCN flavoured SPIR-V, whereby aggregates are passed as direct, both to kernels and to normal functions. This is not ideal (there are pathological cases where performance is heavily impacted), but empirically robust and guaranteed to work as the AMDGPU BE retains handling of `direct` passing for legacy reasons. We will revisit this in the future, but as it stands it is enough to pass a wide array of integration tests and generates correct SPIR-V and correct reverse translation into LLVM IR. The amdgpu-kernel-arg-pointer-type test is updated via the automated script, and thus becomes quite noisy. --- clang/lib/CodeGen/Targets/SPIR.cpp | 70 +- .../amdgpu-kernel-arg-pointer-type.cu | 723 ++++++++++++++++-- clang/test/CodeGenCUDA/kernel-args.cu | 6 + 3 files changed, 729 insertions(+), 70 deletions(-) diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp index cf068cbc4fcd36..cc52925e2e523f 100644 --- a/clang/lib/CodeGen/Targets/SPIR.cpp +++ b/clang/lib/CodeGen/Targets/SPIR.cpp @@ -32,7 +32,9 @@ class SPIRVABIInfo : public CommonSPIRABIInfo { void computeInfo(CGFunctionInfo &FI) const override; private: + ABIArgInfo classifyReturnType(QualType RetTy) const; ABIArgInfo classifyKernelArgumentType(QualType Ty) const; + ABIArgInfo classifyArgumentType(QualType Ty) const; }; } // end anonymous namespace namespace { @@ -64,6 +66,27 @@ void CommonSPIRABIInfo::setCCs() { RuntimeCC = llvm::CallingConv::SPIR_FUNC; } +ABIArgInfo SPIRVABIInfo::classifyReturnType(QualType RetTy) const { + if (getTarget().getTriple().getVendor() != llvm::Triple::AMD) + return DefaultABIInfo::classifyReturnType(RetTy); + if (!isAggregateTypeForABI(RetTy) || getRecordArgABI(RetTy, getCXXABI())) + return DefaultABIInfo::classifyReturnType(RetTy); + + if (const RecordType *RT = RetTy->getAs()) { + const RecordDecl *RD = RT->getDecl(); + if (RD->hasFlexibleArrayMember()) + return DefaultABIInfo::classifyReturnType(RetTy); + } + + // TODO: The AMDGPU ABI is non-trivial to represent in SPIR-V; in order to + // avoid encoding various architecture specific bits here we return everything + // as direct to retain type info for things like aggregates, for later perusal + // when translating back to LLVM/lowering in the BE. This is also why we + // disable flattening as the outcomes can mismatch between SPIR-V and AMDGPU. + // This will be revisited / optimised in the future. + return ABIArgInfo::getDirect(CGT.ConvertType(RetTy), 0u, nullptr, false); +} + ABIArgInfo SPIRVABIInfo::classifyKernelArgumentType(QualType Ty) const { if (getContext().getLangOpts().CUDAIsDevice) { // Coerce pointer arguments with default address space to CrossWorkGroup @@ -78,18 +101,51 @@ ABIArgInfo SPIRVABIInfo::classifyKernelArgumentType(QualType Ty) const { return ABIArgInfo::getDirect(LTy, 0, nullptr, false); } - // Force copying aggregate type in kernel arguments by value when - // compiling CUDA targeting SPIR-V. This is required for the object - // copied to be valid on the device. - // This behavior follows the CUDA spec - // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#global-function-argument-processing, - // and matches the NVPTX implementation. - if (isAggregateTypeForABI(Ty)) + if (isAggregateTypeForABI(Ty)) { + if (getTarget().getTriple().getVendor() == llvm::Triple::AMD) + // TODO: The AMDGPU kernel ABI passes aggregates byref, which is not + // currently expressible in SPIR-V; SPIR-V passes aggregates byval, + // which the AMDGPU kernel ABI does not allow. Passing aggregates as + // direct works around this impedance mismatch, as it retains type info + // and can be correctly handled, post reverse-translation, by the AMDGPU + // BE, which has to support this CC for legacy OpenCL purposes. It can + // be brittle and does lead to performance degradation in certain + // pathological cases. This will be revisited / optimised in the future, + // once a way to deal with the byref/byval impedance mismatch is + // identified. + return ABIArgInfo::getDirect(LTy, 0, nullptr, false); + // Force copying aggregate type in kernel arguments by value when + // compiling CUDA targeting SPIR-V. This is required for the object + // copied to be valid on the device. + // This behavior follows the CUDA spec + // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#global-function-argument-processing, + // and matches the NVPTX implementation. return getNaturalAlignIndirect(Ty, /* byval */ true); + } } return classifyArgumentType(Ty); } +ABIArgInfo SPIRVABIInfo::classifyArgumentType(QualType Ty) const { + if (getTarget().getTriple().getVendor() != llvm::Triple::AMD) + return DefaultABIInfo::classifyArgumentType(Ty); + if (!isAggregateTypeForABI(Ty)) + return DefaultABIInfo::classifyArgumentType(Ty); + + // Records with non-trivial destructors/copy-constructors should not be + // passed by value. + if (auto RAA = getRecordArgABI(Ty, getCXXABI())) + return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory); + + if (const RecordType *RT = Ty->getAs()) { + const RecordDecl *RD = RT->getDecl(); + if (RD->hasFlexibleArrayMember()) + return DefaultABIInfo::classifyArgumentType(Ty); + } + + return ABIArgInfo::getDirect(CGT.ConvertType(Ty), 0u, nullptr, false); +} + void SPIRVABIInfo::computeInfo(CGFunctionInfo &FI) const { // The logic is same as in DefaultABIInfo with an exception on the kernel // arguments handling. diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu index 70c86cbb8c3d40..b295bbbdaaf955 100644 --- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu +++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu @@ -1,8 +1,11 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // REQUIRES: x86-registered-target // REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -o - | FileCheck --check-prefixes=COMMON,CHECK %s -// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -disable-O0-optnone -o - | opt -S -O2 | FileCheck %s --check-prefixes=COMMON,OPT +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -o - | FileCheck --check-prefixes=CHECK %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -o - | FileCheck --check-prefixes=CHECK-SPIRV %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -disable-O0-optnone -o - | opt -S -O2 | FileCheck %s --check-prefixes=OPT +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -disable-O0-optnone -o - | opt -S -O2 | FileCheck %s --check-prefixes=OPT-SPIRV // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -x hip %s -o - | FileCheck -check-prefix=HOST %s #include "Inputs/cuda.h" @@ -11,41 +14,260 @@ // global ones. // On the host-side compilation, generic pointer won't be coerced. -// HOST-NOT: %struct.S.coerce -// HOST-NOT: %struct.T.coerce - -// HOST: define{{.*}} void @_Z22__device_stub__kernel1Pi(ptr noundef %x) -// COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel1Pi(ptr addrspace(1){{.*}} %x.coerce) -// CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr -// OPT: [[VAL:%.*]] = load i32, ptr addrspace(1) %x.coerce, align 4{{$}} -// OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1 -// OPT: store i32 [[INC]], ptr addrspace(1) %x.coerce, align 4 -// OPT: ret void + +// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z7kernel1Pi( +// CHECK-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[X:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr +// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr [[X_ASCAST]], align 8 +// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr [[X_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[X1]], ptr [[X_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: ret void +// +// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel1Pi( +// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0:[0-9]+]] { +// CHECK-SPIRV-NEXT: [[ENTRY:.*:]] +// CHECK-SPIRV-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-SPIRV-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) +// CHECK-SPIRV-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4) +// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr addrspace(4) [[X_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 0 +// CHECK-SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX]], align 4 +// CHECK-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[ARRAYIDX]], align 4 +// CHECK-SPIRV-NEXT: ret void +// +// OPT-LABEL: define dso_local amdgpu_kernel void @_Z7kernel1Pi( +// OPT-SAME: ptr addrspace(1) nocapture noundef [[X_COERCE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// OPT-NEXT: [[ENTRY:.*:]] +// OPT-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) [[X_COERCE]], align 4 +// OPT-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +// OPT-NEXT: store i32 [[INC]], ptr addrspace(1) [[X_COERCE]], align 4 +// OPT-NEXT: ret void +// +// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel1Pi( +// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0:[0-9]+]] { +// OPT-SPIRV-NEXT: [[ENTRY:.*:]] +// OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64 +// OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// OPT-SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4 +// OPT-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// OPT-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[TMP1]], align 4 +// OPT-SPIRV-NEXT: ret void +// +// HOST-LABEL: define dso_local void @_Z22__device_stub__kernel1Pi( +// HOST-SAME: ptr noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] { +// HOST-NEXT: [[ENTRY:.*:]] +// HOST-NEXT: [[X_ADDR:%.*]] = alloca ptr, align 8 +// HOST-NEXT: store ptr [[X]], ptr [[X_ADDR]], align 8 +// HOST-NEXT: [[TMP0:%.*]] = call i32 @hipSetupArgument(ptr [[X_ADDR]], i64 8, i64 0) +// HOST-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 +// HOST-NEXT: br i1 [[TMP1]], label %[[SETUP_NEXT:.*]], label %[[SETUP_END:.*]] +// HOST: [[SETUP_NEXT]]: +// HOST-NEXT: [[TMP2:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z7kernel1Pi) +// HOST-NEXT: br label %[[SETUP_END]] +// HOST: [[SETUP_END]]: +// HOST-NEXT: ret void +// __global__ void kernel1(int *x) { x[0]++; } -// HOST: define{{.*}} void @_Z22__device_stub__kernel2Ri(ptr noundef nonnull align 4 dereferenceable(4) %x) -// COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel2Ri(ptr addrspace(1){{.*}} nonnull align 4 dereferenceable(4) %x.coerce) -// CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr -// OPT: [[VAL:%.*]] = load i32, ptr addrspace(1) %x.coerce, align 4{{$}} -// OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1 -// OPT: store i32 [[INC]], ptr addrspace(1) %x.coerce, align 4 -// OPT: ret void +// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z7kernel2Ri( +// CHECK-SAME: ptr addrspace(1) noundef nonnull align 4 dereferenceable(4) [[X_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[X:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr +// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr [[X_ASCAST]], align 8 +// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr [[X_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[X1]], ptr [[X_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK-NEXT: store i32 [[INC]], ptr [[TMP0]], align 4 +// CHECK-NEXT: ret void +// +// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel2Ri( +// CHECK-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] { +// CHECK-SPIRV-NEXT: [[ENTRY:.*:]] +// CHECK-SPIRV-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-SPIRV-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) +// CHECK-SPIRV-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4) +// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr addrspace(4) [[X_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[TMP0]], align 4 +// CHECK-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[TMP0]], align 4 +// CHECK-SPIRV-NEXT: ret void +// +// OPT-LABEL: define dso_local amdgpu_kernel void @_Z7kernel2Ri( +// OPT-SAME: ptr addrspace(1) nocapture noundef nonnull align 4 dereferenceable(4) [[X_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] { +// OPT-NEXT: [[ENTRY:.*:]] +// OPT-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) [[X_COERCE]], align 4 +// OPT-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +// OPT-NEXT: store i32 [[INC]], ptr addrspace(1) [[X_COERCE]], align 4 +// OPT-NEXT: ret void +// +// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel2Ri( +// OPT-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] { +// OPT-SPIRV-NEXT: [[ENTRY:.*:]] +// OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64 +// OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// OPT-SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4 +// OPT-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// OPT-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[TMP1]], align 4 +// OPT-SPIRV-NEXT: ret void +// +// HOST-LABEL: define dso_local void @_Z22__device_stub__kernel2Ri( +// HOST-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] { +// HOST-NEXT: [[ENTRY:.*:]] +// HOST-NEXT: [[X_ADDR:%.*]] = alloca ptr, align 8 +// HOST-NEXT: store ptr [[X]], ptr [[X_ADDR]], align 8 +// HOST-NEXT: [[TMP0:%.*]] = call i32 @hipSetupArgument(ptr [[X_ADDR]], i64 8, i64 0) +// HOST-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 +// HOST-NEXT: br i1 [[TMP1]], label %[[SETUP_NEXT:.*]], label %[[SETUP_END:.*]] +// HOST: [[SETUP_NEXT]]: +// HOST-NEXT: [[TMP2:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z7kernel2Ri) +// HOST-NEXT: br label %[[SETUP_END]] +// HOST: [[SETUP_END]]: +// HOST-NEXT: ret void +// __global__ void kernel2(int &x) { x++; } -// HOST: define{{.*}} void @_Z22__device_stub__kernel3PU3AS2iPU3AS1i(ptr addrspace(2) noundef %x, ptr addrspace(1) noundef %y) -// CHECK-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel3PU3AS2iPU3AS1i(ptr addrspace(2){{.*}} %x, ptr addrspace(1){{.*}} %y) -// CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr +// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z7kernel3PU3AS2iPU3AS1i( +// CHECK-SAME: ptr addrspace(2) noundef [[X:%.*]], ptr addrspace(1) noundef [[Y:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(2), align 8, addrspace(5) +// CHECK-NEXT: [[Y_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr +// CHECK-NEXT: [[Y_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[Y_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(2) [[X]], ptr [[X_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[Y]], ptr [[Y_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(2), ptr [[X_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[TMP0]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(2) [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[Y_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP2]], i64 0 +// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[ARRAYIDX1]], align 4 +// CHECK-NEXT: ret void +// +// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel3PU3AS2iPU3AS1i( +// CHECK-SPIRV-SAME: ptr addrspace(2) noundef [[X:%.*]], ptr addrspace(1) noundef [[Y:%.*]]) addrspace(4) #[[ATTR0]] { +// CHECK-SPIRV-NEXT: [[ENTRY:.*:]] +// CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(2), align 8 +// CHECK-SPIRV-NEXT: [[Y_ADDR:%.*]] = alloca ptr addrspace(1), align 8 +// CHECK-SPIRV-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4) +// CHECK-SPIRV-NEXT: [[Y_ADDR_ASCAST:%.*]] = addrspacecast ptr [[Y_ADDR]] to ptr addrspace(4) +// CHECK-SPIRV-NEXT: store ptr addrspace(2) [[X]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[Y]], ptr addrspace(4) [[Y_ADDR_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(2), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[TMP0]], i64 0 +// CHECK-SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(2) [[ARRAYIDX]], align 4 +// CHECK-SPIRV-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[Y_ADDR_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP2]], i64 0 +// CHECK-SPIRV-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[ARRAYIDX1]], align 4 +// CHECK-SPIRV-NEXT: ret void +// +// OPT-LABEL: define dso_local amdgpu_kernel void @_Z7kernel3PU3AS2iPU3AS1i( +// OPT-SAME: ptr addrspace(2) nocapture noundef readonly [[X:%.*]], ptr addrspace(1) nocapture noundef writeonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { +// OPT-NEXT: [[ENTRY:.*:]] +// OPT-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(2) [[X]], align 4 +// OPT-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[Y]], align 4 +// OPT-NEXT: ret void +// +// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel3PU3AS2iPU3AS1i( +// OPT-SPIRV-SAME: ptr addrspace(2) nocapture noundef readonly [[X:%.*]], ptr addrspace(1) nocapture noundef writeonly [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR1:[0-9]+]] { +// OPT-SPIRV-NEXT: [[ENTRY:.*:]] +// OPT-SPIRV-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(2) [[X]], align 4 +// OPT-SPIRV-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[Y]], align 4 +// OPT-SPIRV-NEXT: ret void +// +// HOST-LABEL: define dso_local void @_Z22__device_stub__kernel3PU3AS2iPU3AS1i( +// HOST-SAME: ptr addrspace(2) noundef [[X:%.*]], ptr addrspace(1) noundef [[Y:%.*]]) #[[ATTR0]] { +// HOST-NEXT: [[ENTRY:.*:]] +// HOST-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(2), align 8 +// HOST-NEXT: [[Y_ADDR:%.*]] = alloca ptr addrspace(1), align 8 +// HOST-NEXT: store ptr addrspace(2) [[X]], ptr [[X_ADDR]], align 8 +// HOST-NEXT: store ptr addrspace(1) [[Y]], ptr [[Y_ADDR]], align 8 +// HOST-NEXT: [[TMP0:%.*]] = call i32 @hipSetupArgument(ptr [[X_ADDR]], i64 8, i64 0) +// HOST-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 +// HOST-NEXT: br i1 [[TMP1]], label %[[SETUP_NEXT:.*]], label %[[SETUP_END:.*]] +// HOST: [[SETUP_NEXT]]: +// HOST-NEXT: [[TMP2:%.*]] = call i32 @hipSetupArgument(ptr [[Y_ADDR]], i64 8, i64 8) +// HOST-NEXT: [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0 +// HOST-NEXT: br i1 [[TMP3]], label %[[SETUP_NEXT1:.*]], label %[[SETUP_END]] +// HOST: [[SETUP_NEXT1]]: +// HOST-NEXT: [[TMP4:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z7kernel3PU3AS2iPU3AS1i) +// HOST-NEXT: br label %[[SETUP_END]] +// HOST: [[SETUP_END]]: +// HOST-NEXT: ret void +// __global__ void kernel3(__attribute__((address_space(2))) int *x, __attribute__((address_space(1))) int *y) { y[0] = x[0]; } -// COMMON-LABEL: define{{.*}} void @_Z4funcPi(ptr{{.*}} %x) -// CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr +// CHECK-LABEL: define dso_local void @_Z4funcPi( +// CHECK-SAME: ptr noundef [[X:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr +// CHECK-NEXT: store ptr [[X]], ptr [[X_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: ret void +// +// CHECK-SPIRV-LABEL: define spir_func void @_Z4funcPi( +// CHECK-SPIRV-SAME: ptr addrspace(4) noundef [[X:%.*]]) addrspace(4) #[[ATTR1:[0-9]+]] { +// CHECK-SPIRV-NEXT: [[ENTRY:.*:]] +// CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-SPIRV-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4) +// CHECK-SPIRV-NEXT: store ptr addrspace(4) [[X]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 0 +// CHECK-SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX]], align 4 +// CHECK-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[ARRAYIDX]], align 4 +// CHECK-SPIRV-NEXT: ret void +// +// OPT-LABEL: define dso_local void @_Z4funcPi( +// OPT-SAME: ptr nocapture noundef [[X:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +// OPT-NEXT: [[ENTRY:.*:]] +// OPT-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4 +// OPT-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +// OPT-NEXT: store i32 [[INC]], ptr [[X]], align 4 +// OPT-NEXT: ret void +// +// OPT-SPIRV-LABEL: define spir_func void @_Z4funcPi( +// OPT-SPIRV-SAME: ptr addrspace(4) nocapture noundef [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2:[0-9]+]] { +// OPT-SPIRV-NEXT: [[ENTRY:.*:]] +// OPT-SPIRV-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(4) [[X]], align 4 +// OPT-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +// OPT-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[X]], align 4 +// OPT-SPIRV-NEXT: ret void +// __device__ void func(int *x) { x[0]++; } @@ -57,29 +279,202 @@ struct S { // `by-val` struct is passed by-indirect-alias (a mix of by-ref and indirect // by-val). However, the enhanced address inferring pass should be able to // assume they are global pointers. +// For SPIR-V, since byref is not supported at the moment, we pass it as direct. + +// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z7kernel41S( +// CHECK-SAME: ptr addrspace(4) noundef byref([[STRUCT_S:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_S]], align 8, addrspace(5) +// CHECK-NEXT: [[S:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// CHECK-NEXT: call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[S]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false) +// CHECK-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[S]], i32 0, i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[X]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 0 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[S]], i32 0, i32 1 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[Y]], align 8 +// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i64 0 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = fadd contract float [[TMP4]], 1.000000e+00 +// CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX1]], align 4 +// CHECK-NEXT: ret void +// +// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel41S( +// CHECK-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] { +// CHECK-SPIRV-NEXT: [[ENTRY:.*:]] +// CHECK-SPIRV-NEXT: [[S:%.*]] = alloca [[STRUCT_S]], align 8 +// CHECK-SPIRV-NEXT: [[S1:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(4) +// CHECK-SPIRV-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr addrspace(4) [[S1]], i32 0, i32 0 +// CHECK-SPIRV-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_S]] [[S_COERCE]], 0 +// CHECK-SPIRV-NEXT: store ptr addrspace(4) [[TMP1]], ptr addrspace(4) [[TMP0]], align 8 +// CHECK-SPIRV-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr addrspace(4) [[S1]], i32 0, i32 1 +// CHECK-SPIRV-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_S]] [[S_COERCE]], 1 +// CHECK-SPIRV-NEXT: store ptr addrspace(4) [[TMP3]], ptr addrspace(4) [[TMP2]], align 8 +// CHECK-SPIRV-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr addrspace(4) [[S1]], i32 0, i32 0 +// CHECK-SPIRV-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X]], align 8 +// CHECK-SPIRV-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP4]], i64 0 +// CHECK-SPIRV-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX]], align 4 +// CHECK-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[ARRAYIDX]], align 4 +// CHECK-SPIRV-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr addrspace(4) [[S1]], i32 0, i32 1 +// CHECK-SPIRV-NEXT: [[TMP6:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[Y]], align 8 +// CHECK-SPIRV-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr addrspace(4) [[TMP6]], i64 0 +// CHECK-SPIRV-NEXT: [[TMP7:%.*]] = load float, ptr addrspace(4) [[ARRAYIDX2]], align 4 +// CHECK-SPIRV-NEXT: [[ADD:%.*]] = fadd contract float [[TMP7]], 1.000000e+00 +// CHECK-SPIRV-NEXT: store float [[ADD]], ptr addrspace(4) [[ARRAYIDX2]], align 4 +// CHECK-SPIRV-NEXT: ret void +// +// OPT-LABEL: define dso_local amdgpu_kernel void @_Z7kernel41S( +// OPT-SAME: ptr addrspace(4) nocapture noundef readonly byref([[STRUCT_S:%.*]]) align 8 [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { +// OPT-NEXT: [[ENTRY:.*:]] +// OPT-NEXT: [[COERCE_SROA_0_0_COPYLOAD:%.*]] = load ptr, ptr addrspace(4) [[TMP0]], align 8, !amdgpu.noclobber [[META4:![0-9]+]] +// OPT-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[COERCE_SROA_0_0_COPYLOAD]] to ptr addrspace(1) +// OPT-NEXT: [[COERCE_SROA_2_0__SROA_IDX:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP0]], i64 8 +// OPT-NEXT: [[COERCE_SROA_2_0_COPYLOAD:%.*]] = load ptr, ptr addrspace(4) [[COERCE_SROA_2_0__SROA_IDX]], align 8, !amdgpu.noclobber [[META4]] +// OPT-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[COERCE_SROA_2_0_COPYLOAD]] to ptr addrspace(1) +// OPT-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[TMP1]], align 4, !amdgpu.noclobber [[META4]] +// OPT-NEXT: [[INC:%.*]] = add nsw i32 [[TMP3]], 1 +// OPT-NEXT: store i32 [[INC]], ptr addrspace(1) [[TMP1]], align 4 +// OPT-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(1) [[TMP2]], align 4 +// OPT-NEXT: [[ADD:%.*]] = fadd contract float [[TMP4]], 1.000000e+00 +// OPT-NEXT: store float [[ADD]], ptr addrspace(1) [[TMP2]], align 4 +// OPT-NEXT: ret void +// +// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel41S( +// OPT-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] { +// OPT-SPIRV-NEXT: [[ENTRY:.*:]] +// OPT-SPIRV-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_S]] [[S_COERCE]], 0 +// OPT-SPIRV-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_S]] [[S_COERCE]], 1 +// OPT-SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP0]], align 4 +// OPT-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// OPT-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[TMP0]], align 4 +// OPT-SPIRV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[TMP1]], align 4 +// OPT-SPIRV-NEXT: [[ADD:%.*]] = fadd contract float [[TMP3]], 1.000000e+00 +// OPT-SPIRV-NEXT: store float [[ADD]], ptr addrspace(4) [[TMP1]], align 4 +// OPT-SPIRV-NEXT: ret void +// +// HOST-LABEL: define dso_local void @_Z22__device_stub__kernel41S( +// HOST-SAME: ptr [[S_COERCE0:%.*]], ptr [[S_COERCE1:%.*]]) #[[ATTR0]] { +// HOST-NEXT: [[ENTRY:.*:]] +// HOST-NEXT: [[S:%.*]] = alloca [[STRUCT_S:%.*]], align 8 +// HOST-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw { ptr, ptr }, ptr [[S]], i32 0, i32 0 +// HOST-NEXT: store ptr [[S_COERCE0]], ptr [[TMP0]], align 8 +// HOST-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { ptr, ptr }, ptr [[S]], i32 0, i32 1 +// HOST-NEXT: store ptr [[S_COERCE1]], ptr [[TMP1]], align 8 +// HOST-NEXT: [[TMP2:%.*]] = call i32 @hipSetupArgument(ptr [[S]], i64 16, i64 0) +// HOST-NEXT: [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0 +// HOST-NEXT: br i1 [[TMP3]], label %[[SETUP_NEXT:.*]], label %[[SETUP_END:.*]] +// HOST: [[SETUP_NEXT]]: +// HOST-NEXT: [[TMP4:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z7kernel41S) +// HOST-NEXT: br label %[[SETUP_END]] +// HOST: [[SETUP_END]]: +// HOST-NEXT: ret void // -// HOST: define{{.*}} void @_Z22__device_stub__kernel41S(ptr %s.coerce0, ptr %s.coerce1) -// COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel41S(ptr addrspace(4){{.*}} byref(%struct.S) align 8 %0) -// OPT: [[P0:%.*]] = load ptr, ptr addrspace(4) %0, align 8 -// OPT: [[G0:%.*]] ={{.*}} addrspacecast ptr [[P0]] to ptr addrspace(1) -// OPT: [[R1:%.*]] = getelementptr inbounds i8, ptr addrspace(4) %0, i64 8 -// OPT: [[P1:%.*]] = load ptr, ptr addrspace(4) [[R1]], align 8 -// OPT: [[G1:%.*]] ={{.*}} addrspacecast ptr [[P1]] to ptr addrspace(1) -// OPT: [[V0:%.*]] = load i32, ptr addrspace(1) [[G0]], align 4, !amdgpu.noclobber ![[MD:[0-9]+]] -// OPT: [[INC:%.*]] = add nsw i32 [[V0]], 1 -// OPT: store i32 [[INC]], ptr addrspace(1) [[G0]], align 4 -// OPT: [[V1:%.*]] = load float, ptr addrspace(1) [[G1]], align 4 -// OPT: [[ADD:%.*]] = fadd contract float [[V1]], 1.000000e+00 -// OPT: store float [[ADD]], ptr addrspace(1) [[G1]], align 4 -// OPT: ret void __global__ void kernel4(struct S s) { s.x[0]++; s.y[0] += 1.f; } // If a pointer to struct is passed, only the pointer itself is coerced into the global one. -// HOST: define{{.*}} void @_Z22__device_stub__kernel5P1S(ptr noundef %s) -// COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel5P1S(ptr addrspace(1){{.*}} %s.coerce) + +// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z7kernel5P1S( +// CHECK-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[S:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[S_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[S_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[S]] to ptr +// CHECK-NEXT: [[S_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[S_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[S_COERCE]], ptr [[S_ASCAST]], align 8 +// CHECK-NEXT: [[S1:%.*]] = load ptr, ptr [[S_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[S1]], ptr [[S_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[TMP0]], i32 0, i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[X]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 0 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[S_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP3]], i32 0, i32 1 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[Y]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 0 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = fadd contract float [[TMP5]], 1.000000e+00 +// CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4 +// CHECK-NEXT: ret void +// +// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel5P1S( +// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] { +// CHECK-SPIRV-NEXT: [[ENTRY:.*:]] +// CHECK-SPIRV-NEXT: [[S:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-SPIRV-NEXT: [[S_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-SPIRV-NEXT: [[S_ASCAST:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(4) +// CHECK-SPIRV-NEXT: [[S_ADDR_ASCAST:%.*]] = addrspacecast ptr [[S_ADDR]] to ptr addrspace(4) +// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[S_COERCE]], ptr addrspace(4) [[S_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: [[S1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[S_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: store ptr addrspace(4) [[S1]], ptr addrspace(4) [[S_ADDR_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[S_ADDR_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr addrspace(4) [[TMP0]], i32 0, i32 0 +// CHECK-SPIRV-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X]], align 8 +// CHECK-SPIRV-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 0 +// CHECK-SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX]], align 4 +// CHECK-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// CHECK-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[ARRAYIDX]], align 4 +// CHECK-SPIRV-NEXT: [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[S_ADDR_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr addrspace(4) [[TMP3]], i32 0, i32 1 +// CHECK-SPIRV-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[Y]], align 8 +// CHECK-SPIRV-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr addrspace(4) [[TMP4]], i64 0 +// CHECK-SPIRV-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(4) [[ARRAYIDX2]], align 4 +// CHECK-SPIRV-NEXT: [[ADD:%.*]] = fadd contract float [[TMP5]], 1.000000e+00 +// CHECK-SPIRV-NEXT: store float [[ADD]], ptr addrspace(4) [[ARRAYIDX2]], align 4 +// CHECK-SPIRV-NEXT: ret void +// +// OPT-LABEL: define dso_local amdgpu_kernel void @_Z7kernel5P1S( +// OPT-SAME: ptr addrspace(1) nocapture noundef readonly [[S_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] { +// OPT-NEXT: [[ENTRY:.*:]] +// OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(1) [[S_COERCE]], align 8 +// OPT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// OPT-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// OPT-NEXT: store i32 [[INC]], ptr [[TMP0]], align 4 +// OPT-NEXT: [[Y:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[S_COERCE]], i64 8 +// OPT-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(1) [[Y]], align 8 +// OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP2]], align 4 +// OPT-NEXT: [[ADD:%.*]] = fadd contract float [[TMP3]], 1.000000e+00 +// OPT-NEXT: store float [[ADD]], ptr [[TMP2]], align 4 +// OPT-NEXT: ret void +// +// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel5P1S( +// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] { +// OPT-SPIRV-NEXT: [[ENTRY:.*:]] +// OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[S_COERCE]] to i64 +// OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// OPT-SPIRV-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[TMP1]], align 8 +// OPT-SPIRV-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4 +// OPT-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP3]], 1 +// OPT-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[TMP2]], align 4 +// OPT-SPIRV-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP1]], i64 8 +// OPT-SPIRV-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[Y]], align 8 +// OPT-SPIRV-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(4) [[TMP4]], align 4 +// OPT-SPIRV-NEXT: [[ADD:%.*]] = fadd contract float [[TMP5]], 1.000000e+00 +// OPT-SPIRV-NEXT: store float [[ADD]], ptr addrspace(4) [[TMP4]], align 4 +// OPT-SPIRV-NEXT: ret void +// +// HOST-LABEL: define dso_local void @_Z22__device_stub__kernel5P1S( +// HOST-SAME: ptr noundef [[S:%.*]]) #[[ATTR0]] { +// HOST-NEXT: [[ENTRY:.*:]] +// HOST-NEXT: [[S_ADDR:%.*]] = alloca ptr, align 8 +// HOST-NEXT: store ptr [[S]], ptr [[S_ADDR]], align 8 +// HOST-NEXT: [[TMP0:%.*]] = call i32 @hipSetupArgument(ptr [[S_ADDR]], i64 8, i64 0) +// HOST-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 +// HOST-NEXT: br i1 [[TMP1]], label %[[SETUP_NEXT:.*]], label %[[SETUP_END:.*]] +// HOST: [[SETUP_NEXT]]: +// HOST-NEXT: [[TMP2:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z7kernel5P1S) +// HOST-NEXT: br label %[[SETUP_END]] +// HOST: [[SETUP_END]]: +// HOST-NEXT: ret void +// __global__ void kernel5(struct S *s) { s->x[0]++; s->y[0] += 1.f; @@ -91,29 +486,174 @@ struct T { // `by-val` array is passed by-indirect-alias (a mix of by-ref and indirect // by-val). However, the enhanced address inferring pass should be able to // assume they are global pointers. +// For SPIR-V, since byref is not supported at the moment, we pass it as direct. + +// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z7kernel61T( +// CHECK-SAME: ptr addrspace(4) noundef byref([[STRUCT_T:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_T]], align 8, addrspace(5) +// CHECK-NEXT: [[T:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// CHECK-NEXT: call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[T]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false) +// CHECK-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_T]], ptr [[T]], i32 0, i32 0 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x ptr], ptr [[X]], i64 0, i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 +// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 0 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = fadd contract float [[TMP2]], 1.000000e+00 +// CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX1]], align 4 +// CHECK-NEXT: [[X2:%.*]] = getelementptr inbounds nuw [[STRUCT_T]], ptr [[T]], i32 0, i32 0 +// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x ptr], ptr [[X2]], i64 0, i64 1 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i64 0 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 +// CHECK-NEXT: [[ADD5:%.*]] = fadd contract float [[TMP4]], 2.000000e+00 +// CHECK-NEXT: store float [[ADD5]], ptr [[ARRAYIDX4]], align 4 +// CHECK-NEXT: ret void +// +// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel61T( +// CHECK-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) addrspace(4) #[[ATTR0]] { +// CHECK-SPIRV-NEXT: [[ENTRY:.*:]] +// CHECK-SPIRV-NEXT: [[T:%.*]] = alloca [[STRUCT_T]], align 8 +// CHECK-SPIRV-NEXT: [[T1:%.*]] = addrspacecast ptr [[T]] to ptr addrspace(4) +// CHECK-SPIRV-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_T]], ptr addrspace(4) [[T1]], i32 0, i32 0 +// CHECK-SPIRV-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_T]] [[T_COERCE]], 0 +// CHECK-SPIRV-NEXT: store [2 x ptr addrspace(4)] [[TMP1]], ptr addrspace(4) [[TMP0]], align 8 +// CHECK-SPIRV-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_T]], ptr addrspace(4) [[T1]], i32 0, i32 0 +// CHECK-SPIRV-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x ptr addrspace(4)], ptr addrspace(4) [[X]], i64 0, i64 0 +// CHECK-SPIRV-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ARRAYIDX]], align 8 +// CHECK-SPIRV-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr addrspace(4) [[TMP2]], i64 0 +// CHECK-SPIRV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ARRAYIDX2]], align 4 +// CHECK-SPIRV-NEXT: [[ADD:%.*]] = fadd contract float [[TMP3]], 1.000000e+00 +// CHECK-SPIRV-NEXT: store float [[ADD]], ptr addrspace(4) [[ARRAYIDX2]], align 4 +// CHECK-SPIRV-NEXT: [[X3:%.*]] = getelementptr inbounds nuw [[STRUCT_T]], ptr addrspace(4) [[T1]], i32 0, i32 0 +// CHECK-SPIRV-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [2 x ptr addrspace(4)], ptr addrspace(4) [[X3]], i64 0, i64 1 +// CHECK-SPIRV-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ARRAYIDX4]], align 8 +// CHECK-SPIRV-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr addrspace(4) [[TMP4]], i64 0 +// CHECK-SPIRV-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(4) [[ARRAYIDX5]], align 4 +// CHECK-SPIRV-NEXT: [[ADD6:%.*]] = fadd contract float [[TMP5]], 2.000000e+00 +// CHECK-SPIRV-NEXT: store float [[ADD6]], ptr addrspace(4) [[ARRAYIDX5]], align 4 +// CHECK-SPIRV-NEXT: ret void +// +// OPT-LABEL: define dso_local amdgpu_kernel void @_Z7kernel61T( +// OPT-SAME: ptr addrspace(4) nocapture noundef readonly byref([[STRUCT_T:%.*]]) align 8 [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] { +// OPT-NEXT: [[ENTRY:.*:]] +// OPT-NEXT: [[COERCE_SROA_0_0_COPYLOAD:%.*]] = load ptr, ptr addrspace(4) [[TMP0]], align 8, !amdgpu.noclobber [[META4]] +// OPT-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[COERCE_SROA_0_0_COPYLOAD]] to ptr addrspace(1) +// OPT-NEXT: [[COERCE_SROA_2_0__SROA_IDX:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP0]], i64 8 +// OPT-NEXT: [[COERCE_SROA_2_0_COPYLOAD:%.*]] = load ptr, ptr addrspace(4) [[COERCE_SROA_2_0__SROA_IDX]], align 8, !amdgpu.noclobber [[META4]] +// OPT-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[COERCE_SROA_2_0_COPYLOAD]] to ptr addrspace(1) +// OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(1) [[TMP1]], align 4, !amdgpu.noclobber [[META4]] +// OPT-NEXT: [[ADD:%.*]] = fadd contract float [[TMP3]], 1.000000e+00 +// OPT-NEXT: store float [[ADD]], ptr addrspace(1) [[TMP1]], align 4 +// OPT-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(1) [[TMP2]], align 4 +// OPT-NEXT: [[ADD5:%.*]] = fadd contract float [[TMP4]], 2.000000e+00 +// OPT-NEXT: store float [[ADD5]], ptr addrspace(1) [[TMP2]], align 4 +// OPT-NEXT: ret void +// +// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel61T( +// OPT-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] { +// OPT-SPIRV-NEXT: [[ENTRY:.*:]] +// OPT-SPIRV-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_T]] [[T_COERCE]], 0 +// OPT-SPIRV-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x ptr addrspace(4)] [[TMP0]], 0 +// OPT-SPIRV-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x ptr addrspace(4)] [[TMP0]], 1 +// OPT-SPIRV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTFCA_0_EXTRACT]], align 4 +// OPT-SPIRV-NEXT: [[ADD:%.*]] = fadd contract float [[TMP1]], 1.000000e+00 +// OPT-SPIRV-NEXT: store float [[ADD]], ptr addrspace(4) [[DOTFCA_0_EXTRACT]], align 4 +// OPT-SPIRV-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(4) [[DOTFCA_1_EXTRACT]], align 4 +// OPT-SPIRV-NEXT: [[ADD6:%.*]] = fadd contract float [[TMP2]], 2.000000e+00 +// OPT-SPIRV-NEXT: store float [[ADD6]], ptr addrspace(4) [[DOTFCA_1_EXTRACT]], align 4 +// OPT-SPIRV-NEXT: ret void +// +// HOST-LABEL: define dso_local void @_Z22__device_stub__kernel61T( +// HOST-SAME: ptr [[T_COERCE0:%.*]], ptr [[T_COERCE1:%.*]]) #[[ATTR0]] { +// HOST-NEXT: [[ENTRY:.*:]] +// HOST-NEXT: [[T:%.*]] = alloca [[STRUCT_T:%.*]], align 8 +// HOST-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw { ptr, ptr }, ptr [[T]], i32 0, i32 0 +// HOST-NEXT: store ptr [[T_COERCE0]], ptr [[TMP0]], align 8 +// HOST-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { ptr, ptr }, ptr [[T]], i32 0, i32 1 +// HOST-NEXT: store ptr [[T_COERCE1]], ptr [[TMP1]], align 8 +// HOST-NEXT: [[TMP2:%.*]] = call i32 @hipSetupArgument(ptr [[T]], i64 16, i64 0) +// HOST-NEXT: [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0 +// HOST-NEXT: br i1 [[TMP3]], label %[[SETUP_NEXT:.*]], label %[[SETUP_END:.*]] +// HOST: [[SETUP_NEXT]]: +// HOST-NEXT: [[TMP4:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z7kernel61T) +// HOST-NEXT: br label %[[SETUP_END]] +// HOST: [[SETUP_END]]: +// HOST-NEXT: ret void // -// HOST: define{{.*}} void @_Z22__device_stub__kernel61T(ptr %t.coerce0, ptr %t.coerce1) -// COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel61T(ptr addrspace(4){{.*}} byref(%struct.T) align 8 %0) -// OPT: [[P0:%.*]] = load ptr, ptr addrspace(4) %0, align 8 -// OPT: [[G0:%.*]] ={{.*}} addrspacecast ptr [[P0]] to ptr addrspace(1) -// OPT: [[R1:%.*]] = getelementptr inbounds i8, ptr addrspace(4) %0, i64 8 -// OPT: [[P1:%.*]] = load ptr, ptr addrspace(4) [[R1]], align 8 -// OPT: [[G1:%.*]] ={{.*}} addrspacecast ptr [[P1]] to ptr addrspace(1) -// OPT: [[V0:%.*]] = load float, ptr addrspace(1) [[G0]], align 4, !amdgpu.noclobber ![[MD]] -// OPT: [[ADD0:%.*]] = fadd contract float [[V0]], 1.000000e+00 -// OPT: store float [[ADD0]], ptr addrspace(1) [[G0]], align 4 -// OPT: [[V1:%.*]] = load float, ptr addrspace(1) [[G1]], align 4 -// OPT: [[ADD1:%.*]] = fadd contract float [[V1]], 2.000000e+00 -// OPT: store float [[ADD1]], ptr addrspace(1) [[G1]], align 4 -// OPT: ret void __global__ void kernel6(struct T t) { t.x[0][0] += 1.f; t.x[1][0] += 2.f; } // Check that coerced pointers retain the noalias attribute when qualified with __restrict. -// HOST: define{{.*}} void @_Z22__device_stub__kernel7Pi(ptr noalias noundef %x) -// COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel7Pi(ptr addrspace(1) noalias{{.*}} %x.coerce) + +// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z7kernel7Pi( +// CHECK-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[X:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr +// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr [[X_ASCAST]], align 8 +// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr [[X_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[X1]], ptr [[X_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: ret void +// +// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel7Pi( +// CHECK-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] { +// CHECK-SPIRV-NEXT: [[ENTRY:.*:]] +// CHECK-SPIRV-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-SPIRV-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) +// CHECK-SPIRV-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4) +// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr addrspace(4) [[X_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 0 +// CHECK-SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX]], align 4 +// CHECK-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[ARRAYIDX]], align 4 +// CHECK-SPIRV-NEXT: ret void +// +// OPT-LABEL: define dso_local amdgpu_kernel void @_Z7kernel7Pi( +// OPT-SAME: ptr addrspace(1) noalias nocapture noundef [[X_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] { +// OPT-NEXT: [[ENTRY:.*:]] +// OPT-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) [[X_COERCE]], align 4 +// OPT-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +// OPT-NEXT: store i32 [[INC]], ptr addrspace(1) [[X_COERCE]], align 4 +// OPT-NEXT: ret void +// +// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel7Pi( +// OPT-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] { +// OPT-SPIRV-NEXT: [[ENTRY:.*:]] +// OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64 +// OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// OPT-SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4 +// OPT-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// OPT-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[TMP1]], align 4 +// OPT-SPIRV-NEXT: ret void +// +// HOST-LABEL: define dso_local void @_Z22__device_stub__kernel7Pi( +// HOST-SAME: ptr noalias noundef [[X:%.*]]) #[[ATTR0]] { +// HOST-NEXT: [[ENTRY:.*:]] +// HOST-NEXT: [[X_ADDR:%.*]] = alloca ptr, align 8 +// HOST-NEXT: store ptr [[X]], ptr [[X_ADDR]], align 8 +// HOST-NEXT: [[TMP0:%.*]] = call i32 @hipSetupArgument(ptr [[X_ADDR]], i64 8, i64 0) +// HOST-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 +// HOST-NEXT: br i1 [[TMP1]], label %[[SETUP_NEXT:.*]], label %[[SETUP_END:.*]] +// HOST: [[SETUP_NEXT]]: +// HOST-NEXT: [[TMP2:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z7kernel7Pi) +// HOST-NEXT: br label %[[SETUP_END]] +// HOST: [[SETUP_END]]: +// HOST-NEXT: ret void +// __global__ void kernel7(int *__restrict x) { x[0]++; } @@ -122,13 +662,70 @@ __global__ void kernel7(int *__restrict x) { struct SS { float *x; }; -// HOST: define{{.*}} void @_Z22__device_stub__kernel82SS(ptr %a.coerce) -// COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel82SS(ptr addrspace(1){{.*}} %a.coerce) -// CHECK-NOT: ={{.*}} addrspacecast ptr addrspace(1) %{{.*}} to ptr -// OPT: [[VAL:%.*]] = load float, ptr addrspace(1) %a.coerce, align 4{{$}} -// OPT: [[INC:%.*]] = fadd contract float [[VAL]], 3.000000e+00 -// OPT: store float [[INC]], ptr addrspace(1) %a.coerce, align 4 -// OPT: ret void +// CHECK-LABEL: define dso_local amdgpu_kernel void @_Z7kernel82SS( +// CHECK-SAME: ptr addrspace(1) [[A_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A:%.*]] = alloca [[STRUCT_SS:%.*]], align 8, addrspace(5) +// CHECK-NEXT: [[A1:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr +// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[A1]], i32 0, i32 0 +// CHECK-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr [[COERCE_DIVE]], align 8 +// CHECK-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[A1]], i32 0, i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[TMP0]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = fadd contract float [[TMP1]], 3.000000e+00 +// CHECK-NEXT: store float [[ADD]], ptr [[TMP0]], align 4 +// CHECK-NEXT: ret void +// +// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel82SS( +// CHECK-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) addrspace(4) #[[ATTR0]] { +// CHECK-SPIRV-NEXT: [[ENTRY:.*:]] +// CHECK-SPIRV-NEXT: [[A:%.*]] = alloca [[STRUCT_SS]], align 8 +// CHECK-SPIRV-NEXT: [[A1:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(4) +// CHECK-SPIRV-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr addrspace(4) [[A1]], i32 0, i32 0 +// CHECK-SPIRV-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_SS]] [[A_COERCE]], 0 +// CHECK-SPIRV-NEXT: store ptr addrspace(4) [[TMP1]], ptr addrspace(4) [[TMP0]], align 8 +// CHECK-SPIRV-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr addrspace(4) [[A1]], i32 0, i32 0 +// CHECK-SPIRV-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X]], align 8 +// CHECK-SPIRV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[TMP2]], align 4 +// CHECK-SPIRV-NEXT: [[ADD:%.*]] = fadd contract float [[TMP3]], 3.000000e+00 +// CHECK-SPIRV-NEXT: store float [[ADD]], ptr addrspace(4) [[TMP2]], align 4 +// CHECK-SPIRV-NEXT: ret void +// +// OPT-LABEL: define dso_local amdgpu_kernel void @_Z7kernel82SS( +// OPT-SAME: ptr addrspace(1) nocapture [[A_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] { +// OPT-NEXT: [[ENTRY:.*:]] +// OPT-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(1) [[A_COERCE]], align 4 +// OPT-NEXT: [[ADD:%.*]] = fadd contract float [[TMP0]], 3.000000e+00 +// OPT-NEXT: store float [[ADD]], ptr addrspace(1) [[A_COERCE]], align 4 +// OPT-NEXT: ret void +// +// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel82SS( +// OPT-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] { +// OPT-SPIRV-NEXT: [[ENTRY:.*:]] +// OPT-SPIRV-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_SS]] [[A_COERCE]], 0 +// OPT-SPIRV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[TMP0]], align 4 +// OPT-SPIRV-NEXT: [[ADD:%.*]] = fadd contract float [[TMP1]], 3.000000e+00 +// OPT-SPIRV-NEXT: store float [[ADD]], ptr addrspace(4) [[TMP0]], align 4 +// OPT-SPIRV-NEXT: ret void +// +// HOST-LABEL: define dso_local void @_Z22__device_stub__kernel82SS( +// HOST-SAME: ptr [[A_COERCE:%.*]]) #[[ATTR0]] { +// HOST-NEXT: [[ENTRY:.*:]] +// HOST-NEXT: [[A:%.*]] = alloca [[STRUCT_SS:%.*]], align 8 +// HOST-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[A]], i32 0, i32 0 +// HOST-NEXT: store ptr [[A_COERCE]], ptr [[COERCE_DIVE]], align 8 +// HOST-NEXT: [[TMP0:%.*]] = call i32 @hipSetupArgument(ptr [[A]], i64 8, i64 0) +// HOST-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0 +// HOST-NEXT: br i1 [[TMP1]], label %[[SETUP_NEXT:.*]], label %[[SETUP_END:.*]] +// HOST: [[SETUP_NEXT]]: +// HOST-NEXT: [[TMP2:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z7kernel82SS) +// HOST-NEXT: br label %[[SETUP_END]] +// HOST: [[SETUP_END]]: +// HOST-NEXT: ret void +// __global__ void kernel8(struct SS a) { *a.x += 3.f; } +//. +// OPT: [[META4]] = !{} +//. diff --git a/clang/test/CodeGenCUDA/kernel-args.cu b/clang/test/CodeGenCUDA/kernel-args.cu index bcce729f14481c..8d17d89b315dec 100644 --- a/clang/test/CodeGenCUDA/kernel-args.cu +++ b/clang/test/CodeGenCUDA/kernel-args.cu @@ -1,5 +1,7 @@ // RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -fcuda-is-device \ // RUN: -emit-llvm %s -o - | FileCheck -check-prefix=AMDGCN %s +// RUN: %clang_cc1 -x hip -triple spirv64-amd-amdhsa -fcuda-is-device \ +// RUN: -emit-llvm %s -o - | FileCheck -check-prefix=AMDGCNSPIRV %s // RUN: %clang_cc1 -x cuda -triple nvptx64-nvidia-cuda- -fcuda-is-device \ // RUN: -emit-llvm %s -o - | FileCheck -check-prefix=NVPTX %s #include "Inputs/cuda.h" @@ -10,6 +12,7 @@ struct A { }; // AMDGCN: define{{.*}} amdgpu_kernel void @_Z6kernel1A(ptr addrspace(4) noundef byref(%struct.A) align 8 %{{.+}}) +// AMDGCNSPIRV: define{{.*}} spir_kernel void @_Z6kernel1A(%struct.A %{{.+}}) // NVPTX: define{{.*}} void @_Z6kernel1A(ptr noundef byval(%struct.A) align 8 %x) __global__ void kernel(A x) { } @@ -17,6 +20,7 @@ __global__ void kernel(A x) { class Kernel { public: // AMDGCN: define{{.*}} amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(ptr addrspace(4) noundef byref(%struct.A) align 8 %{{.+}}) + // AMDGCNSPIRV: define{{.*}} spir_kernel void @_ZN6Kernel12memberKernelE1A(%struct.A %{{.+}}) // NVPTX: define{{.*}} void @_ZN6Kernel12memberKernelE1A(ptr noundef byval(%struct.A) align 8 %x) static __global__ void memberKernel(A x){} template static __global__ void templateMemberKernel(T x) {} @@ -31,10 +35,12 @@ void launch(void*); void test() { Kernel K; // AMDGCN: define{{.*}} amdgpu_kernel void @_Z14templateKernelI1AEvT_(ptr addrspace(4) noundef byref(%struct.A) align 8 %{{.+}} + // AMDGCNSPIRV: define{{.*}} spir_kernel void @_Z14templateKernelI1AEvT_(%struct.A %{{.+}}) // NVPTX: define{{.*}} void @_Z14templateKernelI1AEvT_(ptr noundef byval(%struct.A) align 8 %x) launch((void*)templateKernel); // AMDGCN: define{{.*}} amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(ptr addrspace(4) noundef byref(%struct.A) align 8 %{{.+}} + // AMDGCNSPIRV: define{{.*}} spir_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A %{{.+}} // NVPTX: define{{.*}} void @_ZN6Kernel20templateMemberKernelI1AEEvT_(ptr noundef byval(%struct.A) align 8 %x) launch((void*)Kernel::templateMemberKernel); } From 848658955a9d2d42ea3e319d191e2dcd5d76c837 Mon Sep 17 00:00:00 2001 From: Balazs Benics Date: Wed, 21 Aug 2024 14:24:56 +0200 Subject: [PATCH 054/426] [analyzer] Limit `isTainted()` by skipping complicated symbols (#105493) As discussed in https://discourse.llvm.org/t/rfc-make-istainted-and-complex-symbols-friends/79570/10 Some `isTainted()` queries can blow up the analysis times, and effectively halt the analysis under specific workloads. We don't really have the time now to do a caching re-implementation of `isTainted()`, so we need to workaround the case. The workaround with the smallest blast radius was to limit what symbols `isTainted()` does the query (by walking the SymExpr). So far, the threshold 10 worked for us, but this value can be overridden using the "max-tainted-symbol-complexity" config value. This new option is "deprecated" from the getgo, as I expect this issue to be fixed within the next few months and I don't want users to override this value anyways. If they do, this message will let them know that they are on their own, and the next release may break them (as we no longer recognize this option if we drop it). Mitigates #89720 CPP-5414 --- .../StaticAnalyzer/Core/AnalyzerOptions.def | 5 ++ clang/lib/StaticAnalyzer/Checkers/Taint.cpp | 7 +++ clang/test/Analysis/analyzer-config.c | 1 + clang/test/Analysis/taint-generic.c | 49 ++++++++++++++++++- 4 files changed, 61 insertions(+), 1 deletion(-) diff --git a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def index 29aa6a3b8a16e7..737bc8e86cfb6a 100644 --- a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def +++ b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def @@ -407,6 +407,11 @@ ANALYZER_OPTION( ANALYZER_OPTION(unsigned, MaxSymbolComplexity, "max-symbol-complexity", "The maximum complexity of symbolic constraint.", 35) +// HACK:https://discourse.llvm.org/t/rfc-make-istainted-and-complex-symbols-friends/79570 +// Ideally, we should get rid of this option soon. +ANALYZER_OPTION(unsigned, MaxTaintedSymbolComplexity, "max-tainted-symbol-complexity", + "[DEPRECATED] The maximum complexity of a symbol to carry taint", 9) + ANALYZER_OPTION(unsigned, MaxTimesInlineLarge, "max-times-inline-large", "The maximum times a large function could be inlined.", 32) diff --git a/clang/lib/StaticAnalyzer/Checkers/Taint.cpp b/clang/lib/StaticAnalyzer/Checkers/Taint.cpp index 6362c82b009d72..0bb5739db4b756 100644 --- a/clang/lib/StaticAnalyzer/Checkers/Taint.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/Taint.cpp @@ -12,6 +12,7 @@ #include "clang/StaticAnalyzer/Checkers/Taint.h" #include "clang/StaticAnalyzer/Core/BugReporter/BugReporter.h" +#include "clang/StaticAnalyzer/Core/PathSensitive/AnalysisManager.h" #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h" #include @@ -256,6 +257,12 @@ std::vector taint::getTaintedSymbolsImpl(ProgramStateRef State, if (!Sym) return TaintedSymbols; + // HACK:https://discourse.llvm.org/t/rfc-make-istainted-and-complex-symbols-friends/79570 + if (const auto &Opts = State->getAnalysisManager().getAnalyzerOptions(); + Sym->computeComplexity() > Opts.MaxTaintedSymbolComplexity) { + return {}; + } + // Traverse all the symbols this symbol depends on to see if any are tainted. for (SymbolRef SubSym : Sym->symbols()) { if (!isa(SubSym)) diff --git a/clang/test/Analysis/analyzer-config.c b/clang/test/Analysis/analyzer-config.c index b8dbcdd7e55afe..8eb869bac46f8f 100644 --- a/clang/test/Analysis/analyzer-config.c +++ b/clang/test/Analysis/analyzer-config.c @@ -94,6 +94,7 @@ // CHECK-NEXT: max-inlinable-size = 100 // CHECK-NEXT: max-nodes = 225000 // CHECK-NEXT: max-symbol-complexity = 35 +// CHECK-NEXT: max-tainted-symbol-complexity = 9 // CHECK-NEXT: max-times-inline-large = 32 // CHECK-NEXT: min-cfg-size-treat-functions-as-large = 14 // CHECK-NEXT: mode = deep diff --git a/clang/test/Analysis/taint-generic.c b/clang/test/Analysis/taint-generic.c index b0df85f237298d..1c139312734bca 100644 --- a/clang/test/Analysis/taint-generic.c +++ b/clang/test/Analysis/taint-generic.c @@ -63,6 +63,7 @@ void clang_analyzer_isTainted_char(char); void clang_analyzer_isTainted_wchar(wchar_t); void clang_analyzer_isTainted_charp(char*); void clang_analyzer_isTainted_int(int); +void clang_analyzer_dump_int(int); int coin(); @@ -459,7 +460,53 @@ unsigned radar11369570_hanging(const unsigned char *arr, int l) { longcmp(a, t, c); l -= 12; } - return 5/a; // expected-warning {{Division by a tainted value, possibly zero}} + return 5/a; // FIXME: Should be a "div by tainted" warning here. +} + +// This computation used to take a very long time. +void complex_taint_queries(const int *p) { + int tainted = 0; + scanf("%d", &tainted); + + // Make "tmp" tainted. + int tmp = tainted + tainted; + clang_analyzer_isTainted_int(tmp); // expected-warning{{YES}} + + // Make "tmp" SymExpr a lot more complicated by applying computation. + // This should balloon the symbol complexity. + tmp += p[0] + p[0]; + tmp += p[1] + p[1]; + tmp += p[2] + p[2]; + clang_analyzer_dump_int(tmp); // expected-warning{{((((conj_}} symbol complexity: 8 + clang_analyzer_isTainted_int(tmp); // expected-warning{{YES}} + + tmp += p[3] + p[3]; + clang_analyzer_dump_int(tmp); // expected-warning{{(((((conj_}} symbol complexity: 10 + clang_analyzer_isTainted_int(tmp); // expected-warning{{NO}} 10 is already too complex to be traversed + + tmp += p[4] + p[4]; + tmp += p[5] + p[5]; + tmp += p[6] + p[6]; + tmp += p[7] + p[7]; + tmp += p[8] + p[8]; + tmp += p[9] + p[9]; + tmp += p[10] + p[10]; + tmp += p[11] + p[11]; + tmp += p[12] + p[12]; + tmp += p[13] + p[13]; + tmp += p[14] + p[14]; + tmp += p[15] + p[15]; + + // The SymExpr still holds the full history of the computation, yet, "isTainted" doesn't traverse the tree as the complexity is over the threshold. + clang_analyzer_dump_int(tmp); + // expected-warning@-1{{(((((((((((((((((conj_}} symbol complexity: 34 + clang_analyzer_isTainted_int(tmp); // expected-warning{{NO}} FIXME: Ideally, this should still result in "tainted". + + // By making it even one step more complex, then it would hit the "max-symbol-complexity" + // threshold and the engine would cut the SymExpr and replace it by a new conjured symbol. + tmp += p[16]; + clang_analyzer_dump_int(tmp); // expected-warning{{conj_}} symbol complexity: 1 + clang_analyzer_isTainted_int(tmp); // expected-warning{{NO}} } // Check that we do not assert of the following code. From 2704b804bec50c2b016bf678bd534c330ec655b6 Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Wed, 21 Aug 2024 10:36:41 +0200 Subject: [PATCH 055/426] Revert "[asan] Remove debug tracing from `report_globals` (#104404)" This caused SanitizerCommon-asan-x86_64-Darwin :: Darwin/print-stack-trace-in-code-loaded-after-fork.cpp to fail, see comment on the PR. > Printing globals registration is internal debug > tracing and should be controlled with verbosity. This reverts commit 68f6e7467651f38e0b97343bfbc49e0ce69eaedf and follow-up commit ef6760116fa2fa21f78e7a3b499f77e1a3eb7b92. --- compiler-rt/lib/asan/asan_flags.inc | 7 +++++-- compiler-rt/lib/asan/asan_globals.cpp | 19 +++++++++++-------- .../Linux/initialization-nobug-lld.cpp | 2 +- .../Linux/odr_indicator_unregister.cpp | 2 +- .../asan/TestCases/Linux/odr_indicators.cpp | 4 ++-- .../TestCases/Windows/dll_global_dead_strip.c | 4 ++-- ...eport_globals_symbolization_at_startup.cpp | 2 +- .../TestCases/Windows/global_dead_strip.c | 4 ++-- .../Windows/report_globals_vs_freelibrary.cpp | 2 +- .../asan/TestCases/initialization-nobug.cpp | 8 ++++---- 10 files changed, 30 insertions(+), 24 deletions(-) diff --git a/compiler-rt/lib/asan/asan_flags.inc b/compiler-rt/lib/asan/asan_flags.inc index 5e0ced9706e664..fad1577d912a5e 100644 --- a/compiler-rt/lib/asan/asan_flags.inc +++ b/compiler-rt/lib/asan/asan_flags.inc @@ -36,8 +36,11 @@ ASAN_FLAG(int, max_redzone, 2048, ASAN_FLAG( bool, debug, false, "If set, prints some debugging information and does additional checks.") -ASAN_FLAG(bool, report_globals, true, - "If set, detect and report errors on globals .") +ASAN_FLAG( + int, report_globals, 1, + "Controls the way to handle globals (0 - don't detect buffer overflow on " + "globals, 1 - detect buffer overflow, 2 - print data about registered " + "globals).") ASAN_FLAG(bool, check_initialization_order, false, "If set, attempts to catch initialization order issues.") ASAN_FLAG( diff --git a/compiler-rt/lib/asan/asan_globals.cpp b/compiler-rt/lib/asan/asan_globals.cpp index a1211430b1268a..c83b782cb85f89 100644 --- a/compiler-rt/lib/asan/asan_globals.cpp +++ b/compiler-rt/lib/asan/asan_globals.cpp @@ -22,7 +22,6 @@ #include "asan_thread.h" #include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_dense_map.h" -#include "sanitizer_common/sanitizer_internal_defs.h" #include "sanitizer_common/sanitizer_list.h" #include "sanitizer_common/sanitizer_mutex.h" #include "sanitizer_common/sanitizer_placement_new.h" @@ -180,7 +179,7 @@ int GetGlobalsForAddress(uptr addr, Global *globals, u32 *reg_sites, int res = 0; for (const auto &l : list_of_all_globals) { const Global &g = *l.g; - if (UNLIKELY(common_flags()->verbosity >= 3)) + if (flags()->report_globals >= 2) ReportGlobal(g, "Search"); if (IsAddressNearGlobal(addr, g)) { internal_memcpy(&globals[res], &g, sizeof(g)); @@ -271,7 +270,7 @@ static inline bool UseODRIndicator(const Global *g) { // so we store the globals in a map. static void RegisterGlobal(const Global *g) SANITIZER_REQUIRES(mu_for_globals) { CHECK(AsanInited()); - if (UNLIKELY(common_flags()->verbosity >= 3)) + if (flags()->report_globals >= 2) ReportGlobal(*g, "Added"); CHECK(flags()->report_globals); CHECK(AddrIsInMem(g->beg)); @@ -308,7 +307,7 @@ static void RegisterGlobal(const Global *g) SANITIZER_REQUIRES(mu_for_globals) { static void UnregisterGlobal(const Global *g) SANITIZER_REQUIRES(mu_for_globals) { CHECK(AsanInited()); - if (UNLIKELY(common_flags()->verbosity >= 3)) + if (flags()->report_globals >= 2) ReportGlobal(*g, "Removed"); CHECK(flags()->report_globals); CHECK(AddrIsInMem(g->beg)); @@ -439,7 +438,7 @@ void __asan_register_globals(__asan_global *globals, uptr n) { } GlobalRegistrationSite site = {stack_id, &globals[0], &globals[n - 1]}; global_registration_site_vector->push_back(site); - if (UNLIKELY(common_flags()->verbosity >= 3)) { + if (flags()->report_globals >= 2) { PRINT_CURRENT_STACK(); Printf("=== ID %d; %p %p\n", stack_id, (void *)&globals[0], (void *)&globals[n - 1]); @@ -498,7 +497,9 @@ void __asan_before_dynamic_init(const char *module_name) { Lock lock(&mu_for_globals); if (current_dynamic_init_module_name == module_name) return; - VPrintf(2, "DynInitPoison module: %s\n", module_name); + if (flags()->report_globals >= 3) + Printf("DynInitPoison module: %s\n", module_name); + if (current_dynamic_init_module_name == nullptr) { // First call, poison all globals from other modules. DynInitGlobals().forEach([&](auto &kv) { @@ -544,7 +545,8 @@ static void UnpoisonBeforeMain(void) { return; allow_after_dynamic_init = true; } - VPrintf(2, "UnpoisonBeforeMain\n"); + if (flags()->report_globals >= 3) + Printf("UnpoisonBeforeMain\n"); __asan_after_dynamic_init(); } @@ -568,7 +570,8 @@ void __asan_after_dynamic_init() { if (!current_dynamic_init_module_name) return; - VPrintf(2, "DynInitUnpoison\n"); + if (flags()->report_globals >= 3) + Printf("DynInitUnpoison\n"); DynInitGlobals().forEach([&](auto &kv) { UnpoisonDynamicGlobals(kv.second, /*mark_initialized=*/false); diff --git a/compiler-rt/test/asan/TestCases/Linux/initialization-nobug-lld.cpp b/compiler-rt/test/asan/TestCases/Linux/initialization-nobug-lld.cpp index ef82c7a29575eb..5cec029811cbc8 100644 --- a/compiler-rt/test/asan/TestCases/Linux/initialization-nobug-lld.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/initialization-nobug-lld.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx_asan -O3 %S/../initialization-nobug.cpp %S/../Helpers/initialization-nobug-extra.cpp -fuse-ld=lld -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInit" +// RUN: %clangxx_asan -O3 %S/../initialization-nobug.cpp %S/../Helpers/initialization-nobug-extra.cpp -fuse-ld=lld -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInit" // Same as initialization-nobug.cpp, but with lld we expect just one // `DynInitUnpoison` executed after `AfterDynamicInit` at the end. diff --git a/compiler-rt/test/asan/TestCases/Linux/odr_indicator_unregister.cpp b/compiler-rt/test/asan/TestCases/Linux/odr_indicator_unregister.cpp index b75f5be101ef8a..0f2ed6597154bb 100644 --- a/compiler-rt/test/asan/TestCases/Linux/odr_indicator_unregister.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/odr_indicator_unregister.cpp @@ -4,7 +4,7 @@ // RUN: %clangxx_asan -g -O0 -DSHARED_LIB -DSIZE=1 %s -fPIC -shared -o %t-so-1.so // RUN: %clangxx_asan -g -O0 -DSHARED_LIB -DSIZE=2 %s -fPIC -shared -o %t-so-2.so // RUN: %clangxx_asan -g -O0 %s %libdl -Wl,--export-dynamic -o %t -// RUN: %env_asan_opts=report_globals=1:detect_odr_violation=1:verbosity=3 %run %t 2>&1 | FileCheck %s +// RUN: %env_asan_opts=report_globals=2:detect_odr_violation=1 %run %t 2>&1 | FileCheck %s // FIXME: Checks do not match on Android. // UNSUPPORTED: android diff --git a/compiler-rt/test/asan/TestCases/Linux/odr_indicators.cpp b/compiler-rt/test/asan/TestCases/Linux/odr_indicators.cpp index f28a9f6d07386d..8af3ec09be78c4 100644 --- a/compiler-rt/test/asan/TestCases/Linux/odr_indicators.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/odr_indicators.cpp @@ -1,8 +1,8 @@ // RUN: %clangxx_asan -fno-sanitize-address-use-odr-indicator -fPIC %s -o %t -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,INDICATOR0 +// RUN: %env_asan_opts=report_globals=2 %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,INDICATOR0 // RUN: %clangxx_asan -fsanitize-address-use-odr-indicator -fPIC %s -o %t -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,INDICATOR1 +// RUN: %env_asan_opts=report_globals=2 %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,INDICATOR1 #include diff --git a/compiler-rt/test/asan/TestCases/Windows/dll_global_dead_strip.c b/compiler-rt/test/asan/TestCases/Windows/dll_global_dead_strip.c index e5bd27bdf65fdf..a0c96622efeea4 100644 --- a/compiler-rt/test/asan/TestCases/Windows/dll_global_dead_strip.c +++ b/compiler-rt/test/asan/TestCases/Windows/dll_global_dead_strip.c @@ -1,11 +1,11 @@ // RUN: %clang_cl_asan %Od %p/dll_host.cpp %Fe%t // // RUN: %clang_cl_nocxx_asan %Gw %LD %Od %s %Fe%t.dll -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %t %t.dll 2>&1 | FileCheck %s --check-prefix=NOSTRIP +// RUN: %env_asan_opts=report_globals=2 %run %t %t.dll 2>&1 | FileCheck %s --check-prefix=NOSTRIP // RUN: %clang_cl_nocxx_asan %Gw %LD -O2 %s %Fe%t.dll \ // RUN: %if target={{.*-windows-gnu}} %{ -Wl,--gc-sections %} \ // RUN: %else %{ -link -opt:ref %} -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %t %t.dll 2>&1 | FileCheck %s --check-prefix=STRIP +// RUN: %env_asan_opts=report_globals=2 %run %t %t.dll 2>&1 | FileCheck %s --check-prefix=STRIP #include diff --git a/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp b/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp index c74b66f2b43b3e..06a632e6708b1e 100644 --- a/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp +++ b/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp @@ -1,7 +1,7 @@ // RUN: %clang_cl_asan %LD %Od -DDLL %s %Fe%t.dll \ // RUN: %if target={{.*-windows-gnu}} %{ -Wl,--out-implib,%t.lib %} // RUN: %clang_cl_asan %Od -DEXE %s %t.lib %Fe%te.exe -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %te.exe 2>&1 | FileCheck %s +// RUN: %env_asan_opts=report_globals=2 %run %te.exe 2>&1 | FileCheck %s // FIXME: Currently, the MT runtime build crashes on startup due to dbghelp.dll // initialization failure. diff --git a/compiler-rt/test/asan/TestCases/Windows/global_dead_strip.c b/compiler-rt/test/asan/TestCases/Windows/global_dead_strip.c index 7f2405fdfc8364..0e15120a46f776 100644 --- a/compiler-rt/test/asan/TestCases/Windows/global_dead_strip.c +++ b/compiler-rt/test/asan/TestCases/Windows/global_dead_strip.c @@ -1,9 +1,9 @@ // RUN: %clang_cl_nocxx_asan %Gw %Od %s %Fe%t.exe -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %t.exe 2>&1 | FileCheck %s --check-prefix=NOSTRIP +// RUN: %env_asan_opts=report_globals=2 %t.exe 2>&1 | FileCheck %s --check-prefix=NOSTRIP // RUN: %clang_cl_nocxx_asan %Gw -O2 %s %Fe%t.exe \ // RUN: %if target={{.*-windows-gnu}} %{ -Wl,--gc-sections %} \ // RUN: %else %{ -link -opt:ref %} -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %t.exe 2>&1 | FileCheck %s --check-prefix=STRIP +// RUN: %env_asan_opts=report_globals=2 %t.exe 2>&1 | FileCheck %s --check-prefix=STRIP #include int dead_global = 42; diff --git a/compiler-rt/test/asan/TestCases/Windows/report_globals_vs_freelibrary.cpp b/compiler-rt/test/asan/TestCases/Windows/report_globals_vs_freelibrary.cpp index 34ce18e146d677..7cad3f39be1ec2 100644 --- a/compiler-rt/test/asan/TestCases/Windows/report_globals_vs_freelibrary.cpp +++ b/compiler-rt/test/asan/TestCases/Windows/report_globals_vs_freelibrary.cpp @@ -1,6 +1,6 @@ // RUN: %clang_cl_asan %LD %Od -DDLL %s %Fe%t.dll // RUN: %clang_cl_asan %Od -DEXE %s %Fe%te.exe -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %te.exe %t.dll 2>&1 | FileCheck %s +// RUN: %env_asan_opts=report_globals=2 %run %te.exe %t.dll 2>&1 | FileCheck %s #include #include diff --git a/compiler-rt/test/asan/TestCases/initialization-nobug.cpp b/compiler-rt/test/asan/TestCases/initialization-nobug.cpp index 61328b9de28ae6..f66d501124bc48 100644 --- a/compiler-rt/test/asan/TestCases/initialization-nobug.cpp +++ b/compiler-rt/test/asan/TestCases/initialization-nobug.cpp @@ -1,10 +1,10 @@ // A collection of various initializers which shouldn't trip up initialization // order checking. If successful, this will just return 0. -// RUN: %clangxx_asan -O0 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" -// RUN: %clangxx_asan -O1 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" -// RUN: %clangxx_asan -O2 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" -// RUN: %clangxx_asan -O3 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" +// RUN: %clangxx_asan -O0 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" +// RUN: %clangxx_asan -O1 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" +// RUN: %clangxx_asan -O2 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" +// RUN: %clangxx_asan -O3 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" // Simple access: // Make sure that accessing a global in the same TU is safe From 40eca60c5a273e7b89851c7c0b73b5f1037b29ed Mon Sep 17 00:00:00 2001 From: Zhikai Zeng Date: Wed, 21 Aug 2024 20:33:17 +0800 Subject: [PATCH 056/426] [Clang] fix generic lambda inside requires-clause of friend function template (#99813) fixes https://github.com/llvm/llvm-project/issues/98258 The cause is that the assertion "Nothing should reference a value below the actual template depth" is incorrect since we can have a generic lambda inside requires-clause of friend function template, and the generic lambda can reference to values with greater template depth. --------- Co-authored-by: cor3ntin --- clang/docs/ReleaseNotes.rst | 2 ++ clang/lib/Sema/SemaTemplate.cpp | 10 ++-------- clang/test/SemaTemplate/concepts-friends.cpp | 21 ++++++++++++++++++++ 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 127b9541d5c5d8..5aedfc654e8dbb 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -290,6 +290,8 @@ Bug Fixes to C++ Support - Clang now properly handles the order of attributes in `extern` blocks. (#GH101990). - Fixed an assertion failure by preventing null explicit object arguments from being deduced. (#GH102025). - Correctly check constraints of explicit instantiations of member functions. (#GH46029) +- Fixed an assertion failure about a constraint of a friend function template references to a value with greater + template depth than the friend function template. (#GH98258) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 0afe6064bab185..992565701d40ca 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -1667,10 +1667,7 @@ class ConstraintRefersToContainingTemplateChecker } void CheckNonTypeTemplateParmDecl(NonTypeTemplateParmDecl *D) { - assert(D->getDepth() <= TemplateDepth && - "Nothing should reference a value below the actual template depth, " - "depth is likely wrong"); - if (D->getDepth() != TemplateDepth) + if (D->getDepth() < TemplateDepth) Result = true; // Necessary because the type of the NTTP might be what refers to the parent @@ -1694,10 +1691,7 @@ class ConstraintRefersToContainingTemplateChecker using inherited::TransformTemplateTypeParmType; QualType TransformTemplateTypeParmType(TypeLocBuilder &TLB, TemplateTypeParmTypeLoc TL, bool) { - assert(TL.getDecl()->getDepth() <= TemplateDepth && - "Nothing should reference a value below the actual template depth, " - "depth is likely wrong"); - if (TL.getDecl()->getDepth() != TemplateDepth) + if (TL.getDecl()->getDepth() < TemplateDepth) Result = true; return inherited::TransformTemplateTypeParmType( TLB, TL, diff --git a/clang/test/SemaTemplate/concepts-friends.cpp b/clang/test/SemaTemplate/concepts-friends.cpp index 91b797034ed6cf..14b37d78d951dc 100644 --- a/clang/test/SemaTemplate/concepts-friends.cpp +++ b/clang/test/SemaTemplate/concepts-friends.cpp @@ -504,3 +504,24 @@ template struct Z; Y y(1); } + +namespace GH98258 { + +struct S { + template + friend void f() requires requires { [](V){}; } { + return; + } + + template + friend void f2() requires requires { [](auto){}; } { + return; + } + + template + friend void f3() requires requires { [](){ return X; }; } { + return; + } +}; + +} From 1e5f275f36a3758c7c3b06d0b9e975c4eea3d0af Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 21 Aug 2024 08:47:22 -0400 Subject: [PATCH 057/426] [libc++] Refactor the tests for mutex, recursive mutex and their timed counterparts (#104852) This refactoring is done to remove flakyness as described in https://github.com/llvm/llvm-project/pull/89083. --- ...mpile.fail.cpp => assign.compile.pass.cpp} | 12 +- ...le.fail.cpp => ctor.copy.compile.pass.cpp} | 11 +- ...default.pass.cpp => ctor.default.pass.cpp} | 20 ++- .../thread.mutex.class/lock.pass.cpp | 77 ++++++--- .../thread.mutex.class/try_lock.pass.cpp | 56 +++---- ...mpile.fail.cpp => assign.compile.pass.cpp} | 12 +- ...le.fail.cpp => ctor.copy.compile.pass.cpp} | 11 +- ...default.pass.cpp => ctor.default.pass.cpp} | 14 +- .../thread.mutex.recursive/lock.pass.cpp | 102 +++++++++--- .../thread.mutex.recursive/try_lock.pass.cpp | 87 ++++++---- ...mpile.fail.cpp => assign.compile.pass.cpp} | 12 +- ...le.fail.cpp => ctor.copy.compile.pass.cpp} | 11 +- ...default.pass.cpp => ctor.default.pass.cpp} | 11 +- .../thread.timedmutex.class/lock.pass.cpp | 75 ++++++--- .../thread.timedmutex.class/try_lock.pass.cpp | 54 +++---- .../try_lock_for.pass.cpp | 117 +++++++++----- .../try_lock_until.pass.cpp | 117 +++++++++----- ...mpile.fail.cpp => assign.compile.pass.cpp} | 12 +- ...le.fail.cpp => ctor.copy.compile.pass.cpp} | 11 +- ...default.pass.cpp => ctor.default.pass.cpp} | 11 +- .../thread.timedmutex.recursive/lock.pass.cpp | 102 +++++++++--- .../try_lock.pass.cpp | 85 ++++++---- .../try_lock_for.pass.cpp | 148 ++++++++++++------ .../try_lock_until.pass.cpp | 148 ++++++++++++------ 24 files changed, 838 insertions(+), 478 deletions(-) rename libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/{assign.compile.fail.cpp => assign.compile.pass.cpp} (80%) rename libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/{copy.compile.fail.cpp => ctor.copy.compile.pass.cpp} (79%) rename libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/{default.pass.cpp => ctor.default.pass.cpp} (64%) rename libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/{assign.compile.fail.cpp => assign.compile.pass.cpp} (79%) rename libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/{copy.compile.fail.cpp => ctor.copy.compile.pass.cpp} (79%) rename libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/{default.pass.cpp => ctor.default.pass.cpp} (75%) rename libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/{assign.compile.fail.cpp => assign.compile.pass.cpp} (80%) rename libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/{copy.compile.fail.cpp => ctor.copy.compile.pass.cpp} (79%) rename libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/{default.pass.cpp => ctor.default.pass.cpp} (77%) rename libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/{assign.compile.fail.cpp => assign.compile.pass.cpp} (79%) rename libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/{copy.compile.fail.cpp => ctor.copy.compile.pass.cpp} (78%) rename libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/{default.pass.cpp => ctor.default.pass.cpp} (77%) diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/assign.compile.fail.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/assign.compile.pass.cpp similarity index 80% rename from libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/assign.compile.fail.cpp rename to libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/assign.compile.pass.cpp index ba09ed1a706ea7..5f5274a6c0027a 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/assign.compile.fail.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/assign.compile.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// UNSUPPORTED: no-threads + // // class mutex; @@ -13,12 +15,6 @@ // mutex& operator=(const mutex&) = delete; #include +#include -int main(int, char**) -{ - std::mutex m0; - std::mutex m1; - m1 = m0; - - return 0; -} +static_assert(!std::is_copy_assignable::value, ""); diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/copy.compile.fail.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/ctor.copy.compile.pass.cpp similarity index 79% rename from libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/copy.compile.fail.cpp rename to libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/ctor.copy.compile.pass.cpp index 9edfb7267dee6f..74d0dfda41ad1b 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/copy.compile.fail.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/ctor.copy.compile.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// UNSUPPORTED: no-threads + // // class mutex; @@ -13,11 +15,6 @@ // mutex(const mutex&) = delete; #include +#include -int main(int, char**) -{ - std::mutex m0; - std::mutex m1(m0); - - return 0; -} +static_assert(!std::is_copy_constructible::value, ""); diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/default.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/ctor.default.pass.cpp similarity index 64% rename from libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/default.pass.cpp rename to libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/ctor.default.pass.cpp index d8115cd1bf3a08..7bda43087063af 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/default.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/ctor.default.pass.cpp @@ -5,24 +5,28 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// + // UNSUPPORTED: no-threads // // class mutex; -// mutex(); +// mutex() noexcept; #include +#include #include -#include "test_macros.h" +static_assert(std::is_nothrow_default_constructible::value, ""); + +int main(int, char**) { + // The mutex is unlocked after default construction + { + std::mutex m; + assert(m.try_lock()); + m.unlock(); + } -int main(int, char**) -{ - static_assert(std::is_nothrow_default_constructible::value, ""); - std::mutex m; - ((void)m); return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/lock.pass.cpp index b3e76cf886c4d4..e2bd2de84c33ce 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/lock.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/lock.pass.cpp @@ -5,9 +5,9 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// + +// UNSUPPORTED: c++03 // UNSUPPORTED: no-threads -// ALLOW_RETRIES: 2 // @@ -15,40 +15,67 @@ // void lock(); -#include -#include -#include #include +#include +#include #include +#include #include "make_test_thread.h" -#include "test_macros.h" -std::mutex m; - -typedef std::chrono::system_clock Clock; -typedef Clock::time_point time_point; -typedef Clock::duration duration; -typedef std::chrono::milliseconds ms; -typedef std::chrono::nanoseconds ns; - -void f() -{ - time_point t0 = Clock::now(); +int main(int, char**) { + // Lock a mutex that is not locked yet. This should succeed. + { + std::mutex m; m.lock(); - time_point t1 = Clock::now(); m.unlock(); - ns d = t1 - t0 - ms(250); - assert(d < ms(50)); // within 50ms -} + } -int main(int, char**) -{ + // Lock a mutex that is already locked. This should block until it is unlocked. + { + std::atomic ready(false); + std::mutex m; m.lock(); - std::thread t = support::make_test_thread(f); - std::this_thread::sleep_for(ms(250)); + std::atomic is_locked_from_main(true); + + std::thread t = support::make_test_thread([&] { + ready = true; + m.lock(); + assert(!is_locked_from_main); + m.unlock(); + }); + + while (!ready) + /* spin */; + + // We would rather signal this after we unlock, but that would create a race condition. + // We instead signal it before we unlock, which means that it's technically possible for + // the thread to take the lock while main is still holding it yet for the test to still pass. + is_locked_from_main = false; m.unlock(); + t.join(); + } + + // Make sure that at most one thread can acquire the mutex concurrently. + { + std::atomic counter(0); + std::mutex mutex; + + std::vector threads; + for (int i = 0; i != 10; ++i) { + threads.push_back(support::make_test_thread([&] { + mutex.lock(); + counter++; + assert(counter == 1); + counter--; + mutex.unlock(); + })); + } + + for (auto& t : threads) + t.join(); + } return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/try_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/try_lock.pass.cpp index bf3cb6530b3b94..db8b809c08d365 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/try_lock.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.class/try_lock.pass.cpp @@ -5,9 +5,9 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// + +// UNSUPPORTED: c++03 // UNSUPPORTED: no-threads -// ALLOW_RETRIES: 2 // @@ -15,44 +15,36 @@ // bool try_lock(); -#include -#include -#include #include +#include #include #include "make_test_thread.h" -#include "test_macros.h" - -std::mutex m; - -typedef std::chrono::system_clock Clock; -typedef Clock::time_point time_point; -typedef Clock::duration duration; -typedef std::chrono::milliseconds ms; -typedef std::chrono::nanoseconds ns; - -void f() -{ - time_point t0 = Clock::now(); - assert(!m.try_lock()); - assert(!m.try_lock()); - assert(!m.try_lock()); - while(!m.try_lock()) - ; - time_point t1 = Clock::now(); + +int main(int, char**) { + // Try to lock a mutex that is not locked yet. This should succeed. + { + std::mutex m; + bool succeeded = m.try_lock(); + assert(succeeded); m.unlock(); - ns d = t1 - t0 - ms(250); - assert(d < ms(200)); // within 200ms -} + } -int main(int, char**) -{ + // Try to lock a mutex that is already locked. This should fail. + { + std::mutex m; m.lock(); - std::thread t = support::make_test_thread(f); - std::this_thread::sleep_for(ms(250)); - m.unlock(); + + std::thread t = support::make_test_thread([&] { + for (int i = 0; i != 10; ++i) { + bool succeeded = m.try_lock(); + assert(!succeeded); + } + }); t.join(); + m.unlock(); + } + return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/assign.compile.fail.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/assign.compile.pass.cpp similarity index 79% rename from libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/assign.compile.fail.cpp rename to libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/assign.compile.pass.cpp index 0cf3c5bca1e1b4..fadd9a7cae28cc 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/assign.compile.fail.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/assign.compile.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// UNSUPPORTED: no-threads + // // class recursive_mutex; @@ -13,12 +15,6 @@ // recursive_mutex& operator=(const recursive_mutex&) = delete; #include +#include -int main(int, char**) -{ - std::recursive_mutex m0; - std::recursive_mutex m1; - m1 = m0; - - return 0; -} +static_assert(!std::is_copy_assignable::value, ""); diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/copy.compile.fail.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/ctor.copy.compile.pass.cpp similarity index 79% rename from libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/copy.compile.fail.cpp rename to libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/ctor.copy.compile.pass.cpp index 454d7797373cac..bd63224f35d70c 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/copy.compile.fail.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/ctor.copy.compile.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// UNSUPPORTED: no-threads + // // class recursive_mutex; @@ -13,11 +15,6 @@ // recursive_mutex(const recursive_mutex&) = delete; #include +#include -int main(int, char**) -{ - std::recursive_mutex m0; - std::recursive_mutex m1(m0); - - return 0; -} +static_assert(!std::is_copy_constructible::value, ""); diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/default.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/ctor.default.pass.cpp similarity index 75% rename from libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/default.pass.cpp rename to libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/ctor.default.pass.cpp index 43dc38d7cab517..cd2694e8c43c8b 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/default.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/ctor.default.pass.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// + // UNSUPPORTED: no-threads // @@ -15,12 +15,16 @@ // recursive_mutex(); #include +#include +#include -#include "test_macros.h" - -int main(int, char**) -{ +int main(int, char**) { + // The mutex is unlocked after default construction + { std::recursive_mutex m; + assert(m.try_lock()); + m.unlock(); + } return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/lock.pass.cpp index d9bff9b3cbda52..344667fa705d2c 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/lock.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/lock.pass.cpp @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// +// UNSUPPORTED: c++03 // UNSUPPORTED: no-threads -// ALLOW_RETRIES: 2 // @@ -15,42 +15,96 @@ // void lock(); -#include -#include -#include #include +#include +#include #include +#include #include "make_test_thread.h" -#include "test_macros.h" -std::recursive_mutex m; +bool is_lockable(std::recursive_mutex& m) { + bool did_lock; + std::thread t = support::make_test_thread([&] { + did_lock = m.try_lock(); + if (did_lock) + m.unlock(); // undo side effects + }); + t.join(); -typedef std::chrono::system_clock Clock; -typedef Clock::time_point time_point; -typedef Clock::duration duration; -typedef std::chrono::milliseconds ms; -typedef std::chrono::nanoseconds ns; + return did_lock; +} -void f() -{ - time_point t0 = Clock::now(); - m.lock(); - time_point t1 = Clock::now(); +int main(int, char**) { + // Lock a mutex that is not locked yet. This should succeed. + { + std::recursive_mutex m; m.lock(); m.unlock(); - m.unlock(); - ns d = t1 - t0 - ms(250); - assert(d < ms(200)); // within 200ms -} + } + + // Lock a mutex that is already locked by this thread. This should succeed and the mutex should only + // be unlocked after a matching number of calls to unlock() on the same thread. + { + std::recursive_mutex m; + int lock_count = 0; + for (int i = 0; i != 10; ++i) { + m.lock(); + ++lock_count; + } + while (lock_count != 0) { + assert(!is_lockable(m)); + m.unlock(); + --lock_count; + } + assert(is_lockable(m)); + } -int main(int, char**) -{ + // Lock a mutex that is already locked by another thread. This should block until it is unlocked. + { + std::atomic ready(false); + std::recursive_mutex m; m.lock(); - std::thread t = support::make_test_thread(f); - std::this_thread::sleep_for(ms(250)); + std::atomic is_locked_from_main(true); + + std::thread t = support::make_test_thread([&] { + ready = true; + m.lock(); + assert(!is_locked_from_main); + m.unlock(); + }); + + while (!ready) + /* spin */; + + // We would rather signal this after we unlock, but that would create a race condition. + // We instead signal it before we unlock, which means that it's technically possible for + // the thread to take the lock while main is still holding it yet for the test to still pass. + is_locked_from_main = false; m.unlock(); + t.join(); + } + + // Make sure that at most one thread can acquire the mutex concurrently. + { + std::atomic counter(0); + std::recursive_mutex mutex; + + std::vector threads; + for (int i = 0; i != 10; ++i) { + threads.push_back(support::make_test_thread([&] { + mutex.lock(); + counter++; + assert(counter == 1); + counter--; + mutex.unlock(); + })); + } + + for (auto& t : threads) + t.join(); + } return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/try_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/try_lock.pass.cpp index 1247c1ce1ba5fd..96073eb345306d 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/try_lock.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.mutex.requirements.mutex/thread.mutex.recursive/try_lock.pass.cpp @@ -5,9 +5,9 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// + +// UNSUPPORTED: c++03 // UNSUPPORTED: no-threads -// ALLOW_RETRIES: 2 // @@ -15,46 +15,67 @@ // bool try_lock(); +#include +#include #include #include -#include -#include #include #include "make_test_thread.h" -#include "test_macros.h" - -std::recursive_mutex m; - -typedef std::chrono::system_clock Clock; -typedef Clock::time_point time_point; -typedef Clock::duration duration; -typedef std::chrono::milliseconds ms; -typedef std::chrono::nanoseconds ns; - -void f() -{ - time_point t0 = Clock::now(); - assert(!m.try_lock()); - assert(!m.try_lock()); - assert(!m.try_lock()); - while(!m.try_lock()) - ; - time_point t1 = Clock::now(); - assert(m.try_lock()); - m.unlock(); - m.unlock(); - ns d = t1 - t0 - ms(250); - assert(d < ms(200)); // within 200ms + +bool is_lockable(std::recursive_mutex& m) { + bool did_lock; + std::thread t = support::make_test_thread([&] { + did_lock = m.try_lock(); + if (did_lock) + m.unlock(); // undo side effects + }); + t.join(); + + return did_lock; } -int main(int, char**) -{ - m.lock(); - std::thread t = support::make_test_thread(f); - std::this_thread::sleep_for(ms(250)); +int main(int, char**) { + // Try to lock a mutex that is not locked yet. This should succeed. + { + std::recursive_mutex m; + bool succeeded = m.try_lock(); + assert(succeeded); m.unlock(); + } + + // Try to lock a mutex that is already locked by this thread. This should succeed and the mutex should only + // be unlocked after a matching number of calls to unlock() on the same thread. + { + std::recursive_mutex m; + int lock_count = 0; + for (int i = 0; i != 10; ++i) { + assert(m.try_lock()); + ++lock_count; + } + while (lock_count != 0) { + assert(!is_lockable(m)); + m.unlock(); + --lock_count; + } + assert(is_lockable(m)); + } + + // Try to lock a mutex that is already locked by another thread. This should fail. + { + std::recursive_mutex m; + m.lock(); + + std::thread t = support::make_test_thread([&] { + for (int i = 0; i != 10; ++i) { + bool succeeded = m.try_lock(); + assert(!succeeded); + } + }); t.join(); + m.unlock(); + } + return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/assign.compile.fail.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/assign.compile.pass.cpp similarity index 80% rename from libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/assign.compile.fail.cpp rename to libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/assign.compile.pass.cpp index d0fabc678f26c7..a046a875b3df5c 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/assign.compile.fail.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/assign.compile.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// UNSUPPORTED: no-threads + // // class timed_mutex; @@ -13,12 +15,6 @@ // timed_mutex& operator=(const timed_mutex&) = delete; #include +#include -int main(int, char**) -{ - std::timed_mutex m0; - std::timed_mutex m1; - m1 = m0; - - return 0; -} +static_assert(!std::is_copy_assignable::value, ""); diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/copy.compile.fail.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/ctor.copy.compile.pass.cpp similarity index 79% rename from libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/copy.compile.fail.cpp rename to libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/ctor.copy.compile.pass.cpp index a3efb2feeeedae..3e3a01c6b5b197 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/copy.compile.fail.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/ctor.copy.compile.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// UNSUPPORTED: no-threads + // // class timed_mutex; @@ -13,11 +15,6 @@ // timed_mutex(const timed_mutex&) = delete; #include +#include -int main(int, char**) -{ - std::timed_mutex m0; - std::timed_mutex m1(m0); - - return 0; -} +static_assert(!std::is_copy_constructible::value, ""); diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/default.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/ctor.default.pass.cpp similarity index 77% rename from libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/default.pass.cpp rename to libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/ctor.default.pass.cpp index c7f207372ac421..9bb2d93c5b2103 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/default.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/ctor.default.pass.cpp @@ -15,12 +15,15 @@ // timed_mutex(); #include - -#include "test_macros.h" +#include int main(int, char**) { - std::timed_mutex m; - (void)m; + // The mutex is unlocked after default construction + { + std::timed_mutex m; + assert(m.try_lock()); + m.unlock(); + } return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/lock.pass.cpp index a71bd3d38b2c37..8893d389becef2 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/lock.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/lock.pass.cpp @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// +// UNSUPPORTED: c++03 // UNSUPPORTED: no-threads -// ALLOW_RETRIES: 2 // @@ -15,40 +15,67 @@ // void lock(); -#include -#include -#include #include +#include +#include #include +#include #include "make_test_thread.h" -#include "test_macros.h" - -std::timed_mutex m; -typedef std::chrono::system_clock Clock; -typedef Clock::time_point time_point; -typedef Clock::duration duration; -typedef std::chrono::milliseconds ms; -typedef std::chrono::nanoseconds ns; - -void f() -{ - time_point t0 = Clock::now(); +int main(int, char**) { + // Lock a mutex that is not locked yet. This should succeed. + { + std::timed_mutex m; m.lock(); - time_point t1 = Clock::now(); m.unlock(); - ns d = t1 - t0 - ms(250); - assert(d < ms(50)); // within 50ms -} + } -int main(int, char**) -{ + // Lock a mutex that is already locked. This should block until it is unlocked. + { + std::atomic ready(false); + std::timed_mutex m; m.lock(); - std::thread t = support::make_test_thread(f); - std::this_thread::sleep_for(ms(250)); + std::atomic is_locked_from_main(true); + + std::thread t = support::make_test_thread([&] { + ready = true; + m.lock(); + assert(!is_locked_from_main); + m.unlock(); + }); + + while (!ready) + /* spin */; + + // We would rather signal this after we unlock, but that would create a race condition. + // We instead signal it before we unlock, which means that it's technically possible for + // the thread to take the lock while main is still holding it yet for the test to still pass. + is_locked_from_main = false; m.unlock(); + t.join(); + } + + // Make sure that at most one thread can acquire the mutex concurrently. + { + std::atomic counter(0); + std::timed_mutex mutex; + + std::vector threads; + for (int i = 0; i != 10; ++i) { + threads.push_back(support::make_test_thread([&] { + mutex.lock(); + counter++; + assert(counter == 1); + counter--; + mutex.unlock(); + })); + } + + for (auto& t : threads) + t.join(); + } return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock.pass.cpp index f3942ccb9d8603..9a4c68bc120933 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock.pass.cpp @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// +// UNSUPPORTED: c++03 // UNSUPPORTED: no-threads -// ALLOW_RETRIES: 2 // @@ -15,44 +15,36 @@ // bool try_lock(); -#include -#include -#include #include +#include #include #include "make_test_thread.h" -#include "test_macros.h" - -std::timed_mutex m; - -typedef std::chrono::system_clock Clock; -typedef Clock::time_point time_point; -typedef Clock::duration duration; -typedef std::chrono::milliseconds ms; -typedef std::chrono::nanoseconds ns; - -void f() -{ - time_point t0 = Clock::now(); - assert(!m.try_lock()); - assert(!m.try_lock()); - assert(!m.try_lock()); - while(!m.try_lock()) - ; - time_point t1 = Clock::now(); + +int main(int, char**) { + // Try to lock a mutex that is not locked yet. This should succeed. + { + std::timed_mutex m; + bool succeeded = m.try_lock(); + assert(succeeded); m.unlock(); - ns d = t1 - t0 - ms(250); - assert(d < ms(200)); // within 200ms -} + } -int main(int, char**) -{ + // Try to lock a mutex that is already locked. This should fail. + { + std::timed_mutex m; m.lock(); - std::thread t = support::make_test_thread(f); - std::this_thread::sleep_for(ms(250)); - m.unlock(); + + std::thread t = support::make_test_thread([&] { + for (int i = 0; i != 10; ++i) { + bool succeeded = m.try_lock(); + assert(!succeeded); + } + }); t.join(); + m.unlock(); + } + return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_for.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_for.pass.cpp index acfa5560962ae3..b1882ad61c6036 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_for.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_for.pass.cpp @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// +// UNSUPPORTED: c++03 // UNSUPPORTED: no-threads -// ALLOW_RETRIES: 2 // @@ -17,56 +17,89 @@ // bool try_lock_for(const chrono::duration& rel_time); #include -#include -#include +#include #include +#include +#include #include "make_test_thread.h" -#include "test_macros.h" - -std::timed_mutex m; -typedef std::chrono::steady_clock Clock; -typedef Clock::time_point time_point; -typedef Clock::duration duration; -typedef std::chrono::milliseconds ms; -typedef std::chrono::nanoseconds ns; +template +std::chrono::microseconds measure(Function f) { + std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); + f(); + std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now(); + return std::chrono::duration_cast(end - start); +} -void f1() -{ - time_point t0 = Clock::now(); - assert(m.try_lock_for(ms(300)) == true); - time_point t1 = Clock::now(); +int main(int, char**) { + // Try to lock a mutex that is not locked yet. This should succeed immediately. + { + std::timed_mutex m; + bool succeeded = m.try_lock_for(std::chrono::milliseconds(1)); + assert(succeeded); m.unlock(); - ns d = t1 - t0 - ms(250); - assert(d < ms(50)); // within 50ms -} + } -void f2() -{ - time_point t0 = Clock::now(); - assert(m.try_lock_for(ms(250)) == false); - time_point t1 = Clock::now(); - ns d = t1 - t0 - ms(250); - assert(d < ms(50)); // within 50ms -} + // Try to lock an already-locked mutex for a long enough amount of time and succeed. + // This is technically flaky, but we use such long durations that it should pass even + // in slow or contended environments. + { + std::chrono::milliseconds const wait_time(500); + std::chrono::milliseconds const tolerance = wait_time * 3; + std::atomic ready(false); -int main(int, char**) -{ - { - m.lock(); - std::thread t = support::make_test_thread(f1); - std::this_thread::sleep_for(ms(250)); - m.unlock(); - t.join(); - } - { - m.lock(); - std::thread t = support::make_test_thread(f2); - std::this_thread::sleep_for(ms(300)); + std::timed_mutex m; + m.lock(); + + std::thread t = support::make_test_thread([&] { + auto elapsed = measure([&] { + ready = true; + bool succeeded = m.try_lock_for(wait_time); + assert(succeeded); m.unlock(); - t.join(); - } + }); + + // Ensure we didn't wait significantly longer than our timeout. This is technically + // flaky and non-conforming because an implementation is free to block for arbitrarily + // long, but any decent quality implementation should pass this test. + assert(elapsed - wait_time < tolerance); + }); + + // Wait for the thread to be ready to take the lock before we unlock it from here, otherwise + // there's a high chance that we're not testing the "locking an already locked" mutex use case. + // There is still technically a race condition here. + while (!ready) + /* spin */; + std::this_thread::sleep_for(wait_time / 5); + + m.unlock(); // this should allow the thread to lock 'm' + t.join(); + } + + // Try to lock an already-locked mutex for a short amount of time and fail. + // Again, this is technically flaky but we use such long durations that it should work. + { + std::chrono::milliseconds const wait_time(10); + std::chrono::milliseconds const tolerance(750); // in case the thread we spawned goes to sleep or something + + std::timed_mutex m; + m.lock(); + + std::thread t = support::make_test_thread([&] { + auto elapsed = measure([&] { + bool succeeded = m.try_lock_for(wait_time); + assert(!succeeded); + }); + + // Ensure we failed within some bounded time. + assert(elapsed - wait_time < tolerance); + }); + + t.join(); + + m.unlock(); + } return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_until.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_until.pass.cpp index 23385c100807d5..72471ed07dcffc 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_until.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.class/try_lock_until.pass.cpp @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// +// UNSUPPORTED: c++03 // UNSUPPORTED: no-threads -// ALLOW_RETRIES: 2 // @@ -17,56 +17,89 @@ // bool try_lock_until(const chrono::time_point& abs_time); #include -#include -#include +#include #include +#include +#include #include "make_test_thread.h" -#include "test_macros.h" - -std::timed_mutex m; -typedef std::chrono::steady_clock Clock; -typedef Clock::time_point time_point; -typedef Clock::duration duration; -typedef std::chrono::milliseconds ms; -typedef std::chrono::nanoseconds ns; +template +std::chrono::microseconds measure(Function f) { + std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); + f(); + std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now(); + return std::chrono::duration_cast(end - start); +} -void f1() -{ - time_point t0 = Clock::now(); - assert(m.try_lock_until(Clock::now() + ms(300)) == true); - time_point t1 = Clock::now(); +int main(int, char**) { + // Try to lock a mutex that is not locked yet. This should succeed immediately. + { + std::timed_mutex m; + bool succeeded = m.try_lock_until(std::chrono::steady_clock::now() + std::chrono::milliseconds(1)); + assert(succeeded); m.unlock(); - ns d = t1 - t0 - ms(250); - assert(d < ms(50)); // within 50ms -} + } -void f2() -{ - time_point t0 = Clock::now(); - assert(m.try_lock_until(Clock::now() + ms(250)) == false); - time_point t1 = Clock::now(); - ns d = t1 - t0 - ms(250); - assert(d < ms(50)); // within 50ms -} + // Try to lock an already-locked mutex for a long enough amount of time and succeed. + // This is technically flaky, but we use such long durations that it should pass even + // in slow or contended environments. + { + std::chrono::milliseconds const wait_time(500); + std::chrono::milliseconds const tolerance = wait_time * 3; + std::atomic ready(false); -int main(int, char**) -{ - { - m.lock(); - std::thread t = support::make_test_thread(f1); - std::this_thread::sleep_for(ms(250)); - m.unlock(); - t.join(); - } - { - m.lock(); - std::thread t = support::make_test_thread(f2); - std::this_thread::sleep_for(ms(300)); + std::timed_mutex m; + m.lock(); + + std::thread t = support::make_test_thread([&] { + auto elapsed = measure([&] { + ready = true; + bool succeeded = m.try_lock_until(std::chrono::steady_clock::now() + wait_time); + assert(succeeded); m.unlock(); - t.join(); - } + }); + + // Ensure we didn't wait significantly longer than our timeout. This is technically + // flaky and non-conforming because an implementation is free to block for arbitrarily + // long, but any decent quality implementation should pass this test. + assert(elapsed - wait_time < tolerance); + }); + + // Wait for the thread to be ready to take the lock before we unlock it from here, otherwise + // there's a high chance that we're not testing the "locking an already locked" mutex use case. + // There is still technically a race condition here. + while (!ready) + /* spin */; + std::this_thread::sleep_for(wait_time / 5); + + m.unlock(); // this should allow the thread to lock 'm' + t.join(); + } + + // Try to lock an already-locked mutex for a short amount of time and fail. + // Again, this is technically flaky but we use such long durations that it should work. + { + std::chrono::milliseconds const wait_time(10); + std::chrono::milliseconds const tolerance(750); // in case the thread we spawned goes to sleep or something + + std::timed_mutex m; + m.lock(); + + std::thread t = support::make_test_thread([&] { + auto elapsed = measure([&] { + bool succeeded = m.try_lock_until(std::chrono::steady_clock::now() + wait_time); + assert(!succeeded); + }); + + // Ensure we failed within some bounded time. + assert(elapsed - wait_time < tolerance); + }); + + t.join(); + + m.unlock(); + } return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/assign.compile.fail.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/assign.compile.pass.cpp similarity index 79% rename from libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/assign.compile.fail.cpp rename to libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/assign.compile.pass.cpp index 44be06d6754133..681679e006235e 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/assign.compile.fail.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/assign.compile.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// UNSUPPORTED: no-threads + // // class recursive_timed_mutex; @@ -13,12 +15,6 @@ // recursive_timed_mutex& operator=(const recursive_timed_mutex&) = delete; #include +#include -int main(int, char**) -{ - std::recursive_timed_mutex m0; - std::recursive_timed_mutex m1; - m1 = m0; - - return 0; -} +static_assert(!std::is_copy_assignable::value, ""); diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/copy.compile.fail.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/ctor.copy.compile.pass.cpp similarity index 78% rename from libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/copy.compile.fail.cpp rename to libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/ctor.copy.compile.pass.cpp index 154a0192d14db1..1ac287e08d7f99 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/copy.compile.fail.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/ctor.copy.compile.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// UNSUPPORTED: no-threads + // // class recursive_timed_mutex; @@ -13,11 +15,6 @@ // recursive_timed_mutex(const recursive_timed_mutex&) = delete; #include +#include -int main(int, char**) -{ - std::recursive_timed_mutex m0; - std::recursive_timed_mutex m1(m0); - - return 0; -} +static_assert(!std::is_copy_constructible::value, ""); diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/default.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/ctor.default.pass.cpp similarity index 77% rename from libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/default.pass.cpp rename to libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/ctor.default.pass.cpp index 3096e031855a22..dede7f44bc1960 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/default.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/ctor.default.pass.cpp @@ -15,12 +15,15 @@ // recursive_timed_mutex(); #include - -#include "test_macros.h" +#include int main(int, char**) { - std::recursive_timed_mutex m; - (void)m; + // The mutex is unlocked after default construction + { + std::recursive_timed_mutex m; + assert(m.try_lock()); + m.unlock(); + } return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/lock.pass.cpp index bad5a4457e5160..695ce508cf7c62 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/lock.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/lock.pass.cpp @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// +// UNSUPPORTED: c++03 // UNSUPPORTED: no-threads -// ALLOW_RETRIES: 2 // @@ -15,42 +15,96 @@ // void lock(); -#include -#include -#include #include +#include +#include #include +#include #include "make_test_thread.h" -#include "test_macros.h" -std::recursive_timed_mutex m; +bool is_lockable(std::recursive_timed_mutex& m) { + bool did_lock; + std::thread t = support::make_test_thread([&] { + did_lock = m.try_lock(); + if (did_lock) + m.unlock(); // undo side effects + }); + t.join(); -typedef std::chrono::system_clock Clock; -typedef Clock::time_point time_point; -typedef Clock::duration duration; -typedef std::chrono::milliseconds ms; -typedef std::chrono::nanoseconds ns; + return did_lock; +} -void f() -{ - time_point t0 = Clock::now(); - m.lock(); - time_point t1 = Clock::now(); +int main(int, char**) { + // Lock a mutex that is not locked yet. This should succeed. + { + std::recursive_timed_mutex m; m.lock(); m.unlock(); - m.unlock(); - ns d = t1 - t0 - ms(250); - assert(d < ms(50)); // within 50ms -} + } + + // Lock a mutex that is already locked by this thread. This should succeed and the mutex should only + // be unlocked after a matching number of calls to unlock() on the same thread. + { + std::recursive_timed_mutex m; + int lock_count = 0; + for (int i = 0; i != 10; ++i) { + m.lock(); + ++lock_count; + } + while (lock_count != 0) { + assert(!is_lockable(m)); + m.unlock(); + --lock_count; + } + assert(is_lockable(m)); + } -int main(int, char**) -{ + // Lock a mutex that is already locked by another thread. This should block until it is unlocked. + { + std::atomic ready(false); + std::recursive_timed_mutex m; m.lock(); - std::thread t = support::make_test_thread(f); - std::this_thread::sleep_for(ms(250)); + std::atomic is_locked_from_main(true); + + std::thread t = support::make_test_thread([&] { + ready = true; + m.lock(); + assert(!is_locked_from_main); + m.unlock(); + }); + + while (!ready) + /* spin */; + + // We would rather signal this after we unlock, but that would create a race condition. + // We instead signal it before we unlock, which means that it's technically possible for + // the thread to take the lock while main is still holding it yet for the test to still pass. + is_locked_from_main = false; m.unlock(); + t.join(); + } + + // Make sure that at most one thread can acquire the mutex concurrently. + { + std::atomic counter(0); + std::recursive_timed_mutex mutex; + + std::vector threads; + for (int i = 0; i != 10; ++i) { + threads.push_back(support::make_test_thread([&] { + mutex.lock(); + counter++; + assert(counter == 1); + counter--; + mutex.unlock(); + })); + } + + for (auto& t : threads) + t.join(); + } return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/try_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/try_lock.pass.cpp index 63be0ac713f8ba..848db63a003cb9 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/try_lock.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/try_lock.pass.cpp @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// +// UNSUPPORTED: c++03 // UNSUPPORTED: no-threads -// ALLOW_RETRIES: 2 // @@ -15,46 +15,67 @@ // bool try_lock(); +#include +#include #include #include -#include -#include #include #include "make_test_thread.h" -#include "test_macros.h" - -std::recursive_timed_mutex m; - -typedef std::chrono::system_clock Clock; -typedef Clock::time_point time_point; -typedef Clock::duration duration; -typedef std::chrono::milliseconds ms; -typedef std::chrono::nanoseconds ns; - -void f() -{ - time_point t0 = Clock::now(); - assert(!m.try_lock()); - assert(!m.try_lock()); - assert(!m.try_lock()); - while(!m.try_lock()) - ; - time_point t1 = Clock::now(); - assert(m.try_lock()); - m.unlock(); - m.unlock(); - ns d = t1 - t0 - ms(250); - assert(d < ms(200)); // within 200ms + +bool is_lockable(std::recursive_timed_mutex& m) { + bool did_lock; + std::thread t = support::make_test_thread([&] { + did_lock = m.try_lock(); + if (did_lock) + m.unlock(); // undo side effects + }); + t.join(); + + return did_lock; } -int main(int, char**) -{ - m.lock(); - std::thread t = support::make_test_thread(f); - std::this_thread::sleep_for(ms(250)); +int main(int, char**) { + // Try to lock a mutex that is not locked yet. This should succeed. + { + std::recursive_timed_mutex m; + bool succeeded = m.try_lock(); + assert(succeeded); m.unlock(); + } + + // Try to lock a mutex that is already locked by this thread. This should succeed and the mutex should only + // be unlocked after a matching number of calls to unlock() on the same thread. + { + std::recursive_timed_mutex m; + int lock_count = 0; + for (int i = 0; i != 10; ++i) { + assert(m.try_lock()); + ++lock_count; + } + while (lock_count != 0) { + assert(!is_lockable(m)); + m.unlock(); + --lock_count; + } + assert(is_lockable(m)); + } + + // Try to lock a mutex that is already locked by another thread. This should fail. + { + std::recursive_timed_mutex m; + m.lock(); + + std::thread t = support::make_test_thread([&] { + for (int i = 0; i != 10; ++i) { + bool succeeded = m.try_lock(); + assert(!succeeded); + } + }); t.join(); + m.unlock(); + } + return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/try_lock_for.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/try_lock_for.pass.cpp index b0b27801c8c74d..c9192f6fad78c1 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/try_lock_for.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/try_lock_for.pass.cpp @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// +// UNSUPPORTED: c++03 // UNSUPPORTED: no-threads -// ALLOW_RETRIES: 2 // @@ -17,58 +17,118 @@ // bool try_lock_for(const chrono::duration& rel_time); #include -#include -#include +#include #include +#include +#include #include "make_test_thread.h" -#include "test_macros.h" - -std::recursive_timed_mutex m; - -typedef std::chrono::steady_clock Clock; -typedef Clock::time_point time_point; -typedef Clock::duration duration; -typedef std::chrono::milliseconds ms; -typedef std::chrono::nanoseconds ns; - -void f1() -{ - time_point t0 = Clock::now(); - assert(m.try_lock_for(ms(300)) == true); - time_point t1 = Clock::now(); - assert(m.try_lock()); - m.unlock(); - m.unlock(); - ns d = t1 - t0 - ms(250); - assert(d < ns(50000000)); // within 50ms + +bool is_lockable(std::recursive_timed_mutex& m) { + bool did_lock; + std::thread t = support::make_test_thread([&] { + did_lock = m.try_lock(); + if (did_lock) + m.unlock(); // undo side effects + }); + t.join(); + + return did_lock; } -void f2() -{ - time_point t0 = Clock::now(); - assert(m.try_lock_for(ms(250)) == false); - time_point t1 = Clock::now(); - ns d = t1 - t0 - ms(250); - assert(d < ns(50000000)); // within 50ms +template +std::chrono::microseconds measure(Function f) { + std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); + f(); + std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now(); + return std::chrono::duration_cast(end - start); } -int main(int, char**) -{ - { - m.lock(); - std::thread t = support::make_test_thread(f1); - std::this_thread::sleep_for(ms(250)); - m.unlock(); - t.join(); +int main(int, char**) { + // Try to lock a mutex that is not locked yet. This should succeed immediately. + { + std::recursive_timed_mutex m; + bool succeeded = m.try_lock_for(std::chrono::milliseconds(1)); + assert(succeeded); + m.unlock(); + } + + // Lock a mutex that is already locked by this thread. This should succeed immediately and the mutex + // should only be unlocked after a matching number of calls to unlock() on the same thread. + { + std::recursive_timed_mutex m; + int lock_count = 0; + for (int i = 0; i != 10; ++i) { + assert(m.try_lock_for(std::chrono::milliseconds(1))); + ++lock_count; } - { - m.lock(); - std::thread t = support::make_test_thread(f2); - std::this_thread::sleep_for(ms(300)); - m.unlock(); - t.join(); + while (lock_count != 0) { + assert(!is_lockable(m)); + m.unlock(); + --lock_count; } + assert(is_lockable(m)); + } + + // Try to lock an already-locked mutex for a long enough amount of time and succeed. + // This is technically flaky, but we use such long durations that it should pass even + // in slow or contended environments. + { + std::chrono::milliseconds const wait_time(500); + std::chrono::milliseconds const tolerance = wait_time * 3; + std::atomic ready(false); + + std::recursive_timed_mutex m; + m.lock(); + + std::thread t = support::make_test_thread([&] { + auto elapsed = measure([&] { + ready = true; + bool succeeded = m.try_lock_for(wait_time); + assert(succeeded); + m.unlock(); + }); + + // Ensure we didn't wait significantly longer than our timeout. This is technically + // flaky and non-conforming because an implementation is free to block for arbitrarily + // long, but any decent quality implementation should pass this test. + assert(elapsed - wait_time < tolerance); + }); + + // Wait for the thread to be ready to take the lock before we unlock it from here, otherwise + // there's a high chance that we're not testing the "locking an already locked" mutex use case. + // There is still technically a race condition here. + while (!ready) + /* spin */; + std::this_thread::sleep_for(wait_time / 5); + + m.unlock(); // this should allow the thread to lock 'm' + t.join(); + } + + // Try to lock an already-locked mutex for a short amount of time and fail. + // Again, this is technically flaky but we use such long durations that it should work. + { + std::chrono::milliseconds const wait_time(10); + std::chrono::milliseconds const tolerance(750); // in case the thread we spawned goes to sleep or something + + std::recursive_timed_mutex m; + m.lock(); + + std::thread t = support::make_test_thread([&] { + auto elapsed = measure([&] { + bool succeeded = m.try_lock_for(wait_time); + assert(!succeeded); + }); + + // Ensure we failed within some bounded time. + assert(elapsed - wait_time < tolerance); + }); + + t.join(); + + m.unlock(); + } return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/try_lock_until.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/try_lock_until.pass.cpp index 5c5807d6736c92..6579b21a38a26b 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/try_lock_until.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.timedmutex.requirements/thread.timedmutex.recursive/try_lock_until.pass.cpp @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// +// UNSUPPORTED: c++03 // UNSUPPORTED: no-threads -// ALLOW_RETRIES: 2 // @@ -17,58 +17,118 @@ // bool try_lock_until(const chrono::time_point& abs_time); #include -#include -#include +#include #include +#include +#include #include "make_test_thread.h" -#include "test_macros.h" - -std::recursive_timed_mutex m; - -typedef std::chrono::steady_clock Clock; -typedef Clock::time_point time_point; -typedef Clock::duration duration; -typedef std::chrono::milliseconds ms; -typedef std::chrono::nanoseconds ns; - -void f1() -{ - time_point t0 = Clock::now(); - assert(m.try_lock_until(Clock::now() + ms(300)) == true); - time_point t1 = Clock::now(); - assert(m.try_lock()); - m.unlock(); - m.unlock(); - ns d = t1 - t0 - ms(250); - assert(d < ms(50)); // within 50ms + +bool is_lockable(std::recursive_timed_mutex& m) { + bool did_lock; + std::thread t = support::make_test_thread([&] { + did_lock = m.try_lock(); + if (did_lock) + m.unlock(); // undo side effects + }); + t.join(); + + return did_lock; } -void f2() -{ - time_point t0 = Clock::now(); - assert(m.try_lock_until(Clock::now() + ms(250)) == false); - time_point t1 = Clock::now(); - ns d = t1 - t0 - ms(250); - assert(d < ms(50)); // within 50ms +template +std::chrono::microseconds measure(Function f) { + std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); + f(); + std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now(); + return std::chrono::duration_cast(end - start); } -int main(int, char**) -{ - { - m.lock(); - std::thread t = support::make_test_thread(f1); - std::this_thread::sleep_for(ms(250)); - m.unlock(); - t.join(); +int main(int, char**) { + // Try to lock a mutex that is not locked yet. This should succeed immediately. + { + std::recursive_timed_mutex m; + bool succeeded = m.try_lock_until(std::chrono::steady_clock::now() + std::chrono::milliseconds(1)); + assert(succeeded); + m.unlock(); + } + + // Lock a mutex that is already locked by this thread. This should succeed immediately and the mutex + // should only be unlocked after a matching number of calls to unlock() on the same thread. + { + std::recursive_timed_mutex m; + int lock_count = 0; + for (int i = 0; i != 10; ++i) { + assert(m.try_lock_until(std::chrono::steady_clock::now() + std::chrono::milliseconds(1))); + ++lock_count; } - { - m.lock(); - std::thread t = support::make_test_thread(f2); - std::this_thread::sleep_for(ms(300)); - m.unlock(); - t.join(); + while (lock_count != 0) { + assert(!is_lockable(m)); + m.unlock(); + --lock_count; } + assert(is_lockable(m)); + } + + // Try to lock an already-locked mutex for a long enough amount of time and succeed. + // This is technically flaky, but we use such long durations that it should pass even + // in slow or contended environments. + { + std::chrono::milliseconds const wait_time(500); + std::chrono::milliseconds const tolerance = wait_time * 3; + std::atomic ready(false); + + std::recursive_timed_mutex m; + m.lock(); + + std::thread t = support::make_test_thread([&] { + auto elapsed = measure([&] { + ready = true; + bool succeeded = m.try_lock_until(std::chrono::steady_clock::now() + wait_time); + assert(succeeded); + m.unlock(); + }); + + // Ensure we didn't wait significantly longer than our timeout. This is technically + // flaky and non-conforming because an implementation is free to block for arbitrarily + // long, but any decent quality implementation should pass this test. + assert(elapsed - wait_time < tolerance); + }); + + // Wait for the thread to be ready to take the lock before we unlock it from here, otherwise + // there's a high chance that we're not testing the "locking an already locked" mutex use case. + // There is still technically a race condition here. + while (!ready) + /* spin */; + std::this_thread::sleep_for(wait_time / 5); + + m.unlock(); // this should allow the thread to lock 'm' + t.join(); + } + + // Try to lock an already-locked mutex for a short amount of time and fail. + // Again, this is technically flaky but we use such long durations that it should work. + { + std::chrono::milliseconds const wait_time(10); + std::chrono::milliseconds const tolerance(750); // in case the thread we spawned goes to sleep or something + + std::recursive_timed_mutex m; + m.lock(); + + std::thread t = support::make_test_thread([&] { + auto elapsed = measure([&] { + bool succeeded = m.try_lock_until(std::chrono::steady_clock::now() + wait_time); + assert(!succeeded); + }); + + // Ensure we failed within some bounded time. + assert(elapsed - wait_time < tolerance); + }); + + t.join(); + + m.unlock(); + } return 0; } From 4f14bfeddedcf21e0eaf0ff3ddf7b62938f66df5 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 21 Aug 2024 14:45:02 +0200 Subject: [PATCH 058/426] [llvm-reduce] Disable fixpoint verification in InstCombine We don't want to get fixpoint verification errors while reducing. --- llvm/tools/llvm-reduce/deltas/RunIRPasses.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/llvm/tools/llvm-reduce/deltas/RunIRPasses.cpp b/llvm/tools/llvm-reduce/deltas/RunIRPasses.cpp index acef29a6819415..e3af05616fe04b 100644 --- a/llvm/tools/llvm-reduce/deltas/RunIRPasses.cpp +++ b/llvm/tools/llvm-reduce/deltas/RunIRPasses.cpp @@ -16,12 +16,13 @@ using namespace llvm; extern cl::OptionCategory LLVMReduceOptions; -static cl::opt PassPipeline( - "ir-passes", - cl::desc("A textual description of the pass pipeline, same as " - "what's passed to `opt -passes`."), - cl::init("function(sroa,instcombine,gvn,simplifycfg,infer-address-spaces)"), - cl::cat(LLVMReduceOptions)); +static cl::opt + PassPipeline("ir-passes", + cl::desc("A textual description of the pass pipeline, same as " + "what's passed to `opt -passes`."), + cl::init("function(sroa,instcombine,gvn," + "simplifycfg,infer-address-spaces)"), + cl::cat(LLVMReduceOptions)); static void runPasses(Oracle &O, ReducerWorkItem &WorkItem) { Module &Program = WorkItem.getModule(); From aa088438784dd76a859eee229ddaec17e0cb0651 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 21 Aug 2024 09:05:55 -0400 Subject: [PATCH 059/426] [libc++] Avoid -Wzero-as-null-pointer-constant in operator<=> (#79465) Issue #43670 describes a situation where the following comparison will issue a warning when -Wzero-as-null-pointer-constant is enabled: #include auto b = (1 <=> 2) < 0; This code uses operator<(strong_ordering, Unspecified), which is specified by the Standard to only work with a literal 0. In the library, this is achieved by constructing Unspecified from a pointer, which works but has the downside of triggering the warning. This patch uses an alternative implementation where we require that the operator is used exactly with an int of value 0 (known at compile-time), however that value can technically be an expression like `1 - 1`, which makes us a bit less strict than what's specified in the Standard. Fixes #43670 --- libcxx/include/__compare/ordering.h | 23 ++++-- .../reject-other-than-literal-zero.verify.cpp | 74 +++++++++++++++++++ .../cmp.categories.pre/zero_type.verify.cpp | 57 -------------- 3 files changed, 89 insertions(+), 65 deletions(-) create mode 100644 libcxx/test/std/language.support/cmp/cmp.categories.pre/reject-other-than-literal-zero.verify.cpp delete mode 100644 libcxx/test/std/language.support/cmp/cmp.categories.pre/zero_type.verify.cpp diff --git a/libcxx/include/__compare/ordering.h b/libcxx/include/__compare/ordering.h index 2995d381304f0e..379f3459c681dd 100644 --- a/libcxx/include/__compare/ordering.h +++ b/libcxx/include/__compare/ordering.h @@ -30,14 +30,20 @@ class partial_ordering; class weak_ordering; class strong_ordering; -template -inline constexpr bool __one_of_v = (is_same_v<_Tp, _Args> || ...); - struct _CmpUnspecifiedParam { - _LIBCPP_HIDE_FROM_ABI constexpr _CmpUnspecifiedParam(int _CmpUnspecifiedParam::*) noexcept {} - - template >> - _CmpUnspecifiedParam(_Tp) = delete; + // If anything other than a literal 0 is provided, the behavior is undefined by the Standard. + // + // The alternative to the `__enable_if__` attribute would be to use the fact that a pointer + // can be constructed from literal 0, but this conflicts with `-Wzero-as-null-pointer-constant`. + template > > + _LIBCPP_HIDE_FROM_ABI consteval _CmpUnspecifiedParam(_Tp __zero) noexcept +# if __has_attribute(__enable_if__) + __attribute__((__enable_if__( + __zero == 0, "Only literal 0 is allowed as the operand of a comparison with one of the ordering types"))) +# endif + { + (void)__zero; + } }; class partial_ordering { @@ -269,7 +275,8 @@ inline constexpr strong_ordering strong_ordering::greater(_OrdResult::__greater) /// The types partial_ordering, weak_ordering, and strong_ordering are /// collectively termed the comparison category types. template -concept __comparison_category = __one_of_v<_Tp, partial_ordering, weak_ordering, strong_ordering>; +concept __comparison_category = + is_same_v<_Tp, partial_ordering> || is_same_v<_Tp, weak_ordering> || is_same_v<_Tp, strong_ordering>; #endif // _LIBCPP_STD_VER >= 20 diff --git a/libcxx/test/std/language.support/cmp/cmp.categories.pre/reject-other-than-literal-zero.verify.cpp b/libcxx/test/std/language.support/cmp/cmp.categories.pre/reject-other-than-literal-zero.verify.cpp new file mode 100644 index 00000000000000..b6bc4dd4f097a4 --- /dev/null +++ b/libcxx/test/std/language.support/cmp/cmp.categories.pre/reject-other-than-literal-zero.verify.cpp @@ -0,0 +1,74 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +// + +// Ensure we reject all cases where an argument other than a literal 0 is used +// for a comparison against a comparison category type. + +// Also ensure that we don't warn about providing a null pointer constant when +// comparing an ordering type against literal 0, since one of the common +// implementation strategies is to use a pointer as the "unspecified type". +// ADDITIONAL_COMPILE_FLAGS: -Wzero-as-null-pointer-constant + +#include + +#include "test_macros.h" + +#define TEST_FAIL(v, op) \ + do { \ + /* invalid types */ \ + void(v op 0L); \ + void(0L op v); \ + void(v op 0.0); \ + void(0.0 op v); \ + void(v op nullptr); \ + void(nullptr op v); \ + /* invalid value */ \ + void(v op 1); \ + void(1 op v); \ + /* value not known at compile-time */ \ + int i = 0; \ + void(v op i); \ + void(i op v); \ + } while (false) + +#define TEST_PASS(v, op) \ + do { \ + void(v op 0); \ + void(0 op v); \ + LIBCPP_ONLY(void(v op(1 - 1))); \ + LIBCPP_ONLY(void((1 - 1) op v)); \ + } while (false) + +template +void test_category(T v) { + TEST_FAIL(v, ==); // expected-error 30 {{invalid operands to binary expression}} + TEST_FAIL(v, !=); // expected-error 30 {{invalid operands to binary expression}} + TEST_FAIL(v, <); // expected-error 30 {{invalid operands to binary expression}} + TEST_FAIL(v, <=); // expected-error 30 {{invalid operands to binary expression}} + TEST_FAIL(v, >); // expected-error 30 {{invalid operands to binary expression}} + TEST_FAIL(v, >=); // expected-error 30 {{invalid operands to binary expression}} + TEST_FAIL(v, <=>); // expected-error 30 {{invalid operands to binary expression}} + + TEST_PASS(v, ==); + TEST_PASS(v, !=); + TEST_PASS(v, <); + TEST_PASS(v, >); + TEST_PASS(v, <=); + TEST_PASS(v, >=); + TEST_PASS(v, <=>); +} + +void f() { + test_category(std::strong_ordering::equivalent); + test_category(std::weak_ordering::equivalent); + test_category(std::partial_ordering::equivalent); +} diff --git a/libcxx/test/std/language.support/cmp/cmp.categories.pre/zero_type.verify.cpp b/libcxx/test/std/language.support/cmp/cmp.categories.pre/zero_type.verify.cpp deleted file mode 100644 index 8e3c793de4b92a..00000000000000 --- a/libcxx/test/std/language.support/cmp/cmp.categories.pre/zero_type.verify.cpp +++ /dev/null @@ -1,57 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03, c++11, c++14, c++17 - -// In MSVC mode, there's a slightly different number of errors printed for -// each of these, so it doesn't add up to the exact expected count of 18. -// XFAIL: msvc - -// - -// Ensure we reject all cases where an argument other than a literal 0 is used -// for a comparison against a comparison category type. - -#include - -#define TEST_FAIL(v, op) \ - void(v op 0L); \ - void(0L op v); \ - void(v op nullptr); \ - void(nullptr op v); \ - void(v op(1 - 1)); \ - void((1 - 1) op v) - -#define TEST_PASS(v, op) \ - void(v op 0); \ - void(0 op v) - -template -void test_category(T v) { - TEST_FAIL(v, ==); // expected-error 18 {{}} - TEST_FAIL(v, !=); // expected-error 18 {{}} - TEST_FAIL(v, <); // expected-error 18 {{}} - TEST_FAIL(v, <=); // expected-error 18 {{}} - TEST_FAIL(v, >); // expected-error 18 {{}} - TEST_FAIL(v, >=); // expected-error 18 {{}} - TEST_FAIL(v, <=>); // expected-error 18 {{}} - - TEST_PASS(v, ==); - TEST_PASS(v, !=); - TEST_PASS(v, <); - TEST_PASS(v, >); - TEST_PASS(v, <=); - TEST_PASS(v, >=); - TEST_PASS(v, <=>); -} - -void f() { - test_category(std::strong_ordering::equivalent); - test_category(std::weak_ordering::equivalent); - test_category(std::partial_ordering::equivalent); -} From 65281570afd7e35e01533b07c6c2937de410fc52 Mon Sep 17 00:00:00 2001 From: Dmitri Gribenko Date: Wed, 21 Aug 2024 15:04:04 +0200 Subject: [PATCH 060/426] [lldb][swig] Use the correct variable in the return statement The issue was introduced in https://github.com/llvm/llvm-project/pull/104523. The code introduces the `ret_val` variable but does not use it. Instead it returns a pointer, which gets implicitly converted to bool. --- lldb/bindings/python/python-wrapper.swig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/bindings/python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig index 2ce42e3e017d5b..360c392235a866 100644 --- a/lldb/bindings/python/python-wrapper.swig +++ b/lldb/bindings/python/python-wrapper.swig @@ -837,7 +837,7 @@ bool lldb_private::python::SWIGBridge::LLDBSwigPython_ShouldHide( bool ret_val = result ? PyObject_IsTrue(result) : false; Py_XDECREF(result); - return result; + return ret_val; } void *lldb_private::python::SWIGBridge::LLDBSWIGPython_GetDynamicSetting( From 76c07984257b49dcc4786fa9fb3918a2c1342e23 Mon Sep 17 00:00:00 2001 From: Aviad Cohen Date: Wed, 21 Aug 2024 16:35:26 +0300 Subject: [PATCH 061/426] [mlir][memref]: Allow collapse dummy strided unit dim (#103719) Dimensions of size 1 should be skipped, because their strides are meaningless and could have any arbitrary value. --- mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp | 5 +++++ mlir/test/Dialect/MemRef/ops.mlir | 17 ++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp index 0ff25de7295f6e..150049e5c5effe 100644 --- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp +++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp @@ -2448,6 +2448,11 @@ computeCollapsedLayoutMap(MemRefType srcType, if (strict && (stride.saturated || srcStride.saturated)) return failure(); + // Dimensions of size 1 should be skipped, because their strides are + // meaningless and could have any arbitrary value. + if (srcShape[idx - 1] == 1) + continue; + if (!stride.saturated && !srcStride.saturated && stride != srcStride) return failure(); } diff --git a/mlir/test/Dialect/MemRef/ops.mlir b/mlir/test/Dialect/MemRef/ops.mlir index b60894377f22fc..f616f6795bf9dc 100644 --- a/mlir/test/Dialect/MemRef/ops.mlir +++ b/mlir/test/Dialect/MemRef/ops.mlir @@ -99,7 +99,9 @@ func.func @expand_collapse_shape_static( %arg4: memref<1x5xf32, strided<[5, 1], offset: ?>>, %arg5: memref, %arg6: memref<3x4x5xf32, strided<[240, 60, 10], offset: 0>>, - %arg7: memref<1x2049xi64, strided<[?, ?], offset: ?>>) { + %arg7: memref<1x2049xi64, strided<[?, ?], offset: ?>>, + %arg8: memref<1x1x1024xi8, strided<[40960, 4096, 1], offset: 0>>, + %arg9: memref<24x1x1x1024xi8, strided<[40960, 40960, 4096, 1], offset: 0>>) { // Reshapes that collapse and expand back a contiguous buffer. // CHECK: memref.collapse_shape {{.*}} {{\[}}[0, 1], [2]] // CHECK-SAME: memref<3x4x5xf32> into memref<12x5xf32> @@ -163,6 +165,19 @@ func.func @expand_collapse_shape_static( memref<1x2049xi64, strided<[?, ?], offset: ?>> into memref<2049xi64, strided<[?], offset: ?>> + // %arg8: memref<1x1x1024xi8, strided<[40960, 4096, 1], offset: 0>>, + // %arg9: memref<24x1x1x1024xi8, strided<[40960, 40960, 4096, 1], offset: 0>>) { + +// CHECK: memref.collapse_shape {{.*}} {{\[}}[0, 1, 2]] + %r8 = memref.collapse_shape %arg8 [[0, 1, 2]] : + memref<1x1x1024xi8, strided<[40960, 4096, 1], offset: 0>> into + memref<1024xi8, strided<[1], offset: 0>> + +// CHECK: memref.collapse_shape {{.*}} {{\[}}[0], [1, 2, 3]] + %r9 = memref.collapse_shape %arg9 [[0], [1, 2, 3]] : + memref<24x1x1x1024xi8, strided<[40960, 40960, 4096, 1], offset: 0>> into + memref<24x1024xi8, strided<[40960, 1], offset: 0>> + // Reshapes that expand and collapse back a contiguous buffer with some 1's. // CHECK: memref.expand_shape {{.*}} {{\[}}[0, 1], [2], [3, 4]] output_shape [1, 3, 4, 1, 5] // CHECK-SAME: memref<3x4x5xf32> into memref<1x3x4x1x5xf32> From 281d17840c35a1d80303bb6170c253fe2411f95f Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Wed, 21 Aug 2024 17:38:24 +0400 Subject: [PATCH 062/426] [clang] Diagnose functions with too many parameters (#104833) This patch adds a parser check when a function declaration or function type declaration (in a function pointer declaration, for example) has too many parameters for `FunctionTypeBits::NumParams` to hold. At the moment of writing it's a 16-bit-wide bit-field, limiting the number of parameters at 65536. The check is added in the parser loop that goes over comma-separated list of function parameters. This is not the solution Aaron suggested in https://github.com/llvm/llvm-project/issues/35741#issuecomment-1638086571, because it was found out that it's quite hard to recover from this particular error in `GetFullTypeForDeclarator()`. Multiple options were tried, but all of them led to crashes down the line. I used LLVM Compile Time Tracker to ensure this does not introduce a performance regression. I believe changes are in the noise: https://llvm-compile-time-tracker.com/compare.php?from=de5ea2d122c31e1551654ff506c33df299f351b8&to=424818620766cedb2770e076ee359afeb0cc14ec&stat=instructions:u Fixes #35741 --- clang/docs/ReleaseNotes.rst | 3 ++ clang/include/clang/AST/Type.h | 7 ++++- .../clang/Basic/DiagnosticParseKinds.td | 3 ++ clang/lib/Parse/ParseDecl.cpp | 11 +++++++ .../test/Parser/function-parameter-limit.cpp | 29 +++++++++++++++++++ 5 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 clang/test/Parser/function-parameter-limit.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 5aedfc654e8dbb..8f98167dff31ef 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -310,6 +310,9 @@ Miscellaneous Clang Crashes Fixed - Fixed a crash caused by long chains of ``sizeof`` and other similar operators that can be followed by a non-parenthesized expression. (#GH45061) +- Fixed a crash when function has more than 65536 parameters. + Now a diagnostic is emitted. (#GH35741) + OpenACC Specific Changes ------------------------ diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 27618604192c51..575f3c17a3f691 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -1929,6 +1929,11 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase { unsigned Kind : NumOfBuiltinTypeBits; }; +public: + static constexpr int FunctionTypeNumParamsWidth = 16; + static constexpr int FunctionTypeNumParamsLimit = (1 << 16) - 1; + +protected: /// FunctionTypeBitfields store various bits belonging to FunctionProtoType. /// Only common bits are stored here. Additional uncommon bits are stored /// in a trailing object after FunctionProtoType. @@ -1966,7 +1971,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase { /// According to [implimits] 8 bits should be enough here but this is /// somewhat easy to exceed with metaprogramming and so we would like to /// keep NumParams as wide as reasonably possible. - unsigned NumParams : 16; + unsigned NumParams : FunctionTypeNumParamsWidth; /// The type of exception specification this function has. LLVM_PREFERRED_TYPE(ExceptionSpecificationType) diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index 62a97b36737e72..464f08637332d4 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -481,6 +481,9 @@ def ext_decomp_decl_empty : ExtWarn< "ISO C++17 does not allow a decomposition group to be empty">, InGroup>; +def err_function_parameter_limit_exceeded : Error< + "too many function parameters; subsequent parameters will be ignored">; + // C++26 structured bindings def ext_decl_attrs_on_binding : ExtWarn< "an attribute specifier sequence attached to a structured binding declaration " diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index a8a9d3f3f5b088..ed5d6ce90aa7d1 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -8025,6 +8025,17 @@ void Parser::ParseParameterDeclarationClause( // Consume the keyword. ConsumeToken(); } + + // We can only store so many parameters + // Skip until the the end of the parameter list, ignoring + // parameters that would overflow. + if (ParamInfo.size() == Type::FunctionTypeNumParamsLimit) { + Diag(ParmDeclarator.getBeginLoc(), + diag::err_function_parameter_limit_exceeded); + SkipUntil(tok::r_paren, SkipUntilFlags::StopBeforeMatch); + break; + } + // Inform the actions module about the parameter declarator, so it gets // added to the current scope. Decl *Param = diff --git a/clang/test/Parser/function-parameter-limit.cpp b/clang/test/Parser/function-parameter-limit.cpp new file mode 100644 index 00000000000000..29f5dde294715c --- /dev/null +++ b/clang/test/Parser/function-parameter-limit.cpp @@ -0,0 +1,29 @@ +// RUN: %clang_cc1 -verify %s + +#define P_10(x) x##0, x##1, x##2, x##3, x##4, x##5, x##6, x##7, x##8, x##9, +#define P_100(x) P_10(x##0) P_10(x##1) P_10(x##2) P_10(x##3) P_10(x##4) \ + P_10(x##5) P_10(x##6) P_10(x##7) P_10(x##8) P_10(x##9) +#define P_1000(x) P_100(x##0) P_100(x##1) P_100(x##2) P_100(x##3) P_100(x##4) \ + P_100(x##5) P_100(x##6) P_100(x##7) P_100(x##8) P_100(x##9) +#define P_10000(x) P_1000(x##0) P_1000(x##1) P_1000(x##2) P_1000(x##3) P_1000(x##4) \ + P_1000(x##5) P_1000(x##6) P_1000(x##7) P_1000(x##8) P_1000(x##9) + +void func ( + P_10000(int p) + P_10000(int q) + P_10000(int r) + P_10000(int s) + P_10000(int t) + P_10000(int u) + P_10000(int v) // expected-error {{too many function parameters; subsequent parameters will be ignored}} + int w); + +extern double(*func2)( + P_10000(int p) + P_10000(int q) + P_10000(int r) + P_10000(int s) + P_10000(int t) + P_10000(int u) + P_10000(int v) // expected-error {{too many function parameters; subsequent parameters will be ignored}} + int w); From 7ad7f8f7a3d443f4c17264d7e14cccdc020976b9 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 21 Aug 2024 08:48:53 -0500 Subject: [PATCH 063/426] [libcxx] Add `LIBCXX_HAS_TERMINAL_AVAILABLE` CMake option to disable `print` terminal checks (#99259) Adds a new CMake option called `LIBCXX_HAS_TERMINAL_AVAILABLE` that prevents us from checking for `isatty`. --- .github/workflows/libcxx-build-and-test.yaml | 1 + libcxx/CMakeLists.txt | 3 +++ libcxx/cmake/caches/Generic-no-terminal.cmake | 5 +++++ libcxx/include/__config_site.in | 1 + libcxx/include/print | 2 +- libcxx/utils/ci/run-buildbot | 5 +++++ libcxx/utils/libcxx/test/features.py | 1 + 7 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 libcxx/cmake/caches/Generic-no-terminal.cmake diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml index 1456f245cf7c0c..8c6d7c6c3c4dec 100644 --- a/.github/workflows/libcxx-build-and-test.yaml +++ b/.github/workflows/libcxx-build-and-test.yaml @@ -146,6 +146,7 @@ jobs: 'generic-no-experimental', 'generic-no-filesystem', 'generic-no-localization', + 'generic-no-terminal', 'generic-no-random_device', 'generic-no-threads', 'generic-no-tzdb', diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt index 6168c76bff6d99..273b2238f34851 100644 --- a/libcxx/CMakeLists.txt +++ b/libcxx/CMakeLists.txt @@ -97,6 +97,8 @@ option(LIBCXX_ENABLE_UNICODE "Whether to include support for Unicode in the library. Disabling Unicode can be useful when porting to platforms that don't support UTF-8 encoding (e.g. embedded)." ON) +option(LIBCXX_HAS_TERMINAL_AVAILABLE + "Build libc++ with support for checking whether a stream is a terminal." ON) option(LIBCXX_ENABLE_WIDE_CHARACTERS "Whether to include support for wide characters in the library. Disabling wide character support can be useful when porting to platforms that don't @@ -744,6 +746,7 @@ config_define_if(LIBCXX_ABI_FORCE_ITANIUM _LIBCPP_ABI_FORCE_ITANIUM) config_define_if(LIBCXX_ABI_FORCE_MICROSOFT _LIBCPP_ABI_FORCE_MICROSOFT) config_define_if_not(LIBCXX_ENABLE_THREADS _LIBCPP_HAS_NO_THREADS) config_define_if_not(LIBCXX_ENABLE_MONOTONIC_CLOCK _LIBCPP_HAS_NO_MONOTONIC_CLOCK) +config_define_if_not(LIBCXX_HAS_TERMINAL_AVAILABLE _LIBCPP_HAS_NO_TERMINAL) if (NOT LIBCXX_TYPEINFO_COMPARISON_IMPLEMENTATION STREQUAL "default") config_define("${LIBCXX_TYPEINFO_COMPARISON_IMPLEMENTATION}" _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION) endif() diff --git a/libcxx/cmake/caches/Generic-no-terminal.cmake b/libcxx/cmake/caches/Generic-no-terminal.cmake new file mode 100644 index 00000000000000..9f712ebce02dbf --- /dev/null +++ b/libcxx/cmake/caches/Generic-no-terminal.cmake @@ -0,0 +1,5 @@ +set(LIBCXX_HAS_TERMINAL_AVAILABLE OFF CACHE BOOL "") + +# Speed up the CI +set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "") +set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "") diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in index 67022146c9082b..bf2d31d8eeb1b9 100644 --- a/libcxx/include/__config_site.in +++ b/libcxx/include/__config_site.in @@ -15,6 +15,7 @@ #cmakedefine _LIBCPP_ABI_FORCE_MICROSOFT #cmakedefine _LIBCPP_HAS_NO_THREADS #cmakedefine _LIBCPP_HAS_NO_MONOTONIC_CLOCK +#cmakedefine _LIBCPP_HAS_NO_TERMINAL #cmakedefine _LIBCPP_HAS_MUSL_LIBC #cmakedefine _LIBCPP_HAS_THREAD_API_PTHREAD #cmakedefine _LIBCPP_HAS_THREAD_API_EXTERNAL diff --git a/libcxx/include/print b/libcxx/include/print index 1a579daff270f7..2798a6bda26262 100644 --- a/libcxx/include/print +++ b/libcxx/include/print @@ -199,7 +199,7 @@ _LIBCPP_HIDE_FROM_ABI inline bool __is_terminal([[maybe_unused]] FILE* __stream) // the behavior in the test. This is not part of the public API. # ifdef _LIBCPP_TESTING_PRINT_IS_TERMINAL return _LIBCPP_TESTING_PRINT_IS_TERMINAL(__stream); -# elif _LIBCPP_AVAILABILITY_HAS_PRINT == 0 +# elif _LIBCPP_AVAILABILITY_HAS_PRINT == 0 || defined(_LIBCPP_HAS_NO_TERMINAL) return false; # elif defined(_LIBCPP_WIN32API) return std::__is_windows_terminal(__stream); diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot index f1c20b9d721904..6353c87c3d865d 100755 --- a/libcxx/utils/ci/run-buildbot +++ b/libcxx/utils/ci/run-buildbot @@ -469,6 +469,11 @@ generic-no-localization) generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Generic-no-localization.cmake" check-runtimes ;; +generic-no-terminal) + clean + generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Generic-no-terminal.cmake" + check-runtimes +;; generic-no-unicode) clean generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Generic-no-unicode.cmake" diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py index 97cdb0349885d6..6857a28eb32995 100644 --- a/libcxx/utils/libcxx/test/features.py +++ b/libcxx/utils/libcxx/test/features.py @@ -378,6 +378,7 @@ def _mingwSupportsModules(cfg): "_LIBCPP_HAS_NO_FILESYSTEM": "no-filesystem", "_LIBCPP_HAS_NO_RANDOM_DEVICE": "no-random-device", "_LIBCPP_HAS_NO_LOCALIZATION": "no-localization", + "_LIBCPP_HAS_NO_TERMINAL": "no-terminal", "_LIBCPP_HAS_NO_WIDE_CHARACTERS": "no-wide-characters", "_LIBCPP_HAS_NO_TIME_ZONE_DATABASE": "no-tzdb", "_LIBCPP_HAS_NO_UNICODE": "libcpp-has-no-unicode", From 87eeed1f0ebe57abffde560c25dd9829dc6038f3 Mon Sep 17 00:00:00 2001 From: "Ivan R. Ivanov" Date: Wed, 21 Aug 2024 22:59:11 +0900 Subject: [PATCH 064/426] [flang][NFC] Move OpenMP related passes into a separate directory (#104732) --- flang/docs/OpenMP-declare-target.md | 4 +- flang/docs/OpenMP-descriptor-management.md | 4 +- flang/include/flang/Optimizer/CMakeLists.txt | 1 + .../flang/Optimizer/OpenMP/CMakeLists.txt | 4 ++ flang/include/flang/Optimizer/OpenMP/Passes.h | 30 ++++++++++++++ .../include/flang/Optimizer/OpenMP/Passes.td | 40 +++++++++++++++++++ .../flang/Optimizer/Transforms/Passes.td | 26 ------------ flang/include/flang/Tools/CLOptions.inc | 7 ++-- flang/lib/Frontend/CMakeLists.txt | 1 + flang/lib/Optimizer/CMakeLists.txt | 1 + flang/lib/Optimizer/OpenMP/CMakeLists.txt | 25 ++++++++++++ .../FunctionFiltering.cpp} | 18 ++++----- .../MapInfoFinalization.cpp} | 21 +++++----- .../MarkDeclareTarget.cpp} | 26 ++++++++---- flang/lib/Optimizer/Transforms/CMakeLists.txt | 3 -- flang/tools/bbc/CMakeLists.txt | 1 + flang/tools/fir-opt/CMakeLists.txt | 1 + flang/tools/fir-opt/fir-opt.cpp | 2 + flang/tools/tco/CMakeLists.txt | 1 + 19 files changed, 153 insertions(+), 63 deletions(-) create mode 100644 flang/include/flang/Optimizer/OpenMP/CMakeLists.txt create mode 100644 flang/include/flang/Optimizer/OpenMP/Passes.h create mode 100644 flang/include/flang/Optimizer/OpenMP/Passes.td create mode 100644 flang/lib/Optimizer/OpenMP/CMakeLists.txt rename flang/lib/Optimizer/{Transforms/OMPFunctionFiltering.cpp => OpenMP/FunctionFiltering.cpp} (90%) rename flang/lib/Optimizer/{Transforms/OMPMapInfoFinalization.cpp => OpenMP/MapInfoFinalization.cpp} (96%) rename flang/lib/Optimizer/{Transforms/OMPMarkDeclareTarget.cpp => OpenMP/MarkDeclareTarget.cpp} (80%) diff --git a/flang/docs/OpenMP-declare-target.md b/flang/docs/OpenMP-declare-target.md index d29a46807e1eaf..45062469007b65 100644 --- a/flang/docs/OpenMP-declare-target.md +++ b/flang/docs/OpenMP-declare-target.md @@ -149,7 +149,7 @@ flang/lib/Lower/OpenMP.cpp function `genDeclareTargetIntGlobal`. There are currently two passes within Flang that are related to the processing of `declare target`: -* `OMPMarkDeclareTarget` - This pass is in charge of marking functions captured +* `MarkDeclareTarget` - This pass is in charge of marking functions captured (called from) in `target` regions or other `declare target` marked functions as `declare target`. It does so recursively, i.e. nested calls will also be implicitly marked. It currently will try to mark things as conservatively as @@ -157,7 +157,7 @@ possible, e.g. if captured in a `target` region it will apply `nohost`, unless it encounters a `host` `declare target` in which case it will apply the `any` device type. Functions are handled similarly, except we utilise the parent's device type where possible. -* `OMPFunctionFiltering` - This is executed after the `OMPMarkDeclareTarget` +* `FunctionFiltering` - This is executed after the `MarkDeclareTarget` pass, and its job is to conservatively remove host functions from the module where possible when compiling for the device. This helps make sure that most incompatible code for the host is not lowered for the diff --git a/flang/docs/OpenMP-descriptor-management.md b/flang/docs/OpenMP-descriptor-management.md index d0eb01b00f9bb9..66c153914f70da 100644 --- a/flang/docs/OpenMP-descriptor-management.md +++ b/flang/docs/OpenMP-descriptor-management.md @@ -44,7 +44,7 @@ Currently, Flang will lower these descriptor types in the OpenMP lowering (lower to all other map types, generating an omp.MapInfoOp containing relevant information required for lowering the OpenMP dialect to LLVM-IR during the final stages of the MLIR lowering. However, after the lowering to FIR/HLFIR has been performed an OpenMP dialect specific pass for Fortran, -`OMPMapInfoFinalizationPass` (Optimizer/OMPMapInfoFinalization.cpp) will expand the +`MapInfoFinalizationPass` (Optimizer/OpenMP/MapInfoFinalization.cpp) will expand the `omp.MapInfoOp`'s containing descriptors (which currently will be a `BoxType` or `BoxAddrOp`) into multiple mappings, with one extra per pointer member in the descriptor that is supported on top of the original descriptor map operation. These pointers members are linked to the parent descriptor by adding them to @@ -53,7 +53,7 @@ owning operation's (`omp.TargetOp`, `omp.TargetDataOp` etc.) map operand list an operation is `IsolatedFromAbove`, it also inserts them as `BlockArgs` to canonicalize the mappings and simplify lowering. -An example transformation by the `OMPMapInfoFinalizationPass`: +An example transformation by the `MapInfoFinalizationPass`: ``` diff --git a/flang/include/flang/Optimizer/CMakeLists.txt b/flang/include/flang/Optimizer/CMakeLists.txt index 89e43a9ee8d621..3336ac935e1012 100644 --- a/flang/include/flang/Optimizer/CMakeLists.txt +++ b/flang/include/flang/Optimizer/CMakeLists.txt @@ -2,3 +2,4 @@ add_subdirectory(CodeGen) add_subdirectory(Dialect) add_subdirectory(HLFIR) add_subdirectory(Transforms) +add_subdirectory(OpenMP) diff --git a/flang/include/flang/Optimizer/OpenMP/CMakeLists.txt b/flang/include/flang/Optimizer/OpenMP/CMakeLists.txt new file mode 100644 index 00000000000000..d59573f0f7fd91 --- /dev/null +++ b/flang/include/flang/Optimizer/OpenMP/CMakeLists.txt @@ -0,0 +1,4 @@ +set(LLVM_TARGET_DEFINITIONS Passes.td) +mlir_tablegen(Passes.h.inc -gen-pass-decls -name FlangOpenMP) + +add_public_tablegen_target(FlangOpenMPPassesIncGen) diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.h b/flang/include/flang/Optimizer/OpenMP/Passes.h new file mode 100644 index 00000000000000..403d79667bf448 --- /dev/null +++ b/flang/include/flang/Optimizer/OpenMP/Passes.h @@ -0,0 +1,30 @@ +//===- Passes.h - OpenMP pass entry points ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This header declares the flang OpenMP passes. +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_OPTIMIZER_OPENMP_PASSES_H +#define FORTRAN_OPTIMIZER_OPENMP_PASSES_H + +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassRegistry.h" + +#include + +namespace flangomp { +#define GEN_PASS_DECL +#define GEN_PASS_REGISTRATION +#include "flang/Optimizer/OpenMP/Passes.h.inc" + +} // namespace flangomp + +#endif // FORTRAN_OPTIMIZER_OPENMP_PASSES_H diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td new file mode 100644 index 00000000000000..395178e26a5762 --- /dev/null +++ b/flang/include/flang/Optimizer/OpenMP/Passes.td @@ -0,0 +1,40 @@ +//===-- Passes.td - flang OpenMP pass definition -----------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_OPTIMIZER_OPENMP_PASSES +#define FORTRAN_OPTIMIZER_OPENMP_PASSES + +include "mlir/Pass/PassBase.td" + +def MapInfoFinalizationPass + : Pass<"omp-map-info-finalization"> { + let summary = "expands OpenMP MapInfo operations containing descriptors"; + let description = [{ + Expands MapInfo operations containing descriptor types into multiple + MapInfo's for each pointer element in the descriptor that requires + explicit individual mapping by the OpenMP runtime. + }]; + let dependentDialects = ["mlir::omp::OpenMPDialect"]; +} + +def MarkDeclareTargetPass + : Pass<"omp-mark-declare-target", "mlir::ModuleOp"> { + let summary = "Marks all functions called by an OpenMP declare target function as declare target"; + let dependentDialects = ["mlir::omp::OpenMPDialect"]; +} + +def FunctionFiltering : Pass<"omp-function-filtering"> { + let summary = "Filters out functions intended for the host when compiling " + "for the target device."; + let dependentDialects = [ + "mlir::func::FuncDialect", + "fir::FIROpsDialect" + ]; +} + +#endif //FORTRAN_OPTIMIZER_OPENMP_PASSES diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td index c703a62c03b7d9..53a1b55450972e 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.td +++ b/flang/include/flang/Optimizer/Transforms/Passes.td @@ -340,32 +340,6 @@ def LoopVersioning : Pass<"loop-versioning", "mlir::func::FuncOp"> { let dependentDialects = [ "fir::FIROpsDialect" ]; } -def OMPMapInfoFinalizationPass - : Pass<"omp-map-info-finalization"> { - let summary = "expands OpenMP MapInfo operations containing descriptors"; - let description = [{ - Expands MapInfo operations containing descriptor types into multiple - MapInfo's for each pointer element in the descriptor that requires - explicit individual mapping by the OpenMP runtime. - }]; - let dependentDialects = ["mlir::omp::OpenMPDialect"]; -} - -def OMPMarkDeclareTargetPass - : Pass<"omp-mark-declare-target", "mlir::ModuleOp"> { - let summary = "Marks all functions called by an OpenMP declare target function as declare target"; - let dependentDialects = ["mlir::omp::OpenMPDialect"]; -} - -def OMPFunctionFiltering : Pass<"omp-function-filtering"> { - let summary = "Filters out functions intended for the host when compiling " - "for the target device."; - let dependentDialects = [ - "mlir::func::FuncDialect", - "fir::FIROpsDialect" - ]; -} - def VScaleAttr : Pass<"vscale-attr", "mlir::func::FuncOp"> { let summary = "Add vscale_range attribute to functions"; let description = [{ diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc index 7df50449494631..05b2f31711add2 100644 --- a/flang/include/flang/Tools/CLOptions.inc +++ b/flang/include/flang/Tools/CLOptions.inc @@ -17,6 +17,7 @@ #include "mlir/Transforms/Passes.h" #include "flang/Optimizer/CodeGen/CodeGen.h" #include "flang/Optimizer/HLFIR/Passes.h" +#include "flang/Optimizer/OpenMP/Passes.h" #include "flang/Optimizer/Transforms/Passes.h" #include "llvm/Passes/OptimizationLevel.h" #include "llvm/Support/CommandLine.h" @@ -358,10 +359,10 @@ inline void createHLFIRToFIRPassPipeline( inline void createOpenMPFIRPassPipeline( mlir::PassManager &pm, bool isTargetDevice) { addNestedPassToAllTopLevelOperations( - pm, fir::createOMPMapInfoFinalizationPass); - pm.addPass(fir::createOMPMarkDeclareTargetPass()); + pm, flangomp::createMapInfoFinalizationPass); + pm.addPass(flangomp::createMarkDeclareTargetPass()); if (isTargetDevice) - pm.addPass(fir::createOMPFunctionFiltering()); + pm.addPass(flangomp::createFunctionFiltering()); } #if !defined(FLANG_EXCLUDE_CODEGEN) diff --git a/flang/lib/Frontend/CMakeLists.txt b/flang/lib/Frontend/CMakeLists.txt index c20b9096aff496..ecdcc73d61ec1f 100644 --- a/flang/lib/Frontend/CMakeLists.txt +++ b/flang/lib/Frontend/CMakeLists.txt @@ -38,6 +38,7 @@ add_flang_library(flangFrontend FIRTransforms HLFIRDialect HLFIRTransforms + FlangOpenMPTransforms MLIRTransforms MLIRBuiltinToLLVMIRTranslation MLIRLLVMToLLVMIRTranslation diff --git a/flang/lib/Optimizer/CMakeLists.txt b/flang/lib/Optimizer/CMakeLists.txt index 4a602162ed2b77..dd153ac33c0fbb 100644 --- a/flang/lib/Optimizer/CMakeLists.txt +++ b/flang/lib/Optimizer/CMakeLists.txt @@ -5,3 +5,4 @@ add_subdirectory(HLFIR) add_subdirectory(Support) add_subdirectory(Transforms) add_subdirectory(Analysis) +add_subdirectory(OpenMP) diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt new file mode 100644 index 00000000000000..a8984d256b8f6a --- /dev/null +++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt @@ -0,0 +1,25 @@ +get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) + +add_flang_library(FlangOpenMPTransforms + FunctionFiltering.cpp + MapInfoFinalization.cpp + MarkDeclareTarget.cpp + + DEPENDS + FIRDialect + HLFIROpsIncGen + FlangOpenMPPassesIncGen + + LINK_LIBS + FIRAnalysis + FIRBuilder + FIRCodeGen + FIRDialect + FIRDialectSupport + FIRSupport + FortranCommon + MLIRFuncDialect + MLIROpenMPDialect + HLFIRDialect + MLIRIR +) diff --git a/flang/lib/Optimizer/Transforms/OMPFunctionFiltering.cpp b/flang/lib/Optimizer/OpenMP/FunctionFiltering.cpp similarity index 90% rename from flang/lib/Optimizer/Transforms/OMPFunctionFiltering.cpp rename to flang/lib/Optimizer/OpenMP/FunctionFiltering.cpp index 0c472246c2a44c..bd9005d3e2df6f 100644 --- a/flang/lib/Optimizer/Transforms/OMPFunctionFiltering.cpp +++ b/flang/lib/Optimizer/OpenMP/FunctionFiltering.cpp @@ -1,4 +1,4 @@ -//===- OMPFunctionFiltering.cpp -------------------------------------------===// +//===- FunctionFiltering.cpp -------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -13,7 +13,7 @@ #include "flang/Optimizer/Dialect/FIRDialect.h" #include "flang/Optimizer/Dialect/FIROpsSupport.h" -#include "flang/Optimizer/Transforms/Passes.h" +#include "flang/Optimizer/OpenMP/Passes.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" @@ -21,18 +21,18 @@ #include "mlir/IR/BuiltinOps.h" #include "llvm/ADT/SmallVector.h" -namespace fir { -#define GEN_PASS_DEF_OMPFUNCTIONFILTERING -#include "flang/Optimizer/Transforms/Passes.h.inc" -} // namespace fir +namespace flangomp { +#define GEN_PASS_DEF_FUNCTIONFILTERING +#include "flang/Optimizer/OpenMP/Passes.h.inc" +} // namespace flangomp using namespace mlir; namespace { -class OMPFunctionFilteringPass - : public fir::impl::OMPFunctionFilteringBase { +class FunctionFilteringPass + : public flangomp::impl::FunctionFilteringBase { public: - OMPFunctionFilteringPass() = default; + FunctionFilteringPass() = default; void runOnOperation() override { MLIRContext *context = &getContext(); diff --git a/flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp similarity index 96% rename from flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp rename to flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp index ddaa3c5f404f0b..6e9cd03dca8f3f 100644 --- a/flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp +++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp @@ -1,5 +1,4 @@ -//===- OMPMapInfoFinalization.cpp -//---------------------------------------------------===// +//===- MapInfoFinalization.cpp -----------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -28,7 +27,7 @@ #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Optimizer/Dialect/Support/KindMapping.h" -#include "flang/Optimizer/Transforms/Passes.h" +#include "flang/Optimizer/OpenMP/Passes.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" #include "mlir/IR/BuiltinDialect.h" @@ -41,15 +40,15 @@ #include "llvm/Frontend/OpenMP/OMPConstants.h" #include -namespace fir { -#define GEN_PASS_DEF_OMPMAPINFOFINALIZATIONPASS -#include "flang/Optimizer/Transforms/Passes.h.inc" -} // namespace fir +namespace flangomp { +#define GEN_PASS_DEF_MAPINFOFINALIZATIONPASS +#include "flang/Optimizer/OpenMP/Passes.h.inc" +} // namespace flangomp namespace { -class OMPMapInfoFinalizationPass - : public fir::impl::OMPMapInfoFinalizationPassBase< - OMPMapInfoFinalizationPass> { +class MapInfoFinalizationPass + : public flangomp::impl::MapInfoFinalizationPassBase< + MapInfoFinalizationPass> { void genDescriptorMemberMaps(mlir::omp::MapInfoOp op, fir::FirOpBuilder &builder, @@ -245,7 +244,7 @@ class OMPMapInfoFinalizationPass // all users appropriately, making sure to only add a single member link // per new generation for the original originating descriptor MapInfoOp. assert(llvm::hasSingleElement(op->getUsers()) && - "OMPMapInfoFinalization currently only supports single users " + "MapInfoFinalization currently only supports single users " "of a MapInfoOp"); if (!op.getMembers().empty()) { diff --git a/flang/lib/Optimizer/Transforms/OMPMarkDeclareTarget.cpp b/flang/lib/Optimizer/OpenMP/MarkDeclareTarget.cpp similarity index 80% rename from flang/lib/Optimizer/Transforms/OMPMarkDeclareTarget.cpp rename to flang/lib/Optimizer/OpenMP/MarkDeclareTarget.cpp index 4946e13b22865d..a7ffd5fda82b7f 100644 --- a/flang/lib/Optimizer/Transforms/OMPMarkDeclareTarget.cpp +++ b/flang/lib/Optimizer/OpenMP/MarkDeclareTarget.cpp @@ -1,4 +1,16 @@ -#include "flang/Optimizer/Transforms/Passes.h" +//===- MarkDeclareTarget.cpp -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Mark functions called from explicit target code as implicitly declare target. +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/OpenMP/Passes.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" @@ -10,14 +22,14 @@ #include "mlir/Support/LLVM.h" #include "llvm/ADT/SmallPtrSet.h" -namespace fir { -#define GEN_PASS_DEF_OMPMARKDECLARETARGETPASS -#include "flang/Optimizer/Transforms/Passes.h.inc" -} // namespace fir +namespace flangomp { +#define GEN_PASS_DEF_MARKDECLARETARGETPASS +#include "flang/Optimizer/OpenMP/Passes.h.inc" +} // namespace flangomp namespace { -class OMPMarkDeclareTargetPass - : public fir::impl::OMPMarkDeclareTargetPassBase { +class MarkDeclareTargetPass + : public flangomp::impl::MarkDeclareTargetPassBase { void markNestedFuncs(mlir::omp::DeclareTargetDeviceType parentDevTy, mlir::omp::DeclareTargetCaptureClause parentCapClause, diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt index 3869633bd98e02..a6fc8e999d44da 100644 --- a/flang/lib/Optimizer/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt @@ -21,9 +21,6 @@ add_flang_library(FIRTransforms AddDebugInfo.cpp PolymorphicOpConversion.cpp LoopVersioning.cpp - OMPFunctionFiltering.cpp - OMPMapInfoFinalization.cpp - OMPMarkDeclareTarget.cpp StackReclaim.cpp VScaleAttr.cpp FunctionAttr.cpp diff --git a/flang/tools/bbc/CMakeLists.txt b/flang/tools/bbc/CMakeLists.txt index 9410fd00566006..69316d4dc61de3 100644 --- a/flang/tools/bbc/CMakeLists.txt +++ b/flang/tools/bbc/CMakeLists.txt @@ -25,6 +25,7 @@ FIRTransforms FIRBuilder HLFIRDialect HLFIRTransforms +FlangOpenMPTransforms ${dialect_libs} ${extension_libs} MLIRAffineToStandard diff --git a/flang/tools/fir-opt/CMakeLists.txt b/flang/tools/fir-opt/CMakeLists.txt index 43679a9d535782..4c6dbf7d9c8c37 100644 --- a/flang/tools/fir-opt/CMakeLists.txt +++ b/flang/tools/fir-opt/CMakeLists.txt @@ -19,6 +19,7 @@ target_link_libraries(fir-opt PRIVATE FIRCodeGen HLFIRDialect HLFIRTransforms + FlangOpenMPTransforms FIRAnalysis ${test_libs} ${dialect_libs} diff --git a/flang/tools/fir-opt/fir-opt.cpp b/flang/tools/fir-opt/fir-opt.cpp index 1846c1b317848f..f75fba27c68f08 100644 --- a/flang/tools/fir-opt/fir-opt.cpp +++ b/flang/tools/fir-opt/fir-opt.cpp @@ -14,6 +14,7 @@ #include "mlir/Tools/mlir-opt/MlirOptMain.h" #include "flang/Optimizer/CodeGen/CodeGen.h" #include "flang/Optimizer/HLFIR/Passes.h" +#include "flang/Optimizer/OpenMP/Passes.h" #include "flang/Optimizer/Support/InitFIR.h" #include "flang/Optimizer/Transforms/Passes.h" @@ -34,6 +35,7 @@ int main(int argc, char **argv) { fir::registerOptCodeGenPasses(); fir::registerOptTransformPasses(); hlfir::registerHLFIRPasses(); + flangomp::registerFlangOpenMPPasses(); #ifdef FLANG_INCLUDE_TESTS fir::test::registerTestFIRAliasAnalysisPass(); mlir::registerSideEffectTestPasses(); diff --git a/flang/tools/tco/CMakeLists.txt b/flang/tools/tco/CMakeLists.txt index 808219ac361f2a..698a398547c773 100644 --- a/flang/tools/tco/CMakeLists.txt +++ b/flang/tools/tco/CMakeLists.txt @@ -17,6 +17,7 @@ target_link_libraries(tco PRIVATE FIRBuilder HLFIRDialect HLFIRTransforms + FlangOpenMPTransforms ${dialect_libs} ${extension_libs} MLIRIR From e1912a15b6b05aab36b7bcbe617980e8d808bd80 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 21 Aug 2024 07:10:17 -0700 Subject: [PATCH 065/426] [NFC][ADT] Format StringRefTest.cpp to fit in 80 columns. (#105502) --- llvm/unittests/ADT/StringRefTest.cpp | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/llvm/unittests/ADT/StringRefTest.cpp b/llvm/unittests/ADT/StringRefTest.cpp index b3c206a336962d..40351c99d0185c 100644 --- a/llvm/unittests/ADT/StringRefTest.cpp +++ b/llvm/unittests/ADT/StringRefTest.cpp @@ -939,16 +939,17 @@ struct GetDoubleStrings { bool AllowInexact; bool ShouldFail; double D; -} DoubleStrings[] = {{"0", false, false, 0.0}, - {"0.0", false, false, 0.0}, - {"-0.0", false, false, -0.0}, - {"123.45", false, true, 123.45}, - {"123.45", true, false, 123.45}, - {"1.8e308", true, false, std::numeric_limits::infinity()}, - {"1.8e308", false, true, std::numeric_limits::infinity()}, - {"0x0.0000000000001P-1023", false, true, 0.0}, - {"0x0.0000000000001P-1023", true, false, 0.0}, - }; +} DoubleStrings[] = { + {"0", false, false, 0.0}, + {"0.0", false, false, 0.0}, + {"-0.0", false, false, -0.0}, + {"123.45", false, true, 123.45}, + {"123.45", true, false, 123.45}, + {"1.8e308", true, false, std::numeric_limits::infinity()}, + {"1.8e308", false, true, std::numeric_limits::infinity()}, + {"0x0.0000000000001P-1023", false, true, 0.0}, + {"0x0.0000000000001P-1023", true, false, 0.0}, +}; TEST(StringRefTest, getAsDouble) { for (const auto &Entry : DoubleStrings) { @@ -1117,7 +1118,8 @@ TEST(StringRefTest, StringLiteral) { constexpr StringRef StringRefs[] = {"Foo", "Bar"}; EXPECT_EQ(StringRef("Foo"), StringRefs[0]); EXPECT_EQ(3u, (std::integral_constant::value)); - EXPECT_EQ(false, (std::integral_constant::value)); + EXPECT_EQ(false, + (std::integral_constant::value)); EXPECT_EQ(StringRef("Bar"), StringRefs[1]); constexpr StringLiteral Strings[] = {"Foo", "Bar"}; From 3c8f139fb73a8610680b184afc88fe4b1485add0 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 21 Aug 2024 16:11:57 +0200 Subject: [PATCH 066/426] [InstCombine] Add tests for icmp of select of cmp (NFC) --- .../test/Transforms/InstCombine/select-cmp.ll | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/select-cmp.ll b/llvm/test/Transforms/InstCombine/select-cmp.ll index 7c1a32e7b5eb70..697010b90db584 100644 --- a/llvm/test/Transforms/InstCombine/select-cmp.ll +++ b/llvm/test/Transforms/InstCombine/select-cmp.ll @@ -480,4 +480,108 @@ define i1 @test_select_inverse_nonconst4(i64 %x, i64 %y, i64 %z, i1 %cond) { ret i1 %sel } +define i1 @sel_icmp_two_cmp(i1 %c, i32 %a1, i32 %a2, i32 %a3, i32 %a4) { +; CHECK-LABEL: @sel_icmp_two_cmp( +; CHECK-NEXT: [[V1:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A1:%.*]], i32 [[A2:%.*]]) +; CHECK-NEXT: [[V2:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[A3:%.*]], i32 [[A4:%.*]]) +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C:%.*]], i8 [[V1]], i8 [[V2]] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[SEL]], 1 +; CHECK-NEXT: ret i1 [[CMP]] +; + %v1 = call i8 @llvm.ucmp(i32 %a1, i32 %a2) + %v2 = call i8 @llvm.scmp(i32 %a3, i32 %a4) + %sel = select i1 %c, i8 %v1, i8 %v2 + %cmp = icmp sle i8 %sel, 0 + ret i1 %cmp +} + +define i1 @sel_icmp_two_cmp_extra_use1(i1 %c, i32 %a1, i32 %a2, i32 %a3, i32 %a4) { +; CHECK-LABEL: @sel_icmp_two_cmp_extra_use1( +; CHECK-NEXT: [[V1:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A1:%.*]], i32 [[A2:%.*]]) +; CHECK-NEXT: [[V2:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[A3:%.*]], i32 [[A4:%.*]]) +; CHECK-NEXT: call void @use.i8(i8 [[V1]]) +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C:%.*]], i8 [[V1]], i8 [[V2]] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[SEL]], 1 +; CHECK-NEXT: ret i1 [[CMP]] +; + %v1 = call i8 @llvm.ucmp(i32 %a1, i32 %a2) + %v2 = call i8 @llvm.scmp(i32 %a3, i32 %a4) + call void @use.i8(i8 %v1) + %sel = select i1 %c, i8 %v1, i8 %v2 + %cmp = icmp sle i8 %sel, 0 + ret i1 %cmp +} + +define i1 @sel_icmp_two_cmp_extra_use2(i1 %c, i32 %a1, i32 %a2, i32 %a3, i32 %a4) { +; CHECK-LABEL: @sel_icmp_two_cmp_extra_use2( +; CHECK-NEXT: [[V1:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A1:%.*]], i32 [[A2:%.*]]) +; CHECK-NEXT: [[V2:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[A3:%.*]], i32 [[A4:%.*]]) +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C:%.*]], i8 [[V1]], i8 [[V2]] +; CHECK-NEXT: call void @use.i8(i8 [[SEL]]) +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[SEL]], 1 +; CHECK-NEXT: ret i1 [[CMP]] +; + %v1 = call i8 @llvm.ucmp(i32 %a1, i32 %a2) + %v2 = call i8 @llvm.scmp(i32 %a3, i32 %a4) + %sel = select i1 %c, i8 %v1, i8 %v2 + call void @use.i8(i8 %sel) + %cmp = icmp sle i8 %sel, 0 + ret i1 %cmp +} + +define i1 @sel_icmp_two_cmp_not_const(i1 %c, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i8 %b) { +; CHECK-LABEL: @sel_icmp_two_cmp_not_const( +; CHECK-NEXT: [[V1:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A1:%.*]], i32 [[A2:%.*]]) +; CHECK-NEXT: [[V2:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[A3:%.*]], i32 [[A4:%.*]]) +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C:%.*]], i8 [[V1]], i8 [[V2]] +; CHECK-NEXT: [[CMP:%.*]] = icmp sle i8 [[SEL]], [[B:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %v1 = call i8 @llvm.ucmp(i32 %a1, i32 %a2) + %v2 = call i8 @llvm.scmp(i32 %a3, i32 %a4) + %sel = select i1 %c, i8 %v1, i8 %v2 + %cmp = icmp sle i8 %sel, %b + ret i1 %cmp +} + +define i1 @sel_icmp_cmp_and_simplify(i1 %c, i32 %a1, i32 %a2) { +; CHECK-LABEL: @sel_icmp_cmp_and_simplify( +; CHECK-NEXT: [[CMP1:%.*]] = icmp ule i32 [[A1:%.*]], [[A2:%.*]] +; CHECK-NEXT: [[NOT_C:%.*]] = xor i1 [[C:%.*]], true +; CHECK-NEXT: [[CMP:%.*]] = select i1 [[NOT_C]], i1 true, i1 [[CMP1]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %v = call i8 @llvm.ucmp(i32 %a1, i32 %a2) + %sel = select i1 %c, i8 %v, i8 0 + %cmp = icmp sle i8 %sel, 0 + ret i1 %cmp +} + +define i1 @sel_icmp_cmp_and_no_simplify(i1 %c, i32 %a1, i32 %a2, i8 %b) { +; CHECK-LABEL: @sel_icmp_cmp_and_no_simplify( +; CHECK-NEXT: [[V:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A1:%.*]], i32 [[A2:%.*]]) +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C:%.*]], i8 [[V]], i8 [[B:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[SEL]], 1 +; CHECK-NEXT: ret i1 [[CMP]] +; + %v = call i8 @llvm.ucmp(i32 %a1, i32 %a2) + %sel = select i1 %c, i8 %v, i8 %b + %cmp = icmp sle i8 %sel, 0 + ret i1 %cmp +} + +define i1 @sel_icmp_cmp_and_no_simplify_comm(i1 %c, i32 %a1, i32 %a2, i8 %b) { +; CHECK-LABEL: @sel_icmp_cmp_and_no_simplify_comm( +; CHECK-NEXT: [[V:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A1:%.*]], i32 [[A2:%.*]]) +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C:%.*]], i8 [[B:%.*]], i8 [[V]] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[SEL]], 1 +; CHECK-NEXT: ret i1 [[CMP]] +; + %v = call i8 @llvm.ucmp(i32 %a1, i32 %a2) + %sel = select i1 %c, i8 %b, i8 %v + %cmp = icmp sle i8 %sel, 0 + ret i1 %cmp +} + declare void @use(i1) +declare void @use.i8(i8) From 68e21e16d21deee0f0226b4c771ff8b4731b7370 Mon Sep 17 00:00:00 2001 From: Tomas Matheson Date: Wed, 21 Aug 2024 15:15:49 +0100 Subject: [PATCH 067/426] [AArch64] Add support for ACTLR_EL12 system register (#105497) Documentation can be found here: https://developer.arm.com/documentation/ddi0601/2024-06/AArch64-Registers/ACTLR-EL1--Auxiliary-Control-Register--EL1- --- llvm/lib/Target/AArch64/AArch64SystemOperands.td | 1 + llvm/test/MC/AArch64/arm64-system-encoding.s | 4 ++++ llvm/test/MC/Disassembler/AArch64/basic-a64-instructions.txt | 4 ++++ 3 files changed, 9 insertions(+) diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td index 7476ab852a923b..dd0ce1cf47a792 100644 --- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td +++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td @@ -939,6 +939,7 @@ def : RWSysReg<"SCTLR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b000>; def : RWSysReg<"SCTLR_EL2", 0b11, 0b100, 0b0001, 0b0000, 0b000>; def : RWSysReg<"SCTLR_EL3", 0b11, 0b110, 0b0001, 0b0000, 0b000>; def : RWSysReg<"ACTLR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b001>; +def : RWSysReg<"ACTLR_EL12", 0b11, 0b101, 0b0001, 0b0000, 0b001>; def : RWSysReg<"ACTLR_EL2", 0b11, 0b100, 0b0001, 0b0000, 0b001>; def : RWSysReg<"ACTLR_EL3", 0b11, 0b110, 0b0001, 0b0000, 0b001>; def : RWSysReg<"HCR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b000>; diff --git a/llvm/test/MC/AArch64/arm64-system-encoding.s b/llvm/test/MC/AArch64/arm64-system-encoding.s index c58a8f0cb841cb..d38f3ac9871fe5 100644 --- a/llvm/test/MC/AArch64/arm64-system-encoding.s +++ b/llvm/test/MC/AArch64/arm64-system-encoding.s @@ -59,6 +59,7 @@ foo: ; MSR/MRS instructions ;----------------------------------------------------------------------------- msr ACTLR_EL1, x3 + msr ACTLR_EL12, x3 msr ACTLR_EL2, x3 msr ACTLR_EL3, x3 msr AFSR0_EL1, x3 @@ -167,6 +168,7 @@ foo: msr S0_0_C0_C0_0, x0 msr S1_2_C3_C4_5, x2 ; CHECK: msr ACTLR_EL1, x3 ; encoding: [0x23,0x10,0x18,0xd5] +; CHECK: msr ACTLR_EL12, x3 ; encoding: [0x23,0x10,0x1d,0xd5] ; CHECK: msr ACTLR_EL2, x3 ; encoding: [0x23,0x10,0x1c,0xd5] ; CHECK: msr ACTLR_EL3, x3 ; encoding: [0x23,0x10,0x1e,0xd5] ; CHECK: msr AFSR0_EL1, x3 ; encoding: [0x03,0x51,0x18,0xd5] @@ -280,6 +282,7 @@ foo: ; CHECK-ERRORS: :[[@LINE-1]]:7: error: expected writable system register or pstate mrs x3, ACTLR_EL1 + mrs x3, ACTLR_EL12 mrs x3, ACTLR_EL2 mrs x3, ACTLR_EL3 mrs x3, AFSR0_EL1 @@ -501,6 +504,7 @@ foo: mrs x3, S3_3_c11_c1_4 ; CHECK: mrs x3, ACTLR_EL1 ; encoding: [0x23,0x10,0x38,0xd5] +; CHECK: mrs x3, ACTLR_EL12 ; encoding: [0x23,0x10,0x3d,0xd5] ; CHECK: mrs x3, ACTLR_EL2 ; encoding: [0x23,0x10,0x3c,0xd5] ; CHECK: mrs x3, ACTLR_EL3 ; encoding: [0x23,0x10,0x3e,0xd5] ; CHECK: mrs x3, AFSR0_EL1 ; encoding: [0x03,0x51,0x38,0xd5] diff --git a/llvm/test/MC/Disassembler/AArch64/basic-a64-instructions.txt b/llvm/test/MC/Disassembler/AArch64/basic-a64-instructions.txt index f46301e8c1c15b..5ffabfc692ad10 100644 --- a/llvm/test/MC/Disassembler/AArch64/basic-a64-instructions.txt +++ b/llvm/test/MC/Disassembler/AArch64/basic-a64-instructions.txt @@ -3245,6 +3245,7 @@ # CHECK: msr {{sctlr_el2|SCTLR_EL2}}, x12 # CHECK: msr {{sctlr_el3|SCTLR_EL3}}, x12 # CHECK: msr {{actlr_el1|ACTLR_EL1}}, x12 +# CHECK: msr {{actlr_el12|ACTLR_EL12}}, x12 # CHECK: msr {{actlr_el2|ACTLR_EL2}}, x12 # CHECK: msr {{actlr_el3|ACTLR_EL3}}, x12 # CHECK: msr {{cpacr_el1|CPACR_EL1}}, x12 @@ -3575,6 +3576,7 @@ # CHECK: mrs x9, {{sctlr_el2|SCTLR_EL2}} # CHECK: mrs x9, {{sctlr_el3|SCTLR_EL3}} # CHECK: mrs x9, {{actlr_el1|ACTLR_EL1}} +# CHECK: mrs x9, {{actlr_el12|ACTLR_EL12}} # CHECK: mrs x9, {{actlr_el2|ACTLR_EL2}} # CHECK: mrs x9, {{actlr_el3|ACTLR_EL3}} # CHECK: mrs x9, {{cpacr_el1|CPACR_EL1}} @@ -3867,6 +3869,7 @@ 0xc 0x10 0x1c 0xd5 0xc 0x10 0x1e 0xd5 0x2c 0x10 0x18 0xd5 +0x2c 0x10 0x1d 0xd5 0x2c 0x10 0x1c 0xd5 0x2c 0x10 0x1e 0xd5 0x4c 0x10 0x18 0xd5 @@ -4199,6 +4202,7 @@ 0x9 0x10 0x3c 0xd5 0x9 0x10 0x3e 0xd5 0x29 0x10 0x38 0xd5 +0x29 0x10 0x3d 0xd5 0x29 0x10 0x3c 0xd5 0x29 0x10 0x3e 0xd5 0x49 0x10 0x38 0xd5 From bccb22709324ae329e3d80cf8af9dd225799bc17 Mon Sep 17 00:00:00 2001 From: Ivan Radanov Ivanov Date: Wed, 21 Aug 2024 23:16:52 +0900 Subject: [PATCH 068/426] Revert "[flang][NFC] Move OpenMP related passes into a separate directory (#104732)" This reverts commit 87eeed1f0ebe57abffde560c25dd9829dc6038f3. --- flang/docs/OpenMP-declare-target.md | 4 +- flang/docs/OpenMP-descriptor-management.md | 4 +- flang/include/flang/Optimizer/CMakeLists.txt | 1 - .../flang/Optimizer/OpenMP/CMakeLists.txt | 4 -- flang/include/flang/Optimizer/OpenMP/Passes.h | 30 -------------- .../include/flang/Optimizer/OpenMP/Passes.td | 40 ------------------- .../flang/Optimizer/Transforms/Passes.td | 26 ++++++++++++ flang/include/flang/Tools/CLOptions.inc | 7 ++-- flang/lib/Frontend/CMakeLists.txt | 1 - flang/lib/Optimizer/CMakeLists.txt | 1 - flang/lib/Optimizer/OpenMP/CMakeLists.txt | 25 ------------ flang/lib/Optimizer/Transforms/CMakeLists.txt | 3 ++ .../OMPFunctionFiltering.cpp} | 18 ++++----- .../OMPMapInfoFinalization.cpp} | 21 +++++----- .../OMPMarkDeclareTarget.cpp} | 26 ++++-------- flang/tools/bbc/CMakeLists.txt | 1 - flang/tools/fir-opt/CMakeLists.txt | 1 - flang/tools/fir-opt/fir-opt.cpp | 2 - flang/tools/tco/CMakeLists.txt | 1 - 19 files changed, 63 insertions(+), 153 deletions(-) delete mode 100644 flang/include/flang/Optimizer/OpenMP/CMakeLists.txt delete mode 100644 flang/include/flang/Optimizer/OpenMP/Passes.h delete mode 100644 flang/include/flang/Optimizer/OpenMP/Passes.td delete mode 100644 flang/lib/Optimizer/OpenMP/CMakeLists.txt rename flang/lib/Optimizer/{OpenMP/FunctionFiltering.cpp => Transforms/OMPFunctionFiltering.cpp} (90%) rename flang/lib/Optimizer/{OpenMP/MapInfoFinalization.cpp => Transforms/OMPMapInfoFinalization.cpp} (96%) rename flang/lib/Optimizer/{OpenMP/MarkDeclareTarget.cpp => Transforms/OMPMarkDeclareTarget.cpp} (80%) diff --git a/flang/docs/OpenMP-declare-target.md b/flang/docs/OpenMP-declare-target.md index 45062469007b65..d29a46807e1eaf 100644 --- a/flang/docs/OpenMP-declare-target.md +++ b/flang/docs/OpenMP-declare-target.md @@ -149,7 +149,7 @@ flang/lib/Lower/OpenMP.cpp function `genDeclareTargetIntGlobal`. There are currently two passes within Flang that are related to the processing of `declare target`: -* `MarkDeclareTarget` - This pass is in charge of marking functions captured +* `OMPMarkDeclareTarget` - This pass is in charge of marking functions captured (called from) in `target` regions or other `declare target` marked functions as `declare target`. It does so recursively, i.e. nested calls will also be implicitly marked. It currently will try to mark things as conservatively as @@ -157,7 +157,7 @@ possible, e.g. if captured in a `target` region it will apply `nohost`, unless it encounters a `host` `declare target` in which case it will apply the `any` device type. Functions are handled similarly, except we utilise the parent's device type where possible. -* `FunctionFiltering` - This is executed after the `MarkDeclareTarget` +* `OMPFunctionFiltering` - This is executed after the `OMPMarkDeclareTarget` pass, and its job is to conservatively remove host functions from the module where possible when compiling for the device. This helps make sure that most incompatible code for the host is not lowered for the diff --git a/flang/docs/OpenMP-descriptor-management.md b/flang/docs/OpenMP-descriptor-management.md index 66c153914f70da..d0eb01b00f9bb9 100644 --- a/flang/docs/OpenMP-descriptor-management.md +++ b/flang/docs/OpenMP-descriptor-management.md @@ -44,7 +44,7 @@ Currently, Flang will lower these descriptor types in the OpenMP lowering (lower to all other map types, generating an omp.MapInfoOp containing relevant information required for lowering the OpenMP dialect to LLVM-IR during the final stages of the MLIR lowering. However, after the lowering to FIR/HLFIR has been performed an OpenMP dialect specific pass for Fortran, -`MapInfoFinalizationPass` (Optimizer/OpenMP/MapInfoFinalization.cpp) will expand the +`OMPMapInfoFinalizationPass` (Optimizer/OMPMapInfoFinalization.cpp) will expand the `omp.MapInfoOp`'s containing descriptors (which currently will be a `BoxType` or `BoxAddrOp`) into multiple mappings, with one extra per pointer member in the descriptor that is supported on top of the original descriptor map operation. These pointers members are linked to the parent descriptor by adding them to @@ -53,7 +53,7 @@ owning operation's (`omp.TargetOp`, `omp.TargetDataOp` etc.) map operand list an operation is `IsolatedFromAbove`, it also inserts them as `BlockArgs` to canonicalize the mappings and simplify lowering. -An example transformation by the `MapInfoFinalizationPass`: +An example transformation by the `OMPMapInfoFinalizationPass`: ``` diff --git a/flang/include/flang/Optimizer/CMakeLists.txt b/flang/include/flang/Optimizer/CMakeLists.txt index 3336ac935e1012..89e43a9ee8d621 100644 --- a/flang/include/flang/Optimizer/CMakeLists.txt +++ b/flang/include/flang/Optimizer/CMakeLists.txt @@ -2,4 +2,3 @@ add_subdirectory(CodeGen) add_subdirectory(Dialect) add_subdirectory(HLFIR) add_subdirectory(Transforms) -add_subdirectory(OpenMP) diff --git a/flang/include/flang/Optimizer/OpenMP/CMakeLists.txt b/flang/include/flang/Optimizer/OpenMP/CMakeLists.txt deleted file mode 100644 index d59573f0f7fd91..00000000000000 --- a/flang/include/flang/Optimizer/OpenMP/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -set(LLVM_TARGET_DEFINITIONS Passes.td) -mlir_tablegen(Passes.h.inc -gen-pass-decls -name FlangOpenMP) - -add_public_tablegen_target(FlangOpenMPPassesIncGen) diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.h b/flang/include/flang/Optimizer/OpenMP/Passes.h deleted file mode 100644 index 403d79667bf448..00000000000000 --- a/flang/include/flang/Optimizer/OpenMP/Passes.h +++ /dev/null @@ -1,30 +0,0 @@ -//===- Passes.h - OpenMP pass entry points ----------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This header declares the flang OpenMP passes. -// -//===----------------------------------------------------------------------===// - -#ifndef FORTRAN_OPTIMIZER_OPENMP_PASSES_H -#define FORTRAN_OPTIMIZER_OPENMP_PASSES_H - -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/IR/BuiltinOps.h" -#include "mlir/Pass/Pass.h" -#include "mlir/Pass/PassRegistry.h" - -#include - -namespace flangomp { -#define GEN_PASS_DECL -#define GEN_PASS_REGISTRATION -#include "flang/Optimizer/OpenMP/Passes.h.inc" - -} // namespace flangomp - -#endif // FORTRAN_OPTIMIZER_OPENMP_PASSES_H diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td deleted file mode 100644 index 395178e26a5762..00000000000000 --- a/flang/include/flang/Optimizer/OpenMP/Passes.td +++ /dev/null @@ -1,40 +0,0 @@ -//===-- Passes.td - flang OpenMP pass definition -----------*- tablegen -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef FORTRAN_OPTIMIZER_OPENMP_PASSES -#define FORTRAN_OPTIMIZER_OPENMP_PASSES - -include "mlir/Pass/PassBase.td" - -def MapInfoFinalizationPass - : Pass<"omp-map-info-finalization"> { - let summary = "expands OpenMP MapInfo operations containing descriptors"; - let description = [{ - Expands MapInfo operations containing descriptor types into multiple - MapInfo's for each pointer element in the descriptor that requires - explicit individual mapping by the OpenMP runtime. - }]; - let dependentDialects = ["mlir::omp::OpenMPDialect"]; -} - -def MarkDeclareTargetPass - : Pass<"omp-mark-declare-target", "mlir::ModuleOp"> { - let summary = "Marks all functions called by an OpenMP declare target function as declare target"; - let dependentDialects = ["mlir::omp::OpenMPDialect"]; -} - -def FunctionFiltering : Pass<"omp-function-filtering"> { - let summary = "Filters out functions intended for the host when compiling " - "for the target device."; - let dependentDialects = [ - "mlir::func::FuncDialect", - "fir::FIROpsDialect" - ]; -} - -#endif //FORTRAN_OPTIMIZER_OPENMP_PASSES diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td index 53a1b55450972e..c703a62c03b7d9 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.td +++ b/flang/include/flang/Optimizer/Transforms/Passes.td @@ -340,6 +340,32 @@ def LoopVersioning : Pass<"loop-versioning", "mlir::func::FuncOp"> { let dependentDialects = [ "fir::FIROpsDialect" ]; } +def OMPMapInfoFinalizationPass + : Pass<"omp-map-info-finalization"> { + let summary = "expands OpenMP MapInfo operations containing descriptors"; + let description = [{ + Expands MapInfo operations containing descriptor types into multiple + MapInfo's for each pointer element in the descriptor that requires + explicit individual mapping by the OpenMP runtime. + }]; + let dependentDialects = ["mlir::omp::OpenMPDialect"]; +} + +def OMPMarkDeclareTargetPass + : Pass<"omp-mark-declare-target", "mlir::ModuleOp"> { + let summary = "Marks all functions called by an OpenMP declare target function as declare target"; + let dependentDialects = ["mlir::omp::OpenMPDialect"]; +} + +def OMPFunctionFiltering : Pass<"omp-function-filtering"> { + let summary = "Filters out functions intended for the host when compiling " + "for the target device."; + let dependentDialects = [ + "mlir::func::FuncDialect", + "fir::FIROpsDialect" + ]; +} + def VScaleAttr : Pass<"vscale-attr", "mlir::func::FuncOp"> { let summary = "Add vscale_range attribute to functions"; let description = [{ diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc index 05b2f31711add2..7df50449494631 100644 --- a/flang/include/flang/Tools/CLOptions.inc +++ b/flang/include/flang/Tools/CLOptions.inc @@ -17,7 +17,6 @@ #include "mlir/Transforms/Passes.h" #include "flang/Optimizer/CodeGen/CodeGen.h" #include "flang/Optimizer/HLFIR/Passes.h" -#include "flang/Optimizer/OpenMP/Passes.h" #include "flang/Optimizer/Transforms/Passes.h" #include "llvm/Passes/OptimizationLevel.h" #include "llvm/Support/CommandLine.h" @@ -359,10 +358,10 @@ inline void createHLFIRToFIRPassPipeline( inline void createOpenMPFIRPassPipeline( mlir::PassManager &pm, bool isTargetDevice) { addNestedPassToAllTopLevelOperations( - pm, flangomp::createMapInfoFinalizationPass); - pm.addPass(flangomp::createMarkDeclareTargetPass()); + pm, fir::createOMPMapInfoFinalizationPass); + pm.addPass(fir::createOMPMarkDeclareTargetPass()); if (isTargetDevice) - pm.addPass(flangomp::createFunctionFiltering()); + pm.addPass(fir::createOMPFunctionFiltering()); } #if !defined(FLANG_EXCLUDE_CODEGEN) diff --git a/flang/lib/Frontend/CMakeLists.txt b/flang/lib/Frontend/CMakeLists.txt index ecdcc73d61ec1f..c20b9096aff496 100644 --- a/flang/lib/Frontend/CMakeLists.txt +++ b/flang/lib/Frontend/CMakeLists.txt @@ -38,7 +38,6 @@ add_flang_library(flangFrontend FIRTransforms HLFIRDialect HLFIRTransforms - FlangOpenMPTransforms MLIRTransforms MLIRBuiltinToLLVMIRTranslation MLIRLLVMToLLVMIRTranslation diff --git a/flang/lib/Optimizer/CMakeLists.txt b/flang/lib/Optimizer/CMakeLists.txt index dd153ac33c0fbb..4a602162ed2b77 100644 --- a/flang/lib/Optimizer/CMakeLists.txt +++ b/flang/lib/Optimizer/CMakeLists.txt @@ -5,4 +5,3 @@ add_subdirectory(HLFIR) add_subdirectory(Support) add_subdirectory(Transforms) add_subdirectory(Analysis) -add_subdirectory(OpenMP) diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt deleted file mode 100644 index a8984d256b8f6a..00000000000000 --- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) - -add_flang_library(FlangOpenMPTransforms - FunctionFiltering.cpp - MapInfoFinalization.cpp - MarkDeclareTarget.cpp - - DEPENDS - FIRDialect - HLFIROpsIncGen - FlangOpenMPPassesIncGen - - LINK_LIBS - FIRAnalysis - FIRBuilder - FIRCodeGen - FIRDialect - FIRDialectSupport - FIRSupport - FortranCommon - MLIRFuncDialect - MLIROpenMPDialect - HLFIRDialect - MLIRIR -) diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt index a6fc8e999d44da..3869633bd98e02 100644 --- a/flang/lib/Optimizer/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt @@ -21,6 +21,9 @@ add_flang_library(FIRTransforms AddDebugInfo.cpp PolymorphicOpConversion.cpp LoopVersioning.cpp + OMPFunctionFiltering.cpp + OMPMapInfoFinalization.cpp + OMPMarkDeclareTarget.cpp StackReclaim.cpp VScaleAttr.cpp FunctionAttr.cpp diff --git a/flang/lib/Optimizer/OpenMP/FunctionFiltering.cpp b/flang/lib/Optimizer/Transforms/OMPFunctionFiltering.cpp similarity index 90% rename from flang/lib/Optimizer/OpenMP/FunctionFiltering.cpp rename to flang/lib/Optimizer/Transforms/OMPFunctionFiltering.cpp index bd9005d3e2df6f..0c472246c2a44c 100644 --- a/flang/lib/Optimizer/OpenMP/FunctionFiltering.cpp +++ b/flang/lib/Optimizer/Transforms/OMPFunctionFiltering.cpp @@ -1,4 +1,4 @@ -//===- FunctionFiltering.cpp -------------------------------------------===// +//===- OMPFunctionFiltering.cpp -------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -13,7 +13,7 @@ #include "flang/Optimizer/Dialect/FIRDialect.h" #include "flang/Optimizer/Dialect/FIROpsSupport.h" -#include "flang/Optimizer/OpenMP/Passes.h" +#include "flang/Optimizer/Transforms/Passes.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" @@ -21,18 +21,18 @@ #include "mlir/IR/BuiltinOps.h" #include "llvm/ADT/SmallVector.h" -namespace flangomp { -#define GEN_PASS_DEF_FUNCTIONFILTERING -#include "flang/Optimizer/OpenMP/Passes.h.inc" -} // namespace flangomp +namespace fir { +#define GEN_PASS_DEF_OMPFUNCTIONFILTERING +#include "flang/Optimizer/Transforms/Passes.h.inc" +} // namespace fir using namespace mlir; namespace { -class FunctionFilteringPass - : public flangomp::impl::FunctionFilteringBase { +class OMPFunctionFilteringPass + : public fir::impl::OMPFunctionFilteringBase { public: - FunctionFilteringPass() = default; + OMPFunctionFilteringPass() = default; void runOnOperation() override { MLIRContext *context = &getContext(); diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp similarity index 96% rename from flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp rename to flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp index 6e9cd03dca8f3f..ddaa3c5f404f0b 100644 --- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp +++ b/flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp @@ -1,4 +1,5 @@ -//===- MapInfoFinalization.cpp -----------------------------------------===// +//===- OMPMapInfoFinalization.cpp +//---------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -27,7 +28,7 @@ #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Optimizer/Dialect/Support/KindMapping.h" -#include "flang/Optimizer/OpenMP/Passes.h" +#include "flang/Optimizer/Transforms/Passes.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" #include "mlir/IR/BuiltinDialect.h" @@ -40,15 +41,15 @@ #include "llvm/Frontend/OpenMP/OMPConstants.h" #include -namespace flangomp { -#define GEN_PASS_DEF_MAPINFOFINALIZATIONPASS -#include "flang/Optimizer/OpenMP/Passes.h.inc" -} // namespace flangomp +namespace fir { +#define GEN_PASS_DEF_OMPMAPINFOFINALIZATIONPASS +#include "flang/Optimizer/Transforms/Passes.h.inc" +} // namespace fir namespace { -class MapInfoFinalizationPass - : public flangomp::impl::MapInfoFinalizationPassBase< - MapInfoFinalizationPass> { +class OMPMapInfoFinalizationPass + : public fir::impl::OMPMapInfoFinalizationPassBase< + OMPMapInfoFinalizationPass> { void genDescriptorMemberMaps(mlir::omp::MapInfoOp op, fir::FirOpBuilder &builder, @@ -244,7 +245,7 @@ class MapInfoFinalizationPass // all users appropriately, making sure to only add a single member link // per new generation for the original originating descriptor MapInfoOp. assert(llvm::hasSingleElement(op->getUsers()) && - "MapInfoFinalization currently only supports single users " + "OMPMapInfoFinalization currently only supports single users " "of a MapInfoOp"); if (!op.getMembers().empty()) { diff --git a/flang/lib/Optimizer/OpenMP/MarkDeclareTarget.cpp b/flang/lib/Optimizer/Transforms/OMPMarkDeclareTarget.cpp similarity index 80% rename from flang/lib/Optimizer/OpenMP/MarkDeclareTarget.cpp rename to flang/lib/Optimizer/Transforms/OMPMarkDeclareTarget.cpp index a7ffd5fda82b7f..4946e13b22865d 100644 --- a/flang/lib/Optimizer/OpenMP/MarkDeclareTarget.cpp +++ b/flang/lib/Optimizer/Transforms/OMPMarkDeclareTarget.cpp @@ -1,16 +1,4 @@ -//===- MarkDeclareTarget.cpp -------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Mark functions called from explicit target code as implicitly declare target. -// -//===----------------------------------------------------------------------===// - -#include "flang/Optimizer/OpenMP/Passes.h" +#include "flang/Optimizer/Transforms/Passes.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" @@ -22,14 +10,14 @@ #include "mlir/Support/LLVM.h" #include "llvm/ADT/SmallPtrSet.h" -namespace flangomp { -#define GEN_PASS_DEF_MARKDECLARETARGETPASS -#include "flang/Optimizer/OpenMP/Passes.h.inc" -} // namespace flangomp +namespace fir { +#define GEN_PASS_DEF_OMPMARKDECLARETARGETPASS +#include "flang/Optimizer/Transforms/Passes.h.inc" +} // namespace fir namespace { -class MarkDeclareTargetPass - : public flangomp::impl::MarkDeclareTargetPassBase { +class OMPMarkDeclareTargetPass + : public fir::impl::OMPMarkDeclareTargetPassBase { void markNestedFuncs(mlir::omp::DeclareTargetDeviceType parentDevTy, mlir::omp::DeclareTargetCaptureClause parentCapClause, diff --git a/flang/tools/bbc/CMakeLists.txt b/flang/tools/bbc/CMakeLists.txt index 69316d4dc61de3..9410fd00566006 100644 --- a/flang/tools/bbc/CMakeLists.txt +++ b/flang/tools/bbc/CMakeLists.txt @@ -25,7 +25,6 @@ FIRTransforms FIRBuilder HLFIRDialect HLFIRTransforms -FlangOpenMPTransforms ${dialect_libs} ${extension_libs} MLIRAffineToStandard diff --git a/flang/tools/fir-opt/CMakeLists.txt b/flang/tools/fir-opt/CMakeLists.txt index 4c6dbf7d9c8c37..43679a9d535782 100644 --- a/flang/tools/fir-opt/CMakeLists.txt +++ b/flang/tools/fir-opt/CMakeLists.txt @@ -19,7 +19,6 @@ target_link_libraries(fir-opt PRIVATE FIRCodeGen HLFIRDialect HLFIRTransforms - FlangOpenMPTransforms FIRAnalysis ${test_libs} ${dialect_libs} diff --git a/flang/tools/fir-opt/fir-opt.cpp b/flang/tools/fir-opt/fir-opt.cpp index f75fba27c68f08..1846c1b317848f 100644 --- a/flang/tools/fir-opt/fir-opt.cpp +++ b/flang/tools/fir-opt/fir-opt.cpp @@ -14,7 +14,6 @@ #include "mlir/Tools/mlir-opt/MlirOptMain.h" #include "flang/Optimizer/CodeGen/CodeGen.h" #include "flang/Optimizer/HLFIR/Passes.h" -#include "flang/Optimizer/OpenMP/Passes.h" #include "flang/Optimizer/Support/InitFIR.h" #include "flang/Optimizer/Transforms/Passes.h" @@ -35,7 +34,6 @@ int main(int argc, char **argv) { fir::registerOptCodeGenPasses(); fir::registerOptTransformPasses(); hlfir::registerHLFIRPasses(); - flangomp::registerFlangOpenMPPasses(); #ifdef FLANG_INCLUDE_TESTS fir::test::registerTestFIRAliasAnalysisPass(); mlir::registerSideEffectTestPasses(); diff --git a/flang/tools/tco/CMakeLists.txt b/flang/tools/tco/CMakeLists.txt index 698a398547c773..808219ac361f2a 100644 --- a/flang/tools/tco/CMakeLists.txt +++ b/flang/tools/tco/CMakeLists.txt @@ -17,7 +17,6 @@ target_link_libraries(tco PRIVATE FIRBuilder HLFIRDialect HLFIRTransforms - FlangOpenMPTransforms ${dialect_libs} ${extension_libs} MLIRIR From d6d8243dcd4ea768549904036ed31b8e59e14c73 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 21 Aug 2024 07:20:23 -0700 Subject: [PATCH 069/426] [LTO] Use DenseSet in computeLTOCacheKey (NFC) (#105466) The two instances of std::set are used only for membership checking purposes in computeLTOCacheKey. We do not need std::set's strengths like iterators staying valid or the ability to traverse in a sorted order. This patch changes them to DenseSet. While I am at it, this patch replaces count with contains for slightly increased readability. --- llvm/include/llvm/LTO/LTO.h | 4 ++-- llvm/lib/LTO/LTO.cpp | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h index 0781d57feb5a64..949e80a43f0e88 100644 --- a/llvm/include/llvm/LTO/LTO.h +++ b/llvm/include/llvm/LTO/LTO.h @@ -68,8 +68,8 @@ std::string computeLTOCacheKey( const FunctionImporter::ExportSetTy &ExportList, const std::map &ResolvedODR, const GVSummaryMapTy &DefinedGlobals, - const std::set &CfiFunctionDefs = {}, - const std::set &CfiFunctionDecls = {}); + const DenseSet &CfiFunctionDefs = {}, + const DenseSet &CfiFunctionDecls = {}); namespace lto { diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index f69e089edf42e7..cb3369d93754d5 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -95,8 +95,8 @@ std::string llvm::computeLTOCacheKey( const FunctionImporter::ExportSetTy &ExportList, const std::map &ResolvedODR, const GVSummaryMapTy &DefinedGlobals, - const std::set &CfiFunctionDefs, - const std::set &CfiFunctionDecls) { + const DenseSet &CfiFunctionDefs, + const DenseSet &CfiFunctionDecls) { // Compute the unique hash for this entry. // This is based on the current compiler version, the module itself, the // export list, the hash for every single module in the import list, the @@ -237,9 +237,9 @@ std::string llvm::computeLTOCacheKey( std::set UsedTypeIds; auto AddUsedCfiGlobal = [&](GlobalValue::GUID ValueGUID) { - if (CfiFunctionDefs.count(ValueGUID)) + if (CfiFunctionDefs.contains(ValueGUID)) UsedCfiDefs.insert(ValueGUID); - if (CfiFunctionDecls.count(ValueGUID)) + if (CfiFunctionDecls.contains(ValueGUID)) UsedCfiDecls.insert(ValueGUID); }; @@ -1429,8 +1429,8 @@ class InProcessThinBackend : public ThinBackendProc { DefaultThreadPool BackendThreadPool; AddStreamFn AddStream; FileCache Cache; - std::set CfiFunctionDefs; - std::set CfiFunctionDecls; + DenseSet CfiFunctionDefs; + DenseSet CfiFunctionDecls; std::optional Err; std::mutex ErrMu; From 5ddc79b093f2afaaf2c69d20d7d44448da04458a Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 21 Aug 2024 07:23:30 -0700 Subject: [PATCH 070/426] [LTO] Use a range-based for loop (NFC) (#105467) --- llvm/lib/LTO/LTO.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index cb3369d93754d5..e5545860c329d4 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -330,8 +330,8 @@ std::string llvm::computeLTOCacheKey( // Include the hash for all type identifiers used by this module. for (GlobalValue::GUID TId : UsedTypeIds) { auto TidIter = Index.typeIds().equal_range(TId); - for (auto It = TidIter.first; It != TidIter.second; ++It) - AddTypeIdSummary(It->second.first, It->second.second); + for (const auto &I : make_range(TidIter)) + AddTypeIdSummary(I.second.first, I.second.second); } AddUnsigned(UsedCfiDefs.size()); From 70e8c982d0589b1a56faf0768b45596c2da3a510 Mon Sep 17 00:00:00 2001 From: Sjoerd Meijer Date: Wed, 21 Aug 2024 15:27:09 +0100 Subject: [PATCH 071/426] [AArch64] Bail out for scalable vecs in areExtractShuffleVectors (#105484) The added test triggers the following assert in `areExtractShuffleVectors` that is called from `shouldSinkOperands`: Assertion `(!isScalable() || isZero()) && "Request for a fixed element count on a scalable object"' failed. I don't think scalable types can be extract shuffles, so bail early if this is the case. --- .../Target/AArch64/AArch64ISelLowering.cpp | 4 ++++ .../AArch64/sink-free-instructions.ll | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index e1d265fdf0d1a8..dbe9413f05d013 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -16149,6 +16149,10 @@ static bool isSplatShuffle(Value *V) { /// or upper half of the vector elements. static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat = false) { + // Scalable types can't be extract shuffle vectors. + if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy()) + return false; + auto areTypesHalfed = [](Value *FullV, Value *HalfV) { auto *FullTy = FullV->getType(); auto *HalfTy = HalfV->getType(); diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll index d6629bf4b1849b..0ccfd9c20c12ef 100644 --- a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll @@ -984,3 +984,22 @@ if.else: ret <5 x float> %r.4 } +; This ran in an assert in `areExtractShuffleVectors`. +define @scalable_types_cannot_be_extract_shuffle() { +; CHECK-LABEL: @scalable_types_cannot_be_extract_shuffle( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[BROADCAST_SPLAT68:%.*]] = shufflevector zeroinitializer, poison, zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = zext [[BROADCAST_SPLAT68]] to +; CHECK-NEXT: [[BROADCAST_SPLAT70:%.*]] = shufflevector zeroinitializer, poison, zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = zext [[BROADCAST_SPLAT70]] to +; CHECK-NEXT: [[TMP2:%.*]] = sub [[TMP0]], [[TMP1]] +; CHECK-NEXT: ret [[TMP2]] +; +entry: + %broadcast.splat68 = shufflevector zeroinitializer, poison, zeroinitializer + %0 = zext %broadcast.splat68 to + %broadcast.splat70 = shufflevector zeroinitializer, poison, zeroinitializer + %1 = zext %broadcast.splat70 to + %2 = sub %0, %1 + ret %2 +} From 32c38dd85ee27fc7c2dd6a749fc1f7af4abdbea1 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 21 Aug 2024 10:29:10 -0400 Subject: [PATCH 072/426] [libc++] Mark C++14 as complete and remove the status pages (#105514) We already documented that libc++ was C++14 complete, but we still documented the status of C++14. Since that is redundant (and I suspect the C++14 status page was missing some stuff), simply remove them. --- libcxx/docs/Status/Cxx14.rst | 50 ------ libcxx/docs/Status/Cxx14Issues.csv | 157 ------------------- libcxx/docs/Status/Cxx14Papers.csv | 32 ---- libcxx/docs/index.rst | 3 +- libcxx/utils/synchronize_csv_status_files.py | 2 - 5 files changed, 1 insertion(+), 243 deletions(-) delete mode 100644 libcxx/docs/Status/Cxx14.rst delete mode 100644 libcxx/docs/Status/Cxx14Issues.csv delete mode 100644 libcxx/docs/Status/Cxx14Papers.csv diff --git a/libcxx/docs/Status/Cxx14.rst b/libcxx/docs/Status/Cxx14.rst deleted file mode 100644 index 0557bdc285d707..00000000000000 --- a/libcxx/docs/Status/Cxx14.rst +++ /dev/null @@ -1,50 +0,0 @@ -.. _cxx14-status: - -================================ -libc++ C++14 Status -================================ - -.. include:: ../Helpers/Styles.rst - -.. contents:: - :local: - - -Overview -================================ - -In April 2013, the C++ standard committee approved the draft for the next version of the C++ standard, initially known as "C++1y". - -The draft standard includes papers and issues that were voted on at the previous three meetings (Kona, Portland, and Bristol). - -In August 2014, this draft was approved by ISO as C++14. - -This page shows the status of libc++; the status of clang's support of the language features is `here `__. - -The groups that have contributed papers: - -- CWG - Core Language Working group -- LWG - Library working group -- SG1 - Study group #1 (Concurrency working group) - - -.. _paper-status-cxx14: - -Paper Status -==================================== - -.. csv-table:: - :file: Cxx14Papers.csv - :header-rows: 1 - :widths: auto - - -.. _issues-status-cxx14: - -Library Working Group Issues Status -==================================== - -.. csv-table:: - :file: Cxx14Issues.csv - :header-rows: 1 - :widths: auto diff --git a/libcxx/docs/Status/Cxx14Issues.csv b/libcxx/docs/Status/Cxx14Issues.csv deleted file mode 100644 index aff88b89774e48..00000000000000 --- a/libcxx/docs/Status/Cxx14Issues.csv +++ /dev/null @@ -1,157 +0,0 @@ -"Issue #","Issue Name","Meeting","Status","First released version","Labels" -"`LWG1214 `__","Insufficient/inconsistent key immutability requirements for associative containers","2012-02 (Kona)","|Complete|","","" -"`LWG2009 `__","Reporting out-of-bound values on numeric string conversions","2012-02 (Kona)","|Complete|","","" -"`LWG2010 `__","``is_*``\ traits for binding operations can't be meaningfully specialized","2012-02 (Kona)","|Complete|","","" -"`LWG2015 `__","Incorrect pre-conditions for some type traits","2012-02 (Kona)","|Complete|","","" -"`LWG2021 `__","Further incorrect usages of result_of","2012-02 (Kona)","|Complete|","","" -"`LWG2028 `__","messages_base::catalog overspecified","2012-02 (Kona)","|Complete|","","" -"`LWG2033 `__","Preconditions of reserve, shrink_to_fit, and resize functions","2012-02 (Kona)","|Complete|","","" -"`LWG2039 `__","Issues with std::reverse and std::copy_if","2012-02 (Kona)","|Complete|","","" -"`LWG2044 `__","No definition of ""Stable"" for copy algorithms","2012-02 (Kona)","|Complete|","","" -"`LWG2045 `__","forward_list::merge and forward_list::splice_after with unequal allocators","2012-02 (Kona)","|Complete|","","" -"`LWG2047 `__","Incorrect ""mixed"" move-assignment semantics of unique_ptr","2012-02 (Kona)","|Complete|","","" -"`LWG2050 `__","Unordered associative containers do not use allocator_traits to define member types","2012-02 (Kona)","|Complete|","","" -"`LWG2053 `__","Errors in regex bitmask types","2012-02 (Kona)","|Complete|","","" -"`LWG2061 `__","make_move_iterator and arrays","2012-02 (Kona)","|Complete|","","" -"`LWG2064 `__","More noexcept issues in basic_string","2012-02 (Kona)","|Complete|","","" -"`LWG2065 `__","Minimal allocator interface","2012-02 (Kona)","|Complete|","","" -"`LWG2067 `__","packaged_task should have deleted copy c'tor with const parameter","2012-02 (Kona)","|Complete|","","" -"`LWG2069 `__","Inconsistent exception spec for basic_string move constructor","2012-02 (Kona)","|Complete|","","" -"`LWG2096 `__","Incorrect constraints of future::get in regard to MoveAssignable","2012-02 (Kona)","|Complete|","","" -"`LWG2102 `__","Why is std::launch an implementation-defined type?","2012-02 (Kona)","|Complete|","","" -"","","","","","" -"`LWG2071 `__","std::valarray move-assignment","2012-10 (Portland)","|Complete|","","" -"`LWG2074 `__","Off by one error in std::reverse_copy","2012-10 (Portland)","|Complete|","","" -"`LWG2081 `__","Allocator requirements should include CopyConstructible","2012-10 (Portland)","|Complete|","","" -"`LWG2083 `__","const-qualification on weak_ptr::owner_before","2012-10 (Portland)","|Complete|","","" -"`LWG2086 `__","Overly generic type support for math functions","2012-10 (Portland)","|Complete|","","" -"`LWG2099 `__","Unnecessary constraints of va_start() usage","2012-10 (Portland)","|Complete|","","" -"`LWG2103 `__","std::allocator_traits>::propagate_on_container_move_assignment","2012-10 (Portland)","|Complete|","","" -"`LWG2105 `__","Inconsistent requirements on ``const_iterator``'s value_type","2012-10 (Portland)","|Complete|","","" -"`LWG2110 `__","remove can't swap but note says it might","2012-10 (Portland)","|Complete|","","" -"`LWG2123 `__","merge() allocator requirements for lists versus forward lists","2012-10 (Portland)","|Complete|","","" -"`LWG2005 `__","unordered_map::insert(T&&) protection should apply to map too","2012-10 (Portland)","|Complete|","","" -"`LWG2011 `__","Unexpected output required of strings","2012-10 (Portland)","|Complete|","","" -"`LWG2048 `__","Unnecessary mem_fn overloads","2012-10 (Portland)","|Complete|","","" -"`LWG2049 `__","``is_destructible``\ is underspecified","2012-10 (Portland)","|Complete|","","" -"`LWG2056 `__","future_errc enums start with value 0 (invalid value for broken_promise)","2012-10 (Portland)","|Complete|","","" -"`LWG2058 `__","valarray and begin/end","2012-10 (Portland)","|Complete|","","" -"","","","","","" -"`LWG2091 `__","Misplaced effect in m.try_lock_for()","2013-04 (Bristol)","|Complete|","","" -"`LWG2092 `__","Vague Wording for condition_variable_any","2013-04 (Bristol)","|Complete|","","" -"`LWG2093 `__","Throws clause of condition_variable::wait with predicate","2013-04 (Bristol)","|Complete|","","" -"`LWG2094 `__","duration conversion overflow shouldn't participate in overload resolution","2013-04 (Bristol)","|Complete|","","" -"`LWG2122 `__","merge() stability for lists versus forward lists","2013-04 (Bristol)","|Complete|","","" -"`LWG2128 `__","Absence of global functions cbegin/cend","2013-04 (Bristol)","|Complete|","","" -"`LWG2145 `__","error_category default constructor","2013-04 (Bristol)","|Complete|","","" -"`LWG2147 `__","Unclear hint type in Allocator's allocate function","2013-04 (Bristol)","|Complete|","","" -"`LWG2148 `__","Hashing enums should be supported directly by std::hash","2013-04 (Bristol)","|Complete|","","" -"`LWG2149 `__","Concerns about 20.8/5","2013-04 (Bristol)","|Complete|","","" -"`LWG2162 `__","allocator_traits::max_size missing noexcept","2013-04 (Bristol)","|Complete|","","" -"`LWG2163 `__","nth_element requires inconsistent post-conditions","2013-04 (Bristol)","|Complete|","","" -"`LWG2169 `__","Missing reset() requirements in unique_ptr specialization","2013-04 (Bristol)","|Complete|","","" -"`LWG2172 `__","Does ``atomic_compare_exchange_*``\ accept v == nullptr arguments?","2013-04 (Bristol)","|Complete|","","" -"`LWG2080 `__","Specify when once_flag becomes invalid","2013-04 (Bristol)","|Complete|","","" -"`LWG2098 `__","promise throws clauses","2013-04 (Bristol)","|Complete|","","" -"`LWG2109 `__","Incorrect requirements for hash specializations","2013-04 (Bristol)","|Complete|","","" -"`LWG2130 `__","missing ordering constraints for fences","2013-04 (Bristol)","|Complete|","","" -"`LWG2138 `__","atomic_flag::clear ordering constraints","2013-04 (Bristol)","|Complete|","","" -"`LWG2140 `__","notify_all_at_thread_exit synchronization","2013-04 (Bristol)","|Complete|","","" -"`LWG2144 `__","Missing noexcept specification in type_index","2013-04 (Bristol)","|Complete|","","" -"`LWG2174 `__","wstring_convert::converted() should be noexcept","2013-04 (Bristol)","|Complete|","","" -"`LWG2175 `__","string_convert and wbuffer_convert validity","2013-04 (Bristol)","|Complete|","","" -"`LWG2176 `__","Special members for wstring_convert and wbuffer_convert","2013-04 (Bristol)","|Complete|","","" -"`LWG2177 `__","Requirements on Copy/MoveInsertable","2013-04 (Bristol)","|Complete|","","" -"`LWG2185 `__","Missing throws clause for future/shared_future::wait_for/wait_until","2013-04 (Bristol)","|Complete|","","" -"`LWG2187 `__","vector is missing emplace and emplace_back member functions","2013-04 (Bristol)","|Complete|","","" -"`LWG2190 `__","ordering of condition variable operations, reflects Posix discussion","2013-04 (Bristol)","|Complete|","","" -"`LWG2196 `__","Specification of ``is_*[copy/move]_[constructible/assignable]``\ unclear for non-referencable types","2013-04 (Bristol)","|Complete|","","" -"`LWG2197 `__","Specification of ``is_[un]signed``\ unclear for non-arithmetic types","2013-04 (Bristol)","|Complete|","","" -"`LWG2200 `__","Data race avoidance for all containers, not only for sequences","2013-04 (Bristol)","|Complete|","","" -"`LWG2203 `__","scoped_allocator_adaptor uses wrong argument types for piecewise construction","2013-04 (Bristol)","|Complete|","","" -"`LWG2207 `__","basic_string::at should not have a Requires clause","2013-04 (Bristol)","|Complete|","","" -"`LWG2209 `__","assign() overspecified for sequence containers","2013-04 (Bristol)","|Complete|","","" -"`LWG2210 `__","Missing allocator-extended constructor for allocator-aware containers","2013-04 (Bristol)","|Complete|","","" -"`LWG2211 `__","Replace ambiguous use of ""Allocator"" in container requirements","2013-04 (Bristol)","|Complete|","","" -"`LWG2222 `__","Inconsistency in description of forward_list::splice_after single-element overload","2013-04 (Bristol)","|Complete|","","" -"`LWG2225 `__","Unrealistic header inclusion checks required","2013-04 (Bristol)","|Complete|","","" -"`LWG2229 `__","Standard code conversion facets underspecified","2013-04 (Bristol)","|Complete|","","" -"`LWG2231 `__","DR 704 removes complexity guarantee for clear()","2013-04 (Bristol)","|Complete|","","" -"`LWG2235 `__","Undefined behavior without proper requirements on basic_string constructors","2013-04 (Bristol)","|Complete|","","" -"","","","","","" -"`LWG2141 `__","common_type trait produces reference types","2013-09 (Chicago)","|Complete|","","" -"`LWG2246 `__","unique_ptr assignment effects w.r.t. deleter","2013-09 (Chicago)","|Complete|","","" -"`LWG2247 `__","Type traits and std::nullptr_t","2013-09 (Chicago)","|Complete|","","" -"`LWG2085 `__","Wrong description of effect 1 of basic_istream::ignore","2013-09 (Chicago)","|Complete|","","" -"`LWG2087 `__","iostream_category() and noexcept","2013-09 (Chicago)","|Complete|","","" -"`LWG2143 `__","ios_base::xalloc should be thread-safe","2013-09 (Chicago)","|Complete|","","" -"`LWG2150 `__","Unclear specification of find_end","2013-09 (Chicago)","|Complete|","","" -"`LWG2180 `__","Exceptions from std::seed_seq operations","2013-09 (Chicago)","|Complete|","","" -"`LWG2194 `__","Impossible container requirements for adaptor types","2013-09 (Chicago)","|Complete|","","" -"`LWG2013 `__","Do library implementers have the freedom to add constexpr?","2013-09 (Chicago)","|Complete|","","" -"`LWG2018 `__","regex_traits::isctype Returns clause is wrong","2013-09 (Chicago)","|Complete|","","" -"`LWG2078 `__","Throw specification of async() incomplete","2013-09 (Chicago)","|Complete|","","" -"`LWG2097 `__","packaged_task constructors should be constrained","2013-09 (Chicago)","|Complete|","","" -"`LWG2100 `__","Timed waiting functions cannot timeout if launch::async policy used","2013-09 (Chicago)","|Complete|","","" -"`LWG2120 `__","What should async do if neither 'async' nor 'deferred' is set in policy?","2013-09 (Chicago)","|Complete|","","" -"`LWG2159 `__","atomic_flag initialization","2013-09 (Chicago)","|Complete|","","" -"`LWG2275 `__","Why is forward_as_tuple not constexpr?","2013-09 (Chicago)","|Complete|","","" -"`LWG2284 `__","Inconsistency in allocator_traits::max_size","2013-09 (Chicago)","|Complete|","","" -"`LWG2298 `__","``is_nothrow_constructible``\ is always false because of create<>","2013-09 (Chicago)","|Complete|","","" -"`LWG2300 `__","Redundant sections for map and multimap members should be removed","2013-09 (Chicago)","|Complete|","","" -"`LWG2249 `__","NB comment GB9: Remove gets from C++14","2013-09 (Chicago)","|Complete|","","" -"","","","","","" -"`LWG2135 `__","Unclear requirement for exceptions thrown in condition_variable::wait()","2014-02 (Issaquah)","|Complete|","","" -"`LWG2291 `__","std::hash is vulnerable to collision DoS attack","2014-02 (Issaquah)","|Complete|","","" -"`LWG2142 `__","packaged_task::operator() synchronization too broad?","2014-02 (Issaquah)","|Complete|","","" -"`LWG2240 `__","Probable misuse of term ""function scope"" in [thread.condition]","2014-02 (Issaquah)","|Complete|","","" -"`LWG2252 `__","Strong guarantee on vector::push_back() still broken with C++11?","2014-02 (Issaquah)","|Complete|","","" -"`LWG2257 `__","Simplify container requirements with the new algorithms","2014-02 (Issaquah)","|Complete|","","" -"`LWG2268 `__","Setting a default argument in the declaration of a member function assign of std::basic_string","2014-02 (Issaquah)","|Complete|","","" -"`LWG2271 `__","regex_traits::lookup_classname specification unclear","2014-02 (Issaquah)","|Complete|","","" -"`LWG2272 `__","quoted should use char_traits::eq for character comparison","2014-02 (Issaquah)","|Complete|","","" -"`LWG2278 `__","User-defined literals for Standard Library types","2014-02 (Issaquah)","|Complete|","","" -"`LWG2280 `__","begin / end for arrays should be constexpr and noexcept","2014-02 (Issaquah)","|Complete|","","" -"`LWG2285 `__","make_reverse_iterator","2014-02 (Issaquah)","|Complete|","","" -"`LWG2299 `__","Effects of inaccessible ``key_compare::is_transparent``\ type are not clear","2014-02 (Issaquah)","|Complete|","","" -"`LWG1450 `__","Contradiction in regex_constants","2014-02 (Issaquah)","|Complete|","","" -"`LWG2003 `__","String exception inconsistency in erase.","2014-02 (Issaquah)","|Complete|","","" -"`LWG2112 `__","User-defined classes that cannot be derived from","2014-02 (Issaquah)","|Complete|","","" -"`LWG2132 `__","std::function ambiguity","2014-02 (Issaquah)","|Complete|","","" -"`LWG2182 `__","``Container::[const_]reference`` types are misleadingly specified","2014-02 (Issaquah)","|Complete|","","" -"`LWG2188 `__","Reverse iterator does not fully support targets that overload operator&","2014-02 (Issaquah)","|Complete|","","" -"`LWG2193 `__","Default constructors for standard library containers are explicit","2014-02 (Issaquah)","|Complete|","","" -"`LWG2205 `__","Problematic postconditions of regex_match and regex_search","2014-02 (Issaquah)","|Complete|","","" -"`LWG2213 `__","Return value of std::regex_replace","2014-02 (Issaquah)","|Complete|","","" -"`LWG2258 `__","a.erase(q1, q2) unable to directly return q2","2014-02 (Issaquah)","|Complete|","","" -"`LWG2263 `__","Comparing iterators and allocator pointers with different const-character","2014-02 (Issaquah)","|Complete|","","" -"`LWG2293 `__","Wrong facet used by num_put::do_put","2014-02 (Issaquah)","|Complete|","","" -"`LWG2301 `__","Why is std::tie not constexpr?","2014-02 (Issaquah)","|Complete|","","" -"`LWG2304 `__","Complexity of count in unordered associative containers","2014-02 (Issaquah)","|Complete|","","" -"`LWG2306 `__","match_results::reference should be value_type&, not const value_type&","2014-02 (Issaquah)","|Complete|","","" -"`LWG2308 `__","Clarify container destructor requirements w.r.t. std::array","2014-02 (Issaquah)","|Complete|","","" -"`LWG2313 `__","tuple_size should always derive from integral_constant","2014-02 (Issaquah)","|Complete|","","" -"`LWG2314 `__","apply() should return decltype(auto) and use decay_t before tuple_size","2014-02 (Issaquah)","|Complete|","","" -"`LWG2315 `__","weak_ptr should be movable","2014-02 (Issaquah)","|Complete|","","" -"`LWG2316 `__","weak_ptr::lock() should be atomic","2014-02 (Issaquah)","|Complete|","","" -"`LWG2317 `__","The type property queries should be UnaryTypeTraits returning size_t","2014-02 (Issaquah)","|Complete|","","" -"`LWG2320 `__","select_on_container_copy_construction() takes allocators, not containers","2014-02 (Issaquah)","|Complete|","","" -"`LWG2322 `__","Associative(initializer_list, stuff) constructors are underspecified","2014-02 (Issaquah)","|Complete|","","" -"`LWG2323 `__","vector::resize(n, t)'s specification should be simplified","2014-02 (Issaquah)","|Complete|","","" -"`LWG2324 `__","Insert iterator constructors should use addressof()","2014-02 (Issaquah)","|Complete|","","" -"`LWG2329 `__","regex_match()/regex_search() with match_results should forbid temporary strings","2014-02 (Issaquah)","|Complete|","","" -"`LWG2330 `__","regex(""meow"", regex::icase) is technically forbidden but should be permitted","2014-02 (Issaquah)","|Complete|","","" -"`LWG2332 `__","regex_iterator/regex_token_iterator should forbid temporary regexes","2014-02 (Issaquah)","|Complete|","","" -"`LWG2339 `__","Wording issue in nth_element","2014-02 (Issaquah)","|Complete|","","" -"`LWG2341 `__","Inconsistency between basic_ostream::seekp(pos) and basic_ostream::seekp(off, dir)","2014-02 (Issaquah)","|Complete|","","" -"`LWG2344 `__","quoted()'s interaction with padding is unclear","2014-02 (Issaquah)","|Complete|","","" -"`LWG2346 `__","integral_constant's member functions should be marked noexcept","2014-02 (Issaquah)","|Complete|","","" -"`LWG2350 `__","min, max, and minmax should be constexpr","2014-02 (Issaquah)","|Complete|","","" -"`LWG2356 `__","Stability of erasure in unordered associative containers","2014-02 (Issaquah)","|Complete|","","" -"`LWG2357 `__","Remaining ""Assignable"" requirement","2014-02 (Issaquah)","|Complete|","","" -"`LWG2359 `__","How does regex_constants::nosubs affect basic_regex::mark_count()?","2014-02 (Issaquah)","|Complete|","","" -"`LWG2360 `__","``reverse_iterator::operator*()``\ is unimplementable","2014-02 (Issaquah)","|Complete|","","" -"`LWG2104 `__","unique_lock move-assignment should not be noexcept","2014-02 (Issaquah)","|Complete|","","" -"`LWG2186 `__","Incomplete action on async/launch::deferred","2014-02 (Issaquah)","|Complete|","","" -"`LWG2075 `__","Progress guarantees, lock-free property, and scheduling assumptions","2014-02 (Issaquah)","|Complete|","","" -"`LWG2288 `__","Inconsistent requirements for shared mutexes","2014-02 (Issaquah)","|Complete|","","" diff --git a/libcxx/docs/Status/Cxx14Papers.csv b/libcxx/docs/Status/Cxx14Papers.csv deleted file mode 100644 index 3dc670ca0a5dc4..00000000000000 --- a/libcxx/docs/Status/Cxx14Papers.csv +++ /dev/null @@ -1,32 +0,0 @@ -"Paper #","Paper Name","Meeting","Status","First released version","Labels" -"`N3346 `__","Terminology for Container Element Requirements - Rev 1","2012-02 (Kona)","|Complete|","3.4","" -"","","","","","" -"`N3421 `__","Making Operator Functors greater<>","2012-10 (Portland)","|Complete|","3.4","" -"`N3462 `__","std::result_of and SFINAE","2012-10 (Portland)","|Complete|","3.4","" -"`N3469 `__","Constexpr Library Additions: chrono, v3","2012-10 (Portland)","|Complete|","3.4","" -"`N3470 `__","Constexpr Library Additions: containers, v2","2012-10 (Portland)","|Complete|","3.4","" -"`N3471 `__","Constexpr Library Additions: utilities, v3","2012-10 (Portland)","|Complete|","3.4","" -"`N3302 `__","Constexpr Library Additions: complex, v2","2012-10 (Portland)","|Complete|","3.4","" -"","","","","","" -"`N3545 `__","An Incremental Improvement to integral_constant","2013-04 (Bristol)","|Complete|","3.4","" -"`N3644 `__","Null Forward Iterators","2013-04 (Bristol)","|Complete|","3.4","" -"`N3668 `__","std::exchange()","2013-04 (Bristol)","|Complete|","3.4","" -"`N3658 `__","Compile-time integer sequences","2013-04 (Bristol)","|Complete|","3.4","" -"`N3670 `__","Addressing Tuples by Type","2013-04 (Bristol)","|Complete|","3.4","" -"`N3671 `__","Making non-modifying sequence operations more robust","2013-04 (Bristol)","|Complete|","3.4","" -"`N3656 `__","make_unique","2013-04 (Bristol)","|Complete|","3.4","" -"`N3654 `__","Quoted Strings","2013-04 (Bristol)","|Complete|","3.4","" -"`N3642 `__","User-defined Literals","2013-04 (Bristol)","|Complete|","3.4","" -"`N3655 `__","TransformationTraits Redux (excluding part 4)","2013-04 (Bristol)","|Complete|","3.4","" -"`N3657 `__","Adding heterogeneous comparison lookup to associative containers","2013-04 (Bristol)","|Complete|","3.4","" -"`N3672 `__","A proposal to add a utility class to represent optional objects","2013-04 (Bristol)","*Removed from Draft Standard*","n/a","" -"`N3669 `__","Fixing constexpr member functions without const","2013-04 (Bristol)","|Complete|","3.4","" -"`N3662 `__","C++ Dynamic Arrays (dynarray)","2013-04 (Bristol)","*Removed from Draft Standard*","n/a","" -"`N3659 `__","Shared Locking in C++","2013-04 (Bristol)","|Complete|","3.4","" -"","","","","","" -"`N3779 `__","User-defined Literals for std::complex","2013-09 (Chicago)","|Complete|","3.4","" -"`N3789 `__","Constexpr Library Additions: functional","2013-09 (Chicago)","|Complete|","3.4","" -"","","","","","" -"`N3924 `__","Discouraging rand() in C++14","2014-02 (Issaquah)","|Complete|","3.5","" -"`N3887 `__","Consistent Metafunction Aliases","2014-02 (Issaquah)","|Complete|","3.5","" -"`N3891 `__","A proposal to rename shared_mutex to shared_timed_mutex","2014-02 (Issaquah)","|Complete|","3.5","" diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst index 4bca3ccc8fa063..c3b724568bc51e 100644 --- a/libcxx/docs/index.rst +++ b/libcxx/docs/index.rst @@ -43,7 +43,6 @@ Getting Started with libc++ Modules Hardening ReleaseProcedure - Status/Cxx14 Status/Cxx17 Status/Cxx20 Status/Cxx23 @@ -173,7 +172,7 @@ C++ Dialect Support =================== * C++11 - Complete -* :ref:`C++14 - Complete ` +* C++14 - Complete * :ref:`C++17 - In Progress ` * :ref:`C++20 - In Progress ` * :ref:`C++23 - In Progress ` diff --git a/libcxx/utils/synchronize_csv_status_files.py b/libcxx/utils/synchronize_csv_status_files.py index 9228fc6ed20198..8c1e8cea0f394d 100755 --- a/libcxx/utils/synchronize_csv_status_files.py +++ b/libcxx/utils/synchronize_csv_status_files.py @@ -204,8 +204,6 @@ def sync_csv(rows: List[Tuple], from_github: List[PaperInfo]) -> List[Tuple]: return results CSV_FILES_TO_SYNC = [ - 'Cxx14Issues.csv', - 'Cxx14Papers.csv', 'Cxx17Issues.csv', 'Cxx17Papers.csv', 'Cxx20Issues.csv', From bf71c64839c0082e761a4f070ed92e01ced0187c Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Wed, 21 Aug 2024 16:28:25 +0200 Subject: [PATCH 073/426] Speculative fix for asan/TestCases/Darwin/cstring_section.c It's been failing since https://green.lab.llvm.org/job/llvm.org/job/clang-stage1-RA/1812 It seems __TEXT,__cstring now comes before __TEXT,__const. --- compiler-rt/test/asan/TestCases/Darwin/cstring_section.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler-rt/test/asan/TestCases/Darwin/cstring_section.c b/compiler-rt/test/asan/TestCases/Darwin/cstring_section.c index d72b0ba8a8bb33..e40c4b1b8ed6ba 100644 --- a/compiler-rt/test/asan/TestCases/Darwin/cstring_section.c +++ b/compiler-rt/test/asan/TestCases/Darwin/cstring_section.c @@ -6,10 +6,10 @@ // Check that "Hello.\n" is in __asan_cstring and not in __cstring. // CHECK: Contents of section {{.*}}__asan_cstring: // CHECK: 48656c6c {{.*}} Hello. -// CHECK: Contents of section {{.*}}__const: -// CHECK-NOT: 48656c6c {{.*}} Hello. // CHECK: Contents of section {{.*}}__cstring: // CHECK-NOT: 48656c6c {{.*}} Hello. +// CHECK: Contents of section {{.*}}__const: +// CHECK-NOT: 48656c6c {{.*}} Hello. int main(int argc, char *argv[]) { argv[0] = "Hello.\n"; From 8d4891591fb41780c2af6e18abd590faf1f5626c Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Wed, 21 Aug 2024 10:35:10 -0400 Subject: [PATCH 074/426] [gn] port 7ad7f8f7a3d4 --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index cc759d2337516d..f49c964b4128fb 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -35,6 +35,7 @@ if (current_toolchain == default_toolchain) { "_LIBCPP_HAS_NO_UNICODE=", "_LIBCPP_HAS_NO_WIDE_CHARACTERS=", "_LIBCPP_HAS_NO_STD_MODULES=", + "_LIBCPP_HAS_NO_TERMINAL=", "_LIBCPP_INSTRUMENTED_WITH_ASAN=", "_LIBCPP_ABI_DEFINES=", "_LIBCPP_HARDENING_MODE_DEFAULT=_LIBCPP_HARDENING_MODE_NONE", From f0a3f8a370e3c85ee00cbc5e5d1c29e8ad3c51da Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 21 Aug 2024 08:54:27 -0400 Subject: [PATCH 075/426] [libc++] Enable C++23 and C++26 issues to be synchronized As a drive-by, also switch to printing dangling issues instead of killing the script, since those can be fairly common. --- libcxx/utils/synchronize_csv_status_files.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libcxx/utils/synchronize_csv_status_files.py b/libcxx/utils/synchronize_csv_status_files.py index 8c1e8cea0f394d..68df5756e884d6 100755 --- a/libcxx/utils/synchronize_csv_status_files.py +++ b/libcxx/utils/synchronize_csv_status_files.py @@ -182,7 +182,8 @@ def sync_csv(rows: List[Tuple], from_github: List[PaperInfo]) -> List[Tuple]: if paper.is_implemented(): dangling = [gh for gh in from_github if gh.paper_number == paper.paper_number and not gh.is_implemented()] if dangling: - raise RuntimeError(f"We found the following open tracking issues for a row which is already marked as implemented:\nrow: {row}\ntracking issues: {dangling}") + print(f"We found the following open tracking issues for a row which is already marked as implemented:\nrow: {row}\ntracking issues: {dangling}") + print("The Github issue should be closed if the work has indeed been done.") results.append(paper.for_printing()) else: # Find any Github issues tracking this paper @@ -208,11 +209,10 @@ def sync_csv(rows: List[Tuple], from_github: List[PaperInfo]) -> List[Tuple]: 'Cxx17Papers.csv', 'Cxx20Issues.csv', 'Cxx20Papers.csv', - # TODO: The Github issues are not created yet. - # 'Cxx23Issues.csv', - # 'Cxx23Papers.csv', - # 'Cxx2cIssues.csv', - # 'Cxx2cPapers.csv', + 'Cxx23Issues.csv', + 'Cxx23Papers.csv', + 'Cxx2cIssues.csv', + 'Cxx2cPapers.csv', ] def main(): From ddb5480e6799d0de72c2cd34c1e7f9ffd154e660 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Wed, 21 Aug 2024 10:47:36 -0400 Subject: [PATCH 076/426] [AMDGPU][True16][MC] added VOPC realtrue/faketrue flag and fake16 instructions (#104739) VOPC instructions were defined with HasTrue16BitInst flag while these true16 instructions are actually implemented with fake16 profile. Seperate them to true16 version and fake16 version by adding UseRealTrue16 and UseFakeTrue16 flag and fake16 instructions. The code default to use fake16. This is preparing for the upcoming changes in MC to support realtrue 16bit operands and vdst. The true16 and fake16 profile will be modified in the later patches. --- llvm/lib/Target/AMDGPU/VOPCInstructions.td | 149 ++++++++++++- .../GlobalISel/inst-select-fcmp.s16.mir | 200 ++++++++++-------- 2 files changed, 256 insertions(+), 93 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 62ca6261c47c80..be862b44917e15 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -87,6 +87,17 @@ class VOPC_Profile sched, ValueType vt0, ValueType vt1 = vt multiclass VOPC_Profile_t16 sched, ValueType vt0, ValueType vt1 = vt0> { def NAME : VOPC_Profile; def _t16 : VOPC_Profile { + let IsTrue16 = 1; + let IsRealTrue16 = 1; + let Src1RC32 = getVregSrcForVT.ret; + let Src0DPP = getVregSrcForVT.ret; + let Src1DPP = getVregSrcForVT.ret; + let Src2DPP = getVregSrcForVT.ret; + let Src0ModDPP = getSrcModDPP_t16.ret; + let Src1ModDPP = getSrcModDPP_t16.ret; + let Src2ModDPP = getSrcModDPP_t16.ret; + } + def _fake16: VOPC_Profile { let IsTrue16 = 1; let Src1RC32 = getVregSrcForVT.ret; let Src0DPP = getVregSrcForVT.ret; @@ -117,6 +128,17 @@ class VOPC_NoSdst_Profile sched, ValueType vt0, multiclass VOPC_NoSdst_Profile_t16 sched, ValueType vt0, ValueType vt1 = vt0> { def NAME : VOPC_NoSdst_Profile; def _t16 : VOPC_NoSdst_Profile { + let IsTrue16 = 1; + let IsRealTrue16 = 1; + let Src1RC32 = getVregSrcForVT.ret; + let Src0DPP = getVregSrcForVT.ret; + let Src1DPP = getVregSrcForVT.ret; + let Src2DPP = getVregSrcForVT.ret; + let Src0ModDPP = getSrcModDPP_t16.ret; + let Src1ModDPP = getSrcModDPP_t16.ret; + let Src2ModDPP = getSrcModDPP_t16.ret; + } + def _fake16 : VOPC_NoSdst_Profile { let IsTrue16 = 1; let Src1RC32 = getVregSrcForVT.ret; let Src0DPP = getVregSrcForVT.ret; @@ -412,9 +434,12 @@ multiclass VOPC_F16 ; } - let OtherPredicates = [HasTrue16BitInsts] in { + let True16Predicate = UseRealTrue16Insts in { defm _t16 : VOPC_Pseudos ; } + let True16Predicate = UseFakeTrue16Insts in { + defm _fake16 : VOPC_Pseudos ; + } } multiclass VOPC_F32 : @@ -428,9 +453,12 @@ multiclass VOPC_I16 ; } - let OtherPredicates = [HasTrue16BitInsts] in { + let True16Predicate = UseRealTrue16Insts in { defm _t16 : VOPC_Pseudos ; } + let True16Predicate = UseFakeTrue16Insts in { + defm _fake16 : VOPC_Pseudos ; + } } multiclass VOPC_I32 : @@ -445,9 +473,12 @@ multiclass VOPCX_F16 { let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in { defm NAME : VOPCX_Pseudos ; } - let OtherPredicates = [HasTrue16BitInsts] in { + let True16Predicate = UseRealTrue16Insts in { defm _t16 : VOPCX_Pseudos ; } + let True16Predicate = UseFakeTrue16Insts in { + defm _fake16 : VOPCX_Pseudos ; + } } multiclass VOPCX_F32 : @@ -460,9 +491,12 @@ multiclass VOPCX_I16 { let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in { defm NAME : VOPCX_Pseudos ; } - let OtherPredicates = [HasTrue16BitInsts] in { + let True16Predicate = UseRealTrue16Insts in { defm _t16 : VOPCX_Pseudos ; } + let True16Predicate = UseFakeTrue16Insts in { + defm _fake16 : VOPCX_Pseudos ; + } } multiclass VOPCX_I32 : @@ -795,6 +829,18 @@ class VOPC_Class_Profile sched, ValueType src0VT, ValueType multiclass VOPC_Class_Profile_t16 sched> { def NAME : VOPC_Class_Profile; def _t16 : VOPC_Class_Profile { + let IsTrue16 = 1; + let IsRealTrue16 = 1; + let Src1RC32 = getVregSrcForVT.ret; + let Src1RC64 = VSrc_b32; + let Src0DPP = getVregSrcForVT.ret; + let Src1DPP = getVregSrcForVT.ret; + let Src2DPP = getVregSrcForVT.ret; + let Src0ModDPP = getSrcModDPP_t16.ret; + let Src1ModDPP = getSrcModDPP_t16.ret; + let Src2ModDPP = getSrcModDPP_t16.ret; + } + def _fake16 : VOPC_Class_Profile { let IsTrue16 = 1; let Src1RC32 = getVregSrcForVT.ret; let Src1RC64 = VSrc_b32; @@ -822,6 +868,18 @@ class VOPC_Class_NoSdst_Profile sched, ValueType src0VT, Va multiclass VOPC_Class_NoSdst_Profile_t16 sched> { def NAME : VOPC_Class_NoSdst_Profile; def _t16 : VOPC_Class_NoSdst_Profile { + let IsTrue16 = 1; + let IsRealTrue16 = 1; + let Src1RC32 = getVregSrcForVT.ret; + let Src1RC64 = VSrc_b32; + let Src0DPP = getVregSrcForVT.ret; + let Src1DPP = getVregSrcForVT.ret; + let Src2DPP = getVregSrcForVT.ret; + let Src0ModDPP = getSrcModDPP_t16.ret; + let Src1ModDPP = getSrcModDPP_t16.ret; + let Src2ModDPP = getSrcModDPP_t16.ret; + } + def _fake16 : VOPC_Class_NoSdst_Profile { let IsTrue16 = 1; let Src1RC32 = getVregSrcForVT.ret; let Src1RC64 = VSrc_b32; @@ -948,18 +1006,24 @@ multiclass VOPC_CLASS_F16 { let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in { defm NAME : VOPC_Class_Pseudos ; } - let OtherPredicates = [HasTrue16BitInsts] in { + let OtherPredicates = [UseRealTrue16Insts] in { defm _t16 : VOPC_Class_Pseudos ; } + let OtherPredicates = [UseFakeTrue16Insts] in { + defm _fake16 : VOPC_Class_Pseudos ; + } } multiclass VOPCX_CLASS_F16 { let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in { defm NAME : VOPCX_Class_Pseudos ; } - let OtherPredicates = [HasTrue16BitInsts] in { + let OtherPredicates = [UseRealTrue16Insts] in { defm _t16 : VOPCX_Class_Pseudos ; } + let OtherPredicates = [UseFakeTrue16Insts] in { + defm _fake16 : VOPCX_Class_Pseudos ; + } } multiclass VOPC_CLASS_F32 : @@ -1401,7 +1465,7 @@ multiclass VOPC_Real_with_name op, string OpName, pseudo_mnemonic), asm_name, ps64.AsmVariantName>; - let DecoderNamespace = Gen.DecoderNamespace in { + let DecoderNamespace = Gen.DecoderNamespace # !if(ps32.Pfl.IsRealTrue16, "", "_FAKE16") in { def _e32#Gen.Suffix : // 32 and 64 bit forms of the instruction have _e32 and _e64 // respectively appended to their assembly mnemonic. @@ -1530,7 +1594,7 @@ multiclass VOPCX_Real_with_name op, string OpName, pseudo_mnemonic), asm_name, ps64.AsmVariantName>; - let DecoderNamespace = Gen.DecoderNamespace in { + let DecoderNamespace = Gen.DecoderNamespace # !if(ps32.Pfl.IsRealTrue16, "", "_FAKE16") in { def _e32#Gen.Suffix : VOPC_Real, VOPCe { @@ -1623,7 +1687,25 @@ defm V_CMP_NGT_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00b, "v_cmp_ngt_f16">; defm V_CMP_NLE_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00c, "v_cmp_nle_f16">; defm V_CMP_NEQ_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00d, "v_cmp_neq_f16">; defm V_CMP_NLT_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00e, "v_cmp_nlt_f16">; -defm V_CMP_T_F16_t16 : VOPC_Real_with_name_gfx11<0x00f, "V_CMP_TRU_F16_t16", "v_cmp_t_f16", "v_cmp_tru_f16">; +defm V_CMP_T_F16_t16 : VOPC_Real_t16_gfx11<0x00f, "v_cmp_t_f16", "V_CMP_TRU_F16_t16", "v_cmp_tru_f16">; + +defm V_CMP_F_F16_fake16 : VOPC_Real_t16_gfx11<0x000, "v_cmp_f_f16">; +defm V_CMP_LT_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x001, "v_cmp_lt_f16">; +defm V_CMP_EQ_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x002, "v_cmp_eq_f16">; +defm V_CMP_LE_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x003, "v_cmp_le_f16">; +defm V_CMP_GT_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x004, "v_cmp_gt_f16">; +defm V_CMP_LG_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x005, "v_cmp_lg_f16">; +defm V_CMP_GE_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x006, "v_cmp_ge_f16">; +defm V_CMP_O_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x007, "v_cmp_o_f16">; +defm V_CMP_U_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x008, "v_cmp_u_f16">; +defm V_CMP_NGE_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x009, "v_cmp_nge_f16">; +defm V_CMP_NLG_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x00a, "v_cmp_nlg_f16">; +defm V_CMP_NGT_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x00b, "v_cmp_ngt_f16">; +defm V_CMP_NLE_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x00c, "v_cmp_nle_f16">; +defm V_CMP_NEQ_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x00d, "v_cmp_neq_f16">; +defm V_CMP_NLT_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x00e, "v_cmp_nlt_f16">; +defm V_CMP_T_F16_fake16 : VOPC_Real_t16_gfx11<0x00f, "v_cmp_t_f16", "V_CMP_TRU_F16_fake16", "v_cmp_tru_f16">; + defm V_CMP_F_F32 : VOPC_Real_gfx11<0x010>; defm V_CMP_LT_F32 : VOPC_Real_gfx11_gfx12<0x011>; defm V_CMP_EQ_F32 : VOPC_Real_gfx11_gfx12<0x012>; @@ -1641,6 +1723,7 @@ defm V_CMP_NEQ_F32 : VOPC_Real_gfx11_gfx12<0x01d>; defm V_CMP_NLT_F32 : VOPC_Real_gfx11_gfx12<0x01e>; defm V_CMP_T_F32 : VOPC_Real_with_name_gfx11<0x01f, "V_CMP_TRU_F32", "v_cmp_t_f32">; defm V_CMP_T_F64 : VOPC_Real_with_name_gfx11<0x02f, "V_CMP_TRU_F64", "v_cmp_t_f64">; + defm V_CMP_LT_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x031, "v_cmp_lt_i16">; defm V_CMP_EQ_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x032, "v_cmp_eq_i16">; defm V_CMP_LE_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x033, "v_cmp_le_i16">; @@ -1653,6 +1736,20 @@ defm V_CMP_LE_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03b, "v_cmp_le_u16">; defm V_CMP_GT_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03c, "v_cmp_gt_u16">; defm V_CMP_NE_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03d, "v_cmp_ne_u16">; defm V_CMP_GE_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03e, "v_cmp_ge_u16">; + +defm V_CMP_LT_I16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x031, "v_cmp_lt_i16">; +defm V_CMP_EQ_I16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x032, "v_cmp_eq_i16">; +defm V_CMP_LE_I16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x033, "v_cmp_le_i16">; +defm V_CMP_GT_I16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x034, "v_cmp_gt_i16">; +defm V_CMP_NE_I16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x035, "v_cmp_ne_i16">; +defm V_CMP_GE_I16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x036, "v_cmp_ge_i16">; +defm V_CMP_LT_U16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x039, "v_cmp_lt_u16">; +defm V_CMP_EQ_U16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x03a, "v_cmp_eq_u16">; +defm V_CMP_LE_U16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x03b, "v_cmp_le_u16">; +defm V_CMP_GT_U16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x03c, "v_cmp_gt_u16">; +defm V_CMP_NE_U16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x03d, "v_cmp_ne_u16">; +defm V_CMP_GE_U16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x03e, "v_cmp_ge_u16">; + defm V_CMP_F_I32 : VOPC_Real_gfx11<0x040>; defm V_CMP_LT_I32 : VOPC_Real_gfx11_gfx12<0x041>; defm V_CMP_EQ_I32 : VOPC_Real_gfx11_gfx12<0x042>; @@ -1688,6 +1785,7 @@ defm V_CMP_GE_U64 : VOPC_Real_gfx11_gfx12<0x05e>; defm V_CMP_T_U64 : VOPC_Real_gfx11<0x05f>; defm V_CMP_CLASS_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x07d, "v_cmp_class_f16">; +defm V_CMP_CLASS_F16_fake16 : VOPC_Real_t16_gfx11_gfx12<0x07d, "v_cmp_class_f16">; defm V_CMP_CLASS_F32 : VOPC_Real_gfx11_gfx12<0x07e>; defm V_CMP_CLASS_F64 : VOPC_Real_gfx11_gfx12<0x07f>; @@ -1707,6 +1805,24 @@ defm V_CMPX_NLE_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08c, "v_cmpx_nle_f16">; defm V_CMPX_NEQ_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08d, "v_cmpx_neq_f16">; defm V_CMPX_NLT_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08e, "v_cmpx_nlt_f16">; defm V_CMPX_T_F16_t16 : VOPCX_Real_with_name_gfx11<0x08f, "V_CMPX_TRU_F16_t16", "v_cmpx_t_f16", "v_cmpx_tru_f16">; + +defm V_CMPX_F_F16_fake16 : VOPCX_Real_t16_gfx11<0x080, "v_cmpx_f_f16">; +defm V_CMPX_LT_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x081, "v_cmpx_lt_f16">; +defm V_CMPX_EQ_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x082, "v_cmpx_eq_f16">; +defm V_CMPX_LE_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x083, "v_cmpx_le_f16">; +defm V_CMPX_GT_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x084, "v_cmpx_gt_f16">; +defm V_CMPX_LG_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x085, "v_cmpx_lg_f16">; +defm V_CMPX_GE_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x086, "v_cmpx_ge_f16">; +defm V_CMPX_O_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x087, "v_cmpx_o_f16">; +defm V_CMPX_U_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x088, "v_cmpx_u_f16">; +defm V_CMPX_NGE_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x089, "v_cmpx_nge_f16">; +defm V_CMPX_NLG_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x08a, "v_cmpx_nlg_f16">; +defm V_CMPX_NGT_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x08b, "v_cmpx_ngt_f16">; +defm V_CMPX_NLE_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x08c, "v_cmpx_nle_f16">; +defm V_CMPX_NEQ_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x08d, "v_cmpx_neq_f16">; +defm V_CMPX_NLT_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x08e, "v_cmpx_nlt_f16">; +defm V_CMPX_T_F16_fake16 : VOPCX_Real_with_name_gfx11<0x08f, "V_CMPX_TRU_F16_fake16", "v_cmpx_t_f16", "v_cmpx_tru_f16">; + defm V_CMPX_F_F32 : VOPCX_Real_gfx11<0x090>; defm V_CMPX_LT_F32 : VOPCX_Real_gfx11_gfx12<0x091>; defm V_CMPX_EQ_F32 : VOPCX_Real_gfx11_gfx12<0x092>; @@ -1753,6 +1869,20 @@ defm V_CMPX_LE_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0bb, "v_cmpx_le_u16">; defm V_CMPX_GT_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0bc, "v_cmpx_gt_u16">; defm V_CMPX_NE_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0bd, "v_cmpx_ne_u16">; defm V_CMPX_GE_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0be, "v_cmpx_ge_u16">; + +defm V_CMPX_LT_I16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0b1, "v_cmpx_lt_i16">; +defm V_CMPX_EQ_I16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0b2, "v_cmpx_eq_i16">; +defm V_CMPX_LE_I16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0b3, "v_cmpx_le_i16">; +defm V_CMPX_GT_I16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0b4, "v_cmpx_gt_i16">; +defm V_CMPX_NE_I16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0b5, "v_cmpx_ne_i16">; +defm V_CMPX_GE_I16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0b6, "v_cmpx_ge_i16">; +defm V_CMPX_LT_U16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0b9, "v_cmpx_lt_u16">; +defm V_CMPX_EQ_U16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0ba, "v_cmpx_eq_u16">; +defm V_CMPX_LE_U16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0bb, "v_cmpx_le_u16">; +defm V_CMPX_GT_U16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0bc, "v_cmpx_gt_u16">; +defm V_CMPX_NE_U16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0bd, "v_cmpx_ne_u16">; +defm V_CMPX_GE_U16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0be, "v_cmpx_ge_u16">; + defm V_CMPX_F_I32 : VOPCX_Real_gfx11<0x0c0>; defm V_CMPX_LT_I32 : VOPCX_Real_gfx11_gfx12<0x0c1>; defm V_CMPX_EQ_I32 : VOPCX_Real_gfx11_gfx12<0x0c2>; @@ -1787,6 +1917,7 @@ defm V_CMPX_NE_U64 : VOPCX_Real_gfx11_gfx12<0x0dd>; defm V_CMPX_GE_U64 : VOPCX_Real_gfx11_gfx12<0x0de>; defm V_CMPX_T_U64 : VOPCX_Real_gfx11<0x0df>; defm V_CMPX_CLASS_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0fd, "v_cmpx_class_f16">; +defm V_CMPX_CLASS_F16_fake16 : VOPCX_Real_t16_gfx11_gfx12<0x0fd, "v_cmpx_class_f16">; defm V_CMPX_CLASS_F32 : VOPCX_Real_gfx11_gfx12<0x0fe>; defm V_CMPX_CLASS_F64 : VOPCX_Real_gfx11_gfx12<0x0ff>; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.mir index 04c3f050d165a3..5c387baf467524 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.mir @@ -20,6 +20,7 @@ body: | ; WAVE64-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) ; WAVE64-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(false), [[TRUNC]](s16), [[TRUNC1]] ; WAVE64-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1) + ; ; WAVE32-LABEL: name: fcmp_false_s16_vv ; WAVE32: liveins: $vgpr0, $vgpr1 ; WAVE32-NEXT: {{ $}} @@ -29,6 +30,7 @@ body: | ; WAVE32-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) ; WAVE32-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(false), [[TRUNC]](s16), [[TRUNC1]] ; WAVE32-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1) + ; ; GFX11-LABEL: name: fcmp_false_s16_vv ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} @@ -59,22 +61,24 @@ body: | ; WAVE64-NEXT: {{ $}} ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE64-NEXT: %4:sreg_64_xexec = nofpexcept V_CMP_EQ_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE64-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE64-NEXT: [[V_CMP_EQ_F16_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_EQ_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_e64_]] + ; ; WAVE32-LABEL: name: fcmp_oeq_s16_vv ; WAVE32: liveins: $vgpr0, $vgpr1 ; WAVE32-NEXT: {{ $}} ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE32-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_EQ_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE32-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE32-NEXT: [[V_CMP_EQ_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_EQ_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_e64_]] + ; ; GFX11-LABEL: name: fcmp_oeq_s16_vv ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_EQ_F16_t16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit %4 + ; GFX11-NEXT: [[V_CMP_EQ_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_EQ_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -96,22 +100,24 @@ body: | ; WAVE64-NEXT: {{ $}} ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE64-NEXT: %4:sreg_64_xexec = nofpexcept V_CMP_GT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE64-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE64-NEXT: [[V_CMP_GT_F16_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_GT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_CMP_GT_F16_e64_]] + ; ; WAVE32-LABEL: name: fcmp_ogt_s16_vv ; WAVE32: liveins: $vgpr0, $vgpr1 ; WAVE32-NEXT: {{ $}} ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE32-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_GT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE32-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE32-NEXT: [[V_CMP_GT_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_GT_F16_e64_]] + ; ; GFX11-LABEL: name: fcmp_ogt_s16_vv ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_GT_F16_t16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit %4 + ; GFX11-NEXT: [[V_CMP_GT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_GT_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -133,22 +139,24 @@ body: | ; WAVE64-NEXT: {{ $}} ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE64-NEXT: %4:sreg_64_xexec = nofpexcept V_CMP_GE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE64-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE64-NEXT: [[V_CMP_GE_F16_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_GE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_CMP_GE_F16_e64_]] + ; ; WAVE32-LABEL: name: fcmp_oge_s16_vv ; WAVE32: liveins: $vgpr0, $vgpr1 ; WAVE32-NEXT: {{ $}} ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE32-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_GE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE32-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE32-NEXT: [[V_CMP_GE_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_GE_F16_e64_]] + ; ; GFX11-LABEL: name: fcmp_oge_s16_vv ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_GE_F16_t16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit %4 + ; GFX11-NEXT: [[V_CMP_GE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_GE_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -170,22 +178,24 @@ body: | ; WAVE64-NEXT: {{ $}} ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE64-NEXT: %4:sreg_64_xexec = nofpexcept V_CMP_LT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE64-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE64-NEXT: [[V_CMP_LT_F16_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_LT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_CMP_LT_F16_e64_]] + ; ; WAVE32-LABEL: name: fcmp_olt_s16_vv ; WAVE32: liveins: $vgpr0, $vgpr1 ; WAVE32-NEXT: {{ $}} ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE32-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE32-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE32-NEXT: [[V_CMP_LT_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_LT_F16_e64_]] + ; ; GFX11-LABEL: name: fcmp_olt_s16_vv ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit %4 + ; GFX11-NEXT: [[V_CMP_LT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LT_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -207,22 +217,24 @@ body: | ; WAVE64-NEXT: {{ $}} ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE64-NEXT: %4:sreg_64_xexec = nofpexcept V_CMP_LE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE64-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE64-NEXT: [[V_CMP_LE_F16_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_LE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_CMP_LE_F16_e64_]] + ; ; WAVE32-LABEL: name: fcmp_ole_s16_vv ; WAVE32: liveins: $vgpr0, $vgpr1 ; WAVE32-NEXT: {{ $}} ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE32-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_LE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE32-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE32-NEXT: [[V_CMP_LE_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_LE_F16_e64_]] + ; ; GFX11-LABEL: name: fcmp_ole_s16_vv ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_LE_F16_t16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit %4 + ; GFX11-NEXT: [[V_CMP_LE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LE_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -243,22 +255,24 @@ body: | ; WAVE64-NEXT: {{ $}} ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE64-NEXT: %4:sreg_64_xexec = nofpexcept V_CMP_LG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE64-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE64-NEXT: [[V_CMP_LG_F16_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_LG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_e64_]] + ; ; WAVE32-LABEL: name: fcmp_one_s16_vv ; WAVE32: liveins: $vgpr0, $vgpr1 ; WAVE32-NEXT: {{ $}} ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE32-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE32-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE32-NEXT: [[V_CMP_LG_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_e64_]] + ; ; GFX11-LABEL: name: fcmp_one_s16_vv ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_t16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit %4 + ; GFX11-NEXT: [[V_CMP_LG_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -280,22 +294,24 @@ body: | ; WAVE64-NEXT: {{ $}} ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE64-NEXT: %4:sreg_64_xexec = nofpexcept V_CMP_LG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE64-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE64-NEXT: [[V_CMP_LG_F16_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_LG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_e64_]] + ; ; WAVE32-LABEL: name: fcmp_ord_s16_vv ; WAVE32: liveins: $vgpr0, $vgpr1 ; WAVE32-NEXT: {{ $}} ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE32-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE32-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE32-NEXT: [[V_CMP_LG_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_e64_]] + ; ; GFX11-LABEL: name: fcmp_ord_s16_vv ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_t16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit %4 + ; GFX11-NEXT: [[V_CMP_LG_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -317,22 +333,24 @@ body: | ; WAVE64-NEXT: {{ $}} ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE64-NEXT: %4:sreg_64_xexec = nofpexcept V_CMP_U_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE64-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE64-NEXT: [[V_CMP_U_F16_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_CMP_U_F16_e64_]] + ; ; WAVE32-LABEL: name: fcmp_uno_s16_vv ; WAVE32: liveins: $vgpr0, $vgpr1 ; WAVE32-NEXT: {{ $}} ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE32-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE32-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE32-NEXT: [[V_CMP_U_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_U_F16_e64_]] + ; ; GFX11-LABEL: name: fcmp_uno_s16_vv ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F16_t16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit %4 + ; GFX11-NEXT: [[V_CMP_U_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_U_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -354,22 +372,24 @@ body: | ; WAVE64-NEXT: {{ $}} ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE64-NEXT: %4:sreg_64_xexec = nofpexcept V_CMP_NLG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE64-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE64-NEXT: [[V_CMP_NLG_F16_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_NLG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLG_F16_e64_]] + ; ; WAVE32-LABEL: name: fcmp_ueq_s16_vv ; WAVE32: liveins: $vgpr0, $vgpr1 ; WAVE32-NEXT: {{ $}} ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE32-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_NLG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE32-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE32-NEXT: [[V_CMP_NLG_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLG_F16_e64_]] + ; ; GFX11-LABEL: name: fcmp_ueq_s16_vv ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_NLG_F16_t16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit %4 + ; GFX11-NEXT: [[V_CMP_NLG_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLG_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLG_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -391,22 +411,24 @@ body: | ; WAVE64-NEXT: {{ $}} ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE64-NEXT: %4:sreg_64_xexec = nofpexcept V_CMP_NLE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE64-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE64-NEXT: [[V_CMP_NLE_F16_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_NLE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLE_F16_e64_]] + ; ; WAVE32-LABEL: name: fcmp_ugt_s16_vv ; WAVE32: liveins: $vgpr0, $vgpr1 ; WAVE32-NEXT: {{ $}} ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE32-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_NLE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE32-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE32-NEXT: [[V_CMP_NLE_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLE_F16_e64_]] + ; ; GFX11-LABEL: name: fcmp_ugt_s16_vv ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_NLE_F16_t16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit %4 + ; GFX11-NEXT: [[V_CMP_NLE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLE_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -428,22 +450,24 @@ body: | ; WAVE64-NEXT: {{ $}} ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE64-NEXT: %4:sreg_64_xexec = nofpexcept V_CMP_NLT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE64-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE64-NEXT: [[V_CMP_NLT_F16_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_NLT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLT_F16_e64_]] + ; ; WAVE32-LABEL: name: fcmp_uge_s16_vv ; WAVE32: liveins: $vgpr0, $vgpr1 ; WAVE32-NEXT: {{ $}} ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE32-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_NLT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE32-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE32-NEXT: [[V_CMP_NLT_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLT_F16_e64_]] + ; ; GFX11-LABEL: name: fcmp_uge_s16_vv ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_NLT_F16_t16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit %4 + ; GFX11-NEXT: [[V_CMP_NLT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLT_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -465,22 +489,24 @@ body: | ; WAVE64-NEXT: {{ $}} ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE64-NEXT: %4:sreg_64_xexec = nofpexcept V_CMP_NGE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE64-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE64-NEXT: [[V_CMP_NGE_F16_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_NGE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGE_F16_e64_]] + ; ; WAVE32-LABEL: name: fcmp_ult_s16_vv ; WAVE32: liveins: $vgpr0, $vgpr1 ; WAVE32-NEXT: {{ $}} ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE32-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_NGE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE32-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE32-NEXT: [[V_CMP_NGE_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGE_F16_e64_]] + ; ; GFX11-LABEL: name: fcmp_ult_s16_vv ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_NGE_F16_t16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit %4 + ; GFX11-NEXT: [[V_CMP_NGE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGE_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -502,22 +528,24 @@ body: | ; WAVE64-NEXT: {{ $}} ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE64-NEXT: %4:sreg_64_xexec = nofpexcept V_CMP_NGT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE64-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE64-NEXT: [[V_CMP_NGT_F16_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_NGT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGT_F16_e64_]] + ; ; WAVE32-LABEL: name: fcmp_ule_s16_vv ; WAVE32: liveins: $vgpr0, $vgpr1 ; WAVE32-NEXT: {{ $}} ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE32-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_NGT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE32-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE32-NEXT: [[V_CMP_NGT_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGT_F16_e64_]] + ; ; GFX11-LABEL: name: fcmp_ule_s16_vv ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_NGT_F16_t16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit %4 + ; GFX11-NEXT: [[V_CMP_NGT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGT_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -539,22 +567,24 @@ body: | ; WAVE64-NEXT: {{ $}} ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE64-NEXT: %4:sreg_64_xexec = nofpexcept V_CMP_NEQ_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE64-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE64-NEXT: [[V_CMP_NEQ_F16_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_NEQ_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_CMP_NEQ_F16_e64_]] + ; ; WAVE32-LABEL: name: fcmp_une_s16_vv ; WAVE32: liveins: $vgpr0, $vgpr1 ; WAVE32-NEXT: {{ $}} ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; WAVE32-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_NEQ_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; WAVE32-NEXT: S_ENDPGM 0, implicit %4 + ; WAVE32-NEXT: [[V_CMP_NEQ_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NEQ_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NEQ_F16_e64_]] + ; ; GFX11-LABEL: name: fcmp_une_s16_vv ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: %4:sreg_32_xm0_xexec = nofpexcept V_CMP_NEQ_F16_t16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit %4 + ; GFX11-NEXT: [[V_CMP_NEQ_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NEQ_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NEQ_F16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_TRUNC %0 @@ -580,6 +610,7 @@ body: | ; WAVE64-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) ; WAVE64-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(true), [[TRUNC]](s16), [[TRUNC1]] ; WAVE64-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1) + ; ; WAVE32-LABEL: name: fcmp_true_s16_vv ; WAVE32: liveins: $vgpr0, $vgpr1 ; WAVE32-NEXT: {{ $}} @@ -589,6 +620,7 @@ body: | ; WAVE32-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) ; WAVE32-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(true), [[TRUNC]](s16), [[TRUNC1]] ; WAVE32-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1) + ; ; GFX11-LABEL: name: fcmp_true_s16_vv ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} From c9ba6d35c19022a582516e9455af3f0d79101adf Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Wed, 21 Aug 2024 07:53:47 -0700 Subject: [PATCH 077/426] [RISCV] Add coverage for fp reductions of <2^N-1 x FP> vectors --- .../RISCV/rvv/fixed-vectors-reduction-fp.ll | 375 +++++++++++++----- 1 file changed, 283 insertions(+), 92 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll index a6763fa22822ed..e9e147861df564 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll @@ -91,6 +91,26 @@ define half @vreduce_ord_fadd_v4f16(ptr %x, half %s) { ret half %red } +declare half @llvm.vector.reduce.fadd.v7f16(half, <7 x half>) + +define half @vreduce_fadd_v7f16(ptr %x, half %s) { +; CHECK-LABEL: vreduce_fadd_v7f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 7, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 7 +; CHECK-NEXT: vfmv.s.f v9, fa0 +; CHECK-NEXT: vfredusum.vs v8, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret + %v = load <7 x half>, ptr %x + %red = call reassoc half @llvm.vector.reduce.fadd.v7f16(half %s, <7 x half> %v) + ret half %red +} + declare half @llvm.vector.reduce.fadd.v8f16(half, <8 x half>) define half @vreduce_fadd_v8f16(ptr %x, half %s) { @@ -443,6 +463,45 @@ define float @vreduce_ord_fwadd_v4f32(ptr %x, float %s) { ret float %red } +declare float @llvm.vector.reduce.fadd.v7f32(float, <7 x float>) + +define float @vreduce_fadd_v7f32(ptr %x, float %s) { +; CHECK-LABEL: vreduce_fadd_v7f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 7 +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vfredusum.vs v8, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret + %v = load <7 x float>, ptr %x + %red = call reassoc float @llvm.vector.reduce.fadd.v7f32(float %s, <7 x float> %v) + ret float %red +} + +define float @vreduce_ord_fadd_v7f32(ptr %x, float %s) { +; CHECK-LABEL: vreduce_ord_fadd_v7f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 7 +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vfredosum.vs v8, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret + %v = load <7 x float>, ptr %x + %red = call float @llvm.vector.reduce.fadd.v7f32(float %s, <7 x float> %v) + ret float %red +} + + declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>) define float @vreduce_fadd_v8f32(ptr %x, float %s) { @@ -1250,6 +1309,26 @@ define float @vreduce_fmin_v4f32_nonans_noinfs(ptr %x) { ret float %red } +declare float @llvm.vector.reduce.fmin.v7f32(<7 x float>) + +define float @vreduce_fmin_v7f32(ptr %x) { +; CHECK-LABEL: vreduce_fmin_v7f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lui a0, 523264 +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vmv.v.v v12, v8 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v12, v10, 7 +; CHECK-NEXT: vfredmin.vs v8, v12, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret + %v = load <7 x float>, ptr %x + %red = call float @llvm.vector.reduce.fmin.v7f32(<7 x float> %v) + ret float %red +} + declare float @llvm.vector.reduce.fmin.v128f32(<128 x float>) define float @vreduce_fmin_v128f32(ptr %x) { @@ -1480,6 +1559,26 @@ define float @vreduce_fmax_v4f32_nonans_noinfs(ptr %x) { ret float %red } +declare float @llvm.vector.reduce.fmax.v7f32(<7 x float>) + +define float @vreduce_fmax_v7f32(ptr %x) { +; CHECK-LABEL: vreduce_fmax_v7f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lui a0, 1047552 +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vmv.v.v v12, v8 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v12, v10, 7 +; CHECK-NEXT: vfredmax.vs v8, v12, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret + %v = load <7 x float>, ptr %x + %red = call float @llvm.vector.reduce.fmax.v7f32(<7 x float> %v) + ret float %red +} + declare float @llvm.vector.reduce.fmax.v128f32(<128 x float>) define float @vreduce_fmax_v128f32(ptr %x) { @@ -1602,12 +1701,12 @@ define float @vreduce_fminimum_v2f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmfne.vv v9, v8, v8 ; CHECK-NEXT: vcpop.m a0, v9 -; CHECK-NEXT: beqz a0, .LBB99_2 +; CHECK-NEXT: beqz a0, .LBB104_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB99_2: +; CHECK-NEXT: .LBB104_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -1638,12 +1737,12 @@ define float @vreduce_fminimum_v4f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmfne.vv v9, v8, v8 ; CHECK-NEXT: vcpop.m a0, v9 -; CHECK-NEXT: beqz a0, .LBB101_2 +; CHECK-NEXT: beqz a0, .LBB106_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB101_2: +; CHECK-NEXT: .LBB106_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -1665,6 +1764,52 @@ define float @vreduce_fminimum_v4f32_nonans(ptr %x) { ret float %red } +declare float @llvm.vector.reduce.fminimum.v7f32(<7 x float>) + +define float @vreduce_fminimum_v7f32(ptr %x) { +; CHECK-LABEL: vreduce_fminimum_v7f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lui a0, 522240 +; CHECK-NEXT: vmv.s.x v12, a0 +; CHECK-NEXT: vmv.v.v v10, v8 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v10, v12, 7 +; CHECK-NEXT: vmfne.vv v9, v10, v10 +; CHECK-NEXT: vcpop.m a0, v9 +; CHECK-NEXT: beqz a0, .LBB108_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, 523264 +; CHECK-NEXT: fmv.w.x fa0, a0 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB108_2: +; CHECK-NEXT: vfredmin.vs v8, v10, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret + %v = load <7 x float>, ptr %x + %red = call float @llvm.vector.reduce.fminimum.v7f32(<7 x float> %v) + ret float %red +} + +define float @vreduce_fminimum_v7f32_nonans(ptr %x) { +; CHECK-LABEL: vreduce_fminimum_v7f32_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lui a0, 522240 +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vmv.v.v v12, v8 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v12, v10, 7 +; CHECK-NEXT: vfredmin.vs v8, v12, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret + %v = load <7 x float>, ptr %x + %red = call nnan float @llvm.vector.reduce.fminimum.v7f32(<7 x float> %v) + ret float %red +} + declare float @llvm.vector.reduce.fminimum.v8f32(<8 x float>) define float @vreduce_fminimum_v8f32(ptr %x) { @@ -1674,12 +1819,12 @@ define float @vreduce_fminimum_v8f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmfne.vv v10, v8, v8 ; CHECK-NEXT: vcpop.m a0, v10 -; CHECK-NEXT: beqz a0, .LBB103_2 +; CHECK-NEXT: beqz a0, .LBB110_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB103_2: +; CHECK-NEXT: .LBB110_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -1710,12 +1855,12 @@ define float @vreduce_fminimum_v16f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmfne.vv v12, v8, v8 ; CHECK-NEXT: vcpop.m a0, v12 -; CHECK-NEXT: beqz a0, .LBB105_2 +; CHECK-NEXT: beqz a0, .LBB112_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB105_2: +; CHECK-NEXT: .LBB112_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -1747,12 +1892,12 @@ define float @vreduce_fminimum_v32f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB107_2 +; CHECK-NEXT: beqz a0, .LBB114_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB107_2: +; CHECK-NEXT: .LBB114_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -1802,15 +1947,15 @@ define float @vreduce_fminimum_v64f32(ptr %x) { ; CHECK-NEXT: vfmin.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB109_2 +; CHECK-NEXT: beqz a0, .LBB116_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 -; CHECK-NEXT: j .LBB109_3 -; CHECK-NEXT: .LBB109_2: +; CHECK-NEXT: j .LBB116_3 +; CHECK-NEXT: .LBB116_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: .LBB109_3: +; CHECK-NEXT: .LBB116_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -1924,15 +2069,15 @@ define float @vreduce_fminimum_v128f32(ptr %x) { ; CHECK-NEXT: vfmin.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB111_2 +; CHECK-NEXT: beqz a0, .LBB118_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 -; CHECK-NEXT: j .LBB111_3 -; CHECK-NEXT: .LBB111_2: +; CHECK-NEXT: j .LBB118_3 +; CHECK-NEXT: .LBB118_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: .LBB111_3: +; CHECK-NEXT: .LBB118_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: mv a1, a0 @@ -1978,12 +2123,12 @@ define double @vreduce_fminimum_v2f64(ptr %x) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: vmfne.vv v9, v8, v8 ; CHECK-NEXT: vcpop.m a0, v9 -; CHECK-NEXT: beqz a0, .LBB113_2 +; CHECK-NEXT: beqz a0, .LBB120_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI113_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI113_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI120_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI120_0)(a0) ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB113_2: +; CHECK-NEXT: .LBB120_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2014,12 +2159,12 @@ define double @vreduce_fminimum_v4f64(ptr %x) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: vmfne.vv v10, v8, v8 ; CHECK-NEXT: vcpop.m a0, v10 -; CHECK-NEXT: beqz a0, .LBB115_2 +; CHECK-NEXT: beqz a0, .LBB122_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI115_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI115_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI122_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI122_0)(a0) ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB115_2: +; CHECK-NEXT: .LBB122_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2050,12 +2195,12 @@ define double @vreduce_fminimum_v8f64(ptr %x) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: vmfne.vv v12, v8, v8 ; CHECK-NEXT: vcpop.m a0, v12 -; CHECK-NEXT: beqz a0, .LBB117_2 +; CHECK-NEXT: beqz a0, .LBB124_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI117_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI117_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI124_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI124_0)(a0) ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB117_2: +; CHECK-NEXT: .LBB124_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2086,12 +2231,12 @@ define double @vreduce_fminimum_v16f64(ptr %x) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB119_2 +; CHECK-NEXT: beqz a0, .LBB126_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI119_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI119_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI126_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI126_0)(a0) ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB119_2: +; CHECK-NEXT: .LBB126_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2139,15 +2284,15 @@ define double @vreduce_fminimum_v32f64(ptr %x) { ; CHECK-NEXT: vfmin.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB121_2 +; CHECK-NEXT: beqz a0, .LBB128_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI121_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI121_0)(a0) -; CHECK-NEXT: j .LBB121_3 -; CHECK-NEXT: .LBB121_2: +; CHECK-NEXT: lui a0, %hi(.LCPI128_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI128_0)(a0) +; CHECK-NEXT: j .LBB128_3 +; CHECK-NEXT: .LBB128_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: .LBB121_3: +; CHECK-NEXT: .LBB128_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -2259,15 +2404,15 @@ define double @vreduce_fminimum_v64f64(ptr %x) { ; CHECK-NEXT: vfmin.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB123_2 +; CHECK-NEXT: beqz a0, .LBB130_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI123_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI123_0)(a0) -; CHECK-NEXT: j .LBB123_3 -; CHECK-NEXT: .LBB123_2: +; CHECK-NEXT: lui a0, %hi(.LCPI130_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI130_0)(a0) +; CHECK-NEXT: j .LBB130_3 +; CHECK-NEXT: .LBB130_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: .LBB123_3: +; CHECK-NEXT: .LBB130_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: mv a1, a0 @@ -2312,12 +2457,12 @@ define float @vreduce_fmaximum_v2f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmfne.vv v9, v8, v8 ; CHECK-NEXT: vcpop.m a0, v9 -; CHECK-NEXT: beqz a0, .LBB125_2 +; CHECK-NEXT: beqz a0, .LBB132_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB125_2: +; CHECK-NEXT: .LBB132_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2348,12 +2493,12 @@ define float @vreduce_fmaximum_v4f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmfne.vv v9, v8, v8 ; CHECK-NEXT: vcpop.m a0, v9 -; CHECK-NEXT: beqz a0, .LBB127_2 +; CHECK-NEXT: beqz a0, .LBB134_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB127_2: +; CHECK-NEXT: .LBB134_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2375,6 +2520,52 @@ define float @vreduce_fmaximum_v4f32_nonans(ptr %x) { ret float %red } +declare float @llvm.vector.reduce.fmaximum.v7f32(<7 x float>) + +define float @vreduce_fmaximum_v7f32(ptr %x) { +; CHECK-LABEL: vreduce_fmaximum_v7f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lui a0, 1046528 +; CHECK-NEXT: vmv.s.x v12, a0 +; CHECK-NEXT: vmv.v.v v10, v8 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v10, v12, 7 +; CHECK-NEXT: vmfne.vv v9, v10, v10 +; CHECK-NEXT: vcpop.m a0, v9 +; CHECK-NEXT: beqz a0, .LBB136_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, 523264 +; CHECK-NEXT: fmv.w.x fa0, a0 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB136_2: +; CHECK-NEXT: vfredmax.vs v8, v10, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret + %v = load <7 x float>, ptr %x + %red = call float @llvm.vector.reduce.fmaximum.v7f32(<7 x float> %v) + ret float %red +} + +define float @vreduce_fmaximum_v7f32_nonans(ptr %x) { +; CHECK-LABEL: vreduce_fmaximum_v7f32_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lui a0, 1046528 +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vmv.v.v v12, v8 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v12, v10, 7 +; CHECK-NEXT: vfredmax.vs v8, v12, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret + %v = load <7 x float>, ptr %x + %red = call nnan float @llvm.vector.reduce.fmaximum.v7f32(<7 x float> %v) + ret float %red +} + declare float @llvm.vector.reduce.fmaximum.v8f32(<8 x float>) define float @vreduce_fmaximum_v8f32(ptr %x) { @@ -2384,12 +2575,12 @@ define float @vreduce_fmaximum_v8f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmfne.vv v10, v8, v8 ; CHECK-NEXT: vcpop.m a0, v10 -; CHECK-NEXT: beqz a0, .LBB129_2 +; CHECK-NEXT: beqz a0, .LBB138_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB129_2: +; CHECK-NEXT: .LBB138_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2420,12 +2611,12 @@ define float @vreduce_fmaximum_v16f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmfne.vv v12, v8, v8 ; CHECK-NEXT: vcpop.m a0, v12 -; CHECK-NEXT: beqz a0, .LBB131_2 +; CHECK-NEXT: beqz a0, .LBB140_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB131_2: +; CHECK-NEXT: .LBB140_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2457,12 +2648,12 @@ define float @vreduce_fmaximum_v32f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB133_2 +; CHECK-NEXT: beqz a0, .LBB142_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB133_2: +; CHECK-NEXT: .LBB142_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2512,15 +2703,15 @@ define float @vreduce_fmaximum_v64f32(ptr %x) { ; CHECK-NEXT: vfmax.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB135_2 +; CHECK-NEXT: beqz a0, .LBB144_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 -; CHECK-NEXT: j .LBB135_3 -; CHECK-NEXT: .LBB135_2: +; CHECK-NEXT: j .LBB144_3 +; CHECK-NEXT: .LBB144_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: .LBB135_3: +; CHECK-NEXT: .LBB144_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -2634,15 +2825,15 @@ define float @vreduce_fmaximum_v128f32(ptr %x) { ; CHECK-NEXT: vfmax.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB137_2 +; CHECK-NEXT: beqz a0, .LBB146_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 -; CHECK-NEXT: j .LBB137_3 -; CHECK-NEXT: .LBB137_2: +; CHECK-NEXT: j .LBB146_3 +; CHECK-NEXT: .LBB146_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: .LBB137_3: +; CHECK-NEXT: .LBB146_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: mv a1, a0 @@ -2688,12 +2879,12 @@ define double @vreduce_fmaximum_v2f64(ptr %x) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: vmfne.vv v9, v8, v8 ; CHECK-NEXT: vcpop.m a0, v9 -; CHECK-NEXT: beqz a0, .LBB139_2 +; CHECK-NEXT: beqz a0, .LBB148_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI139_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI139_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI148_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI148_0)(a0) ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB139_2: +; CHECK-NEXT: .LBB148_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2724,12 +2915,12 @@ define double @vreduce_fmaximum_v4f64(ptr %x) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: vmfne.vv v10, v8, v8 ; CHECK-NEXT: vcpop.m a0, v10 -; CHECK-NEXT: beqz a0, .LBB141_2 +; CHECK-NEXT: beqz a0, .LBB150_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI141_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI141_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI150_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI150_0)(a0) ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB141_2: +; CHECK-NEXT: .LBB150_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2760,12 +2951,12 @@ define double @vreduce_fmaximum_v8f64(ptr %x) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: vmfne.vv v12, v8, v8 ; CHECK-NEXT: vcpop.m a0, v12 -; CHECK-NEXT: beqz a0, .LBB143_2 +; CHECK-NEXT: beqz a0, .LBB152_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI143_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI143_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI152_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI152_0)(a0) ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB143_2: +; CHECK-NEXT: .LBB152_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2796,12 +2987,12 @@ define double @vreduce_fmaximum_v16f64(ptr %x) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB145_2 +; CHECK-NEXT: beqz a0, .LBB154_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI145_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI145_0)(a0) +; CHECK-NEXT: lui a0, %hi(.LCPI154_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI154_0)(a0) ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB145_2: +; CHECK-NEXT: .LBB154_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2849,15 +3040,15 @@ define double @vreduce_fmaximum_v32f64(ptr %x) { ; CHECK-NEXT: vfmax.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB147_2 +; CHECK-NEXT: beqz a0, .LBB156_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI147_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI147_0)(a0) -; CHECK-NEXT: j .LBB147_3 -; CHECK-NEXT: .LBB147_2: +; CHECK-NEXT: lui a0, %hi(.LCPI156_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI156_0)(a0) +; CHECK-NEXT: j .LBB156_3 +; CHECK-NEXT: .LBB156_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: .LBB147_3: +; CHECK-NEXT: .LBB156_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -2969,15 +3160,15 @@ define double @vreduce_fmaximum_v64f64(ptr %x) { ; CHECK-NEXT: vfmax.vv v8, v8, v16 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB149_2 +; CHECK-NEXT: beqz a0, .LBB158_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI149_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI149_0)(a0) -; CHECK-NEXT: j .LBB149_3 -; CHECK-NEXT: .LBB149_2: +; CHECK-NEXT: lui a0, %hi(.LCPI158_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI158_0)(a0) +; CHECK-NEXT: j .LBB158_3 +; CHECK-NEXT: .LBB158_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: .LBB149_3: +; CHECK-NEXT: .LBB158_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: mv a1, a0 From c0d222219a8d01d3945100114256d26cfe833a1c Mon Sep 17 00:00:00 2001 From: Andy Kaylor Date: Wed, 21 Aug 2024 08:10:26 -0700 Subject: [PATCH 078/426] Fix bug with -ffp-contract=fast-honor-pragmas (#104857) This fixes a problem which caused clang to assert in the Sema pragma handling if it encountered "#pragma STDC FP_CONTRACT DEFAULT" when compiling with the -ffp-contract=fast-honor-pragmas option. This fixes https://github.com/llvm/llvm-project/issues/104830 --- clang/docs/ReleaseNotes.rst | 3 + clang/lib/Sema/SemaAttr.cpp | 3 +- .../ffp-contract-fast-honor-pramga-option.cpp | 37 +++++ .../ffp-contract-fhp-pragma-override.cpp | 151 ++++++++++++++++++ 4 files changed, 192 insertions(+), 2 deletions(-) create mode 100644 clang/test/CodeGen/ffp-contract-fast-honor-pramga-option.cpp create mode 100644 clang/test/CodeGen/ffp-contract-fhp-pragma-override.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 8f98167dff31ef..5c156a9c073a9c 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -310,6 +310,9 @@ Miscellaneous Clang Crashes Fixed - Fixed a crash caused by long chains of ``sizeof`` and other similar operators that can be followed by a non-parenthesized expression. (#GH45061) +- Fixed an crash when compiling ``#pragma STDC FP_CONTRACT DEFAULT`` with + ``-ffp-contract=fast-honor-pragmas``. (#GH104830) + - Fixed a crash when function has more than 65536 parameters. Now a diagnostic is emitted. (#GH35741) diff --git a/clang/lib/Sema/SemaAttr.cpp b/clang/lib/Sema/SemaAttr.cpp index b0c239678d0b01..a1724820472b59 100644 --- a/clang/lib/Sema/SemaAttr.cpp +++ b/clang/lib/Sema/SemaAttr.cpp @@ -1269,13 +1269,12 @@ void Sema::ActOnPragmaFPContract(SourceLocation Loc, NewFPFeatures.setAllowFPContractWithinStatement(); break; case LangOptions::FPM_Fast: + case LangOptions::FPM_FastHonorPragmas: NewFPFeatures.setAllowFPContractAcrossStatement(); break; case LangOptions::FPM_Off: NewFPFeatures.setDisallowFPContract(); break; - case LangOptions::FPM_FastHonorPragmas: - llvm_unreachable("Should not happen"); } FpPragmaStack.Act(Loc, Sema::PSK_Set, StringRef(), NewFPFeatures); CurFPFeatures = NewFPFeatures.applyOverrides(getLangOpts()); diff --git a/clang/test/CodeGen/ffp-contract-fast-honor-pramga-option.cpp b/clang/test/CodeGen/ffp-contract-fast-honor-pramga-option.cpp new file mode 100644 index 00000000000000..fef4da1edf1fc9 --- /dev/null +++ b/clang/test/CodeGen/ffp-contract-fast-honor-pramga-option.cpp @@ -0,0 +1,37 @@ +// RUN: %clang_cc1 -O3 -ffp-contract=fast-honor-pragmas -triple %itanium_abi_triple -emit-llvm -o - %s | FileCheck %s + +float fp_contract_1(float a, float b, float c) { + // CHECK-LABEL: fp_contract_1fff( + // CHECK: fmul contract float + // CHECK: fadd contract float + return a * b + c; +} + +float fp_contract_2(float a, float b, float c) { + // CHECK-LABEL: fp_contract_2fff( + // CHECK: fmul contract float + // CHECK: fsub contract float + return a * b - c; +} + +void fp_contract_3(float *a, float b, float c) { + // CHECK-LABEL: fp_contract_3Pfff( + // CHECK: fmul contract float + // CHECK: fadd contract float + a[0] += b * c; +} + +void fp_contract_4(float *a, float b, float c) { + // CHECK-LABEL: fp_contract_4Pfff( + // CHECK: fmul contract float + // CHECK: fsub contract float + a[0] -= b * c; +} + +float fp_contract_5(float a, float b, float c) { + // CHECK-LABEL: fp_contract_5fff( + // CHECK: fmul contract float + // CHECK: fadd contract float + float t = a * b; + return t + c; +} diff --git a/clang/test/CodeGen/ffp-contract-fhp-pragma-override.cpp b/clang/test/CodeGen/ffp-contract-fhp-pragma-override.cpp new file mode 100644 index 00000000000000..ff35c9204c79cd --- /dev/null +++ b/clang/test/CodeGen/ffp-contract-fhp-pragma-override.cpp @@ -0,0 +1,151 @@ +// RUN: %clang_cc1 -O3 -ffp-contract=fast-honor-pragmas -triple %itanium_abi_triple -emit-llvm -o - %s | FileCheck %s + +float fp_contract_on_1(float a, float b, float c) { + // CHECK-LABEL: fp_contract_on_1fff( + // CHECK: call float @llvm.fmuladd.f32(float {{.*}}, float {{.*}}, float {{.*}}) + #pragma STDC FP_CONTRACT ON + return a * b + c; +} + +float fp_contract_on_2(float a, float b, float c) { + // CHECK-LABEL: fp_contract_on_2fff( + // CHECK: fmul float + // CHECK: fadd float + #pragma STDC FP_CONTRACT ON + float t = a * b; + return t + c; +} + +float fp_contract_off_1(float a, float b, float c) { + // CHECK-LABEL: fp_contract_off_1fff( + // CHECK: fmul float + // CHECK: fadd float + #pragma STDC FP_CONTRACT OFF + return a * b + c; +} + +float fp_contract_off_2(float a, float b, float c) { + // CHECK-LABEL: fp_contract_off_2fff( + // CHECK: fmul float + // CHECK: fadd float + #pragma STDC FP_CONTRACT OFF + float t = a * b; + return t + c; +} + +float fp_contract_default_1(float a, float b, float c) { + // CHECK-LABEL: fp_contract_default_1fff( + // CHECK: fmul contract float + // CHECK: fadd contract float + #pragma STDC FP_CONTRACT DEFAULT + return a * b + c; +} + +float fp_contract_default_2(float a, float b, float c) { + // CHECK-LABEL: fp_contract_default_2fff( + // CHECK: fmul contract float + // CHECK: fadd contract float + #pragma STDC FP_CONTRACT DEFAULT + float t = a * b; + return t + c; +} + +float fp_contract_clang_on_1(float a, float b, float c) { + // CHECK-LABEL: fp_contract_clang_on_1fff( + // CHECK: call float @llvm.fmuladd.f32(float {{.*}}, float {{.*}}, float {{.*}}) + #pragma clang fp contract(on) + return a * b + c; +} + +float fp_contract_clang_on_2(float a, float b, float c) { + // CHECK-LABEL: fp_contract_clang_on_2fff( + // CHECK: fmul float + // CHECK: fadd float + #pragma clang fp contract(on) + float t = a * b; + return t + c; +} + +float fp_contract_clang_off_1(float a, float b, float c) { + // CHECK-LABEL: fp_contract_clang_off_1fff( + // CHECK: fmul float + // CHECK: fadd float + #pragma clang fp contract(off) + return a * b + c; +} + +float fp_contract_clang_off_2(float a, float b, float c) { + // CHECK-LABEL: fp_contract_clang_off_2fff( + // CHECK: fmul float + // CHECK: fadd float + #pragma clang fp contract(off) + float t = a * b; + return t + c; +} + +float fp_contract_clang_fast_1(float a, float b, float c) { + // CHECK-LABEL: fp_contract_clang_fast_1fff( + // CHECK: fmul contract float + // CHECK: fadd contract float + #pragma clang fp contract(fast) + return a * b + c; +} + +float fp_contract_clang_fast_2(float a, float b, float c) { + // CHECK-LABEL: fp_contract_clang_fast_2fff( + // CHECK: fmul contract float + // CHECK: fadd contract float + #pragma clang fp contract(fast) + float t = a * b; + return t + c; +} + +#pragma STDC FP_CONTRACT ON + +float fp_contract_global_on_1(float a, float b, float c) { + // CHECK-LABEL: fp_contract_global_on_1fff( + // CHECK: call float @llvm.fmuladd.f32(float {{.*}}, float {{.*}}, float {{.*}}) + return a * b + c; +} + +float fp_contract_global_on_2(float a, float b, float c) { + // CHECK-LABEL: fp_contract_global_on_2fff( + // CHECK: fmul float + // CHECK: fadd float + float t = a * b; + return t + c; +} + +#pragma STDC FP_CONTRACT OFF + +float fp_contract_global_off_1(float a, float b, float c) { + // CHECK-LABEL: fp_contract_global_off_1fff( + // CHECK: fmul float + // CHECK: fadd float + return a * b + c; +} + +float fp_contract_global_off_2(float a, float b, float c) { + // CHECK-LABEL: fp_contract_global_off_2fff( + // CHECK: fmul float + // CHECK: fadd float + float t = a * b; + return t + c; +} + +#pragma STDC FP_CONTRACT DEFAULT + +float fp_contract_global_default_1(float a, float b, float c) { + // CHECK-LABEL: fp_contract_global_default_1fff( + // CHECK: fmul contract float + // CHECK: fadd contract float + return a * b + c; +} + +float fp_contract_global_default_2(float a, float b, float c) { + // CHECK-LABEL: fp_contract_global_default_2fff( + // CHECK: fmul contract float + // CHECK: fadd contract float + float t = a * b; + return t + c; +} From 278fc8efdf004a1959a31bb4c208df5ee733d5c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Pettersson?= Date: Wed, 21 Aug 2024 17:56:27 +0200 Subject: [PATCH 079/426] [DAGCombiner] Fix ReplaceAllUsesOfValueWith mutation bug in visitFREEZE (#104924) In visitFREEZE we have been collecting a set/vector of MaybePoisonOperands that later was iterated over, applying a freeze to those operands. However, C-level fuzzy testing has discovered that the recursiveness of ReplaceAllUsesOfValueWith may cause later operands in the MaybePoisonOperands vector to be replaced when replacing an earlier operand. That would then turn up as Assertion `N1.getOpcode() != ISD::DELETED_NODE && "Operand is DELETED_NODE!"' failed. failures when trying to freeze those later operands. So we need to make sure that the vector with MaybePoisonOperands is mutated as well when needed. Or as the solution used in this patch, make sure to keep track of operand numbers that should be frozen instead of having a vector of SDValues. And then we can refetch the operands while iterating over operand numbers. The problem was seen after adding SELECT_CC to the set of operations including in "AllowMultipleMaybePoisonOperands". I'm not sure, but I guess that this could happen for other operations as well for which we allow multiple maybe poison operands. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 22 ++++++++++--- .../CodeGen/AArch64/dag-combine-freeze.ll | 31 +++++++++++++++++++ 2 files changed, 49 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/dag-combine-freeze.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 4180dcc8a720d5..c9ab7e7a66079c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -15808,13 +15808,16 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { } } - SmallSetVector MaybePoisonOperands; - for (SDValue Op : N0->ops()) { + SmallSet MaybePoisonOperands; + SmallVector MaybePoisonOperandNumbers; + for (auto [OpNo, Op] : enumerate(N0->ops())) { if (DAG.isGuaranteedNotToBeUndefOrPoison(Op, /*PoisonOnly*/ false, /*Depth*/ 1)) continue; bool HadMaybePoisonOperands = !MaybePoisonOperands.empty(); - bool IsNewMaybePoisonOperand = MaybePoisonOperands.insert(Op); + bool IsNewMaybePoisonOperand = MaybePoisonOperands.insert(Op).second; + if (IsNewMaybePoisonOperand) + MaybePoisonOperandNumbers.push_back(OpNo); if (!HadMaybePoisonOperands) continue; if (IsNewMaybePoisonOperand && !AllowMultipleMaybePoisonOperands) { @@ -15826,7 +15829,18 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { // it could create undef or poison due to it's poison-generating flags. // So not finding any maybe-poison operands is fine. - for (SDValue MaybePoisonOperand : MaybePoisonOperands) { + for (unsigned OpNo : MaybePoisonOperandNumbers) { + // N0 can mutate during iteration, so make sure to refetch the maybe poison + // operands via the operand numbers. The typical scenario is that we have + // something like this + // t262: i32 = freeze t181 + // t150: i32 = ctlz_zero_undef t262 + // t184: i32 = ctlz_zero_undef t181 + // t268: i32 = select_cc t181, Constant:i32<0>, t184, t186, setne:ch + // When freezing the t181 operand we get t262 back, and then the + // ReplaceAllUsesOfValueWith call will not only replace t181 by t262, but + // also recursively replace t184 by t150. + SDValue MaybePoisonOperand = N->getOperand(0).getOperand(OpNo); // Don't replace every single UNDEF everywhere with frozen UNDEF, though. if (MaybePoisonOperand.getOpcode() == ISD::UNDEF) continue; diff --git a/llvm/test/CodeGen/AArch64/dag-combine-freeze.ll b/llvm/test/CodeGen/AArch64/dag-combine-freeze.ll new file mode 100644 index 00000000000000..4f0c3d0ce18006 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/dag-combine-freeze.ll @@ -0,0 +1,31 @@ +; RUN: llc -mtriple aarch64 -o /dev/null %s + +; This used to fail with: +; Assertion `N1.getOpcode() != ISD::DELETED_NODE && +; "Operand is DELETED_NODE!"' failed. +; Just make sure we do not crash here. +define void @test_fold_freeze_over_select_cc(i15 %a, ptr %p1, ptr %p2) { +entry: + %a2 = add nsw i15 %a, 1 + %sext = sext i15 %a2 to i32 + %ashr = ashr i32 %sext, 31 + %lshr = lshr i32 %ashr, 7 + ; Setup an already frozen input to ctlz. + %freeze = freeze i32 %lshr + %ctlz = call i32 @llvm.ctlz.i32(i32 %freeze, i1 true) + store i32 %ctlz, ptr %p1, align 1 + ; Here is another ctlz, which is used by a frozen select. + ; DAGCombiner::visitFREEZE will to try to fold the freeze over a SELECT_CC, + ; and when dealing with the condition operand the other SELECT_CC operands + ; will be replaced/simplified as well. So the SELECT_CC is mutated while + ; freezing the "maybe poison operands". This needs to be handled by + ; DAGCombiner::visitFREEZE, as it can't store the list of SDValues that + ; should be frozen in a separate data structure that isn't updated when the + ; SELECT_CC is mutated. + %ctlz1 = call i32 @llvm.ctlz.i32(i32 %lshr, i1 true) + %icmp = icmp ne i32 %lshr, 0 + %select = select i1 %icmp, i32 %ctlz1, i32 0 + %freeze1 = freeze i32 %select + store i32 %freeze1, ptr %p2, align 1 + ret void +} From 6fd46089c9fbd5b22bb67ac3d6196fe70ba684c6 Mon Sep 17 00:00:00 2001 From: Abid Qadeer Date: Wed, 21 Aug 2024 16:57:08 +0100 Subject: [PATCH 080/426] [flang][debug] Allow non default array lower bounds. (#104467) As mentioned in #98877, we currently always use 1 as lower bound for fixed size arrays. This PR removes this restriction. It passes along `DeclareOp` to type conversion functions and uses the shift information (if present) to get the lower bound value. This was suggested by @jeanPerier in https://github.com/llvm/llvm-project/pull/96746#issuecomment-2195164553 This PR also adds a small cleanup that type conversion functions don't take Location now. It was initially added so that location of derived types can be passed. But that information can be extracted from typeInfo objects and we don't need to pass it along. This PR will handle the problem for local and global variable. We may need a bit more work for derived type once the support for derived types lands. Fixes #98877. --- .../lib/Optimizer/Transforms/AddDebugInfo.cpp | 23 ++++---- .../Transforms/DebugTypeGenerator.cpp | 54 ++++++++++--------- .../Optimizer/Transforms/DebugTypeGenerator.h | 26 ++++----- .../Integration/debug-fixed-array-type-2.f90 | 38 ++++++++----- .../Transforms/debug-fixed-array-type.fir | 7 +++ 5 files changed, 87 insertions(+), 61 deletions(-) diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp index 3c067bf946cfc9..30fc4185575e61 100644 --- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp +++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp @@ -65,7 +65,8 @@ class AddDebugInfoPass : public fir::impl::AddDebugInfoBase { void handleGlobalOp(fir::GlobalOp glocalOp, mlir::LLVM::DIFileAttr fileAttr, mlir::LLVM::DIScopeAttr scope, - mlir::SymbolTable *symbolTable); + mlir::SymbolTable *symbolTable, + fir::cg::XDeclareOp declOp); void handleFuncOp(mlir::func::FuncOp funcOp, mlir::LLVM::DIFileAttr fileAttr, mlir::LLVM::DICompileUnitAttr cuAttr, mlir::SymbolTable *symbolTable); @@ -100,10 +101,9 @@ void AddDebugInfoPass::handleDeclareOp(fir::cg::XDeclareOp declOp, if (result.first != fir::NameUniquer::NameKind::VARIABLE) return; - // If this DeclareOp actually represents a global then treat it as such. if (auto global = symbolTable->lookup(declOp.getUniqName())) { - handleGlobalOp(global, fileAttr, scopeAttr, symbolTable); + handleGlobalOp(global, fileAttr, scopeAttr, symbolTable, declOp); return; } @@ -127,7 +127,7 @@ void AddDebugInfoPass::handleDeclareOp(fir::cg::XDeclareOp declOp, } auto tyAttr = typeGen.convertType(fir::unwrapRefType(declOp.getType()), - fileAttr, scopeAttr, declOp.getLoc()); + fileAttr, scopeAttr, declOp); auto localVarAttr = mlir::LLVM::DILocalVariableAttr::get( context, scopeAttr, mlir::StringAttr::get(context, result.second.name), @@ -160,7 +160,8 @@ mlir::LLVM::DIModuleAttr AddDebugInfoPass::getOrCreateModuleAttr( void AddDebugInfoPass::handleGlobalOp(fir::GlobalOp globalOp, mlir::LLVM::DIFileAttr fileAttr, mlir::LLVM::DIScopeAttr scope, - mlir::SymbolTable *symbolTable) { + mlir::SymbolTable *symbolTable, + fir::cg::XDeclareOp declOp) { if (debugInfoIsAlreadySet(globalOp.getLoc())) return; mlir::ModuleOp module = getOperation(); @@ -200,8 +201,8 @@ void AddDebugInfoPass::handleGlobalOp(fir::GlobalOp globalOp, scope = getOrCreateModuleAttr(result.second.modules[0], fileAttr, scope, line - 1, !globalOp.isInitialized()); } - mlir::LLVM::DITypeAttr diType = typeGen.convertType( - globalOp.getType(), fileAttr, scope, globalOp.getLoc()); + mlir::LLVM::DITypeAttr diType = + typeGen.convertType(globalOp.getType(), fileAttr, scope, declOp); auto gvAttr = mlir::LLVM::DIGlobalVariableAttr::get( context, scope, mlir::StringAttr::get(context, result.second.name), mlir::StringAttr::get(context, globalOp.getName()), fileAttr, line, @@ -246,12 +247,13 @@ void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp, llvm::SmallVector types; fir::DebugTypeGenerator typeGen(module); for (auto resTy : funcOp.getResultTypes()) { - auto tyAttr = typeGen.convertType(resTy, fileAttr, cuAttr, funcOp.getLoc()); + auto tyAttr = + typeGen.convertType(resTy, fileAttr, cuAttr, /*declOp=*/nullptr); types.push_back(tyAttr); } for (auto inTy : funcOp.getArgumentTypes()) { auto tyAttr = typeGen.convertType(fir::unwrapRefType(inTy), fileAttr, - cuAttr, funcOp.getLoc()); + cuAttr, /*declOp=*/nullptr); types.push_back(tyAttr); } @@ -358,7 +360,8 @@ void AddDebugInfoPass::runOnOperation() { if (debugLevel == mlir::LLVM::DIEmissionKind::Full) { // Process 'GlobalOp' only if full debug info is requested. for (auto globalOp : module.getOps()) - handleGlobalOp(globalOp, fileAttr, cuAttr, &symbolTable); + handleGlobalOp(globalOp, fileAttr, cuAttr, &symbolTable, + /*declOp=*/nullptr); } } diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp index db559731552df2..860c16c9a13ce9 100644 --- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp +++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp @@ -83,8 +83,8 @@ static mlir::LLVM::DITypeAttr genPlaceholderType(mlir::MLIRContext *context) { mlir::LLVM::DITypeAttr DebugTypeGenerator::convertBoxedSequenceType( fir::SequenceType seqTy, mlir::LLVM::DIFileAttr fileAttr, - mlir::LLVM::DIScopeAttr scope, mlir::Location loc, bool genAllocated, - bool genAssociated) { + mlir::LLVM::DIScopeAttr scope, fir::cg::XDeclareOp declOp, + bool genAllocated, bool genAssociated) { mlir::MLIRContext *context = module.getContext(); // FIXME: Assumed rank arrays not supported yet @@ -114,7 +114,7 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertBoxedSequenceType( llvm::SmallVector elements; mlir::LLVM::DITypeAttr elemTy = - convertType(seqTy.getEleTy(), fileAttr, scope, loc); + convertType(seqTy.getEleTy(), fileAttr, scope, declOp); unsigned offset = dimsOffset; const unsigned indexSize = dimsSize / 3; for ([[maybe_unused]] auto _ : seqTy.getShape()) { @@ -156,13 +156,14 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertBoxedSequenceType( mlir::LLVM::DITypeAttr DebugTypeGenerator::convertSequenceType( fir::SequenceType seqTy, mlir::LLVM::DIFileAttr fileAttr, - mlir::LLVM::DIScopeAttr scope, mlir::Location loc) { + mlir::LLVM::DIScopeAttr scope, fir::cg::XDeclareOp declOp) { mlir::MLIRContext *context = module.getContext(); llvm::SmallVector elements; mlir::LLVM::DITypeAttr elemTy = - convertType(seqTy.getEleTy(), fileAttr, scope, loc); + convertType(seqTy.getEleTy(), fileAttr, scope, declOp); + unsigned index = 0; for (fir::SequenceType::Extent dim : seqTy.getShape()) { if (dim == seqTy.getUnknownExtent()) { // FIXME: This path is taken for assumed size arrays but also for arrays @@ -174,20 +175,20 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertSequenceType( elements.push_back(subrangeTy); } else { auto intTy = mlir::IntegerType::get(context, 64); - // FIXME: Only supporting lower bound of 1 at the moment. The - // 'SequenceType' has information about the shape but not the shift. In - // cases where the conversion originated during the processing of - // 'DeclareOp', it may be possible to pass on this information. But the - // type conversion should ideally be based on what information present in - // the type class so that it works from everywhere (e.g. when it is part - // of a module or a derived type.) + int64_t shift = 1; + if (declOp && declOp.getShift().size() > index) { + if (std::optional optint = + getIntIfConstant(declOp.getShift()[index])) + shift = *optint; + } auto countAttr = mlir::IntegerAttr::get(intTy, llvm::APInt(64, dim)); - auto lowerAttr = mlir::IntegerAttr::get(intTy, llvm::APInt(64, 1)); + auto lowerAttr = mlir::IntegerAttr::get(intTy, llvm::APInt(64, shift)); auto subrangeTy = mlir::LLVM::DISubrangeAttr::get( context, countAttr, lowerAttr, /*upperBound=*/nullptr, /*stride=*/nullptr); elements.push_back(subrangeTy); } + ++index; } // Apart from arrays, the `DICompositeTypeAttr` is used for other things like // structure types. Many of its fields which are not applicable to arrays @@ -203,7 +204,8 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertSequenceType( mlir::LLVM::DITypeAttr DebugTypeGenerator::convertCharacterType( fir::CharacterType charTy, mlir::LLVM::DIFileAttr fileAttr, - mlir::LLVM::DIScopeAttr scope, mlir::Location loc, bool hasDescriptor) { + mlir::LLVM::DIScopeAttr scope, fir::cg::XDeclareOp declOp, + bool hasDescriptor) { mlir::MLIRContext *context = module.getContext(); // DWARF 5 says the following about the character encoding in 5.1.1.2. @@ -250,21 +252,21 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertCharacterType( mlir::LLVM::DITypeAttr DebugTypeGenerator::convertPointerLikeType( mlir::Type elTy, mlir::LLVM::DIFileAttr fileAttr, - mlir::LLVM::DIScopeAttr scope, mlir::Location loc, bool genAllocated, - bool genAssociated) { + mlir::LLVM::DIScopeAttr scope, fir::cg::XDeclareOp declOp, + bool genAllocated, bool genAssociated) { mlir::MLIRContext *context = module.getContext(); // Arrays and character need different treatment because DWARF have special // constructs for them to get the location from the descriptor. Rest of // types are handled like pointer to underlying type. if (auto seqTy = mlir::dyn_cast_or_null(elTy)) - return convertBoxedSequenceType(seqTy, fileAttr, scope, loc, genAllocated, - genAssociated); + return convertBoxedSequenceType(seqTy, fileAttr, scope, declOp, + genAllocated, genAssociated); if (auto charTy = mlir::dyn_cast_or_null(elTy)) - return convertCharacterType(charTy, fileAttr, scope, loc, + return convertCharacterType(charTy, fileAttr, scope, declOp, /*hasDescriptor=*/true); - mlir::LLVM::DITypeAttr elTyAttr = convertType(elTy, fileAttr, scope, loc); + mlir::LLVM::DITypeAttr elTyAttr = convertType(elTy, fileAttr, scope, declOp); return mlir::LLVM::DIDerivedTypeAttr::get( context, llvm::dwarf::DW_TAG_pointer_type, @@ -276,7 +278,7 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertPointerLikeType( mlir::LLVM::DITypeAttr DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr, mlir::LLVM::DIScopeAttr scope, - mlir::Location loc) { + fir::cg::XDeclareOp declOp) { mlir::MLIRContext *context = module.getContext(); if (Ty.isInteger()) { return genBasicType(context, mlir::StringAttr::get(context, "integer"), @@ -306,22 +308,22 @@ DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr, return genBasicType(context, mlir::StringAttr::get(context, "complex"), bitWidth * 2, llvm::dwarf::DW_ATE_complex_float); } else if (auto seqTy = mlir::dyn_cast_or_null(Ty)) { - return convertSequenceType(seqTy, fileAttr, scope, loc); + return convertSequenceType(seqTy, fileAttr, scope, declOp); } else if (auto charTy = mlir::dyn_cast_or_null(Ty)) { - return convertCharacterType(charTy, fileAttr, scope, loc, + return convertCharacterType(charTy, fileAttr, scope, declOp, /*hasDescriptor=*/false); } else if (auto boxTy = mlir::dyn_cast_or_null(Ty)) { auto elTy = boxTy.getElementType(); if (auto seqTy = mlir::dyn_cast_or_null(elTy)) - return convertBoxedSequenceType(seqTy, fileAttr, scope, loc, false, + return convertBoxedSequenceType(seqTy, fileAttr, scope, declOp, false, false); if (auto heapTy = mlir::dyn_cast_or_null(elTy)) return convertPointerLikeType(heapTy.getElementType(), fileAttr, scope, - loc, /*genAllocated=*/true, + declOp, /*genAllocated=*/true, /*genAssociated=*/false); if (auto ptrTy = mlir::dyn_cast_or_null(elTy)) return convertPointerLikeType(ptrTy.getElementType(), fileAttr, scope, - loc, /*genAllocated=*/false, + declOp, /*genAllocated=*/false, /*genAssociated=*/true); return genPlaceholderType(context); } else { diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h index ec881e8be7cadc..5ab6ca5e9f880e 100644 --- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h +++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h @@ -13,6 +13,7 @@ #ifndef FORTRAN_OPTIMIZER_TRANSFORMS_DEBUGTYPEGENERATOR_H #define FORTRAN_OPTIMIZER_TRANSFORMS_DEBUGTYPEGENERATOR_H +#include "flang/Optimizer/CodeGen/CGOps.h" #include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Optimizer/Dialect/Support/FIRContext.h" #include "flang/Optimizer/Dialect/Support/KindMapping.h" @@ -28,33 +29,34 @@ class DebugTypeGenerator { mlir::LLVM::DITypeAttr convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr, mlir::LLVM::DIScopeAttr scope, - mlir::Location loc); + fir::cg::XDeclareOp declOp); private: mlir::LLVM::DITypeAttr convertSequenceType(fir::SequenceType seqTy, mlir::LLVM::DIFileAttr fileAttr, mlir::LLVM::DIScopeAttr scope, - mlir::Location loc); + fir::cg::XDeclareOp declOp); /// The 'genAllocated' is true when we want to generate 'allocated' field /// in the DICompositeType. It is needed for the allocatable arrays. /// Similarly, 'genAssociated' is used with 'pointer' type to generate /// 'associated' field. - mlir::LLVM::DITypeAttr - convertBoxedSequenceType(fir::SequenceType seqTy, - mlir::LLVM::DIFileAttr fileAttr, - mlir::LLVM::DIScopeAttr scope, mlir::Location loc, - bool genAllocated, bool genAssociated); + mlir::LLVM::DITypeAttr convertBoxedSequenceType( + fir::SequenceType seqTy, mlir::LLVM::DIFileAttr fileAttr, + mlir::LLVM::DIScopeAttr scope, fir::cg::XDeclareOp declOp, + bool genAllocated, bool genAssociated); mlir::LLVM::DITypeAttr convertCharacterType(fir::CharacterType charTy, mlir::LLVM::DIFileAttr fileAttr, mlir::LLVM::DIScopeAttr scope, - mlir::Location loc, + fir::cg::XDeclareOp declOp, bool hasDescriptor); - mlir::LLVM::DITypeAttr - convertPointerLikeType(mlir::Type elTy, mlir::LLVM::DIFileAttr fileAttr, - mlir::LLVM::DIScopeAttr scope, mlir::Location loc, - bool genAllocated, bool genAssociated); + mlir::LLVM::DITypeAttr convertPointerLikeType(mlir::Type elTy, + mlir::LLVM::DIFileAttr fileAttr, + mlir::LLVM::DIScopeAttr scope, + fir::cg::XDeclareOp declOp, + bool genAllocated, + bool genAssociated); mlir::ModuleOp module; KindMapping kindMapping; diff --git a/flang/test/Integration/debug-fixed-array-type-2.f90 b/flang/test/Integration/debug-fixed-array-type-2.f90 index b34413458ad8d3..705c1da593c705 100644 --- a/flang/test/Integration/debug-fixed-array-type-2.f90 +++ b/flang/test/Integration/debug-fixed-array-type-2.f90 @@ -1,19 +1,22 @@ ! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck %s -program mn - +module test integer d1(3) - integer d2(2, 5) - real d3(6, 8, 7) + integer d2(1:4, -1:3) + real d3(-2:6, 0:5, 3:7) +end + +program mn + use test i8 = fn1(d1, d2, d3) contains function fn1(a1, b1, c1) result (res) integer a1(3) - integer b1(2, 5) - real c1(6, 8, 7) + integer b1(-1:0, 5:9) + real c1(-2:6, 0:5, 3:7) integer res - res = a1(1) + b1(1,2) + c1(3, 3, 4) + res = a1(1) + b1(0,6) + c1(3, 3, 4) end function end program @@ -24,17 +27,26 @@ function fn1(a1, b1, c1) result (res) ! CHECK-DAG: ![[SUB1:.*]] = !{![[R1]]} ! CHECK-DAG: ![[D1TY:.*]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[INT]], elements: ![[SUB1]]) -! CHECK-DAG: ![[R21:.*]] = !DISubrange(count: 2, lowerBound: 1) -! CHECK-DAG: ![[R22:.*]] = !DISubrange(count: 5, lowerBound: 1) +! CHECK-DAG: ![[R21:.*]] = !DISubrange(count: 4, lowerBound: 1) +! CHECK-DAG: ![[R22:.*]] = !DISubrange(count: 5, lowerBound: -1) ! CHECK-DAG: ![[SUB2:.*]] = !{![[R21]], ![[R22]]} ! CHECK-DAG: ![[D2TY:.*]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[INT]], elements: ![[SUB2]]) -! CHECK-DAG: ![[R31:.*]] = !DISubrange(count: 6, lowerBound: 1) -! CHECK-DAG: ![[R32:.*]] = !DISubrange(count: 8, lowerBound: 1) -! CHECK-DAG: ![[R33:.*]] = !DISubrange(count: 7, lowerBound: 1) +! CHECK-DAG: ![[R31:.*]] = !DISubrange(count: 9, lowerBound: -2) +! CHECK-DAG: ![[R32:.*]] = !DISubrange(count: 6, lowerBound: 0) +! CHECK-DAG: ![[R33:.*]] = !DISubrange(count: 5, lowerBound: 3) ! CHECK-DAG: ![[SUB3:.*]] = !{![[R31]], ![[R32]], ![[R33]]} ! CHECK-DAG: ![[D3TY:.*]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[REAL]], elements: ![[SUB3]]) +! CHECK-DAG: ![[B11:.*]] = !DISubrange(count: 2, lowerBound: -1) +! CHECK-DAG: ![[B12:.*]] = !DISubrange(count: 5, lowerBound: 5) +! CHECK-DAG: ![[B1:.*]] = !{![[B11]], ![[B12]]} +! CHECK-DAG: ![[B1TY:.*]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[INT]], elements: ![[B1]]) + +! CHECK-DAG: {{.*}}!DIGlobalVariable(name: "d1"{{.*}}type: ![[D1TY]]{{.*}}) +! CHECK-DAG: {{.*}}!DIGlobalVariable(name: "d2"{{.*}}type: ![[D2TY]]{{.*}}) +! CHECK-DAG: {{.*}}!DIGlobalVariable(name: "d3"{{.*}}type: ![[D3TY]]{{.*}}) + ! CHECK-DAG: !DILocalVariable(name: "a1", arg: 1{{.*}}type: ![[D1TY]]) -! CHECK-DAG: !DILocalVariable(name: "b1", arg: 2{{.*}}type: ![[D2TY]]) +! CHECK-DAG: !DILocalVariable(name: "b1", arg: 2{{.*}}type: ![[B1TY]]) ! CHECK-DAG: !DILocalVariable(name: "c1", arg: 3{{.*}}type: ![[D3TY]]) diff --git a/flang/test/Transforms/debug-fixed-array-type.fir b/flang/test/Transforms/debug-fixed-array-type.fir index d4ed0b97020898..1a7d8115908a07 100644 --- a/flang/test/Transforms/debug-fixed-array-type.fir +++ b/flang/test/Transforms/debug-fixed-array-type.fir @@ -8,12 +8,16 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<>} { %c5 = arith.constant 5 : index %c2 = arith.constant 2 : index %c3 = arith.constant 3 : index + %c-2 = arith.constant -2 : index loc(#loc3) + %c4 = arith.constant 4 : index loc(#loc3) %0 = fir.alloca !fir.array<3xi32> {bindc_name = "d1", uniq_name = "_QFEd1"} %1 = fircg.ext_declare %0(%c3) {uniq_name = "_QFEd1"} : (!fir.ref>, index) -> !fir.ref> loc(#loc1) %2 = fir.address_of(@_QFEd2) : !fir.ref> %3 = fircg.ext_declare %2(%c2, %c5) {uniq_name = "_QFEd2"} : (!fir.ref>, index, index) -> !fir.ref> loc(#loc2) %4 = fir.address_of(@_QFEd3) : !fir.ref> %5 = fircg.ext_declare %4(%c6, %c8, %c7) {uniq_name = "_QFEd3"} : (!fir.ref>, index, index, index) -> !fir.ref> loc(#loc3) + %6 = fir.address_of(@_QFEd4) : !fir.ref> + %7 = fircg.ext_declare %6(%c6, %c7) origin %c-2, %c4 {uniq_name = "_QFEd4"} : (!fir.ref>, index, index, index, index) -> !fir.ref> loc(#loc5) return } loc(#loc4) } @@ -22,6 +26,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<>} { #loc2 = loc("test.f90":6:11) #loc3 = loc("test.f90":7:11) #loc4 = loc("test.f90":2:8) +#loc5 = loc("test.f90":8:11) // CHECK-DAG: #[[INT:.*]] = #llvm.di_basic_type @@ -29,6 +34,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<>} { // CHECK-DAG: #[[D1TY:.*]] = #llvm.di_composite_type> // CHECK-DAG: #[[D2TY:.*]] = #llvm.di_composite_type, #llvm.di_subrange> // CHECK-DAG: #[[D3TY:.*]] = #llvm.di_composite_type, #llvm.di_subrange, #llvm.di_subrange> +// CHECK-DAG: #[[D4TY:.*]] = #llvm.di_composite_type, #llvm.di_subrange> // CHECK-DAG: #llvm.di_local_variable<{{.*}}name = "d1"{{.*}}type = #[[D1TY]]> // CHECK-DAG: #llvm.di_local_variable<{{.*}}name = "d2"{{.*}}type = #[[D2TY]]> // CHECK-DAG: #llvm.di_local_variable<{{.*}}name = "d3"{{.*}}type = #[[D3TY]]> +// CHECK-DAG: #llvm.di_local_variable<{{.*}}name = "d4"{{.*}}type = #[[D4TY]]> From 839275d0536f992591f4c5d81e13a26e6095dda6 Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Wed, 21 Aug 2024 16:57:31 +0100 Subject: [PATCH 081/426] [MLIR][OpenMP] Add missing OpenMP to LLVM conversion patterns (#104440) This patch adds conversion patterns to LLVM for the following OpenMP dialect operations: - `omp.critical.declare` - `omp.cancel` - `omp.cancellation_point` - `omp.distribute` - `omp.teams` - `omp.ordered` - `omp.taskloop` Also, arbitrary sorting of operations when passing them as template argument lists when configuring that pass is replaced by alphabetical sorting. --- .../Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp | 69 ++++++++------ .../OpenMPToLLVM/convert-to-llvmir.mlir | 94 +++++++++++++++++++ 2 files changed, 132 insertions(+), 31 deletions(-) diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp index f6a6d1d7228a06..d6b4ec8584b082 100644 --- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp +++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp @@ -223,22 +223,21 @@ void MultiRegionOpConversion::forwardOpAttrs( void mlir::configureOpenMPToLLVMConversionLegality( ConversionTarget &target, LLVMTypeConverter &typeConverter) { target.addDynamicallyLegalOp< - mlir::omp::AtomicReadOp, mlir::omp::AtomicWriteOp, mlir::omp::FlushOp, - mlir::omp::ThreadprivateOp, mlir::omp::YieldOp, - mlir::omp::TargetEnterDataOp, mlir::omp::TargetExitDataOp, - mlir::omp::TargetUpdateOp, mlir::omp::MapBoundsOp, mlir::omp::MapInfoOp>( - [&](Operation *op) { - return typeConverter.isLegal(op->getOperandTypes()) && - typeConverter.isLegal(op->getResultTypes()); - }); + omp::AtomicReadOp, omp::AtomicWriteOp, omp::CancellationPointOp, + omp::CancelOp, omp::CriticalDeclareOp, omp::FlushOp, omp::MapBoundsOp, + omp::MapInfoOp, omp::OrderedOp, omp::TargetEnterDataOp, + omp::TargetExitDataOp, omp::TargetUpdateOp, omp::ThreadprivateOp, + omp::YieldOp>([&](Operation *op) { + return typeConverter.isLegal(op->getOperandTypes()) && + typeConverter.isLegal(op->getResultTypes()); + }); target.addDynamicallyLegalOp< - mlir::omp::AtomicUpdateOp, mlir::omp::CriticalOp, mlir::omp::TargetOp, - mlir::omp::TargetDataOp, mlir::omp::LoopNestOp, - mlir::omp::OrderedRegionOp, mlir::omp::ParallelOp, mlir::omp::WsloopOp, - mlir::omp::SimdOp, mlir::omp::MasterOp, mlir::omp::SectionOp, - mlir::omp::SectionsOp, mlir::omp::SingleOp, mlir::omp::TaskgroupOp, - mlir::omp::TaskOp, mlir::omp::DeclareReductionOp, - mlir::omp::PrivateClauseOp>([&](Operation *op) { + omp::AtomicUpdateOp, omp::CriticalOp, omp::DeclareReductionOp, + omp::DistributeOp, omp::LoopNestOp, omp::MasterOp, omp::OrderedRegionOp, + omp::ParallelOp, omp::PrivateClauseOp, omp::SectionOp, omp::SectionsOp, + omp::SimdOp, omp::SingleOp, omp::TargetDataOp, omp::TargetOp, + omp::TaskgroupOp, omp::TaskloopOp, omp::TaskOp, omp::TeamsOp, + omp::WsloopOp>([&](Operation *op) { return std::all_of(op->getRegions().begin(), op->getRegions().end(), [&](Region ®ion) { return typeConverter.isLegal(®ion); @@ -260,23 +259,31 @@ void mlir::populateOpenMPToLLVMConversionPatterns(LLVMTypeConverter &converter, AtomicReadOpConversion, MapInfoOpConversion, MultiRegionOpConversion, MultiRegionOpConversion, - RegionOpConversion, RegionOpConversion, - RegionOpConversion, - RegionOpConversion, - RegionOpConversion, RegionOpConversion, - RegionOpConversion, RegionOpConversion, - RegionOpConversion, RegionOpConversion, - RegionOpConversion, RegionOpConversion, - RegionOpConversion, RegionOpConversion, - RegionLessOpWithVarOperandsConversion, - RegionOpWithVarOperandsConversion, - RegionLessOpWithVarOperandsConversion, - RegionLessOpWithVarOperandsConversion, - RegionLessOpConversion, + RegionLessOpConversion, + RegionLessOpConversion, + RegionLessOpConversion, + RegionLessOpConversion, RegionLessOpConversion, RegionLessOpConversion, RegionLessOpConversion, - RegionLessOpWithVarOperandsConversion>(converter); + RegionLessOpConversion, + RegionLessOpWithVarOperandsConversion, + RegionLessOpWithVarOperandsConversion, + RegionLessOpWithVarOperandsConversion, + RegionLessOpWithVarOperandsConversion, + RegionOpConversion, + RegionOpConversion, + RegionOpConversion, + RegionOpConversion, RegionOpConversion, + RegionOpConversion, + RegionOpConversion, + RegionOpConversion, RegionOpConversion, + RegionOpConversion, RegionOpConversion, + RegionOpConversion, RegionOpConversion, + RegionOpConversion, RegionOpConversion, + RegionOpConversion, RegionOpConversion, + RegionOpConversion, RegionOpConversion, + RegionOpWithVarOperandsConversion>(converter); } namespace { @@ -301,8 +308,8 @@ void ConvertOpenMPToLLVMPass::runOnOperation() { populateOpenMPToLLVMConversionPatterns(converter, patterns); LLVMConversionTarget target(getContext()); - target.addLegalOp(); + target.addLegalOp(); configureOpenMPToLLVMConversionLegality(target, converter); if (failed(applyPartialConversion(module, target, std::move(patterns)))) signalPassFailure(); diff --git a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir index d81487daf34f68..5afdbaa2a56af3 100644 --- a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir +++ b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir @@ -18,6 +18,20 @@ func.func @critical_block_arg() { // ----- +// CHECK: omp.critical.declare @[[MUTEX:.*]] hint(contended, speculative) +omp.critical.declare @mutex hint(contended, speculative) + +// CHECK: llvm.func @critical_declare +func.func @critical_declare() { + // CHECK: omp.critical(@[[MUTEX]]) + omp.critical(@mutex) { + omp.terminator + } + return +} + +// ----- + // CHECK-LABEL: llvm.func @master_block_arg func.func @master_block_arg() { // CHECK: omp.master @@ -523,3 +537,83 @@ omp.private {type = firstprivate} @y.privatizer : index alloc { // CHECK: omp.yield(%arg0 : i64) omp.yield(%arg0 : index) } + +// ----- + +// CHECK-LABEL: llvm.func @omp_cancel_cancellation_point() +func.func @omp_cancel_cancellation_point() -> () { + omp.parallel { + // CHECK: omp.cancel cancellation_construct_type(parallel) + omp.cancel cancellation_construct_type(parallel) + // CHECK: omp.cancellation_point cancellation_construct_type(parallel) + omp.cancellation_point cancellation_construct_type(parallel) + omp.terminator + } + return +} + +// ----- + +// CHECK-LABEL: llvm.func @omp_distribute( +// CHECK-SAME: %[[ARG0:.*]]: i64) +func.func @omp_distribute(%arg0 : index) -> () { + // CHECK: omp.distribute dist_schedule_static dist_schedule_chunk_size(%[[ARG0]] : i64) { + omp.distribute dist_schedule_static dist_schedule_chunk_size(%arg0 : index) { + omp.loop_nest (%iv) : index = (%arg0) to (%arg0) step (%arg0) { + omp.yield + } + omp.terminator + } + return +} + +// ----- + +// CHECK-LABEL: llvm.func @omp_teams( +// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr, %[[ARG1:.*]]: !llvm.ptr, %[[ARG2:.*]]: i64) +func.func @omp_teams(%arg0 : memref) -> () { + // CHECK: omp.teams allocate(%{{.*}} : !llvm.struct<(ptr, ptr, i64)> -> %{{.*}} : !llvm.struct<(ptr, ptr, i64)>) + omp.teams allocate(%arg0 : memref -> %arg0 : memref) { + omp.terminator + } + return +} + +// ----- + +// CHECK-LABEL: llvm.func @omp_ordered( +// CHECK-SAME: %[[ARG0:.*]]: i64) +func.func @omp_ordered(%arg0 : index) -> () { + omp.wsloop ordered(1) { + omp.loop_nest (%iv) : index = (%arg0) to (%arg0) step (%arg0) { + // CHECK: omp.ordered depend_vec(%[[ARG0]] : i64) {doacross_num_loops = 1 : i64} + omp.ordered depend_vec(%arg0 : index) {doacross_num_loops = 1 : i64} + omp.yield + } + omp.terminator + } + return +} + +// ----- + +// CHECK-LABEL: @omp_taskloop( +// CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: !llvm.ptr, %[[ARG2:.*]]: !llvm.ptr, %[[ARG3:.*]]: i64) +func.func @omp_taskloop(%arg0: index, %arg1 : memref) { + // CHECK: omp.parallel { + omp.parallel { + // CHECK: omp.taskloop allocate(%{{.*}} : !llvm.struct<(ptr, ptr, i64)> -> %{{.*}} : !llvm.struct<(ptr, ptr, i64)>) { + omp.taskloop allocate(%arg1 : memref -> %arg1 : memref) { + // CHECK: omp.loop_nest (%[[IV:.*]]) : i64 = (%[[ARG0]]) to (%[[ARG0]]) step (%[[ARG0]]) { + omp.loop_nest (%iv) : index = (%arg0) to (%arg0) step (%arg0) { + // CHECK-DAG: %[[CAST_IV:.*]] = builtin.unrealized_conversion_cast %[[IV]] : i64 to index + // CHECK: "test.payload"(%[[CAST_IV]]) : (index) -> () + "test.payload"(%iv) : (index) -> () + omp.yield + } + omp.terminator + } + omp.terminator + } + return +} From 6816a137985bfa38cda20b9cd4e23c361c3bd0de Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Wed, 21 Aug 2024 16:58:57 +0100 Subject: [PATCH 082/426] [flang][Driver] Remove misleading test comment (#105528) The test initially worked on ArmPL but this was changed during code review and I neglected to fix this comment. Thanks for pointing this out @banach-space --- flang/test/Driver/fveclib-codegen.f90 | 1 - 1 file changed, 1 deletion(-) diff --git a/flang/test/Driver/fveclib-codegen.f90 b/flang/test/Driver/fveclib-codegen.f90 index 3a96c29ac70854..3720b9e597f5b5 100644 --- a/flang/test/Driver/fveclib-codegen.f90 +++ b/flang/test/Driver/fveclib-codegen.f90 @@ -1,5 +1,4 @@ ! test that -fveclib= is passed to the backend -! -target aarch64 so that ArmPL is available ! RUN: %if aarch64-registered-target %{ %flang -S -Ofast -target aarch64-unknown-linux-gnu -fveclib=LIBMVEC -o - %s | FileCheck %s %} ! RUN: %if x86-registered-target %{ %flang -S -Ofast -target x86_64-unknown-linux-gnu -fveclib=LIBMVEC -o - %s | FileCheck %s %} ! RUN: %flang -S -Ofast -fveclib=NoLibrary -o - %s | FileCheck %s --check-prefix=NOLIB From e49068624c48f4d906707b32b31f6a1d561605be Mon Sep 17 00:00:00 2001 From: Alex Rice Date: Wed, 21 Aug 2024 17:14:33 +0100 Subject: [PATCH 083/426] [mlir] [tablegen] Make `hasSummary` and `hasDescription` useful (#105531) The `hasSummary` and `hasDescription` functions are currently useless as they check if the corresponding `summary` and `description` are present. However, these values are set to a default value of `""`, and so these functions always return true. This PR changes these functions to check if the summary and description are just whitespace, which is presumably closer to their original intent. @math-fehr @zero9178 --- mlir/lib/TableGen/Operator.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/lib/TableGen/Operator.cpp b/mlir/lib/TableGen/Operator.cpp index bd3e3b1c6b7ccf..76af82a827da13 100644 --- a/mlir/lib/TableGen/Operator.cpp +++ b/mlir/lib/TableGen/Operator.cpp @@ -798,14 +798,14 @@ const InferredResultType &Operator::getInferredResultType(int index) const { ArrayRef Operator::getLoc() const { return def.getLoc(); } bool Operator::hasDescription() const { - return def.getValue("description") != nullptr; + return !getDescription().trim().empty(); } StringRef Operator::getDescription() const { return def.getValueAsString("description"); } -bool Operator::hasSummary() const { return def.getValue("summary") != nullptr; } +bool Operator::hasSummary() const { return !getSummary().trim().empty(); } StringRef Operator::getSummary() const { return def.getValueAsString("summary"); From 625841c3be4dbaab089c01217726a2906f3a8103 Mon Sep 17 00:00:00 2001 From: magic-akari Date: Thu, 22 Aug 2024 00:22:21 +0800 Subject: [PATCH 084/426] [clang-format] Use double hyphen for multiple-letter flags (#100978) - Closes: #100974 --- clang/tools/clang-format/clang-format-diff.py | 8 ++++---- clang/tools/clang-format/clang-format-sublime.py | 8 ++++---- clang/tools/clang-format/clang-format.el | 14 +++++++------- clang/tools/clang-format/clang-format.py | 16 ++++++++-------- clang/tools/clang-format/git-clang-format | 6 +++--- 5 files changed, 26 insertions(+), 26 deletions(-) diff --git a/clang/tools/clang-format/clang-format-diff.py b/clang/tools/clang-format/clang-format-diff.py index 3a74b90e731578..9eec0f3c89de37 100755 --- a/clang/tools/clang-format/clang-format-diff.py +++ b/clang/tools/clang-format/clang-format-diff.py @@ -134,7 +134,7 @@ def main(): if line_count != 0: end_line += line_count - 1 lines_by_file.setdefault(filename, []).extend( - ["-lines", str(start_line) + ":" + str(end_line)] + ["--lines", str(start_line) + ":" + str(end_line)] ) # Reformat files containing changes in place. @@ -146,12 +146,12 @@ def main(): if args.i: command.append("-i") if args.sort_includes: - command.append("-sort-includes") + command.append("--sort-includes") command.extend(lines) if args.style: - command.extend(["-style", args.style]) + command.extend(["--style", args.style]) if args.fallback_style: - command.extend(["-fallback-style", args.fallback_style]) + command.extend(["--fallback-style", args.fallback_style]) try: p = subprocess.Popen( diff --git a/clang/tools/clang-format/clang-format-sublime.py b/clang/tools/clang-format/clang-format-sublime.py index dcd72e68e94faa..8d41da332c1889 100644 --- a/clang/tools/clang-format/clang-format-sublime.py +++ b/clang/tools/clang-format/clang-format-sublime.py @@ -35,18 +35,18 @@ def run(self, edit): regions = [] command = [binary] if style: - command.extend(["-style", style]) + command.extend(["--style", style]) for region in self.view.sel(): regions.append(region) region_offset = min(region.a, region.b) region_length = abs(region.b - region.a) command.extend( [ - "-offset", + "--offset", str(region_offset), - "-length", + "--length", str(region_length), - "-assume-filename", + "--assume-filename", str(self.view.file_name()), ] ) diff --git a/clang/tools/clang-format/clang-format.el b/clang/tools/clang-format/clang-format.el index f43bf063c62970..f3da5415f8672b 100644 --- a/clang/tools/clang-format/clang-format.el +++ b/clang/tools/clang-format/clang-format.el @@ -166,19 +166,19 @@ uses the function `buffer-file-name'." (let ((status (apply #'call-process-region nil nil clang-format-executable nil `(,temp-buffer ,temp-file) nil - `("-output-replacements-xml" + `("--output-replacements-xml" ;; Guard against a nil assume-file-name. ;; If the clang-format option -assume-filename ;; is given a blank string it will crash as per ;; the following bug report ;; https://bugs.llvm.org/show_bug.cgi?id=34667 ,@(and assume-file-name - (list "-assume-filename" assume-file-name)) - ,@(and style (list "-style" style)) - "-fallback-style" ,clang-format-fallback-style - "-offset" ,(number-to-string file-start) - "-length" ,(number-to-string (- file-end file-start)) - "-cursor" ,(number-to-string cursor)))) + (list "--assume-filename" assume-file-name)) + ,@(and style (list "--style" style)) + "--fallback-style" ,clang-format-fallback-style + "--offset" ,(number-to-string file-start) + "--length" ,(number-to-string (- file-end file-start)) + "--cursor" ,(number-to-string cursor)))) (stderr (with-temp-buffer (unless (zerop (cadr (insert-file-contents temp-file))) (insert ": ")) diff --git a/clang/tools/clang-format/clang-format.py b/clang/tools/clang-format/clang-format.py index 28e0d14a552fd1..07eebd27f49d11 100644 --- a/clang/tools/clang-format/clang-format.py +++ b/clang/tools/clang-format/clang-format.py @@ -78,7 +78,7 @@ def main(): # Determine range to format. if vim.eval('exists("l:lines")') == "1": - lines = ["-lines", vim.eval("l:lines")] + lines = ["--lines", vim.eval("l:lines")] elif vim.eval('exists("l:formatdiff")') == "1" and os.path.exists( vim.current.buffer.name ): @@ -88,12 +88,12 @@ def main(): lines = [] for op in reversed(sequence.get_opcodes()): if op[0] not in ["equal", "delete"]: - lines += ["-lines", "%s:%s" % (op[3] + 1, op[4])] + lines += ["--lines", "%s:%s" % (op[3] + 1, op[4])] if lines == []: return else: lines = [ - "-lines", + "--lines", "%s:%s" % (vim.current.range.start + 1, vim.current.range.end + 1), ] @@ -116,15 +116,15 @@ def main(): startupinfo.wShowWindow = subprocess.SW_HIDE # Call formatter. - command = [binary, "-cursor", str(cursor_byte)] - if lines != ["-lines", "all"]: + command = [binary, "--cursor", str(cursor_byte)] + if lines != ["--lines", "all"]: command += lines if style: - command.extend(["-style", style]) + command.extend(["--style", style]) if fallback_style: - command.extend(["-fallback-style", fallback_style]) + command.extend(["--fallback-style", fallback_style]) if vim.current.buffer.name: - command.extend(["-assume-filename", vim.current.buffer.name]) + command.extend(["--assume-filename", vim.current.buffer.name]) p = subprocess.Popen( command, stdout=subprocess.PIPE, diff --git a/clang/tools/clang-format/git-clang-format b/clang/tools/clang-format/git-clang-format index 714ba8a6e77d51..bacbd8de245666 100755 --- a/clang/tools/clang-format/git-clang-format +++ b/clang/tools/clang-format/git-clang-format @@ -510,12 +510,12 @@ def clang_format_to_blob(filename, line_ranges, revision=None, Returns the object ID (SHA-1) of the created blob.""" clang_format_cmd = [binary] if style: - clang_format_cmd.extend(['-style='+style]) + clang_format_cmd.extend(['--style='+style]) clang_format_cmd.extend([ - '-lines=%s:%s' % (start_line, start_line+line_count-1) + '--lines=%s:%s' % (start_line, start_line+line_count-1) for start_line, line_count in line_ranges]) if revision is not None: - clang_format_cmd.extend(['-assume-filename='+filename]) + clang_format_cmd.extend(['--assume-filename='+filename]) git_show_cmd = ['git', 'cat-file', 'blob', '%s:%s' % (revision, filename)] git_show = subprocess.Popen(git_show_cmd, env=env, stdin=subprocess.PIPE, stdout=subprocess.PIPE) From f7bbc40b0736cc417f57cd039b098b504cf6a71f Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Wed, 7 Aug 2024 14:29:27 +0000 Subject: [PATCH 085/426] [ELF,test] Enhance hip-section-layout.s Check different object file order Change-Id: I6096c12e29e9ddb6b3053f977e4cbb24eea9b7d3 --- lld/test/ELF/hip-section-layout.s | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lld/test/ELF/hip-section-layout.s b/lld/test/ELF/hip-section-layout.s index c76df50919e6d0..b76141c6b41aec 100644 --- a/lld/test/ELF/hip-section-layout.s +++ b/lld/test/ELF/hip-section-layout.s @@ -7,8 +7,10 @@ # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux --defsym=NON_HIP_SECTIONS=1 %s -o %t.1.o # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux --defsym=HIP_SECTIONS=1 %s -o %t.2.o -# RUN: ld.lld %t.1.o %t.2.o -o %t.s.out -# RUN: llvm-readobj --sections %t.s.out | FileCheck %s +# RUN: ld.lld %t.1.o %t.2.o -o %t.1.s.out +# RUN: llvm-readobj --sections %t.1.s.out | FileCheck %s +# RUN: ld.lld %t.2.o %t.1.o -o %t.2.s.out +# RUN: llvm-readobj --sections %t.2.s.out | FileCheck %s .ifdef HIP_SECTIONS .section .hipFatBinSegment,"aw",@progbits; .space 1 From f03b7830902225a8910d2972c39143355795efa9 Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee Date: Wed, 21 Aug 2024 09:32:51 -0700 Subject: [PATCH 086/426] [CGData] Rename CodeGenDataTests to CGDataTests (#105463) This addresses the comment for https://github.com/llvm/llvm-project/pull/101461. --- llvm/unittests/CGData/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/unittests/CGData/CMakeLists.txt b/llvm/unittests/CGData/CMakeLists.txt index 9cedab56d3f6bc..792b323130b474 100644 --- a/llvm/unittests/CGData/CMakeLists.txt +++ b/llvm/unittests/CGData/CMakeLists.txt @@ -6,9 +6,9 @@ set(LLVM_LINK_COMPONENTS Support ) -add_llvm_unittest(CodeGenDataTests +add_llvm_unittest(CGDataTests OutlinedHashTreeRecordTest.cpp OutlinedHashTreeTest.cpp ) -target_link_libraries(CodeGenDataTests PRIVATE LLVMTestingSupport) +target_link_libraries(CGDataTests PRIVATE LLVMTestingSupport) From 216d6a06524e4a8ebd6de2806c473b92d3349c4e Mon Sep 17 00:00:00 2001 From: Chenguang Wang Date: Wed, 21 Aug 2024 09:54:57 -0700 Subject: [PATCH 087/426] [bazel] Fix mlir build broken by 681ae097. (#105552) The cmake config creates two targets, `MLIRTensorMeshShardingExtensions` and `MLIRTensorAllExtensions`; but for bazel, with the `Func` dialect we only have a single `FuncExtensions`. Here I am following the `Func` dialect convension to only create a single `TensorExtensions`. --- .../llvm-project-overlay/mlir/BUILD.bazel | 39 +++++++++---------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 57b08448ae9294..ddb08f12f04976 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -3337,25 +3337,6 @@ cc_library( ], ) -cc_library( - name = "TensorShardingInterfaceImpl", - srcs = ["lib/Dialect/Mesh/Interfaces/TensorShardingInterfaceImpl.cpp"], - hdrs = [ - "include/mlir/Dialect/Mesh/IR/TensorShardingInterfaceImpl.h", - ], - includes = ["include"], - deps = [ - ":DialectUtils", - ":IR", - ":MeshDialect", - ":MeshShardingInterface", - ":MeshShardingInterfaceIncGen", - ":Support", - ":TensorDialect", - "//llvm:Support", - ], -) - cc_library( name = "MeshDialect", srcs = ["lib/Dialect/Mesh/IR/MeshOps.cpp"], @@ -4890,6 +4871,7 @@ cc_library( ":ROCDLToLLVMIRTranslation", ":SCFTransformOps", ":SparseTensorTransformOps", + ":TensorExtensions", ":TensorTransformOps", ":TransformDebugExtension", ":TransformIRDLExtension", @@ -7600,6 +7582,7 @@ cc_library( "lib/Dialect/Tensor/IR/ValueBoundsOpInterfaceImpl.cpp", ], hdrs = [ + "include/mlir/Dialect/Tensor/IR/ShardingInterfaceImpl.h", "include/mlir/Dialect/Tensor/IR/Tensor.h", "include/mlir/Dialect/Tensor/IR/ValueBoundsOpInterfaceImpl.h", ], @@ -7669,6 +7652,23 @@ cc_library( ], ) +cc_library( + name = "TensorExtensions", + srcs = glob(["lib/Dialect/Tensor/Extensions/*.cpp"]), + hdrs = glob(["include/mlir/Dialect/Tensor/Extensions/*.h"]), + includes = ["include"], + deps = [ + ":DialectUtils", + ":IR", + ":MeshDialect", + ":MeshShardingInterface", + ":MeshShardingInterfaceIncGen", + ":Support", + ":TensorDialect", + "//llvm:Support", + ], +) + cc_library( name = "TensorUtils", srcs = ["lib/Dialect/Tensor/Utils/Utils.cpp"], @@ -9603,7 +9603,6 @@ cc_library( ":SparseTensorTransforms", ":TensorDialect", ":TensorInferTypeOpInterfaceImpl", - ":TensorShardingInterfaceImpl", ":TensorTilingInterfaceImpl", ":TensorTransformOps", ":TensorTransforms", From 3b7611594f010ecd5233ab9580b2feb88837f9ef Mon Sep 17 00:00:00 2001 From: Johannes Doerfert Date: Wed, 21 Aug 2024 10:01:35 -0700 Subject: [PATCH 088/426] [Offload] Improve error reporting on memory faults (#104254) Since we can already track allocations, we can diagnose memory faults to some degree. If the fault happens in a prior allocation (use after free) or "close but outside" one, we can provide that information to the user. Note that the fault address might be page aligned, and not all accesses trigger a fault, especially for allocations that are backed by a MemoryManager. Still, if people disable the MemoryManager or the allocation is big enough, we can sometimes provide valueable feedback. --- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 12 +++- .../common/include/ErrorReporting.h | 67 +++++++++++++++++-- .../common/include/PluginInterface.h | 46 +++++++++++-- offload/test/sanitizer/double_free.c | 6 +- offload/test/sanitizer/double_free_racy.c | 2 +- offload/test/sanitizer/free_wrong_ptr_kind.c | 2 +- .../test/sanitizer/free_wrong_ptr_kind.cpp | 2 +- offload/test/sanitizer/ptr_outside_alloc_1.c | 40 +++++++++++ offload/test/sanitizer/ptr_outside_alloc_2.c | 26 +++++++ offload/test/sanitizer/use_after_free_1.c | 39 +++++++++++ offload/test/sanitizer/use_after_free_2.c | 32 +++++++++ 11 files changed, 256 insertions(+), 18 deletions(-) create mode 100644 offload/test/sanitizer/ptr_outside_alloc_1.c create mode 100644 offload/test/sanitizer/ptr_outside_alloc_2.c create mode 100644 offload/test/sanitizer/use_after_free_1.c create mode 100644 offload/test/sanitizer/use_after_free_2.c diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index a434a0089d5f94..86df4584db0914 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -3264,8 +3264,18 @@ struct AMDGPUPluginTy final : public GenericPluginTy { } if (DeviceNode != Node) continue; - + void *DevicePtr = (void *)Event->memory_fault.virtual_address; + std::string S; + llvm::raw_string_ostream OS(S); + OS << llvm::format("Memory access fault by GPU %" PRIu32 + " (agent 0x%" PRIx64 + ") at virtual address %p. Reasons: %s", + Node, Event->memory_fault.agent.handle, + (void *)Event->memory_fault.virtual_address, + llvm::join(Reasons, ", ").c_str()); ErrorReporter::reportKernelTraces(AMDGPUDevice, *KernelTraceInfoRecord); + ErrorReporter::reportMemoryAccessError(AMDGPUDevice, DevicePtr, S, + /*Abort*/ true); } // Abort the execution since we do not recover from this error. diff --git a/offload/plugins-nextgen/common/include/ErrorReporting.h b/offload/plugins-nextgen/common/include/ErrorReporting.h index e557b32c2c24f8..8478977a8f86af 100644 --- a/offload/plugins-nextgen/common/include/ErrorReporting.h +++ b/offload/plugins-nextgen/common/include/ErrorReporting.h @@ -157,10 +157,13 @@ class ErrorReporter { if (ATI->HostPtr) print(BoldLightPurple, - "Last allocation of size %lu for host pointer %p:\n", ATI->Size, - ATI->HostPtr); + "Last allocation of size %lu for host pointer %p -> device pointer " + "%p:\n", + ATI->Size, ATI->HostPtr, ATI->DevicePtr); else - print(BoldLightPurple, "Last allocation of size %lu:\n", ATI->Size); + print(BoldLightPurple, + "Last allocation of size %lu -> device pointer %p:\n", ATI->Size, + ATI->DevicePtr); reportStackTrace(ATI->AllocationTrace); if (!ATI->LastAllocationInfo) return; @@ -174,10 +177,13 @@ class ErrorReporter { ATI->Size); reportStackTrace(ATI->DeallocationTrace); if (ATI->HostPtr) - print(BoldLightPurple, " #%u Prior allocation for host pointer %p:\n", - I, ATI->HostPtr); + print( + BoldLightPurple, + " #%u Prior allocation for host pointer %p -> device pointer %p:\n", + I, ATI->HostPtr, ATI->DevicePtr); else - print(BoldLightPurple, " #%u Prior allocation:\n", I); + print(BoldLightPurple, " #%u Prior allocation -> device pointer %p:\n", + I, ATI->DevicePtr); reportStackTrace(ATI->AllocationTrace); ++I; } @@ -219,6 +225,55 @@ class ErrorReporter { #undef DEALLOCATION_ERROR } + static void reportMemoryAccessError(GenericDeviceTy &Device, void *DevicePtr, + std::string &ErrorStr, bool Abort) { + reportError(ErrorStr.c_str()); + + if (!Device.OMPX_TrackAllocationTraces) { + print(Yellow, "Use '%s=true' to track device allocations\n", + Device.OMPX_TrackAllocationTraces.getName().data()); + if (Abort) + abortExecution(); + return; + } + uintptr_t Distance = false; + auto *ATI = + Device.getClosestAllocationTraceInfoForAddr(DevicePtr, Distance); + if (!ATI) { + print(Cyan, + "No host-issued allocations; device pointer %p might be " + "a global, stack, or shared location\n", + DevicePtr); + if (Abort) + abortExecution(); + return; + } + if (!Distance) { + print(Cyan, "Device pointer %p points into%s host-issued allocation:\n", + DevicePtr, ATI->DeallocationTrace.empty() ? "" : " prior"); + reportAllocationInfo(ATI); + if (Abort) + abortExecution(); + return; + } + + bool IsClose = Distance < (1L << 29L /*512MB=*/); + print(Cyan, + "Device pointer %p does not point into any (current or prior) " + "host-issued allocation%s.\n", + DevicePtr, + IsClose ? "" : " (might be a global, stack, or shared location)"); + if (IsClose) { + print(Cyan, + "Closest host-issued allocation (distance %" PRIuPTR + " byte%s; might be by page):\n", + Distance, Distance > 1 ? "s" : ""); + reportAllocationInfo(ATI); + } + if (Abort) + abortExecution(); + } + /// Report that a kernel encountered a trap instruction. static void reportTrapInKernel( GenericDeviceTy &Device, KernelTraceInfoRecordTy &KTIR, diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 81823338fe2112..7e3e788fa52dc9 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -938,6 +938,42 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// been deallocated, both for error reporting purposes. ProtectedObj> AllocationTraces; + /// Return the allocation trace info for a device pointer, that is the + /// allocation into which this device pointer points to (or pointed into). + AllocationTraceInfoTy *getAllocationTraceInfoForAddr(void *DevicePtr) { + auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor(); + for (auto &It : *AllocationTraceMap) { + if (It.first <= DevicePtr && + advanceVoidPtr(It.first, It.second->Size) > DevicePtr) + return It.second; + } + return nullptr; + } + + /// Return the allocation trace info for a device pointer, that is the + /// allocation into which this device pointer points to (or pointed into). + AllocationTraceInfoTy * + getClosestAllocationTraceInfoForAddr(void *DevicePtr, uintptr_t &Distance) { + Distance = 0; + if (auto *ATI = getAllocationTraceInfoForAddr(DevicePtr)) { + return ATI; + } + + AllocationTraceInfoTy *ATI = nullptr; + uintptr_t DevicePtrI = uintptr_t(DevicePtr); + auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor(); + for (auto &It : *AllocationTraceMap) { + uintptr_t Begin = uintptr_t(It.second->DevicePtr); + uintptr_t End = Begin + It.second->Size - 1; + uintptr_t ItDistance = std::min(Begin - DevicePtrI, DevicePtrI - End); + if (ATI && ItDistance > Distance) + continue; + ATI = It.second; + Distance = ItDistance; + } + return ATI; + } + /// Map to record kernel have been launchedl, for error reporting purposes. ProtectedObj KernelLaunchTraces; @@ -946,6 +982,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy { UInt32Envar OMPX_TrackNumKernelLaunches = UInt32Envar("OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES", 0); + /// Environment variable to determine if stack traces for allocations and + /// deallocations are tracked. + BoolEnvar OMPX_TrackAllocationTraces = + BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false); + private: /// Get and set the stack size and heap size for the device. If not used, the /// plugin can implement the setters as no-op and setting the output @@ -996,11 +1037,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy { UInt32Envar OMPX_InitialNumStreams; UInt32Envar OMPX_InitialNumEvents; - /// Environment variable to determine if stack traces for allocations and - /// deallocations are tracked. - BoolEnvar OMPX_TrackAllocationTraces = - BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false); - /// Array of images loaded into the device. Images are automatically /// deallocated by the allocator. llvm::SmallVector LoadedImages; diff --git a/offload/test/sanitizer/double_free.c b/offload/test/sanitizer/double_free.c index ca7310e34fc9d0..a3d8b06f1c7381 100644 --- a/offload/test/sanitizer/double_free.c +++ b/offload/test/sanitizer/double_free.c @@ -36,7 +36,7 @@ int main(void) { // NDEBG: main // DEBUG: main {{.*}}double_free.c:24 // -// CHECK: Last allocation of size 8: +// CHECK: Last allocation of size 8 -> device pointer // CHECK: dataAlloc // CHECK: omp_target_alloc // NDEBG: main @@ -49,7 +49,7 @@ int main(void) { // NDEBG: main // DEBUG: main {{.*}}double_free.c:22 // -// CHECK: #0 Prior allocation: +// CHECK: #0 Prior allocation -> device pointer // CHECK: dataAlloc // CHECK: omp_target_alloc // NDEBG: main @@ -61,7 +61,7 @@ int main(void) { // NDEBG: main // DEBUG: main {{.*}}double_free.c:20 // -// CHECK: #1 Prior allocation: +// CHECK: #1 Prior allocation -> device pointer // CHECK: dataAlloc // CHECK: omp_target_alloc // NDEBG: main diff --git a/offload/test/sanitizer/double_free_racy.c b/offload/test/sanitizer/double_free_racy.c index 3b4f2d5c51571c..4ebd8f36efa10c 100644 --- a/offload/test/sanitizer/double_free_racy.c +++ b/offload/test/sanitizer/double_free_racy.c @@ -28,6 +28,6 @@ int main(void) { // CHECK: dataDelete // CHECK: omp_target_free -// CHECK: Last allocation of size 8: +// CHECK: Last allocation of size 8 -> device pointer // CHECK: dataAlloc // CHECK: omp_target_alloc diff --git a/offload/test/sanitizer/free_wrong_ptr_kind.c b/offload/test/sanitizer/free_wrong_ptr_kind.c index 0c178541db1170..7c5a4ff7085024 100644 --- a/offload/test/sanitizer/free_wrong_ptr_kind.c +++ b/offload/test/sanitizer/free_wrong_ptr_kind.c @@ -28,7 +28,7 @@ int main(void) { // NDEBG: main // DEBUG: main {{.*}}free_wrong_ptr_kind.c:22 // -// CHECK: Last allocation of size 8: +// CHECK: Last allocation of size 8 -> device pointer // CHECK: dataAlloc // CHECK: llvm_omp_target_alloc_host // NDEBG: main diff --git a/offload/test/sanitizer/free_wrong_ptr_kind.cpp b/offload/test/sanitizer/free_wrong_ptr_kind.cpp index 87a52c5d4baf23..7ebb8c438433a9 100644 --- a/offload/test/sanitizer/free_wrong_ptr_kind.cpp +++ b/offload/test/sanitizer/free_wrong_ptr_kind.cpp @@ -31,7 +31,7 @@ int main(void) { // NDEBG: main // DEBUG: main {{.*}}free_wrong_ptr_kind.cpp:25 // -// CHECK: Last allocation of size 8: +// CHECK: Last allocation of size 8 -> device pointer // CHECK: dataAlloc // CHECK: llvm_omp_target_alloc_shared // NDEBG: main diff --git a/offload/test/sanitizer/ptr_outside_alloc_1.c b/offload/test/sanitizer/ptr_outside_alloc_1.c new file mode 100644 index 00000000000000..38742b783e8e9b --- /dev/null +++ b/offload/test/sanitizer/ptr_outside_alloc_1.c @@ -0,0 +1,40 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic +// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NTRCE +// RUN: %libomptarget-compileopt-generic +// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +#include + +void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum); +void llvm_omp_target_free_host(void *Ptr, int DeviceNum); + +int main() { + int N = (1 << 30); + char *A = (char *)llvm_omp_target_alloc_host(N, omp_get_default_device()); + char *P; +#pragma omp target map(from : P) + { + P = &A[0]; + *P = 3; + } + // clang-format off +// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}} +// NTRCE: Use 'OFFLOAD_TRACK_ALLOCATION_TRACES=true' to track device allocations +// TRACE: Device pointer [[PTR]] does not point into any (current or prior) host-issued allocation. +// TRACE: Closest host-issued allocation (distance 4096 bytes; might be by page): +// TRACE: Last allocation of size 1073741824 +// clang-format on +#pragma omp target + { P[-4] = 5; } + + llvm_omp_target_free_host(A, omp_get_default_device()); +} diff --git a/offload/test/sanitizer/ptr_outside_alloc_2.c b/offload/test/sanitizer/ptr_outside_alloc_2.c new file mode 100644 index 00000000000000..ac47c8922f09ef --- /dev/null +++ b/offload/test/sanitizer/ptr_outside_alloc_2.c @@ -0,0 +1,26 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic +// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +#include + +int main() { + int N = (1 << 30); + char *A = (char *)malloc(N); +#pragma omp target map(A[ : N]) + { A[N] = 3; } + // clang-format off +// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}} +// CHECK: Device pointer [[PTR]] does not point into any (current or prior) host-issued allocation. +// CHECK: Closest host-issued allocation (distance 1 byte; might be by page): +// CHECK: Last allocation of size 1073741824 +// clang-format on +} diff --git a/offload/test/sanitizer/use_after_free_1.c b/offload/test/sanitizer/use_after_free_1.c new file mode 100644 index 00000000000000..cebcdee1803475 --- /dev/null +++ b/offload/test/sanitizer/use_after_free_1.c @@ -0,0 +1,39 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic +// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NTRCE +// RUN: %libomptarget-compileopt-generic +// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +#include + +void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum); +void llvm_omp_target_free_host(void *Ptr, int DeviceNum); + +int main() { + int N = (1 << 30); + char *A = (char *)llvm_omp_target_alloc_host(N, omp_get_default_device()); + char *P; +#pragma omp target map(from : P) + { + P = &A[N / 2]; + *P = 3; + } + llvm_omp_target_free_host(A, omp_get_default_device()); + // clang-format off +// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}} +// NTRCE: Use 'OFFLOAD_TRACK_ALLOCATION_TRACES=true' to track device allocations +// TRACE: Device pointer [[PTR]] points into prior host-issued allocation: +// TRACE: Last deallocation: +// TRACE: Last allocation of size 1073741824 +// clang-format on +#pragma omp target + { *P = 5; } +} diff --git a/offload/test/sanitizer/use_after_free_2.c b/offload/test/sanitizer/use_after_free_2.c new file mode 100644 index 00000000000000..587d04a6ff3528 --- /dev/null +++ b/offload/test/sanitizer/use_after_free_2.c @@ -0,0 +1,32 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic +// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +#include + +int main() { + int N = (1 << 30); + char *A = (char *)malloc(N); + char *P; +#pragma omp target map(A[ : N]) map(from : P) + { + P = &A[N / 2]; + *P = 3; + } + // clang-format off +// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}} +// CHECK: Device pointer [[PTR]] points into prior host-issued allocation: +// CHECK: Last deallocation: +// CHECK: Last allocation of size 1073741824 +// clang-format on +#pragma omp target + { *P = 5; } +} From 1c9d8a62cb208afe1bc87669c7dd5d9590e615b2 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 21 Aug 2024 12:07:27 -0500 Subject: [PATCH 089/426] [libcxx] Add cache file for the GPU build (#99348) Summary: This patch adds a CMake cache config file for the GPU build. This cache will set the default required options when used from the LLVM runtime interface or directly. These options pretty much disable everything the GPU can't handle. With this and the following patches: #99259, #99243, #99287, and #99333, we will be able to build `libc++` targeting the GPU with an invocation like this. ``` $ cmake ../llvm -DRUNTIMES_nvptx64-nvidia-cuda_CACHE_FILES=${LLVM_SRC}/../libcxx/cmake/caches/NVPTX.cmake \ -DRUNTIMES_amdgcn-amd-amdhsa_CACHE_FILES=${LLVM_SRC}/../libcxx/cmake/caches/AMDGPU.cmake \ -DRUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES=compiler-rt;libc;libcxx \ -DRUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES=compiler-rt;libc;libcxx \ -DLLVM_RUNTIME_TARGETS=amdgcn-amd-amdhsa;nvptx64-nvidia-cuda \ ``` This will then install the libraries and headers into the appropriate locations for use with `clang`. --- libcxx/cmake/caches/AMDGPU.cmake | 36 ++++++++++++++++++++++++++++++++ libcxx/cmake/caches/NVPTX.cmake | 36 ++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 libcxx/cmake/caches/AMDGPU.cmake create mode 100644 libcxx/cmake/caches/NVPTX.cmake diff --git a/libcxx/cmake/caches/AMDGPU.cmake b/libcxx/cmake/caches/AMDGPU.cmake new file mode 100644 index 00000000000000..0cd2eebfb9c16a --- /dev/null +++ b/libcxx/cmake/caches/AMDGPU.cmake @@ -0,0 +1,36 @@ +# Configuration options for libcxx. +set(LIBCXX_ABI_VERSION 2 CACHE STRING "") +set(LIBCXX_CXX_ABI libcxxabi CACHE STRING "") +set(LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "") +set(LIBCXX_ENABLE_FILESYSTEM OFF CACHE BOOL "") +set(LIBCXX_ENABLE_LOCALIZATION OFF CACHE BOOL "") +set(LIBCXX_ENABLE_MONOTONIC_CLOCK ON CACHE BOOL "") +set(LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS ON CACHE BOOL "") +set(LIBCXX_ENABLE_RANDOM_DEVICE OFF CACHE BOOL "") +set(LIBCXX_ENABLE_RTTI OFF CACHE BOOL "") +set(LIBCXX_ENABLE_SHARED OFF CACHE BOOL "") +set(LIBCXX_ENABLE_STATIC_ABI_LIBRARY ON CACHE BOOL "") +set(LIBCXX_ENABLE_STATIC ON CACHE BOOL "") +set(LIBCXX_ENABLE_THREADS OFF CACHE BOOL "") +set(LIBCXX_ENABLE_UNICODE OFF CACHE BOOL "") +set(LIBCXX_ENABLE_WIDE_CHARACTERS OFF CACHE BOOL "") +set(LIBCXX_HAS_TERMINAL_AVAILABLE OFF CACHE BOOL "") +set(LIBCXX_INSTALL_LIBRARY ON CACHE BOOL "") +set(LIBCXX_LIBC "llvm-libc" CACHE STRING "") +set(LIBCXX_STATICALLY_LINK_ABI_IN_STATIC_LIBRARY ON CACHE BOOL "") +set(LIBCXX_USE_COMPILER_RT ON CACHE BOOL "") + +# Configuration options for libcxxabi. +set(LIBCXXABI_BAREMETAL ON CACHE BOOL "") +set(LIBCXXABI_ENABLE_EXCEPTIONS OFF CACHE BOOL "") +set(LIBCXXABI_ENABLE_NEW_DELETE_DEFINITIONS OFF CACHE BOOL "") +set(LIBCXXABI_ENABLE_SHARED OFF CACHE BOOL "") +set(LIBCXXABI_ENABLE_THREADS OFF CACHE BOOL "") +set(LIBCXXABI_USE_LLVM_UNWINDER OFF CACHE BOOL "") + +# Necessary compile flags for AMDGPU. +set(LIBCXX_ADDITIONAL_COMPILE_FLAGS + "-nogpulib;-flto;-fconvergent-functions;-Xclang;-mcode-object-version=none" CACHE STRING "") +set(LIBCXXABI_ADDITIONAL_COMPILE_FLAGS + "-nogpulib;-flto;-fconvergent-functions;-Xclang;-mcode-object-version=none" CACHE STRING "") +set(CMAKE_REQUIRED_FLAGS "-nogpulib -nodefaultlibs" CACHE STRING "") diff --git a/libcxx/cmake/caches/NVPTX.cmake b/libcxx/cmake/caches/NVPTX.cmake new file mode 100644 index 00000000000000..47a24a349e996e --- /dev/null +++ b/libcxx/cmake/caches/NVPTX.cmake @@ -0,0 +1,36 @@ +# Configuration options for libcxx. +set(LIBCXX_ABI_VERSION 2 CACHE STRING "") +set(LIBCXX_CXX_ABI libcxxabi CACHE STRING "") +set(LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "") +set(LIBCXX_ENABLE_FILESYSTEM OFF CACHE BOOL "") +set(LIBCXX_ENABLE_LOCALIZATION OFF CACHE BOOL "") +set(LIBCXX_ENABLE_MONOTONIC_CLOCK ON CACHE BOOL "") +set(LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS ON CACHE BOOL "") +set(LIBCXX_ENABLE_RANDOM_DEVICE OFF CACHE BOOL "") +set(LIBCXX_ENABLE_RTTI OFF CACHE BOOL "") +set(LIBCXX_ENABLE_SHARED OFF CACHE BOOL "") +set(LIBCXX_ENABLE_STATIC_ABI_LIBRARY ON CACHE BOOL "") +set(LIBCXX_ENABLE_STATIC ON CACHE BOOL "") +set(LIBCXX_ENABLE_THREADS OFF CACHE BOOL "") +set(LIBCXX_ENABLE_UNICODE OFF CACHE BOOL "") +set(LIBCXX_ENABLE_WIDE_CHARACTERS OFF CACHE BOOL "") +set(LIBCXX_HAS_TERMINAL_AVAILABLE OFF CACHE BOOL "") +set(LIBCXX_INSTALL_LIBRARY ON CACHE BOOL "") +set(LIBCXX_LIBC "llvm-libc" CACHE STRING "") +set(LIBCXX_STATICALLY_LINK_ABI_IN_STATIC_LIBRARY ON CACHE BOOL "") +set(LIBCXX_USE_COMPILER_RT ON CACHE BOOL "") + +# Configuration options for libcxxabi. +set(LIBCXXABI_BAREMETAL ON CACHE BOOL "") +set(LIBCXXABI_ENABLE_EXCEPTIONS OFF CACHE BOOL "") +set(LIBCXXABI_ENABLE_NEW_DELETE_DEFINITIONS OFF CACHE BOOL "") +set(LIBCXXABI_ENABLE_SHARED OFF CACHE BOOL "") +set(LIBCXXABI_ENABLE_THREADS OFF CACHE BOOL "") +set(LIBCXXABI_USE_LLVM_UNWINDER OFF CACHE BOOL "") + +# Necessary compile flags for NVPTX. +set(LIBCXX_ADDITIONAL_COMPILE_FLAGS + "-nogpulib;-flto;-fconvergent-functions;--cuda-feature=+ptx63" CACHE STRING "") +set(LIBCXXABI_ADDITIONAL_COMPILE_FLAGS + "-nogpulib;-flto;-fconvergent-functions;--cuda-feature=+ptx63" CACHE STRING "") +set(CMAKE_REQUIRED_FLAGS "-nogpulib -nodefaultlibs -flto -c" CACHE STRING "") From c61d565721d0cf03e2658ec65a3526dd89142e52 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 21 Aug 2024 18:10:16 +0100 Subject: [PATCH 090/426] [AArch64] Set scalar fneg to free for fnmul (#104814) A fneg(fmul(..)) or fmul(fneg(..)) can be folded into a fnmul under AArch64. https://clang.godbolt.org/z/znPj34Mae This discounts the cost of the fneg in such patterns to be free. --- .../Target/AArch64/AArch64TargetTransformInfo.cpp | 9 +++++++++ llvm/test/Analysis/CostModel/AArch64/arith-fp.ll | 12 ++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index a782c9c4351237..f31e1fa9ab3045 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3242,6 +3242,15 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( return LT.first; case ISD::FNEG: + // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul + if ((Ty->isFloatTy() || Ty->isDoubleTy() || + (Ty->isHalfTy() && ST->hasFullFP16())) && + CxtI && + ((CxtI->hasOneUse() && + match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) || + match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value())))) + return 0; + [[fallthrough]]; case ISD::FADD: case ISD::FSUB: // Increase the cost for half and bfloat types if not architecturally diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll index 84150765d77973..aaffd97b92b2de 100644 --- a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll +++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll @@ -133,7 +133,7 @@ define i32 @fneg(i32 %arg) { define i32 @fmulfneg(i32 %arg) { ; CHECK-LABEL: 'fmulfneg' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F16 = fneg half undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %F16 = fneg half undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F16M = fmul half %F16, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = fneg <2 x half> undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F16M = fmul <2 x half> %V2F16, undef @@ -143,7 +143,7 @@ define i32 @fmulfneg(i32 %arg) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F16M = fmul <8 x half> %V8F16, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fneg <16 x half> undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16F16M = fmul <16 x half> %V16F16, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = fneg float undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %F32 = fneg float undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32M = fmul float %F32, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fneg <2 x float> undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32M = fmul <2 x float> %V2F32, undef @@ -151,7 +151,7 @@ define i32 @fmulfneg(i32 %arg) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32M = fmul <4 x float> %V4F32, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fneg <8 x float> undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8F32M = fmul <8 x float> %V8F32, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = fneg double undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %F64 = fneg double undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64M = fmul double %F64, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fneg <2 x double> undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64M = fmul <2 x double> %V2F64, undef @@ -192,7 +192,7 @@ define i32 @fmulfneg(i32 %arg) { define i32 @fnegfmul(i32 %arg) { ; CHECK-LABEL: 'fnegfmul' ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F16M = fmul half undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F16 = fneg half %F16M +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %F16 = fneg half %F16M ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F16M = fmul <2 x half> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = fneg <2 x half> %V2F16M ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F16M = fmul <4 x half> undef, undef @@ -202,7 +202,7 @@ define i32 @fnegfmul(i32 %arg) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16F16M = fmul <16 x half> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fneg <16 x half> %V16F16M ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32M = fmul float undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = fneg float %F32M +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %F32 = fneg float %F32M ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32M = fmul <2 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fneg <2 x float> %V2F32M ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32M = fmul <4 x float> undef, undef @@ -210,7 +210,7 @@ define i32 @fnegfmul(i32 %arg) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8F32M = fmul <8 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fneg <8 x float> %V8F32M ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64M = fmul double undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = fneg double %F64M +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %F64 = fneg double %F64M ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64M = fmul <2 x double> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fneg <2 x double> %V2F64M ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64M = fmul <4 x double> undef, undef From e78156a0e225673e592920410c8cadc94f19aa66 Mon Sep 17 00:00:00 2001 From: Sumanth Gundapaneni Date: Wed, 21 Aug 2024 12:13:56 -0500 Subject: [PATCH 091/426] Scalarize the vector inputs to llvm.lround intrinsic by default. (#101054) Verifier is updated in a different patch to let the vector types for llvm.lround and llvm.llround intrinsics. --- .../CodeGen/GlobalISel/LegalizerHelper.cpp | 2 + llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 10 +- .../SelectionDAG/LegalizeFloatTypes.cpp | 2 + llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 2 +- .../SelectionDAG/LegalizeVectorOps.cpp | 2 + .../SelectionDAG/LegalizeVectorTypes.cpp | 16 +- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 + llvm/lib/CodeGen/TargetLoweringBase.cpp | 5 +- llvm/test/CodeGen/AMDGPU/lround.ll | 479 ++++++++++++++---- 9 files changed, 424 insertions(+), 96 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index bdbef20e20960d..3fece81df1f2fd 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -4921,6 +4921,8 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, case G_INTRINSIC_LLRINT: case G_INTRINSIC_ROUND: case G_INTRINSIC_ROUNDEVEN: + case G_LROUND: + case G_LLROUND: case G_INTRINSIC_TRUNC: case G_FCOS: case G_FSIN: diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c9ab7e7a66079c..11935cbc309f01 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -507,7 +507,7 @@ namespace { SDValue visitUINT_TO_FP(SDNode *N); SDValue visitFP_TO_SINT(SDNode *N); SDValue visitFP_TO_UINT(SDNode *N); - SDValue visitXRINT(SDNode *N); + SDValue visitXROUND(SDNode *N); SDValue visitFP_ROUND(SDNode *N); SDValue visitFP_EXTEND(SDNode *N); SDValue visitFNEG(SDNode *N); @@ -1929,8 +1929,10 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::UINT_TO_FP: return visitUINT_TO_FP(N); case ISD::FP_TO_SINT: return visitFP_TO_SINT(N); case ISD::FP_TO_UINT: return visitFP_TO_UINT(N); + case ISD::LROUND: + case ISD::LLROUND: case ISD::LRINT: - case ISD::LLRINT: return visitXRINT(N); + case ISD::LLRINT: return visitXROUND(N); case ISD::FP_ROUND: return visitFP_ROUND(N); case ISD::FP_EXTEND: return visitFP_EXTEND(N); case ISD::FNEG: return visitFNEG(N); @@ -17998,15 +18000,17 @@ SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) { return FoldIntToFPToInt(N, DAG); } -SDValue DAGCombiner::visitXRINT(SDNode *N) { +SDValue DAGCombiner::visitXROUND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (lrint|llrint undef) -> undef + // fold (lround|llround undef) -> undef if (N0.isUndef()) return DAG.getUNDEF(VT); // fold (lrint|llrint c1fp) -> c1 + // fold (lround|llround c1fp) -> c1 if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index ad0c054d3ccd50..221dcfe145594f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -2441,6 +2441,8 @@ bool DAGTypeLegalizer::PromoteFloatOperand(SDNode *N, unsigned OpNo) { case ISD::FCOPYSIGN: R = PromoteFloatOp_FCOPYSIGN(N, OpNo); break; case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: + case ISD::LROUND: + case ISD::LLROUND: case ISD::LRINT: case ISD::LLRINT: R = PromoteFloatOp_UnaryOp(N, OpNo); break; case ISD::FP_TO_SINT_SAT: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 27dd4ae241bd10..1088db4bdbe0b3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -1052,7 +1052,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue WidenVecRes_Convert(SDNode *N); SDValue WidenVecRes_Convert_StrictFP(SDNode *N); SDValue WidenVecRes_FP_TO_XINT_SAT(SDNode *N); - SDValue WidenVecRes_XRINT(SDNode *N); + SDValue WidenVecRes_XROUND(SDNode *N); SDValue WidenVecRes_FCOPYSIGN(SDNode *N); SDValue WidenVecRes_UnarySameEltsWithScalarArg(SDNode *N); SDValue WidenVecRes_ExpOp(SDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 57843f0959ac28..3f104baed97b1a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -473,6 +473,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { Node->getValueType(0), Scale); break; } + case ISD::LROUND: + case ISD::LLROUND: case ISD::LRINT: case ISD::LLRINT: case ISD::SINT_TO_FP: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index aad0047b4839a8..8315efcb6750f9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -110,6 +110,8 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::LLRINT: case ISD::FROUND: case ISD::FROUNDEVEN: + case ISD::LROUND: + case ISD::LLROUND: case ISD::FSIN: case ISD::FSINH: case ISD::FSQRT: @@ -752,6 +754,8 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) { case ISD::FP_TO_UINT: case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: + case ISD::LROUND: + case ISD::LLROUND: case ISD::LRINT: case ISD::LLRINT: Res = ScalarizeVecOp_UnaryOp(N); @@ -1215,6 +1219,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::VP_FROUND: case ISD::FROUNDEVEN: case ISD::VP_FROUNDEVEN: + case ISD::LROUND: + case ISD::LLROUND: case ISD::FSIN: case ISD::FSINH: case ISD::FSQRT: case ISD::VP_SQRT: @@ -3270,6 +3276,8 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: case ISD::FTRUNC: + case ISD::LROUND: + case ISD::LLROUND: case ISD::LRINT: case ISD::LLRINT: Res = SplitVecOp_UnaryOp(N); @@ -4594,7 +4602,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::LLRINT: case ISD::VP_LRINT: case ISD::VP_LLRINT: - Res = WidenVecRes_XRINT(N); + case ISD::LROUND: + case ISD::LLROUND: + Res = WidenVecRes_XROUND(N); break; case ISD::FABS: @@ -5231,7 +5241,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_FP_TO_XINT_SAT(SDNode *N) { return DAG.getNode(N->getOpcode(), dl, WidenVT, Src, N->getOperand(1)); } -SDValue DAGTypeLegalizer::WidenVecRes_XRINT(SDNode *N) { +SDValue DAGTypeLegalizer::WidenVecRes_XROUND(SDNode *N) { SDLoc dl(N); EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); ElementCount WidenNumElts = WidenVT.getVectorElementCount(); @@ -6480,6 +6490,8 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) { case ISD::VSELECT: Res = WidenVecOp_VSELECT(N); break; case ISD::FLDEXP: case ISD::FCOPYSIGN: + case ISD::LROUND: + case ISD::LLROUND: case ISD::LRINT: case ISD::LLRINT: Res = WidenVecOp_UnrollVectorOp(N); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 18a3b7bce104a7..27675dce70c260 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5436,6 +5436,8 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const case ISD::FCEIL: case ISD::FROUND: case ISD::FROUNDEVEN: + case ISD::LROUND: + case ISD::LLROUND: case ISD::FRINT: case ISD::LRINT: case ISD::LLRINT: diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 4ff8617f740c89..35d6304cf9b400 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -774,8 +774,9 @@ void TargetLoweringBase::initActions() { setOperationAction( {ISD::FCOPYSIGN, ISD::SIGN_EXTEND_INREG, ISD::ANY_EXTEND_VECTOR_INREG, ISD::SIGN_EXTEND_VECTOR_INREG, ISD::ZERO_EXTEND_VECTOR_INREG, - ISD::SPLAT_VECTOR, ISD::LRINT, ISD::LLRINT, ISD::FTAN, ISD::FACOS, - ISD::FASIN, ISD::FATAN, ISD::FCOSH, ISD::FSINH, ISD::FTANH}, + ISD::SPLAT_VECTOR, ISD::LRINT, ISD::LLRINT, ISD::LROUND, + ISD::LLROUND, ISD::FTAN, ISD::FACOS, ISD::FASIN, ISD::FATAN, + ISD::FCOSH, ISD::FSINH, ISD::FTANH}, VT, Expand); // Constrained floating-point operations default to expand. diff --git a/llvm/test/CodeGen/AMDGPU/lround.ll b/llvm/test/CodeGen/AMDGPU/lround.ll index d45d83026013df..072ee70b840d83 100644 --- a/llvm/test/CodeGen/AMDGPU/lround.ll +++ b/llvm/test/CodeGen/AMDGPU/lround.ll @@ -6,94 +6,6 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-SDAG %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-GISEL %s -declare float @llvm.round.f32(float) -declare i32 @llvm.lround.i32.f32(float) -declare i32 @llvm.lround.i32.f64(double) -declare i64 @llvm.lround.i64.f32(float) -declare i64 @llvm.lround.i64.f64(double) -declare i64 @llvm.llround.i64.f32(float) -declare half @llvm.round.f16(half) -declare i32 @llvm.lround.i32.f16(half %arg) - -define float @intrinsic_fround(float %arg) { -; GFX9-SDAG-LABEL: intrinsic_fround: -; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_trunc_f32_e32 v1, v0 -; GFX9-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 -; GFX9-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] -; GFX9-SDAG-NEXT: s_brev_b32 s4, -2 -; GFX9-SDAG-NEXT: v_bfi_b32 v0, s4, v2, v0 -; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: intrinsic_fround: -; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_trunc_f32_e32 v1, v0 -; GFX9-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; GFX9-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] -; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-GISEL-NEXT: v_and_or_b32 v0, v0, v3, v2 -; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: intrinsic_fround: -; GFX10-SDAG: ; %bb.0: ; %entry -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_trunc_f32_e32 v1, v0 -; GFX10-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 -; GFX10-SDAG-NEXT: v_cmp_ge_f32_e64 s4, |v2|, 0.5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s4 -; GFX10-SDAG-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0 -; GFX10-SDAG-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: intrinsic_fround: -; GFX10-GISEL: ; %bb.0: ; %entry -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_trunc_f32_e32 v1, v0 -; GFX10-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; GFX10-GISEL-NEXT: v_cmp_ge_f32_e64 s4, |v2|, 0.5 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s4 -; GFX10-GISEL-NEXT: v_and_or_b32 v0, 0x80000000, v0, v2 -; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: intrinsic_fround: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_trunc_f32_e32 v1, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 -; GFX11-SDAG-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0 -; GFX11-SDAG-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: intrinsic_fround: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_trunc_f32_e32 v1, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; GFX11-GISEL-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0 -; GFX11-GISEL-NEXT: v_and_or_b32 v0, 0x80000000, v0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -entry: - %res = tail call float @llvm.round.f32(float %arg) - ret float %res -} - define i32 @intrinsic_lround_i32_f32(float %arg) { ; GFX9-SDAG-LABEL: intrinsic_lround_i32_f32: ; GFX9-SDAG: ; %bb.0: ; %entry @@ -1034,3 +946,394 @@ entry: ret i32 %res } +define <2 x i32> @intrinsic_lround_v2i32_v2f32(<2 x float> %arg) { +; GFX9-SDAG-LABEL: intrinsic_lround_v2i32_v2f32: +; GFX9-SDAG: ; %bb.0: ; %entry +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_trunc_f32_e32 v2, v0 +; GFX9-SDAG-NEXT: v_sub_f32_e32 v3, v0, v2 +; GFX9-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5] +; GFX9-SDAG-NEXT: s_brev_b32 s6, -2 +; GFX9-SDAG-NEXT: v_bfi_b32 v0, s6, v3, v0 +; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX9-SDAG-NEXT: v_trunc_f32_e32 v2, v1 +; GFX9-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2 +; GFX9-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5] +; GFX9-SDAG-NEXT: v_bfi_b32 v1, s6, v3, v1 +; GFX9-SDAG-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX9-SDAG-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: intrinsic_lround_v2i32_v2f32: +; GFX9-GISEL: ; %bb.0: ; %entry +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_trunc_f32_e32 v2, v0 +; GFX9-GISEL-NEXT: v_sub_f32_e32 v3, v0, v2 +; GFX9-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5] +; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-GISEL-NEXT: v_and_or_b32 v0, v0, v4, v3 +; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX9-GISEL-NEXT: v_trunc_f32_e32 v2, v1 +; GFX9-GISEL-NEXT: v_sub_f32_e32 v3, v1, v2 +; GFX9-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5] +; GFX9-GISEL-NEXT: v_and_or_b32 v1, v1, v4, v3 +; GFX9-GISEL-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX9-GISEL-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: intrinsic_lround_v2i32_v2f32: +; GFX10-SDAG: ; %bb.0: ; %entry +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_trunc_f32_e32 v2, v0 +; GFX10-SDAG-NEXT: v_trunc_f32_e32 v3, v1 +; GFX10-SDAG-NEXT: v_sub_f32_e32 v4, v0, v2 +; GFX10-SDAG-NEXT: v_sub_f32_e32 v5, v1, v3 +; GFX10-SDAG-NEXT: v_cmp_ge_f32_e64 s4, |v4|, 0.5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s4 +; GFX10-SDAG-NEXT: v_cmp_ge_f32_e64 s4, |v5|, 0.5 +; GFX10-SDAG-NEXT: v_bfi_b32 v0, 0x7fffffff, v4, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s4 +; GFX10-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX10-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v5, v1 +; GFX10-SDAG-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-SDAG-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX10-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: intrinsic_lround_v2i32_v2f32: +; GFX10-GISEL: ; %bb.0: ; %entry +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_trunc_f32_e32 v2, v0 +; GFX10-GISEL-NEXT: v_trunc_f32_e32 v3, v1 +; GFX10-GISEL-NEXT: v_sub_f32_e32 v4, v0, v2 +; GFX10-GISEL-NEXT: v_sub_f32_e32 v5, v1, v3 +; GFX10-GISEL-NEXT: v_cmp_ge_f32_e64 s4, |v4|, 0.5 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s4 +; GFX10-GISEL-NEXT: v_cmp_ge_f32_e64 s4, |v5|, 0.5 +; GFX10-GISEL-NEXT: v_and_or_b32 v0, 0x80000000, v0, v4 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s4 +; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX10-GISEL-NEXT: v_and_or_b32 v1, 0x80000000, v1, v5 +; GFX10-GISEL-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-GISEL-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX10-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: intrinsic_lround_v2i32_v2f32: +; GFX11-SDAG: ; %bb.0: ; %entry +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_trunc_f32_e32 v2, v0 +; GFX11-SDAG-NEXT: v_trunc_f32_e32 v3, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3 +; GFX11-SDAG-NEXT: v_cmp_ge_f32_e64 s0, |v4|, 0.5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s0 +; GFX11-SDAG-NEXT: v_cmp_ge_f32_e64 s0, |v5|, 0.5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_bfi_b32 v0, 0x7fffffff, v4, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v5, v1 +; GFX11-SDAG-NEXT: v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX11-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: intrinsic_lround_v2i32_v2f32: +; GFX11-GISEL: ; %bb.0: ; %entry +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_trunc_f32_e32 v2, v0 +; GFX11-GISEL-NEXT: v_trunc_f32_e32 v3, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3 +; GFX11-GISEL-NEXT: v_cmp_ge_f32_e64 s0, |v4|, 0.5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s0 +; GFX11-GISEL-NEXT: v_cmp_ge_f32_e64 s0, |v5|, 0.5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_and_or_b32 v0, 0x80000000, v0, v4 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_and_or_b32 v1, 0x80000000, v1, v5 +; GFX11-GISEL-NEXT: v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX11-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + %res = tail call <2 x i32> @llvm.lround.v2i32.v2f32(<2 x float> %arg) + ret <2 x i32> %res +} + +define <2 x i64> @intrinsic_lround_v2i64_v2f32(<2 x float> %arg) { +; GFX9-SDAG-LABEL: intrinsic_lround_v2i64_v2f32: +; GFX9-SDAG: ; %bb.0: ; %entry +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_trunc_f32_e32 v2, v0 +; GFX9-SDAG-NEXT: v_sub_f32_e32 v3, v0, v2 +; GFX9-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5] +; GFX9-SDAG-NEXT: s_brev_b32 s6, -2 +; GFX9-SDAG-NEXT: v_bfi_b32 v0, s6, v3, v0 +; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX9-SDAG-NEXT: v_trunc_f32_e32 v0, v0 +; GFX9-SDAG-NEXT: s_mov_b32 s7, 0x2f800000 +; GFX9-SDAG-NEXT: v_mul_f32_e64 v2, |v0|, s7 +; GFX9-SDAG-NEXT: v_floor_f32_e32 v2, v2 +; GFX9-SDAG-NEXT: s_mov_b32 s8, 0xcf800000 +; GFX9-SDAG-NEXT: v_fma_f32 v3, v2, s8, |v0| +; GFX9-SDAG-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-SDAG-NEXT: v_ashrrev_i32_e32 v4, 31, v0 +; GFX9-SDAG-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX9-SDAG-NEXT: v_xor_b32_e32 v0, v3, v4 +; GFX9-SDAG-NEXT: v_trunc_f32_e32 v3, v1 +; GFX9-SDAG-NEXT: v_sub_f32_e32 v5, v1, v3 +; GFX9-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX9-SDAG-NEXT: v_bfi_b32 v1, s6, v5, v1 +; GFX9-SDAG-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX9-SDAG-NEXT: v_trunc_f32_e32 v3, v1 +; GFX9-SDAG-NEXT: v_mul_f32_e64 v1, |v3|, s7 +; GFX9-SDAG-NEXT: v_floor_f32_e32 v1, v1 +; GFX9-SDAG-NEXT: v_fma_f32 v5, v1, s8, |v3| +; GFX9-SDAG-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX9-SDAG-NEXT: v_cvt_u32_f32_e32 v6, v1 +; GFX9-SDAG-NEXT: v_xor_b32_e32 v2, v2, v4 +; GFX9-SDAG-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 +; GFX9-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v3 +; GFX9-SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v4, vcc +; GFX9-SDAG-NEXT: v_xor_b32_e32 v2, v5, v3 +; GFX9-SDAG-NEXT: v_xor_b32_e32 v4, v6, v3 +; GFX9-SDAG-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 +; GFX9-SDAG-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: intrinsic_lround_v2i64_v2f32: +; GFX9-GISEL: ; %bb.0: ; %entry +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_trunc_f32_e32 v2, v0 +; GFX9-GISEL-NEXT: v_sub_f32_e32 v3, v0, v2 +; GFX9-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5] +; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-GISEL-NEXT: v_and_or_b32 v0, v0, v4, v3 +; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX9-GISEL-NEXT: v_trunc_f32_e32 v2, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x2f800000 +; GFX9-GISEL-NEXT: v_mul_f32_e64 v5, |v2|, v3 +; GFX9-GISEL-NEXT: v_floor_f32_e32 v5, v5 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0xcf800000 +; GFX9-GISEL-NEXT: v_fma_f32 v2, v5, v6, |v2| +; GFX9-GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX9-GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v0 +; GFX9-GISEL-NEXT: v_xor_b32_e32 v0, v2, v7 +; GFX9-GISEL-NEXT: v_xor_b32_e32 v2, v5, v7 +; GFX9-GISEL-NEXT: v_trunc_f32_e32 v5, v1 +; GFX9-GISEL-NEXT: v_sub_f32_e32 v8, v1, v5 +; GFX9-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5] +; GFX9-GISEL-NEXT: v_and_or_b32 v1, v1, v4, v8 +; GFX9-GISEL-NEXT: v_add_f32_e32 v4, v5, v1 +; GFX9-GISEL-NEXT: v_trunc_f32_e32 v1, v4 +; GFX9-GISEL-NEXT: v_mul_f32_e64 v3, |v1|, v3 +; GFX9-GISEL-NEXT: v_floor_f32_e32 v3, v3 +; GFX9-GISEL-NEXT: v_fma_f32 v1, v3, v6, |v1| +; GFX9-GISEL-NEXT: v_cvt_u32_f32_e32 v5, v1 +; GFX9-GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v7 +; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v4 +; GFX9-GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v7, vcc +; GFX9-GISEL-NEXT: v_xor_b32_e32 v2, v5, v4 +; GFX9-GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 +; GFX9-GISEL-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v4 +; GFX9-GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: intrinsic_lround_v2i64_v2f32: +; GFX10-SDAG: ; %bb.0: ; %entry +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_trunc_f32_e32 v2, v0 +; GFX10-SDAG-NEXT: v_trunc_f32_e32 v3, v1 +; GFX10-SDAG-NEXT: v_sub_f32_e32 v4, v0, v2 +; GFX10-SDAG-NEXT: v_sub_f32_e32 v5, v1, v3 +; GFX10-SDAG-NEXT: v_cmp_ge_f32_e64 s4, |v4|, 0.5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s4 +; GFX10-SDAG-NEXT: v_cmp_ge_f32_e64 s4, |v5|, 0.5 +; GFX10-SDAG-NEXT: v_bfi_b32 v0, 0x7fffffff, v4, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s4 +; GFX10-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX10-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v5, v1 +; GFX10-SDAG-NEXT: v_trunc_f32_e32 v0, v0 +; GFX10-SDAG-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX10-SDAG-NEXT: v_mul_f32_e64 v2, 0x2f800000, |v0| +; GFX10-SDAG-NEXT: v_trunc_f32_e32 v1, v1 +; GFX10-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v0 +; GFX10-SDAG-NEXT: v_floor_f32_e32 v2, v2 +; GFX10-SDAG-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v1| +; GFX10-SDAG-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; GFX10-SDAG-NEXT: v_fma_f32 v4, 0xcf800000, v2, |v0| +; GFX10-SDAG-NEXT: v_floor_f32_e32 v3, v3 +; GFX10-SDAG-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX10-SDAG-NEXT: v_fma_f32 v0, 0xcf800000, v3, |v1| +; GFX10-SDAG-NEXT: v_cvt_u32_f32_e32 v1, v4 +; GFX10-SDAG-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX10-SDAG-NEXT: v_xor_b32_e32 v2, v2, v5 +; GFX10-SDAG-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-SDAG-NEXT: v_xor_b32_e32 v1, v1, v5 +; GFX10-SDAG-NEXT: v_xor_b32_e32 v3, v3, v6 +; GFX10-SDAG-NEXT: v_xor_b32_e32 v4, v0, v6 +; GFX10-SDAG-NEXT: v_sub_co_u32 v0, vcc_lo, v1, v5 +; GFX10-SDAG-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v5, vcc_lo +; GFX10-SDAG-NEXT: v_sub_co_u32 v2, vcc_lo, v4, v6 +; GFX10-SDAG-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: intrinsic_lround_v2i64_v2f32: +; GFX10-GISEL: ; %bb.0: ; %entry +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_trunc_f32_e32 v2, v0 +; GFX10-GISEL-NEXT: v_trunc_f32_e32 v3, v1 +; GFX10-GISEL-NEXT: v_sub_f32_e32 v4, v0, v2 +; GFX10-GISEL-NEXT: v_sub_f32_e32 v5, v1, v3 +; GFX10-GISEL-NEXT: v_cmp_ge_f32_e64 s4, |v4|, 0.5 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s4 +; GFX10-GISEL-NEXT: v_cmp_ge_f32_e64 s4, |v5|, 0.5 +; GFX10-GISEL-NEXT: v_and_or_b32 v0, 0x80000000, v0, v4 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s4 +; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX10-GISEL-NEXT: v_and_or_b32 v1, 0x80000000, v1, v5 +; GFX10-GISEL-NEXT: v_trunc_f32_e32 v2, v0 +; GFX10-GISEL-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v0 +; GFX10-GISEL-NEXT: v_mul_f32_e64 v4, 0x2f800000, |v2| +; GFX10-GISEL-NEXT: v_trunc_f32_e32 v3, v1 +; GFX10-GISEL-NEXT: v_floor_f32_e32 v4, v4 +; GFX10-GISEL-NEXT: v_mul_f32_e64 v5, 0x2f800000, |v3| +; GFX10-GISEL-NEXT: v_fma_f32 v2, 0xcf800000, v4, |v2| +; GFX10-GISEL-NEXT: v_floor_f32_e32 v5, v5 +; GFX10-GISEL-NEXT: v_fma_f32 v0, 0xcf800000, v5, |v3| +; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; GFX10-GISEL-NEXT: v_cvt_u32_f32_e32 v1, v2 +; GFX10-GISEL-NEXT: v_cvt_u32_f32_e32 v2, v4 +; GFX10-GISEL-NEXT: v_cvt_u32_f32_e32 v4, v5 +; GFX10-GISEL-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 +; GFX10-GISEL-NEXT: v_xor_b32_e32 v2, v2, v6 +; GFX10-GISEL-NEXT: v_xor_b32_e32 v4, v4, v3 +; GFX10-GISEL-NEXT: v_xor_b32_e32 v5, v0, v3 +; GFX10-GISEL-NEXT: v_sub_co_u32 v0, vcc_lo, v1, v6 +; GFX10-GISEL-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v6, vcc_lo +; GFX10-GISEL-NEXT: v_sub_co_u32 v2, vcc_lo, v5, v3 +; GFX10-GISEL-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v3, vcc_lo +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: intrinsic_lround_v2i64_v2f32: +; GFX11-SDAG: ; %bb.0: ; %entry +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_trunc_f32_e32 v2, v0 +; GFX11-SDAG-NEXT: v_trunc_f32_e32 v3, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3 +; GFX11-SDAG-NEXT: v_cmp_ge_f32_e64 s0, |v4|, 0.5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s0 +; GFX11-SDAG-NEXT: v_cmp_ge_f32_e64 s0, |v5|, 0.5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_bfi_b32 v0, 0x7fffffff, v4, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v5, v1 +; GFX11-SDAG-NEXT: v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_trunc_f32_e32 v0, v0 +; GFX11-SDAG-NEXT: v_trunc_f32_e32 v1, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_mul_f32_e64 v2, 0x2f800000, |v0| +; GFX11-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v0 +; GFX11-SDAG-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v1| +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_floor_f32_e32 v2, v2 +; GFX11-SDAG-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; GFX11-SDAG-NEXT: v_floor_f32_e32 v3, v3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_fma_f32 v4, 0xcf800000, v2, |v0| +; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX11-SDAG-NEXT: v_fma_f32 v0, 0xcf800000, v3, |v1| +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v1, v4 +; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX11-SDAG-NEXT: v_xor_b32_e32 v2, v2, v5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-SDAG-NEXT: v_xor_b32_e32 v1, v1, v5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_xor_b32_e32 v3, v3, v6 +; GFX11-SDAG-NEXT: v_xor_b32_e32 v4, v0, v6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_sub_co_u32 v0, vcc_lo, v1, v5 +; GFX11-SDAG-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v5, vcc_lo +; GFX11-SDAG-NEXT: v_sub_co_u32 v2, vcc_lo, v4, v6 +; GFX11-SDAG-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: intrinsic_lround_v2i64_v2f32: +; GFX11-GISEL: ; %bb.0: ; %entry +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_trunc_f32_e32 v2, v0 +; GFX11-GISEL-NEXT: v_trunc_f32_e32 v3, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3 +; GFX11-GISEL-NEXT: v_cmp_ge_f32_e64 s0, |v4|, 0.5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s0 +; GFX11-GISEL-NEXT: v_cmp_ge_f32_e64 s0, |v5|, 0.5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_and_or_b32 v0, 0x80000000, v0, v4 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_and_or_b32 v1, 0x80000000, v1, v5 +; GFX11-GISEL-NEXT: v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_trunc_f32_e32 v2, v0 +; GFX11-GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v0 +; GFX11-GISEL-NEXT: v_trunc_f32_e32 v3, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mul_f32_e64 v4, 0x2f800000, |v2| +; GFX11-GISEL-NEXT: v_mul_f32_e64 v5, 0x2f800000, |v3| +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_floor_f32_e32 v4, v4 +; GFX11-GISEL-NEXT: v_floor_f32_e32 v5, v5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_fma_f32 v2, 0xcf800000, v4, |v2| +; GFX11-GISEL-NEXT: v_fma_f32 v0, 0xcf800000, v5, |v3| +; GFX11-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v1, v2 +; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v2, v4 +; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v4, v5 +; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_xor_b32_e32 v2, v2, v6 +; GFX11-GISEL-NEXT: v_xor_b32_e32 v4, v4, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_xor_b32_e32 v5, v0, v3 +; GFX11-GISEL-NEXT: v_sub_co_u32 v0, vcc_lo, v1, v6 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v6, vcc_lo +; GFX11-GISEL-NEXT: v_sub_co_u32 v2, vcc_lo, v5, v3 +; GFX11-GISEL-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v3, vcc_lo +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + %res = tail call <2 x i64> @llvm.lround.v2i64.v2f32(<2 x float> %arg) + ret <2 x i64> %res +} From 6cb14599ade843be3171fa7e4dd5f3601a3bb0de Mon Sep 17 00:00:00 2001 From: Jacob Lalonde Date: Wed, 21 Aug 2024 10:25:23 -0700 Subject: [PATCH 092/426] [LLDB][Minidump] Fix ProcessMinidump::GetMemoryRegions to include 64b regions when /proc/pid maps are missing. (#101086) This PR is in response to a bug my coworker @mbucko discovered where on MacOS Minidumps were being created where the 64b memory regions were readable, but were not being listed in `SBProcess.GetMemoryRegionList()`. This went unnoticed in #95312 due to all the linux testing including /proc/pid maps. On MacOS generated dumps (or any dump without access to /proc/pid) we would fail to properly map Memory Regions due to there being two independent methods for 32b and 64b mapping. In this PR I addressed this minor bug and merged the methods, but in order to add test coverage required additions to `obj2yaml` and `yaml2obj` which make up the bulk of this patch. Lastly, there are some non-required changes such as the addition of the `Memory64ListHeader` type, to make writing/reading the header section of the Memory64List easier. --- .../Minidump/MinidumpFileBuilder.cpp | 13 +- .../Process/minidump/MinidumpParser.cpp | 113 +++++++----------- .../Plugins/Process/minidump/MinidumpParser.h | 4 + .../Process/minidump/MinidumpTypes.cpp | 20 ---- .../Plugins/Process/minidump/MinidumpTypes.h | 3 - .../minidump-new/TestMiniDumpNew.py | 19 +++ .../minidump-new/linux-x86_64_mem64.yaml | 43 +++++++ 7 files changed, 119 insertions(+), 96 deletions(-) create mode 100644 lldb/test/API/functionalities/postmortem/minidump-new/linux-x86_64_mem64.yaml diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp index c0cc3af638a777..4b862d8d8e99b8 100644 --- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp +++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp @@ -1014,15 +1014,17 @@ MinidumpFileBuilder::AddMemoryList_32(Process::CoreFileMemoryRanges &ranges) { // With a size of the number of ranges as a 32 bit num // And then the size of all the ranges error = AddDirectory(StreamType::MemoryList, - sizeof(llvm::support::ulittle32_t) + + sizeof(llvm::minidump::MemoryListHeader) + descriptors.size() * sizeof(llvm::minidump::MemoryDescriptor)); if (error.Fail()) return error; + llvm::minidump::MemoryListHeader list_header; llvm::support::ulittle32_t memory_ranges_num = static_cast(descriptors.size()); - m_data.AppendData(&memory_ranges_num, sizeof(llvm::support::ulittle32_t)); + list_header.NumberOfMemoryRanges = memory_ranges_num; + m_data.AppendData(&list_header, sizeof(llvm::minidump::MemoryListHeader)); // For 32b we can get away with writing off the descriptors after the data. // This means no cleanup loop needed. m_data.AppendData(descriptors.data(), @@ -1044,9 +1046,10 @@ MinidumpFileBuilder::AddMemoryList_64(Process::CoreFileMemoryRanges &ranges) { if (error.Fail()) return error; + llvm::minidump::Memory64ListHeader list_header; llvm::support::ulittle64_t memory_ranges_num = static_cast(ranges.size()); - m_data.AppendData(&memory_ranges_num, sizeof(llvm::support::ulittle64_t)); + list_header.NumberOfMemoryRanges = memory_ranges_num; // Capture the starting offset for all the descriptors so we can clean them up // if needed. offset_t starting_offset = @@ -1058,8 +1061,8 @@ MinidumpFileBuilder::AddMemoryList_64(Process::CoreFileMemoryRanges &ranges) { (ranges.size() * sizeof(llvm::minidump::MemoryDescriptor_64)); llvm::support::ulittle64_t memory_ranges_base_rva = static_cast(base_rva); - m_data.AppendData(&memory_ranges_base_rva, - sizeof(llvm::support::ulittle64_t)); + list_header.BaseRVA = memory_ranges_base_rva; + m_data.AppendData(&list_header, sizeof(llvm::minidump::Memory64ListHeader)); bool cleanup_required = false; std::vector descriptors; diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp index be9fae938e2276..c099c28a620ecf 100644 --- a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp +++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp @@ -429,7 +429,6 @@ const minidump::ExceptionStream *MinidumpParser::GetExceptionStream() { std::optional MinidumpParser::FindMemoryRange(lldb::addr_t addr) { - llvm::ArrayRef data64 = GetStream(StreamType::Memory64List); Log *log = GetLog(LLDBLog::Modules); auto ExpectedMemory = GetMinidumpFile().getMemoryList(); @@ -457,33 +456,17 @@ MinidumpParser::FindMemoryRange(lldb::addr_t addr) { } } - // Some Minidumps have a Memory64ListStream that captures all the heap memory - // (full-memory Minidumps). We can't exactly use the same loop as above, - // because the Minidump uses slightly different data structures to describe - // those - - if (!data64.empty()) { - llvm::ArrayRef memory64_list; - uint64_t base_rva; - std::tie(memory64_list, base_rva) = - MinidumpMemoryDescriptor64::ParseMemory64List(data64); - - if (memory64_list.empty()) - return std::nullopt; - - for (const auto &memory_desc64 : memory64_list) { - const lldb::addr_t range_start = memory_desc64.start_of_memory_range; - const size_t range_size = memory_desc64.data_size; - - if (base_rva + range_size > GetData().size()) - return std::nullopt; - - if (range_start <= addr && addr < range_start + range_size) { - return minidump::Range(range_start, - GetData().slice(base_rva, range_size)); + if (!GetStream(StreamType::Memory64List).empty()) { + llvm::Error err = llvm::Error::success(); + for (const auto &memory_desc : GetMinidumpFile().getMemory64List(err)) { + if (memory_desc.first.StartOfMemoryRange <= addr + && addr < memory_desc.first.StartOfMemoryRange + memory_desc.first.DataSize) { + return minidump::Range(memory_desc.first.StartOfMemoryRange, memory_desc.second); } - base_rva += range_size; } + + if (err) + LLDB_LOG_ERROR(log, std::move(err), "Failed to read memory64 list: {0}"); } return std::nullopt; @@ -512,6 +495,11 @@ llvm::ArrayRef MinidumpParser::GetMemory(lldb::addr_t addr, return range->range_ref.slice(offset, overlap); } +llvm::iterator_range MinidumpParser::GetMemory64Iterator(llvm::Error &err) { + llvm::ErrorAsOutParameter ErrAsOutParam(&err); + return m_file->getMemory64List(err); +} + static bool CreateRegionsCacheFromMemoryInfoList(MinidumpParser &parser, std::vector ®ions) { @@ -553,53 +541,44 @@ static bool CreateRegionsCacheFromMemoryList(MinidumpParser &parser, std::vector ®ions) { Log *log = GetLog(LLDBLog::Modules); + // Cache the expected memory32 into an optional + // because it is possible to just have a memory64 list auto ExpectedMemory = parser.GetMinidumpFile().getMemoryList(); if (!ExpectedMemory) { LLDB_LOG_ERROR(log, ExpectedMemory.takeError(), "Failed to read memory list: {0}"); - return false; - } - regions.reserve(ExpectedMemory->size()); - for (const MemoryDescriptor &memory_desc : *ExpectedMemory) { - if (memory_desc.Memory.DataSize == 0) - continue; - MemoryRegionInfo region; - region.GetRange().SetRangeBase(memory_desc.StartOfMemoryRange); - region.GetRange().SetByteSize(memory_desc.Memory.DataSize); - region.SetReadable(MemoryRegionInfo::eYes); - region.SetMapped(MemoryRegionInfo::eYes); - regions.push_back(region); + } else { + for (const MemoryDescriptor &memory_desc : *ExpectedMemory) { + if (memory_desc.Memory.DataSize == 0) + continue; + MemoryRegionInfo region; + region.GetRange().SetRangeBase(memory_desc.StartOfMemoryRange); + region.GetRange().SetByteSize(memory_desc.Memory.DataSize); + region.SetReadable(MemoryRegionInfo::eYes); + region.SetMapped(MemoryRegionInfo::eYes); + regions.push_back(region); + } } - regions.shrink_to_fit(); - return !regions.empty(); -} - -static bool -CreateRegionsCacheFromMemory64List(MinidumpParser &parser, - std::vector ®ions) { - llvm::ArrayRef data = - parser.GetStream(StreamType::Memory64List); - if (data.empty()) - return false; - llvm::ArrayRef memory64_list; - uint64_t base_rva; - std::tie(memory64_list, base_rva) = - MinidumpMemoryDescriptor64::ParseMemory64List(data); - if (memory64_list.empty()) - return false; + if (!parser.GetStream(StreamType::Memory64List).empty()) { + llvm::Error err = llvm::Error::success(); + for (const auto &memory_desc : parser.GetMemory64Iterator(err)) { + if (memory_desc.first.DataSize == 0) + continue; + MemoryRegionInfo region; + region.GetRange().SetRangeBase(memory_desc.first.StartOfMemoryRange); + region.GetRange().SetByteSize(memory_desc.first.DataSize); + region.SetReadable(MemoryRegionInfo::eYes); + region.SetMapped(MemoryRegionInfo::eYes); + regions.push_back(region); + } - regions.reserve(memory64_list.size()); - for (const auto &memory_desc : memory64_list) { - if (memory_desc.data_size == 0) - continue; - MemoryRegionInfo region; - region.GetRange().SetRangeBase(memory_desc.start_of_memory_range); - region.GetRange().SetByteSize(memory_desc.data_size); - region.SetReadable(MemoryRegionInfo::eYes); - region.SetMapped(MemoryRegionInfo::eYes); - regions.push_back(region); + if (err) { + LLDB_LOG_ERROR(log, std::move(err), "Failed to read memory64 list: {0}"); + return false; + } } + regions.shrink_to_fit(); return !regions.empty(); } @@ -620,9 +599,7 @@ std::pair MinidumpParser::BuildMemoryRegions() { return return_sorted(true); if (CreateRegionsCacheFromMemoryInfoList(*this, result)) return return_sorted(true); - if (CreateRegionsCacheFromMemoryList(*this, result)) - return return_sorted(false); - CreateRegionsCacheFromMemory64List(*this, result); + CreateRegionsCacheFromMemoryList(*this, result); return return_sorted(false); } diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.h b/lldb/source/Plugins/Process/minidump/MinidumpParser.h index 050ba086f46f54..222c0ef47fb853 100644 --- a/lldb/source/Plugins/Process/minidump/MinidumpParser.h +++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.h @@ -47,6 +47,8 @@ struct Range { } }; +using FallibleMemory64Iterator = llvm::object::MinidumpFile::FallibleMemory64Iterator; + class MinidumpParser { public: static llvm::Expected @@ -92,6 +94,8 @@ class MinidumpParser { /// complete (includes all regions mapped into the process memory). std::pair BuildMemoryRegions(); + llvm::iterator_range GetMemory64Iterator(llvm::Error &err); + static llvm::StringRef GetStreamTypeAsString(StreamType stream_type); llvm::object::MinidumpFile &GetMinidumpFile() { return *m_file; } diff --git a/lldb/source/Plugins/Process/minidump/MinidumpTypes.cpp b/lldb/source/Plugins/Process/minidump/MinidumpTypes.cpp index 5b919828428fae..45dd2272aef041 100644 --- a/lldb/source/Plugins/Process/minidump/MinidumpTypes.cpp +++ b/lldb/source/Plugins/Process/minidump/MinidumpTypes.cpp @@ -57,23 +57,3 @@ LinuxProcStatus::Parse(llvm::ArrayRef &data) { } lldb::pid_t LinuxProcStatus::GetPid() const { return pid; } - -std::pair, uint64_t> -MinidumpMemoryDescriptor64::ParseMemory64List(llvm::ArrayRef &data) { - const llvm::support::ulittle64_t *mem_ranges_count; - Status error = consumeObject(data, mem_ranges_count); - if (error.Fail() || - *mem_ranges_count * sizeof(MinidumpMemoryDescriptor64) > data.size()) - return {}; - - const llvm::support::ulittle64_t *base_rva; - error = consumeObject(data, base_rva); - if (error.Fail()) - return {}; - - return std::make_pair( - llvm::ArrayRef( - reinterpret_cast(data.data()), - *mem_ranges_count), - *base_rva); -} diff --git a/lldb/source/Plugins/Process/minidump/MinidumpTypes.h b/lldb/source/Plugins/Process/minidump/MinidumpTypes.h index fe99abf9e24ed9..9a9f1cc1578336 100644 --- a/lldb/source/Plugins/Process/minidump/MinidumpTypes.h +++ b/lldb/source/Plugins/Process/minidump/MinidumpTypes.h @@ -62,9 +62,6 @@ Status consumeObject(llvm::ArrayRef &Buffer, const T *&Object) { struct MinidumpMemoryDescriptor64 { llvm::support::ulittle64_t start_of_memory_range; llvm::support::ulittle64_t data_size; - - static std::pair, uint64_t> - ParseMemory64List(llvm::ArrayRef &data); }; static_assert(sizeof(MinidumpMemoryDescriptor64) == 16, "sizeof MinidumpMemoryDescriptor64 is not correct!"); diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py b/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py index 91fd2439492b54..2de3e36b507341 100644 --- a/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py +++ b/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py @@ -491,3 +491,22 @@ def test_minidump_sysroot(self): spec_dir_norm = os.path.normcase(module.GetFileSpec().GetDirectory()) exe_dir_norm = os.path.normcase(exe_dir) self.assertEqual(spec_dir_norm, exe_dir_norm) + + def test_minidump_memory64list(self): + """Test that lldb can read from the memory64list in a minidump.""" + self.process_from_yaml("linux-x86_64_mem64.yaml") + + region_count = 3 + region_info_list = self.process.GetMemoryRegions() + self.assertEqual(region_info_list.GetSize(), region_count) + + region = lldb.SBMemoryRegionInfo() + self.assertTrue(region_info_list.GetMemoryRegionAtIndex(0, region)) + self.assertEqual(region.GetRegionBase(), 0x7FFF12A84030) + self.assertTrue(region.GetRegionEnd(), 0x7FFF12A84030 + 0x2FD0) + self.assertTrue(region_info_list.GetMemoryRegionAtIndex(1, region)) + self.assertEqual(region.GetRegionBase(), 0x00007fff12a87000) + self.assertTrue(region.GetRegionEnd(), 0x00007fff12a87000 + 0x00000018) + self.assertTrue(region_info_list.GetMemoryRegionAtIndex(2, region)) + self.assertEqual(region.GetRegionBase(), 0x00007fff12a87018) + self.assertTrue(region.GetRegionEnd(), 0x00007fff12a87018 + 0x00000400) diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/linux-x86_64_mem64.yaml b/lldb/test/API/functionalities/postmortem/minidump-new/linux-x86_64_mem64.yaml new file mode 100644 index 00000000000000..df3c6477ae50a0 --- /dev/null +++ b/lldb/test/API/functionalities/postmortem/minidump-new/linux-x86_64_mem64.yaml @@ -0,0 +1,43 @@ +--- !minidump +Streams: + - Type: SystemInfo + Processor Arch: AMD64 + Processor Level: 6 + Processor Revision: 15876 + Number of Processors: 40 + Platform ID: Linux + CSD Version: 'Linux 3.13.0-91-generic' + CPU: + Vendor ID: GenuineIntel + Version Info: 0x00000000 + Feature Info: 0x00000000 + - Type: LinuxProcStatus + Text: | + Name: linux-x86_64 + State: t (tracing stop) + Tgid: 29917 + Ngid: 0 + Pid: 29917 + PPid: 29370 + TracerPid: 29918 + Uid: 1001 1001 1001 1001 + Gid: 1001 1001 1001 1001 + - Type: ThreadList + Threads: + - Thread Id: 0x2896BB + Context: 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000700100000000000FFFFFFFF0000FFFFFFFFFFFFFFFFFFFF0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000B040A812FF7F00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000050D0A75BBA7F00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + Stack: + Start of Memory Range: 0x0 + Content: '' + - Type: Memory64List + Memory Ranges: + - Start of Memory Range: 0x7FFF12A84030 + Data Size: 0x2FD0 + Content : '' + - Start of Memory Range: 0x00007fff12a87000 + Data Size: 0x00000018 + Content : '' + - Start of Memory Range: 0x00007fff12a87018 + Data Size: 0x00000400 + Content : '' +... From ec866638ff36b4a01b38a3ab8ef604596cb37178 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 21 Aug 2024 12:56:41 -0400 Subject: [PATCH 093/426] [libc++][NFC] A few mechanical adjustments to capitalization in status files Make sure that we consistently use `Nothing To Do`, and that we use the RST tags properly (e.g. '|Complete|' instead of 'Complete'). --- libcxx/docs/Status/Cxx17Issues.csv | 2 +- libcxx/docs/Status/Cxx17Papers.csv | 8 ++++---- libcxx/docs/Status/Cxx20Issues.csv | 2 +- libcxx/docs/Status/Cxx23Issues.csv | 20 ++++++++++---------- libcxx/docs/Status/Cxx23Papers.csv | 16 ++++++++-------- 5 files changed, 24 insertions(+), 24 deletions(-) diff --git a/libcxx/docs/Status/Cxx17Issues.csv b/libcxx/docs/Status/Cxx17Issues.csv index 35e42e5ec2d7ba..2e469dc0bfddec 100644 --- a/libcxx/docs/Status/Cxx17Issues.csv +++ b/libcxx/docs/Status/Cxx17Issues.csv @@ -160,7 +160,7 @@ "`LWG2685 `__","shared_ptr deleters must not throw on move construction","2016-06 (Oulu)","|Complete|","","" "`LWG2687 `__","{inclusive,exclusive}_scan misspecified","2016-06 (Oulu)","","","" "`LWG2688 `__","clamp misses preconditions and has extraneous condition on result","2016-06 (Oulu)","|Complete|","","" -"`LWG2689 `__","Parallel versions of std::copy and std::move shouldn't be in order","2016-06 (Oulu)","|Nothing to do|","","" +"`LWG2689 `__","Parallel versions of std::copy and std::move shouldn't be in order","2016-06 (Oulu)","|Nothing To Do|","","" "`LWG2698 `__","Effect of assign() on iterators/pointers/references","2016-06 (Oulu)","|Complete|","","" "`LWG2704 `__","recursive_directory_iterator's members should require '``*this`` is dereferenceable'","2016-06 (Oulu)","|Complete|","","" "`LWG2706 `__","Error reporting for recursive_directory_iterator::pop() is under-specified","2016-06 (Oulu)","|Complete|","","" diff --git a/libcxx/docs/Status/Cxx17Papers.csv b/libcxx/docs/Status/Cxx17Papers.csv index 614cc4ca73f63e..c2f0cb4be96822 100644 --- a/libcxx/docs/Status/Cxx17Papers.csv +++ b/libcxx/docs/Status/Cxx17Papers.csv @@ -98,16 +98,16 @@ "`P0452R1 `__","Unifying Parallel Algorithms","2017-02 (Kona)","|Partial| [#note-P0452]_","","" "`P0467R2 `__","Iterator Concerns for Parallel Algorithms","2017-02 (Kona)","|Partial|","","" "`P0492R2 `__","Proposed Resolution of C++17 National Body Comments for Filesystems","2017-02 (Kona)","|Complete|","7.0","" -"`P0518R1 `__","Allowing copies as arguments to function objects given to parallel algorithms in response to CH11","2017-02 (Kona)","|Nothing to do|","","" -"`P0523R1 `__","Wording for CH 10: Complexity of parallel algorithms","2017-02 (Kona)","|Nothing to do|","","" +"`P0518R1 `__","Allowing copies as arguments to function objects given to parallel algorithms in response to CH11","2017-02 (Kona)","|Nothing To Do|","","" +"`P0523R1 `__","Wording for CH 10: Complexity of parallel algorithms","2017-02 (Kona)","|Nothing To Do|","","" "`P0548R1 `__","common_type and duration","2017-02 (Kona)","|Complete|","5.0","" "`P0558R1 `__","Resolving atomic named base class inconsistencies","2017-02 (Kona)","|Complete|","","" -"`P0574R1 `__","Algorithm Complexity Constraints and Parallel Overloads","2017-02 (Kona)","|Nothing to do|","","" +"`P0574R1 `__","Algorithm Complexity Constraints and Parallel Overloads","2017-02 (Kona)","|Nothing To Do|","","" "`P0599R1 `__","noexcept for hash functions","2017-02 (Kona)","|Complete|","5.0","" "`P0604R0 `__","Resolving GB 55, US 84, US 85, US 86","2017-02 (Kona)","|Complete|","","" "`P0607R0 `__","Inline Variables for the Standard Library","2017-02 (Kona)","|In Progress| [#note-P0607]_","6.0","" "`P0618R0 `__","Deprecating ","2017-02 (Kona)","|Complete|","15.0","" -"`P0623R0 `__","Final C++17 Parallel Algorithms Fixes","2017-02 (Kona)","|Nothing to do|","","" +"`P0623R0 `__","Final C++17 Parallel Algorithms Fixes","2017-02 (Kona)","|Nothing To Do|","","" "","","","","","" "`P0682R1 `__","Repairing elementary string conversions","2017-07 (Toronto)","","","" "`P0739R0 `__","Some improvements to class template argument deduction integration into the standard library","2017-07 (Toronto)","|Complete|","5.0","" diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv index 1d441de31f107b..bdc2b637efc348 100644 --- a/libcxx/docs/Status/Cxx20Issues.csv +++ b/libcxx/docs/Status/Cxx20Issues.csv @@ -193,7 +193,7 @@ "`LWG2859 `__","Definition of *reachable* in [ptr.launder] misses pointer arithmetic from pointer-interconvertible object","2020-02 (Prague)","","","" "`LWG3018 `__","``shared_ptr``\ of function type","2020-02 (Prague)","|Nothing To Do|","","" "`LWG3050 `__","Conversion specification problem in ``chrono::duration``\ constructor","2020-02 (Prague)","|Complete|","19.0","|chrono|" -"`LWG3141 `__","``CopyConstructible``\ doesn't preserve source values","2020-02 (Prague)","|Nothing to do|","","" +"`LWG3141 `__","``CopyConstructible``\ doesn't preserve source values","2020-02 (Prague)","|Nothing To Do|","","" "`LWG3150 `__","``UniformRandomBitGenerator``\ should validate ``min``\ and ``max``\ ","2020-02 (Prague)","|Complete|","13.0","|ranges|" "`LWG3175 `__","The ``CommonReference``\ requirement of concept ``SwappableWith``\ is not satisfied in the example","2020-02 (Prague)","|Complete|","13.0","" "`LWG3194 `__","``ConvertibleTo``\ prose does not match code","2020-02 (Prague)","|Complete|","13.0","" diff --git a/libcxx/docs/Status/Cxx23Issues.csv b/libcxx/docs/Status/Cxx23Issues.csv index 0484c650e3c36c..16471406f41588 100644 --- a/libcxx/docs/Status/Cxx23Issues.csv +++ b/libcxx/docs/Status/Cxx23Issues.csv @@ -142,18 +142,18 @@ "`LWG3525 `__","``uses_allocator_construction_args`` fails to handle types convertible to ``pair``","2022-02 (Virtual)","","","" "`LWG3598 `__","``system_category().default_error_condition(0)`` is underspecified","2022-02 (Virtual)","","","" "`LWG3601 `__","common_iterator's postfix-proxy needs ``indirectly_readable`` ","2022-02 (Virtual)","","","|ranges|" -"`LWG3607 `__","``contiguous_iterator`` should not be allowed to have custom ``iter_move`` and ``iter_swap`` behavior","2022-02 (Virtual)","|Nothing to do|","","|ranges|" +"`LWG3607 `__","``contiguous_iterator`` should not be allowed to have custom ``iter_move`` and ``iter_swap`` behavior","2022-02 (Virtual)","|Nothing To Do|","","|ranges|" "`LWG3610 `__","``iota_view::size`` sometimes rejects integer-class types","2022-02 (Virtual)","","","|ranges|" "`LWG3612 `__","Inconsistent pointer alignment in ``std::format`` ","2022-02 (Virtual)","|Complete|","14.0","|format|" "`LWG3616 `__","LWG 3498 seems to miss the non-member ``swap`` for ``basic_syncbuf`` ","2022-02 (Virtual)","|Complete|","18.0","" "`LWG3618 `__","Unnecessary ``iter_move`` for ``transform_view::iterator`` ","2022-02 (Virtual)","|Complete|","19.0","|ranges|" -"`LWG3619 `__","Specification of ``vformat_to`` contains ill-formed ``formatted_size`` calls","2022-02 (Virtual)","|Nothing to do|","","|format|" +"`LWG3619 `__","Specification of ``vformat_to`` contains ill-formed ``formatted_size`` calls","2022-02 (Virtual)","|Nothing To Do|","","|format|" "`LWG3621 `__","Remove feature-test macro ``__cpp_lib_monadic_optional`` ","2022-02 (Virtual)","|Complete|","15.0","" -"`LWG3632 `__","``unique_ptr`` ""Mandates: This constructor is not selected by class template argument deduction""","2022-02 (Virtual)","|Nothing to do|","","" +"`LWG3632 `__","``unique_ptr`` ""Mandates: This constructor is not selected by class template argument deduction""","2022-02 (Virtual)","|Nothing To Do|","","" "`LWG3643 `__","Missing ``constexpr`` in ``std::counted_iterator`` ","2022-02 (Virtual)","|Complete|","19.0","|ranges|" "`LWG3648 `__","``format`` should not print ``bool`` with ``'c'`` ","2022-02 (Virtual)","|Complete|","15.0","|format|" "`LWG3649 `__","[fund.ts.v2] Reinstate and bump ``__cpp_lib_experimental_memory_resource`` feature test macro","2022-02 (Virtual)","","","" -"`LWG3650 `__","Are ``std::basic_string`` 's ``iterator`` and ``const_iterator`` constexpr iterators?","2022-02 (Virtual)","|Nothing to do|","","" +"`LWG3650 `__","Are ``std::basic_string`` 's ``iterator`` and ``const_iterator`` constexpr iterators?","2022-02 (Virtual)","|Nothing To Do|","","" "`LWG3654 `__","``basic_format_context::arg(size_t)`` should be ``noexcept`` ","2022-02 (Virtual)","|Complete|","15.0","|format|" "`LWG3657 `__","``std::hash`` is not enabled","2022-02 (Virtual)","|Complete|","17.0","" "`LWG3660 `__","``iterator_traits::pointer`` should conform to §[iterator.traits]","2022-02 (Virtual)","|Complete|","14.0","|ranges|" @@ -188,7 +188,7 @@ "","","","","","" "`LWG3028 `__","Container requirements tables should distinguish ``const`` and non-``const`` variables","2022-11 (Kona)","","","" "`LWG3118 `__","``fpos`` equality comparison unspecified","2022-11 (Kona)","","","" -"`LWG3177 `__","Limit permission to specialize variable templates to program-defined types","2022-11 (Kona)","|Nothing to do|","","" +"`LWG3177 `__","Limit permission to specialize variable templates to program-defined types","2022-11 (Kona)","|Nothing To Do|","","" "`LWG3515 `__","§[stacktrace.basic.nonmem]: ``operator<<`` should be less templatized","2022-11 (Kona)","","","" "`LWG3545 `__","``std::pointer_traits`` should be SFINAE-friendly","2022-11 (Kona)","|Complete|","18.0","" "`LWG3569 `__","``join_view`` fails to support ranges of ranges with non-default_initializable iterators","2022-11 (Kona)","","","|ranges|" @@ -200,7 +200,7 @@ "`LWG3646 `__","``std::ranges::view_interface::size`` returns a signed type","2022-11 (Kona)","|Complete|","16.0","|ranges|" "`LWG3677 `__","Is a cv-qualified ``pair`` specially handled in uses-allocator construction?","2022-11 (Kona)","|Complete|","18.0","" "`LWG3717 `__","``common_view::end`` should improve ``random_access_range`` case","2022-11 (Kona)","","","|ranges|" -"`LWG3732 `__","``prepend_range`` and ``append_range`` can't be amortized constant time","2022-11 (Kona)","|Nothing to do|","","|ranges|" +"`LWG3732 `__","``prepend_range`` and ``append_range`` can't be amortized constant time","2022-11 (Kona)","|Nothing To Do|","","|ranges|" "`LWG3736 `__","``move_iterator`` missing ``disable_sized_sentinel_for`` specialization","2022-11 (Kona)","|Complete|","19.0","|ranges|" "`LWG3737 `__","``take_view::sentinel`` should provide ``operator-``","2022-11 (Kona)","","","|ranges|" "`LWG3738 `__","Missing preconditions for ``take_view`` constructor","2022-11 (Kona)","|Complete|","16.0","|ranges|" @@ -211,7 +211,7 @@ "`LWG3750 `__","Too many papers bump ``__cpp_lib_format``","2022-11 (Kona)","|Partial| [#note-LWG3750]_","","|format|" "`LWG3751 `__","Missing feature macro for ``flat_set``","2022-11 (Kona)","","","|flat_containers|" "`LWG3753 `__","Clarify entity vs. freestanding entity","2022-11 (Kona)","","","" -"`LWG3754 `__","Class template expected synopsis contains declarations that do not match the detailed description","2022-11 (Kona)","|Nothing to do|","","" +"`LWG3754 `__","Class template expected synopsis contains declarations that do not match the detailed description","2022-11 (Kona)","|Nothing To Do|","","" "`LWG3755 `__","``tuple-for-each`` can call ``user-defined`` ``operator,``","2022-11 (Kona)","|Complete|","17.0","" "`LWG3757 `__","What's the effect of ``std::forward_like(x)``?","2022-11 (Kona)","","","" "`LWG3759 `__","``ranges::rotate_copy`` should use ``std::move``","2022-11 (Kona)","|Complete|","15.0","|ranges|" @@ -226,7 +226,7 @@ "`LWG3774 `__","```` should include ````","2022-11 (Kona)","","","|flat_containers|" "`LWG3775 `__","Broken dependencies in the ``Cpp17Allocator`` requirements","2022-11 (Kona)","","","" "`LWG3778 `__","``vector`` missing exception specifications","2022-11 (Kona)","|Complete|","3.7","" -"`LWG3781 `__","The exposition-only alias templates ``cont-key-type`` and ``cont-mapped-type`` should be removed","2022-11 (Kona)","|Nothing to do|","","" +"`LWG3781 `__","The exposition-only alias templates ``cont-key-type`` and ``cont-mapped-type`` should be removed","2022-11 (Kona)","|Nothing To Do|","","" "`LWG3782 `__","Should ```` declare ``::lerp``?","2022-11 (Kona)","|Complete|","17.0","" "`LWG3784 `__","std.compat should not provide ``::byte`` and its friends","2022-11 (Kona)","|Complete|","19.0","" "`LWG3785 `__","``ranges::to`` is over-constrained on the destination type being a range","2022-11 (Kona)","","","|ranges|" @@ -239,10 +239,10 @@ "`LWG3814 `__","Add freestanding items requested by NB comments","2022-11 (Kona)","","","" "`LWG3816 `__","``flat_map`` and ``flat_multimap`` should impose sequence container requirements","2022-11 (Kona)","","","|flat_containers|" "`LWG3817 `__","Missing preconditions on ``forward_list`` modifiers","2022-11 (Kona)","","","" -"`LWG3818 `__","Exposition-only concepts are not described in library intro","2022-11 (Kona)","|Nothing to do|","","" +"`LWG3818 `__","Exposition-only concepts are not described in library intro","2022-11 (Kona)","|Nothing To Do|","","" "`LWG3822 `__","Avoiding normalization in ``filesystem::weakly_canonical``","2022-11 (Kona)","","","" "`LWG3823 `__","Unnecessary precondition for ``is_aggregate``","2022-11 (Kona)","|Nothing To Do|","","" -"`LWG3824 `__","Number of ``bind`` placeholders is underspecified","2022-11 (Kona)","|Nothing to do|","","" +"`LWG3824 `__","Number of ``bind`` placeholders is underspecified","2022-11 (Kona)","|Nothing To Do|","","" "`LWG3826 `__","Redundant specification [for overload of yield_value]","2022-11 (Kona)","|Nothing To Do|","","" "","","","","","" "`LWG2195 `__","Missing constructors for ``match_results``","2023-02 (Issaquah)","","","" diff --git a/libcxx/docs/Status/Cxx23Papers.csv b/libcxx/docs/Status/Cxx23Papers.csv index 9389f031b1842c..8e1544acb2ce0e 100644 --- a/libcxx/docs/Status/Cxx23Papers.csv +++ b/libcxx/docs/Status/Cxx23Papers.csv @@ -6,9 +6,9 @@ "","","","","","" "`P1682R3 `__","std::to_underlying for enumerations","2021-02 (Virtual)","|Complete|","13.0","" "`P2017R1 `__","Conditionally borrowed ranges","2021-02 (Virtual)","|Complete|","16.0","|ranges|" -"`P2160R1 `__","Locks lock lockables","2021-02 (Virtual)","Nothing to do","","" +"`P2160R1 `__","Locks lock lockables","2021-02 (Virtual)","|Nothing To Do|","","" "`P2162R2 `__","Inheriting from std::variant","2021-02 (Virtual)","|Complete|","13.0","" -"`P2212R2 `__","Relax Requirements for time_point::clock","2021-02 (Virtual)","Nothing to do","","" +"`P2212R2 `__","Relax Requirements for time_point::clock","2021-02 (Virtual)","|Nothing To Do|","","" "`P2259R1 `__","Repairing input range adaptors and counted_iterator","2021-02 (Virtual)","","","|ranges|" "","","","","","" "`P0401R6 `__","Providing size feedback in the Allocator interface","2021-06 (Virtual)","|Complete|","15.0","" @@ -29,17 +29,17 @@ "`P1072R10 `__","``basic_string::resize_and_overwrite``","2021-10 (Virtual)","|Complete|","14.0","" "`P1147R1 `__","Printing ``volatile`` Pointers","2021-10 (Virtual)","|Complete|","14.0","" "`P1272R4 `__","Byteswapping for fun&&nuf","2021-10 (Virtual)","|Complete|","14.0","" -"`P1675R2 `__","``rethrow_exception`` must be allowed to copy","2021-10 (Virtual)","Nothing to do","","" +"`P1675R2 `__","``rethrow_exception`` must be allowed to copy","2021-10 (Virtual)","|Nothing To Do|","","" "`P2077R3 `__","Heterogeneous erasure overloads for associative containers","2021-10 (Virtual)","","","" "`P2251R1 `__","Require ``span`` & ``basic_string_view`` to be Trivially Copyable","2021-10 (Virtual)","|Complete|","14.0","" "`P2301R1 `__","Add a ``pmr`` alias for ``std::stacktrace``","2021-10 (Virtual)","","","" "`P2321R2 `__","``zip``","2021-10 (Virtual)","|In Progress|","","|ranges|" -"`P2340R1 `__","Clarifying the status of the 'C headers'","2021-10 (Virtual)","Nothing to do","","" +"`P2340R1 `__","Clarifying the status of the 'C headers'","2021-10 (Virtual)","|Nothing To Do|","","" "`P2393R1 `__","Cleaning up ``integer``-class types","2021-10 (Virtual)","","","" "`P2401R0 `__","Add a conditional ``noexcept`` specification to ``std::exchange``","2021-10 (Virtual)","|Complete|","14.0","" "","","","","","" "`P0323R12 `__","``std::expected``","2022-02 (Virtual)","|Complete|","16.0","" -"`P0533R9 `__","``constexpr`` for ```` and ````","2022-02 (Virtual)","|In progress| [#note-P0533R9]_","","" +"`P0533R9 `__","``constexpr`` for ```` and ````","2022-02 (Virtual)","|In Progress| [#note-P0533R9]_","","" "`P0627R6 `__","Function to mark unreachable code","2022-02 (Virtual)","|Complete|","15.0","" "`P1206R7 `__","``ranges::to``: A function to convert any range to a container","2022-02 (Virtual)","|Complete|","17.0","|ranges|" "`P1413R3 `__","Deprecate ``std::aligned_storage`` and ``std::aligned_union``","2022-02 (Virtual)","|Complete| [#note-P1413R3]_","","" @@ -74,7 +74,7 @@ "`P2438R2 `__","``std::string::substr() &&``","2022-07 (Virtual)","|Complete|","16.0","" "`P2445R1 `__","``forward_like``","2022-07 (Virtual)","|Complete|","16.0","" "`P2446R2 `__","``views::as_rvalue``","2022-07 (Virtual)","|Complete|","16.0","|ranges|" -"`P2460R2 `__","Relax requirements on ``wchar_t`` to match existing practices","2022-07 (Virtual)","Nothing to do","","" +"`P2460R2 `__","Relax requirements on ``wchar_t`` to match existing practices","2022-07 (Virtual)","|Nothing To Do|","","" "`P2465R3 `__","Standard Library Modules ``std`` and ``std.compat``","2022-07 (Virtual)","|Complete|","19.0","" "`P2467R1 `__","Support exclusive mode for ``fstreams``","2022-07 (Virtual)","|Complete|","18.0","" "`P2474R2 `__","``views::repeat``","2022-07 (Virtual)","|Complete|","17.0","|ranges|" @@ -101,7 +101,7 @@ "`P2505R5 `__","Monadic Functions for ``std::expected``","2022-11 (Kona)","|Complete|","17.0","" "`P2539R4 `__","Should the output of ``std::print`` to a terminal be synchronized with the underlying stream?","2022-11 (Kona)","|Complete|","18.0","|format|" "`P2602R2 `__","Poison Pills are Too Toxic","2022-11 (Kona)","|Complete|","19.0","|ranges|" -"`P2708R1 `__","No Further Fundamentals TSes","2022-11 (Kona)","|Nothing to do|","","" +"`P2708R1 `__","No Further Fundamentals TSes","2022-11 (Kona)","|Nothing To Do|","","" "","","","","","" "`P0290R4 `__","``apply()`` for ``synchronized_value``","2023-02 (Issaquah)","","","|concurrency TS|" "`P2770R0 `__","Stashing stashing ``iterators`` for proper flattening","2023-02 (Issaquah)","|Partial| [#note-P2770R0]_","","|ranges|" @@ -120,4 +120,4 @@ "`P2614R2 `__","Deprecate ``numeric_limits::has_denorm``","2023-02 (Issaquah)","|Complete|","18.0","" "`P2588R3 `__","``barrier``’s phase completion guarantees","2023-02 (Issaquah)","","","" "`P2763R1 `__","``layout_stride`` static extents default constructor fix","2023-02 (Issaquah)","","","" -"`P2736R2 `__","Referencing The Unicode Standard","2023-02 (Issaquah)","Complete","19.0","|format|" +"`P2736R2 `__","Referencing The Unicode Standard","2023-02 (Issaquah)","|Complete|","19.0","|format|" From 7a28192ce1c1d9d0398348eabc46c94eadb317d8 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 21 Aug 2024 13:11:46 -0400 Subject: [PATCH 094/426] [libc++] Standardize how we track removed and superseded papers Instead of having various status entries like 'Superseded by XXX', we use '|Nothing To Do|' but we add a note explaining that the paper was pulled at another meeting. --- libcxx/docs/Status/Cxx17.rst | 7 ++++++- libcxx/docs/Status/Cxx17Issues.csv | 6 +++--- libcxx/docs/Status/Cxx17Papers.csv | 6 +++--- libcxx/docs/Status/Cxx20.rst | 9 +++++++++ libcxx/docs/Status/Cxx20Issues.csv | 10 +++++----- libcxx/docs/Status/Cxx20Papers.csv | 8 ++++---- libcxx/docs/Status/Cxx23.rst | 3 +++ libcxx/docs/Status/Cxx23Issues.csv | 6 +++--- 8 files changed, 36 insertions(+), 19 deletions(-) diff --git a/libcxx/docs/Status/Cxx17.rst b/libcxx/docs/Status/Cxx17.rst index 9263bd7c4af0c8..c1073c0b411b06 100644 --- a/libcxx/docs/Status/Cxx17.rst +++ b/libcxx/docs/Status/Cxx17.rst @@ -45,7 +45,12 @@ Paper Status .. [#note-P0607] P0607: The parts of P0607 that are not done are the ```` bits. .. [#note-P0154] P0154: The required macros are only implemented as of clang 19. .. [#note-P0452] P0452: The changes to ``std::transform_inclusive_scan`` and ``std::transform_exclusive_scan`` have not yet been implemented. - .. [#note-P0156] P0156: This paper was reverted in Kona. + .. [#note-P0156] P0156: That paper was pulled out of the draft at the 2017-01 meeting in Kona. + .. [#note-P0181] P0181: That paper was pulled out of the draft at the 2017-01 meeting in Kona. + .. [#note-P0067] P0067: That paper was resolved by `P0067R5 `__. + .. [#note-LWG2587] LWG2587: That LWG issue was resolved by `LWG2567 `__. + .. [#note-LWG2588] LWG2588: That LWG issue was resolved by `LWG2568 `__. + .. [#note-LWG2955] LWG2955: That LWG issue was resolved by `P0682R1 `__. .. _issues-status-cxx17: diff --git a/libcxx/docs/Status/Cxx17Issues.csv b/libcxx/docs/Status/Cxx17Issues.csv index 2e469dc0bfddec..902a3717e5a388 100644 --- a/libcxx/docs/Status/Cxx17Issues.csv +++ b/libcxx/docs/Status/Cxx17Issues.csv @@ -211,8 +211,8 @@ "`LWG2570 `__","[fund.ts.v2] conjunction and disjunction requirements are too strict","2016-11 (Issaquah)","","","" "`LWG2578 `__","Iterator requirements should reference iterator traits","2016-11 (Issaquah)","|Complete|","","" "`LWG2584 `__"," ECMAScript IdentityEscape is ambiguous","2016-11 (Issaquah)","","","" -"`LWG2587 `__","""Convertible to bool"" requirement in conjunction and disjunction","2016-11 (Issaquah)","Resolved by `LWG2567 `__","","" -"`LWG2588 `__","[fund.ts.v2] ""Convertible to bool"" requirement in conjunction and disjunction","2016-11 (Issaquah)","Resolved by `LWG2568 `__","","" +"`LWG2587 `__","""Convertible to bool"" requirement in conjunction and disjunction","2016-11 (Issaquah)","|Nothing To Do| [#note-LWG2587]_","","" +"`LWG2588 `__","[fund.ts.v2] ""Convertible to bool"" requirement in conjunction and disjunction","2016-11 (Issaquah)","|Nothing To Do| [#note-LWG2588]_","","" "`LWG2589 `__","match_results can't satisfy the requirements of a container","2016-11 (Issaquah)","|Complete|","","" "`LWG2591 `__","std::function's member template target() should not lead to undefined behaviour","2016-11 (Issaquah)","|Complete|","","" "`LWG2598 `__","addressof works on temporaries","2016-11 (Issaquah)","|Complete|","","" @@ -310,5 +310,5 @@ "`LWG2934 `__","optional doesn't compare with T","2017-02 (Kona)","|Complete|","","" "","","","","","" "`LWG2901 `__","Variants cannot properly support allocators","2017-07 (Toronto)","|Complete|","","" -"`LWG2955 `__","``to_chars / from_chars``\ depend on ``std::string``\ ","2017-07 (Toronto)","Resolved by `P0682R1 `__","","" +"`LWG2955 `__","``to_chars / from_chars``\ depend on ``std::string``\ ","2017-07 (Toronto)","|Nothing To Do| [#note-LWG2955]_","","" "`LWG2956 `__","``filesystem::canonical()``\ still defined in terms of ``absolute(p, base)``\ ","2017-07 (Toronto)","|Complete|","","" diff --git a/libcxx/docs/Status/Cxx17Papers.csv b/libcxx/docs/Status/Cxx17Papers.csv index c2f0cb4be96822..0aeb15f18b76bb 100644 --- a/libcxx/docs/Status/Cxx17Papers.csv +++ b/libcxx/docs/Status/Cxx17Papers.csv @@ -21,7 +21,7 @@ "`P0006R0 `__","Adopt Type Traits Variable Templates for C++17.","2015-10 (Kona)","|Complete|","3.8","" "`P0092R1 `__","Polishing ","2015-10 (Kona)","|Complete|","3.8","" "`P0007R1 `__","Constant View: A proposal for a ``std::as_const``\ helper function template.","2015-10 (Kona)","|Complete|","3.8","" -"`P0156R0 `__","Variadic lock_guard(rev 3).","2015-10 (Kona)","|Complete| [#note-P0156]_","3.9","" +"`P0156R0 `__","Variadic lock_guard(rev 3).","2015-10 (Kona)","|Nothing To Do| [#note-P0156]_","","" "`P0074R0 `__","Making ``std::owner_less``\ more flexible","2015-10 (Kona)","|Complete|","3.8","" "`P0013R1 `__","Logical type traits rev 2","2015-10 (Kona)","|Complete|","3.8","" "","","","","","" @@ -44,7 +44,7 @@ "`P0032R3 `__","Homogeneous interface for variant, any and optional","2016-06 (Oulu)","|Complete|","4.0","" "`P0040R3 `__","Extending memory management tools","2016-06 (Oulu)","|Complete|","4.0","" "`P0063R3 `__","C++17 should refer to C11 instead of C99","2016-06 (Oulu)","|Complete|","7.0","" -"`P0067R3 `__","Elementary string conversions","2016-06 (Oulu)","Now `P0067R5 `__","n/a","" +"`P0067R3 `__","Elementary string conversions","2016-06 (Oulu)","|Nothing To Do| [#note-P0067]_","n/a","" "`P0083R3 `__","Splicing Maps and Sets","2016-06 (Oulu)","|Complete|","8.0","" "`P0084R2 `__","Emplace Return Type","2016-06 (Oulu)","|Complete|","4.0","" "`P0088R3 `__","Variant: a type-safe union for C++17","2016-06 (Oulu)","|Complete|","4.0","" @@ -53,7 +53,7 @@ "`P0174R2 `__","Deprecating Vestigial Library Parts in C++17","2016-06 (Oulu)","|Complete|","15.0","" "`P0175R1 `__","Synopses for the C library","2016-06 (Oulu)","","","" "`P0180R2 `__","Reserve a New Library Namespace for Future Standardization","2016-06 (Oulu)","|Nothing To Do|","n/a","" -"`P0181R1 `__","Ordered by Default","2016-06 (Oulu)","*Removed in Kona*","n/a","" +"`P0181R1 `__","Ordered by Default","2016-06 (Oulu)","|Nothing To Do| [#note-P0181]_","n/a","" "`P0209R2 `__","make_from_tuple: apply for construction","2016-06 (Oulu)","|Complete|","3.9","" "`P0219R1 `__","Relative Paths for Filesystem","2016-06 (Oulu)","|Complete|","7.0","" "`P0254R2 `__","Integrating std::string_view and std::string","2016-06 (Oulu)","|Complete|","4.0","" diff --git a/libcxx/docs/Status/Cxx20.rst b/libcxx/docs/Status/Cxx20.rst index 5331af92dea594..f5b35d7ccc39e7 100644 --- a/libcxx/docs/Status/Cxx20.rst +++ b/libcxx/docs/Status/Cxx20.rst @@ -49,6 +49,15 @@ Paper Status .. [#note-P0883.2] P0883: ``ATOMIC_FLAG_INIT`` was marked deprecated in version 14.0, but was undeprecated with the implementation of LWG3659 in version 15.0. .. [#note-P0660] P0660: The paper is implemented but the features are experimental and can be enabled via ``-fexperimental-library``. .. [#note-P1614] P1614: ``std::strong_order(long double, long double)`` is partly implemented. + .. [#note-P0542] P0542: That paper was pulled out of the draft at the 2019-07 meeting in Cologne. + .. [#note-P0788] P0788: That paper was pulled out of the draft at the 2019-07 meeting in Cologne. + .. [#note-P0920] P0920: That paper was reverted by `P1661 `__. + .. [#note-P1424] P1424: That paper was superseded by `P1902 `__. + .. [#note-LWG2070] LWG2070: That LWG issue was resolved by `P0674R1 `__. + .. [#note-LWG2499] LWG2499: That LWG issue was resolved by `P0487R1 `__. + .. [#note-LWG2797] LWG2797: That LWG issue was resolved by `P1285R0 `__. + .. [#note-LWG3022] LWG3022: That LWG issue was resolved by `P1285R0 `__. + .. [#note-LWG3134] LWG3134: That LWG issue was resolved by `P1210R0 `__. .. [#note-P0355] P0355: The implementation status is: * ``Calendars`` mostly done in Clang 7 diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv index bdc2b637efc348..d72a3682420620 100644 --- a/libcxx/docs/Status/Cxx20Issues.csv +++ b/libcxx/docs/Status/Cxx20Issues.csv @@ -1,5 +1,5 @@ "Issue #","Issue Name","Meeting","Status","First released version","Labels" -"`LWG2070 `__","``allocate_shared``\ should use ``allocator_traits::construct``\ ","2017-07 (Toronto)","Resolved by `P0674R1 `__","","" +"`LWG2070 `__","``allocate_shared``\ should use ``allocator_traits::construct``\ ","2017-07 (Toronto)","|Nothing To Do| [#note-LWG2070]_","","" "`LWG2444 `__","Inconsistent complexity for ``std::sort_heap``\ ","2017-07 (Toronto)","|Nothing To Do|","","" "`LWG2593 `__","Moved-from state of Allocators","2017-07 (Toronto)","","","" "`LWG2597 `__","``std::log``\ misspecified for complex numbers","2017-07 (Toronto)","","","" @@ -94,17 +94,17 @@ "`LWG2183 `__","Muddled allocator requirements for ``match_results``\ constructors","2018-11 (San Diego)","|Complete|","","" "`LWG2184 `__","Muddled allocator requirements for ``match_results``\ assignments","2018-11 (San Diego)","|Complete|","","" "`LWG2412 `__","``promise::set_value()``\ and ``promise::get_future()``\ should not race","2018-11 (San Diego)","","","" -"`LWG2499 `__","``operator>>(basic_istream&, CharT*)``\ makes it hard to avoid buffer overflows","2018-11 (San Diego)","Resolved by `P0487R1 `__","","" +"`LWG2499 `__","``operator>>(basic_istream&, CharT*)``\ makes it hard to avoid buffer overflows","2018-11 (San Diego)","|Nothing To Do| [#note-LWG2499]_","","" "`LWG2682 `__","``filesystem::copy()``\ won't create a symlink to a directory","2018-11 (San Diego)","|Nothing To Do|","","" "`LWG2697 `__","[concurr.ts] Behavior of ``future/shared_future``\ unwrapping constructor when given an invalid ``future``\ ","2018-11 (San Diego)","","","" -"`LWG2797 `__","Trait precondition violations","2018-11 (San Diego)","Resolved by `P1285R0 `__","","" +"`LWG2797 `__","Trait precondition violations","2018-11 (San Diego)","|Nothing To Do| [#note-LWG2797]_","","" "`LWG2936 `__","Path comparison is defined in terms of the generic format","2018-11 (San Diego)","|Complete|","","" "`LWG2943 `__","Problematic specification of the wide version of ``basic_filebuf::open``\ ","2018-11 (San Diego)","|Nothing To Do|","","" "`LWG2960 `__","[fund.ts.v3] ``nonesuch``\ is insufficiently useless","2018-11 (San Diego)","|Complete|","","" "`LWG2995 `__","``basic_stringbuf``\ default constructor forbids it from using SSO capacity","2018-11 (San Diego)","|Complete|","20.0","" "`LWG2996 `__","Missing rvalue overloads for ``shared_ptr``\ operations","2018-11 (San Diego)","|Complete|","17.0","" "`LWG3008 `__","``make_shared``\ (sub)object destruction semantics are not specified","2018-11 (San Diego)","|Complete|","16.0","" -"`LWG3022 `__","``is_convertible``\ may lead to ODR","2018-11 (San Diego)","Resolved by `P1285R0 `__","","" +"`LWG3022 `__","``is_convertible``\ may lead to ODR","2018-11 (San Diego)","|Nothing To Do| [#note-LWG3022]_","","" "`LWG3025 `__","Map-like container deduction guides should use ``pair``\ , not ``pair``\ ","2018-11 (San Diego)","|Complete|","","" "`LWG3031 `__","Algorithms and predicates with non-const reference arguments","2018-11 (San Diego)","","","" "`LWG3037 `__","``polymorphic_allocator``\ and incomplete types","2018-11 (San Diego)","|Complete|","16.0","" @@ -120,7 +120,7 @@ "`LWG3130 `__","|sect|\ [input.output] needs many ``addressof``\ ","2018-11 (San Diego)","|Complete|","20.0","" "`LWG3131 `__","``addressof``\ all the things","2018-11 (San Diego)","","","" "`LWG3132 `__","Library needs to ban macros named ``expects``\ or ``ensures``\ ","2018-11 (San Diego)","|Nothing To Do|","","" -"`LWG3134 `__","[fund.ts.v3] LFTSv3 contains extraneous [meta] variable templates that should have been deleted by P09961","2018-11 (San Diego)","Resolved by `P1210R0 `__","","" +"`LWG3134 `__","[fund.ts.v3] LFTSv3 contains extraneous [meta] variable templates that should have been deleted by P09961","2018-11 (San Diego)","|Nothing To Do| [#note-LWG3134]_","","" "`LWG3137 `__","Header for ``__cpp_lib_to_chars``\ ","2018-11 (San Diego)","|Complete|","","" "`LWG3140 `__","``COMMON_REF``\ is unimplementable as specified","2018-11 (San Diego)","|Nothing To Do|","","" "`LWG3145 `__","``file_clock``\ breaks ABI for C++17 implementations","2018-11 (San Diego)","|Complete|","","" diff --git a/libcxx/docs/Status/Cxx20Papers.csv b/libcxx/docs/Status/Cxx20Papers.csv index 40442f3b6fa50f..8aeff47830ece2 100644 --- a/libcxx/docs/Status/Cxx20Papers.csv +++ b/libcxx/docs/Status/Cxx20Papers.csv @@ -32,7 +32,7 @@ "`P0475R1 `__","LWG 2511: guaranteed copy elision for piecewise construction","2018-06 (Rapperswil)","|Complete|","","" "`P0476R2 `__","Bit-casting object representations","2018-06 (Rapperswil)","|Complete|","14.0","" "`P0528R3 `__","The Curious Case of Padding Bits, Featuring Atomic Compare-and-Exchange","2018-06 (Rapperswil)","","","" -"`P0542R5 `__","Support for contract based programming in C++","2018-06 (Rapperswil)","*Removed in Cologne*","n/a","" +"`P0542R5 `__","Support for contract based programming in C++","2018-06 (Rapperswil)","|Nothing To Do| [#note-P0542]_","n/a","" "`P0556R3 `__","Integral power-of-2 operations","2018-06 (Rapperswil)","|Complete|","9.0","" "`P0619R4 `__","Reviewing Deprecated Facilities of C++17 for C++20","2018-06 (Rapperswil)","|Partial| [#note-P0619]_","","" "`P0646R1 `__","Improving the Return Value of Erase-Like Algorithms","2018-06 (Rapperswil)","|Complete|","10.0","" @@ -40,7 +40,7 @@ "`P0758R1 `__","Implicit conversion traits and utility functions","2018-06 (Rapperswil)","|Complete|","","" "`P0759R1 `__","fpos Requirements","2018-06 (Rapperswil)","|Complete|","11.0","" "`P0769R2 `__","Add shift to ","2018-06 (Rapperswil)","|Complete|","12.0","" -"`P0788R3 `__","Standard Library Specification in a Concepts and Contracts World","2018-06 (Rapperswil)","*Removed in Cologne*","n/a","" +"`P0788R3 `__","Standard Library Specification in a Concepts and Contracts World","2018-06 (Rapperswil)","|Nothing To Do| [#note-P0788]_","n/a","" "`P0879R0 `__","Constexpr for swap and swap related functions Also resolves LWG issue 2800.","2018-06 (Rapperswil)","|Complete|","13.0","" "`P0887R1 `__","The identity metafunction","2018-06 (Rapperswil)","|Complete|","8.0","" "`P0892R2 `__","explicit(bool)","2018-06 (Rapperswil)","","","" @@ -85,7 +85,7 @@ "`P0340R3 `__","Making std::underlying_type SFINAE-friendly","2019-02 (Kona)","|Complete|","9.0","" "`P0738R2 `__","I Stream, You Stream, We All Stream for istream_iterator","2019-02 (Kona)","","","" "`P0811R3 `__","Well-behaved interpolation for numbers and pointers","2019-02 (Kona)","|Complete|","9.0","" -"`P0920R2 `__","Precalculated hash values in lookup","2019-02 (Kona)","Reverted by `P1661 `__","","" +"`P0920R2 `__","Precalculated hash values in lookup","2019-02 (Kona)","|Nothing To Do| [#note-P0920]_","","" "`P1001R2 `__","Target Vectorization Policies from Parallelism V2 TS to C++20","2019-02 (Kona)","|Complete|","17.0","" "`P1024R3 `__","Usability Enhancements for std::span","2019-02 (Kona)","|Complete|","9.0","" "`P1164R1 `__","Make create_directory() Intuitive","2019-02 (Kona)","|Complete|","12.0","" @@ -117,7 +117,7 @@ "`P1355R2 `__","Exposing a narrow contract for ceil2","2019-07 (Cologne)","|Complete|","9.0","" "`P1361R2 `__","Integration of chrono with text formatting","2019-07 (Cologne)","|Partial|","","" "`P1423R3 `__","char8_t backward compatibility remediation","2019-07 (Cologne)","|Complete|","15.0","" -"`P1424R1 `__","'constexpr' feature macro concerns","2019-07 (Cologne)","Superseded by `P1902 `__","","" +"`P1424R1 `__","'constexpr' feature macro concerns","2019-07 (Cologne)","|Nothing To Do| [#note-P1424]_","","" "`P1466R3 `__","Miscellaneous minor fixes for chrono","2019-07 (Cologne)","","","" "`P1474R1 `__","Helpful pointers for ContiguousIterator","2019-07 (Cologne)","|Complete|","15.0","|ranges|" "`P1502R1 `__","Standard library header units for C++20","2019-07 (Cologne)","","","" diff --git a/libcxx/docs/Status/Cxx23.rst b/libcxx/docs/Status/Cxx23.rst index 23d30c8128d71e..b3918149a735f1 100644 --- a/libcxx/docs/Status/Cxx23.rst +++ b/libcxx/docs/Status/Cxx23.rst @@ -46,6 +46,9 @@ Paper Status .. [#note-P2520R0] P2520R0: Libc++ implemented this paper as a DR in C++20 as well. .. [#note-P2711R1] P2711R1: ``join_with_view`` hasn't been done yet since this type isn't implemented yet. .. [#note-P2770R0] P2770R0: ``join_with_view`` hasn't been done yet since this type isn't implemented yet. + .. [#note-LWG3494] LWG3494: That LWG issue was superseded by `P2017R1 `__. + .. [#note-LWG3481] LWG3481: That LWG issue was superseded by `P2415R2 `__. + .. [#note-LWG3265] LWG3265: That LWG issue was resolved by `LWG3435 `__. .. [#note-P2693R1] P2693R1: The formatter for ``std::thread::id`` is implemented. The formatter for ``stacktrace`` is not implemented, since ``stacktrace`` is not implemented yet. diff --git a/libcxx/docs/Status/Cxx23Issues.csv b/libcxx/docs/Status/Cxx23Issues.csv index 16471406f41588..a0a9ccdca48c3c 100644 --- a/libcxx/docs/Status/Cxx23Issues.csv +++ b/libcxx/docs/Status/Cxx23Issues.csv @@ -5,7 +5,7 @@ "`LWG3195 `__","What is the stored pointer value of an empty ``weak_ptr``?","2020-11 (Virtual)","|Nothing To Do|","","" "`LWG3211 `__","``std::tuple<>`` should be trivially constructible","2020-11 (Virtual)","|Complete|","9.0","" "`LWG3236 `__","Random access iterator requirements lack limiting relational operators domain to comparing those from the same range","2020-11 (Virtual)","|Nothing To Do|","","" -"`LWG3265 `__","``move_iterator``'s conversions are more broken after P1207","2020-11 (Virtual)","Resolved by `LWG3435 `__","","" +"`LWG3265 `__","``move_iterator``'s conversions are more broken after P1207","2020-11 (Virtual)","|Nothing To Do| [#note-LWG3265]_","","" "`LWG3435 `__","``three_way_comparable_with, reverse_iterator>``","2020-11 (Virtual)","|Complete|","13.0","" "`LWG3432 `__","Missing requirement for ``comparison_category``","2020-11 (Virtual)","|Complete|","16.0","|spaceship|" "`LWG3447 `__","Deduction guides for ``take_view`` and ``drop_view`` have different constraints","2020-11 (Virtual)","|Complete|","14.0","|ranges|" @@ -54,7 +54,7 @@ "`LWG3433 `__","``subrange::advance(n)`` has UB when ``n < 0``","2021-02 (Virtual)","|Complete|","14.0","|ranges|" "`LWG3490 `__","``ranges::drop_while_view::begin()`` is missing a precondition","2021-02 (Virtual)","|Nothing To Do|","","|ranges|" "`LWG3492 `__","Minimal improvements to ``elements_view::iterator``","2021-02 (Virtual)","|Complete|","16.0","|ranges|" -"`LWG3494 `__","Allow ranges to be conditionally borrowed","2021-02 (Virtual)","Superseded by `P2017R1 `__","","|ranges|" +"`LWG3494 `__","Allow ranges to be conditionally borrowed","2021-02 (Virtual)","|Nothing To Do| [#note-LWG3494]_","","|ranges|" "`LWG3495 `__","``constexpr launder`` makes pointers to inactive members of unions usable","2021-02 (Virtual)","|Nothing To Do|","","" "`LWG3500 `__","``join_view::iterator::operator->()`` is bogus","2021-02 (Virtual)","|Complete|","14.0","|ranges|" "`LWG3502 `__","``elements_view`` should not be allowed to return dangling reference","2021-02 (Virtual)","|Complete|","16.0","|ranges|" @@ -66,7 +66,7 @@ "`LWG3410 `__","``lexicographical_compare_three_way`` is overspecified","2021-06 (Virtual)","|Complete|","17.0","|spaceship|" "`LWG3430 `__","``std::fstream`` & co. should be constructible from string_view","2021-06 (Virtual)","|Complete|","19.0","" "`LWG3462 `__","§[formatter.requirements]: Formatter requirements forbid use of ``fc.arg()``","2021-06 (Virtual)","|Nothing To Do|","","|format|" -"`LWG3481 `__","``viewable_range`` mishandles lvalue move-only views","2021-06 (Virtual)","Superseded by `P2415R2 `__","","|ranges|" +"`LWG3481 `__","``viewable_range`` mishandles lvalue move-only views","2021-06 (Virtual)","|Nothing To Do| [#note-LWG3481]_","","|ranges|" "`LWG3506 `__","Missing allocator-extended constructors for ``priority_queue``","2021-06 (Virtual)","|Complete|","14.0","" "`LWG3517 `__","``join_view::iterator``'s ``iter_swap`` is underconstrained","2021-06 (Virtual)","|Complete|","14.0","|ranges|" "`LWG3518 `__","Exception requirements on char trait operations unclear","2021-06 (Virtual)","|Nothing To Do|","","" From ae48affd25ac8e211a5bc1c72ef208615fc7eb7d Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Wed, 21 Aug 2024 10:46:21 -0700 Subject: [PATCH 095/426] [RISCV] Minor style fixes in lowerVectorMaskVecReduction [nfc] Reuse existing routine to avoid duplication, and reduce variable scopes. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 66ea6423097ab2..670dee2edb1dfb 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -9409,10 +9409,7 @@ SDValue RISCVTargetLowering::lowerVectorMaskVecReduction(SDValue Op, getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget); } - unsigned BaseOpc; ISD::CondCode CC; - SDValue Zero = DAG.getConstant(0, DL, XLenVT); - switch (Op.getOpcode()) { default: llvm_unreachable("Unhandled reduction"); @@ -9423,7 +9420,6 @@ SDValue RISCVTargetLowering::lowerVectorMaskVecReduction(SDValue Op, Vec = DAG.getNode(RISCVISD::VMXOR_VL, DL, ContainerVT, Vec, TrueMask, VL); Vec = DAG.getNode(RISCVISD::VCPOP_VL, DL, XLenVT, Vec, Mask, VL); CC = ISD::SETEQ; - BaseOpc = ISD::AND; break; } case ISD::VECREDUCE_OR: @@ -9431,7 +9427,6 @@ SDValue RISCVTargetLowering::lowerVectorMaskVecReduction(SDValue Op, // vcpop x != 0 Vec = DAG.getNode(RISCVISD::VCPOP_VL, DL, XLenVT, Vec, Mask, VL); CC = ISD::SETNE; - BaseOpc = ISD::OR; break; case ISD::VECREDUCE_XOR: case ISD::VP_REDUCE_XOR: { @@ -9440,11 +9435,11 @@ SDValue RISCVTargetLowering::lowerVectorMaskVecReduction(SDValue Op, Vec = DAG.getNode(RISCVISD::VCPOP_VL, DL, XLenVT, Vec, Mask, VL); Vec = DAG.getNode(ISD::AND, DL, XLenVT, Vec, One); CC = ISD::SETNE; - BaseOpc = ISD::XOR; break; } } + SDValue Zero = DAG.getConstant(0, DL, XLenVT); SDValue SetCC = DAG.getSetCC(DL, XLenVT, Vec, Zero, CC); SetCC = DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), SetCC); @@ -9457,6 +9452,7 @@ SDValue RISCVTargetLowering::lowerVectorMaskVecReduction(SDValue Op, // 0 for an inactive vector, and so we've already received the neutral value: // AND gives us (0 == 0) -> 1 and OR/XOR give us (0 != 0) -> 0. Therefore we // can simply include the start value. + unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Op.getOpcode()); return DAG.getNode(BaseOpc, DL, Op.getValueType(), SetCC, Op.getOperand(0)); } From c975dc1da03d684604ddf787b07b63fb8e903648 Mon Sep 17 00:00:00 2001 From: Harini0924 <79345568+Harini0924@users.noreply.github.com> Date: Wed, 21 Aug 2024 10:48:32 -0700 Subject: [PATCH 096/426] [clang] [test] Use lit Syntax for Environment Variables in Clang subproject (#102647) This patch updates the clang tests by replacing shell command substitutions with lit-compatible syntax for setting and referencing environment variables. Specifically, the use of shell-style variable substitution (e.g., `DEFAULT_TRIPLE=`and `EXPECTED_RESOURCE_DIR=`) has been replaced with `env` and `%{env}` to align with lit's internal shell requirements. These changes ensure that environment variables are properly set and accessed within the lit environment. When using the lit internal shell with the command `LIT_USE_INTERNAL_SHELL=1 ninja check-clang`, one common error encountered is: ``` FAIL: Clang :: Driver/program-path-priority.c (19 of 20640) ******************** TEST 'Clang :: Driver/program-path-priority.c' FAILED ******************** Exit Code: 127 Command Output (stdout): -- # RUN: at line 90 DEFAULT_TRIPLE=`/usr/local/google/home/harinidonthula/llvm-project/build/tools/clang/test/Driver/Output/program-path-priority.c.tmp/clang --version | grep "Target:" | cut -d ' ' -f2` # executed command: 'DEFAULT_TRIPLE=`/usr/local/google/home/harinidonthula/llvm-project/build/tools/clang/test/Driver/Output/program-path-priority.c.tmp/clang' --version # .---command stderr------------ # | 'DEFAULT_TRIPLE=`/usr/local/google/home/harinidonthula/llvm-project/build/tools/clang/test/Driver/Output/program-path-priority.c.tmp/clang': command not found # `----------------------------- # error: command failed with exit status: 127 ``` To fix this issue, the patch replaces traditional shell substitutions with lit's environment variable handling, ensuring compatibility with the lit internal shell framework. This update applies to both the handling of the `DEFAULT_TRIPLE` and `EXPECTED_RESOURCE_DIR` variables, allowing the tests to pass when using the lit internal shell. The patch also adds `env` to the `PWD` variable setting in the following command to ensure the environment variable is correctly set within the lit internal shell: ``` // RUN: %if system-linux %{ env PWD=/proc/self/cwd %clang -### -c --coverage %s -o foo/bar.o 2>&1 | FileCheck --check-prefix=PWD %s %} ``` fixes: #102395 [link to RFC](https://discourse.llvm.org/t/rfc-enabling-the-lit-internal-shell-by-default/80179) --- clang/test/ClangScanDeps/pr61006.cppm | 10 +++++----- clang/test/Driver/coverage.c | 4 ++-- clang/test/Driver/program-path-priority.c | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/clang/test/ClangScanDeps/pr61006.cppm b/clang/test/ClangScanDeps/pr61006.cppm index f75edd38c81ba9..9ce6edaf2010e1 100644 --- a/clang/test/ClangScanDeps/pr61006.cppm +++ b/clang/test/ClangScanDeps/pr61006.cppm @@ -6,13 +6,13 @@ // RUN: mkdir -p %t // RUN: split-file %s %t // -// RUN: EXPECTED_RESOURCE_DIR=`%clang -print-resource-dir` && \ +// RUN: %clang -print-resource-dir > %t/resource-dir.txt && \ // RUN: ln -s %clang++ %t/clang++ && \ -// RUN: sed "s|EXPECTED_RESOURCE_DIR|$EXPECTED_RESOURCE_DIR|g; s|DIR|%/t|g" %t/P1689.json.in > %t/P1689.json && \ -// RUN: clang-scan-deps -compilation-database %t/P1689.json -format=p1689 | FileCheck %t/a.cpp -DPREFIX=%/t && \ -// RUN: clang-scan-deps -format=p1689 \ +// RUN: sed "s|EXPECTED_RESOURCE_DIR|%{readfile:%t/resource-dir.txt}|g; s|DIR|%/t|g" %t/P1689.json.in > %t/P1689.json && \ +// RUN: env EXPECTED_RESOURCE_DIR=%{readfile:%t/resource-dir.txt} clang-scan-deps -compilation-database %t/P1689.json -format=p1689 | FileCheck %t/a.cpp -DPREFIX=%/t && \ +// RUN: env EXPECTED_RESOURCE_DIR=%{readfile:%t/resource-dir.txt} clang-scan-deps -format=p1689 \ // RUN: -- %t/clang++ -std=c++20 -c -fprebuilt-module-path=%t %t/a.cpp -o %t/a.o \ -// RUN: -resource-dir $EXPECTED_RESOURCE_DIR | FileCheck %t/a.cpp -DPREFIX=%/t +// RUN: -resource-dir %{env:EXPECTED_RESOURCE_DIR} | FileCheck %t/a.cpp -DPREFIX=%/t //--- P1689.json.in [ diff --git a/clang/test/Driver/coverage.c b/clang/test/Driver/coverage.c index e5ed064aab457c..ab791ada2d351a 100644 --- a/clang/test/Driver/coverage.c +++ b/clang/test/Driver/coverage.c @@ -18,7 +18,7 @@ // GCNO-LOCATION-REL: "-coverage-notes-file={{.*}}{{/|\\\\}}foo/bar.gcno" /// GCC allows PWD to change the paths. -// RUN: %if system-linux %{ PWD=/proc/self/cwd %clang -### -c --coverage %s -o foo/bar.o 2>&1 | FileCheck --check-prefix=PWD %s %} +// RUN: %if system-linux %{ env PWD=/proc/self/cwd %clang -### -c --coverage %s -o foo/bar.o 2>&1 | FileCheck --check-prefix=PWD %s %} // PWD: "-coverage-notes-file=/proc/self/cwd/foo/bar.gcno" "-coverage-data-file=/proc/self/cwd/foo/bar.gcda" /// Don't warn -Wunused-command-line-argument. @@ -50,6 +50,6 @@ // LINK2: -cc1{{.*}} "-coverage-notes-file={{.*}}{{/|\\\\}}f/gb.gcno" "-coverage-data-file={{.*}}{{/|\\\\}}f/gb.gcda" /// GCC allows PWD to change the paths. -// RUN: %if system-linux %{ PWD=/proc/self/cwd %clang -### --coverage d/a.c d/b.c -o e/x -fprofile-dir=f 2>&1 | FileCheck %s --check-prefix=LINK3 %} +// RUN: %if system-linux %{ env PWD=/proc/self/cwd %clang -### --coverage d/a.c d/b.c -o e/x -fprofile-dir=f 2>&1 | FileCheck %s --check-prefix=LINK3 %} // LINK3: -cc1{{.*}} "-coverage-notes-file=/proc/self/cwd/e/x-a.gcno" "-coverage-data-file=f/proc/self/cwd/e/x-a.gcda" // LINK3: -cc1{{.*}} "-coverage-notes-file=/proc/self/cwd/e/x-b.gcno" "-coverage-data-file=f/proc/self/cwd/e/x-b.gcda" diff --git a/clang/test/Driver/program-path-priority.c b/clang/test/Driver/program-path-priority.c index c940c4ced94420..358a06d7c6d1b5 100644 --- a/clang/test/Driver/program-path-priority.c +++ b/clang/test/Driver/program-path-priority.c @@ -87,8 +87,8 @@ /// -gcc has lowest priority so -gcc /// on PATH beats default triple in program path -// RUN: DEFAULT_TRIPLE=`%t/clang --version | grep "Target:" | cut -d ' ' -f2` -// RUN: touch %t/$DEFAULT_TRIPLE-gcc && chmod +x %t/$DEFAULT_TRIPLE-gcc +// RUN: %t/clang --version | grep "Target:" | cut -d ' ' -f2 > %t/default-triple.txt +// RUN: env DEFAULT_TRIPLE=%{readfile:%t/default-triple.txt} touch %t/%{env:DEFAULT_TRIPLE}-gcc && chmod +x %t/%{env:DEFAULT_TRIPLE}-gcc // RUN: touch %t/%target_triple-gcc && chmod +x %t/%target_triple-gcc // RUN: env "PATH=%t/env/" %t/clang -### -target notreal-none-elf %s 2>&1 | \ // RUN: FileCheck --check-prefix=DEFAULT_TRIPLE_GCC %s From b89fef8f67974ebcd4114fa75ac2e53fd687870c Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Wed, 21 Aug 2024 10:50:39 -0700 Subject: [PATCH 097/426] [libc][docs] Update docs to reflect new headergen (#102381) Since new headergen is now the default for building LLVM-libc, the docs need to be updated to reflect that. While I was editing those docs, I took a quick pass at updating other out-of-date pages. --- libc/docs/build_and_test.rst | 7 -- libc/docs/contributing.rst | 14 +-- libc/docs/dev/api_test.rst | 25 ---- libc/docs/dev/ground_truth_specification.rst | 11 -- libc/docs/dev/header_generation.rst | 2 + libc/docs/dev/index.rst | 3 - libc/docs/dev/mechanics_of_public_api.rst | 29 ----- libc/docs/dev/source_tree_layout.rst | 24 ++-- libc/docs/full_cross_build.rst | 115 +++---------------- libc/docs/full_host_build.rst | 87 ++++++++++++-- libc/docs/fullbuild_mode.rst | 5 + libc/docs/gpu/building.rst | 6 +- libc/docs/index.rst | 17 +-- libc/docs/overlay_mode.rst | 36 +++--- libc/docs/porting.rst | 15 --- 15 files changed, 154 insertions(+), 242 deletions(-) delete mode 100644 libc/docs/dev/api_test.rst delete mode 100644 libc/docs/dev/ground_truth_specification.rst delete mode 100644 libc/docs/dev/mechanics_of_public_api.rst diff --git a/libc/docs/build_and_test.rst b/libc/docs/build_and_test.rst index 22b09b07d9612d..ccd8b5bbee4759 100644 --- a/libc/docs/build_and_test.rst +++ b/libc/docs/build_and_test.rst @@ -38,13 +38,6 @@ The libc can be built and tested in two different modes: $> ninja libc-integration-tests - #. API verification test - See :ref:`api_test` for more information about - the API test. It can be run by the command: - - .. code-block:: sh - - $> ninja libc-api-test - Building with VSCode ==================== diff --git a/libc/docs/contributing.rst b/libc/docs/contributing.rst index bd7d9d79be57d7..a674290cf6dc03 100644 --- a/libc/docs/contributing.rst +++ b/libc/docs/contributing.rst @@ -4,7 +4,7 @@ Contributing to the libc Project ================================ -LLVM's libc is being developed as part of the LLVM project so contributions +LLVM-libc is being developed as part of the LLVM project so contributions to the libc project should also follow the general LLVM `contribution guidelines `_. Below is a list of open projects that one can start with: @@ -31,24 +31,12 @@ a list of open projects that one can start with: directory. So, a simple but mechanical project would be to move the parts following the old styles to the new style. -#. **Integrating with the rest of the LLVM project** - There are two parts to - this project: - - #. One is about adding CMake facilities to optionally link the libc's overlay - static archive (see :ref:`overlay_mode`) with other LLVM tools/executables. - #. The other is about putting plumbing in place to release the overlay static - archive (see :ref:`overlay_mode`) as part of the LLVM binary releases. - #. **Implement Linux syscall wrappers** - A large portion of the POSIX API can be implemented as syscall wrappers on Linux. A good number have already been implemented but many more are yet to be implemented. So, a project of medium complexity would be to implement syscall wrappers which have not yet been implemented. -#. **Add a better random number generator** - The current random number - generator has a very small range. This has to be improved or switched over - to a fast random number generator with a large range. - #. **Update the clang-tidy lint rules and use them in the build and/or CI** - Currently, the :ref:`clang_tidy_checks` have gone stale and are mostly unused by the developers and on the CI builders. This project is about updating diff --git a/libc/docs/dev/api_test.rst b/libc/docs/dev/api_test.rst deleted file mode 100644 index 3191a32b7e3fb1..00000000000000 --- a/libc/docs/dev/api_test.rst +++ /dev/null @@ -1,25 +0,0 @@ -.. _api_test: - -======== -API Test -======== - -.. warning:: - This page is severely out of date. Much of the information it contains may be - incorrect. Please only remove this warning once the page has been updated. - -The implementation of libc-project is unique because our public C header files -are generated using information from ground truth captured in TableGen files. -Unit tests only exercise the internal C++ implementations and don't ensure the -headers were generated by the build system and that the generated header files -contain the expected declarations and definitions. A simple solution is to have -contributors write an integration test for each individual function as a C -program; however, this would place a large burden on contributors and duplicates -some effort from the unit tests. - -Instead we automate the generation of what we call as an API test. This API test -ensures that public facing symbols are visible, that the header files are -generated as expected, and that each libc function has the correct function -prototype as specified by the standards. The API test cmake rules are located in -``test/src/CMakeLists.txt``. The source file for the API test is generated in -``/projects/libc/test/src/public_api_test.cpp`` diff --git a/libc/docs/dev/ground_truth_specification.rst b/libc/docs/dev/ground_truth_specification.rst deleted file mode 100644 index f2540b6f78e715..00000000000000 --- a/libc/docs/dev/ground_truth_specification.rst +++ /dev/null @@ -1,11 +0,0 @@ -The ground truth of standards -============================= - -Like any modern libc, LLVM libc also supports a wide number of standards and -extensions. To avoid developing headers, wrappers and sources in a disjointed -fashion, LLVM libc employs ground truth files. These files live under the -``spec`` directory and list ground truth corresponding the ISO C standard, the -POSIX extension standard, etc. For example, the path to the ground truth file -for the ISO C standard is ``spec/stdc.td``. Tools like the header generator -(described in the header generation document), docs generator, etc. use the -ground truth files to generate headers, docs etc. diff --git a/libc/docs/dev/header_generation.rst b/libc/docs/dev/header_generation.rst index 735db2d291ff16..ec4206217ca777 100644 --- a/libc/docs/dev/header_generation.rst +++ b/libc/docs/dev/header_generation.rst @@ -1,3 +1,5 @@ +.. _header_generation: + Generating Public and Internal headers ====================================== diff --git a/libc/docs/dev/index.rst b/libc/docs/dev/index.rst index 87712afcae2ac6..c16121feb3a45d 100644 --- a/libc/docs/dev/index.rst +++ b/libc/docs/dev/index.rst @@ -15,10 +15,7 @@ Navigate to the links below for information on the respective topics: config_options clang_tidy_checks fuzzing - ground_truth_specification header_generation implementation_standard undefined_behavior printf_behavior - api_test - mechanics_of_public_api diff --git a/libc/docs/dev/mechanics_of_public_api.rst b/libc/docs/dev/mechanics_of_public_api.rst deleted file mode 100644 index 257ab3d71bc17a..00000000000000 --- a/libc/docs/dev/mechanics_of_public_api.rst +++ /dev/null @@ -1,29 +0,0 @@ -The mechanics of the ``public_api`` command -=========================================== - -The build system, in combination with the header generation mechanism, -facilitates the fine grained ability to pick and choose the public API one wants -to expose on their platform. The public header files are always generated from -the corresponding ``.h.def`` files. A header generation command ``%%public_api`` -is listed in these files. In the generated header file, the header generator -replaces this command with the public API relevant for the target platform. - -Under the hood --------------- - -When the header generator sees the ``%%public_api`` command, it looks up the -API config file for the platform in the path ``config//api.td``. -The API config file lists two kinds of items: - -1. The list of standards from which the public entities available on the platform - are derived from. -2. For each header file exposed on the platform, the list of public members - provided in that header file. - -Note that, the header generator only learns the names of the public entities -from the header config file (the 2nd item from above.) The exact manner in which -the entities are to be declared is got from the standards (the 1st item from -above.) - -See the ground truth document for more information on how the standards are -formally listed in LLVM libc using LLVM table-gen files. diff --git a/libc/docs/dev/source_tree_layout.rst b/libc/docs/dev/source_tree_layout.rst index 0bcedc96a133c3..8b423a1712cc81 100644 --- a/libc/docs/dev/source_tree_layout.rst +++ b/libc/docs/dev/source_tree_layout.rst @@ -14,9 +14,10 @@ directories:: - docs - examples - fuzzing + - hdr - include - lib - - spec + - newhdrgen - src - startup - test @@ -62,6 +63,14 @@ The directory structure within this directory mirrors the directory structure of the top-level ``libc`` directory itself. For more details, see :doc:`fuzzing`. +The ``hdr`` directory +--------------------- + +This directory contains proxy headers which are included from the files in the +src directory. These proxy headers either include our internal type or macro +definitions, or the system's type or macro definitions, depending on if we are +in fullbuild or overlay mode. + The ``include`` directory ------------------------- @@ -80,13 +89,14 @@ The ``lib`` directory This directory contains a ``CMakeLists.txt`` file listing the targets for the public libraries ``libc.a``, ``libm.a`` etc. -The ``spec`` directory ----------------------- +The ``newhdrgen`` directory +--------------------------- -This directory contains the specifications for the types, macros, and entrypoint -functions. These definitions come from the various standards and extensions -LLVM-libc supports, and they are used along with the ``*.h.def`` files and the -config files to generate the headers for fullbuild mode. +This directory contains the sources and specifications for the types, macros +and entrypoint functions. These definitions are organized in the ``yaml`` +subdirectory and match the organization of the ``*.h.def`` files. This folder +also contains the python sources for new headergen, which is what generates the +headers. The ``src`` directory --------------------- diff --git a/libc/docs/full_cross_build.rst b/libc/docs/full_cross_build.rst index 100e17a977e764..5f57169d228ef7 100644 --- a/libc/docs/full_cross_build.rst +++ b/libc/docs/full_cross_build.rst @@ -8,35 +8,33 @@ Full Cross Build :depth: 1 :local: +.. note:: + Fullbuild requires running headergen, which is a python program that depends on + pyyaml. The minimum versions are listed on the :ref:`header_generation` + page, as well as additional information. + In this document, we will present recipes to cross build the full libc. When we say *cross build* a full libc, we mean that we will build the full libc for a target system which is not the same as the system on which the libc is being built. For example, you could be building for a bare metal aarch64 *target* on a Linux x86_64 *host*. -There are three main recipes to cross build the full libc. Each one serves a +There are two main recipes to cross build the full libc. Each one serves a different use case. Below is a short description of these recipes to help users pick the recipe that best suites their needs and contexts. * **Standalone cross build** - Using this recipe one can build the libc using a compiler of their choice. One should use this recipe if their compiler can build for the host as well as the target. -* **Runtimes cross build** - In this recipe, one will have to first build the - libc build tools for the host separately and then use those build tools to - build the libc. Users can use the compiler of their choice to build the - libc build tools as well as the libc. One should use this recipe if they - have to use a host compiler to build the build tools for the host and then - use a target compiler (which is different from the host compiler) to build - the libc. * **Bootstrap cross build** - In this recipe, one will build the ``clang`` compiler and the libc build tools for the host first, and then use them to - build the libc for the target. Unlike with the runtimes build recipe, the - user does not have explicitly build ``clang`` and other libc build tools. + build the libc for the target. Unlike with the standalone build recipe, the + user does not have explicitly build ``clang`` and other build tools. They get built automatically before building the libc. One should use this recipe if they intend use the built ``clang`` and the libc as part of their toolchain for the target. -The following sections present the three recipes in detail. +The following sections present the two recipes in detail. Standalone cross build ====================== @@ -61,9 +59,9 @@ Below is the CMake command to configure the standalone crossbuild of the libc. $> cd build $> C_COMPILER= # For example "clang" $> CXX_COMPILER= # For example "clang++" - $> cmake ../llvm \ + $> cmake ../runtimes \ -G Ninja \ - -DLLVM_ENABLE_PROJECTS=libc \ + -DLLVM_ENABLE_RUNTIMES=libc \ -DCMAKE_C_COMPILER=$C_COMPILER \ -DCMAKE_CXX_COMPILER=$CXX_COMPILER \ -DLLVM_LIBC_FULL_BUILD=ON \ @@ -72,8 +70,8 @@ Below is the CMake command to configure the standalone crossbuild of the libc. We will go over the special options passed to the ``cmake`` command above. -* **Enabled Projects** - Since we want to build the libc project, we list - ``libc`` as the enabled project. +* **Enabled Runtimes** - Since we want to build LLVM-libc, we list + ``libc`` as the enabled runtime. * **The full build option** - Since we want to build the full libc, we pass ``-DLLVM_LIBC_FULL_BUILD=ON``. * **The target triple** - This is the target triple of the target for which @@ -94,88 +92,6 @@ The above ``ninja`` command will build the libc static archives ``libc.a`` and ``libm.a`` for the target specified with ``-DLIBC_TARGET_TRIPLE`` in the CMake configure step. -.. _runtimes_cross_build: - -Runtimes cross build -==================== - -The *runtimes cross build* is very similar to the standalone crossbuild but the -user will have to first build the libc build tools for the host separately. One -should use this recipe if they want to use a different host and target compiler. -Note that the libc build tools MUST be in sync with the libc. That is, the -libc build tools and the libc, both should be built from the same source -revision. At the time of this writing, there is only one libc build tool that -has to be built separately. It is done as follows: - -.. code-block:: sh - - $> cd llvm-project # The llvm-project checkout - $> mkdir build-libc-tools # A different build directory for the build tools - $> cd build-libc-tools - $> HOST_C_COMPILER= # For example "clang" - $> HOST_CXX_COMPILER= # For example "clang++" - $> cmake ../llvm \ - -G Ninja \ - -DLLVM_ENABLE_PROJECTS=libc \ - -DCMAKE_C_COMPILER=$HOST_C_COMPILER \ - -DCMAKE_CXX_COMPILER=$HOST_CXX_COMPILER \ - -DLLVM_LIBC_FULL_BUILD=ON \ - -DCMAKE_BUILD_TYPE=Debug # User can choose to use "Release" build type - $> ninja libc-hdrgen - -The above commands should build a binary named ``libc-hdrgen``. Copy this binary -to a directory of your choice. - -CMake configure step --------------------- - -After copying the ``libc-hdrgen`` binary to say ``/path/to/libc-hdrgen``, -configure the libc build using the following command: - -.. code-block:: sh - - $> cd llvm-project # The llvm-project checkout - $> mkdir build - $> cd build - $> TARGET_C_COMPILER= - $> TARGET_CXX_COMPILER= - $> HDRGEN= - $> TARGET_TRIPLE= - $> cmake ../runtimes \ - -G Ninja \ - -DLLVM_ENABLE_RUNTIMES=libc \ - -DCMAKE_C_COMPILER=$TARGET_C_COMPILER \ - -DCMAKE_CXX_COMPILER=$TARGET_CXX_COMPILER \ - -DLLVM_LIBC_FULL_BUILD=ON \ - -DLIBC_HDRGEN_EXE=$HDRGEN \ - -DLIBC_TARGET_TRIPLE=$TARGET_TRIPLE \ - -DCMAKE_BUILD_TYPE=Debug # User can choose to use "Release" build type - -Note the differences in the above cmake command versus the one used in the -CMake configure step of the standalone build recipe: - -* Instead of listing ``libc`` in ``LLVM_ENABLED_PROJECTS``, we list it in - ``LLVM_ENABLED_RUNTIMES``. -* Instead of using ``llvm-project/llvm`` as the root CMake source directory, - we use ``llvm-project/runtimes`` as the root CMake source directory. -* The path to the ``libc-hdrgen`` binary built earlier is specified with - ``-DLIBC_HDRGEN_EXE=/path/to/libc-hdrgen``. - -Build step ----------- - -The build step in the runtimes build recipe is exactly the same as that of -the standalone build recipe: - -.. code-block:: sh - - $> ninja libc libm - -As with the standalone build recipe, the above ninja command will build the -libc static archives for the target specified with ``-DLIBC_TARGET_TRIPLE`` in -the CMake configure step. - - Bootstrap cross build ===================== @@ -203,8 +119,7 @@ CMake configure step -DLLVM_RUNTIME_TARGETS=$TARGET_TRIPLE \ -DCMAKE_BUILD_TYPE=Debug -Note how the above cmake command differs from the one used in the other two -recipes: +Note how the above cmake command differs from the one used in the other recipe: * ``clang`` is listed in ``-DLLVM_ENABLE_PROJECTS`` and ``libc`` is listed in ``-DLLVM_ENABLE_RUNTIMES``. @@ -214,7 +129,7 @@ recipes: Build step ---------- -The build step is similar to the other two recipes: +The build step is similar to the other recipe: .. code-block:: sh diff --git a/libc/docs/full_host_build.rst b/libc/docs/full_host_build.rst index 4fb3072590f322..f687c2fdab213e 100644 --- a/libc/docs/full_host_build.rst +++ b/libc/docs/full_host_build.rst @@ -8,17 +8,90 @@ Full Host Build :depth: 1 :local: +.. note:: + Fullbuild requires running headergen, which is a python program that depends on + pyyaml. The minimum versions are listed on the :ref:`header_generation` + page, as well as additional information. + In this document, we will present a recipe to build the full libc for the host. When we say *build the libc for the host*, the goal is to build the libc for -the same system on which the libc is being built. Also, we will take this -opportunity to demonstrate how one can set up a *sysroot* (see the documentation +the same system on which the libc is being built. First, we will explain how to +build for developing LLVM-libc, then we will explain how to build LLVM-libc as +part of a complete toolchain. + +Configure the build for development +=================================== + + +Below is the list of commands for a simple recipe to build LLVM-libc for +development. In this we've set the Ninja generator, set the build type to +"Debug", and enabled the Scudo allocator. This build also enables generating the +documentation and verbose cmake logging, which are useful development features. + +.. note:: + if your build fails with an error saying the compiler can't find + ```` or similar then you're probably missing the symlink from + ``/usr/include/asm`` to ``/usr/include//asm``. Installing the + ``gcc-multilib`` package creates this symlink, or you can do it manually with + this command: + ``sudo ln -s /usr/include//asm /usr/include/asm`` + (your host triple will probably be similar to ``x86_64-linux-gnu``) + +.. code-block:: sh + + $> cd llvm-project # The llvm-project checkout + $> mkdir build + $> cd build + $> cmake ../runtimes \ + -G Ninja \ + -DCMAKE_C_COMPILER=clang \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DLLVM_ENABLE_RUNTIMES="libc;compiler-rt" \ + -DLLVM_LIBC_FULL_BUILD=ON \ + -DCMAKE_BUILD_TYPE=Debug \ + -DLLVM_LIBC_INCLUDE_SCUDO=ON \ + -DCOMPILER_RT_BUILD_SCUDO_STANDALONE_WITH_LLVM_LIBC=ON \ + -DCOMPILER_RT_BUILD_GWP_ASAN=OFF \ + -DCOMPILER_RT_SCUDO_STANDALONE_BUILD_SHARED=OFF \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ + -DLLVM_ENABLE_SPHINX=ON -DLIBC_INCLUDE_DOCS=ON \ + -DLIBC_CMAKE_VERBOSE_LOGGING=ON + +Build and test +============== + +After configuring the build with the above ``cmake`` command, one can build test +libc with the following command: + +.. code-block:: sh + + $> ninja libc libm check-libc + +To build the docs run this command: + + +.. code-block:: sh + + $> ninja docs-libc-html + +To run a specific test, use the following: + +.. code-block:: sh + + $> ninja libc.test.src.
._test.__unit__ + $> ninja libc.test.src.ctype.isalpha_test.__unit__ # EXAMPLE + +Configure the complete toolchain build +====================================== + +For a complete toolchain we recommend creating a *sysroot* (see the documentation of the ``--sysroot`` option here: ``_) which includes not only the components of LLVM's libc, but also a full LLVM only toolchain consisting of the `clang `_ compiler, the `lld `_ linker and the -`compiler-rt `_ runtime libraries. LLVM's libc is -not yet complete enough to allow using and linking a C++ application against +`compiler-rt `_ runtime libraries. LLVM-libc is +not quite complete enough to allow using and linking a C++ application against a C++ standard library (like libc++). Hence, we do not include `libc++ `_ in the sysroot. @@ -26,9 +99,6 @@ a C++ standard library (like libc++). Hence, we do not include `libc++ `_, libcxx-abi and libunwind in the LLVM only toolchain and use them to build and link C++ applications. -Configure the full libc build -=============================== - Below is the list of commands for a simple recipe to build and install the libc components along with other components of an LLVM only toolchain. In this we've set the Ninja generator, enabled a full compiler suite, set the build @@ -43,6 +113,7 @@ to use the freshly built lld and compiler-rt. this command: ``sudo ln -s /usr/include//asm /usr/include/asm`` +.. TODO: Move from projects to runtimes for libc, compiler-rt .. code-block:: sh $> cd llvm-project # The llvm-project checkout @@ -51,7 +122,7 @@ to use the freshly built lld and compiler-rt. $> SYSROOT=/path/to/sysroot # Remember to set this! $> cmake ../llvm \ -G Ninja \ - -DLLVM_ENABLE_PROJECTS="clang;libc;lld;compiler-rt" \ + -DLLVM_ENABLE_PROJECTS="clang;lld;libc;compiler-rt" \ -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_C_COMPILER=clang \ -DCMAKE_CXX_COMPILER=clang++ \ diff --git a/libc/docs/fullbuild_mode.rst b/libc/docs/fullbuild_mode.rst index b1151017fbc794..d5c62172dac8e7 100644 --- a/libc/docs/fullbuild_mode.rst +++ b/libc/docs/fullbuild_mode.rst @@ -4,6 +4,11 @@ Fullbuild Mode ============== +.. note:: + Fullbuild requires running headergen, which is a python program that depends on + pyyaml. The minimum versions are listed on the :ref:`header_generation` + page, as well as additional information. + The *fullbuild* mode of LLVM's libc is the mode in which it is to be used as the only libc (as opposed to the :ref:`overlay_mode` in which it is used along with the system libc.) In order to use it as the only libc, one will have to diff --git a/libc/docs/gpu/building.rst b/libc/docs/gpu/building.rst index 60498e348395a3..37dccdab6dc340 100644 --- a/libc/docs/gpu/building.rst +++ b/libc/docs/gpu/building.rst @@ -63,9 +63,13 @@ targeting the default host environment as well. Runtimes cross build -------------------- +.. note:: + These instructions need to be updated for new headergen. They may be + inaccurate. + For users wanting more direct control over the build process, the build steps can be done manually instead. This build closely follows the instructions in the -:ref:`main documentation` but is specialized for the GPU +:ref:`main documentation` but is specialized for the GPU build. We follow the same steps to first build the libc tools and a suitable compiler. These tools must all be up-to-date with the libc source. diff --git a/libc/docs/index.rst b/libc/docs/index.rst index 5b96987e0aada0..d089a800ab90ab 100644 --- a/libc/docs/index.rst +++ b/libc/docs/index.rst @@ -2,14 +2,16 @@ The LLVM C Library ================== -.. warning:: - The libc is not complete. If you need a fully functioning C library right - now, you should continue to use your standard system libraries. +.. note:: + LLVM-libc is not fully complete right now. Some programs may fail to build due + to missing functions (especially C++ ones). If you would like to help us + finish LLVM-libc, check out "Contributing to the libc project" in the sidebar + or ask on discord. Introduction ============ -The libc aspires to a unique place in the software ecosystem. The goals are: +LLVM-libc aspires to a unique place in the software ecosystem. The goals are: - Fully compliant with current C standards (C17 and upcoming C2x) and POSIX. - Easily decomposed and embedded: Supplement or replace system C library @@ -32,8 +34,9 @@ The libc aspires to a unique place in the software ecosystem. The goals are: Platform Support ================ -Most development is currently targeting x86_64 and aarch64 on Linux. Several -functions in the libc have been tested on Windows. The Fuchsia platform is +Most development is currently targeting Linux on x86_64, aarch64, arm, and +RISC-V. Embedded/baremetal targets are supported on arm and RISC-V, and Windows +and MacOS have limited support (may be broken). The Fuchsia platform is slowly replacing functions from its bundled libc with functions from this project. @@ -41,7 +44,7 @@ ABI Compatibility ================= The libc is written to be ABI independent. Interfaces are generated using -LLVM's tablegen, so supporting arbitrary ABIs is possible. In it's initial +headergen, so supporting arbitrary ABIs is possible. In it's initial stages there is no ABI stability in any form. .. toctree:: diff --git a/libc/docs/overlay_mode.rst b/libc/docs/overlay_mode.rst index 37368ffc1fea15..ca04c4c7674a3e 100644 --- a/libc/docs/overlay_mode.rst +++ b/libc/docs/overlay_mode.rst @@ -28,18 +28,18 @@ Also, if users choose to mix more than one libc with the system libc, then the name ``libllvmlibc.a`` makes it absolutely clear that it is the static archive of LLVM's libc. -Building the static archive with libc as a normal LLVM project --------------------------------------------------------------- +Building LLVM-libc as a standalone runtime +------------------------------------------ -We can treat the ``libc`` project as any other normal LLVM project and perform -the CMake configure step as follows: +We can treat the ``libc`` project like any other normal LLVM runtime library by +building it with the following cmake command: .. code-block:: sh $> cd llvm-project # The llvm-project checkout $> mkdir build $> cd build - $> cmake ../llvm -G Ninja -DLLVM_ENABLE_RUNTIMES="libc" \ + $> cmake ../runtimes -G Ninja -DLLVM_ENABLE_RUNTIMES="libc" \ -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ \ -DCMAKE_BUILD_TYPE= \ # Select build type -DCMAKE_INSTALL_PREFIX= # Optional @@ -50,24 +50,29 @@ Next, build the libc: $> ninja libc +Then, run the tests: + +.. code-block:: sh + + $> ninja check-libc + The build step will build the static archive the in the directory ``build/projects/libc/lib``. Notice that the above CMake configure step also -specified an install prefix. This is optional, but if one uses it, then they -can follow up the build step with an install step: +specified an install prefix. This is optional, but it's used, then the following +command will install the static archive to the install path: .. code-block:: sh - $> ninja install-llvmlibc + $> ninja install-libc Building the static archive as part of the bootstrap build ---------------------------------------------------------- The bootstrap build is a build mode in which runtime components like libc++, libcxx-abi, libc etc. are built using the ToT clang. The idea is that this build -produces an in-sync toolchain of compiler + runtime libraries. Such a synchrony -is not essential for the libc but can one still build the overlay static archive -as part of the bootstrap build if one wants to. The first step is to configure -appropriately: +produces an in-sync toolchain of compiler + runtime libraries. This ensures that +LLVM-libc has access to the latest clang features, which should provide the best +performance possible. .. code-block:: sh @@ -77,14 +82,13 @@ appropriately: -DCMAKE_BUILD_TYPE= \ # Select build type -DCMAKE_INSTALL_PREFIX= # Optional -The build and install steps are similar to the those used when configured -as a normal project. Note that the build step takes much longer this time -as ``clang`` will be built before building ``libllvmlibc.a``. +The build and install steps are the same as above, but the build step will take +much longer since ``clang`` will be built before building ``libllvmlibc.a``. .. code-block:: sh $> ninja libc - $> ninja install-llvmlibc + $> ninja check-libc Using the overlay static archive ================================ diff --git a/libc/docs/porting.rst b/libc/docs/porting.rst index ef7a2ff5cc8758..a4df4e8cf0719d 100644 --- a/libc/docs/porting.rst +++ b/libc/docs/porting.rst @@ -43,21 +43,6 @@ have their own config directory. config directory for Fuchsia as the bring up is being done in the Fuchsia source tree. -The api.td file ---------------- - -If the :ref:`fullbuild_mode` is to be supported on the new operating system, -then a file named ``api.td`` should be added in its config directory. It is -written in the -`LLVM tablegen language `_. -It lists all the relevant macros and type definitions we want in the -public libc header files. See the existing Linux -`api.td `_ -file as an example to prepare the ``api.td`` file for the new operating system. - -.. note:: In future, LLVM tablegen will be replaced with a different DSL to list - config information. - Architecture Subdirectory ========================= From 22d3fb182c9199ac3d51e5577c6647508a7a37f0 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Wed, 21 Aug 2024 10:52:10 -0700 Subject: [PATCH 098/426] [ctx_prof] Profile flatterner (#104539) Eventually we'll need to flatten the profile (at the end of all IPO) and lower to "vanilla" `MD_prof`. This is the first part of that. Issue #89287 --- llvm/include/llvm/Analysis/CtxProfAnalysis.h | 10 +++ llvm/lib/Analysis/CtxProfAnalysis.cpp | 40 ++++++++++++ .../Analysis/CtxProfAnalysis/full-cycle.ll | 65 ++++++++++++++++++- llvm/test/Analysis/CtxProfAnalysis/load.ll | 5 ++ 4 files changed, 117 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/Analysis/CtxProfAnalysis.h b/llvm/include/llvm/Analysis/CtxProfAnalysis.h index 43587d953fc4ca..23abcbe2c6e9d2 100644 --- a/llvm/include/llvm/Analysis/CtxProfAnalysis.h +++ b/llvm/include/llvm/Analysis/CtxProfAnalysis.h @@ -9,6 +9,8 @@ #ifndef LLVM_ANALYSIS_CTXPROFANALYSIS_H #define LLVM_ANALYSIS_CTXPROFANALYSIS_H +#include "llvm/ADT/DenseMap.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PassManager.h" @@ -18,6 +20,12 @@ namespace llvm { class CtxProfAnalysis; +// Setting initial capacity to 1 because all contexts must have at least 1 +// counter, and then, because all contexts belonging to a function have the same +// size, there'll be at most one other heap allocation. +using CtxProfFlatProfile = + DenseMap>; + /// The instrumented contextual profile, produced by the CtxProfAnalysis. class PGOContextualProfile { friend class CtxProfAnalysis; @@ -65,6 +73,8 @@ class PGOContextualProfile { return FuncInfo.find(getDefinedFunctionGUID(F))->second.NextCallsiteIndex++; } + const CtxProfFlatProfile flatten() const; + bool invalidate(Module &, const PreservedAnalyses &PA, ModuleAnalysisManager::Invalidator &) { // Check whether the analysis has been explicitly invalidated. Otherwise, diff --git a/llvm/lib/Analysis/CtxProfAnalysis.cpp b/llvm/lib/Analysis/CtxProfAnalysis.cpp index 51663196b13070..ceebb2cf06d235 100644 --- a/llvm/lib/Analysis/CtxProfAnalysis.cpp +++ b/llvm/lib/Analysis/CtxProfAnalysis.cpp @@ -184,6 +184,14 @@ PreservedAnalyses CtxProfAnalysisPrinterPass::run(Module &M, OS << "\nCurrent Profile:\n"; OS << formatv("{0:2}", JSONed); OS << "\n"; + OS << "\nFlat Profile:\n"; + auto Flat = C.flatten(); + for (const auto &[Guid, Counters] : Flat) { + OS << Guid << " : "; + for (auto V : Counters) + OS << V << " "; + OS << "\n"; + } return PreservedAnalyses::all(); } @@ -193,3 +201,35 @@ InstrProfCallsite *CtxProfAnalysis::getCallsiteInstrumentation(CallBase &CB) { return IPC; return nullptr; } + +static void +preorderVisit(const PGOCtxProfContext::CallTargetMapTy &Profiles, + function_ref Visitor) { + std::function Traverser = + [&](const auto &Ctx) { + Visitor(Ctx); + for (const auto &[_, SubCtxSet] : Ctx.callsites()) + for (const auto &[__, Subctx] : SubCtxSet) + Traverser(Subctx); + }; + for (const auto &[_, P] : Profiles) + Traverser(P); +} + +const CtxProfFlatProfile PGOContextualProfile::flatten() const { + assert(Profiles.has_value()); + CtxProfFlatProfile Flat; + preorderVisit(*Profiles, [&](const PGOCtxProfContext &Ctx) { + auto [It, Ins] = Flat.insert({Ctx.guid(), {}}); + if (Ins) { + llvm::append_range(It->second, Ctx.counters()); + return; + } + assert(It->second.size() == Ctx.counters().size() && + "All contexts corresponding to a function should have the exact " + "same number of counters."); + for (size_t I = 0, E = It->second.size(); I < E; ++I) + It->second[I] += Ctx.counters()[I]; + }); + return Flat; +} diff --git a/llvm/test/Analysis/CtxProfAnalysis/full-cycle.ll b/llvm/test/Analysis/CtxProfAnalysis/full-cycle.ll index 0cdf82bd96efcb..06ba8b3542f7d5 100644 --- a/llvm/test/Analysis/CtxProfAnalysis/full-cycle.ll +++ b/llvm/test/Analysis/CtxProfAnalysis/full-cycle.ll @@ -4,6 +4,9 @@ ; RUN: split-file %s %t ; ; Test that the GUID metadata survives through thinlink. +; Also test that the flattener works correctly. f2 is called in 2 places, with +; different counter values, and we expect resulting flat profile to be the sum +; (of values at the same index). ; ; RUN: llvm-ctxprof-util fromJSON --input=%t/profile.json --output=%t/profile.ctxprofdata ; @@ -17,7 +20,9 @@ ; RUN: llvm-lto2 run %t/m1.bc %t/m2.bc -o %t/ -thinlto-distributed-indexes \ ; RUN: -use-ctx-profile=%t/profile.ctxprofdata \ ; RUN: -r %t/m1.bc,f1,plx \ +; RUN: -r %t/m1.bc,f3,plx \ ; RUN: -r %t/m2.bc,f1 \ +; RUN: -r %t/m2.bc,f3 \ ; RUN: -r %t/m2.bc,entrypoint,plx ; RUN: opt --passes='function-import,require,print' \ ; RUN: -summary-file=%t/m2.bc.thinlto.bc -use-ctx-profile=%t/profile.ctxprofdata %t/m2.bc \ @@ -38,6 +43,11 @@ define void @f1() #0 { ret void } +define void @f3() #0 { + call void @f2() + ret void +} + attributes #0 = { noinline } !0 = !{ i64 3087265239403591524 } @@ -48,9 +58,11 @@ target triple = "x86_64-pc-linux-gnu" source_filename = "random_path/m2.cc" declare void @f1() +declare void @f3() define void @entrypoint() { call void @f1() + call void @f3() ret void } ;--- profile.json @@ -63,7 +75,8 @@ define void @entrypoint() { [ { "Counters": [ - 10 + 10, + 7 ], "Guid": 3087265239403591524 } @@ -74,6 +87,25 @@ define void @entrypoint() { ], "Guid": 2072045998141807037 } + ], + [ + { + "Callsites": [ + [ + { + "Counters": [ + 1, + 2 + ], + "Guid": 3087265239403591524 + } + ] + ], + "Counters": [ + 2 + ], + "Guid": 4197650231481825559 + } ] ], "Counters": [ @@ -84,8 +116,9 @@ define void @entrypoint() { ] ;--- expected.txt Function Info: -10507721908651011566 : entrypoint. MaxCounterID: 1. MaxCallsiteID: 1 +10507721908651011566 : entrypoint. MaxCounterID: 1. MaxCallsiteID: 2 3087265239403591524 : f2.llvm.0. MaxCounterID: 1. MaxCallsiteID: 0 +4197650231481825559 : f3. MaxCounterID: 1. MaxCallsiteID: 1 2072045998141807037 : f1. MaxCounterID: 1. MaxCallsiteID: 1 Current Profile: @@ -98,7 +131,8 @@ Current Profile: [ { "Counters": [ - 10 + 10, + 7 ], "Guid": 3087265239403591524 } @@ -109,6 +143,25 @@ Current Profile: ], "Guid": 2072045998141807037 } + ], + [ + { + "Callsites": [ + [ + { + "Counters": [ + 1, + 2 + ], + "Guid": 3087265239403591524 + } + ] + ], + "Counters": [ + 2 + ], + "Guid": 4197650231481825559 + } ] ], "Counters": [ @@ -117,3 +170,9 @@ Current Profile: "Guid": 10507721908651011566 } ] + +Flat Profile: +10507721908651011566 : 1 +3087265239403591524 : 11 9 +4197650231481825559 : 2 +2072045998141807037 : 7 diff --git a/llvm/test/Analysis/CtxProfAnalysis/load.ll b/llvm/test/Analysis/CtxProfAnalysis/load.ll index 69806e334aaec9..fa09474f433151 100644 --- a/llvm/test/Analysis/CtxProfAnalysis/load.ll +++ b/llvm/test/Analysis/CtxProfAnalysis/load.ll @@ -86,6 +86,11 @@ Current Profile: "Guid": 12074870348631550642 } ] + +Flat Profile: +728453322856651412 : 6 7 +12074870348631550642 : 5 +11872291593386833696 : 1 ;--- example.ll declare void @bar() From a6bae5cb37919bb0b855dd468d4982340a5740d2 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 21 Aug 2024 19:11:02 +0100 Subject: [PATCH 099/426] [AMDGPU] Split GCNSubtarget into its own file. NFC. (#105525) --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 761 ----------------- llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 + llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 797 ++++++++++++++++++ .../AMDGPU/sramecc-subtarget-feature-any.ll | 6 +- .../sramecc-subtarget-feature-disabled.ll | 6 +- .../sramecc-subtarget-feature-enabled.ll | 6 +- .../AMDGPU/xnack-subtarget-feature-any.ll | 14 +- .../xnack-subtarget-feature-disabled.ll | 14 +- .../AMDGPU/xnack-subtarget-feature-enabled.ll | 14 +- 9 files changed, 828 insertions(+), 791 deletions(-) create mode 100644 llvm/lib/Target/AMDGPU/GCNSubtarget.cpp diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 2e1bdf46924783..67d8715d3f1c26 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -17,7 +17,6 @@ #include "AMDGPULegalizerInfo.h" #include "AMDGPURegisterBankInfo.h" #include "AMDGPUTargetMachine.h" -#include "GCNSubtarget.h" #include "R600Subtarget.h" #include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" @@ -36,308 +35,12 @@ using namespace llvm; #define DEBUG_TYPE "amdgpu-subtarget" -#define GET_SUBTARGETINFO_TARGET_DESC -#define GET_SUBTARGETINFO_CTOR -#define AMDGPUSubtarget GCNSubtarget -#include "AMDGPUGenSubtargetInfo.inc" -#undef AMDGPUSubtarget - -static cl::opt EnablePowerSched( - "amdgpu-enable-power-sched", - cl::desc("Enable scheduling to minimize mAI power bursts"), - cl::init(false)); - -static cl::opt EnableVGPRIndexMode( - "amdgpu-vgpr-index-mode", - cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), - cl::init(false)); - -static cl::opt UseAA("amdgpu-use-aa-in-codegen", - cl::desc("Enable the use of AA during codegen."), - cl::init(true)); - -static cl::opt NSAThreshold("amdgpu-nsa-threshold", - cl::desc("Number of addresses from which to enable MIMG NSA."), - cl::init(3), cl::Hidden); - -GCNSubtarget::~GCNSubtarget() = default; - -GCNSubtarget & -GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, - StringRef GPU, StringRef FS) { - // Determine default and user-specified characteristics - // - // We want to be able to turn these off, but making this a subtarget feature - // for SI has the unhelpful behavior that it unsets everything else if you - // disable it. - // - // Similarly we want enable-prt-strict-null to be on by default and not to - // unset everything else if it is disabled - - SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); - - // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default - if (isAmdHsaOS()) - FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; - - FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS - - // Disable mutually exclusive bits. - if (FS.contains_insensitive("+wavefrontsize")) { - if (!FS.contains_insensitive("wavefrontsize16")) - FullFS += "-wavefrontsize16,"; - if (!FS.contains_insensitive("wavefrontsize32")) - FullFS += "-wavefrontsize32,"; - if (!FS.contains_insensitive("wavefrontsize64")) - FullFS += "-wavefrontsize64,"; - } - - FullFS += FS; - - ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); - - // Implement the "generic" processors, which acts as the default when no - // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to - // the first amdgcn target that supports flat addressing. Other OSes defaults - // to the first amdgcn target. - if (Gen == AMDGPUSubtarget::INVALID) { - Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS - : AMDGPUSubtarget::SOUTHERN_ISLANDS; - } - - if (!hasFeature(AMDGPU::FeatureWavefrontSize32) && - !hasFeature(AMDGPU::FeatureWavefrontSize64)) { - // If there is no default wave size it must be a generation before gfx10, - // these have FeatureWavefrontSize64 in their definition already. For gfx10+ - // set wave32 as a default. - ToggleFeature(AMDGPU::FeatureWavefrontSize32); - } - - // We don't support FP64 for EG/NI atm. - assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); - - // Targets must either support 64-bit offsets for MUBUF instructions, and/or - // support flat operations, otherwise they cannot access a 64-bit global - // address space - assert(hasAddr64() || hasFlat()); - // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets - // that do not support ADDR64 variants of MUBUF instructions. Such targets - // cannot use a 64 bit offset with a MUBUF instruction to access the global - // address space - if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { - ToggleFeature(AMDGPU::FeatureFlatForGlobal); - FlatForGlobal = true; - } - // Unless +-flat-for-global is specified, use MUBUF instructions for global - // address space access if flat operations are not available. - if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { - ToggleFeature(AMDGPU::FeatureFlatForGlobal); - FlatForGlobal = false; - } - - // Set defaults if needed. - if (MaxPrivateElementSize == 0) - MaxPrivateElementSize = 4; - - if (LDSBankCount == 0) - LDSBankCount = 32; - - if (TT.getArch() == Triple::amdgcn) { - if (LocalMemorySize == 0) - LocalMemorySize = 32768; - - // Do something sensible for unspecified target. - if (!HasMovrel && !HasVGPRIndexMode) - HasMovrel = true; - } - - AddressableLocalMemorySize = LocalMemorySize; - - if (AMDGPU::isGFX10Plus(*this) && - !getFeatureBits().test(AMDGPU::FeatureCuMode)) - LocalMemorySize *= 2; - - // Don't crash on invalid devices. - if (WavefrontSizeLog2 == 0) - WavefrontSizeLog2 = 5; - - HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; - HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; - - TargetID.setTargetIDFromFeaturesString(FS); - - LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " - << TargetID.getXnackSetting() << '\n'); - LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " - << TargetID.getSramEccSetting() << '\n'); - - return *this; -} - -void GCNSubtarget::checkSubtargetFeatures(const Function &F) const { - LLVMContext &Ctx = F.getContext(); - if (hasFeature(AMDGPU::FeatureWavefrontSize32) == - hasFeature(AMDGPU::FeatureWavefrontSize64)) { - Ctx.diagnose(DiagnosticInfoUnsupported( - F, "must specify exactly one of wavefrontsize32 and wavefrontsize64")); - } -} - AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {} bool AMDGPUSubtarget::useRealTrue16Insts() const { return hasTrue16BitInsts() && EnableRealTrue16Insts; } -GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, - const GCNTargetMachine &TM) - : // clang-format off - AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), - AMDGPUSubtarget(TT), - TargetTriple(TT), - TargetID(*this), - InstrItins(getInstrItineraryForCPU(GPU)), - InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), - TLInfo(TM, *this), - FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { - // clang-format on - MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); - EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(this); - CallLoweringInfo = std::make_unique(*getTargetLowering()); - InlineAsmLoweringInfo = - std::make_unique(getTargetLowering()); - Legalizer = std::make_unique(*this, TM); - RegBankInfo = std::make_unique(*this); - InstSelector = - std::make_unique(*this, *RegBankInfo, TM); -} - -unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { - if (getGeneration() < GFX10) - return 1; - - switch (Opcode) { - case AMDGPU::V_LSHLREV_B64_e64: - case AMDGPU::V_LSHLREV_B64_gfx10: - case AMDGPU::V_LSHLREV_B64_e64_gfx11: - case AMDGPU::V_LSHLREV_B64_e32_gfx12: - case AMDGPU::V_LSHLREV_B64_e64_gfx12: - case AMDGPU::V_LSHL_B64_e64: - case AMDGPU::V_LSHRREV_B64_e64: - case AMDGPU::V_LSHRREV_B64_gfx10: - case AMDGPU::V_LSHRREV_B64_e64_gfx11: - case AMDGPU::V_LSHRREV_B64_e64_gfx12: - case AMDGPU::V_LSHR_B64_e64: - case AMDGPU::V_ASHRREV_I64_e64: - case AMDGPU::V_ASHRREV_I64_gfx10: - case AMDGPU::V_ASHRREV_I64_e64_gfx11: - case AMDGPU::V_ASHRREV_I64_e64_gfx12: - case AMDGPU::V_ASHR_I64_e64: - return 1; - } - - return 2; -} - -/// This list was mostly derived from experimentation. -bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { - switch (Opcode) { - case AMDGPU::V_CVT_F16_F32_e32: - case AMDGPU::V_CVT_F16_F32_e64: - case AMDGPU::V_CVT_F16_U16_e32: - case AMDGPU::V_CVT_F16_U16_e64: - case AMDGPU::V_CVT_F16_I16_e32: - case AMDGPU::V_CVT_F16_I16_e64: - case AMDGPU::V_RCP_F16_e64: - case AMDGPU::V_RCP_F16_e32: - case AMDGPU::V_RSQ_F16_e64: - case AMDGPU::V_RSQ_F16_e32: - case AMDGPU::V_SQRT_F16_e64: - case AMDGPU::V_SQRT_F16_e32: - case AMDGPU::V_LOG_F16_e64: - case AMDGPU::V_LOG_F16_e32: - case AMDGPU::V_EXP_F16_e64: - case AMDGPU::V_EXP_F16_e32: - case AMDGPU::V_SIN_F16_e64: - case AMDGPU::V_SIN_F16_e32: - case AMDGPU::V_COS_F16_e64: - case AMDGPU::V_COS_F16_e32: - case AMDGPU::V_FLOOR_F16_e64: - case AMDGPU::V_FLOOR_F16_e32: - case AMDGPU::V_CEIL_F16_e64: - case AMDGPU::V_CEIL_F16_e32: - case AMDGPU::V_TRUNC_F16_e64: - case AMDGPU::V_TRUNC_F16_e32: - case AMDGPU::V_RNDNE_F16_e64: - case AMDGPU::V_RNDNE_F16_e32: - case AMDGPU::V_FRACT_F16_e64: - case AMDGPU::V_FRACT_F16_e32: - case AMDGPU::V_FREXP_MANT_F16_e64: - case AMDGPU::V_FREXP_MANT_F16_e32: - case AMDGPU::V_FREXP_EXP_I16_F16_e64: - case AMDGPU::V_FREXP_EXP_I16_F16_e32: - case AMDGPU::V_LDEXP_F16_e64: - case AMDGPU::V_LDEXP_F16_e32: - case AMDGPU::V_LSHLREV_B16_e64: - case AMDGPU::V_LSHLREV_B16_e32: - case AMDGPU::V_LSHRREV_B16_e64: - case AMDGPU::V_LSHRREV_B16_e32: - case AMDGPU::V_ASHRREV_I16_e64: - case AMDGPU::V_ASHRREV_I16_e32: - case AMDGPU::V_ADD_U16_e64: - case AMDGPU::V_ADD_U16_e32: - case AMDGPU::V_SUB_U16_e64: - case AMDGPU::V_SUB_U16_e32: - case AMDGPU::V_SUBREV_U16_e64: - case AMDGPU::V_SUBREV_U16_e32: - case AMDGPU::V_MUL_LO_U16_e64: - case AMDGPU::V_MUL_LO_U16_e32: - case AMDGPU::V_ADD_F16_e64: - case AMDGPU::V_ADD_F16_e32: - case AMDGPU::V_SUB_F16_e64: - case AMDGPU::V_SUB_F16_e32: - case AMDGPU::V_SUBREV_F16_e64: - case AMDGPU::V_SUBREV_F16_e32: - case AMDGPU::V_MUL_F16_e64: - case AMDGPU::V_MUL_F16_e32: - case AMDGPU::V_MAX_F16_e64: - case AMDGPU::V_MAX_F16_e32: - case AMDGPU::V_MIN_F16_e64: - case AMDGPU::V_MIN_F16_e32: - case AMDGPU::V_MAX_U16_e64: - case AMDGPU::V_MAX_U16_e32: - case AMDGPU::V_MIN_U16_e64: - case AMDGPU::V_MIN_U16_e32: - case AMDGPU::V_MAX_I16_e64: - case AMDGPU::V_MAX_I16_e32: - case AMDGPU::V_MIN_I16_e64: - case AMDGPU::V_MIN_I16_e32: - case AMDGPU::V_MAD_F16_e64: - case AMDGPU::V_MAD_U16_e64: - case AMDGPU::V_MAD_I16_e64: - case AMDGPU::V_FMA_F16_e64: - case AMDGPU::V_DIV_FIXUP_F16_e64: - // On gfx10, all 16-bit instructions preserve the high bits. - return getGeneration() <= AMDGPUSubtarget::GFX9; - case AMDGPU::V_MADAK_F16: - case AMDGPU::V_MADMK_F16: - case AMDGPU::V_MAC_F16_e64: - case AMDGPU::V_MAC_F16_e32: - case AMDGPU::V_FMAMK_F16: - case AMDGPU::V_FMAAK_F16: - case AMDGPU::V_FMAC_F16_e64: - case AMDGPU::V_FMAC_F16_e32: - // In gfx9, the preferred handling of the unused high 16-bits changed. Most - // instructions maintain the legacy behavior of 0ing. Some instructions - // changed to preserving the high bits. - return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; - case AMDGPU::V_MAD_MIXLO_F16: - case AMDGPU::V_MAD_MIXHI_F16: - default: - return false; - } -} - // Returns the maximum per-workgroup LDS allocation size (in bytes) that still // allows the given function to achieve an occupancy of NWaves waves per // SIMD / EU, taking into account only the function's *maximum* workgroup size. @@ -650,391 +353,6 @@ AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { : AMDGPUDwarfFlavour::Wave64; } -void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, - unsigned NumRegionInstrs) const { - // Track register pressure so the scheduler can try to decrease - // pressure once register usage is above the threshold defined by - // SIRegisterInfo::getRegPressureSetLimit() - Policy.ShouldTrackPressure = true; - - // Enabling both top down and bottom up scheduling seems to give us less - // register spills than just using one of these approaches on its own. - Policy.OnlyTopDown = false; - Policy.OnlyBottomUp = false; - - // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. - if (!enableSIScheduler()) - Policy.ShouldTrackLaneMasks = true; -} - -void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const { - if (isWave32()) { - // Fix implicit $vcc operands after MIParser has verified that they match - // the instruction definitions. - for (auto &MBB : MF) { - for (auto &MI : MBB) - InstrInfo.fixImplicitOperands(MI); - } - } -} - -bool GCNSubtarget::hasMadF16() const { - return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; -} - -bool GCNSubtarget::useVGPRIndexMode() const { - return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); -} - -bool GCNSubtarget::useAA() const { return UseAA; } - -unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { - return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, getMaxWavesPerEU(), - getGeneration()); -} - -unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const { - return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs); -} - -unsigned -GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const { - if (getGeneration() >= AMDGPUSubtarget::GFX10) - return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. - - if (HasFlatScratch || HasArchitectedFlatScratch) { - if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). - if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) - return 4; // FLAT_SCRATCH, VCC (in that order). - } - - if (isXNACKEnabled()) - return 4; // XNACK, VCC (in that order). - return 2; // VCC. -} - -unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { - const SIMachineFunctionInfo &MFI = *MF.getInfo(); - return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit()); -} - -unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { - // In principle we do not need to reserve SGPR pair used for flat_scratch if - // we know flat instructions do not access the stack anywhere in the - // program. For now assume it's needed if we have flat instructions. - const bool KernelUsesFlatScratch = hasFlatAddressSpace(); - return getBaseReservedNumSGPRs(KernelUsesFlatScratch); -} - -unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, - unsigned NumSGPRs, - unsigned NumVGPRs) const { - unsigned Occupancy = - std::min(getMaxWavesPerEU(), - getOccupancyWithLocalMemSize(LDSSize, F)); - if (NumSGPRs) - Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); - if (NumVGPRs) - Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); - return Occupancy; -} - -unsigned GCNSubtarget::getBaseMaxNumSGPRs( - const Function &F, std::pair WavesPerEU, - unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { - // Compute maximum number of SGPRs function can use using default/requested - // minimum number of waves per execution unit. - unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); - unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); - - // Check if maximum number of SGPRs was explicitly requested using - // "amdgpu-num-sgpr" attribute. - if (F.hasFnAttribute("amdgpu-num-sgpr")) { - unsigned Requested = - F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs); - - // Make sure requested value does not violate subtarget's specifications. - if (Requested && (Requested <= ReservedNumSGPRs)) - Requested = 0; - - // If more SGPRs are required to support the input user/system SGPRs, - // increase to accommodate them. - // - // FIXME: This really ends up using the requested number of SGPRs + number - // of reserved special registers in total. Theoretically you could re-use - // the last input registers for these special registers, but this would - // require a lot of complexity to deal with the weird aliasing. - unsigned InputNumSGPRs = PreloadedSGPRs; - if (Requested && Requested < InputNumSGPRs) - Requested = InputNumSGPRs; - - // Make sure requested value is compatible with values implied by - // default/requested minimum/maximum number of waves per execution unit. - if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) - Requested = 0; - if (WavesPerEU.second && - Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) - Requested = 0; - - if (Requested) - MaxNumSGPRs = Requested; - } - - if (hasSGPRInitBug()) - MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; - - return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); -} - -unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { - const Function &F = MF.getFunction(); - const SIMachineFunctionInfo &MFI = *MF.getInfo(); - return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), - getReservedNumSGPRs(MF)); -} - -static unsigned getMaxNumPreloadedSGPRs() { - using USI = GCNUserSGPRUsageInfo; - // Max number of user SGPRs - const unsigned MaxUserSGPRs = - USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) + - USI::getNumUserSGPRForField(USI::DispatchPtrID) + - USI::getNumUserSGPRForField(USI::QueuePtrID) + - USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) + - USI::getNumUserSGPRForField(USI::DispatchIdID) + - USI::getNumUserSGPRForField(USI::FlatScratchInitID) + - USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID); - - // Max number of system SGPRs - const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX - 1 + // WorkGroupIDY - 1 + // WorkGroupIDZ - 1 + // WorkGroupInfo - 1; // private segment wave byte offset - - // Max number of synthetic SGPRs - const unsigned SyntheticSGPRs = 1; // LDSKernelId - - return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs; -} - -unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { - return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), - getReservedNumSGPRs(F)); -} - -unsigned GCNSubtarget::getBaseMaxNumVGPRs( - const Function &F, std::pair WavesPerEU) const { - // Compute maximum number of VGPRs function can use using default/requested - // minimum number of waves per execution unit. - unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); - - // Check if maximum number of VGPRs was explicitly requested using - // "amdgpu-num-vgpr" attribute. - if (F.hasFnAttribute("amdgpu-num-vgpr")) { - unsigned Requested = - F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs); - - if (hasGFX90AInsts()) - Requested *= 2; - - // Make sure requested value is compatible with values implied by - // default/requested minimum/maximum number of waves per execution unit. - if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) - Requested = 0; - if (WavesPerEU.second && - Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) - Requested = 0; - - if (Requested) - MaxNumVGPRs = Requested; - } - - return MaxNumVGPRs; -} - -unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { - return getBaseMaxNumVGPRs(F, getWavesPerEU(F)); -} - -unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { - const Function &F = MF.getFunction(); - const SIMachineFunctionInfo &MFI = *MF.getInfo(); - return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); -} - -void GCNSubtarget::adjustSchedDependency( - SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, - const TargetSchedModel *SchedModel) const { - if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || - !Def->isInstr() || !Use->isInstr()) - return; - - MachineInstr *DefI = Def->getInstr(); - MachineInstr *UseI = Use->getInstr(); - - if (DefI->isBundle()) { - const SIRegisterInfo *TRI = getRegisterInfo(); - auto Reg = Dep.getReg(); - MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); - MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); - unsigned Lat = 0; - for (++I; I != E && I->isBundledWithPred(); ++I) { - if (I->modifiesRegister(Reg, TRI)) - Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); - else if (Lat) - --Lat; - } - Dep.setLatency(Lat); - } else if (UseI->isBundle()) { - const SIRegisterInfo *TRI = getRegisterInfo(); - auto Reg = Dep.getReg(); - MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); - MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); - unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); - for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { - if (I->readsRegister(Reg, TRI)) - break; - --Lat; - } - Dep.setLatency(Lat); - } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) { - // Work around the fact that SIInstrInfo::fixImplicitOperands modifies - // implicit operands which come from the MCInstrDesc, which can fool - // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit - // pseudo operands. - Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency( - DefI, DefOpIdx, UseI, UseOpIdx)); - } -} - -namespace { -struct FillMFMAShadowMutation : ScheduleDAGMutation { - const SIInstrInfo *TII; - - ScheduleDAGMI *DAG; - - FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} - - bool isSALU(const SUnit *SU) const { - const MachineInstr *MI = SU->getInstr(); - return MI && TII->isSALU(*MI) && !MI->isTerminator(); - } - - bool isVALU(const SUnit *SU) const { - const MachineInstr *MI = SU->getInstr(); - return MI && TII->isVALU(*MI); - } - - // Link as many SALU instructions in chain as possible. Return the size - // of the chain. Links up to MaxChain instructions. - unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, - SmallPtrSetImpl &Visited) const { - SmallVector Worklist({To}); - unsigned Linked = 0; - - while (!Worklist.empty() && MaxChain-- > 0) { - SUnit *SU = Worklist.pop_back_val(); - if (!Visited.insert(SU).second) - continue; - - LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); - dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); - - if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SU, From)) - if (DAG->addEdge(SU, SDep(From, SDep::Artificial))) - ++Linked; - - for (SDep &SI : From->Succs) { - SUnit *SUv = SI.getSUnit(); - if (SUv != From && SU != &DAG->ExitSU && isVALU(SUv) && - DAG->canAddEdge(SUv, SU)) - DAG->addEdge(SUv, SDep(SU, SDep::Artificial)); - } - - for (SDep &SI : SU->Succs) { - SUnit *Succ = SI.getSUnit(); - if (Succ != SU && isSALU(Succ)) - Worklist.push_back(Succ); - } - } - - return Linked; - } - - void apply(ScheduleDAGInstrs *DAGInstrs) override { - const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget(); - if (!ST.hasMAIInsts()) - return; - DAG = static_cast(DAGInstrs); - const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); - if (!TSchedModel || DAG->SUnits.empty()) - return; - - // Scan for MFMA long latency instructions and try to add a dependency - // of available SALU instructions to give them a chance to fill MFMA - // shadow. That is desirable to fill MFMA shadow with SALU instructions - // rather than VALU to prevent power consumption bursts and throttle. - auto LastSALU = DAG->SUnits.begin(); - auto E = DAG->SUnits.end(); - SmallPtrSet Visited; - for (SUnit &SU : DAG->SUnits) { - MachineInstr &MAI = *SU.getInstr(); - if (!TII->isMAI(MAI) || - MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || - MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) - continue; - - unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; - - LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); - dbgs() << "Need " << Lat - << " instructions to cover latency.\n"); - - // Find up to Lat independent scalar instructions as early as - // possible such that they can be scheduled after this MFMA. - for ( ; Lat && LastSALU != E; ++LastSALU) { - if (Visited.count(&*LastSALU)) - continue; - - if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(&*LastSALU) || - !DAG->canAddEdge(&*LastSALU, &SU)) - continue; - - Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); - } - } - } -}; -} // namespace - -void GCNSubtarget::getPostRAMutations( - std::vector> &Mutations) const { - Mutations.push_back(std::make_unique(&InstrInfo)); -} - -std::unique_ptr -GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const { - return EnablePowerSched ? std::make_unique(&InstrInfo) - : nullptr; -} - -unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const { - if (getGeneration() >= AMDGPUSubtarget::GFX12) - return 0; // Not MIMG encoding. - - if (NSAThreshold.getNumOccurrences() > 0) - return std::max(NSAThreshold.getValue(), 2u); - - int Value = MF.getFunction().getFnAttributeAsParsedInteger( - "amdgpu-nsa-threshold", -1); - if (Value > 0) - return std::max(Value, 2); - - return 3; -} - const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) return static_cast(MF.getSubtarget()); @@ -1048,85 +366,6 @@ const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Funct TM.getSubtarget(F)); } -GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, - const GCNSubtarget &ST) - : ST(ST) { - const CallingConv::ID CC = F.getCallingConv(); - const bool IsKernel = - CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; - // FIXME: Should have analysis or something rather than attribute to detect - // calls. - const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); - // FIXME: This attribute is a hack, we just need an analysis on the function - // to look for allocas. - const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); - - if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0)) - KernargSegmentPtr = true; - - bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); - if (IsAmdHsaOrMesa && !ST.enableFlatScratch()) - PrivateSegmentBuffer = true; - else if (ST.isMesaGfxShader(F)) - ImplicitBufferPtr = true; - - if (!AMDGPU::isGraphics(CC)) { - if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr")) - DispatchPtr = true; - - // FIXME: Can this always be disabled with < COv5? - if (!F.hasFnAttribute("amdgpu-no-queue-ptr")) - QueuePtr = true; - - if (!F.hasFnAttribute("amdgpu-no-dispatch-id")) - DispatchID = true; - } - - // TODO: This could be refined a lot. The attribute is a poor way of - // detecting calls or stack objects that may require it before argument - // lowering. - if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) && - (IsAmdHsaOrMesa || ST.enableFlatScratch()) && - (HasCalls || HasStackObjects || ST.enableFlatScratch()) && - !ST.flatScratchIsArchitected()) { - FlatScratchInit = true; - } - - if (hasImplicitBufferPtr()) - NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID); - - if (hasPrivateSegmentBuffer()) - NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID); - - if (hasDispatchPtr()) - NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID); - - if (hasQueuePtr()) - NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID); - - if (hasKernargSegmentPtr()) - NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID); - - if (hasDispatchID()) - NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID); - - if (hasFlatScratchInit()) - NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID); - - if (hasPrivateSegmentSize()) - NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID); -} - -void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) { - assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST)); - NumKernargPreloadSGPRs += NumSGPRs; - NumUsedUserSGPRs += NumSGPRs; -} - -unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() { - return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs; -} - SmallVector AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const { return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 85a59e01230237..18a8e917fbb71f 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -119,6 +119,7 @@ add_llvm_target(AMDGPUCodeGen GCNRegPressure.cpp GCNRewritePartialRegUses.cpp GCNSchedStrategy.cpp + GCNSubtarget.cpp GCNVOPDUtils.cpp R600AsmPrinter.cpp R600ClauseMergePass.cpp diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp new file mode 100644 index 00000000000000..b3872a6374261b --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -0,0 +1,797 @@ +//===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Implements the GCN specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#include "GCNSubtarget.h" +#include "AMDGPUCallLowering.h" +#include "AMDGPUInstructionSelector.h" +#include "AMDGPULegalizerInfo.h" +#include "AMDGPURegisterBankInfo.h" +#include "AMDGPUTargetMachine.h" +#include "R600Subtarget.h" +#include "SIMachineFunctionInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/TargetFrameLowering.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IntrinsicsR600.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "gcn-subtarget" + +#define GET_SUBTARGETINFO_TARGET_DESC +#define GET_SUBTARGETINFO_CTOR +#define AMDGPUSubtarget GCNSubtarget +#include "AMDGPUGenSubtargetInfo.inc" +#undef AMDGPUSubtarget + +static cl::opt + EnablePowerSched("amdgpu-enable-power-sched", + cl::desc("Enable scheduling to minimize mAI power bursts"), + cl::init(false)); + +static cl::opt EnableVGPRIndexMode( + "amdgpu-vgpr-index-mode", + cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), + cl::init(false)); + +static cl::opt UseAA("amdgpu-use-aa-in-codegen", + cl::desc("Enable the use of AA during codegen."), + cl::init(true)); + +static cl::opt + NSAThreshold("amdgpu-nsa-threshold", + cl::desc("Number of addresses from which to enable MIMG NSA."), + cl::init(3), cl::Hidden); + +GCNSubtarget::~GCNSubtarget() = default; + +GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, + StringRef GPU, + StringRef FS) { + // Determine default and user-specified characteristics + // + // We want to be able to turn these off, but making this a subtarget feature + // for SI has the unhelpful behavior that it unsets everything else if you + // disable it. + // + // Similarly we want enable-prt-strict-null to be on by default and not to + // unset everything else if it is disabled + + SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); + + // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by + // default + if (isAmdHsaOS()) + FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; + + FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS + + // Disable mutually exclusive bits. + if (FS.contains_insensitive("+wavefrontsize")) { + if (!FS.contains_insensitive("wavefrontsize16")) + FullFS += "-wavefrontsize16,"; + if (!FS.contains_insensitive("wavefrontsize32")) + FullFS += "-wavefrontsize32,"; + if (!FS.contains_insensitive("wavefrontsize64")) + FullFS += "-wavefrontsize64,"; + } + + FullFS += FS; + + ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); + + // Implement the "generic" processors, which acts as the default when no + // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to + // the first amdgcn target that supports flat addressing. Other OSes defaults + // to the first amdgcn target. + if (Gen == AMDGPUSubtarget::INVALID) { + Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS + : AMDGPUSubtarget::SOUTHERN_ISLANDS; + } + + if (!hasFeature(AMDGPU::FeatureWavefrontSize32) && + !hasFeature(AMDGPU::FeatureWavefrontSize64)) { + // If there is no default wave size it must be a generation before gfx10, + // these have FeatureWavefrontSize64 in their definition already. For gfx10+ + // set wave32 as a default. + ToggleFeature(AMDGPU::FeatureWavefrontSize32); + } + + // We don't support FP64 for EG/NI atm. + assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); + + // Targets must either support 64-bit offsets for MUBUF instructions, and/or + // support flat operations, otherwise they cannot access a 64-bit global + // address space + assert(hasAddr64() || hasFlat()); + // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets + // that do not support ADDR64 variants of MUBUF instructions. Such targets + // cannot use a 64 bit offset with a MUBUF instruction to access the global + // address space + if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { + ToggleFeature(AMDGPU::FeatureFlatForGlobal); + FlatForGlobal = true; + } + // Unless +-flat-for-global is specified, use MUBUF instructions for global + // address space access if flat operations are not available. + if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { + ToggleFeature(AMDGPU::FeatureFlatForGlobal); + FlatForGlobal = false; + } + + // Set defaults if needed. + if (MaxPrivateElementSize == 0) + MaxPrivateElementSize = 4; + + if (LDSBankCount == 0) + LDSBankCount = 32; + + if (TT.getArch() == Triple::amdgcn) { + if (LocalMemorySize == 0) + LocalMemorySize = 32768; + + // Do something sensible for unspecified target. + if (!HasMovrel && !HasVGPRIndexMode) + HasMovrel = true; + } + + AddressableLocalMemorySize = LocalMemorySize; + + if (AMDGPU::isGFX10Plus(*this) && + !getFeatureBits().test(AMDGPU::FeatureCuMode)) + LocalMemorySize *= 2; + + // Don't crash on invalid devices. + if (WavefrontSizeLog2 == 0) + WavefrontSizeLog2 = 5; + + HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; + HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; + + TargetID.setTargetIDFromFeaturesString(FS); + + LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " + << TargetID.getXnackSetting() << '\n'); + LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " + << TargetID.getSramEccSetting() << '\n'); + + return *this; +} + +void GCNSubtarget::checkSubtargetFeatures(const Function &F) const { + LLVMContext &Ctx = F.getContext(); + if (hasFeature(AMDGPU::FeatureWavefrontSize32) == + hasFeature(AMDGPU::FeatureWavefrontSize64)) { + Ctx.diagnose(DiagnosticInfoUnsupported( + F, "must specify exactly one of wavefrontsize32 and wavefrontsize64")); + } +} + +GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, + const GCNTargetMachine &TM) + : // clang-format off + AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), + AMDGPUSubtarget(TT), + TargetTriple(TT), + TargetID(*this), + InstrItins(getInstrItineraryForCPU(GPU)), + InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), + TLInfo(TM, *this), + FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { + // clang-format on + MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); + EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(this); + CallLoweringInfo = std::make_unique(*getTargetLowering()); + InlineAsmLoweringInfo = + std::make_unique(getTargetLowering()); + Legalizer = std::make_unique(*this, TM); + RegBankInfo = std::make_unique(*this); + InstSelector = + std::make_unique(*this, *RegBankInfo, TM); +} + +unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { + if (getGeneration() < GFX10) + return 1; + + switch (Opcode) { + case AMDGPU::V_LSHLREV_B64_e64: + case AMDGPU::V_LSHLREV_B64_gfx10: + case AMDGPU::V_LSHLREV_B64_e64_gfx11: + case AMDGPU::V_LSHLREV_B64_e32_gfx12: + case AMDGPU::V_LSHLREV_B64_e64_gfx12: + case AMDGPU::V_LSHL_B64_e64: + case AMDGPU::V_LSHRREV_B64_e64: + case AMDGPU::V_LSHRREV_B64_gfx10: + case AMDGPU::V_LSHRREV_B64_e64_gfx11: + case AMDGPU::V_LSHRREV_B64_e64_gfx12: + case AMDGPU::V_LSHR_B64_e64: + case AMDGPU::V_ASHRREV_I64_e64: + case AMDGPU::V_ASHRREV_I64_gfx10: + case AMDGPU::V_ASHRREV_I64_e64_gfx11: + case AMDGPU::V_ASHRREV_I64_e64_gfx12: + case AMDGPU::V_ASHR_I64_e64: + return 1; + } + + return 2; +} + +/// This list was mostly derived from experimentation. +bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { + switch (Opcode) { + case AMDGPU::V_CVT_F16_F32_e32: + case AMDGPU::V_CVT_F16_F32_e64: + case AMDGPU::V_CVT_F16_U16_e32: + case AMDGPU::V_CVT_F16_U16_e64: + case AMDGPU::V_CVT_F16_I16_e32: + case AMDGPU::V_CVT_F16_I16_e64: + case AMDGPU::V_RCP_F16_e64: + case AMDGPU::V_RCP_F16_e32: + case AMDGPU::V_RSQ_F16_e64: + case AMDGPU::V_RSQ_F16_e32: + case AMDGPU::V_SQRT_F16_e64: + case AMDGPU::V_SQRT_F16_e32: + case AMDGPU::V_LOG_F16_e64: + case AMDGPU::V_LOG_F16_e32: + case AMDGPU::V_EXP_F16_e64: + case AMDGPU::V_EXP_F16_e32: + case AMDGPU::V_SIN_F16_e64: + case AMDGPU::V_SIN_F16_e32: + case AMDGPU::V_COS_F16_e64: + case AMDGPU::V_COS_F16_e32: + case AMDGPU::V_FLOOR_F16_e64: + case AMDGPU::V_FLOOR_F16_e32: + case AMDGPU::V_CEIL_F16_e64: + case AMDGPU::V_CEIL_F16_e32: + case AMDGPU::V_TRUNC_F16_e64: + case AMDGPU::V_TRUNC_F16_e32: + case AMDGPU::V_RNDNE_F16_e64: + case AMDGPU::V_RNDNE_F16_e32: + case AMDGPU::V_FRACT_F16_e64: + case AMDGPU::V_FRACT_F16_e32: + case AMDGPU::V_FREXP_MANT_F16_e64: + case AMDGPU::V_FREXP_MANT_F16_e32: + case AMDGPU::V_FREXP_EXP_I16_F16_e64: + case AMDGPU::V_FREXP_EXP_I16_F16_e32: + case AMDGPU::V_LDEXP_F16_e64: + case AMDGPU::V_LDEXP_F16_e32: + case AMDGPU::V_LSHLREV_B16_e64: + case AMDGPU::V_LSHLREV_B16_e32: + case AMDGPU::V_LSHRREV_B16_e64: + case AMDGPU::V_LSHRREV_B16_e32: + case AMDGPU::V_ASHRREV_I16_e64: + case AMDGPU::V_ASHRREV_I16_e32: + case AMDGPU::V_ADD_U16_e64: + case AMDGPU::V_ADD_U16_e32: + case AMDGPU::V_SUB_U16_e64: + case AMDGPU::V_SUB_U16_e32: + case AMDGPU::V_SUBREV_U16_e64: + case AMDGPU::V_SUBREV_U16_e32: + case AMDGPU::V_MUL_LO_U16_e64: + case AMDGPU::V_MUL_LO_U16_e32: + case AMDGPU::V_ADD_F16_e64: + case AMDGPU::V_ADD_F16_e32: + case AMDGPU::V_SUB_F16_e64: + case AMDGPU::V_SUB_F16_e32: + case AMDGPU::V_SUBREV_F16_e64: + case AMDGPU::V_SUBREV_F16_e32: + case AMDGPU::V_MUL_F16_e64: + case AMDGPU::V_MUL_F16_e32: + case AMDGPU::V_MAX_F16_e64: + case AMDGPU::V_MAX_F16_e32: + case AMDGPU::V_MIN_F16_e64: + case AMDGPU::V_MIN_F16_e32: + case AMDGPU::V_MAX_U16_e64: + case AMDGPU::V_MAX_U16_e32: + case AMDGPU::V_MIN_U16_e64: + case AMDGPU::V_MIN_U16_e32: + case AMDGPU::V_MAX_I16_e64: + case AMDGPU::V_MAX_I16_e32: + case AMDGPU::V_MIN_I16_e64: + case AMDGPU::V_MIN_I16_e32: + case AMDGPU::V_MAD_F16_e64: + case AMDGPU::V_MAD_U16_e64: + case AMDGPU::V_MAD_I16_e64: + case AMDGPU::V_FMA_F16_e64: + case AMDGPU::V_DIV_FIXUP_F16_e64: + // On gfx10, all 16-bit instructions preserve the high bits. + return getGeneration() <= AMDGPUSubtarget::GFX9; + case AMDGPU::V_MADAK_F16: + case AMDGPU::V_MADMK_F16: + case AMDGPU::V_MAC_F16_e64: + case AMDGPU::V_MAC_F16_e32: + case AMDGPU::V_FMAMK_F16: + case AMDGPU::V_FMAAK_F16: + case AMDGPU::V_FMAC_F16_e64: + case AMDGPU::V_FMAC_F16_e32: + // In gfx9, the preferred handling of the unused high 16-bits changed. Most + // instructions maintain the legacy behavior of 0ing. Some instructions + // changed to preserving the high bits. + return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; + case AMDGPU::V_MAD_MIXLO_F16: + case AMDGPU::V_MAD_MIXHI_F16: + default: + return false; + } +} + +void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, + unsigned NumRegionInstrs) const { + // Track register pressure so the scheduler can try to decrease + // pressure once register usage is above the threshold defined by + // SIRegisterInfo::getRegPressureSetLimit() + Policy.ShouldTrackPressure = true; + + // Enabling both top down and bottom up scheduling seems to give us less + // register spills than just using one of these approaches on its own. + Policy.OnlyTopDown = false; + Policy.OnlyBottomUp = false; + + // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. + if (!enableSIScheduler()) + Policy.ShouldTrackLaneMasks = true; +} + +void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const { + if (isWave32()) { + // Fix implicit $vcc operands after MIParser has verified that they match + // the instruction definitions. + for (auto &MBB : MF) { + for (auto &MI : MBB) + InstrInfo.fixImplicitOperands(MI); + } + } +} + +bool GCNSubtarget::hasMadF16() const { + return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; +} + +bool GCNSubtarget::useVGPRIndexMode() const { + return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); +} + +bool GCNSubtarget::useAA() const { return UseAA; } + +unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { + return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, getMaxWavesPerEU(), + getGeneration()); +} + +unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const { + return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs); +} + +unsigned +GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const { + if (getGeneration() >= AMDGPUSubtarget::GFX10) + return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. + + if (HasFlatScratch || HasArchitectedFlatScratch) { + if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). + if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) + return 4; // FLAT_SCRATCH, VCC (in that order). + } + + if (isXNACKEnabled()) + return 4; // XNACK, VCC (in that order). + return 2; // VCC. +} + +unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { + const SIMachineFunctionInfo &MFI = *MF.getInfo(); + return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit()); +} + +unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { + // In principle we do not need to reserve SGPR pair used for flat_scratch if + // we know flat instructions do not access the stack anywhere in the + // program. For now assume it's needed if we have flat instructions. + const bool KernelUsesFlatScratch = hasFlatAddressSpace(); + return getBaseReservedNumSGPRs(KernelUsesFlatScratch); +} + +unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, + unsigned NumSGPRs, + unsigned NumVGPRs) const { + unsigned Occupancy = + std::min(getMaxWavesPerEU(), getOccupancyWithLocalMemSize(LDSSize, F)); + if (NumSGPRs) + Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); + if (NumVGPRs) + Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); + return Occupancy; +} + +unsigned GCNSubtarget::getBaseMaxNumSGPRs( + const Function &F, std::pair WavesPerEU, + unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { + // Compute maximum number of SGPRs function can use using default/requested + // minimum number of waves per execution unit. + unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); + unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); + + // Check if maximum number of SGPRs was explicitly requested using + // "amdgpu-num-sgpr" attribute. + if (F.hasFnAttribute("amdgpu-num-sgpr")) { + unsigned Requested = + F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs); + + // Make sure requested value does not violate subtarget's specifications. + if (Requested && (Requested <= ReservedNumSGPRs)) + Requested = 0; + + // If more SGPRs are required to support the input user/system SGPRs, + // increase to accommodate them. + // + // FIXME: This really ends up using the requested number of SGPRs + number + // of reserved special registers in total. Theoretically you could re-use + // the last input registers for these special registers, but this would + // require a lot of complexity to deal with the weird aliasing. + unsigned InputNumSGPRs = PreloadedSGPRs; + if (Requested && Requested < InputNumSGPRs) + Requested = InputNumSGPRs; + + // Make sure requested value is compatible with values implied by + // default/requested minimum/maximum number of waves per execution unit. + if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) + Requested = 0; + if (WavesPerEU.second && Requested && + Requested < getMinNumSGPRs(WavesPerEU.second)) + Requested = 0; + + if (Requested) + MaxNumSGPRs = Requested; + } + + if (hasSGPRInitBug()) + MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; + + return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); +} + +unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { + const Function &F = MF.getFunction(); + const SIMachineFunctionInfo &MFI = *MF.getInfo(); + return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), + getReservedNumSGPRs(MF)); +} + +static unsigned getMaxNumPreloadedSGPRs() { + using USI = GCNUserSGPRUsageInfo; + // Max number of user SGPRs + const unsigned MaxUserSGPRs = + USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) + + USI::getNumUserSGPRForField(USI::DispatchPtrID) + + USI::getNumUserSGPRForField(USI::QueuePtrID) + + USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) + + USI::getNumUserSGPRForField(USI::DispatchIdID) + + USI::getNumUserSGPRForField(USI::FlatScratchInitID) + + USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID); + + // Max number of system SGPRs + const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX + 1 + // WorkGroupIDY + 1 + // WorkGroupIDZ + 1 + // WorkGroupInfo + 1; // private segment wave byte offset + + // Max number of synthetic SGPRs + const unsigned SyntheticSGPRs = 1; // LDSKernelId + + return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs; +} + +unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { + return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), + getReservedNumSGPRs(F)); +} + +unsigned GCNSubtarget::getBaseMaxNumVGPRs( + const Function &F, std::pair WavesPerEU) const { + // Compute maximum number of VGPRs function can use using default/requested + // minimum number of waves per execution unit. + unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); + + // Check if maximum number of VGPRs was explicitly requested using + // "amdgpu-num-vgpr" attribute. + if (F.hasFnAttribute("amdgpu-num-vgpr")) { + unsigned Requested = + F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs); + + if (hasGFX90AInsts()) + Requested *= 2; + + // Make sure requested value is compatible with values implied by + // default/requested minimum/maximum number of waves per execution unit. + if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) + Requested = 0; + if (WavesPerEU.second && Requested && + Requested < getMinNumVGPRs(WavesPerEU.second)) + Requested = 0; + + if (Requested) + MaxNumVGPRs = Requested; + } + + return MaxNumVGPRs; +} + +unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { + return getBaseMaxNumVGPRs(F, getWavesPerEU(F)); +} + +unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { + const Function &F = MF.getFunction(); + const SIMachineFunctionInfo &MFI = *MF.getInfo(); + return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); +} + +void GCNSubtarget::adjustSchedDependency( + SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, + const TargetSchedModel *SchedModel) const { + if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || !Def->isInstr() || + !Use->isInstr()) + return; + + MachineInstr *DefI = Def->getInstr(); + MachineInstr *UseI = Use->getInstr(); + + if (DefI->isBundle()) { + const SIRegisterInfo *TRI = getRegisterInfo(); + auto Reg = Dep.getReg(); + MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); + MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); + unsigned Lat = 0; + for (++I; I != E && I->isBundledWithPred(); ++I) { + if (I->modifiesRegister(Reg, TRI)) + Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); + else if (Lat) + --Lat; + } + Dep.setLatency(Lat); + } else if (UseI->isBundle()) { + const SIRegisterInfo *TRI = getRegisterInfo(); + auto Reg = Dep.getReg(); + MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); + MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); + unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); + for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { + if (I->readsRegister(Reg, TRI)) + break; + --Lat; + } + Dep.setLatency(Lat); + } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) { + // Work around the fact that SIInstrInfo::fixImplicitOperands modifies + // implicit operands which come from the MCInstrDesc, which can fool + // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit + // pseudo operands. + Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency( + DefI, DefOpIdx, UseI, UseOpIdx)); + } +} + +namespace { +struct FillMFMAShadowMutation : ScheduleDAGMutation { + const SIInstrInfo *TII; + + ScheduleDAGMI *DAG; + + FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} + + bool isSALU(const SUnit *SU) const { + const MachineInstr *MI = SU->getInstr(); + return MI && TII->isSALU(*MI) && !MI->isTerminator(); + } + + bool isVALU(const SUnit *SU) const { + const MachineInstr *MI = SU->getInstr(); + return MI && TII->isVALU(*MI); + } + + // Link as many SALU instructions in chain as possible. Return the size + // of the chain. Links up to MaxChain instructions. + unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, + SmallPtrSetImpl &Visited) const { + SmallVector Worklist({To}); + unsigned Linked = 0; + + while (!Worklist.empty() && MaxChain-- > 0) { + SUnit *SU = Worklist.pop_back_val(); + if (!Visited.insert(SU).second) + continue; + + LLVM_DEBUG(dbgs() << "Inserting edge from\n"; DAG->dumpNode(*From); + dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); + + if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SU, From)) + if (DAG->addEdge(SU, SDep(From, SDep::Artificial))) + ++Linked; + + for (SDep &SI : From->Succs) { + SUnit *SUv = SI.getSUnit(); + if (SUv != From && SU != &DAG->ExitSU && isVALU(SUv) && + DAG->canAddEdge(SUv, SU)) + DAG->addEdge(SUv, SDep(SU, SDep::Artificial)); + } + + for (SDep &SI : SU->Succs) { + SUnit *Succ = SI.getSUnit(); + if (Succ != SU && isSALU(Succ)) + Worklist.push_back(Succ); + } + } + + return Linked; + } + + void apply(ScheduleDAGInstrs *DAGInstrs) override { + const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget(); + if (!ST.hasMAIInsts()) + return; + DAG = static_cast(DAGInstrs); + const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); + if (!TSchedModel || DAG->SUnits.empty()) + return; + + // Scan for MFMA long latency instructions and try to add a dependency + // of available SALU instructions to give them a chance to fill MFMA + // shadow. That is desirable to fill MFMA shadow with SALU instructions + // rather than VALU to prevent power consumption bursts and throttle. + auto LastSALU = DAG->SUnits.begin(); + auto E = DAG->SUnits.end(); + SmallPtrSet Visited; + for (SUnit &SU : DAG->SUnits) { + MachineInstr &MAI = *SU.getInstr(); + if (!TII->isMAI(MAI) || + MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || + MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) + continue; + + unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; + + LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); + dbgs() << "Need " << Lat + << " instructions to cover latency.\n"); + + // Find up to Lat independent scalar instructions as early as + // possible such that they can be scheduled after this MFMA. + for (; Lat && LastSALU != E; ++LastSALU) { + if (Visited.count(&*LastSALU)) + continue; + + if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(&*LastSALU) || + !DAG->canAddEdge(&*LastSALU, &SU)) + continue; + + Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); + } + } + } +}; +} // namespace + +void GCNSubtarget::getPostRAMutations( + std::vector> &Mutations) const { + Mutations.push_back(std::make_unique(&InstrInfo)); +} + +std::unique_ptr +GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const { + return EnablePowerSched ? std::make_unique(&InstrInfo) + : nullptr; +} + +unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const { + if (getGeneration() >= AMDGPUSubtarget::GFX12) + return 0; // Not MIMG encoding. + + if (NSAThreshold.getNumOccurrences() > 0) + return std::max(NSAThreshold.getValue(), 2u); + + int Value = MF.getFunction().getFnAttributeAsParsedInteger( + "amdgpu-nsa-threshold", -1); + if (Value > 0) + return std::max(Value, 2); + + return 3; +} + +GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, + const GCNSubtarget &ST) + : ST(ST) { + const CallingConv::ID CC = F.getCallingConv(); + const bool IsKernel = + CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; + // FIXME: Should have analysis or something rather than attribute to detect + // calls. + const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); + // FIXME: This attribute is a hack, we just need an analysis on the function + // to look for allocas. + const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); + + if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0)) + KernargSegmentPtr = true; + + bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); + if (IsAmdHsaOrMesa && !ST.enableFlatScratch()) + PrivateSegmentBuffer = true; + else if (ST.isMesaGfxShader(F)) + ImplicitBufferPtr = true; + + if (!AMDGPU::isGraphics(CC)) { + if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr")) + DispatchPtr = true; + + // FIXME: Can this always be disabled with < COv5? + if (!F.hasFnAttribute("amdgpu-no-queue-ptr")) + QueuePtr = true; + + if (!F.hasFnAttribute("amdgpu-no-dispatch-id")) + DispatchID = true; + } + + // TODO: This could be refined a lot. The attribute is a poor way of + // detecting calls or stack objects that may require it before argument + // lowering. + if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) && + (IsAmdHsaOrMesa || ST.enableFlatScratch()) && + (HasCalls || HasStackObjects || ST.enableFlatScratch()) && + !ST.flatScratchIsArchitected()) { + FlatScratchInit = true; + } + + if (hasImplicitBufferPtr()) + NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID); + + if (hasPrivateSegmentBuffer()) + NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID); + + if (hasDispatchPtr()) + NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID); + + if (hasQueuePtr()) + NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID); + + if (hasKernargSegmentPtr()) + NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID); + + if (hasDispatchID()) + NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID); + + if (hasFlatScratchInit()) + NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID); + + if (hasPrivateSegmentSize()) + NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID); +} + +void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) { + assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST)); + NumKernargPreloadSGPRs += NumSGPRs; + NumUsedUserSGPRs += NumSGPRs; +} + +unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() { + return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs; +} diff --git a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-any.ll b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-any.ll index 331518c0c9d339..a3fed314fed243 100644 --- a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-any.ll +++ b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-any.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s ; REQUIRES: asserts diff --git a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-disabled.ll b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-disabled.ll index 1e4e9f3e13fe2b..65b289bcd29d9a 100644 --- a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-disabled.ll +++ b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-disabled.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s ; REQUIRES: asserts diff --git a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-enabled.ll b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-enabled.ll index 713b276ddedb3c..bd665eb432f481 100644 --- a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-enabled.ll +++ b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-enabled.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s ; REQUIRES: asserts diff --git a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll index b7da3b77c96371..5aaf81d0e10e2e 100644 --- a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll +++ b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll @@ -1,10 +1,10 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx902 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx902 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s ; REQUIRES: asserts diff --git a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-disabled.ll b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-disabled.ll index 23baeabc6a1bb4..4ced763abc2ac3 100644 --- a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-disabled.ll +++ b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-disabled.ll @@ -1,10 +1,10 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -debug-only=amdgpu-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -debug-only=gcn-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=gcn-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s ; REQUIRES: asserts diff --git a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-enabled.ll b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-enabled.ll index a52c842afb291f..20354f6828f9c9 100644 --- a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-enabled.ll +++ b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-enabled.ll @@ -1,10 +1,10 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -debug-only=amdgpu-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -debug-only=gcn-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=gcn-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s ; REQUIRES: asserts From 47e0212f00f707a4bb92714afe9c748116887d62 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 21 Aug 2024 18:11:13 +0000 Subject: [PATCH 100/426] [gn build] Port a6bae5cb3791 --- llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn index edd5be27900cc9..006e1ed700b821 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -206,6 +206,7 @@ static_library("LLVMAMDGPUCodeGen") { "GCNRegPressure.cpp", "GCNRewritePartialRegUses.cpp", "GCNSchedStrategy.cpp", + "GCNSubtarget.cpp", "GCNVOPDUtils.cpp", "R600AsmPrinter.cpp", "R600ClauseMergePass.cpp", From c09fdac0b577ca0bfef141765d0a9ae1b6040893 Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Wed, 21 Aug 2024 20:21:04 +0200 Subject: [PATCH 101/426] [Docs] Update Loop Optimization WG call. The WebEx link will become invalid soon, we are switching to Google Meet. Also, changing the cadence from biweekly to monthly. --- llvm/docs/GettingInvolved.rst | 2 +- llvm/docs/_static/LoopOptWG_invite.ics | 126 ++++++++++++++++--------- 2 files changed, 81 insertions(+), 47 deletions(-) diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst index 646f1d09dfab0b..32d3a83738a8eb 100644 --- a/llvm/docs/GettingInvolved.rst +++ b/llvm/docs/GettingInvolved.rst @@ -150,7 +150,7 @@ what to add to your calendar invite. - Calendar link - Minutes/docs link * - Loop Optimization Working Group - - Every 2 weeks on Wednesday + - Every first Wednesday of the month - `ics <./_static/LoopOptWG_invite.ics>`__ - `Minutes/docs `__ * - RISC-V diff --git a/llvm/docs/_static/LoopOptWG_invite.ics b/llvm/docs/_static/LoopOptWG_invite.ics index 3ec76e577ab746..65597d90a9c852 100644 --- a/llvm/docs/_static/LoopOptWG_invite.ics +++ b/llvm/docs/_static/LoopOptWG_invite.ics @@ -1,46 +1,80 @@ -BEGIN:VCALENDAR -PRODID:-//Microsoft Corporation//Outlook 10.0 MIMEDIR//EN -VERSION:2.0 -METHOD:PUBLISH -BEGIN:VTIMEZONE -TZID:America/New_York -LAST-MODIFIED:20201011T015911Z -TZURL:http://tzurl.org/zoneinfo-outlook/America/New_York -X-LIC-LOCATION:America/New_York -BEGIN:DAYLIGHT -TZNAME:EDT -TZOFFSETFROM:-0500 -TZOFFSETTO:-0400 -DTSTART:19700308T020000 -RRULE:FREQ=YEARLY;BYMONTH=3;BYDAY=2SU -END:DAYLIGHT -BEGIN:STANDARD -TZNAME:EST -TZOFFSETFROM:-0400 -TZOFFSETTO:-0500 -DTSTART:19701101T020000 -RRULE:FREQ=YEARLY;BYMONTH=11;BYDAY=1SU -END:STANDARD -END:VTIMEZONE -BEGIN:VEVENT -DTSTAMP:20210908T145817Z -ORGANIZER;CN="Bardia Mahjour":MAILTO:bmahjour@ca.ibm.com -DTSTART;TZID=America/New_York:20210908T110000 -DTEND;TZID=America/New_York:20210908T120000 -LOCATION:https://ibm.webex.com/ibm/j.php?MTID=m450e0c4009445e16df43ff82ea58f7a6 -TRANSP:OPAQUE -SEQUENCE:1631113097 -UID:862486b7-998c-41f8-b7ca-0906ac06f113 -DESCRIPTION:\n\n\n\n\nJOIN WEBEX MEETING\nhttps://ibm.webex.com/ibm/j.php?MTID=m450e0c4009445e16df43ff82ea58f7a6\nMeeting number (access code): 145 067 2790\n\nMeeting password: PQduM8RxN52 (77386879 from phones and video systems)\n\n\n\nTAP TO JOIN FROM A MOBILE DEVICE (ATTENDEES ONLY)\n1-844-531-0958,,1450672790#77386879# tel:1-844-531-0958,,*01*1450672790%2377386879%23*01* United States Toll Free\n+1-669-234-1178,,1450672790#77386879# tel:%2B1-669-234-1178,,*01*1450672790%2377386879%23*01* United States Toll\nSome mobile devices may ask attendees to enter a numeric password.\n\n\nJOIN BY PHONE\n1-844-531-0958 United States Toll Free\n1-669-234-1178 United States Toll\n\nGlobal call-in numbers\nhttps://ibm.webex.com/ibm/globalcallin.php?MTID=mb6e6082af7e7e7fe3948dbe9ab0025cf\n\nToll-free calling restrictions\nhttps://ibm.webex.com/ibm/customer_tollfree_restrictions.pdf\n\n\nJOIN FROM A VIDEO SYSTEM OR APPLICATION\nDial sip:1450672790@ibm.webex.com\nYou can also dial 173.243.2.68 and enter your meeting number.\n\n\nJoin using Microsoft Lync or Microsoft Skype for Business\nDial sip:1450672790.ibm@lync.webex.com\n\n\n\n\n\nCan't join the meeting?\nhttps://collaborationhelp.cisco.com/article/WBX000029055\n\n\nIMPORTANT NOTICE: Please note that this Webex service allows audio and other information sent during the session to be recorded, which may be discoverable in a legal matter. By joining this session, you automatically consent to such recordings. If you do not consent to being recorded, discuss your concerns with the host or do not join the session.\n -X-ALT-DESC;FMTTYPE=text/html:\n\n\n \n \n \n \n
 
\n \n\n\n\n\n\n \n \n \n \n
\n When it's time, join the Webex meeting here.\n
\n \n \n \n \n \n
 
\n \n \n \n \n
Join meeting
\n
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
 
\n More ways to join:\n
 
\n Join from the meeting link\n
\n https://ibm.webex.com/ibm/j.php?MTID=m450e0c4009445e16df43ff82ea58f7a6\n
 
\n Join by meeting number\n
\n Meeting number (access code): 145 067 2790\n
\n
Meeting password:PQduM8RxN52 (77386879 from phones and video systems)
\n\n \n\n 
Tap to join from a mobile device (attendees only)  
1-844-531-0958,,1450672790#77386879# United States Toll Free  
+1-669-234-1178,,1450672790#77386879# United States Toll  
Some mobile devices may ask attendees to enter a numeric password. 

Join by phone  
1-844-531-0958 United States Toll Free  
1-669-234-1178 United States Toll  
Global call-in numbers  |  Toll-free calling restrictions 


\n\n
 
\n\nJoin from a video system or application
Dial 1450672790@ibm.webex.com 
You can also dial 173.243.2.68 and enter your meeting number.  
 
\n\n
 
Join using Microsoft Lync or Microsoft Skype for Business
Dial 1450672790.ibm@lync.webex.com
\n\n
 
\n \n\n \n \n \n \n \n \n
 
Need help? Go to https://help.webex.com\n
 
\n
\n -SUMMARY:Loop Opt WG -PRIORITY:5 -CLASS:PUBLIC -RRULE:FREQ=WEEKLY;WKST=SU;INTERVAL=2;BYDAY=WE -BEGIN:VALARM -TRIGGER:-PT5M -ACTION:DISPLAY -DESCRIPTION:Reminder -END:VALARM -END:VEVENT -END:VCALENDAR +BEGIN:VCALENDAR +PRODID:-//Google Inc//Google Calendar 70.9054//EN +VERSION:2.0 +CALSCALE:GREGORIAN +METHOD:PUBLISH +X-WR-CALNAME:LLVM Loop Optimization Discussion +X-WR-TIMEZONE:Europe/Berlin +BEGIN:VTIMEZONE +TZID:America/New_York +X-LIC-LOCATION:America/New_York +BEGIN:DAYLIGHT +TZOFFSETFROM:-0500 +TZOFFSETTO:-0400 +TZNAME:EDT +DTSTART:19700308T020000 +RRULE:FREQ=YEARLY;BYMONTH=3;BYDAY=2SU +END:DAYLIGHT +BEGIN:STANDARD +TZOFFSETFROM:-0400 +TZOFFSETTO:-0500 +TZNAME:EST +DTSTART:19701101T020000 +RRULE:FREQ=YEARLY;BYMONTH=11;BYDAY=1SU +END:STANDARD +END:VTIMEZONE +BEGIN:VEVENT +DTSTART;TZID=America/New_York:20240904T110000 +DTEND;TZID=America/New_York:20240904T120000 +RRULE:FREQ=MONTHLY;BYDAY=1WE +DTSTAMP:20240821T160951Z +UID:58h3f0kd3aooohmeii0johh23c@google.com +X-GOOGLE-CONFERENCE:https://meet.google.com/fmz-gspu-odg +CREATED:20240821T151507Z +DESCRIPTION:LLVM Loop Optimization Discussion
Video call link:
https://meet.google.c + om/fmz-gspu-odg
Agenda/Minutes/Discussion: https://docs.google.com/document/d/1sdzoyB + 11s0ccTZ3fobqctDpgJmRoFcz0sviKxqczs4g/edit?usp=sharing\n\n-::~:~::~:~:~ + :~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~::~:~::-\ + nJoin with Google Meet: https://meet.google.com/fmz-gspu-odg\nOr dial: (DE) + +49 40 8081617343 PIN: 948106286#\nMore phone numbers: https://tel.meet/fm + z-gspu-odg?pin=6273693382184&hs=7\n\nLearn more about Meet at: https://supp + ort.google.com/a/users/answer/9282720\n\nPlease do not edit this section.\n + -::~:~::~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~ + :~:~::~:~::- +LAST-MODIFIED:20240821T160941Z +SEQUENCE:0 +STATUS:CONFIRMED +SUMMARY:LLVM Loop Optimization Discussion +TRANSP:OPAQUE +END:VEVENT +BEGIN:VEVENT +DTSTART;TZID=America/New_York:20240904T110000 +DTEND;TZID=America/New_York:20240904T120000 +DTSTAMP:20240821T160951Z +UID:58h3f0kd3aooohmeii0johh23c@google.com +X-GOOGLE-CONFERENCE:https://meet.google.com/fmz-gspu-odg +RECURRENCE-ID;TZID=America/New_York:20240904T110000 +CREATED:20240821T151507Z +DESCRIPTION:LLVM Loop Optimization Discussion
Video call link: https://meet.google.c + om/fmz-gspu-odg
Agenda/Minutes/Discussion: https://docs.google.com/document/d/1sdzoyB + 11s0ccTZ3fobqctDpgJmRoFcz0sviKxqczs4g/edit?usp=sharing\n\n-::~:~::~:~:~ + :~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~::~:~::-\ + nJoin with Google Meet: https://meet.google.com/fmz-gspu-odg\nOr dial: (DE) + +49 40 8081617343 PIN: 948106286#\nMore phone numbers: https://tel.meet/fm + z-gspu-odg?pin=6273693382184&hs=7\n\nLearn more about Meet at: https://supp + ort.google.com/a/users/answer/9282720\n\nPlease do not edit this section.\n + -::~:~::~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~ + :~:~::~:~::- +LAST-MODIFIED:20240821T160941Z +SEQUENCE:0 +STATUS:CONFIRMED +SUMMARY:LLVM Loop Optimization Discussion +TRANSP:OPAQUE +END:VEVENT +END:VCALENDAR From 6257a98b258a3f17b78af31bf43009a559c5dd1d Mon Sep 17 00:00:00 2001 From: Adrian Vogelsgesang Date: Wed, 21 Aug 2024 20:30:10 +0200 Subject: [PATCH 102/426] [lldb-dap] Implement `StepGranularity` for "next" and "step-in" (#105464) VS Code requests the `instruction` stepping granularity if the assembly view is currently focused. By implementing `StepGranularity`, we can hence properly single-step through assembly code. --- .../test/tools/lldb-dap/dap_server.py | 12 ++++--- .../test/tools/lldb-dap/lldbdap_testcase.py | 12 ++++--- .../API/tools/lldb-dap/step/TestDAP_step.py | 13 ++++++++ lldb/tools/lldb-dap/lldb-dap.cpp | 33 +++++++++++++++++-- 4 files changed, 60 insertions(+), 10 deletions(-) diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py index a324af57b61df3..874383a13e2bb6 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py @@ -816,17 +816,21 @@ def request_launch( self.wait_for_event(filter=["process", "initialized"]) return response - def request_next(self, threadId): + def request_next(self, threadId, granularity="statement"): if self.exit_status is not None: raise ValueError("request_continue called after process exited") - args_dict = {"threadId": threadId} + args_dict = {"threadId": threadId, "granularity": granularity} command_dict = {"command": "next", "type": "request", "arguments": args_dict} return self.send_recv(command_dict) - def request_stepIn(self, threadId, targetId): + def request_stepIn(self, threadId, targetId, granularity="statement"): if self.exit_status is not None: raise ValueError("request_stepIn called after process exited") - args_dict = {"threadId": threadId, "targetId": targetId} + args_dict = { + "threadId": threadId, + "targetId": targetId, + "granularity": granularity, + } command_dict = {"command": "stepIn", "type": "request", "arguments": args_dict} return self.send_recv(command_dict) diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py index a312a88ebd7e58..27545816f20707 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py @@ -222,14 +222,18 @@ def set_global(self, name, value, id=None): """Set a top level global variable only.""" return self.dap_server.request_setVariable(2, name, str(value), id=id) - def stepIn(self, threadId=None, targetId=None, waitForStop=True): - self.dap_server.request_stepIn(threadId=threadId, targetId=targetId) + def stepIn( + self, threadId=None, targetId=None, waitForStop=True, granularity="statement" + ): + self.dap_server.request_stepIn( + threadId=threadId, targetId=targetId, granularity=granularity + ) if waitForStop: return self.dap_server.wait_for_stopped() return None - def stepOver(self, threadId=None, waitForStop=True): - self.dap_server.request_next(threadId=threadId) + def stepOver(self, threadId=None, waitForStop=True, granularity="statement"): + self.dap_server.request_next(threadId=threadId, granularity=granularity) if waitForStop: return self.dap_server.wait_for_stopped() return None diff --git a/lldb/test/API/tools/lldb-dap/step/TestDAP_step.py b/lldb/test/API/tools/lldb-dap/step/TestDAP_step.py index 8a1bb76340be73..42a39e3c8c080b 100644 --- a/lldb/test/API/tools/lldb-dap/step/TestDAP_step.py +++ b/lldb/test/API/tools/lldb-dap/step/TestDAP_step.py @@ -68,5 +68,18 @@ def test_step(self): self.assertEqual(x4, x3, "verify step over variable") self.assertGreater(line4, line3, "verify step over line") self.assertEqual(src1, src4, "verify step over source") + + # Step a single assembly instruction. + # Unfortunately, there is no portable way to verify the correct + # stepping behavior here, because the generated assembly code + # depends highly on the compiler, its version, the operating + # system, and many more factors. + self.stepOver( + threadId=tid, waitForStop=True, granularity="instruction" + ) + self.stepIn( + threadId=tid, waitForStop=True, granularity="instruction" + ) + # only step one thread that is at the breakpoint and stop break diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp index f50a6c17310739..b534a48660a5f8 100644 --- a/lldb/tools/lldb-dap/lldb-dap.cpp +++ b/lldb/tools/lldb-dap/lldb-dap.cpp @@ -1677,6 +1677,9 @@ void request_initialize(const llvm::json::Object &request) { body.try_emplace("supportsCompletionsRequest", true); // The debug adapter supports the disassembly request. body.try_emplace("supportsDisassembleRequest", true); + // The debug adapter supports stepping granularities (argument `granularity`) + // for the stepping requests. + body.try_emplace("supportsSteppingGranularity", true); llvm::json::Array completion_characters; completion_characters.emplace_back("."); @@ -1985,6 +1988,14 @@ void request_launch(const llvm::json::Object &request) { g_dap.SendJSON(CreateEventObject("initialized")); } +// Check if the step-granularity is `instruction` +static bool hasInstructionGranularity(const llvm::json::Object &requestArgs) { + if (std::optional value = + requestArgs.getString("granularity")) + return value == "instruction"; + return false; +} + // "NextRequest": { // "allOf": [ { "$ref": "#/definitions/Request" }, { // "type": "object", @@ -2012,6 +2023,11 @@ void request_launch(const llvm::json::Object &request) { // "threadId": { // "type": "integer", // "description": "Execute 'next' for this thread." +// }, +// "granularity": { +// "$ref": "#/definitions/SteppingGranularity", +// "description": "Stepping granularity. If no granularity is specified, a +// granularity of `statement` is assumed." // } // }, // "required": [ "threadId" ] @@ -2032,7 +2048,11 @@ void request_next(const llvm::json::Object &request) { // Remember the thread ID that caused the resume so we can set the // "threadCausedFocus" boolean value in the "stopped" events. g_dap.focus_tid = thread.GetThreadID(); - thread.StepOver(); + if (hasInstructionGranularity(*arguments)) { + thread.StepInstruction(/*step_over=*/true); + } else { + thread.StepOver(); + } } else { response["success"] = llvm::json::Value(false); } @@ -3193,6 +3213,11 @@ void request_stackTrace(const llvm::json::Object &request) { // "targetId": { // "type": "integer", // "description": "Optional id of the target to step into." +// }, +// "granularity": { +// "$ref": "#/definitions/SteppingGranularity", +// "description": "Stepping granularity. If no granularity is specified, a +// granularity of `statement` is assumed." // } // }, // "required": [ "threadId" ] @@ -3223,7 +3248,11 @@ void request_stepIn(const llvm::json::Object &request) { // Remember the thread ID that caused the resume so we can set the // "threadCausedFocus" boolean value in the "stopped" events. g_dap.focus_tid = thread.GetThreadID(); - thread.StepInto(step_in_target.c_str(), run_mode); + if (hasInstructionGranularity(*arguments)) { + thread.StepInstruction(/*step_over=*/false); + } else { + thread.StepInto(step_in_target.c_str(), run_mode); + } } else { response["success"] = llvm::json::Value(false); } From 8b4d4bee2a45f637fb4dcda49b592374e93a6480 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 21 Aug 2024 11:38:13 -0700 Subject: [PATCH 103/426] [NFC][ADT] Remove << operators from StringRefTest (#105500) - Remove ostream << operators for StringRef and StringRef pair from StringTest. Both of these are natively supported by googletest framework. --- llvm/unittests/ADT/StringRefTest.cpp | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/llvm/unittests/ADT/StringRefTest.cpp b/llvm/unittests/ADT/StringRefTest.cpp index 40351c99d0185c..a0529b03ae8c22 100644 --- a/llvm/unittests/ADT/StringRefTest.cpp +++ b/llvm/unittests/ADT/StringRefTest.cpp @@ -16,21 +16,6 @@ #include "gtest/gtest.h" using namespace llvm; -namespace llvm { - -std::ostream &operator<<(std::ostream &OS, const StringRef &S) { - OS << S.str(); - return OS; -} - -std::ostream &operator<<(std::ostream &OS, - const std::pair &P) { - OS << "(" << P.first << ", " << P.second << ")"; - return OS; -} - -} - // Check that we can't accidentally assign a temporary std::string to a // StringRef. (Unfortunately we can't make use of the same thing with // constructors.) From 89c556cfda4de346774c9fe547da6af9121dfa97 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Wed, 21 Aug 2024 13:48:50 -0500 Subject: [PATCH 104/426] [flang][OpenMP] Follow-up to build-breakage fix (#102028) Adjust the handling of a few of the new clauses. --- flang/lib/Lower/OpenMP/Clauses.cpp | 30 ++++++------------------------ flang/lib/Lower/OpenMP/Clauses.h | 2 +- 2 files changed, 7 insertions(+), 25 deletions(-) diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp index 75054204bb19db..efac7757ca5855 100644 --- a/flang/lib/Lower/OpenMP/Clauses.cpp +++ b/flang/lib/Lower/OpenMP/Clauses.cpp @@ -218,9 +218,9 @@ MAKE_EMPTY_CLASS(Full, Full); MAKE_EMPTY_CLASS(Inbranch, Inbranch); MAKE_EMPTY_CLASS(Mergeable, Mergeable); MAKE_EMPTY_CLASS(Nogroup, Nogroup); -// MAKE_EMPTY_CLASS(NoOpenmp, ); // missing-in-parser -// MAKE_EMPTY_CLASS(NoOpenmpRoutines, ); // missing-in-parser -// MAKE_EMPTY_CLASS(NoParallelism, ); // missing-in-parser +MAKE_EMPTY_CLASS(NoOpenmp, NoOpenmp); +MAKE_EMPTY_CLASS(NoOpenmpRoutines, NoOpenmpRoutines); +MAKE_EMPTY_CLASS(NoParallelism, NoParallelism); MAKE_EMPTY_CLASS(Notinbranch, Notinbranch); MAKE_EMPTY_CLASS(Nowait, Nowait); MAKE_EMPTY_CLASS(OmpxAttribute, OmpxAttribute); @@ -321,7 +321,6 @@ ReductionOperator makeReductionOperator(const parser::OmpReductionOperator &inp, // -------------------------------------------------------------------- // Actual clauses. Each T (where tomp::T exists in ClauseT) has its "make". -// Absent: missing-in-parser Absent make(const parser::OmpClause::Absent &inp, semantics::SemanticsContext &semaCtx) { llvm_unreachable("Unimplemented: absent"); @@ -450,7 +449,6 @@ Collapse make(const parser::OmpClause::Collapse &inp, // Compare: empty -// Contains: missing-in-parser Contains make(const parser::OmpClause::Contains &inp, semantics::SemanticsContext &semaCtx) { llvm_unreachable("Unimplemented: contains"); @@ -714,7 +712,6 @@ Hint make(const parser::OmpClause::Hint &inp, return Hint{/*HintExpr=*/makeExpr(inp.v, semaCtx)}; } -// Holds: missing-in-parser Holds make(const parser::OmpClause::Holds &inp, semantics::SemanticsContext &semaCtx) { llvm_unreachable("Unimplemented: holds"); @@ -897,24 +894,9 @@ Nontemporal make(const parser::OmpClause::Nontemporal &inp, return Nontemporal{/*List=*/makeList(inp.v, makeObjectFn(semaCtx))}; } -// NoOpenmp: missing-in-parser -NoOpenmp make(const parser::OmpClause::NoOpenmp &inp, - semantics::SemanticsContext &semaCtx) { - llvm_unreachable("Unimplemented: no_openmp"); -} - -// NoOpenmpRoutines: missing-in-parser -NoOpenmpRoutines make(const parser::OmpClause::NoOpenmpRoutines &inp, - semantics::SemanticsContext &semaCtx) { - llvm_unreachable("Unimplemented: no_openmp_routines"); -} - -// NoParallelism: missing-in-parser -NoParallelism make(const parser::OmpClause::NoParallelism &inp, - semantics::SemanticsContext &semaCtx) { - llvm_unreachable("Unimplemented: no_parallelism"); -} - +// NoOpenmp: empty +// NoOpenmpRoutines: empty +// NoParallelism: empty // Notinbranch: empty Novariants make(const parser::OmpClause::Novariants &inp, diff --git a/flang/lib/Lower/OpenMP/Clauses.h b/flang/lib/Lower/OpenMP/Clauses.h index c7874935d8605a..51bf0eab0f8d07 100644 --- a/flang/lib/Lower/OpenMP/Clauses.h +++ b/flang/lib/Lower/OpenMP/Clauses.h @@ -175,8 +175,8 @@ using At = tomp::clause::AtT; using Bind = tomp::clause::BindT; using Capture = tomp::clause::CaptureT; using Collapse = tomp::clause::CollapseT; -using Contains = tomp::clause::ContainsT; using Compare = tomp::clause::CompareT; +using Contains = tomp::clause::ContainsT; using Copyin = tomp::clause::CopyinT; using Copyprivate = tomp::clause::CopyprivateT; using Defaultmap = tomp::clause::DefaultmapT; From 6ec3130a38e6982a61e7fa74bd5223c95c0bb918 Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee Date: Wed, 21 Aug 2024 12:21:43 -0700 Subject: [PATCH 105/426] [CGData] Fix tests for sed without using options (#105546) This fixes a build issue for AIX -- https://github.com/llvm/llvm-project/pull/101461. --- llvm/test/tools/llvm-cgdata/merge-archive.test | 8 ++++---- llvm/test/tools/llvm-cgdata/merge-concat.test | 6 +++--- llvm/test/tools/llvm-cgdata/merge-double.test | 8 ++++---- llvm/test/tools/llvm-cgdata/merge-single.test | 4 ++-- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/llvm/test/tools/llvm-cgdata/merge-archive.test b/llvm/test/tools/llvm-cgdata/merge-archive.test index d70ac7c3c938d8..03eb9106b54562 100644 --- a/llvm/test/tools/llvm-cgdata/merge-archive.test +++ b/llvm/test/tools/llvm-cgdata/merge-archive.test @@ -8,13 +8,13 @@ RUN: split-file %s %t # Synthesize raw cgdata without the header (24 byte) from the indexed cgdata. RUN: llvm-cgdata --convert --format binary %t/raw-1.cgtext -o %t/raw-1.cgdata RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-1-bytes.txt -RUN: sed -ie "s//$(cat %t/raw-1-bytes.txt)/g" %t/merge-1.ll +RUN: sed "s//$(cat %t/raw-1-bytes.txt)/g" %t/merge-1-template.ll > %t/merge-1.ll RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-1.ll -o %t/merge-1.o # Synthesize raw cgdata without the header (24 byte) from the indexed cgdata. RUN: llvm-cgdata --convert --format binary %t/raw-2.cgtext -o %t/raw-2.cgdata RUN: od -t x1 -j 24 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-2-bytes.txt -RUN: sed -ie "s//$(cat %t/raw-2-bytes.txt)/g" %t/merge-2.ll +RUN: sed "s//$(cat %t/raw-2-bytes.txt)/g" %t/merge-2-template.ll > %t/merge-2.ll RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o # Make an archive from two object files @@ -66,7 +66,7 @@ TREE-NEXT: ... SuccessorIds: [ ] ... -;--- merge-1.ll +;--- merge-1-template.ll @.data = private unnamed_addr constant [72 x i8] c"", section "__DATA,__llvm_outline" @@ -86,5 +86,5 @@ TREE-NEXT: ... SuccessorIds: [ ] ... -;--- merge-2.ll +;--- merge-2-template.ll @.data = private unnamed_addr constant [72 x i8] c"", section "__DATA,__llvm_outline" diff --git a/llvm/test/tools/llvm-cgdata/merge-concat.test b/llvm/test/tools/llvm-cgdata/merge-concat.test index cc39c673cf9a5e..ac0e7a6e29e878 100644 --- a/llvm/test/tools/llvm-cgdata/merge-concat.test +++ b/llvm/test/tools/llvm-cgdata/merge-concat.test @@ -9,10 +9,10 @@ RUN: split-file %s %t # Concatenate them in merge-concat.ll RUN: llvm-cgdata --convert --format binary %t/raw-1.cgtext -o %t/raw-1.cgdata RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-1-bytes.txt -RUN: sed -ie "s//$(cat %t/raw-1-bytes.txt)/g" %t/merge-concat.ll +RUN: sed "s//$(cat %t/raw-1-bytes.txt)/g" %t/merge-concat-template.ll > %t/merge-concat-template-2.ll RUN: llvm-cgdata --convert --format binary %t/raw-2.cgtext -o %t/raw-2.cgdata RUN: od -t x1 -j 24 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-2-bytes.txt -RUN: sed -ie "s//$(cat %t/raw-2-bytes.txt)/g" %t/merge-concat.ll +RUN: sed "s//$(cat %t/raw-2-bytes.txt)/g" %t/merge-concat-template-2.ll > %t/merge-concat.ll RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-concat.ll -o %t/merge-concat.o RUN: llvm-cgdata --merge %t/merge-concat.o -o %t/merge-concat.cgdata @@ -76,7 +76,7 @@ TREE-NEXT: ... SuccessorIds: [ ] ... -;--- merge-concat.ll +;--- merge-concat-template.ll ; In an linked executable (as opposed to an object file), cgdata in __llvm_outline might be concatenated. Although this is not a typical workflow, we simply support this case to parse cgdata that is concatenated. In other words, the following two trees are encoded back-to-back in a binary format. @.data1 = private unnamed_addr constant [72 x i8] c"", section "__DATA,__llvm_outline" diff --git a/llvm/test/tools/llvm-cgdata/merge-double.test b/llvm/test/tools/llvm-cgdata/merge-double.test index 950a88c66f7bb4..1ae8064291019e 100644 --- a/llvm/test/tools/llvm-cgdata/merge-double.test +++ b/llvm/test/tools/llvm-cgdata/merge-double.test @@ -8,13 +8,13 @@ RUN: split-file %s %t # Synthesize raw cgdata without the header (24 byte) from the indexed cgdata. RUN: llvm-cgdata --convert --format binary %t/raw-1.cgtext -o %t/raw-1.cgdata RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-1-bytes.txt -RUN: sed -ie "s//$(cat %t/raw-1-bytes.txt)/g" %t/merge-1.ll +RUN: sed "s//$(cat %t/raw-1-bytes.txt)/g" %t/merge-1-template.ll > %t/merge-1.ll RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-1.ll -o %t/merge-1.o # Synthesize raw cgdata without the header (24 byte) from the indexed cgdata. RUN: llvm-cgdata --convert --format binary %t/raw-2.cgtext -o %t/raw-2.cgdata RUN: od -t x1 -j 24 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-2-bytes.txt -RUN: sed -ie "s//$(cat %t/raw-2-bytes.txt)/g" %t/merge-2.ll +RUN: sed "s//$(cat %t/raw-2-bytes.txt)/g" %t/merge-2-template.ll > %t/merge-2.ll RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o # Merge two object files into the codegen data file. @@ -64,7 +64,7 @@ TREE-NEXT: ... SuccessorIds: [ ] ... -;--- merge-1.ll +;--- merge-1-template.ll @.data = private unnamed_addr constant [72 x i8] c"", section "__DATA,__llvm_outline" ;--- raw-2.cgtext @@ -83,5 +83,5 @@ TREE-NEXT: ... SuccessorIds: [ ] ... -;--- merge-2.ll +;--- merge-2-template.ll @.data = private unnamed_addr constant [72 x i8] c"", section "__DATA,__llvm_outline" diff --git a/llvm/test/tools/llvm-cgdata/merge-single.test b/llvm/test/tools/llvm-cgdata/merge-single.test index 783c7b979f541e..47e3cb3f4f50fb 100644 --- a/llvm/test/tools/llvm-cgdata/merge-single.test +++ b/llvm/test/tools/llvm-cgdata/merge-single.test @@ -15,7 +15,7 @@ RUN: llvm-cgdata --show %t/merge-empty.cgdata | count 0 RUN: llvm-cgdata --convert --format binary %t/raw-single.cgtext -o %t/raw-single.cgdata RUN: od -t x1 -j 24 -An %t/raw-single.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-single-bytes.txt -RUN: sed -ie "s//$(cat %t/raw-single-bytes.txt)/g" %t/merge-single.ll +RUN: sed "s//$(cat %t/raw-single-bytes.txt)/g" %t/merge-single-template.ll > %t/merge-single.ll RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-single.ll -o %t/merge-single.o # Merge an object file having cgdata (__llvm_outline) @@ -45,5 +45,5 @@ CHECK-NEXT: Depth: 2 SuccessorIds: [ ] ... -;--- merge-single.ll +;--- merge-single-template.ll @.data = private unnamed_addr constant [72 x i8] c"", section "__DATA,__llvm_outline" From e31252bf54dedadfe78b36d07ea6084156faa38a Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 21 Aug 2024 11:47:00 -0700 Subject: [PATCH 106/426] [SLP]Fix PR105120: fix the order of phi nodes vectorization. The operands of the phi nodes should be vectorized in the same order, in which they were created, otherwise the compiler may crash when trying to correctly build dependency for nodes with non-schedulable instructions for gather/buildvector nodes. Fixes https://github.com/llvm/llvm-project/issues/105120 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 24 ++++++++++-- .../X86/phi-nodes-as-operand-reorder.ll | 38 +++++++++++++++++++ 2 files changed, 59 insertions(+), 3 deletions(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/phi-nodes-as-operand-reorder.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index dee6d688b1b903..848e0de20e7b6c 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7227,6 +7227,22 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, unsigned ShuffleOrOp = S.isAltShuffle() ? (unsigned) Instruction::ShuffleVector : S.getOpcode(); + auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) { + // Postpone PHI nodes creation + SmallVector PHIOps; + for (unsigned I : seq(Operands.size())) { + ArrayRef Op = Operands[I]; + if (Op.empty()) + continue; + InstructionsState S = getSameOpcode(Op, *TLI); + if (S.getOpcode() != Instruction::PHI || S.isAltShuffle()) + buildTree_rec(Op, Depth + 1, {TE, I}); + else + PHIOps.push_back(I); + } + for (unsigned I : PHIOps) + buildTree_rec(Operands[I], Depth + 1, {TE, I}); + }; switch (ShuffleOrOp) { case Instruction::PHI: { auto *PH = cast(VL0); @@ -7238,10 +7254,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // Keeps the reordered operands to avoid code duplication. PHIHandler Handler(*DT, PH, VL); Handler.buildOperands(); - for (unsigned I : seq(0, PH->getNumOperands())) + for (unsigned I : seq(PH->getNumOperands())) TE->setOperand(I, Handler.getOperands(I)); - for (unsigned I : seq(0, PH->getNumOperands())) - buildTree_rec(Handler.getOperands(I), Depth + 1, {TE, I}); + SmallVector> Operands(PH->getNumOperands()); + for (unsigned I : seq(PH->getNumOperands())) + Operands[I] = Handler.getOperands(I); + CreateOperandNodes(TE, Operands); return; } case Instruction::ExtractValue: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-nodes-as-operand-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-nodes-as-operand-reorder.ll new file mode 100644 index 00000000000000..51ce970bf06bc8 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-nodes-as-operand-reorder.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-99999 < %s | FileCheck %s + +define void @test() { +; CHECK-LABEL: define void @test() { +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: br label %[[BB1:.*]] +; CHECK: [[BB1]]: +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP3:%.*]], %[[BB3:.*]] ] +; CHECK-NEXT: br i1 false, label %[[BB6:.*]], label %[[BB3]] +; CHECK: [[BB3]]: +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> , <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i32> zeroinitializer, [[TMP1]] +; CHECK-NEXT: [[TMP3]] = add <2 x i32> zeroinitializer, [[TMP1]] +; CHECK-NEXT: br i1 false, label %[[BB6]], label %[[BB1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ [[TMP0]], %[[BB1]] ], [ [[TMP2]], %[[BB3]] ] +; CHECK-NEXT: ret void +; +bb: + br label %bb1 + +bb1: + %phi = phi i32 [ 0, %bb ], [ %add5, %bb3 ] + %phi2 = phi i32 [ 0, %bb ], [ %add, %bb3 ] + br i1 false, label %bb6, label %bb3 + +bb3: + %add = add i32 0, 0 + %add4 = add i32 0, 0 + %add5 = add i32 %phi, 0 + br i1 false, label %bb6, label %bb1 + +bb6: + %phi7 = phi i32 [ %phi2, %bb1 ], [ %add4, %bb3 ] + %phi8 = phi i32 [ %phi, %bb1 ], [ %add5, %bb3 ] + ret void +} From b765fdd997be9ff0afb6de87077cd53d5f3d349c Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 21 Aug 2024 15:23:47 -0400 Subject: [PATCH 107/426] [SLP]Try to keep scalars, used in phi nodes, if phi nodes from same block are vectorized. Before doing the vectorization of the PHI nodes, the compiler sorts them by the opcodes of the operands. If the scalar is replaced during the vectorization by extractelement, it breaks this sorting and prevent some further vectorization attempts. Patch tries to improve this by doing extra analysis of the scalars and tries to keep them, if it is found that this scalar is used in other (external) PHI node in the same block. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/103923 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 27 +++++++++- llvm/test/Transforms/SLPVectorizer/X86/phi.ll | 53 +++++++++---------- 2 files changed, 51 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 848e0de20e7b6c..8f70a43465b8ac 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -10930,8 +10930,31 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { if (CanBeUsedAsScalar) { InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind); bool KeepScalar = ScalarCost <= ExtraCost; - if (KeepScalar && ScalarCost != TTI::TCC_Free && - ExtraCost - ScalarCost <= TTI::TCC_Basic) { + // Try to keep original scalar if the user is the phi node from the same + // block as the root phis, currently vectorized. It allows to keep + // better ordering info of PHIs, being vectorized currently. + bool IsProfitablePHIUser = + (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic && + VectorizableTree.front()->Scalars.size() > 2)) && + VectorizableTree.front()->getOpcode() == Instruction::PHI && + !Inst->hasNUsesOrMore(UsesLimit) && + none_of(Inst->users(), + [&](User *U) { + auto *PHIUser = dyn_cast(U); + return (!PHIUser || + PHIUser->getParent() != + cast( + VectorizableTree.front()->getMainOp()) + ->getParent()) && + !getTreeEntry(U); + }) && + count_if(Entry->Scalars, [&](Value *V) { + return ValueToExtUses->contains(V); + }) <= 2; + if (IsProfitablePHIUser) { + KeepScalar = true; + } else if (KeepScalar && ScalarCost != TTI::TCC_Free && + ExtraCost - ScalarCost <= TTI::TCC_Basic) { unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) { return ValueToExtUses->contains(V); }); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll index 495a503311ab9e..96151e0bd6c418 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -136,42 +136,41 @@ for.end: ; preds = %for.body define float @foo3(ptr nocapture readonly %A) #0 { ; CHECK-LABEL: @foo3( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 0 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x float> [ [[TMP3]], [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP6]], 7.000000e+00 +; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP8:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00 ; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]] -; CHECK-NEXT: [[TMP7:%.*]] = add nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX14]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]] -; CHECK-NEXT: [[TMP9]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = fmul <4 x float> [[TMP11]], -; CHECK-NEXT: [[TMP13]] = fadd <4 x float> [[TMP4]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP14]], 121 +; CHECK-NEXT: [[TMP8]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP8]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP10]], +; CHECK-NEXT: [[TMP12]] = fadd <4 x float> [[TMP3]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP13]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP13]], i32 0 -; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP13]], i32 1 -; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP13]], i32 2 -; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP13]], i32 3 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP18]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP12]], i32 0 +; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP12]], i32 1 +; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP12]], i32 2 +; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP12]], i32 3 +; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP17]] ; CHECK-NEXT: ret float [[ADD31]] ; entry: From 4b35624ce0ac5b487d39880e75b5d85f4d49eec0 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Wed, 21 Aug 2024 20:34:03 +0100 Subject: [PATCH 108/426] [AArch64] Add SVE lowering of fixed-length UABD/SABD (#104991) --- .../Target/AArch64/AArch64ISelLowering.cpp | 2 + .../AArch64/sve-fixed-length-int-abd.ll | 183 +++++++++++ ...sve-streaming-mode-fixed-length-int-abd.ll | 292 ++++++++++++++++++ 3 files changed, 477 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/sve-fixed-length-int-abd.ll create mode 100644 llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-abd.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index dbe9413f05d013..e98b430e62389b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2055,6 +2055,8 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable(); // Lower fixed length vector operations to scalable equivalents. + setOperationAction(ISD::ABDS, VT, Default); + setOperationAction(ISD::ABDU, VT, Default); setOperationAction(ISD::ABS, VT, Default); setOperationAction(ISD::ADD, VT, Default); setOperationAction(ISD::AND, VT, Default); diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-abd.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-abd.ll new file mode 100644 index 00000000000000..08a974fa2d9f40 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-abd.ll @@ -0,0 +1,183 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 + +target triple = "aarch64-unknown-linux-gnu" + +; Don't use SVE for 128-bit vectors. +define void @sabd_v16i8_v16i16(ptr %a, ptr %b) #0 { +; CHECK-LABEL: sabd_v16i8_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: sabd v0.16b, v0.16b, v1.16b +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %a.ld = load <16 x i8>, ptr %a + %b.ld = load <16 x i8>, ptr %b + %a.sext = sext <16 x i8> %a.ld to <16 x i16> + %b.sext = sext <16 x i8> %b.ld to <16 x i16> + %sub = sub <16 x i16> %a.sext, %b.sext + %abs = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %sub, i1 true) + %trunc = trunc <16 x i16> %abs to <16 x i8> + store <16 x i8> %trunc, ptr %a + ret void +} + +; Don't use SVE for 128-bit vectors. +define void @sabd_v16i8_v16i32(ptr %a, ptr %b) #0 { +; CHECK-LABEL: sabd_v16i8_v16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: sabd v0.16b, v0.16b, v1.16b +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %a.ld = load <16 x i8>, ptr %a + %b.ld = load <16 x i8>, ptr %b + %a.sext = sext <16 x i8> %a.ld to <16 x i32> + %b.sext = sext <16 x i8> %b.ld to <16 x i32> + %sub = sub <16 x i32> %a.sext, %b.sext + %abs = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %sub, i1 true) + %trunc = trunc <16 x i32> %abs to <16 x i8> + store <16 x i8> %trunc, ptr %a + ret void +} + +; Don't use SVE for 128-bit vectors. +define void @sabd_v16i8_v16i64(ptr %a, ptr %b) #0 { +; CHECK-LABEL: sabd_v16i8_v16i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: sabd v0.16b, v0.16b, v1.16b +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %a.ld = load <16 x i8>, ptr %a + %b.ld = load <16 x i8>, ptr %b + %a.sext = sext <16 x i8> %a.ld to <16 x i64> + %b.sext = sext <16 x i8> %b.ld to <16 x i64> + %sub = sub <16 x i64> %a.sext, %b.sext + %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 true) + %trunc = trunc <16 x i64> %abs to <16 x i8> + store <16 x i8> %trunc, ptr %a + ret void +} + +define void @sabd_v32i8_v32i16(ptr %a, ptr %b) #0 { +; CHECK-LABEL: sabd_v32i8_v32i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: sabd z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ret + %a.ld = load <32 x i8>, ptr %a + %b.ld = load <32 x i8>, ptr %b + %a.sext = sext <32 x i8> %a.ld to <32 x i16> + %b.sext = sext <32 x i8> %b.ld to <32 x i16> + %sub = sub <32 x i16> %a.sext, %b.sext + %abs = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %sub, i1 true) + %trunc = trunc <32 x i16> %abs to <32 x i8> + store <32 x i8> %trunc, ptr %a + ret void +} + +define void @uabd_v32i8_v32i16(ptr %a, ptr %b) #0 { +; CHECK-LABEL: uabd_v32i8_v32i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: uabd z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ret + %a.ld = load <32 x i8>, ptr %a + %b.ld = load <32 x i8>, ptr %b + %a.zext = zext <32 x i8> %a.ld to <32 x i16> + %b.zext = zext <32 x i8> %b.ld to <32 x i16> + %sub = sub <32 x i16> %a.zext, %b.zext + %abs = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %sub, i1 true) + %trunc = trunc <32 x i16> %abs to <32 x i8> + store <32 x i8> %trunc, ptr %a + ret void +} + +define void @sabd_v32i8_v32i32(ptr %a, ptr %b) #0 { +; CHECK-LABEL: sabd_v32i8_v32i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: sabd z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ret + %a.ld = load <32 x i8>, ptr %a + %b.ld = load <32 x i8>, ptr %b + %a.sext = sext <32 x i8> %a.ld to <32 x i32> + %b.sext = sext <32 x i8> %b.ld to <32 x i32> + %sub = sub <32 x i32> %a.sext, %b.sext + %abs = call <32 x i32> @llvm.abs.v32i32(<32 x i32> %sub, i1 true) + %trunc = trunc <32 x i32> %abs to <32 x i8> + store <32 x i8> %trunc, ptr %a + ret void +} + +define void @sabd_v32i8_v32i64(ptr %a, ptr %b) #0 { +; CHECK-LABEL: sabd_v32i8_v32i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: sabd z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ret + %a.ld = load <32 x i8>, ptr %a + %b.ld = load <32 x i8>, ptr %b + %a.sext = sext <32 x i8> %a.ld to <32 x i64> + %b.sext = sext <32 x i8> %b.ld to <32 x i64> + %sub = sub <32 x i64> %a.sext, %b.sext + %abs = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %sub, i1 true) + %trunc = trunc <32 x i64> %abs to <32 x i8> + store <32 x i8> %trunc, ptr %a + ret void +} + +define void @sabd_v64i8_v64i64(ptr %a, ptr %b) #0 { +; VBITS_GE_256-LABEL: sabd_v64i8_v64i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: sabd z0.b, p0/m, z0.b, z1.b +; VBITS_GE_256-NEXT: movprfx z1, z2 +; VBITS_GE_256-NEXT: sabd z1.b, p0/m, z1.b, z3.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: sabd_v64i8_v64i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: sabd z0.b, p0/m, z0.b, z1.b +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %a.ld = load <64 x i8>, ptr %a + %b.ld = load <64 x i8>, ptr %b + %a.sext = sext <64 x i8> %a.ld to <64 x i64> + %b.sext = sext <64 x i8> %b.ld to <64 x i64> + %sub = sub <64 x i64> %a.sext, %b.sext + %abs = call <64 x i64> @llvm.abs.v64i64(<64 x i64> %sub, i1 true) + %trunc = trunc <64 x i64> %abs to <64 x i8> + store <64 x i8> %trunc, ptr %a + ret void +} + +attributes #0 = { "target-features"="+neon,+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-abd.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-abd.ll new file mode 100644 index 00000000000000..2dd64bc7df189a --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-abd.ll @@ -0,0 +1,292 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s +; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE + +target triple = "aarch64-unknown-linux-gnu" + +define void @uabd_v16i8_v16i16(ptr %a, ptr %b) { +; CHECK-LABEL: uabd_v16i8_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: uabd z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: uabd_v16i8_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, hi +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, hi +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, hi +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, hi +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, hi +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, hi +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, hi +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, hi +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, hi +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, hi +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, hi +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, hi +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, hi +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, hi +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, hi +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, hi +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: ret + %a.ld = load <16 x i8>, ptr %a + %b.ld = load <16 x i8>, ptr %b + %a.sext = zext <16 x i8> %a.ld to <16 x i16> + %b.sext = zext <16 x i8> %b.ld to <16 x i16> + %sub = sub <16 x i16> %a.sext, %b.sext + %abs = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %sub, i1 true) + %trunc = trunc <16 x i16> %abs to <16 x i8> + store <16 x i8> %trunc, ptr %a + ret void +} + +define void @sabd_v16i8_v16i16(ptr %a, ptr %b) { +; CHECK-LABEL: sabd_v16i8_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: sabd z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: sabd_v16i8_v16i16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: subs w8, w9, w8 +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: ret + %a.ld = load <16 x i8>, ptr %a + %b.ld = load <16 x i8>, ptr %b + %a.sext = sext <16 x i8> %a.ld to <16 x i16> + %b.sext = sext <16 x i8> %b.ld to <16 x i16> + %sub = sub <16 x i16> %a.sext, %b.sext + %abs = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %sub, i1 true) + %trunc = trunc <16 x i16> %abs to <16 x i8> + store <16 x i8> %trunc, ptr %a + ret void +} From 716594da176b4cbc956e7c7ab90988db6f907686 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Wed, 21 Aug 2024 12:42:44 -0700 Subject: [PATCH 109/426] [SandboxIR] Add ShuffleVectorInst (#104891) This is missing tracking for `setShuffleMask`. I'll add it in a follow-up. --- llvm/include/llvm/SandboxIR/SandboxIR.h | 503 +++++++++++++++++- .../llvm/SandboxIR/SandboxIRValues.def | 77 +-- llvm/lib/SandboxIR/SandboxIR.cpp | 74 +++ llvm/unittests/SandboxIR/SandboxIRTest.cpp | 417 +++++++++++++++ 4 files changed, 1008 insertions(+), 63 deletions(-) diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index ca71566091bf82..01ef8013ea42a0 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -114,6 +114,7 @@ class Instruction; class SelectInst; class ExtractElementInst; class InsertElementInst; +class ShuffleVectorInst; class BranchInst; class UnaryInstruction; class LoadInst; @@ -240,31 +241,32 @@ class Value { /// order. llvm::Value *Val = nullptr; - friend class Context; // For getting `Val`. - friend class User; // For getting `Val`. - friend class Use; // For getting `Val`. - friend class SelectInst; // For getting `Val`. - friend class ExtractElementInst; // For getting `Val`. - friend class InsertElementInst; // For getting `Val`. - friend class BranchInst; // For getting `Val`. - friend class LoadInst; // For getting `Val`. - friend class StoreInst; // For getting `Val`. - friend class ReturnInst; // For getting `Val`. - friend class CallBase; // For getting `Val`. - friend class CallInst; // For getting `Val`. - friend class InvokeInst; // For getting `Val`. - friend class CallBrInst; // For getting `Val`. - friend class GetElementPtrInst; // For getting `Val`. - friend class CatchSwitchInst; // For getting `Val`. - friend class SwitchInst; // For getting `Val`. - friend class UnaryOperator; // For getting `Val`. - friend class BinaryOperator; // For getting `Val`. - friend class AtomicRMWInst; // For getting `Val`. - friend class AtomicCmpXchgInst; // For getting `Val`. - friend class AllocaInst; // For getting `Val`. - friend class CastInst; // For getting `Val`. - friend class PHINode; // For getting `Val`. - friend class UnreachableInst; // For getting `Val`. + friend class Context; // For getting `Val`. + friend class User; // For getting `Val`. + friend class Use; // For getting `Val`. + friend class SelectInst; // For getting `Val`. + friend class ExtractElementInst; // For getting `Val`. + friend class InsertElementInst; // For getting `Val`. + friend class ShuffleVectorInst; // For getting `Val`. + friend class BranchInst; // For getting `Val`. + friend class LoadInst; // For getting `Val`. + friend class StoreInst; // For getting `Val`. + friend class ReturnInst; // For getting `Val`. + friend class CallBase; // For getting `Val`. + friend class CallInst; // For getting `Val`. + friend class InvokeInst; // For getting `Val`. + friend class CallBrInst; // For getting `Val`. + friend class GetElementPtrInst; // For getting `Val`. + friend class CatchSwitchInst; // For getting `Val`. + friend class SwitchInst; // For getting `Val`. + friend class UnaryOperator; // For getting `Val`. + friend class BinaryOperator; // For getting `Val`. + friend class AtomicRMWInst; // For getting `Val`. + friend class AtomicCmpXchgInst; // For getting `Val`. + friend class AllocaInst; // For getting `Val`. + friend class CastInst; // For getting `Val`. + friend class PHINode; // For getting `Val`. + friend class UnreachableInst; // For getting `Val`. friend class CatchSwitchAddHandler; // For `Val`. /// All values point to the context. @@ -669,6 +671,7 @@ class Instruction : public sandboxir::User { friend class SelectInst; // For getTopmostLLVMInstruction(). friend class ExtractElementInst; // For getTopmostLLVMInstruction(). friend class InsertElementInst; // For getTopmostLLVMInstruction(). + friend class ShuffleVectorInst; // For getTopmostLLVMInstruction(). friend class BranchInst; // For getTopmostLLVMInstruction(). friend class LoadInst; // For getTopmostLLVMInstruction(). friend class StoreInst; // For getTopmostLLVMInstruction(). @@ -949,6 +952,454 @@ class ExtractElementInst final } }; +class ShuffleVectorInst final + : public SingleLLVMInstructionImpl { + /// Use Context::createShuffleVectorInst() instead. + ShuffleVectorInst(llvm::Instruction *I, Context &Ctx) + : SingleLLVMInstructionImpl(ClassID::ShuffleVector, Opcode::ShuffleVector, + I, Ctx) {} + friend class Context; // For accessing the constructor in create*() + +public: + static Value *create(Value *V1, Value *V2, Value *Mask, + Instruction *InsertBefore, Context &Ctx, + const Twine &Name = ""); + static Value *create(Value *V1, Value *V2, Value *Mask, + BasicBlock *InsertAtEnd, Context &Ctx, + const Twine &Name = ""); + static Value *create(Value *V1, Value *V2, ArrayRef Mask, + Instruction *InsertBefore, Context &Ctx, + const Twine &Name = ""); + static Value *create(Value *V1, Value *V2, ArrayRef Mask, + BasicBlock *InsertAtEnd, Context &Ctx, + const Twine &Name = ""); + static bool classof(const Value *From) { + return From->getSubclassID() == ClassID::ShuffleVector; + } + + /// Swap the operands and adjust the mask to preserve the semantics of the + /// instruction. + void commute() { cast(Val)->commute(); } + + /// Return true if a shufflevector instruction can be formed with the + /// specified operands. + static bool isValidOperands(const Value *V1, const Value *V2, + const Value *Mask) { + return llvm::ShuffleVectorInst::isValidOperands(V1->Val, V2->Val, + Mask->Val); + } + static bool isValidOperands(const Value *V1, const Value *V2, + ArrayRef Mask) { + return llvm::ShuffleVectorInst::isValidOperands(V1->Val, V2->Val, Mask); + } + + /// Overload to return most specific vector type. + VectorType *getType() const { + return cast(Val)->getType(); + } + + /// Return the shuffle mask value of this instruction for the given element + /// index. Return PoisonMaskElem if the element is undef. + int getMaskValue(unsigned Elt) const { + return cast(Val)->getMaskValue(Elt); + } + + /// Convert the input shuffle mask operand to a vector of integers. Undefined + /// elements of the mask are returned as PoisonMaskElem. + static void getShuffleMask(const Constant *Mask, + SmallVectorImpl &Result) { + llvm::ShuffleVectorInst::getShuffleMask(cast(Mask->Val), + Result); + } + + /// Return the mask for this instruction as a vector of integers. Undefined + /// elements of the mask are returned as PoisonMaskElem. + void getShuffleMask(SmallVectorImpl &Result) const { + cast(Val)->getShuffleMask(Result); + } + + /// Return the mask for this instruction, for use in bitcode. + Constant *getShuffleMaskForBitcode() const; + + static Constant *convertShuffleMaskForBitcode(ArrayRef Mask, + Type *ResultTy, Context &Ctx); + + void setShuffleMask(ArrayRef Mask) { + cast(Val)->setShuffleMask(Mask); + } + + ArrayRef getShuffleMask() const { + return cast(Val)->getShuffleMask(); + } + + /// Return true if this shuffle returns a vector with a different number of + /// elements than its source vectors. + /// Examples: shufflevector <4 x n> A, <4 x n> B, <1,2,3> + /// shufflevector <4 x n> A, <4 x n> B, <1,2,3,4,5> + bool changesLength() const { + return cast(Val)->changesLength(); + } + + /// Return true if this shuffle returns a vector with a greater number of + /// elements than its source vectors. + /// Example: shufflevector <2 x n> A, <2 x n> B, <1,2,3> + bool increasesLength() const { + return cast(Val)->increasesLength(); + } + + /// Return true if this shuffle mask chooses elements from exactly one source + /// vector. + /// Example: <7,5,undef,7> + /// This assumes that vector operands (of length \p NumSrcElts) are the same + /// length as the mask. + static bool isSingleSourceMask(ArrayRef Mask, int NumSrcElts) { + return llvm::ShuffleVectorInst::isSingleSourceMask(Mask, NumSrcElts); + } + static bool isSingleSourceMask(const Constant *Mask, int NumSrcElts) { + return llvm::ShuffleVectorInst::isSingleSourceMask( + cast(Mask->Val), NumSrcElts); + } + + /// Return true if this shuffle chooses elements from exactly one source + /// vector without changing the length of that vector. + /// Example: shufflevector <4 x n> A, <4 x n> B, <3,0,undef,3> + bool isSingleSource() const { + return cast(Val)->isSingleSource(); + } + + /// Return true if this shuffle mask chooses elements from exactly one source + /// vector without lane crossings. A shuffle using this mask is not + /// necessarily a no-op because it may change the number of elements from its + /// input vectors or it may provide demanded bits knowledge via undef lanes. + /// Example: + static bool isIdentityMask(ArrayRef Mask, int NumSrcElts) { + return llvm::ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts); + } + static bool isIdentityMask(const Constant *Mask, int NumSrcElts) { + return llvm::ShuffleVectorInst::isIdentityMask( + cast(Mask->Val), NumSrcElts); + } + + /// Return true if this shuffle chooses elements from exactly one source + /// vector without lane crossings and does not change the number of elements + /// from its input vectors. + /// Example: shufflevector <4 x n> A, <4 x n> B, <4,undef,6,undef> + bool isIdentity() const { + return cast(Val)->isIdentity(); + } + + /// Return true if this shuffle lengthens exactly one source vector with + /// undefs in the high elements. + bool isIdentityWithPadding() const { + return cast(Val)->isIdentityWithPadding(); + } + + /// Return true if this shuffle extracts the first N elements of exactly one + /// source vector. + bool isIdentityWithExtract() const { + return cast(Val)->isIdentityWithExtract(); + } + + /// Return true if this shuffle concatenates its 2 source vectors. This + /// returns false if either input is undefined. In that case, the shuffle is + /// is better classified as an identity with padding operation. + bool isConcat() const { + return cast(Val)->isConcat(); + } + + /// Return true if this shuffle mask chooses elements from its source vectors + /// without lane crossings. A shuffle using this mask would be + /// equivalent to a vector select with a constant condition operand. + /// Example: <4,1,6,undef> + /// This returns false if the mask does not choose from both input vectors. + /// In that case, the shuffle is better classified as an identity shuffle. + /// This assumes that vector operands are the same length as the mask + /// (a length-changing shuffle can never be equivalent to a vector select). + static bool isSelectMask(ArrayRef Mask, int NumSrcElts) { + return llvm::ShuffleVectorInst::isSelectMask(Mask, NumSrcElts); + } + static bool isSelectMask(const Constant *Mask, int NumSrcElts) { + return llvm::ShuffleVectorInst::isSelectMask( + cast(Mask->Val), NumSrcElts); + } + + /// Return true if this shuffle chooses elements from its source vectors + /// without lane crossings and all operands have the same number of elements. + /// In other words, this shuffle is equivalent to a vector select with a + /// constant condition operand. + /// Example: shufflevector <4 x n> A, <4 x n> B, + /// This returns false if the mask does not choose from both input vectors. + /// In that case, the shuffle is better classified as an identity shuffle. + bool isSelect() const { + return cast(Val)->isSelect(); + } + + /// Return true if this shuffle mask swaps the order of elements from exactly + /// one source vector. + /// Example: <7,6,undef,4> + /// This assumes that vector operands (of length \p NumSrcElts) are the same + /// length as the mask. + static bool isReverseMask(ArrayRef Mask, int NumSrcElts) { + return llvm::ShuffleVectorInst::isReverseMask(Mask, NumSrcElts); + } + static bool isReverseMask(const Constant *Mask, int NumSrcElts) { + return llvm::ShuffleVectorInst::isReverseMask( + cast(Mask->Val), NumSrcElts); + } + + /// Return true if this shuffle swaps the order of elements from exactly + /// one source vector. + /// Example: shufflevector <4 x n> A, <4 x n> B, <3,undef,1,undef> + bool isReverse() const { + return cast(Val)->isReverse(); + } + + /// Return true if this shuffle mask chooses all elements with the same value + /// as the first element of exactly one source vector. + /// Example: <4,undef,undef,4> + /// This assumes that vector operands (of length \p NumSrcElts) are the same + /// length as the mask. + static bool isZeroEltSplatMask(ArrayRef Mask, int NumSrcElts) { + return llvm::ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts); + } + static bool isZeroEltSplatMask(const Constant *Mask, int NumSrcElts) { + return llvm::ShuffleVectorInst::isZeroEltSplatMask( + cast(Mask->Val), NumSrcElts); + } + + /// Return true if all elements of this shuffle are the same value as the + /// first element of exactly one source vector without changing the length + /// of that vector. + /// Example: shufflevector <4 x n> A, <4 x n> B, + bool isZeroEltSplat() const { + return cast(Val)->isZeroEltSplat(); + } + + /// Return true if this shuffle mask is a transpose mask. + /// Transpose vector masks transpose a 2xn matrix. They read corresponding + /// even- or odd-numbered vector elements from two n-dimensional source + /// vectors and write each result into consecutive elements of an + /// n-dimensional destination vector. Two shuffles are necessary to complete + /// the transpose, one for the even elements and another for the odd elements. + /// This description closely follows how the TRN1 and TRN2 AArch64 + /// instructions operate. + /// + /// For example, a simple 2x2 matrix can be transposed with: + /// + /// ; Original matrix + /// m0 = < a, b > + /// m1 = < c, d > + /// + /// ; Transposed matrix + /// t0 = < a, c > = shufflevector m0, m1, < 0, 2 > + /// t1 = < b, d > = shufflevector m0, m1, < 1, 3 > + /// + /// For matrices having greater than n columns, the resulting nx2 transposed + /// matrix is stored in two result vectors such that one vector contains + /// interleaved elements from all the even-numbered rows and the other vector + /// contains interleaved elements from all the odd-numbered rows. For example, + /// a 2x4 matrix can be transposed with: + /// + /// ; Original matrix + /// m0 = < a, b, c, d > + /// m1 = < e, f, g, h > + /// + /// ; Transposed matrix + /// t0 = < a, e, c, g > = shufflevector m0, m1 < 0, 4, 2, 6 > + /// t1 = < b, f, d, h > = shufflevector m0, m1 < 1, 5, 3, 7 > + static bool isTransposeMask(ArrayRef Mask, int NumSrcElts) { + return llvm::ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts); + } + static bool isTransposeMask(const Constant *Mask, int NumSrcElts) { + return llvm::ShuffleVectorInst::isTransposeMask( + cast(Mask->Val), NumSrcElts); + } + + /// Return true if this shuffle transposes the elements of its inputs without + /// changing the length of the vectors. This operation may also be known as a + /// merge or interleave. See the description for isTransposeMask() for the + /// exact specification. + /// Example: shufflevector <4 x n> A, <4 x n> B, <0,4,2,6> + bool isTranspose() const { + return cast(Val)->isTranspose(); + } + + /// Return true if this shuffle mask is a splice mask, concatenating the two + /// inputs together and then extracts an original width vector starting from + /// the splice index. + /// Example: shufflevector <4 x n> A, <4 x n> B, <1,2,3,4> + /// This assumes that vector operands (of length \p NumSrcElts) are the same + /// length as the mask. + static bool isSpliceMask(ArrayRef Mask, int NumSrcElts, int &Index) { + return llvm::ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index); + } + static bool isSpliceMask(const Constant *Mask, int NumSrcElts, int &Index) { + return llvm::ShuffleVectorInst::isSpliceMask( + cast(Mask->Val), NumSrcElts, Index); + } + + /// Return true if this shuffle splices two inputs without changing the length + /// of the vectors. This operation concatenates the two inputs together and + /// then extracts an original width vector starting from the splice index. + /// Example: shufflevector <4 x n> A, <4 x n> B, <1,2,3,4> + bool isSplice(int &Index) const { + return cast(Val)->isSplice(Index); + } + + /// Return true if this shuffle mask is an extract subvector mask. + /// A valid extract subvector mask returns a smaller vector from a single + /// source operand. The base extraction index is returned as well. + static bool isExtractSubvectorMask(ArrayRef Mask, int NumSrcElts, + int &Index) { + return llvm::ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, + Index); + } + static bool isExtractSubvectorMask(const Constant *Mask, int NumSrcElts, + int &Index) { + return llvm::ShuffleVectorInst::isExtractSubvectorMask( + cast(Mask->Val), NumSrcElts, Index); + } + + /// Return true if this shuffle mask is an extract subvector mask. + bool isExtractSubvectorMask(int &Index) const { + return cast(Val)->isExtractSubvectorMask(Index); + } + + /// Return true if this shuffle mask is an insert subvector mask. + /// A valid insert subvector mask inserts the lowest elements of a second + /// source operand into an in-place first source operand. + /// Both the sub vector width and the insertion index is returned. + static bool isInsertSubvectorMask(ArrayRef Mask, int NumSrcElts, + int &NumSubElts, int &Index) { + return llvm::ShuffleVectorInst::isInsertSubvectorMask(Mask, NumSrcElts, + NumSubElts, Index); + } + static bool isInsertSubvectorMask(const Constant *Mask, int NumSrcElts, + int &NumSubElts, int &Index) { + return llvm::ShuffleVectorInst::isInsertSubvectorMask( + cast(Mask->Val), NumSrcElts, NumSubElts, Index); + } + + /// Return true if this shuffle mask is an insert subvector mask. + bool isInsertSubvectorMask(int &NumSubElts, int &Index) const { + return cast(Val)->isInsertSubvectorMask(NumSubElts, + Index); + } + + /// Return true if this shuffle mask replicates each of the \p VF elements + /// in a vector \p ReplicationFactor times. + /// For example, the mask for \p ReplicationFactor=3 and \p VF=4 is: + /// <0,0,0,1,1,1,2,2,2,3,3,3> + static bool isReplicationMask(ArrayRef Mask, int &ReplicationFactor, + int &VF) { + return llvm::ShuffleVectorInst::isReplicationMask(Mask, ReplicationFactor, + VF); + } + static bool isReplicationMask(const Constant *Mask, int &ReplicationFactor, + int &VF) { + return llvm::ShuffleVectorInst::isReplicationMask( + cast(Mask->Val), ReplicationFactor, VF); + } + + /// Return true if this shuffle mask is a replication mask. + bool isReplicationMask(int &ReplicationFactor, int &VF) const { + return cast(Val)->isReplicationMask( + ReplicationFactor, VF); + } + + /// Return true if this shuffle mask represents "clustered" mask of size VF, + /// i.e. each index between [0..VF) is used exactly once in each submask of + /// size VF. + /// For example, the mask for \p VF=4 is: + /// 0, 1, 2, 3, 3, 2, 0, 1 - "clustered", because each submask of size 4 + /// (0,1,2,3 and 3,2,0,1) uses indices [0..VF) exactly one time. + /// 0, 1, 2, 3, 3, 3, 1, 0 - not "clustered", because + /// element 3 is used twice in the second submask + /// (3,3,1,0) and index 2 is not used at all. + static bool isOneUseSingleSourceMask(ArrayRef Mask, int VF) { + return llvm::ShuffleVectorInst::isOneUseSingleSourceMask(Mask, VF); + } + + /// Return true if this shuffle mask is a one-use-single-source("clustered") + /// mask. + bool isOneUseSingleSourceMask(int VF) const { + return cast(Val)->isOneUseSingleSourceMask(VF); + } + + /// Change values in a shuffle permute mask assuming the two vector operands + /// of length InVecNumElts have swapped position. + static void commuteShuffleMask(MutableArrayRef Mask, + unsigned InVecNumElts) { + llvm::ShuffleVectorInst::commuteShuffleMask(Mask, InVecNumElts); + } + + /// Return if this shuffle interleaves its two input vectors together. + bool isInterleave(unsigned Factor) const { + return cast(Val)->isInterleave(Factor); + } + + /// Return true if the mask interleaves one or more input vectors together. + /// + /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...> + /// E.g. For a Factor of 2 (LaneLen=4): + /// <0, 4, 1, 5, 2, 6, 3, 7> + /// E.g. For a Factor of 3 (LaneLen=4): + /// <4, 0, 9, 5, 1, 10, 6, 2, 11, 7, 3, 12> + /// E.g. For a Factor of 4 (LaneLen=2): + /// <0, 2, 6, 4, 1, 3, 7, 5> + /// + /// NumInputElts is the total number of elements in the input vectors. + /// + /// StartIndexes are the first indexes of each vector being interleaved, + /// substituting any indexes that were undef + /// E.g. <4, -1, 2, 5, 1, 3> (Factor=3): StartIndexes=<4, 0, 2> + /// + /// Note that this does not check if the input vectors are consecutive: + /// It will return true for masks such as + /// <0, 4, 6, 1, 5, 7> (Factor=3, LaneLen=2) + static bool isInterleaveMask(ArrayRef Mask, unsigned Factor, + unsigned NumInputElts, + SmallVectorImpl &StartIndexes) { + return llvm::ShuffleVectorInst::isInterleaveMask(Mask, Factor, NumInputElts, + StartIndexes); + } + static bool isInterleaveMask(ArrayRef Mask, unsigned Factor, + unsigned NumInputElts) { + return llvm::ShuffleVectorInst::isInterleaveMask(Mask, Factor, + NumInputElts); + } + + /// Check if the mask is a DE-interleave mask of the given factor + /// \p Factor like: + /// + static bool isDeInterleaveMaskOfFactor(ArrayRef Mask, unsigned Factor, + unsigned &Index) { + return llvm::ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor, + Index); + } + static bool isDeInterleaveMaskOfFactor(ArrayRef Mask, unsigned Factor) { + return llvm::ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor); + } + + /// Checks if the shuffle is a bit rotation of the first operand across + /// multiple subelements, e.g: + /// + /// shuffle <8 x i8> %a, <8 x i8> poison, <8 x i32> <1, 0, 3, 2, 5, 4, 7, 6> + /// + /// could be expressed as + /// + /// rotl <4 x i16> %a, 8 + /// + /// If it can be expressed as a rotation, returns the number of subelements to + /// group by in NumSubElts and the number of bits to rotate left in RotateAmt. + static bool isBitRotateMask(ArrayRef Mask, unsigned EltSizeInBits, + unsigned MinSubElts, unsigned MaxSubElts, + unsigned &NumSubElts, unsigned &RotateAmt) { + return llvm::ShuffleVectorInst::isBitRotateMask( + Mask, EltSizeInBits, MinSubElts, MaxSubElts, NumSubElts, RotateAmt); + } +}; + class BranchInst : public SingleLLVMInstructionImpl { /// Use Context::createBranchInst(). Don't call the constructor directly. BranchInst(llvm::BranchInst *BI, Context &Ctx) @@ -2280,6 +2731,8 @@ class Context { friend InsertElementInst; // For createInsertElementInst() ExtractElementInst *createExtractElementInst(llvm::ExtractElementInst *EEI); friend ExtractElementInst; // For createExtractElementInst() + ShuffleVectorInst *createShuffleVectorInst(llvm::ShuffleVectorInst *SVI); + friend ShuffleVectorInst; // For createShuffleVectorInst() BranchInst *createBranchInst(llvm::BranchInst *I); friend BranchInst; // For createBranchInst() LoadInst *createLoadInst(llvm::LoadInst *LI); diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def index 402b6f3324a222..56720f564a7cae 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def +++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def @@ -33,46 +33,47 @@ DEF_USER(ConstantInt, ConstantInt) #define OPCODES(...) #endif // clang-format off -// ClassID, Opcode(s), Class -DEF_INSTR(Opaque, OP(Opaque), OpaqueInst) +// ClassID, Opcode(s), Class +DEF_INSTR(Opaque, OP(Opaque), OpaqueInst) DEF_INSTR(ExtractElement, OP(ExtractElement), ExtractElementInst) -DEF_INSTR(InsertElement, OP(InsertElement), InsertElementInst) -DEF_INSTR(Select, OP(Select), SelectInst) -DEF_INSTR(Br, OP(Br), BranchInst) -DEF_INSTR(Load, OP(Load), LoadInst) -DEF_INSTR(Store, OP(Store), StoreInst) -DEF_INSTR(Ret, OP(Ret), ReturnInst) -DEF_INSTR(Call, OP(Call), CallInst) -DEF_INSTR(Invoke, OP(Invoke), InvokeInst) -DEF_INSTR(CallBr, OP(CallBr), CallBrInst) -DEF_INSTR(GetElementPtr, OP(GetElementPtr), GetElementPtrInst) -DEF_INSTR(CatchSwitch, OP(CatchSwitch), CatchSwitchInst) -DEF_INSTR(Switch, OP(Switch), SwitchInst) -DEF_INSTR(UnOp, OPCODES( \ - OP(FNeg) \ - ), UnaryOperator) +DEF_INSTR(InsertElement, OP(InsertElement), InsertElementInst) +DEF_INSTR(ShuffleVector, OP(ShuffleVector), ShuffleVectorInst) +DEF_INSTR(Select, OP(Select), SelectInst) +DEF_INSTR(Br, OP(Br), BranchInst) +DEF_INSTR(Load, OP(Load), LoadInst) +DEF_INSTR(Store, OP(Store), StoreInst) +DEF_INSTR(Ret, OP(Ret), ReturnInst) +DEF_INSTR(Call, OP(Call), CallInst) +DEF_INSTR(Invoke, OP(Invoke), InvokeInst) +DEF_INSTR(CallBr, OP(CallBr), CallBrInst) +DEF_INSTR(GetElementPtr, OP(GetElementPtr), GetElementPtrInst) +DEF_INSTR(CatchSwitch, OP(CatchSwitch), CatchSwitchInst) +DEF_INSTR(Switch, OP(Switch), SwitchInst) +DEF_INSTR(UnOp, OPCODES( \ + OP(FNeg) \ + ), UnaryOperator) DEF_INSTR(BinaryOperator, OPCODES(\ - OP(Add) \ - OP(FAdd) \ - OP(Sub) \ - OP(FSub) \ - OP(Mul) \ - OP(FMul) \ - OP(UDiv) \ - OP(SDiv) \ - OP(FDiv) \ - OP(URem) \ - OP(SRem) \ - OP(FRem) \ - OP(Shl) \ - OP(LShr) \ - OP(AShr) \ - OP(And) \ - OP(Or) \ - OP(Xor) \ - ), BinaryOperator) -DEF_INSTR(AtomicRMW, OP(AtomicRMW), AtomicRMWInst) -DEF_INSTR(AtomicCmpXchg, OP(AtomicCmpXchg), AtomicCmpXchgInst) + OP(Add) \ + OP(FAdd) \ + OP(Sub) \ + OP(FSub) \ + OP(Mul) \ + OP(FMul) \ + OP(UDiv) \ + OP(SDiv) \ + OP(FDiv) \ + OP(URem) \ + OP(SRem) \ + OP(FRem) \ + OP(Shl) \ + OP(LShr) \ + OP(AShr) \ + OP(And) \ + OP(Or) \ + OP(Xor) \ + ), BinaryOperator) +DEF_INSTR(AtomicRMW, OP(AtomicRMW), AtomicRMWInst) +DEF_INSTR(AtomicCmpXchg, OP(AtomicCmpXchg), AtomicCmpXchgInst) DEF_INSTR(Alloca, OP(Alloca), AllocaInst) DEF_INSTR(Cast, OPCODES(\ OP(ZExt) \ diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp index 5b170cee20c940..a62c879b91e8b9 100644 --- a/llvm/lib/SandboxIR/SandboxIR.cpp +++ b/llvm/lib/SandboxIR/SandboxIR.cpp @@ -1818,6 +1818,67 @@ Value *ExtractElementInst::create(Value *Vec, Value *Idx, return Ctx.getOrCreateConstant(cast(NewV)); } +Value *ShuffleVectorInst::create(Value *V1, Value *V2, Value *Mask, + Instruction *InsertBefore, Context &Ctx, + const Twine &Name) { + auto &Builder = Ctx.getLLVMIRBuilder(); + Builder.SetInsertPoint(InsertBefore->getTopmostLLVMInstruction()); + llvm::Value *NewV = + Builder.CreateShuffleVector(V1->Val, V2->Val, Mask->Val, Name); + if (auto *NewShuffle = dyn_cast(NewV)) + return Ctx.createShuffleVectorInst(NewShuffle); + assert(isa(NewV) && "Expected constant"); + return Ctx.getOrCreateConstant(cast(NewV)); +} + +Value *ShuffleVectorInst::create(Value *V1, Value *V2, Value *Mask, + BasicBlock *InsertAtEnd, Context &Ctx, + const Twine &Name) { + auto &Builder = Ctx.getLLVMIRBuilder(); + Builder.SetInsertPoint(cast(InsertAtEnd->Val)); + llvm::Value *NewV = + Builder.CreateShuffleVector(V1->Val, V2->Val, Mask->Val, Name); + if (auto *NewShuffle = dyn_cast(NewV)) + return Ctx.createShuffleVectorInst(NewShuffle); + assert(isa(NewV) && "Expected constant"); + return Ctx.getOrCreateConstant(cast(NewV)); +} + +Value *ShuffleVectorInst::create(Value *V1, Value *V2, ArrayRef Mask, + Instruction *InsertBefore, Context &Ctx, + const Twine &Name) { + auto &Builder = Ctx.getLLVMIRBuilder(); + Builder.SetInsertPoint(InsertBefore->getTopmostLLVMInstruction()); + llvm::Value *NewV = Builder.CreateShuffleVector(V1->Val, V2->Val, Mask, Name); + if (auto *NewShuffle = dyn_cast(NewV)) + return Ctx.createShuffleVectorInst(NewShuffle); + assert(isa(NewV) && "Expected constant"); + return Ctx.getOrCreateConstant(cast(NewV)); +} + +Value *ShuffleVectorInst::create(Value *V1, Value *V2, ArrayRef Mask, + BasicBlock *InsertAtEnd, Context &Ctx, + const Twine &Name) { + auto &Builder = Ctx.getLLVMIRBuilder(); + Builder.SetInsertPoint(cast(InsertAtEnd->Val)); + llvm::Value *NewV = Builder.CreateShuffleVector(V1->Val, V2->Val, Mask, Name); + if (auto *NewShuffle = dyn_cast(NewV)) + return Ctx.createShuffleVectorInst(NewShuffle); + assert(isa(NewV) && "Expected constant"); + return Ctx.getOrCreateConstant(cast(NewV)); +} + +Constant *ShuffleVectorInst::getShuffleMaskForBitcode() const { + return Ctx.getOrCreateConstant( + cast(Val)->getShuffleMaskForBitcode()); +} + +Constant *ShuffleVectorInst::convertShuffleMaskForBitcode( + llvm::ArrayRef Mask, llvm::Type *ResultTy, Context &Ctx) { + return Ctx.getOrCreateConstant( + llvm::ShuffleVectorInst::convertShuffleMaskForBitcode(Mask, ResultTy)); +} + #ifndef NDEBUG void Constant::dumpOS(raw_ostream &OS) const { dumpCommonPrefix(OS); @@ -1957,6 +2018,12 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { new InsertElementInst(LLVMIns, *this)); return It->second.get(); } + case llvm::Instruction::ShuffleVector: { + auto *LLVMIns = cast(LLVMV); + It->second = std::unique_ptr( + new ShuffleVectorInst(LLVMIns, *this)); + return It->second.get(); + } case llvm::Instruction::Br: { auto *LLVMBr = cast(LLVMV); It->second = std::unique_ptr(new BranchInst(LLVMBr, *this)); @@ -2121,6 +2188,13 @@ Context::createInsertElementInst(llvm::InsertElementInst *IEI) { return cast(registerValue(std::move(NewPtr))); } +ShuffleVectorInst * +Context::createShuffleVectorInst(llvm::ShuffleVectorInst *SVI) { + auto NewPtr = + std::unique_ptr(new ShuffleVectorInst(SVI, *this)); + return cast(registerValue(std::move(NewPtr))); +} + BranchInst *Context::createBranchInst(llvm::BranchInst *BI) { auto NewPtr = std::unique_ptr(new BranchInst(BI, *this)); return cast(registerValue(std::move(NewPtr))); diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index 712865fd07cd7b..94d8ac27be3bc8 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -15,6 +15,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Module.h" #include "llvm/Support/SourceMgr.h" +#include "gmock/gmock-matchers.h" #include "gtest/gtest.h" using namespace llvm; @@ -739,6 +740,422 @@ define void @foo(i8 %v0, i8 %v1, <2 x i8> %vec) { llvm::InsertElementInst::isValidOperands(LLVMArg0, LLVMArgVec, LLVMZero)); } +TEST_F(SandboxIRTest, ShuffleVectorInst) { + parseIR(C, R"IR( +define void @foo(<2 x i8> %v1, <2 x i8> %v2) { + %shuf = shufflevector <2 x i8> %v1, <2 x i8> %v2, <2 x i32> + %extr = extractelement <2 x i8> , i32 0 + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto &F = *Ctx.createFunction(&LLVMF); + auto *ArgV1 = F.getArg(0); + auto *ArgV2 = F.getArg(1); + auto *BB = &*F.begin(); + auto It = BB->begin(); + auto *SVI = cast(&*It++); + auto *EEI = cast(&*It++); + auto *Ret = &*It++; + + EXPECT_EQ(SVI->getOpcode(), sandboxir::Instruction::Opcode::ShuffleVector); + EXPECT_EQ(SVI->getOperand(0), ArgV1); + EXPECT_EQ(SVI->getOperand(1), ArgV2); + + // In order to test all the methods we need masks of different lengths, so we + // can't simply reuse one of the instructions created above. This helper + // creates a new `shufflevector %v1, %2, ` with the given mask indices. + auto CreateShuffleWithMask = [&](auto &&...Indices) { + SmallVector Mask = {Indices...}; + return cast( + sandboxir::ShuffleVectorInst::create(ArgV1, ArgV2, Mask, Ret, Ctx)); + }; + + // create (InsertBefore) + auto *NewI1 = + cast(sandboxir::ShuffleVectorInst::create( + ArgV1, ArgV2, ArrayRef({0, 2, 1, 3}), Ret, Ctx, + "NewShuffleBeforeRet")); + EXPECT_EQ(NewI1->getOperand(0), ArgV1); + EXPECT_EQ(NewI1->getOperand(1), ArgV2); + EXPECT_EQ(NewI1->getNextNode(), Ret); +#ifndef NDEBUG + EXPECT_EQ(NewI1->getName(), "NewShuffleBeforeRet"); +#endif + + // create (InsertAtEnd) + auto *NewI2 = + cast(sandboxir::ShuffleVectorInst::create( + ArgV1, ArgV2, ArrayRef({0, 1}), BB, Ctx, "NewShuffleAtEndOfBB")); + EXPECT_EQ(NewI2->getPrevNode(), Ret); + + // Test the path that creates a folded constant. We're currently using an + // extractelement instruction with a constant operand in the textual IR above + // to obtain a constant vector to work with. + // TODO: Refactor this once sandboxir::ConstantVector lands. + auto *ShouldBeConstant = sandboxir::ShuffleVectorInst::create( + EEI->getOperand(0), EEI->getOperand(0), ArrayRef({0, 3}), BB, Ctx); + EXPECT_TRUE(isa(ShouldBeConstant)); + + // isValidOperands + auto *LLVMArgV1 = LLVMF.getArg(0); + auto *LLVMArgV2 = LLVMF.getArg(1); + ArrayRef Mask({1, 2}); + EXPECT_EQ( + sandboxir::ShuffleVectorInst::isValidOperands(ArgV1, ArgV2, Mask), + llvm::ShuffleVectorInst::isValidOperands(LLVMArgV1, LLVMArgV2, Mask)); + EXPECT_EQ(sandboxir::ShuffleVectorInst::isValidOperands(ArgV1, ArgV1, ArgV1), + llvm::ShuffleVectorInst::isValidOperands(LLVMArgV1, LLVMArgV1, + LLVMArgV1)); + + // commute + { + auto *I = CreateShuffleWithMask(0, 2); + I->commute(); + EXPECT_EQ(I->getOperand(0), ArgV2); + EXPECT_EQ(I->getOperand(1), ArgV1); + EXPECT_THAT(I->getShuffleMask(), + testing::ContainerEq(ArrayRef({2, 0}))); + } + + // getType + EXPECT_EQ(SVI->getType(), ArgV1->getType()); + + // getMaskValue + EXPECT_EQ(SVI->getMaskValue(0), 0); + EXPECT_EQ(SVI->getMaskValue(1), 2); + + // getShuffleMask / getShuffleMaskForBitcode + { + EXPECT_THAT(SVI->getShuffleMask(), + testing::ContainerEq(ArrayRef({0, 2}))); + + SmallVector Result; + SVI->getShuffleMask(Result); + EXPECT_THAT(Result, testing::ContainerEq(ArrayRef({0, 2}))); + + Result.clear(); + sandboxir::ShuffleVectorInst::getShuffleMask( + SVI->getShuffleMaskForBitcode(), Result); + EXPECT_THAT(Result, testing::ContainerEq(ArrayRef({0, 2}))); + } + + // convertShuffleMaskForBitcode + { + auto *C = sandboxir::ShuffleVectorInst::convertShuffleMaskForBitcode( + ArrayRef({2, 3}), ArgV1->getType(), Ctx); + SmallVector Result; + sandboxir::ShuffleVectorInst::getShuffleMask(C, Result); + EXPECT_THAT(Result, testing::ContainerEq(ArrayRef({2, 3}))); + } + + // setShuffleMask + { + auto *I = CreateShuffleWithMask(0, 1); + I->setShuffleMask(ArrayRef({2, 3})); + EXPECT_THAT(I->getShuffleMask(), + testing::ContainerEq(ArrayRef({2, 3}))); + } + + // The following functions check different mask properties. Note that most + // of these come in three different flavors: a method that checks the mask + // in the current instructions and two static member functions that check + // a mask given as an ArrayRef or Constant*, so there's quite a bit of + // repetition in order to check all of them. + + // changesLength / increasesLength + { + auto *I = CreateShuffleWithMask(1); + EXPECT_TRUE(I->changesLength()); + EXPECT_FALSE(I->increasesLength()); + } + { + auto *I = CreateShuffleWithMask(1, 1); + EXPECT_FALSE(I->changesLength()); + EXPECT_FALSE(I->increasesLength()); + } + { + auto *I = CreateShuffleWithMask(1, 1, 1); + EXPECT_TRUE(I->changesLength()); + EXPECT_TRUE(I->increasesLength()); + } + + // isSingleSource / isSingleSourceMask + { + auto *I = CreateShuffleWithMask(0, 1); + EXPECT_TRUE(I->isSingleSource()); + EXPECT_TRUE(sandboxir::ShuffleVectorInst::isSingleSourceMask( + I->getShuffleMaskForBitcode(), 2)); + EXPECT_TRUE(sandboxir::ShuffleVectorInst::isSingleSourceMask( + I->getShuffleMask(), 2)); + } + { + auto *I = CreateShuffleWithMask(0, 2); + EXPECT_FALSE(I->isSingleSource()); + EXPECT_FALSE(sandboxir::ShuffleVectorInst::isSingleSourceMask( + I->getShuffleMaskForBitcode(), 2)); + EXPECT_FALSE(sandboxir::ShuffleVectorInst::isSingleSourceMask( + I->getShuffleMask(), 2)); + } + + // isIdentity / isIdentityMask + { + auto *I = CreateShuffleWithMask(0, 1); + EXPECT_TRUE(I->isIdentity()); + EXPECT_TRUE(sandboxir::ShuffleVectorInst::isIdentityMask( + I->getShuffleMaskForBitcode(), 2)); + EXPECT_TRUE( + sandboxir::ShuffleVectorInst::isIdentityMask(I->getShuffleMask(), 2)); + } + { + auto *I = CreateShuffleWithMask(1, 0); + EXPECT_FALSE(I->isIdentity()); + EXPECT_FALSE(sandboxir::ShuffleVectorInst::isIdentityMask( + I->getShuffleMaskForBitcode(), 2)); + EXPECT_FALSE( + sandboxir::ShuffleVectorInst::isIdentityMask(I->getShuffleMask(), 2)); + } + + // isIdentityWithPadding + EXPECT_TRUE(CreateShuffleWithMask(0, 1, -1, -1)->isIdentityWithPadding()); + EXPECT_FALSE(CreateShuffleWithMask(0, 1)->isIdentityWithPadding()); + + // isIdentityWithExtract + EXPECT_TRUE(CreateShuffleWithMask(0)->isIdentityWithExtract()); + EXPECT_FALSE(CreateShuffleWithMask(0, 1)->isIdentityWithExtract()); + EXPECT_FALSE(CreateShuffleWithMask(0, 1, 2)->isIdentityWithExtract()); + EXPECT_FALSE(CreateShuffleWithMask(1)->isIdentityWithExtract()); + + // isConcat + EXPECT_TRUE(CreateShuffleWithMask(0, 1, 2, 3)->isConcat()); + EXPECT_FALSE(CreateShuffleWithMask(0, 3)->isConcat()); + + // isSelect / isSelectMask + { + auto *I = CreateShuffleWithMask(0, 3); + EXPECT_TRUE(I->isSelect()); + EXPECT_TRUE(sandboxir::ShuffleVectorInst::isSelectMask( + I->getShuffleMaskForBitcode(), 2)); + EXPECT_TRUE( + sandboxir::ShuffleVectorInst::isSelectMask(I->getShuffleMask(), 2)); + } + { + auto *I = CreateShuffleWithMask(0, 2); + EXPECT_FALSE(I->isSelect()); + EXPECT_FALSE(sandboxir::ShuffleVectorInst::isSelectMask( + I->getShuffleMaskForBitcode(), 2)); + EXPECT_FALSE( + sandboxir::ShuffleVectorInst::isSelectMask(I->getShuffleMask(), 2)); + } + + // isReverse / isReverseMask + { + auto *I = CreateShuffleWithMask(1, 0); + EXPECT_TRUE(I->isReverse()); + EXPECT_TRUE(sandboxir::ShuffleVectorInst::isReverseMask( + I->getShuffleMaskForBitcode(), 2)); + EXPECT_TRUE( + sandboxir::ShuffleVectorInst::isReverseMask(I->getShuffleMask(), 2)); + } + { + auto *I = CreateShuffleWithMask(1, 2); + EXPECT_FALSE(I->isReverse()); + EXPECT_FALSE(sandboxir::ShuffleVectorInst::isReverseMask( + I->getShuffleMaskForBitcode(), 2)); + EXPECT_FALSE( + sandboxir::ShuffleVectorInst::isReverseMask(I->getShuffleMask(), 2)); + } + + // isZeroEltSplat / isZeroEltSplatMask + { + auto *I = CreateShuffleWithMask(0, 0); + EXPECT_TRUE(I->isZeroEltSplat()); + EXPECT_TRUE(sandboxir::ShuffleVectorInst::isZeroEltSplatMask( + I->getShuffleMaskForBitcode(), 2)); + EXPECT_TRUE(sandboxir::ShuffleVectorInst::isZeroEltSplatMask( + I->getShuffleMask(), 2)); + } + { + auto *I = CreateShuffleWithMask(1, 1); + EXPECT_FALSE(I->isZeroEltSplat()); + EXPECT_FALSE(sandboxir::ShuffleVectorInst::isZeroEltSplatMask( + I->getShuffleMaskForBitcode(), 2)); + EXPECT_FALSE(sandboxir::ShuffleVectorInst::isZeroEltSplatMask( + I->getShuffleMask(), 2)); + } + + // isTranspose / isTransposeMask + { + auto *I = CreateShuffleWithMask(0, 2); + EXPECT_TRUE(I->isTranspose()); + EXPECT_TRUE(sandboxir::ShuffleVectorInst::isTransposeMask( + I->getShuffleMaskForBitcode(), 2)); + EXPECT_TRUE( + sandboxir::ShuffleVectorInst::isTransposeMask(I->getShuffleMask(), 2)); + } + { + auto *I = CreateShuffleWithMask(1, 1); + EXPECT_FALSE(I->isTranspose()); + EXPECT_FALSE(sandboxir::ShuffleVectorInst::isTransposeMask( + I->getShuffleMaskForBitcode(), 2)); + EXPECT_FALSE( + sandboxir::ShuffleVectorInst::isTransposeMask(I->getShuffleMask(), 2)); + } + + // isSplice / isSpliceMask + { + auto *I = CreateShuffleWithMask(1, 2); + int Index; + EXPECT_TRUE(I->isSplice(Index)); + EXPECT_EQ(Index, 1); + EXPECT_TRUE(sandboxir::ShuffleVectorInst::isSpliceMask( + I->getShuffleMaskForBitcode(), 2, Index)); + EXPECT_TRUE(sandboxir::ShuffleVectorInst::isSpliceMask(I->getShuffleMask(), + 2, Index)); + } + { + auto *I = CreateShuffleWithMask(2, 1); + int Index; + EXPECT_FALSE(I->isSplice(Index)); + EXPECT_FALSE(sandboxir::ShuffleVectorInst::isSpliceMask( + I->getShuffleMaskForBitcode(), 2, Index)); + EXPECT_FALSE(sandboxir::ShuffleVectorInst::isSpliceMask(I->getShuffleMask(), + 2, Index)); + } + + // isExtractSubvectorMask + { + auto *I = CreateShuffleWithMask(1); + int Index; + EXPECT_TRUE(I->isExtractSubvectorMask(Index)); + EXPECT_EQ(Index, 1); + EXPECT_TRUE(sandboxir::ShuffleVectorInst::isExtractSubvectorMask( + I->getShuffleMaskForBitcode(), 2, Index)); + EXPECT_TRUE(sandboxir::ShuffleVectorInst::isExtractSubvectorMask( + I->getShuffleMask(), 2, Index)); + } + { + auto *I = CreateShuffleWithMask(1, 2); + int Index; + EXPECT_FALSE(I->isExtractSubvectorMask(Index)); + EXPECT_FALSE(sandboxir::ShuffleVectorInst::isExtractSubvectorMask( + I->getShuffleMaskForBitcode(), 2, Index)); + EXPECT_FALSE(sandboxir::ShuffleVectorInst::isExtractSubvectorMask( + I->getShuffleMask(), 2, Index)); + } + + // isInsertSubvectorMask + { + auto *I = CreateShuffleWithMask(0, 2); + int NumSubElts, Index; + EXPECT_TRUE(I->isInsertSubvectorMask(NumSubElts, Index)); + EXPECT_EQ(Index, 1); + EXPECT_EQ(NumSubElts, 1); + EXPECT_TRUE(sandboxir::ShuffleVectorInst::isInsertSubvectorMask( + I->getShuffleMaskForBitcode(), 2, NumSubElts, Index)); + EXPECT_TRUE(sandboxir::ShuffleVectorInst::isInsertSubvectorMask( + I->getShuffleMask(), 2, NumSubElts, Index)); + } + { + auto *I = CreateShuffleWithMask(0, 1); + int NumSubElts, Index; + EXPECT_FALSE(I->isInsertSubvectorMask(NumSubElts, Index)); + EXPECT_FALSE(sandboxir::ShuffleVectorInst::isInsertSubvectorMask( + I->getShuffleMaskForBitcode(), 2, NumSubElts, Index)); + EXPECT_FALSE(sandboxir::ShuffleVectorInst::isInsertSubvectorMask( + I->getShuffleMask(), 2, NumSubElts, Index)); + } + + // isReplicationMask + { + auto *I = CreateShuffleWithMask(0, 0, 0, 1, 1, 1); + int ReplicationFactor, VF; + EXPECT_TRUE(I->isReplicationMask(ReplicationFactor, VF)); + EXPECT_EQ(ReplicationFactor, 3); + EXPECT_EQ(VF, 2); + EXPECT_TRUE(sandboxir::ShuffleVectorInst::isReplicationMask( + I->getShuffleMaskForBitcode(), ReplicationFactor, VF)); + EXPECT_TRUE(sandboxir::ShuffleVectorInst::isReplicationMask( + I->getShuffleMask(), ReplicationFactor, VF)); + } + { + auto *I = CreateShuffleWithMask(1, 2); + int ReplicationFactor, VF; + EXPECT_FALSE(I->isReplicationMask(ReplicationFactor, VF)); + EXPECT_FALSE(sandboxir::ShuffleVectorInst::isReplicationMask( + I->getShuffleMaskForBitcode(), ReplicationFactor, VF)); + EXPECT_FALSE(sandboxir::ShuffleVectorInst::isReplicationMask( + I->getShuffleMask(), ReplicationFactor, VF)); + } + + // isOneUseSingleSourceMask + { + auto *I = CreateShuffleWithMask(0, 1, 1, 0); + EXPECT_TRUE(I->isOneUseSingleSourceMask(2)); + EXPECT_TRUE(sandboxir::ShuffleVectorInst::isOneUseSingleSourceMask( + I->getShuffleMask(), 2)); + } + { + auto *I = CreateShuffleWithMask(0, 1, 0, 0); + EXPECT_FALSE(I->isOneUseSingleSourceMask(2)); + EXPECT_FALSE(sandboxir::ShuffleVectorInst::isOneUseSingleSourceMask( + I->getShuffleMask(), 2)); + } + + // commuteShuffleMask + { + SmallVector M = {0, 2, 1, 3}; + ShuffleVectorInst::commuteShuffleMask(M, 2); + EXPECT_THAT(M, testing::ContainerEq(ArrayRef({2, 0, 3, 1}))); + } + + // isInterleave / isInterleaveMask + { + auto *I = CreateShuffleWithMask(0, 2, 1, 3); + EXPECT_TRUE(I->isInterleave(2)); + EXPECT_TRUE(sandboxir::ShuffleVectorInst::isInterleaveMask( + I->getShuffleMask(), 2, 4)); + SmallVector StartIndexes; + EXPECT_TRUE(sandboxir::ShuffleVectorInst::isInterleaveMask( + I->getShuffleMask(), 2, 4, StartIndexes)); + EXPECT_THAT(StartIndexes, testing::ContainerEq(ArrayRef({0, 2}))); + } + { + auto *I = CreateShuffleWithMask(0, 3, 1, 2); + EXPECT_FALSE(I->isInterleave(2)); + EXPECT_FALSE(sandboxir::ShuffleVectorInst::isInterleaveMask( + I->getShuffleMask(), 2, 4)); + } + + // isDeInterleaveMaskOfFactor + { + EXPECT_TRUE(sandboxir::ShuffleVectorInst::isDeInterleaveMaskOfFactor( + ArrayRef({0, 2}), 2)); + EXPECT_FALSE(sandboxir::ShuffleVectorInst::isDeInterleaveMaskOfFactor( + ArrayRef({0, 1}), 2)); + + unsigned Index; + EXPECT_TRUE(sandboxir::ShuffleVectorInst::isDeInterleaveMaskOfFactor( + ArrayRef({1, 3}), 2, Index)); + EXPECT_EQ(Index, 1u); + } + + // isBitRotateMask + { + unsigned NumSubElts, RotateAmt; + EXPECT_TRUE(sandboxir::ShuffleVectorInst::isBitRotateMask( + ArrayRef({1, 0, 3, 2, 5, 4, 7, 6}), 8, 2, 2, NumSubElts, + RotateAmt)); + EXPECT_EQ(NumSubElts, 2u); + EXPECT_EQ(RotateAmt, 8u); + + EXPECT_FALSE(sandboxir::ShuffleVectorInst::isBitRotateMask( + ArrayRef({0, 7, 1, 6, 2, 5, 3, 4}), 8, 2, 2, NumSubElts, + RotateAmt)); + } +} + TEST_F(SandboxIRTest, BranchInst) { parseIR(C, R"IR( define void @foo(i1 %cond0, i1 %cond2) { From b03b170dd39799b4fb25ffe70b81d0cf0c7d7346 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 21 Aug 2024 13:28:28 -0700 Subject: [PATCH 110/426] [ADT] Add `isPunct` to StringExtras (#105461) - Add `isPunct` to StringExtras.h. - Add unit test for `isPunct` to StringExtrasTest. --- llvm/include/llvm/ADT/StringExtras.h | 11 +++++++++++ llvm/unittests/ADT/StringExtrasTest.cpp | 12 ++++++++++++ 2 files changed, 23 insertions(+) diff --git a/llvm/include/llvm/ADT/StringExtras.h b/llvm/include/llvm/ADT/StringExtras.h index 20e6ad1f68f996..1317d521d4c191 100644 --- a/llvm/include/llvm/ADT/StringExtras.h +++ b/llvm/include/llvm/ADT/StringExtras.h @@ -140,6 +140,17 @@ inline bool isPrint(char C) { return (0x20 <= UC) && (UC <= 0x7E); } +/// Checks whether character \p C is a punctuation character. +/// +/// Locale-independent version of the C standard library ispunct. The list of +/// punctuation characters can be found in the documentation of std::ispunct: +/// https://en.cppreference.com/w/cpp/string/byte/ispunct. +inline bool isPunct(char C) { + static constexpr StringLiteral Punctuations = + R"(!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~)"; + return Punctuations.contains(C); +} + /// Checks whether character \p C is whitespace in the "C" locale. /// /// Locale-independent version of the C standard library isspace. diff --git a/llvm/unittests/ADT/StringExtrasTest.cpp b/llvm/unittests/ADT/StringExtrasTest.cpp index 1fb1fea6577911..51f7c3948a3146 100644 --- a/llvm/unittests/ADT/StringExtrasTest.cpp +++ b/llvm/unittests/ADT/StringExtrasTest.cpp @@ -59,6 +59,18 @@ TEST(StringExtrasTest, isUpper) { EXPECT_FALSE(isUpper('\?')); } +TEST(StringExtrasTest, isPunct) { + EXPECT_FALSE(isPunct('a')); + EXPECT_FALSE(isPunct('b')); + EXPECT_FALSE(isPunct('z')); + EXPECT_TRUE(isPunct('-')); + EXPECT_TRUE(isPunct(';')); + EXPECT_TRUE(isPunct('@')); + EXPECT_FALSE(isPunct('0')); + EXPECT_FALSE(isPunct('1')); + EXPECT_FALSE(isPunct('x')); +} + template void testJoin() { ContainerT Items; EXPECT_EQ("", join(Items.begin(), Items.end(), " ")); From 84fa7b438e1fba0c88b21784e716926017b9fe49 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 21 Aug 2024 12:54:05 -0400 Subject: [PATCH 111/426] [libc++] Improve the granularity of status tracking from Github issues This enhances the Github - CSV synchronization script to understand some of the idioms we use in the CSV status files, like |Nothing To Do| and others. --- libcxx/utils/synchronize_csv_status_files.py | 173 ++++++++++++++----- 1 file changed, 129 insertions(+), 44 deletions(-) diff --git a/libcxx/utils/synchronize_csv_status_files.py b/libcxx/utils/synchronize_csv_status_files.py index 68df5756e884d6..5ff718e5a8f916 100755 --- a/libcxx/utils/synchronize_csv_status_files.py +++ b/libcxx/utils/synchronize_csv_status_files.py @@ -19,6 +19,101 @@ # Number of the 'Libc++ Standards Conformance' project on Github LIBCXX_CONFORMANCE_PROJECT = '31' +class PaperStatus: + TODO = 1 + IN_PROGRESS = 2 + PARTIAL = 3 + DONE = 4 + NOTHING_TO_DO = 5 + + _status: int + + _original: Optional[str] + """ + Optional string from which the paper status was created. This is used to carry additional + information from CSV rows, like any notes associated to the status. + """ + + def __init__(self, status: int, original: Optional[str] = None): + self._status = status + self._original = original + + def __eq__(self, other) -> bool: + return self._status == other._status + + def __lt__(self, other) -> bool: + relative_order = { + PaperStatus.TODO: 0, + PaperStatus.IN_PROGRESS: 1, + PaperStatus.PARTIAL: 2, + PaperStatus.DONE: 3, + PaperStatus.NOTHING_TO_DO: 3, + } + return relative_order[self._status] < relative_order[other._status] + + @staticmethod + def from_csv_entry(entry: str): + """ + Parse a paper status out of a CSV row entry. Entries can look like: + - '' (an empty string, which means the paper is not done yet) + - '|In Progress|' + - '|Partial|' + - '|Complete|' + - '|Nothing To Do|' + + Note that since we sometimes add additional notes after the status, we only check that the entry + starts with the above patterns. + """ + if entry == '': + return PaperStatus(PaperStatus.TODO, entry) + elif entry.startswith('|In Progress|'): + return PaperStatus(PaperStatus.IN_PROGRESS, entry) + elif entry.startswith('|Partial|'): + return PaperStatus(PaperStatus.PARTIAL, entry) + elif entry.startswith('|Complete|'): + return PaperStatus(PaperStatus.DONE, entry) + elif entry.startswith('|Nothing To Do|'): + return PaperStatus(PaperStatus.NOTHING_TO_DO, entry) + else: + raise RuntimeError(f'Unexpected CSV entry for status: {entry}') + + @staticmethod + def from_github_issue(issue: Dict): + """ + Parse a paper status out of a Github issue obtained from querying a Github project. + """ + if 'status' not in issue: + return PaperStatus(PaperStatus.TODO) + elif issue['status'] == 'Todo': + return PaperStatus(PaperStatus.TODO) + elif issue['status'] == 'In Progress': + return PaperStatus(PaperStatus.IN_PROGRESS) + elif issue['status'] == 'Partial': + return PaperStatus(PaperStatus.PARTIAL) + elif issue['status'] == 'Done': + return PaperStatus(PaperStatus.DONE) + elif issue['status'] == 'Nothing To Do': + return PaperStatus(PaperStatus.NOTHING_TO_DO) + else: + raise RuntimeError(f"Received unrecognizable Github issue status: {issue['status']}") + + def to_csv_entry(self) -> str: + """ + Return the issue state formatted for a CSV entry. The status is formatted as '|Complete|', + '|In Progress|', etc. + """ + mapping = { + PaperStatus.TODO: '', + PaperStatus.IN_PROGRESS: '|In Progress|', + PaperStatus.PARTIAL: '|Partial|', + PaperStatus.DONE: '|Complete|', + PaperStatus.NOTHING_TO_DO: '|Nothing To Do|', + } + return self._original if self._original is not None else mapping[self._status] + + def is_done(self) -> bool: + return self._status == PaperStatus.DONE or self._status == PaperStatus.NOTHING_TO_DO + class PaperInfo: paper_number: str """ @@ -30,15 +125,14 @@ class PaperInfo: Plain text string representing the name of the paper. """ - meeting: Optional[str] + status: PaperStatus """ - Plain text string representing the meeting at which the paper/issue was voted. + Status of the paper/issue. This can be complete, in progress, partial, or done. """ - status: Optional[str] + meeting: Optional[str] """ - Status of the paper/issue. This must be '|Complete|', '|Nothing To Do|', '|In Progress|', - '|Partial|' or 'Resolved by '. + Plain text string representing the meeting at which the paper/issue was voted. """ first_released_version: Optional[str] @@ -59,15 +153,15 @@ class PaperInfo: """ def __init__(self, paper_number: str, paper_name: str, + status: PaperStatus, meeting: Optional[str] = None, - status: Optional[str] = None, first_released_version: Optional[str] = None, labels: Optional[List[str]] = None, original: Optional[object] = None): self.paper_number = paper_number self.paper_name = paper_name - self.meeting = meeting self.status = status + self.meeting = meeting self.first_released_version = first_released_version self.labels = labels self.original = original @@ -77,7 +171,7 @@ def for_printing(self) -> Tuple[str, str, str, str, str, str]: f'`{self.paper_number} `__', self.paper_name, self.meeting if self.meeting is not None else '', - self.status if self.status is not None else '', + self.status.to_csv_entry(), self.first_released_version if self.first_released_version is not None else '', ' '.join(f'|{label}|' for label in self.labels) if self.labels is not None else '', ) @@ -85,13 +179,6 @@ def for_printing(self) -> Tuple[str, str, str, str, str, str]: def __repr__(self) -> str: return repr(self.original) if self.original is not None else repr(self.for_printing()) - def is_implemented(self) -> bool: - if self.status is None: - return False - if re.search(r'(in progress|partial)', self.status.lower()): - return False - return True - @staticmethod def from_csv_row(row: Tuple[str, str, str, str, str, str]):# -> PaperInfo: """ @@ -105,8 +192,8 @@ def from_csv_row(row: Tuple[str, str, str, str, str, str]):# -> PaperInfo: return PaperInfo( paper_number=match.group(1), paper_name=row[1], + status=PaperStatus.from_csv_entry(row[3]), meeting=row[2] or None, - status=row[3] or None, first_released_version=row[4] or None, labels=[l.strip('|') for l in row[5].split(' ') if l] or None, original=row, @@ -123,12 +210,6 @@ def from_github_issue(issue: Dict):# -> PaperInfo: raise RuntimeError(f"Issue doesn't have a title that we know how to parse: {issue}") paper = match.group(1) - # Figure out the status of the paper according to the Github project information. - # - # Sadly, we can't make a finer-grained distiction about *how* the issue - # was closed (such as Nothing To Do or similar). - status = '|Complete|' if 'status' in issue and issue['status'] == 'Done' else None - # Handle labels valid_labels = ('format', 'ranges', 'spaceship', 'flat_containers', 'concurrency TS', 'DR') labels = [label for label in issue['labels'] if label in valid_labels] @@ -136,8 +217,8 @@ def from_github_issue(issue: Dict):# -> PaperInfo: return PaperInfo( paper_number=paper, paper_name=issue['title'], + status=PaperStatus.from_github_issue(issue), meeting=issue.get('meeting Voted', None), - status=status, first_released_version=None, # TODO labels=labels if labels else None, original=issue, @@ -177,30 +258,34 @@ def sync_csv(rows: List[Tuple], from_github: List[PaperInfo]) -> List[Tuple]: paper = PaperInfo.from_csv_row(row) - # If the row is already implemented, basically keep it unchanged but also validate that we're not - # out-of-sync with any still-open Github issue tracking the same paper. - if paper.is_implemented(): - dangling = [gh for gh in from_github if gh.paper_number == paper.paper_number and not gh.is_implemented()] - if dangling: - print(f"We found the following open tracking issues for a row which is already marked as implemented:\nrow: {row}\ntracking issues: {dangling}") - print("The Github issue should be closed if the work has indeed been done.") - results.append(paper.for_printing()) - else: - # Find any Github issues tracking this paper - tracking = [gh for gh in from_github if paper.paper_number == gh.paper_number] + # Find any Github issues tracking this paper. Each row must have one and exactly one Github + # issue tracking it, which we validate below. + tracking = [gh for gh in from_github if paper.paper_number == gh.paper_number] - # If there is no tracking issue for that row in the CSV, this is an error since we're - # missing a Github issue. - if not tracking: - raise RuntimeError(f"Can't find any Github issue for CSV row which isn't marked as done yet: {row}") + # If there is no tracking issue for that row in the CSV, this is an error since we're + # missing a Github issue. + if len(tracking) == 0: + print(f"Can't find any Github issue for CSV row: {row}") + results.append(row) + continue - # If there's more than one tracking issue, something is weird too. - if len(tracking) > 1: - raise RuntimeError(f"Found a row with more than one tracking issue: {row}\ntracked by: {tracking}") + # If there's more than one tracking issue, something is weird too. + if len(tracking) > 1: + print(f"Found a row with more than one tracking issue: {row}\ntracked by: {tracking}") + results.append(row) + continue - # If the issue is closed, synchronize the row based on the Github issue. Otherwise, use the - # existing CSV row as-is. - results.append(tracking[0].for_printing() if tracking[0].is_implemented() else row) + gh = tracking[0] + + # If the CSV row has a status that is "less advanced" than the Github issue, simply update the CSV + # row with the newer status. Otherwise, report an error if they have a different status because + # something must be wrong. + if paper.status < gh.status: + results.append(gh.for_printing()) + continue + elif paper.status != gh.status: + print(f"We found a CSV row and a Github issue with different statuses:\nrow: {row}\Github issue: {gh}") + results.append(row) return results From cfd4c1805ead139f84a4465719c49cca53f07f27 Mon Sep 17 00:00:00 2001 From: Slava Zakharin Date: Wed, 21 Aug 2024 13:37:03 -0700 Subject: [PATCH 112/426] [RFC][flang] Replace special symbols in uniqued global names. (#104859) This change addresses more "issues" as the one resolved in #71338. Some targets (e.g. NVPTX) do not accept global names containing `.`. In particular, the global variables created to represent the runtime information of derived types use `.` in their names. A derived type's descriptor object may be used in the device code, e.g. to initialize a descriptor of a variable of this type. Thus, the runtime type info objects may need to be compiled for the device. Moreover, at least the derived types' descriptor objects may need to be registered (think of `omp declare target`) for the host-device association so that the addendum pointer can be properly mapped to the device for descriptors using a derived type's descriptor as their addendum pointer. The registration implies knowing the name of the global variable in the device image so that proper host code can be created. So it is better to name the globals the same way for the host and the device. CompilerGeneratedNamesConversion pass renames all uniqued globals such that the special symbols (currently `.`) are replaced with `X`. The pass is supposed to be run for the host and the device. An option is added to FIR-to-LLVM conversion pass to indicate whether the new pass has been run before or not. This setting affects how the codegen computes the names of the derived types' descriptors for FIR derived types. fir::NameUniquer now allows `X` to be part of a name, because the name deconstruction may be applied to the mangled names after CompilerGeneratedNamesConversion pass. --- .../flang/Optimizer/CodeGen/CGPasses.td | 6 +- .../include/flang/Optimizer/CodeGen/CodeGen.h | 10 +++ .../flang/Optimizer/Support/InternalNames.h | 25 ++++-- .../flang/Optimizer/Transforms/Passes.h | 1 + .../flang/Optimizer/Transforms/Passes.td | 18 +++++ flang/include/flang/Tools/CLOptions.inc | 10 +++ flang/lib/Optimizer/CodeGen/CodeGen.cpp | 13 ++- flang/lib/Optimizer/Support/InternalNames.cpp | 31 +++++-- flang/lib/Optimizer/Transforms/CMakeLists.txt | 1 + .../Transforms/CompilerGeneratedNames.cpp | 80 +++++++++++++++++++ flang/lib/Semantics/runtime-type-info.cpp | 69 ++++++++++------ .../test/Driver/mlir-debug-pass-pipeline.f90 | 1 + flang/test/Driver/mlir-pass-pipeline.f90 | 1 + flang/test/Fir/basic-program.fir | 1 + flang/test/Fir/convert-to-llvm.fir | 2 +- flang/test/Fir/convert-type-desc-to-llvm.fir | 29 +++++++ flang/test/Fir/polymorphic.fir | 4 +- flang/test/Fir/type-descriptor.fir | 4 +- flang/test/Lower/allocatable-polymorphic.f90 | 14 ++-- flang/test/Lower/dense-array-any-rank.f90 | 6 +- 20 files changed, 270 insertions(+), 56 deletions(-) create mode 100644 flang/lib/Optimizer/Transforms/CompilerGeneratedNames.cpp create mode 100644 flang/test/Fir/convert-type-desc-to-llvm.fir diff --git a/flang/include/flang/Optimizer/CodeGen/CGPasses.td b/flang/include/flang/Optimizer/CodeGen/CGPasses.td index 989e3943882a19..e9e303df09eeba 100644 --- a/flang/include/flang/Optimizer/CodeGen/CGPasses.td +++ b/flang/include/flang/Optimizer/CodeGen/CGPasses.td @@ -36,7 +36,11 @@ def FIRToLLVMLowering : Pass<"fir-to-llvm-ir", "mlir::ModuleOp"> { Option<"forcedTargetFeatures", "target-features", "std::string", /*default=*/"", "Override module's target features.">, Option<"applyTBAA", "apply-tbaa", "bool", /*default=*/"false", - "Attach TBAA tags to memory accessing operations."> + "Attach TBAA tags to memory accessing operations.">, + Option<"typeDescriptorsRenamedForAssembly", + "type-descriptors-renamed-for-assembly", "bool", /*default=*/"false", + "Global variables created to describe derived types " + "have been renamed to avoid special symbols in their names."> ]; } diff --git a/flang/include/flang/Optimizer/CodeGen/CodeGen.h b/flang/include/flang/Optimizer/CodeGen/CodeGen.h index 06961819bb19c8..390f00e1ac77c2 100644 --- a/flang/include/flang/Optimizer/CodeGen/CodeGen.h +++ b/flang/include/flang/Optimizer/CodeGen/CodeGen.h @@ -44,6 +44,16 @@ struct FIRToLLVMPassOptions { // Force the usage of a unified tbaa tree in TBAABuilder. bool forceUnifiedTBAATree = false; + + // If set to true, then the global variables created + // for the derived types have been renamed to avoid usage + // of special symbols that may not be supported by all targets. + // The renaming is done by the CompilerGeneratedNamesConversion pass. + // If it is true, FIR-to-LLVM pass has to use + // fir::NameUniquer::getTypeDescriptorAssemblyName() to take + // the name of the global variable corresponding to a derived + // type's descriptor. + bool typeDescriptorsRenamedForAssembly = false; }; /// Convert FIR to the LLVM IR dialect with default options. diff --git a/flang/include/flang/Optimizer/Support/InternalNames.h b/flang/include/flang/Optimizer/Support/InternalNames.h index 9e13b4a7668b7a..67ab36cf8da7ff 100644 --- a/flang/include/flang/Optimizer/Support/InternalNames.h +++ b/flang/include/flang/Optimizer/Support/InternalNames.h @@ -14,13 +14,23 @@ #include #include -static constexpr llvm::StringRef typeDescriptorSeparator = ".dt."; -static constexpr llvm::StringRef componentInitSeparator = ".di."; -static constexpr llvm::StringRef bindingTableSeparator = ".v."; -static constexpr llvm::StringRef boxprocSuffix = "UnboxProc"; - namespace fir { +static constexpr llvm::StringRef kNameSeparator = "."; +static constexpr llvm::StringRef kBoundsSeparator = ".b."; +static constexpr llvm::StringRef kComponentSeparator = ".c."; +static constexpr llvm::StringRef kComponentInitSeparator = ".di."; +static constexpr llvm::StringRef kDataPtrInitSeparator = ".dp."; +static constexpr llvm::StringRef kTypeDescriptorSeparator = ".dt."; +static constexpr llvm::StringRef kKindParameterSeparator = ".kp."; +static constexpr llvm::StringRef kLenKindSeparator = ".lpk."; +static constexpr llvm::StringRef kLenParameterSeparator = ".lv."; +static constexpr llvm::StringRef kNameStringSeparator = ".n."; +static constexpr llvm::StringRef kProcPtrSeparator = ".p."; +static constexpr llvm::StringRef kSpecialBindingSeparator = ".s."; +static constexpr llvm::StringRef kBindingTableSeparator = ".v."; +static constexpr llvm::StringRef boxprocSuffix = "UnboxProc"; + /// Internal name mangling of identifiers /// /// In order to generate symbolically referencable artifacts in a ModuleOp, @@ -150,6 +160,9 @@ struct NameUniquer { /// not a valid mangled derived type name. static std::string getTypeDescriptorName(llvm::StringRef mangledTypeName); + static std::string + getTypeDescriptorAssemblyName(llvm::StringRef mangledTypeName); + /// Given a mangled derived type name, get the name of the related binding /// table object. Returns an empty string if \p mangledTypeName is not a valid /// mangled derived type name. @@ -169,6 +182,8 @@ struct NameUniquer { static llvm::StringRef dropTypeConversionMarkers(llvm::StringRef mangledTypeName); + static std::string replaceSpecialSymbols(const std::string &name); + private: static std::string intAsString(std::int64_t i); static std::string doKind(std::int64_t kind); diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h index 96b0e9714b95af..6f98e3a25ec125 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.h +++ b/flang/include/flang/Optimizer/Transforms/Passes.h @@ -59,6 +59,7 @@ namespace fir { #define GEN_PASS_DECL_VSCALEATTR #define GEN_PASS_DECL_FUNCTIONATTR #define GEN_PASS_DECL_CONSTANTARGUMENTGLOBALISATIONOPT +#define GEN_PASS_DECL_COMPILERGENERATEDNAMESCONVERSION #include "flang/Optimizer/Transforms/Passes.h.inc" diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td index c703a62c03b7d9..a0211384667ed1 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.td +++ b/flang/include/flang/Optimizer/Transforms/Passes.td @@ -170,6 +170,24 @@ def ExternalNameConversion : Pass<"external-name-interop", "mlir::ModuleOp"> { ]; } +def CompilerGeneratedNamesConversion : Pass<"compiler-generated-names", + "mlir::ModuleOp"> { + let summary = "Convert names of compiler generated globals"; + let description = [{ + Transforms names of compiler generated globals to avoid + characters that might be unsupported by some target toolchains. + All special symbols are replaced with a predefined 'X' character. + This is only done for uniqued names that are not externally facing. + The uniqued names always use '_Q' prefix, and the user entity names + are always lower cased, so using 'X' instead of the special symbols + will guarantee that the converted name will not conflict with the user + space. This pass does not affect the externally facing names, + because the expectation is that the compiler will not generate + externally facing names on its own, and these names cannot use + special symbols. + }]; +} + def MemRefDataFlowOpt : Pass<"fir-memref-dataflow-opt", "::mlir::func::FuncOp"> { let summary = "Perform store/load forwarding and potentially removing dead stores."; diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc index 7df50449494631..57b90017d052e4 100644 --- a/flang/include/flang/Tools/CLOptions.inc +++ b/flang/include/flang/Tools/CLOptions.inc @@ -93,6 +93,8 @@ DisableOption(ExternalNameConversion, "external-name-interop", "convert names with external convention"); EnableOption(ConstantArgumentGlobalisation, "constant-argument-globalisation", "the local constant argument to global constant conversion"); +DisableOption(CompilerGeneratedNamesConversion, "compiler-generated-names", + "replace special symbols in compiler generated names"); using PassConstructor = std::unique_ptr(); @@ -222,6 +224,8 @@ inline void addFIRToLLVMPass( options.ignoreMissingTypeDescriptors = ignoreMissingTypeDescriptors; options.applyTBAA = config.AliasAnalysis; options.forceUnifiedTBAATree = useOldAliasTags; + options.typeDescriptorsRenamedForAssembly = + !disableCompilerGeneratedNamesConversion; addPassConditionally(pm, disableFirToLlvmIr, [&]() { return fir::createFIRToLLVMPass(options); }); // The dialect conversion framework may leave dead unrealized_conversion_cast @@ -248,6 +252,11 @@ inline void addExternalNameConversionPass( [&]() { return fir::createExternalNameConversion({appendUnderscore}); }); } +inline void addCompilerGeneratedNamesConversionPass(mlir::PassManager &pm) { + addPassConditionally(pm, disableCompilerGeneratedNamesConversion, + [&]() { return fir::createCompilerGeneratedNamesConversion(); }); +} + // Use inliner extension point callback to register the default inliner pass. inline void registerDefaultInlinerPass(MLIRToLLVMPassPipelineConfig &config) { config.registerFIRInlinerCallback( @@ -379,6 +388,7 @@ inline void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm, fir::addCodeGenRewritePass( pm, (config.DebugInfo != llvm::codegenoptions::NoDebugInfo)); fir::addTargetRewritePass(pm); + fir::addCompilerGeneratedNamesConversionPass(pm); fir::addExternalNameConversionPass(pm, config.Underscoring); fir::createDebugPasses(pm, config.DebugInfo, config.OptLevel, inputFilename); diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index 1713cf98a8b961..e419b261252995 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -1201,7 +1201,9 @@ struct EmboxCommonConversion : public fir::FIROpConversion { mlir::Location loc, fir::RecordType recType) const { std::string name = - fir::NameUniquer::getTypeDescriptorName(recType.getName()); + this->options.typeDescriptorsRenamedForAssembly + ? fir::NameUniquer::getTypeDescriptorAssemblyName(recType.getName()) + : fir::NameUniquer::getTypeDescriptorName(recType.getName()); mlir::Type llvmPtrTy = ::getLlvmPtrType(mod.getContext()); if (auto global = mod.template lookupSymbol(name)) { return rewriter.create(loc, llvmPtrTy, @@ -2704,7 +2706,10 @@ struct TypeDescOpConversion : public fir::FIROpConversion { auto recordType = mlir::dyn_cast(inTy); auto module = typeDescOp.getOperation()->getParentOfType(); std::string typeDescName = - fir::NameUniquer::getTypeDescriptorName(recordType.getName()); + this->options.typeDescriptorsRenamedForAssembly + ? fir::NameUniquer::getTypeDescriptorAssemblyName( + recordType.getName()) + : fir::NameUniquer::getTypeDescriptorName(recordType.getName()); auto llvmPtrTy = ::getLlvmPtrType(typeDescOp.getContext()); if (auto global = module.lookupSymbol(typeDescName)) { rewriter.replaceOpWithNewOp( @@ -3653,6 +3658,10 @@ class FIRToLLVMLowering if (!forcedTargetFeatures.empty()) fir::setTargetFeatures(mod, forcedTargetFeatures); + if (typeDescriptorsRenamedForAssembly) + options.typeDescriptorsRenamedForAssembly = + typeDescriptorsRenamedForAssembly; + // Run dynamic pass pipeline for converting Math dialect // operations into other dialects (llvm, func, etc.). // Some conversions of Math operations cannot be done diff --git a/flang/lib/Optimizer/Support/InternalNames.cpp b/flang/lib/Optimizer/Support/InternalNames.cpp index b2e2cd38f48e60..58a5da5de79720 100644 --- a/flang/lib/Optimizer/Support/InternalNames.cpp +++ b/flang/lib/Optimizer/Support/InternalNames.cpp @@ -16,6 +16,7 @@ #include "mlir/IR/Diagnostics.h" #include "llvm/Support/CommandLine.h" #include +#include static llvm::cl::opt mainEntryName( "main-entry-name", @@ -59,7 +60,11 @@ convertToStringRef(const std::optional &from) { static std::string readName(llvm::StringRef uniq, std::size_t &i, std::size_t init, std::size_t end) { - for (i = init; i < end && (uniq[i] < 'A' || uniq[i] > 'Z'); ++i) { + // Allow 'X' to be part of the mangled name, which + // can happen after the special symbols are replaced + // in the mangled names by CompilerGeneratedNamesConversionPass. + for (i = init; i < end && (uniq[i] < 'A' || uniq[i] > 'Z' || uniq[i] == 'X'); + ++i) { // do nothing } return uniq.substr(init, i - init).str(); @@ -348,7 +353,7 @@ mangleTypeDescriptorKinds(llvm::ArrayRef kinds) { return ""; std::string result; for (std::int64_t kind : kinds) - result += "." + std::to_string(kind); + result += (fir::kNameSeparator + std::to_string(kind)).str(); return result; } @@ -373,12 +378,18 @@ static std::string getDerivedTypeObjectName(llvm::StringRef mangledTypeName, std::string fir::NameUniquer::getTypeDescriptorName(llvm::StringRef mangledTypeName) { - return getDerivedTypeObjectName(mangledTypeName, typeDescriptorSeparator); + return getDerivedTypeObjectName(mangledTypeName, + fir::kTypeDescriptorSeparator); +} + +std::string fir::NameUniquer::getTypeDescriptorAssemblyName( + llvm::StringRef mangledTypeName) { + return replaceSpecialSymbols(getTypeDescriptorName(mangledTypeName)); } std::string fir::NameUniquer::getTypeDescriptorBindingTableName( llvm::StringRef mangledTypeName) { - return getDerivedTypeObjectName(mangledTypeName, bindingTableSeparator); + return getDerivedTypeObjectName(mangledTypeName, fir::kBindingTableSeparator); } std::string @@ -386,13 +397,17 @@ fir::NameUniquer::getComponentInitName(llvm::StringRef mangledTypeName, llvm::StringRef componentName) { std::string prefix = - getDerivedTypeObjectName(mangledTypeName, componentInitSeparator); - return prefix + "." + componentName.str(); + getDerivedTypeObjectName(mangledTypeName, fir::kComponentInitSeparator); + return (prefix + fir::kNameSeparator + componentName).str(); } llvm::StringRef fir::NameUniquer::dropTypeConversionMarkers(llvm::StringRef mangledTypeName) { - if (mangledTypeName.ends_with(boxprocSuffix)) - return mangledTypeName.drop_back(boxprocSuffix.size()); + if (mangledTypeName.ends_with(fir::boxprocSuffix)) + return mangledTypeName.drop_back(fir::boxprocSuffix.size()); return mangledTypeName; } + +std::string fir::NameUniquer::replaceSpecialSymbols(const std::string &name) { + return std::regex_replace(name, std::regex{"\\."}, "X"); +} diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt index 3869633bd98e02..bf0a8d14d95df6 100644 --- a/flang/lib/Optimizer/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt @@ -6,6 +6,7 @@ add_flang_library(FIRTransforms AnnotateConstant.cpp AssumedRankOpConversion.cpp CharacterConversion.cpp + CompilerGeneratedNames.cpp ConstantArgumentGlobalisation.cpp ControlFlowConverter.cpp CufOpConversion.cpp diff --git a/flang/lib/Optimizer/Transforms/CompilerGeneratedNames.cpp b/flang/lib/Optimizer/Transforms/CompilerGeneratedNames.cpp new file mode 100644 index 00000000000000..7f2cc41275e593 --- /dev/null +++ b/flang/lib/Optimizer/Transforms/CompilerGeneratedNames.cpp @@ -0,0 +1,80 @@ +//=== CompilerGeneratedNames.cpp - convert special symbols in global names ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Dialect/FIRDialect.h" +#include "flang/Optimizer/Dialect/FIROps.h" +#include "flang/Optimizer/Dialect/FIROpsSupport.h" +#include "flang/Optimizer/Support/InternalNames.h" +#include "flang/Optimizer/Transforms/Passes.h" +#include "mlir/IR/Attributes.h" +#include "mlir/IR/SymbolTable.h" +#include "mlir/Pass/Pass.h" + +namespace fir { +#define GEN_PASS_DEF_COMPILERGENERATEDNAMESCONVERSION +#include "flang/Optimizer/Transforms/Passes.h.inc" +} // namespace fir + +using namespace mlir; + +namespace { + +class CompilerGeneratedNamesConversionPass + : public fir::impl::CompilerGeneratedNamesConversionBase< + CompilerGeneratedNamesConversionPass> { +public: + using CompilerGeneratedNamesConversionBase< + CompilerGeneratedNamesConversionPass>:: + CompilerGeneratedNamesConversionBase; + + mlir::ModuleOp getModule() { return getOperation(); } + void runOnOperation() override; +}; +} // namespace + +void CompilerGeneratedNamesConversionPass::runOnOperation() { + auto op = getOperation(); + auto *context = &getContext(); + + llvm::DenseMap remappings; + for (auto &funcOrGlobal : op->getRegion(0).front()) { + if (llvm::isa(funcOrGlobal) || + llvm::isa(funcOrGlobal)) { + auto symName = funcOrGlobal.getAttrOfType( + mlir::SymbolTable::getSymbolAttrName()); + auto deconstructedName = fir::NameUniquer::deconstruct(symName); + if (deconstructedName.first != fir::NameUniquer::NameKind::NOT_UNIQUED && + !fir::NameUniquer::isExternalFacingUniquedName(deconstructedName)) { + std::string newName = + fir::NameUniquer::replaceSpecialSymbols(symName.getValue().str()); + if (newName != symName) { + auto newAttr = mlir::StringAttr::get(context, newName); + mlir::SymbolTable::setSymbolName(&funcOrGlobal, newAttr); + auto newSymRef = mlir::FlatSymbolRefAttr::get(newAttr); + remappings.try_emplace(symName, newSymRef); + } + } + } + } + + if (remappings.empty()) + return; + + // Update all uses of the functions and globals that have been renamed. + op.walk([&remappings](mlir::Operation *nestedOp) { + llvm::SmallVector> updates; + for (const mlir::NamedAttribute &attr : nestedOp->getAttrDictionary()) + if (auto symRef = llvm::dyn_cast(attr.getValue())) + if (auto remap = remappings.find(symRef.getRootReference()); + remap != remappings.end()) + updates.emplace_back(std::pair{ + attr.getName(), mlir::SymbolRefAttr(remap->second)}); + for (auto update : updates) + nestedOp->setAttr(update.first, update.second); + }); +} diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp index 66909241966735..9f3eb5fbe11a15 100644 --- a/flang/lib/Semantics/runtime-type-info.cpp +++ b/flang/lib/Semantics/runtime-type-info.cpp @@ -12,6 +12,7 @@ #include "flang/Evaluate/fold.h" #include "flang/Evaluate/tools.h" #include "flang/Evaluate/type.h" +#include "flang/Optimizer/Support/InternalNames.h" #include "flang/Semantics/scope.h" #include "flang/Semantics/tools.h" #include @@ -377,9 +378,12 @@ static std::optional GetSuffixIfTypeKindParameters( if (pv->GetExplicit()) { if (auto instantiatedValue{evaluate::ToInt64(*pv->GetExplicit())}) { if (suffix.has_value()) { - *suffix += "."s + std::to_string(*instantiatedValue); + *suffix += + (fir::kNameSeparator + llvm::Twine(*instantiatedValue)) + .str(); } else { - suffix = "."s + std::to_string(*instantiatedValue); + suffix = (fir::kNameSeparator + llvm::Twine(*instantiatedValue)) + .str(); } } } @@ -448,7 +452,7 @@ const Symbol *RuntimeTableBuilder::DescribeType(Scope &dtScope) { } else if (isPDTDefinitionWithKindParameters) { return nullptr; } - std::string dtDescName{".dt."s + distinctName}; + std::string dtDescName{(fir::kTypeDescriptorSeparator + distinctName).str()}; Scope *dtSymbolScope{const_cast(dtSymbol->scope())}; Scope &scope{ GetContainingNonDerivedScope(dtSymbolScope ? *dtSymbolScope : dtScope)}; @@ -518,11 +522,13 @@ const Symbol *RuntimeTableBuilder::DescribeType(Scope &dtScope) { } } AddValue(dtValues, derivedTypeSchema_, "kindparameter"s, - SaveNumericPointerTarget( - scope, SaveObjectName(".kp."s + distinctName), std::move(kinds))); + SaveNumericPointerTarget(scope, + SaveObjectName((fir::kKindParameterSeparator + distinctName).str()), + std::move(kinds))); AddValue(dtValues, derivedTypeSchema_, "lenparameterkind"s, - SaveNumericPointerTarget( - scope, SaveObjectName(".lpk."s + distinctName), std::move(lenKinds))); + SaveNumericPointerTarget(scope, + SaveObjectName((fir::kLenKindSeparator + distinctName).str()), + std::move(lenKinds))); // Traverse the components of the derived type if (!isPDTDefinitionWithKindParameters) { std::vector dataComponentSymbols; @@ -570,13 +576,15 @@ const Symbol *RuntimeTableBuilder::DescribeType(Scope &dtScope) { dtScope, distinctName, parameters)); } AddValue(dtValues, derivedTypeSchema_, "component"s, - SaveDerivedPointerTarget(scope, SaveObjectName(".c."s + distinctName), + SaveDerivedPointerTarget(scope, + SaveObjectName((fir::kComponentSeparator + distinctName).str()), std::move(dataComponents), evaluate::ConstantSubscripts{ static_cast( dataComponents.size())})); AddValue(dtValues, derivedTypeSchema_, "procptr"s, - SaveDerivedPointerTarget(scope, SaveObjectName(".p."s + distinctName), + SaveDerivedPointerTarget(scope, + SaveObjectName((fir::kProcPtrSeparator + distinctName).str()), std::move(procPtrComponents), evaluate::ConstantSubscripts{ static_cast( @@ -587,7 +595,9 @@ const Symbol *RuntimeTableBuilder::DescribeType(Scope &dtScope) { std::vector bindings{ DescribeBindings(dtScope, scope)}; AddValue(dtValues, derivedTypeSchema_, bindingDescCompName, - SaveDerivedPointerTarget(scope, SaveObjectName(".v."s + distinctName), + SaveDerivedPointerTarget(scope, + SaveObjectName( + (fir::kBindingTableSeparator + distinctName).str()), std::move(bindings), evaluate::ConstantSubscripts{ static_cast(bindings.size())})); @@ -623,7 +633,9 @@ const Symbol *RuntimeTableBuilder::DescribeType(Scope &dtScope) { sortedSpecials.emplace_back(std::move(pair.second)); } AddValue(dtValues, derivedTypeSchema_, "special"s, - SaveDerivedPointerTarget(scope, SaveObjectName(".s."s + distinctName), + SaveDerivedPointerTarget(scope, + SaveObjectName( + (fir::kSpecialBindingSeparator + distinctName).str()), std::move(sortedSpecials), evaluate::ConstantSubscripts{ static_cast(specials.size())})); @@ -730,10 +742,12 @@ SomeExpr RuntimeTableBuilder::SaveNameAsPointerTarget( using evaluate::Ascii; using AsciiExpr = evaluate::Expr; object.set_init(evaluate::AsGenericExpr(AsciiExpr{name})); - Symbol &symbol{*scope - .try_emplace(SaveObjectName(".n."s + name), - Attrs{Attr::TARGET, Attr::SAVE}, std::move(object)) - .first->second}; + Symbol &symbol{ + *scope + .try_emplace( + SaveObjectName((fir::kNameStringSeparator + name).str()), + Attrs{Attr::TARGET, Attr::SAVE}, std::move(object)) + .first->second}; SetReadOnlyCompilerCreatedFlags(symbol); return evaluate::AsGenericExpr( AsciiExpr{evaluate::Designator{symbol}}); @@ -821,8 +835,9 @@ evaluate::StructureConstructor RuntimeTableBuilder::DescribeComponent( if (!lenParams.empty()) { AddValue(values, componentSchema_, "lenvalue"s, SaveDerivedPointerTarget(scope, - SaveObjectName( - ".lv."s + distinctName + "."s + symbol.name().ToString()), + SaveObjectName((fir::kLenParameterSeparator + distinctName + + fir::kNameSeparator + symbol.name().ToString()) + .str()), std::move(lenParams), evaluate::ConstantSubscripts{ static_cast(lenParams.size())})); @@ -845,8 +860,9 @@ evaluate::StructureConstructor RuntimeTableBuilder::DescribeComponent( } AddValue(values, componentSchema_, "bounds"s, SaveDerivedPointerTarget(scope, - SaveObjectName( - ".b."s + distinctName + "."s + symbol.name().ToString()), + SaveObjectName((fir::kBoundsSeparator + distinctName + + fir::kNameSeparator + symbol.name().ToString()) + .str()), std::move(bounds), evaluate::ConstantSubscripts{2, rank})); } else { AddValue( @@ -868,8 +884,9 @@ evaluate::StructureConstructor RuntimeTableBuilder::DescribeComponent( if (hasDataInit) { AddValue(values, componentSchema_, "initialization"s, SaveObjectInit(scope, - SaveObjectName( - ".di."s + distinctName + "."s + symbol.name().ToString()), + SaveObjectName((fir::kComponentInitSeparator + distinctName + + fir::kNameSeparator + symbol.name().ToString()) + .str()), object)); } } @@ -918,8 +935,9 @@ bool RuntimeTableBuilder::InitializeDataPointer( const ObjectEntityDetails &object, Scope &scope, Scope &dtScope, const std::string &distinctName) { if (object.init().has_value()) { - SourceName ptrDtName{SaveObjectName( - ".dp."s + distinctName + "."s + symbol.name().ToString())}; + SourceName ptrDtName{SaveObjectName((fir::kDataPtrInitSeparator + + distinctName + fir::kNameSeparator + symbol.name().ToString()) + .str())}; Symbol &ptrDtSym{ *scope.try_emplace(ptrDtName, Attrs{}, UnknownDetails{}).first->second}; SetReadOnlyCompilerCreatedFlags(ptrDtSym); @@ -952,8 +970,9 @@ bool RuntimeTableBuilder::InitializeDataPointer( Structure(ptrDtDeclType, std::move(ptrInitValues)))); AddValue(values, componentSchema_, "initialization"s, SaveObjectInit(scope, - SaveObjectName( - ".di."s + distinctName + "."s + symbol.name().ToString()), + SaveObjectName((fir::kComponentInitSeparator + distinctName + + fir::kNameSeparator + symbol.name().ToString()) + .str()), ptrInitObj)); return true; } else { diff --git a/flang/test/Driver/mlir-debug-pass-pipeline.f90 b/flang/test/Driver/mlir-debug-pass-pipeline.f90 index 6e9846fa422e55..a6316ee7c83123 100644 --- a/flang/test/Driver/mlir-debug-pass-pipeline.f90 +++ b/flang/test/Driver/mlir-debug-pass-pipeline.f90 @@ -109,6 +109,7 @@ ! ALL-NEXT: CodeGenRewrite ! ALL-NEXT: (S) 0 num-dce'd - Number of operations eliminated ! ALL-NEXT: TargetRewrite +! ALL-NEXT: CompilerGeneratedNamesConversion ! ALL-NEXT: ExternalNameConversion ! DEBUG-NEXT: AddDebugInfo ! NO-DEBUG-NOT: AddDebugInfo diff --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90 index db4551e93fe64c..2f35f928e99cfc 100644 --- a/flang/test/Driver/mlir-pass-pipeline.f90 +++ b/flang/test/Driver/mlir-pass-pipeline.f90 @@ -118,6 +118,7 @@ ! ALL-NEXT: CodeGenRewrite ! ALL-NEXT: (S) 0 num-dce'd - Number of operations eliminated ! ALL-NEXT: TargetRewrite +! ALL-NEXT: CompilerGeneratedNamesConversion ! ALL-NEXT: ExternalNameConversion ! ALL-NEXT: FIRToLLVMLowering ! ALL-NOT: LLVMIRLoweringPass diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir index dda4f32872fef5..bca454c13ff9cc 100644 --- a/flang/test/Fir/basic-program.fir +++ b/flang/test/Fir/basic-program.fir @@ -118,6 +118,7 @@ func.func @_QQmain() { // PASSES-NEXT: CodeGenRewrite // PASSES-NEXT: (S) 0 num-dce'd - Number of operations eliminated // PASSES-NEXT: TargetRewrite +// PASSES-NEXT: CompilerGeneratedNamesConversion // PASSES-NEXT: FIRToLLVMLowering // PASSES-NEXT: ReconcileUnrealizedCasts // PASSES-NEXT: LLVMIRLoweringPass diff --git a/flang/test/Fir/convert-to-llvm.fir b/flang/test/Fir/convert-to-llvm.fir index 194a11456f2569..a4e8170af036c9 100644 --- a/flang/test/Fir/convert-to-llvm.fir +++ b/flang/test/Fir/convert-to-llvm.fir @@ -1,7 +1,7 @@ // RUN: fir-opt --split-input-file --fir-to-llvm-ir="target=x86_64-unknown-linux-gnu" %s | FileCheck %s --check-prefixes=CHECK,CHECK-COMDAT,GENERIC // RUN: fir-opt --split-input-file --fir-to-llvm-ir="target=aarch64-unknown-linux-gnu" %s | FileCheck %s --check-prefixes=CHECK,CHECK-COMDAT,GENERIC // RUN: fir-opt --split-input-file --fir-to-llvm-ir="target=i386-unknown-linux-gnu" %s | FileCheck %s --check-prefixes=CHECK,CHECK-COMDAT,GENERIC -// RUN: fir-opt --split-input-file --fir-to-llvm-ir="target=powerpc64le-unknown-linux-gn" %s | FileCheck %s --check-prefixes=CHECK,CHECK-COMDAT,GENERIC +// RUN: fir-opt --split-input-file --fir-to-llvm-ir="target=powerpc64le-unknown-linux-gnu" %s | FileCheck %s --check-prefixes=CHECK,CHECK-COMDAT,GENERIC // RUN: fir-opt --split-input-file --fir-to-llvm-ir="target=x86_64-pc-win32" %s | FileCheck %s --check-prefixes=CHECK,CHECK-COMDAT,GENERIC // RUN: fir-opt --split-input-file --fir-to-llvm-ir="target=aarch64-apple-darwin" %s | FileCheck %s --check-prefixes=CHECK,CHECK-NO-COMDAT,GENERIC // RUN: fir-opt --split-input-file --fir-to-llvm-ir="target=amdgcn-amd-amdhsa, datalayout=e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-P0" %s | FileCheck -check-prefixes=CHECK,AMDGPU %s diff --git a/flang/test/Fir/convert-type-desc-to-llvm.fir b/flang/test/Fir/convert-type-desc-to-llvm.fir new file mode 100644 index 00000000000000..251c95d9c84216 --- /dev/null +++ b/flang/test/Fir/convert-type-desc-to-llvm.fir @@ -0,0 +1,29 @@ +// RUN: fir-opt --split-input-file --compiler-generated-names --fir-to-llvm-ir="target=x86_64-unknown-linux-gnu type-descriptors-renamed-for-assembly=true" %s | FileCheck %s --check-prefixes=CHECK,CHECK-COMDAT +// RUN: fir-opt --split-input-file --compiler-generated-names --fir-to-llvm-ir="target=aarch64-unknown-linux-gnu type-descriptors-renamed-for-assembly=true" %s | FileCheck %s --check-prefixes=CHECK,CHECK-COMDAT +// RUN: fir-opt --split-input-file --compiler-generated-names --fir-to-llvm-ir="target=i386-unknown-linux-gnu type-descriptors-renamed-for-assembly=true" %s | FileCheck %s --check-prefixes=CHECK,CHECK-COMDAT +// RUN: fir-opt --split-input-file --compiler-generated-names --fir-to-llvm-ir="target=powerpc64le-unknown-linux-gnu type-descriptors-renamed-for-assembly=true" %s | FileCheck %s --check-prefixes=CHECK,CHECK-COMDAT +// RUN: fir-opt --split-input-file --compiler-generated-names --fir-to-llvm-ir="target=x86_64-pc-win32 type-descriptors-renamed-for-assembly=true" %s | FileCheck %s --check-prefixes=CHECK,CHECK-COMDAT +// RUN: fir-opt --split-input-file --compiler-generated-names --fir-to-llvm-ir="target=aarch64-apple-darwin type-descriptors-renamed-for-assembly=true" %s | FileCheck %s --check-prefixes=CHECK,CHECK-NO-COMDAT +// RUN: fir-opt --split-input-file --compiler-generated-names --fir-to-llvm-ir="target=amdgcn-amd-amdhsa type-descriptors-renamed-for-assembly=1 datalayout=e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-P0" %s | FileCheck -check-prefixes=CHECK %s + +// Check descriptor for a derived type. Check that the f18Addendum flag is set +// to 1 meaning the addendum is present (true) and the addendum values are +// inserted. + +fir.global linkonce @_QMtest_dinitE.dt.tseq constant : i8 + +func.func @embox1(%arg0: !fir.ref>) { + %0 = fir.embox %arg0() : (!fir.ref>) -> !fir.box> + return +} + +// CHECK-COMDAT: llvm.mlir.global linkonce constant @_QMtest_dinitEXdtXtseq() comdat(@__llvm_comdat::@_QMtest_dinitEXdtXtseq) {addr_space = 0 : i32} : i8 +// CHECK-NO-COMDAT: llvm.mlir.global linkonce constant @_QMtest_dinitEXdtXtseq() {addr_space = 0 : i32} : i8 +// CHECK-LABEL: llvm.func @embox1 +// CHECK: %[[TYPE_CODE:.*]] = llvm.mlir.constant(42 : i32) : i32 +// CHECK: %[[VERSION:.*]] = llvm.mlir.constant(20240719 : i32) : i32 +// CHECK: %{{.*}} = llvm.insertvalue %[[VERSION]], %{{.*}}[2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, ptr, array<1 x i64>)> +// CHECK: %[[TYPE_CODE_I8:.*]] = llvm.trunc %[[TYPE_CODE]] : i32 to i8 +// CHECK: %{{.*}} = llvm.insertvalue %[[TYPE_CODE_I8]], %{{.*}}[4] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, ptr, array<1 x i{{.*}}>)> +// CHECK: %[[TDESC:.*]] = llvm.mlir.addressof @_QMtest_dinitEXdtXtseq : !llvm.ptr +// CHECK: %{{.*}} = llvm.insertvalue %[[TDESC]], %{{.*}}[7] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, ptr, array<1 x i{{.*}}>)> diff --git a/flang/test/Fir/polymorphic.fir b/flang/test/Fir/polymorphic.fir index a6b166367a4a1b..40204314e8df79 100644 --- a/flang/test/Fir/polymorphic.fir +++ b/flang/test/Fir/polymorphic.fir @@ -157,7 +157,7 @@ func.func @_QQmain() { // CHECK-LABEL: define void @_QQmain(){{.*}}{ // CHECK: %[[CLASS_NONE:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } // CHECK: %[[DESC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1 -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr @_QMmod1Ea, i64 ptrtoint (ptr getelementptr (%_QMmod1TtK2, ptr null, i32 1) to i64), i32 20240719, i8 0, i8 42, i8 1, i8 1, ptr @_QMmod1E.dt.t.2, [1 x i64] zeroinitializer }, ptr %[[CLASS_NONE]], align 8 +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr @_QMmod1Ea, i64 ptrtoint (ptr getelementptr (%_QMmod1TtK2, ptr null, i32 1) to i64), i32 20240719, i8 0, i8 42, i8 1, i8 1, ptr @_QMmod1EXdtXtX2, [1 x i64] zeroinitializer }, ptr %[[CLASS_NONE]], align 8 // CHECK: %[[LOAD:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[CLASS_NONE]] // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOAD]], ptr %[[DESC]] // CHECK: call void @_QMmod1Psub1(ptr %[[DESC]]) @@ -197,4 +197,4 @@ func.func @_QQembox_input_type(%arg0 : !fir.ref> { } // CHECK: @_QFfooEx = internal global { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } // CHECK-SAME: { ptr null, i64 ptrtoint (ptr getelementptr (%_QFfooTsometype, ptr null, i32 1) to i64), -// CHECK-SAME: i32 20240719, i8 0, i8 42, i8 2, i8 1, ptr @_QFfooE.dt.sometype, [1 x i64] zeroinitializer } +// CHECK-SAME: i32 20240719, i8 0, i8 42, i8 2, i8 1, ptr @_QFfooEXdtXsometype, [1 x i64] zeroinitializer } !some_pdt_type = !fir.type<_QFfooTsome_pdt_typeK42K43{num:i32,values:!fir.box>>}> fir.global internal @_QFfooE.dt.some_pdt_type.42.43 constant : i8 @@ -26,4 +26,4 @@ fir.global internal @_QFfooEx2 : !fir.box> { } // CHECK: @_QFfooEx2 = internal global { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } // CHECK-SAME: { ptr null, i64 ptrtoint (ptr getelementptr (%_QFfooTsome_pdt_typeK42K43, ptr null, i32 1) to i64), -// CHECK-SAME: i32 20240719, i8 0, i8 42, i8 2, i8 1, ptr @_QFfooE.dt.some_pdt_type.42.43, [1 x i64] zeroinitializer } +// CHECK-SAME: i32 20240719, i8 0, i8 42, i8 2, i8 1, ptr @_QFfooEXdtXsome_pdt_typeX42X43, [1 x i64] zeroinitializer } diff --git a/flang/test/Lower/allocatable-polymorphic.f90 b/flang/test/Lower/allocatable-polymorphic.f90 index 8fe06450d6119e..e23e38ffb4b013 100644 --- a/flang/test/Lower/allocatable-polymorphic.f90 +++ b/flang/test/Lower/allocatable-polymorphic.f90 @@ -591,16 +591,16 @@ program test_alloc ! LLVM-LABEL: define void @_QMpolyPtest_allocatable() -! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyE.dt.p1, i32 0, i32 0) +! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp1, i32 0, i32 0) ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) -! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyE.dt.p1, i32 0, i32 0) +! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp1, i32 0, i32 0) ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) -! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyE.dt.p2, i32 0, i32 0) +! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp2, i32 0, i32 0) ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) -! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyE.dt.p1, i32 1, i32 0) +! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp1, i32 1, i32 0) ! LLVM: %{{.*}} = call {} @_FortranAAllocatableSetBounds(ptr %{{.*}}, i32 0, i64 1, i64 10) ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) -! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyE.dt.p2, i32 1, i32 0) +! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp2, i32 1, i32 0) ! LLVM: %{{.*}} = call {} @_FortranAAllocatableSetBounds(ptr %{{.*}}, i32 0, i64 1, i64 20) ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) ! LLVM-COUNT-2: call void %{{.*}}() @@ -685,9 +685,9 @@ program test_alloc ! allocatable. ! LLVM-LABEL: define void @_QMpolyPtest_deallocate() -! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr null, i64 ptrtoint (ptr getelementptr (%_QMpolyTp1, ptr null, i32 1) to i64), i32 20240719, i8 0, i8 42, i8 2, i8 1, ptr @_QMpolyE.dt.p1, [1 x i64] zeroinitializer }, ptr %[[ALLOCA1:[0-9]*]] +! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr null, i64 ptrtoint (ptr getelementptr (%_QMpolyTp1, ptr null, i32 1) to i64), i32 20240719, i8 0, i8 42, i8 2, i8 1, ptr @_QMpolyEXdtXp1, [1 x i64] zeroinitializer }, ptr %[[ALLOCA1:[0-9]*]] ! LLVM: %[[LOAD:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[ALLOCA1]] ! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOAD]], ptr %[[ALLOCA2:[0-9]*]] -! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %[[ALLOCA2]], ptr @_QMpolyE.dt.p1, i32 0, i32 0) +! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %[[ALLOCA2]], ptr @_QMpolyEXdtXp1, i32 0, i32 0) ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %[[ALLOCA2]], i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableDeallocatePolymorphic(ptr %[[ALLOCA2]], ptr {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) diff --git a/flang/test/Lower/dense-array-any-rank.f90 b/flang/test/Lower/dense-array-any-rank.f90 index 437fdec2da10ec..129adf41de07ff 100644 --- a/flang/test/Lower/dense-array-any-rank.f90 +++ b/flang/test/Lower/dense-array-any-rank.f90 @@ -14,12 +14,12 @@ subroutine test() ! a1 array constructor ! CHECK-FIR: fir.global internal @_QQro.10xi4.{{.*}}(dense<[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]> : tensor<10xi32>) constant : !fir.array<10xi32> -! CHECK-LLVMIR: @_QQro.10xi4.0 = internal constant [10 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10] +! CHECK-LLVMIR: @_QQroX10xi4X0 = internal constant [10 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10] ! a2 array constructor ! CHECK-FIR: fir.global internal @_QQro.3x4xi4.{{.*}}(dense<{{\[\[11, 12, 13], \[21, 22, 23], \[31, 32, 33], \[41, 42, 43]]}}> : tensor<4x3xi32>) constant : !fir.array<3x4xi32> -! CHECK-LLVMIR: @_QQro.3x4xi4.1 = internal constant [4 x [3 x i32]] {{\[\[3 x i32] \[i32 11, i32 12, i32 13], \[3 x i32] \[i32 21, i32 22, i32 23], \[3 x i32] \[i32 31, i32 32, i32 33], \[3 x i32] \[i32 41, i32 42, i32 43]]}} +! CHECK-LLVMIR: @_QQroX3x4xi4X1 = internal constant [4 x [3 x i32]] {{\[\[3 x i32] \[i32 11, i32 12, i32 13], \[3 x i32] \[i32 21, i32 22, i32 23], \[3 x i32] \[i32 31, i32 32, i32 33], \[3 x i32] \[i32 41, i32 42, i32 43]]}} ! a3 array constructor ! CHECK-FIR: fir.global internal @_QQro.2x3x4xi4.{{.*}}(dense<{{\[\[\[111, 112], \[121, 122], \[131, 132]], \[\[211, 212], \[221, 222], \[231, 232]], \[\[311, 312], \[321, 322], \[331, 332]], \[\[411, 412], \[421, 422], \[431, 432]]]}}> : tensor<4x3x2xi32>) constant : !fir.array<2x3x4xi32> -! CHECK-LLVMIR: @_QQro.2x3x4xi4.2 = internal constant [4 x [3 x [2 x i32]]] {{\[\[3 x \[2 x i32]] \[\[2 x i32] \[i32 111, i32 112], \[2 x i32] \[i32 121, i32 122], \[2 x i32] \[i32 131, i32 132]], \[3 x \[2 x i32]] \[\[2 x i32] \[i32 211, i32 212], \[2 x i32] \[i32 221, i32 222], \[2 x i32] \[i32 231, i32 232]], \[3 x \[2 x i32]] \[\[2 x i32] \[i32 311, i32 312], \[2 x i32] \[i32 321, i32 322], \[2 x i32] \[i32 331, i32 332]], \[3 x \[2 x i32]] \[\[2 x i32] \[i32 411, i32 412], \[2 x i32] \[i32 421, i32 422], \[2 x i32] \[i32 431, i32 432]]]}} +! CHECK-LLVMIR: @_QQroX2x3x4xi4X2 = internal constant [4 x [3 x [2 x i32]]] {{\[\[3 x \[2 x i32]] \[\[2 x i32] \[i32 111, i32 112], \[2 x i32] \[i32 121, i32 122], \[2 x i32] \[i32 131, i32 132]], \[3 x \[2 x i32]] \[\[2 x i32] \[i32 211, i32 212], \[2 x i32] \[i32 221, i32 222], \[2 x i32] \[i32 231, i32 232]], \[3 x \[2 x i32]] \[\[2 x i32] \[i32 311, i32 312], \[2 x i32] \[i32 321, i32 322], \[2 x i32] \[i32 331, i32 332]], \[3 x \[2 x i32]] \[\[2 x i32] \[i32 411, i32 412], \[2 x i32] \[i32 421, i32 422], \[2 x i32] \[i32 431, i32 432]]]}} From 30ca06c4d0d06f67f10a9e19d4333acc2074811b Mon Sep 17 00:00:00 2001 From: John Harrison Date: Wed, 21 Aug 2024 13:48:29 -0700 Subject: [PATCH 113/426] [lldb-dap] When sending a DAP Output Event break each message into separate lines. (#105456) Previously, when output like `"hello\nworld\n"` was produced by lldb (or the process) the message would be sent as a single Output event. By being a single event this causes VS Code to treat this as a single message in the console when handling displaying and filtering in the Debug Console. Instead, with these changes we send each line as its own event. This results in VS Code representing each line of output from lldb-dap as an individual output message. Resolves #105444 --- .../test/tools/lldb-dap/lldbdap_testcase.py | 5 +++ lldb/test/API/tools/lldb-dap/output/Makefile | 3 ++ .../tools/lldb-dap/output/TestDAP_output.py | 31 +++++++++++++++++++ lldb/test/API/tools/lldb-dap/output/main.c | 12 +++++++ lldb/tools/lldb-dap/DAP.cpp | 22 +++++++++---- lldb/tools/lldb-dap/DAP.h | 4 +++ lldb/tools/lldb-dap/OutputRedirector.cpp | 3 +- lldb/tools/lldb-dap/lldb-dap.cpp | 2 +- 8 files changed, 74 insertions(+), 8 deletions(-) create mode 100644 lldb/test/API/tools/lldb-dap/output/Makefile create mode 100644 lldb/test/API/tools/lldb-dap/output/TestDAP_output.py create mode 100644 lldb/test/API/tools/lldb-dap/output/main.c diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py index 27545816f20707..86eba355da83db 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py @@ -202,6 +202,11 @@ def collect_console(self, timeout_secs, pattern=None): "console", timeout_secs=timeout_secs, pattern=pattern ) + def collect_stdout(self, timeout_secs, pattern=None): + return self.dap_server.collect_output( + "stdout", timeout_secs=timeout_secs, pattern=pattern + ) + def get_local_as_int(self, name, threadId=None): value = self.dap_server.get_local_variable_value(name, threadId=threadId) # 'value' may have the variable value and summary. diff --git a/lldb/test/API/tools/lldb-dap/output/Makefile b/lldb/test/API/tools/lldb-dap/output/Makefile new file mode 100644 index 00000000000000..10495940055b63 --- /dev/null +++ b/lldb/test/API/tools/lldb-dap/output/Makefile @@ -0,0 +1,3 @@ +C_SOURCES := main.c + +include Makefile.rules diff --git a/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py b/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py new file mode 100644 index 00000000000000..0d40ce993dc31c --- /dev/null +++ b/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py @@ -0,0 +1,31 @@ +""" +Test lldb-dap output events +""" + +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +import lldbdap_testcase + + +class TestDAP_output(lldbdap_testcase.DAPTestCaseBase): + def test_output(self): + program = self.getBuildArtifact("a.out") + self.build_and_launch(program) + source = "main.c" + lines = [line_number(source, "// breakpoint 1")] + breakpoint_ids = self.set_source_breakpoints(source, lines) + self.continue_to_breakpoints(breakpoint_ids) + + # Ensure partial messages are still sent. + output = self.collect_stdout(timeout_secs=1.0, pattern="abcdef") + self.assertTrue(output and len(output) > 0, "expect no program output") + + self.continue_to_exit() + + output += self.get_stdout(timeout=lldbdap_testcase.DAPTestCaseBase.timeoutval) + self.assertTrue(output and len(output) > 0, "expect no program output") + self.assertIn( + "abcdefghi\r\nhello world\r\n", + output, + 'full output not found in: ' + output, + ) diff --git a/lldb/test/API/tools/lldb-dap/output/main.c b/lldb/test/API/tools/lldb-dap/output/main.c new file mode 100644 index 00000000000000..0cfcf604aa68f7 --- /dev/null +++ b/lldb/test/API/tools/lldb-dap/output/main.c @@ -0,0 +1,12 @@ +#include +#include +#include + +int main() { + // Ensure multiple partial lines are detected and sent. + printf("abc"); + printf("def"); + printf("ghi\n"); + printf("hello world\n"); // breakpoint 1 + return 0; +} diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp index c3c70e9d739846..1fd560f21904ab 100644 --- a/lldb/tools/lldb-dap/DAP.cpp +++ b/lldb/tools/lldb-dap/DAP.cpp @@ -294,8 +294,6 @@ void DAP::SendOutput(OutputType o, const llvm::StringRef output) { if (output.empty()) return; - llvm::json::Object event(CreateEventObject("output")); - llvm::json::Object body; const char *category = nullptr; switch (o) { case OutputType::Console: @@ -311,10 +309,22 @@ void DAP::SendOutput(OutputType o, const llvm::StringRef output) { category = "telemetry"; break; } - body.try_emplace("category", category); - EmplaceSafeString(body, "output", output.str()); - event.try_emplace("body", std::move(body)); - SendJSON(llvm::json::Value(std::move(event))); + + // Send each line of output as an individual event, including the newline if + // present. + ::size_t idx = 0; + do { + ::size_t end = output.find('\n', idx); + if (end == llvm::StringRef::npos) + end = output.size() - 1; + llvm::json::Object event(CreateEventObject("output")); + llvm::json::Object body; + body.try_emplace("category", category); + EmplaceSafeString(body, "output", output.slice(idx, end + 1).str()); + event.try_emplace("body", std::move(body)); + SendJSON(llvm::json::Value(std::move(event))); + idx = end + 1; + } while (idx < output.size()); } // interface ProgressStartEvent extends Event { diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h index 7828272aa15a7d..27ea6c7ff8423f 100644 --- a/lldb/tools/lldb-dap/DAP.h +++ b/lldb/tools/lldb-dap/DAP.h @@ -68,8 +68,12 @@ namespace lldb_dap { typedef llvm::DenseMap SourceBreakpointMap; typedef llvm::StringMap FunctionBreakpointMap; + enum class OutputType { Console, Stdout, Stderr, Telemetry }; +/// Buffer size for handling output events. +constexpr uint64_t OutputBufferSize = (1u << 12); + enum DAPBroadcasterBits { eBroadcastBitStopEventThread = 1u << 0, eBroadcastBitStopProgressThread = 1u << 1 diff --git a/lldb/tools/lldb-dap/OutputRedirector.cpp b/lldb/tools/lldb-dap/OutputRedirector.cpp index 4e6907ce6c7806..2c2f49569869b4 100644 --- a/lldb/tools/lldb-dap/OutputRedirector.cpp +++ b/lldb/tools/lldb-dap/OutputRedirector.cpp @@ -13,6 +13,7 @@ #include #endif +#include "DAP.h" #include "OutputRedirector.h" #include "llvm/ADT/StringRef.h" @@ -42,7 +43,7 @@ Error RedirectFd(int fd, std::function callback) { int read_fd = new_fd[0]; std::thread t([read_fd, callback]() { - char buffer[4096]; + char buffer[OutputBufferSize]; while (true) { ssize_t bytes_count = read(read_fd, &buffer, sizeof(buffer)); if (bytes_count == 0) diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp index b534a48660a5f8..7b83767d1afeab 100644 --- a/lldb/tools/lldb-dap/lldb-dap.cpp +++ b/lldb/tools/lldb-dap/lldb-dap.cpp @@ -399,7 +399,7 @@ void SendProcessEvent(LaunchMethod launch_method) { // Grab any STDOUT and STDERR from the process and send it up to VS Code // via an "output" event to the "stdout" and "stderr" categories. void SendStdOutStdErr(lldb::SBProcess &process) { - char buffer[1024]; + char buffer[OutputBufferSize]; size_t count; while ((count = process.GetSTDOUT(buffer, sizeof(buffer))) > 0) g_dap.SendOutput(OutputType::Stdout, llvm::StringRef(buffer, count)); From 46c94bed5af48f3785c3370a9297ea29d7918cd5 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 21 Aug 2024 16:49:41 -0400 Subject: [PATCH 114/426] [libc++] Mark LWG3404 as implemented LWG3404 was implemented along with subrange. Closes #104282 --- libcxx/docs/Status/Cxx20Issues.csv | 2 +- libcxx/docs/Status/Cxx23Issues.csv | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv index d72a3682420620..d6fdc813b1f0de 100644 --- a/libcxx/docs/Status/Cxx20Issues.csv +++ b/libcxx/docs/Status/Cxx20Issues.csv @@ -218,7 +218,7 @@ "`LWG3269 `__","Parse manipulators do not specify the result of the extraction from stream","2020-02 (Prague)","","","|chrono|" "`LWG3270 `__","Parsing and formatting ``%j``\ with ``duration``\ s","2020-02 (Prague)","|Partial|","","|chrono| |format|" "`LWG3280 `__","View converting constructors can cause constraint recursion and are unneeded","2020-02 (Prague)","|Complete|","15.0","|ranges|" -"`LWG3281 `__","Conversion from ``*pair-like*``\ types to ``subrange``\ is a silent semantic promotion","2020-02 (Prague)","|Complete|","15.0","|ranges|" +"`LWG3281 `__","Conversion from ``*pair-like*``\ types to ``subrange``\ is a silent semantic promotion","2020-02 (Prague)","|Complete|","13.0","|ranges|" "`LWG3282 `__","``subrange``\ converting constructor should disallow derived to base conversions","2020-02 (Prague)","|Complete|","15.0","|ranges|" "`LWG3284 `__","``random_access_iterator``\ semantic constraints accidentally promote difference type using unary negate","2020-02 (Prague)","|Nothing To Do|","","|ranges|" "`LWG3285 `__","The type of a customization point object shall satisfy ``semiregular``\ ","2020-02 (Prague)","|Nothing To Do|","","|ranges|" diff --git a/libcxx/docs/Status/Cxx23Issues.csv b/libcxx/docs/Status/Cxx23Issues.csv index a0a9ccdca48c3c..8cb0a46b4dd25e 100644 --- a/libcxx/docs/Status/Cxx23Issues.csv +++ b/libcxx/docs/Status/Cxx23Issues.csv @@ -20,7 +20,7 @@ "`LWG3171 `__","LWG2989 breaks ``directory_entry`` stream insertion","2020-11 (Virtual)","|Complete|","14.0","" "`LWG3306 `__","``ranges::advance`` violates its preconditions","2020-11 (Virtual)","|Complete|","14.0","|ranges|" "`LWG3403 `__","Domain of ``ranges::ssize(E)`` doesn't ``match ranges::size(E)``","2020-11 (Virtual)","","","|ranges|" -"`LWG3404 `__","Finish removing subrange's conversions from pair-like","2020-11 (Virtual)","","","|ranges|" +"`LWG3404 `__","Finish removing subrange's conversions from pair-like","2020-11 (Virtual)","|Complete|","13.0","|ranges|" "`LWG3405 `__","``common_view``'s converting constructor is bad, too","2020-11 (Virtual)","|Complete|","14.0","|ranges|" "`LWG3406 `__","``elements_view::begin()`` and ``elements_view::end()`` have incompatible constraints","2020-11 (Virtual)","|Complete|","16.0","|ranges|" "`LWG3419 `__","[algorithms.requirements]/15 doesn't reserve as many rights as it intends to","2020-11 (Virtual)","|Nothing To Do|","","" From ab86fc74c04ff508f909b7b6131df1551dd833fc Mon Sep 17 00:00:00 2001 From: Jonas Rickert Date: Wed, 21 Aug 2024 23:18:21 +0200 Subject: [PATCH 115/426] [mlir] Add nodiscard attribute to allowsUnregisteredDialects (#105530) This getter can easily be confused with the similar named allowUnregisteredDialects setter --- mlir/include/mlir/IR/MLIRContext.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/IR/MLIRContext.h b/mlir/include/mlir/IR/MLIRContext.h index 11e5329f43e681..d17bbac81655b5 100644 --- a/mlir/include/mlir/IR/MLIRContext.h +++ b/mlir/include/mlir/IR/MLIRContext.h @@ -133,7 +133,7 @@ class MLIRContext { Dialect *getOrLoadDialect(StringRef name); /// Return true if we allow to create operation for unregistered dialects. - bool allowsUnregisteredDialects(); + [[nodiscard]] bool allowsUnregisteredDialects(); /// Enables creating operations in unregistered dialects. /// This option is **heavily discouraged**: it is convenient during testing From f709cd5add0ea36bb14259e9716bd74e5c762128 Mon Sep 17 00:00:00 2001 From: Dmitri Gribenko Date: Wed, 21 Aug 2024 23:49:45 +0200 Subject: [PATCH 116/426] Revert "[Coroutines] Salvage the debug information for coroutine frames within optimizations" This reverts commit 522c253f47ea27d8eeb759e06f8749092b1de71e. This series of commits causes Clang crashes. The reproducer is posted on https://github.com/llvm/llvm-project/commit/08a0dece2b2431db8abe650bb43cba01e781e1ce. --- .../test/CodeGenCoroutines/coro-dwarf-O2.cpp | 39 ------------------- llvm/lib/Transforms/Coroutines/CoroFrame.cpp | 31 ++++++++------- llvm/lib/Transforms/Coroutines/CoroInternal.h | 8 ++-- llvm/lib/Transforms/Coroutines/CoroSplit.cpp | 12 ++++-- .../Transforms/Coroutines/coro-debug-O2.ll | 6 +-- 5 files changed, 31 insertions(+), 65 deletions(-) delete mode 100644 clang/test/CodeGenCoroutines/coro-dwarf-O2.cpp diff --git a/clang/test/CodeGenCoroutines/coro-dwarf-O2.cpp b/clang/test/CodeGenCoroutines/coro-dwarf-O2.cpp deleted file mode 100644 index 53f4a07982e427..00000000000000 --- a/clang/test/CodeGenCoroutines/coro-dwarf-O2.cpp +++ /dev/null @@ -1,39 +0,0 @@ -// Check that we can still observe the value of the coroutine frame -// with optimizations. -// -// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 \ -// RUN: -emit-llvm %s -debug-info-kind=limited -dwarf-version=5 \ -// RUN: -O2 -o - | FileCheck %s - -#include "Inputs/coroutine.h" - -template <> -struct std::coroutine_traits { - struct promise_type { - void get_return_object(); - std::suspend_always initial_suspend(); - std::suspend_always final_suspend() noexcept; - void return_void(); - void unhandled_exception(); - }; -}; - -struct ScalarAwaiter { - template void await_suspend(F); - bool await_ready(); - int await_resume(); -}; - -extern "C" void UseScalar(int); - -extern "C" void f() { - UseScalar(co_await ScalarAwaiter{}); - - int Val = co_await ScalarAwaiter{}; - - co_await ScalarAwaiter{}; -} - -// CHECK: define {{.*}}@f.resume({{.*}} %[[ARG:.*]]) -// CHECK: #dbg_value(ptr %[[ARG]], ![[CORO_NUM:[0-9]+]], !DIExpression(DW_OP_deref) -// CHECK: ![[CORO_NUM]] = !DILocalVariable(name: "__coro_frame" diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index 00f49b7bdce294..fa04735340406d 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -1914,7 +1914,8 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) { } // This dbg.declare is for the main function entry point. It // will be deleted in all coro-split functions. - coro::salvageDebugInfo(ArgToAllocaMap, *DDI, false /*UseEntryValue*/); + coro::salvageDebugInfo(ArgToAllocaMap, *DDI, Shape.OptimizeFrame, + false /*UseEntryValue*/); }; for_each(DIs, SalvageOne); for_each(DVRs, SalvageOne); @@ -2851,8 +2852,9 @@ static void collectFrameAlloca(AllocaInst *AI, coro::Shape &Shape, static std::optional> salvageDebugInfoImpl(SmallDenseMap &ArgToAllocaMap, - bool UseEntryValue, Function *F, Value *Storage, - DIExpression *Expr, bool SkipOutermostLoad) { + bool OptimizeFrame, bool UseEntryValue, Function *F, + Value *Storage, DIExpression *Expr, + bool SkipOutermostLoad) { IRBuilder<> Builder(F->getContext()); auto InsertPt = F->getEntryBlock().getFirstInsertionPt(); while (isa(InsertPt)) @@ -2904,9 +2906,10 @@ salvageDebugInfoImpl(SmallDenseMap &ArgToAllocaMap, // If the coroutine frame is an Argument, store it in an alloca to improve // its availability (e.g. registers may be clobbered). - // Avoid this if the value is guaranteed to be available through other means - // (e.g. swift ABI guarantees). - if (StorageAsArg && !IsSwiftAsyncArg) { + // Avoid this if optimizations are enabled (they would remove the alloca) or + // if the value is guaranteed to be available through other means (e.g. swift + // ABI guarantees). + if (StorageAsArg && !OptimizeFrame && !IsSwiftAsyncArg) { auto &Cached = ArgToAllocaMap[StorageAsArg]; if (!Cached) { Cached = Builder.CreateAlloca(Storage->getType(), 0, nullptr, @@ -2929,7 +2932,7 @@ salvageDebugInfoImpl(SmallDenseMap &ArgToAllocaMap, void coro::salvageDebugInfo( SmallDenseMap &ArgToAllocaMap, - DbgVariableIntrinsic &DVI, bool UseEntryValue) { + DbgVariableIntrinsic &DVI, bool OptimizeFrame, bool UseEntryValue) { Function *F = DVI.getFunction(); // Follow the pointer arithmetic all the way to the incoming @@ -2937,9 +2940,9 @@ void coro::salvageDebugInfo( bool SkipOutermostLoad = !isa(DVI); Value *OriginalStorage = DVI.getVariableLocationOp(0); - auto SalvagedInfo = - ::salvageDebugInfoImpl(ArgToAllocaMap, UseEntryValue, F, OriginalStorage, - DVI.getExpression(), SkipOutermostLoad); + auto SalvagedInfo = ::salvageDebugInfoImpl( + ArgToAllocaMap, OptimizeFrame, UseEntryValue, F, OriginalStorage, + DVI.getExpression(), SkipOutermostLoad); if (!SalvagedInfo) return; @@ -2971,7 +2974,7 @@ void coro::salvageDebugInfo( void coro::salvageDebugInfo( SmallDenseMap &ArgToAllocaMap, - DbgVariableRecord &DVR, bool UseEntryValue) { + DbgVariableRecord &DVR, bool OptimizeFrame, bool UseEntryValue) { Function *F = DVR.getFunction(); // Follow the pointer arithmetic all the way to the incoming @@ -2979,9 +2982,9 @@ void coro::salvageDebugInfo( bool SkipOutermostLoad = DVR.isDbgDeclare(); Value *OriginalStorage = DVR.getVariableLocationOp(0); - auto SalvagedInfo = - ::salvageDebugInfoImpl(ArgToAllocaMap, UseEntryValue, F, OriginalStorage, - DVR.getExpression(), SkipOutermostLoad); + auto SalvagedInfo = ::salvageDebugInfoImpl( + ArgToAllocaMap, OptimizeFrame, UseEntryValue, F, OriginalStorage, + DVR.getExpression(), SkipOutermostLoad); if (!SalvagedInfo) return; diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h index d535ad7f85d74a..5716fd0ea4ab96 100644 --- a/llvm/lib/Transforms/Coroutines/CoroInternal.h +++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h @@ -29,14 +29,14 @@ void replaceCoroFree(CoroIdInst *CoroId, bool Elide); /// Attempts to rewrite the location operand of debug intrinsics in terms of /// the coroutine frame pointer, folding pointer offsets into the DIExpression /// of the intrinsic. -/// If the frame pointer is an Argument, store it into an alloca to enhance the -/// debugability. +/// If the frame pointer is an Argument, store it into an alloca if +/// OptimizeFrame is false. void salvageDebugInfo( SmallDenseMap &ArgToAllocaMap, - DbgVariableIntrinsic &DVI, bool IsEntryPoint); + DbgVariableIntrinsic &DVI, bool OptimizeFrame, bool IsEntryPoint); void salvageDebugInfo( SmallDenseMap &ArgToAllocaMap, - DbgVariableRecord &DVR, bool UseEntryValue); + DbgVariableRecord &DVR, bool OptimizeFrame, bool UseEntryValue); // Keeps data and helper functions for lowering coroutine intrinsics. struct LowererBase { diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 40bc932c3e0eef..8eceaef59a1e1f 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -735,9 +735,11 @@ void CoroCloner::salvageDebugInfo() { bool UseEntryValue = llvm::Triple(OrigF.getParent()->getTargetTriple()).isArch64Bit(); for (DbgVariableIntrinsic *DVI : Worklist) - coro::salvageDebugInfo(ArgToAllocaMap, *DVI, UseEntryValue); + coro::salvageDebugInfo(ArgToAllocaMap, *DVI, Shape.OptimizeFrame, + UseEntryValue); for (DbgVariableRecord *DVR : DbgVariableRecords) - coro::salvageDebugInfo(ArgToAllocaMap, *DVR, UseEntryValue); + coro::salvageDebugInfo(ArgToAllocaMap, *DVR, Shape.OptimizeFrame, + UseEntryValue); // Remove all salvaged dbg.declare intrinsics that became // either unreachable or stale due to the CoroSplit transformation. @@ -1960,9 +1962,11 @@ splitCoroutine(Function &F, SmallVectorImpl &Clones, SmallDenseMap ArgToAllocaMap; auto [DbgInsts, DbgVariableRecords] = collectDbgVariableIntrinsics(F); for (auto *DDI : DbgInsts) - coro::salvageDebugInfo(ArgToAllocaMap, *DDI, false /*UseEntryValue*/); + coro::salvageDebugInfo(ArgToAllocaMap, *DDI, Shape.OptimizeFrame, + false /*UseEntryValue*/); for (DbgVariableRecord *DVR : DbgVariableRecords) - coro::salvageDebugInfo(ArgToAllocaMap, *DVR, false /*UseEntryValue*/); + coro::salvageDebugInfo(ArgToAllocaMap, *DVR, Shape.OptimizeFrame, + false /*UseEntryValue*/); return Shape; } diff --git a/llvm/test/Transforms/Coroutines/coro-debug-O2.ll b/llvm/test/Transforms/Coroutines/coro-debug-O2.ll index 588f47959cc5d5..7ffa2ac153c853 100644 --- a/llvm/test/Transforms/Coroutines/coro-debug-O2.ll +++ b/llvm/test/Transforms/Coroutines/coro-debug-O2.ll @@ -1,14 +1,12 @@ ; RUN: opt < %s -passes='module(coro-early),cgscc(coro-split),function(sroa)' -S | FileCheck %s ; RUN: opt --try-experimental-debuginfo-iterators < %s -passes='module(coro-early),cgscc(coro-split),function(sroa)' -S | FileCheck %s -; Checks the dbg informations about promise and coroutine frames under O2. +; Checks whether the dbg.declare for `__promise` remains valid under O2. ; CHECK-LABEL: define internal fastcc void @f.resume({{.*}}) ; CHECK: entry.resume: -; CHECK: #dbg_value(ptr poison, ![[PROMISEVAR_RESUME:[0-9]+]], !DIExpression(DW_OP_deref, DW_OP_plus_uconst, 16 -; CHECK: #dbg_value(ptr %begin, ![[CORO_FRAME:[0-9]+]], !DIExpression(DW_OP_deref) +; CHECK: #dbg_declare(ptr %begin, ![[PROMISEVAR_RESUME:[0-9]+]], !DIExpression( ; -; CHECK: ![[CORO_FRAME]] = !DILocalVariable(name: "__coro_frame" ; CHECK: ![[PROMISEVAR_RESUME]] = !DILocalVariable(name: "__promise" %promise_type = type { i32, i32, double } From dc12ccd13f98a3f3ec4af07e60f6fe1344965e17 Mon Sep 17 00:00:00 2001 From: Dmitri Gribenko Date: Wed, 21 Aug 2024 23:50:19 +0200 Subject: [PATCH 117/426] Revert "[Coroutines] Fix -Wunused-variable in CoroFrame.cpp (NFC)" This reverts commit d48b807aa8abd1cbfe8ac5d1ba27b8b3617fc5e6. This series of commits causes Clang crashes. The reproducer is posted on https://github.com/llvm/llvm-project/commit/08a0dece2b2431db8abe650bb43cba01e781e1ce --- llvm/lib/Transforms/Coroutines/CoroFrame.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index fa04735340406d..e0e4edd2800b29 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -1121,7 +1121,8 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape, DIBuilder DBuilder(*F.getParent(), /*AllowUnresolved*/ false); - assert(Shape.getPromiseAlloca() && + AllocaInst *PromiseAlloca = Shape.getPromiseAlloca(); + assert(PromiseAlloca && "Coroutine with switch ABI should own Promise alloca"); DIFile *DFile = DIS->getFile(); From 5c7ae42c526b21acf65ab4b017d0a5fd4ac654a1 Mon Sep 17 00:00:00 2001 From: Dmitri Gribenko Date: Wed, 21 Aug 2024 23:50:46 +0200 Subject: [PATCH 118/426] Revert "[Coroutines] [NFCI] Don't search the DILocalVariable for __promise when constructing the debug varaible for __coro_frame" This reverts commit 08a0dece2b2431db8abe650bb43cba01e781e1ce. This series of commits causes Clang crashes. The reproducer is posted on https://github.com/llvm/llvm-project/commit/08a0dece2b2431db8abe650bb43cba01e781e1ce. --- llvm/lib/Transforms/Coroutines/CoroFrame.cpp | 48 ++++++++++++------- .../Coroutines/coro-debug-coro-frame.ll | 16 +++---- .../Coroutines/coro-debug-dbg.values.ll | 2 - .../Coroutines/coro-debug-frame-variable.ll | 2 - 4 files changed, 40 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index e0e4edd2800b29..73e30ea00a0e29 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -1125,8 +1125,26 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape, assert(PromiseAlloca && "Coroutine with switch ABI should own Promise alloca"); - DIFile *DFile = DIS->getFile(); - unsigned LineNum = DIS->getLine(); + TinyPtrVector DIs = findDbgDeclares(PromiseAlloca); + TinyPtrVector DVRs = findDVRDeclares(PromiseAlloca); + + DILocalVariable *PromiseDIVariable = nullptr; + DILocation *DILoc = nullptr; + if (!DIs.empty()) { + DbgDeclareInst *PromiseDDI = DIs.front(); + PromiseDIVariable = PromiseDDI->getVariable(); + DILoc = PromiseDDI->getDebugLoc().get(); + } else if (!DVRs.empty()) { + DbgVariableRecord *PromiseDVR = DVRs.front(); + PromiseDIVariable = PromiseDVR->getVariable(); + DILoc = PromiseDVR->getDebugLoc().get(); + } else { + return; + } + + DILocalScope *PromiseDIScope = PromiseDIVariable->getScope(); + DIFile *DFile = PromiseDIScope->getFile(); + unsigned LineNum = PromiseDIVariable->getLine(); DICompositeType *FrameDITy = DBuilder.createStructType( DIS->getUnit(), Twine(F.getName() + ".coro_frame_ty").str(), @@ -1236,9 +1254,10 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape, DBuilder.replaceArrays(FrameDITy, DBuilder.getOrCreateArray(Elements)); - auto *FrameDIVar = - DBuilder.createAutoVariable(DIS, "__coro_frame", DFile, LineNum, - FrameDITy, true, DINode::FlagArtificial); + auto *FrameDIVar = DBuilder.createAutoVariable(PromiseDIScope, "__coro_frame", + DFile, LineNum, FrameDITy, + true, DINode::FlagArtificial); + assert(FrameDIVar->isValidLocationForIntrinsic(DILoc)); // Subprogram would have ContainedNodes field which records the debug // variables it contained. So we need to add __coro_frame to the @@ -1247,17 +1266,14 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape, // If we don't add __coro_frame to the RetainedNodes, user may get // `no symbol __coro_frame in context` rather than `__coro_frame` // is optimized out, which is more precise. - auto RetainedNodes = DIS->getRetainedNodes(); - SmallVector RetainedNodesVec(RetainedNodes.begin(), - RetainedNodes.end()); - RetainedNodesVec.push_back(FrameDIVar); - DIS->replaceOperandWith(7, (MDTuple::get(F.getContext(), RetainedNodesVec))); - - // Construct the location for the frame debug variable. The column number - // is fake but it should be fine. - DILocation *DILoc = - DILocation::get(DIS->getContext(), LineNum, /*Column=*/1, DIS); - assert(FrameDIVar->isValidLocationForIntrinsic(DILoc)); + if (auto *SubProgram = dyn_cast(PromiseDIScope)) { + auto RetainedNodes = SubProgram->getRetainedNodes(); + SmallVector RetainedNodesVec(RetainedNodes.begin(), + RetainedNodes.end()); + RetainedNodesVec.push_back(FrameDIVar); + SubProgram->replaceOperandWith( + 7, (MDTuple::get(F.getContext(), RetainedNodesVec))); + } if (UseNewDbgInfoFormat) { DbgVariableRecord *NewDVR = diff --git a/llvm/test/Transforms/Coroutines/coro-debug-coro-frame.ll b/llvm/test/Transforms/Coroutines/coro-debug-coro-frame.ll index 1d668fd0222f77..8e5c4ab52e78eb 100644 --- a/llvm/test/Transforms/Coroutines/coro-debug-coro-frame.ll +++ b/llvm/test/Transforms/Coroutines/coro-debug-coro-frame.ll @@ -15,7 +15,8 @@ ; ; CHECK-DAG: ![[FILE:[0-9]+]] = !DIFile(filename: "coro-debug.cpp" ; CHECK-DAG: ![[RAMP:[0-9]+]] = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", -; CHECK-DAG: ![[CORO_FRAME]] = !DILocalVariable(name: "__coro_frame", scope: ![[RAMP]], file: ![[FILE]], line: [[CORO_FRAME_LINE:[0-9]+]], type: ![[FRAME_TYPE:[0-9]+]], flags: DIFlagArtificial) +; CHECK-DAG: ![[RAMP_SCOPE:[0-9]+]] = distinct !DILexicalBlock(scope: ![[RAMP]], file: ![[FILE]], line: 23 +; CHECK-DAG: ![[CORO_FRAME]] = !DILocalVariable(name: "__coro_frame", scope: ![[RAMP_SCOPE]], file: ![[FILE]], line: [[PROMISE_VAR_LINE:[0-9]+]], type: ![[FRAME_TYPE:[0-9]+]], flags: DIFlagArtificial) ; CHECK-DAG: ![[FRAME_TYPE]] = !DICompositeType(tag: DW_TAG_structure_type, name: "f.coro_frame_ty", {{.*}}elements: ![[ELEMENTS:[0-9]+]] ; CHECK-DAG: ![[ELEMENTS]] = !{![[RESUME_FN:[0-9]+]], ![[DESTROY_FN:[0-9]+]], ![[PROMISE:[0-9]+]], ![[VECTOR_TYPE:[0-9]+]], ![[INT64_0:[0-9]+]], ![[DOUBLE_1:[0-9]+]], ![[INT64_PTR:[0-9]+]], ![[INT32_2:[0-9]+]], ![[INT32_3:[0-9]+]], ![[UNALIGNED_UNKNOWN:[0-9]+]], ![[STRUCT:[0-9]+]], ![[CORO_INDEX:[0-9]+]], ![[SMALL_UNKNOWN:[0-9]+]] ; CHECK-DAG: ![[RESUME_FN]] = !DIDerivedType(tag: DW_TAG_member, name: "__resume_fn"{{.*}}, baseType: ![[RESUME_FN_TYPE:[0-9]+]]{{.*}}, flags: DIFlagArtificial @@ -28,26 +29,25 @@ ; CHECK-DAG: ![[UNKNOWN_TYPE_BASE]] = !DIBasicType(name: "UnknownType", size: 8, encoding: DW_ATE_unsigned_char, flags: DIFlagArtificial) ; CHECK-DAG: ![[VECTOR_TYPE_BASE_ELEMENTS]] = !{![[VECTOR_TYPE_BASE_SUBRANGE:[0-9]+]]} ; CHECK-DAG: ![[VECTOR_TYPE_BASE_SUBRANGE]] = !DISubrange(count: 16, lowerBound: 0) -; CHECK-DAG: ![[INT64_0]] = !DIDerivedType(tag: DW_TAG_member, name: "__int_64_1", scope: ![[FRAME_TYPE]], file: ![[FILE]], line: [[CORO_FRAME_LINE]], baseType: ![[I64_BASE:[0-9]+]],{{.*}}, flags: DIFlagArtificial +; CHECK-DAG: ![[INT64_0]] = !DIDerivedType(tag: DW_TAG_member, name: "__int_64_1", scope: ![[FRAME_TYPE]], file: ![[FILE]], line: [[PROMISE_VAR_LINE]], baseType: ![[I64_BASE:[0-9]+]],{{.*}}, flags: DIFlagArtificial ; CHECK-DAG: ![[I64_BASE]] = !DIBasicType(name: "__int_64", size: 64, encoding: DW_ATE_signed, flags: DIFlagArtificial) -; CHECK-DAG: ![[DOUBLE_1]] = !DIDerivedType(tag: DW_TAG_member, name: "__double__2", scope: ![[FRAME_TYPE]], file: ![[FILE]], line: [[CORO_FRAME_LINE]], baseType: ![[DOUBLE_BASE:[0-9]+]]{{.*}}, flags: DIFlagArtificial +; CHECK-DAG: ![[DOUBLE_1]] = !DIDerivedType(tag: DW_TAG_member, name: "__double__2", scope: ![[FRAME_TYPE]], file: ![[FILE]], line: [[PROMISE_VAR_LINE]], baseType: ![[DOUBLE_BASE:[0-9]+]]{{.*}}, flags: DIFlagArtificial ; CHECK-DAG: ![[DOUBLE_BASE]] = !DIBasicType(name: "__double_", size: 64, encoding: DW_ATE_float, flags: DIFlagArtificial) -; CHECK-DAG: ![[INT32_2]] = !DIDerivedType(tag: DW_TAG_member, name: "__int_32_4", scope: ![[FRAME_TYPE]], file: ![[FILE]], line: [[CORO_FRAME_LINE]], baseType: ![[I32_BASE:[0-9]+]]{{.*}}, flags: DIFlagArtificial +; CHECK-DAG: ![[INT32_2]] = !DIDerivedType(tag: DW_TAG_member, name: "__int_32_4", scope: ![[FRAME_TYPE]], file: ![[FILE]], line: [[PROMISE_VAR_LINE]], baseType: ![[I32_BASE:[0-9]+]]{{.*}}, flags: DIFlagArtificial ; CHECK-DAG: ![[I32_BASE]] = !DIBasicType(name: "__int_32", size: 32, encoding: DW_ATE_signed, flags: DIFlagArtificial) -; CHECK-DAG: ![[INT32_3]] = !DIDerivedType(tag: DW_TAG_member, name: "__int_32_5", scope: ![[FRAME_TYPE]], file: ![[FILE]], line: [[CORO_FRAME_LINE]], baseType: ![[I32_BASE]] +; CHECK-DAG: ![[INT32_3]] = !DIDerivedType(tag: DW_TAG_member, name: "__int_32_5", scope: ![[FRAME_TYPE]], file: ![[FILE]], line: [[PROMISE_VAR_LINE]], baseType: ![[I32_BASE]] ; CHECK-DAG: ![[UNALIGNED_UNKNOWN]] = !DIDerivedType(tag: DW_TAG_member, name: "_6",{{.*}}baseType: ![[UNALIGNED_UNKNOWN_BASE:[0-9]+]], size: 9 ; CHECK-DAG: ![[UNALIGNED_UNKNOWN_BASE]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[UNKNOWN_TYPE_BASE]], size: 16,{{.*}} elements: ![[UNALIGNED_UNKNOWN_ELEMENTS:[0-9]+]]) ; CHECK-DAG: ![[UNALIGNED_UNKNOWN_ELEMENTS]] = !{![[UNALIGNED_UNKNOWN_SUBRANGE:[0-9]+]]} ; CHECk-DAG: ![[UNALIGNED_UNKNOWN_SUBRANGE]] = !DISubrange(count: 2, lowerBound: 0) -; CHECK-DAG: ![[STRUCT]] = !DIDerivedType(tag: DW_TAG_member, name: "struct_big_structure_7", scope: ![[FRAME_TYPE]], file: ![[FILE]], line: [[CORO_FRAME_LINE]], baseType: ![[STRUCT_BASE:[0-9]+]] +; CHECK-DAG: ![[STRUCT]] = !DIDerivedType(tag: DW_TAG_member, name: "struct_big_structure_7", scope: ![[FRAME_TYPE]], file: ![[FILE]], line: [[PROMISE_VAR_LINE]], baseType: ![[STRUCT_BASE:[0-9]+]] ; CHECK-DAG: ![[STRUCT_BASE]] = !DICompositeType(tag: DW_TAG_structure_type, name: "struct_big_structure"{{.*}}, align: 64, flags: DIFlagArtificial, elements: ![[STRUCT_ELEMENTS:[0-9]+]] ; CHECK-DAG: ![[STRUCT_ELEMENTS]] = !{![[MEM_TYPE:[0-9]+]]} ; CHECK-DAG: ![[MEM_TYPE]] = !DIDerivedType(tag: DW_TAG_member,{{.*}} baseType: ![[MEM_TYPE_BASE:[0-9]+]], size: 4000 ; CHECK-DAG: ![[MEM_TYPE_BASE]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[UNKNOWN_TYPE_BASE]], size: 4000, ; CHECK-DAG: ![[CORO_INDEX]] = !DIDerivedType(tag: DW_TAG_member, name: "__coro_index" ; CHECK-DAG: ![[SMALL_UNKNOWN]] = !DIDerivedType(tag: DW_TAG_member, name: "UnknownType_8",{{.*}} baseType: ![[UNKNOWN_TYPE_BASE]], size: 5 -; CHECK-DAG: ![[PROMISE_VAR:[0-9]+]] = !DILocalVariable(name: "__promise", scope: ![[RAMP_SCOPE:[0-9]+]], file: ![[FILE]] -; CHECK-DAG: ![[RAMP_SCOPE]] = distinct !DILexicalBlock(scope: ![[RAMP]], file: ![[FILE]], line: 23 +; CHECK-DAG: ![[PROMISE_VAR:[0-9]+]] = !DILocalVariable(name: "__promise", scope: ![[RAMP_SCOPE]], file: ![[FILE]], line: [[PROMISE_VAR_LINE]] ; CHECK-DAG: ![[BAR_FUNC:[0-9]+]] = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", ; CHECK-DAG: ![[BAR_SCOPE:[0-9]+]] = distinct !DILexicalBlock(scope: ![[BAR_FUNC]], file: !1 ; CHECK-DAG: ![[FRAME_TYPE_IN_BAR:[0-9]+]] = !DICompositeType(tag: DW_TAG_structure_type, name: "bar.coro_frame_ty", file: ![[FILE]], line: [[BAR_LINE:[0-9]+]]{{.*}}elements: ![[ELEMENTS_IN_BAR:[0-9]+]] diff --git a/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll b/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll index 28f5841bb20af7..0b3acc30a1eee0 100644 --- a/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll +++ b/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll @@ -25,7 +25,6 @@ ; CHECK-SAME: ptr {{.*}} %[[frame:.*]]) ; CHECK-SAME: !dbg ![[RESUME_FN_DBG_NUM:[0-9]+]] ; CHECK: %[[frame_alloca:.*]] = alloca ptr -; CHECK-NEXT: #dbg_declare(ptr %begin.debug, ![[FRAME_DI_NUM:[0-9]+]], ; CHECK-NEXT: store ptr %[[frame]], ptr %[[frame_alloca]] ; CHECK: init.ready: ; CHECK: #dbg_value(ptr %[[frame_alloca]], ![[XVAR_RESUME:[0-9]+]], @@ -39,7 +38,6 @@ ; CHECK-SAME: !DIExpression(DW_OP_deref, DW_OP_plus_uconst, [[OffsetJ]], DW_OP_deref) ; ; CHECK: ![[RESUME_FN_DBG_NUM]] = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov" -; CHECK: ![[FRAME_DI_NUM]] = !DILocalVariable(name: "__coro_frame" ; CHECK: ![[IVAR_RESUME]] = !DILocalVariable(name: "i" ; CHECK: ![[XVAR_RESUME]] = !DILocalVariable(name: "x" ; CHECK: ![[JVAR_RESUME]] = !DILocalVariable(name: "j" diff --git a/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll b/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll index 93b22081cf12f6..4f5cdcf15618c7 100644 --- a/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll +++ b/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll @@ -42,14 +42,12 @@ ; CHECK-NEXT: %[[DBG_PTR:.*]] = alloca ptr ; CHECK-NEXT: #dbg_declare(ptr %[[DBG_PTR]], ![[XVAR_RESUME:[0-9]+]], !DIExpression(DW_OP_deref, DW_OP_plus_uconst, 32), ; CHECK-NEXT: #dbg_declare(ptr %[[DBG_PTR]], ![[IVAR_RESUME:[0-9]+]], !DIExpression(DW_OP_deref, DW_OP_plus_uconst, 20), ![[IDBGLOC_RESUME:[0-9]+]] -; CHECK-NEXT: #dbg_declare(ptr %[[DBG_PTR]], ![[FRAME_RESUME:[0-9]+]], !DIExpression(DW_OP_deref), ; CHECK-NEXT: store ptr {{.*}}, ptr %[[DBG_PTR]] ; CHECK: %[[J:.*]] = alloca i32, align 4 ; CHECK-NEXT: #dbg_declare(ptr %[[J]], ![[JVAR_RESUME:[0-9]+]], !DIExpression(), ![[JDBGLOC_RESUME:[0-9]+]] ; CHECK: init.ready: ; CHECK: await.ready: ; -; CHECK-DAG: ![[FRAME_RESUME]] = !DILocalVariable(name: "__coro_frame" ; CHECK-DAG: ![[IVAR]] = !DILocalVariable(name: "i" ; CHECK-DAG: ![[PROG_SCOPE:[0-9]+]] = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov" ; CHECK-DAG: ![[BLK_SCOPE:[0-9]+]] = distinct !DILexicalBlock(scope: ![[PROG_SCOPE]], file: !1, line: 23, column: 12) From be7d08cd59b0f23eea88e791b2413b44301949d3 Mon Sep 17 00:00:00 2001 From: Volodymyr Vasylkun Date: Wed, 21 Aug 2024 23:15:24 +0100 Subject: [PATCH 119/426] [InstCombine] Fold `sext(A < B) + zext(A > B)` into `ucmp/scmp(A, B)` (#103833) This change also covers the fold of `zext(A > B) - zext(A < B)` since it is already being canonicalized into the aforementioned pattern. Proof: https://alive2.llvm.org/ce/z/AgnfMn --- .../InstCombine/InstCombineAddSub.cpp | 20 ++ llvm/test/Transforms/InstCombine/add.ll | 4 +- .../sext-a-lt-b-plus-zext-a-gt-b-to-uscmp.ll | 184 ++++++++++++++++++ 3 files changed, 206 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Transforms/InstCombine/sext-a-lt-b-plus-zext-a-gt-b-to-uscmp.ll diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index dd4a64050f878a..d7758b5fbf1786 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -1626,6 +1626,26 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) { A->getType()->isIntOrIntVectorTy(1)) return replaceInstUsesWith(I, Constant::getNullValue(I.getType())); + // sext(A < B) + zext(A > B) => ucmp/scmp(A, B) + ICmpInst::Predicate LTPred, GTPred; + if (match(&I, + m_c_Add(m_SExt(m_c_ICmp(LTPred, m_Value(A), m_Value(B))), + m_ZExt(m_c_ICmp(GTPred, m_Deferred(A), m_Deferred(B))))) && + A->getType()->isIntOrIntVectorTy()) { + if (ICmpInst::isGT(LTPred)) { + std::swap(LTPred, GTPred); + std::swap(A, B); + } + + if (ICmpInst::isLT(LTPred) && ICmpInst::isGT(GTPred) && + ICmpInst::isSigned(LTPred) == ICmpInst::isSigned(GTPred)) + return replaceInstUsesWith( + I, Builder.CreateIntrinsic( + Ty, + ICmpInst::isSigned(LTPred) ? Intrinsic::scmp : Intrinsic::ucmp, + {A, B})); + } + // A+B --> A|B iff A and B have no bits set in common. WithCache LHSCache(LHS), RHSCache(RHS); if (haveNoCommonBitsSet(LHSCache, RHSCache, SQ.getWithInstruction(&I))) diff --git a/llvm/test/Transforms/InstCombine/add.ll b/llvm/test/Transforms/InstCombine/add.ll index 36da56d8441bf7..417c3a950d7805 100644 --- a/llvm/test/Transforms/InstCombine/add.ll +++ b/llvm/test/Transforms/InstCombine/add.ll @@ -1315,8 +1315,8 @@ define <2 x i8> @ashr_add_commute(<2 x i1> %x, <2 x i1> %y) { define i32 @cmp_math(i32 %x, i32 %y) { ; CHECK-LABEL: @cmp_math( -; CHECK-NEXT: [[LT:%.*]] = icmp ult i32 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[R:%.*]] = zext i1 [[LT]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = zext i1 [[TMP1]] to i32 ; CHECK-NEXT: ret i32 [[R]] ; %gt = icmp ugt i32 %x, %y diff --git a/llvm/test/Transforms/InstCombine/sext-a-lt-b-plus-zext-a-gt-b-to-uscmp.ll b/llvm/test/Transforms/InstCombine/sext-a-lt-b-plus-zext-a-gt-b-to-uscmp.ll new file mode 100644 index 00000000000000..02ae7ce82f13ce --- /dev/null +++ b/llvm/test/Transforms/InstCombine/sext-a-lt-b-plus-zext-a-gt-b-to-uscmp.ll @@ -0,0 +1,184 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +; sext(A s< B) + zext(A s> B) => scmp(A, B) +define i8 @signed_add(i32 %a, i32 %b) { +; CHECK-LABEL: define i8 @signed_add( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[A]], i32 [[B]]) +; CHECK-NEXT: ret i8 [[R]] +; + %lt = icmp slt i32 %a, %b + %lt8 = sext i1 %lt to i8 + %gt = icmp sgt i32 %a, %b + %gt8 = zext i1 %gt to i8 + %r = add i8 %lt8, %gt8 + ret i8 %r +} + +; Unsigned version +define i8 @unsigned_add(i32 %a, i32 %b) { +; CHECK-LABEL: define i8 @unsigned_add( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) +; CHECK-NEXT: ret i8 [[R]] +; + %lt = icmp ult i32 %a, %b + %lt8 = sext i1 %lt to i8 + %gt = icmp ugt i32 %a, %b + %gt8 = zext i1 %gt to i8 + %r = add i8 %lt8, %gt8 + ret i8 %r +} + +; Commuted operands +define i8 @signed_add_commuted1(i32 %a, i32 %b) { +; CHECK-LABEL: define i8 @signed_add_commuted1( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[B]], i32 [[A]]) +; CHECK-NEXT: ret i8 [[R]] +; + %lt = icmp slt i32 %a, %b + %lt8 = zext i1 %lt to i8 + %gt = icmp sgt i32 %a, %b + %gt8 = sext i1 %gt to i8 + %r = add i8 %lt8, %gt8 + ret i8 %r +} + +define i8 @signed_add_commuted2(i32 %a, i32 %b) { +; CHECK-LABEL: define i8 @signed_add_commuted2( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[A]], i32 [[B]]) +; CHECK-NEXT: ret i8 [[R]] +; + %lt = icmp sgt i32 %b, %a + %lt8 = sext i1 %lt to i8 + %gt = icmp sgt i32 %a, %b + %gt8 = zext i1 %gt to i8 + %r = add i8 %lt8, %gt8 + ret i8 %r +} + +; zext(A s> B) - zext(A s< B) => scmp(A, B) +define i8 @signed_sub(i32 %a, i32 %b) { +; CHECK-LABEL: define i8 @signed_sub( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[A]], i32 [[B]]) +; CHECK-NEXT: ret i8 [[R]] +; + %lt = icmp slt i32 %a, %b + %lt8 = zext i1 %lt to i8 + %gt = icmp sgt i32 %a, %b + %gt8 = zext i1 %gt to i8 + %r = sub i8 %gt8, %lt8 + ret i8 %r +} + +; Unsigned version +define i8 @unsigned_sub(i32 %a, i32 %b) { +; CHECK-LABEL: define i8 @unsigned_sub( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) +; CHECK-NEXT: ret i8 [[R]] +; + %lt = icmp ult i32 %a, %b + %lt8 = zext i1 %lt to i8 + %gt = icmp ugt i32 %a, %b + %gt8 = zext i1 %gt to i8 + %r = sub i8 %gt8, %lt8 + ret i8 %r +} + +; Negative test: incorrect predicates +define i8 @signed_add_neg1(i32 %a, i32 %b) { +; CHECK-LABEL: define i8 @signed_add_neg1( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[LT:%.*]] = icmp sgt i32 [[A]], [[B]] +; CHECK-NEXT: [[LT8:%.*]] = sext i1 [[LT]] to i8 +; CHECK-NEXT: [[GT:%.*]] = icmp sgt i32 [[A]], [[B]] +; CHECK-NEXT: [[GT8:%.*]] = zext i1 [[GT]] to i8 +; CHECK-NEXT: [[R:%.*]] = add nsw i8 [[LT8]], [[GT8]] +; CHECK-NEXT: ret i8 [[R]] +; + %lt = icmp sgt i32 %a, %b + %lt8 = sext i1 %lt to i8 + %gt = icmp sgt i32 %a, %b + %gt8 = zext i1 %gt to i8 + %r = add i8 %lt8, %gt8 + ret i8 %r +} + +define i8 @signed_add_neg2(i32 %a, i32 %b) { +; CHECK-LABEL: define i8 @signed_add_neg2( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[LT:%.*]] = icmp slt i32 [[A]], [[B]] +; CHECK-NEXT: [[LT8:%.*]] = sext i1 [[LT]] to i8 +; CHECK-NEXT: [[GT:%.*]] = icmp ne i32 [[A]], [[B]] +; CHECK-NEXT: [[GT8:%.*]] = zext i1 [[GT]] to i8 +; CHECK-NEXT: [[R:%.*]] = add nsw i8 [[LT8]], [[GT8]] +; CHECK-NEXT: ret i8 [[R]] +; + %lt = icmp slt i32 %a, %b + %lt8 = sext i1 %lt to i8 + %gt = icmp ne i32 %a, %b + %gt8 = zext i1 %gt to i8 + %r = add i8 %lt8, %gt8 + ret i8 %r +} + +; Negative test: mismatched signedness of predicates +define i8 @signed_add_neg3(i32 %a, i32 %b) { +; CHECK-LABEL: define i8 @signed_add_neg3( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[LT:%.*]] = icmp slt i32 [[A]], [[B]] +; CHECK-NEXT: [[LT8:%.*]] = sext i1 [[LT]] to i8 +; CHECK-NEXT: [[GT:%.*]] = icmp ugt i32 [[A]], [[B]] +; CHECK-NEXT: [[GT8:%.*]] = zext i1 [[GT]] to i8 +; CHECK-NEXT: [[R:%.*]] = add nsw i8 [[LT8]], [[GT8]] +; CHECK-NEXT: ret i8 [[R]] +; + %lt = icmp slt i32 %a, %b + %lt8 = sext i1 %lt to i8 + %gt = icmp ugt i32 %a, %b + %gt8 = zext i1 %gt to i8 + %r = add i8 %lt8, %gt8 + ret i8 %r +} + +; Negative test: zext instead of sext or vice-versa (NOT commuted operands) +define i8 @signed_add_neg4(i32 %a, i32 %b) { +; CHECK-LABEL: define i8 @signed_add_neg4( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[LT:%.*]] = icmp slt i32 [[A]], [[B]] +; CHECK-NEXT: [[LT8:%.*]] = sext i1 [[LT]] to i8 +; CHECK-NEXT: [[GT:%.*]] = icmp sgt i32 [[A]], [[B]] +; CHECK-NEXT: [[GT8:%.*]] = sext i1 [[GT]] to i8 +; CHECK-NEXT: [[R:%.*]] = add nsw i8 [[LT8]], [[GT8]] +; CHECK-NEXT: ret i8 [[R]] +; + %lt = icmp slt i32 %a, %b + %lt8 = sext i1 %lt to i8 + %gt = icmp sgt i32 %a, %b + %gt8 = sext i1 %gt to i8 + %r = add i8 %lt8, %gt8 + ret i8 %r +} + +define i8 @signed_add_neg5(i32 %a, i32 %b) { +; CHECK-LABEL: define i8 @signed_add_neg5( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[LT:%.*]] = icmp slt i32 [[A]], [[B]] +; CHECK-NEXT: [[LT8:%.*]] = zext i1 [[LT]] to i8 +; CHECK-NEXT: [[GT:%.*]] = icmp sgt i32 [[A]], [[B]] +; CHECK-NEXT: [[GT8:%.*]] = zext i1 [[GT]] to i8 +; CHECK-NEXT: [[R:%.*]] = add nuw nsw i8 [[LT8]], [[GT8]] +; CHECK-NEXT: ret i8 [[R]] +; + %lt = icmp slt i32 %a, %b + %lt8 = zext i1 %lt to i8 + %gt = icmp sgt i32 %a, %b + %gt8 = zext i1 %gt to i8 + %r = add i8 %lt8, %gt8 + ret i8 %r +} From aa4c6557a1281df627cdf06684bdb08da2707200 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Wed, 21 Aug 2024 15:23:42 -0700 Subject: [PATCH 120/426] [SandboxIR] Fix use-of-uninitialized in ShuffleVectorInst unit test. (#105592) I accidentally created a dangling ArrayRef local variable. Use a SmallVector instead. --- llvm/unittests/SandboxIR/SandboxIRTest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index 94d8ac27be3bc8..8315ee38dbe187 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -801,7 +801,7 @@ define void @foo(<2 x i8> %v1, <2 x i8> %v2) { // isValidOperands auto *LLVMArgV1 = LLVMF.getArg(0); auto *LLVMArgV2 = LLVMF.getArg(1); - ArrayRef Mask({1, 2}); + SmallVector Mask({1, 2}); EXPECT_EQ( sandboxir::ShuffleVectorInst::isValidOperands(ArgV1, ArgV2, Mask), llvm::ShuffleVectorInst::isValidOperands(LLVMArgV1, LLVMArgV2, Mask)); From 9ebe8b9abde02340494883d1ed1897ef5837473b Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 21 Aug 2024 15:27:00 -0700 Subject: [PATCH 121/426] [NFC][TableGen] Change global variables from anonymous NS to static (#105504) - Move global variables in TableGen.cpp out of anonymous namespace and make them static, per LLVM coding standards. --- llvm/utils/TableGen/TableGen.cpp | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/llvm/utils/TableGen/TableGen.cpp b/llvm/utils/TableGen/TableGen.cpp index b2ed48cffe6be5..c420843574cbf3 100644 --- a/llvm/utils/TableGen/TableGen.cpp +++ b/llvm/utils/TableGen/TableGen.cpp @@ -33,24 +33,23 @@ cl::opt EmitLongStrLiterals( cl::Hidden, cl::init(true)); } // end namespace llvm -namespace { +static cl::OptionCategory PrintEnumsCat("Options for -print-enums"); +static cl::opt Class("class", + cl::desc("Print Enum list for this class"), + cl::value_desc("class name"), + cl::cat(PrintEnumsCat)); -cl::OptionCategory PrintEnumsCat("Options for -print-enums"); -cl::opt Class("class", cl::desc("Print Enum list for this class"), - cl::value_desc("class name"), - cl::cat(PrintEnumsCat)); - -void PrintRecords(RecordKeeper &Records, raw_ostream &OS) { +static void PrintRecords(RecordKeeper &Records, raw_ostream &OS) { OS << Records; // No argument, dump all contents } -void PrintEnums(RecordKeeper &Records, raw_ostream &OS) { +static void PrintEnums(RecordKeeper &Records, raw_ostream &OS) { for (Record *Rec : Records.getAllDerivedDefinitions(Class)) OS << Rec->getName() << ", "; OS << "\n"; } -void PrintSets(RecordKeeper &Records, raw_ostream &OS) { +static void PrintSets(RecordKeeper &Records, raw_ostream &OS) { SetTheory Sets; Sets.addFieldExpander("Set", "Elements"); for (Record *Rec : Records.getAllDerivedDefinitions("Set")) { @@ -63,7 +62,7 @@ void PrintSets(RecordKeeper &Records, raw_ostream &OS) { } } -TableGen::Emitter::Opt X[] = { +static TableGen::Emitter::Opt X[] = { {"print-records", PrintRecords, "Print all records to stdout (default)", true}, {"print-detailed-records", EmitDetailedRecords, @@ -75,8 +74,6 @@ TableGen::Emitter::Opt X[] = { {"print-sets", PrintSets, "Print expanded sets for testing DAG exprs"}, }; -} // namespace - int main(int argc, char **argv) { InitLLVM X(argc, argv); cl::ParseCommandLineOptions(argc, argv); From b5ba726577f7e7af880b62a6352c6208bda4cd0b Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Wed, 21 Aug 2024 15:56:55 -0700 Subject: [PATCH 122/426] [SandboxIR] Add tracking for `ShuffleVectorInst::setShuffleMask`. (#105590) --- llvm/include/llvm/SandboxIR/SandboxIR.h | 4 +--- llvm/include/llvm/SandboxIR/Tracker.h | 15 ++++++++++++++ llvm/lib/SandboxIR/SandboxIR.cpp | 5 +++++ llvm/lib/SandboxIR/Tracker.cpp | 14 +++++++++++++ llvm/unittests/SandboxIR/TrackerTest.cpp | 26 ++++++++++++++++++++++++ 5 files changed, 61 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index 01ef8013ea42a0..278951113aed84 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -1024,9 +1024,7 @@ class ShuffleVectorInst final static Constant *convertShuffleMaskForBitcode(ArrayRef Mask, Type *ResultTy, Context &Ctx); - void setShuffleMask(ArrayRef Mask) { - cast(Val)->setShuffleMask(Mask); - } + void setShuffleMask(ArrayRef Mask); ArrayRef getShuffleMask() const { return cast(Val)->getShuffleMask(); diff --git a/llvm/include/llvm/SandboxIR/Tracker.h b/llvm/include/llvm/SandboxIR/Tracker.h index 6f205ae2a075c6..c8a9e99a34341d 100644 --- a/llvm/include/llvm/SandboxIR/Tracker.h +++ b/llvm/include/llvm/SandboxIR/Tracker.h @@ -62,6 +62,7 @@ class AllocaInst; class CatchSwitchInst; class SwitchInst; class ConstantInt; +class ShuffleVectorInst; /// The base class for IR Change classes. class IRChangeBase { @@ -355,6 +356,20 @@ class CreateAndInsertInst final : public IRChangeBase { #endif }; +class ShuffleVectorSetMask final : public IRChangeBase { + ShuffleVectorInst *SVI; + SmallVector PrevMask; + +public: + ShuffleVectorSetMask(ShuffleVectorInst *SVI); + void revert(Tracker &Tracker) final; + void accept() final {} +#ifndef NDEBUG + void dump(raw_ostream &OS) const final { OS << "ShuffleVectorSetMask"; } + LLVM_DUMP_METHOD void dump() const final; +#endif +}; + /// The tracker collects all the change objects and implements the main API for /// saving / reverting / accepting. class Tracker { diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp index a62c879b91e8b9..92054e7cab86ee 100644 --- a/llvm/lib/SandboxIR/SandboxIR.cpp +++ b/llvm/lib/SandboxIR/SandboxIR.cpp @@ -1868,6 +1868,11 @@ Value *ShuffleVectorInst::create(Value *V1, Value *V2, ArrayRef Mask, return Ctx.getOrCreateConstant(cast(NewV)); } +void ShuffleVectorInst::setShuffleMask(ArrayRef Mask) { + Ctx.getTracker().emplaceIfTracking(this); + cast(Val)->setShuffleMask(Mask); +} + Constant *ShuffleVectorInst::getShuffleMaskForBitcode() const { return Ctx.getOrCreateConstant( cast(Val)->getShuffleMaskForBitcode()); diff --git a/llvm/lib/SandboxIR/Tracker.cpp b/llvm/lib/SandboxIR/Tracker.cpp index 38a1c03556650e..953d4bd51353a9 100644 --- a/llvm/lib/SandboxIR/Tracker.cpp +++ b/llvm/lib/SandboxIR/Tracker.cpp @@ -234,6 +234,20 @@ void CreateAndInsertInst::dump() const { } #endif +ShuffleVectorSetMask::ShuffleVectorSetMask(ShuffleVectorInst *SVI) + : SVI(SVI), PrevMask(SVI->getShuffleMask()) {} + +void ShuffleVectorSetMask::revert(Tracker &Tracker) { + SVI->setShuffleMask(PrevMask); +} + +#ifndef NDEBUG +void ShuffleVectorSetMask::dump() const { + dump(dbgs()); + dbgs() << "\n"; +} +#endif + void Tracker::save() { State = TrackerState::Record; } void Tracker::revert() { diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp index 9f502375204024..a2c3080011f162 100644 --- a/llvm/unittests/SandboxIR/TrackerTest.cpp +++ b/llvm/unittests/SandboxIR/TrackerTest.cpp @@ -13,6 +13,7 @@ #include "llvm/IR/Module.h" #include "llvm/SandboxIR/SandboxIR.h" #include "llvm/Support/SourceMgr.h" +#include "gmock/gmock-matchers.h" #include "gtest/gtest.h" using namespace llvm; @@ -792,6 +793,31 @@ define void @foo(i32 %cond0, i32 %cond1) { EXPECT_EQ(Switch->findCaseDest(BB1), One); } +TEST_F(TrackerTest, ShuffleVectorInstSetters) { + parseIR(C, R"IR( +define void @foo(<2 x i8> %v1, <2 x i8> %v2) { + %shuf = shufflevector <2 x i8> %v1, <2 x i8> %v2, <2 x i32> + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + + auto *F = Ctx.createFunction(&LLVMF); + auto *BB = &*F->begin(); + auto It = BB->begin(); + auto *SVI = cast(&*It++); + + // Check setShuffleMask. + SmallVector OrigMask(SVI->getShuffleMask()); + Ctx.save(); + SVI->setShuffleMask(ArrayRef({0, 0})); + EXPECT_THAT(SVI->getShuffleMask(), + testing::Not(testing::ElementsAreArray(OrigMask))); + Ctx.revert(); + EXPECT_THAT(SVI->getShuffleMask(), testing::ElementsAreArray(OrigMask)); +} + TEST_F(TrackerTest, AtomicRMWSetters) { parseIR(C, R"IR( define void @foo(ptr %ptr, i8 %arg) { From 6b98a723653214a6cde05ae3cb5233af328ff101 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 21 Aug 2024 18:02:04 -0500 Subject: [PATCH 123/426] [libc] Add `scanf` support to the GPU build (#104812) Summary: The `scanf` function has a "system file" configuration, which is pretty much what the GPU implementation does at this point. So we should be able to use it in much the same way. --- libc/config/gpu/entrypoints.txt | 2 + libc/docs/gpu/support.rst | 2 + libc/src/stdio/CMakeLists.txt | 2 +- libc/src/stdio/scanf_core/CMakeLists.txt | 46 ++++++++++++-------- libc/src/stdio/scanf_core/vfscanf_internal.h | 28 +++++++++++- 5 files changed, 61 insertions(+), 19 deletions(-) diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt index bbae3298fae615..d7f35bc1edf5a0 100644 --- a/libc/config/gpu/entrypoints.txt +++ b/libc/config/gpu/entrypoints.txt @@ -192,6 +192,8 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.stdio.vsprintf libc.src.stdio.asprintf libc.src.stdio.vasprintf + libc.src.stdio.scanf + libc.src.stdio.fscanf libc.src.stdio.sscanf libc.src.stdio.vsscanf libc.src.stdio.feof diff --git a/libc/docs/gpu/support.rst b/libc/docs/gpu/support.rst index 5ef298a2ba58f2..c8b1052ce16895 100644 --- a/libc/docs/gpu/support.rst +++ b/libc/docs/gpu/support.rst @@ -239,6 +239,8 @@ snprintf |check| vsprintf |check| vsnprintf |check| sscanf |check| +scanf |check| +fscanf |check| putchar |check| |check| fclose |check| |check| fopen |check| |check| diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt index bc5ef5fe0e9b48..372b8fc8192455 100644 --- a/libc/src/stdio/CMakeLists.txt +++ b/libc/src/stdio/CMakeLists.txt @@ -101,7 +101,7 @@ list(APPEND scanf_deps libc.hdr.types.FILE ) -if(LLVM_LIBC_FULL_BUILD) +if(LLVM_LIBC_FULL_BUILD AND NOT LIBC_TARGET_OS_IS_GPU) list(APPEND scanf_deps libc.src.__support.File.file libc.src.__support.File.platform_file diff --git a/libc/src/stdio/scanf_core/CMakeLists.txt b/libc/src/stdio/scanf_core/CMakeLists.txt index e2b49e0c915284..5c00ae0c9973c2 100644 --- a/libc/src/stdio/scanf_core/CMakeLists.txt +++ b/libc/src/stdio/scanf_core/CMakeLists.txt @@ -92,21 +92,33 @@ add_object_library( libc.src.__support.str_to_float ) -if(NOT (TARGET libc.src.__support.File.file) AND LLVM_LIBC_FULL_BUILD) - # Not all platforms have a file implementation. If file is unvailable, and a - # full build is requested, then we must skip all file based printf sections. - return() +if(LIBC_TARGET_OS_IS_GPU) + add_header_library( + vfscanf_internal + HDRS + vfscanf_internal.h + DEPENDS + .reader + .scanf_main + libc.include.stdio + libc.src.__support.arg_list + libc.src.stdio.getc + libc.src.stdio.ungetc + libc.src.stdio.ferror + COMPILE_OPTIONS + -DLIBC_COPT_STDIO_USE_SYSTEM_FILE + ) +elseif(TARGET libc.src.__support.File.file OR (NOT LLVM_LIBC_FULL_BUILD)) + add_header_library( + vfscanf_internal + HDRS + vfscanf_internal.h + DEPENDS + .reader + .scanf_main + libc.include.stdio + libc.src.__support.File.file + libc.src.__support.arg_list + ${use_system_file} + ) endif() - -add_header_library( - vfscanf_internal - HDRS - vfscanf_internal.h - DEPENDS - .reader - .scanf_main - libc.include.stdio - libc.src.__support.File.file - libc.src.__support.arg_list - ${use_system_file} -) diff --git a/libc/src/stdio/scanf_core/vfscanf_internal.h b/libc/src/stdio/scanf_core/vfscanf_internal.h index 2b0072a6ae35f3..67126431fcded5 100644 --- a/libc/src/stdio/scanf_core/vfscanf_internal.h +++ b/libc/src/stdio/scanf_core/vfscanf_internal.h @@ -12,9 +12,16 @@ #include "src/__support/File/file.h" #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" +#include "src/__support/macros/properties/architectures.h" #include "src/stdio/scanf_core/reader.h" #include "src/stdio/scanf_core/scanf_main.h" +#if defined(LIBC_TARGET_ARCH_IS_GPU) +#include "src/stdio/ferror.h" +#include "src/stdio/getc.h" +#include "src/stdio/ungetc.h" +#endif + #include "hdr/types/FILE.h" #include @@ -22,7 +29,26 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { -#ifndef LIBC_COPT_STDIO_USE_SYSTEM_FILE +#if defined(LIBC_TARGET_ARCH_IS_GPU) +// The GPU build provides FILE access through the host operating system's +// library. So here we simply use the public entrypoints like in the SYSTEM_FILE +// interface. Entrypoints should normally not call others, this is an exception. +// FIXME: We do not acquire any locks here, so this is not thread safe. +LIBC_INLINE void flockfile(::FILE *) { return; } + +LIBC_INLINE void funlockfile(::FILE *) { return; } + +LIBC_INLINE int getc(void *f) { + return LIBC_NAMESPACE::getc(reinterpret_cast<::FILE *>(f)); +} + +LIBC_INLINE void ungetc(int c, void *f) { + LIBC_NAMESPACE::ungetc(c, reinterpret_cast<::FILE *>(f)); +} + +LIBC_INLINE int ferror_unlocked(::FILE *f) { return LIBC_NAMESPACE::ferror(f); } + +#elif !defined(LIBC_COPT_STDIO_USE_SYSTEM_FILE) LIBC_INLINE void flockfile(FILE *f) { reinterpret_cast(f)->lock(); From c557d8520413476221a4f3bf2b7b3fed17681691 Mon Sep 17 00:00:00 2001 From: Peter Klausler <35819229+klausler@users.noreply.github.com> Date: Wed, 21 Aug 2024 16:08:06 -0700 Subject: [PATCH 124/426] [flang][runtime] Add build-time flags to runtime to adjust SELECTED_x_KIND() (#105575) Add FLANG_RUNTIME_NO_INTEGER_16 and FLANG_RUNTIME_NO_REAL_{2,10,16} to allow one to disable those kinds from being returned from SELECTED_INT_KIND and SELECTED_REAL_KIND even if they are actually available in the C++ build compiler. --- flang/runtime/numeric.cpp | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/flang/runtime/numeric.cpp b/flang/runtime/numeric.cpp index 7c40beb31083ff..28687b1971b7ed 100644 --- a/flang/runtime/numeric.cpp +++ b/flang/runtime/numeric.cpp @@ -105,7 +105,7 @@ inline RT_API_ATTRS CppTypeFor SelectedIntKind(T x) { return 4; } else if (x <= 18) { return 8; -#ifdef __SIZEOF_INT128__ +#if defined __SIZEOF_INT128__ && !defined FLANG_RUNTIME_NO_INTEGER_16 } else if (x <= 38) { return 16; #endif @@ -137,23 +137,35 @@ inline RT_API_ATTRS CppTypeFor SelectedRealKind( return -5; } +#ifndef FLANG_RUNTIME_NO_REAL_2 + constexpr bool hasReal2{true}; +#else + constexpr bool hasReal2{false}; +#endif +#if defined LDBL_MANT_DIG == 64 && !defined FLANG_RUNTIME_NO_REAL_10 + constexpr bool hasReal10{true}; +#else + constexpr bool hasReal10{false}; +#endif +#if (LDBL_MANT_DIG == 64 || LDBL_MANT_DIG == 113) && \ + !defined FLANG_RUNTIME_NO_REAL_16 + constexpr bool hasReal16{true}; +#else + constexpr bool hasReal16{false}; +#endif + int error{0}; int kind{0}; - if (p <= 3) { + if (hasReal2 && p <= 3) { kind = 2; } else if (p <= 6) { kind = 4; } else if (p <= 15) { kind = 8; -#if LDBL_MANT_DIG == 64 - } else if (p <= 18) { + } else if (hasReal10 && p <= 18) { kind = 10; - } else if (p <= 33) { - kind = 16; -#elif LDBL_MANT_DIG == 113 - } else if (p <= 33) { + } else if (hasReal16 && p <= 33) { kind = 16; -#endif } else { error -= 1; } @@ -164,13 +176,10 @@ inline RT_API_ATTRS CppTypeFor SelectedRealKind( kind = kind < 3 ? (p == 3 ? 4 : 3) : kind; } else if (r <= 307) { kind = kind < 8 ? 8 : kind; -#if LDBL_MANT_DIG == 64 - } else if (r <= 4931) { + } else if (hasReal10 && r <= 4931) { kind = kind < 10 ? 10 : kind; -#elif LDBL_MANT_DIG == 113 - } else if (r <= 4931) { + } else if (hasReal16 && r <= 4931) { kind = kind < 16 ? 16 : kind; -#endif } else { error -= 2; } From ec8fe7ad81af6c211fb26c34824092e5bca08f5e Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 21 Aug 2024 16:53:01 -0700 Subject: [PATCH 125/426] [LTO] Use enum class for ImportFailureReason (NFC) (#105564) It turns out that all uses of the enum values here are already qualified like FunctionImporter::ImportFailureReason::None, so we can switch to enum class without touching the rest of the codebase. --- llvm/include/llvm/Transforms/IPO/FunctionImport.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/llvm/include/llvm/Transforms/IPO/FunctionImport.h index 6df597c300c180..5dad572532c8ae 100644 --- a/llvm/include/llvm/Transforms/IPO/FunctionImport.h +++ b/llvm/include/llvm/Transforms/IPO/FunctionImport.h @@ -42,7 +42,7 @@ class FunctionImporter { /// The different reasons selectCallee will chose not to import a /// candidate. - enum ImportFailureReason { + enum class ImportFailureReason { None, // We can encounter a global variable instead of a function in rare // situations with SamplePGO. See comments where this failure type is From fdbc4089e7a6eafa4002a7981bcde94fc378bc18 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 21 Aug 2024 16:53:18 -0700 Subject: [PATCH 126/426] [LTO] Compare std::optional directly with ImportKind (NFC) (#105561) Note that: Opt == Val if and only (Opt && *Opt == Val) where: std::optional Opt; T Val; --- llvm/lib/Transforms/IPO/FunctionImport.cpp | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp index 6ae89a49b6b9a3..92371720e0eceb 100644 --- a/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -1814,10 +1814,7 @@ Expected FunctionImporter::importFunctions( continue; auto GUID = F.getGUID(); auto MaybeImportType = getImportType(ImportGUIDs, GUID); - - bool ImportDefinition = - (MaybeImportType && - (*MaybeImportType == GlobalValueSummary::Definition)); + bool ImportDefinition = MaybeImportType == GlobalValueSummary::Definition; LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not") << " importing function" @@ -1853,10 +1850,7 @@ Expected FunctionImporter::importFunctions( continue; auto GUID = GV.getGUID(); auto MaybeImportType = getImportType(ImportGUIDs, GUID); - - bool ImportDefinition = - (MaybeImportType && - (*MaybeImportType == GlobalValueSummary::Definition)); + bool ImportDefinition = MaybeImportType == GlobalValueSummary::Definition; LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not") << " importing global" @@ -1876,10 +1870,7 @@ Expected FunctionImporter::importFunctions( continue; auto GUID = GA.getGUID(); auto MaybeImportType = getImportType(ImportGUIDs, GUID); - - bool ImportDefinition = - (MaybeImportType && - (*MaybeImportType == GlobalValueSummary::Definition)); + bool ImportDefinition = MaybeImportType == GlobalValueSummary::Definition; LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not") << " importing alias" From 19d3f3417100dc99caa4394fbd26fc0c4702264e Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Wed, 21 Aug 2024 16:51:54 -0700 Subject: [PATCH 127/426] [lldb] Speculative fix for trap_frame_sym_ctx.test Unfortunately I can't actually reproduce this locally. --- lldb/test/Shell/Unwind/trap_frame_sym_ctx.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/test/Shell/Unwind/trap_frame_sym_ctx.test b/lldb/test/Shell/Unwind/trap_frame_sym_ctx.test index 1bf1fb1d6e85f9..08a26616240e68 100644 --- a/lldb/test/Shell/Unwind/trap_frame_sym_ctx.test +++ b/lldb/test/Shell/Unwind/trap_frame_sym_ctx.test @@ -15,7 +15,7 @@ breakpoint set -n bar process launch # CHECK: stop reason = breakpoint 1.1 -thread backtrace +thread backtrace -u # CHECK: frame #0: {{.*}}`bar # CHECK: frame #1: {{.*}}`tramp # CHECK: frame #2: {{.*}}`main From 1e70122cbc187c08de91a3fb42843efb1221e0e9 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Wed, 21 Aug 2024 17:17:46 -0700 Subject: [PATCH 128/426] [ctx_prof] API to get the instrumentation of a BB (#105468) Analogous to PR #104491 Issue #89287 --- llvm/include/llvm/Analysis/CtxProfAnalysis.h | 5 +++++ llvm/lib/Analysis/CtxProfAnalysis.cpp | 7 ++++++ .../Analysis/CtxProfAnalysisTest.cpp | 22 +++++++++++++++++++ 3 files changed, 34 insertions(+) diff --git a/llvm/include/llvm/Analysis/CtxProfAnalysis.h b/llvm/include/llvm/Analysis/CtxProfAnalysis.h index 23abcbe2c6e9d2..0b4dd8ae3a0dc7 100644 --- a/llvm/include/llvm/Analysis/CtxProfAnalysis.h +++ b/llvm/include/llvm/Analysis/CtxProfAnalysis.h @@ -95,7 +95,12 @@ class CtxProfAnalysis : public AnalysisInfoMixin { PGOContextualProfile run(Module &M, ModuleAnalysisManager &MAM); + /// Get the instruction instrumenting a callsite, or nullptr if that cannot be + /// found. static InstrProfCallsite *getCallsiteInstrumentation(CallBase &CB); + + /// Get the instruction instrumenting a BB, or nullptr if not present. + static InstrProfIncrementInst *getBBInstrumentation(BasicBlock &BB); }; class CtxProfAnalysisPrinterPass diff --git a/llvm/lib/Analysis/CtxProfAnalysis.cpp b/llvm/lib/Analysis/CtxProfAnalysis.cpp index ceebb2cf06d235..3fc1bc34afb97e 100644 --- a/llvm/lib/Analysis/CtxProfAnalysis.cpp +++ b/llvm/lib/Analysis/CtxProfAnalysis.cpp @@ -202,6 +202,13 @@ InstrProfCallsite *CtxProfAnalysis::getCallsiteInstrumentation(CallBase &CB) { return nullptr; } +InstrProfIncrementInst *CtxProfAnalysis::getBBInstrumentation(BasicBlock &BB) { + for (auto &I : BB) + if (auto *Incr = dyn_cast(&I)) + return Incr; + return nullptr; +} + static void preorderVisit(const PGOCtxProfContext::CallTargetMapTy &Profiles, function_ref Visitor) { diff --git a/llvm/unittests/Analysis/CtxProfAnalysisTest.cpp b/llvm/unittests/Analysis/CtxProfAnalysisTest.cpp index 5f9bf3ec540eb3..fbe3a6e45109cc 100644 --- a/llvm/unittests/Analysis/CtxProfAnalysisTest.cpp +++ b/llvm/unittests/Analysis/CtxProfAnalysisTest.cpp @@ -132,4 +132,26 @@ TEST_F(CtxProfAnalysisTest, GetCallsiteIDNegativeTest) { EXPECT_EQ(IndIns, nullptr); } +TEST_F(CtxProfAnalysisTest, GetBBIDTest) { + ModulePassManager MPM; + MPM.addPass(PGOInstrumentationGen(PGOInstrumentationType::CTXPROF)); + EXPECT_FALSE(MPM.run(*M, MAM).areAllPreserved()); + auto *F = M->getFunction("foo"); + ASSERT_NE(F, nullptr); + std::map BBNameAndID; + + for (auto &BB : *F) { + auto *Ins = CtxProfAnalysis::getBBInstrumentation(BB); + if (Ins) + BBNameAndID[BB.getName().str()] = + static_cast(Ins->getIndex()->getZExtValue()); + else + BBNameAndID[BB.getName().str()] = -1; + } + + EXPECT_THAT(BBNameAndID, + testing::UnorderedElementsAre( + testing::Pair("", 0), testing::Pair("yes", 1), + testing::Pair("no", -1), testing::Pair("exit", -1))); +} } // namespace From f25e6515aa04e53a642bc79eb09a96e418cbbb03 Mon Sep 17 00:00:00 2001 From: Connie Zhu <60797237+connieyzhu@users.noreply.github.com> Date: Wed, 21 Aug 2024 17:26:16 -0700 Subject: [PATCH 129/426] [compiler-rt][test] Added REQUIRES:shell to fuzzer test with for-loop (#105557) This patch makes the features_dir.test file require a shell when running. This will make the test file unsupported when running llvm-lit with its internal shell implementation, which is enabled by turning on the LIT_USE_INTERNAL_SHELL environment variable. Lit's internal shell currently does not support for-loop syntax. --- compiler-rt/test/fuzzer/features_dir.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/test/fuzzer/features_dir.test b/compiler-rt/test/fuzzer/features_dir.test index c6beec01bc3ab2..ce63b3920708cc 100644 --- a/compiler-rt/test/fuzzer/features_dir.test +++ b/compiler-rt/test/fuzzer/features_dir.test @@ -1,5 +1,5 @@ # Tests -features_dir=F -# REQUIRES: linux +# REQUIRES: linux, shell RUN: %cpp_compiler %S/SimpleTest.cpp -o %t-SimpleTest RUN: rm -rf %t-C %t-F RUN: mkdir %t-C %t-F From 04c827d0b5e629ba53e8ede94811a13a96db36a4 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Wed, 21 Aug 2024 17:37:17 -0700 Subject: [PATCH 130/426] [SandboxIR] Simplify matchers in ShuffleVectorInst unit test (NFC) (#105596) Replace instances of `testing::ContainerEq(ArrayRef({1, 2, 3, 4}))` with `testing::ElementsAre(1, 2, 3, 4)` which is simpler and more readable. --- llvm/unittests/SandboxIR/SandboxIRTest.cpp | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index 8315ee38dbe187..b6981027b4c040 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -815,8 +815,7 @@ define void @foo(<2 x i8> %v1, <2 x i8> %v2) { I->commute(); EXPECT_EQ(I->getOperand(0), ArgV2); EXPECT_EQ(I->getOperand(1), ArgV1); - EXPECT_THAT(I->getShuffleMask(), - testing::ContainerEq(ArrayRef({2, 0}))); + EXPECT_THAT(I->getShuffleMask(), testing::ElementsAre(2, 0)); } // getType @@ -828,17 +827,16 @@ define void @foo(<2 x i8> %v1, <2 x i8> %v2) { // getShuffleMask / getShuffleMaskForBitcode { - EXPECT_THAT(SVI->getShuffleMask(), - testing::ContainerEq(ArrayRef({0, 2}))); + EXPECT_THAT(SVI->getShuffleMask(), testing::ElementsAre(0, 2)); SmallVector Result; SVI->getShuffleMask(Result); - EXPECT_THAT(Result, testing::ContainerEq(ArrayRef({0, 2}))); + EXPECT_THAT(Result, testing::ElementsAre(0, 2)); Result.clear(); sandboxir::ShuffleVectorInst::getShuffleMask( SVI->getShuffleMaskForBitcode(), Result); - EXPECT_THAT(Result, testing::ContainerEq(ArrayRef({0, 2}))); + EXPECT_THAT(Result, testing::ElementsAre(0, 2)); } // convertShuffleMaskForBitcode @@ -847,15 +845,14 @@ define void @foo(<2 x i8> %v1, <2 x i8> %v2) { ArrayRef({2, 3}), ArgV1->getType(), Ctx); SmallVector Result; sandboxir::ShuffleVectorInst::getShuffleMask(C, Result); - EXPECT_THAT(Result, testing::ContainerEq(ArrayRef({2, 3}))); + EXPECT_THAT(Result, testing::ElementsAre(2, 3)); } // setShuffleMask { auto *I = CreateShuffleWithMask(0, 1); I->setShuffleMask(ArrayRef({2, 3})); - EXPECT_THAT(I->getShuffleMask(), - testing::ContainerEq(ArrayRef({2, 3}))); + EXPECT_THAT(I->getShuffleMask(), testing::ElementsAre(2, 3)); } // The following functions check different mask properties. Note that most @@ -1107,7 +1104,7 @@ define void @foo(<2 x i8> %v1, <2 x i8> %v2) { { SmallVector M = {0, 2, 1, 3}; ShuffleVectorInst::commuteShuffleMask(M, 2); - EXPECT_THAT(M, testing::ContainerEq(ArrayRef({2, 0, 3, 1}))); + EXPECT_THAT(M, testing::ElementsAre(2, 0, 3, 1)); } // isInterleave / isInterleaveMask @@ -1119,7 +1116,7 @@ define void @foo(<2 x i8> %v1, <2 x i8> %v2) { SmallVector StartIndexes; EXPECT_TRUE(sandboxir::ShuffleVectorInst::isInterleaveMask( I->getShuffleMask(), 2, 4, StartIndexes)); - EXPECT_THAT(StartIndexes, testing::ContainerEq(ArrayRef({0, 2}))); + EXPECT_THAT(StartIndexes, testing::ElementsAre(0, 2)); } { auto *I = CreateShuffleWithMask(0, 3, 1, 2); From 64e464349bfca0d90e07f6db2f710d4d53cdacd4 Mon Sep 17 00:00:00 2001 From: eddyz87 Date: Thu, 22 Aug 2024 03:40:56 +0300 Subject: [PATCH 131/426] [BPF] introduce __attribute__((bpf_fastcall)) (#105417) This commit introduces attribute bpf_fastcall to declare BPF functions that do not clobber some of the caller saved registers (R0-R5). The idea is to generate the code complying with generic BPF ABI, but allow compatible Linux Kernel to remove unnecessary spills and fills of non-scratched registers (given some compiler assistance). For such functions do register allocation as-if caller saved registers are not clobbered, but later wrap the calls with spill and fill patterns that are simple to recognize in kernel. For example for the following C code: #define __bpf_fastcall __attribute__((bpf_fastcall)) void bar(void) __bpf_fastcall; void buz(long i, long j, long k); void foo(long i, long j, long k) { bar(); buz(i, j, k); } First allocate registers as if: foo: call bar # note: no spills for i,j,k (r1,r2,r3) call buz exit And later insert spills fills on the peephole phase: foo: *(u64 *)(r10 - 8) = r1; # Such call pattern is *(u64 *)(r10 - 16) = r2; # correct when used with *(u64 *)(r10 - 24) = r3; # old kernels. call bar r3 = *(u64 *)(r10 - 24); # But also allows new r2 = *(u64 *)(r10 - 16); # kernels to recognize the r1 = *(u64 *)(r10 - 8); # pattern and remove spills/fills. call buz exit The offsets for generated spills/fills are picked as minimal stack offsets for the function. Allocated stack slots are not used for any other purposes, in order to simplify in-kernel analysis. --- clang/include/clang/Basic/Attr.td | 9 ++ clang/include/clang/Basic/AttrDocs.td | 19 +++ clang/lib/CodeGen/CGCall.cpp | 2 + clang/test/CodeGen/bpf-attr-bpf-fastcall-1.c | 24 ++++ ...a-attribute-supported-attributes-list.test | 1 + clang/test/Sema/bpf-attr-bpf-fastcall.c | 14 +++ llvm/lib/Target/BPF/BPFCallingConv.td | 1 + llvm/lib/Target/BPF/BPFISelLowering.cpp | 31 +++++ llvm/lib/Target/BPF/BPFInstrInfo.td | 4 +- llvm/lib/Target/BPF/BPFMIPeephole.cpp | 84 +++++++++++++ llvm/lib/Target/BPF/BPFRegisterInfo.cpp | 11 ++ llvm/lib/Target/BPF/BPFRegisterInfo.h | 3 + llvm/test/CodeGen/BPF/bpf-fastcall-1.ll | 46 ++++++++ llvm/test/CodeGen/BPF/bpf-fastcall-2.ll | 68 +++++++++++ llvm/test/CodeGen/BPF/bpf-fastcall-3.ll | 62 ++++++++++ .../CodeGen/BPF/bpf-fastcall-regmask-1.ll | 110 ++++++++++++++++++ 16 files changed, 486 insertions(+), 3 deletions(-) create mode 100644 clang/test/CodeGen/bpf-attr-bpf-fastcall-1.c create mode 100644 clang/test/Sema/bpf-attr-bpf-fastcall.c create mode 100644 llvm/test/CodeGen/BPF/bpf-fastcall-1.ll create mode 100644 llvm/test/CodeGen/BPF/bpf-fastcall-2.ll create mode 100644 llvm/test/CodeGen/BPF/bpf-fastcall-3.ll create mode 100644 llvm/test/CodeGen/BPF/bpf-fastcall-regmask-1.ll diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 10a9d9e899e007..98bedfe20f5d98 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -2200,6 +2200,15 @@ def BTFTypeTag : TypeAttr { let LangOpts = [COnly]; } +def BPFFastCall : InheritableAttr, + TargetSpecificAttr { + let Spellings = [Clang<"bpf_fastcall">]; + let Subjects = SubjectList<[FunctionLike]>; + let Documentation = [BPFFastCallDocs]; + let LangOpts = [COnly]; + let SimpleHandler = 1; +} + def WebAssemblyExportName : InheritableAttr, TargetSpecificAttr { let Spellings = [Clang<"export_name">]; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 19cbb9a0111a28..df36a2163b9f0b 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -2345,6 +2345,25 @@ section. }]; } +def BPFFastCallDocs : Documentation { + let Category = DocCatType; + let Content = [{ +Functions annotated with this attribute are likely to be inlined by BPF JIT. +It is assumed that inlined implementation uses less caller saved registers, +than a regular function. +Specifically, the following registers are likely to be preserved: +- ``R0`` if function return value is ``void``; +- ``R2-R5` if function takes 1 argument; +- ``R3-R5` if function takes 2 arguments; +- ``R4-R5` if function takes 3 arguments; +- ``R5`` if function takes 4 arguments; + +For such functions Clang generates code pattern that allows BPF JIT +to recognize and remove unnecessary spills and fills of the preserved +registers. + }]; +} + def MipsInterruptDocs : Documentation { let Category = DocCatFunction; let Heading = "interrupt (MIPS)"; diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 34ca2227608361..ca2c79b51ac96b 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -2421,6 +2421,8 @@ void CodeGenModule::ConstructAttributeList(StringRef Name, FuncAttrs.addAttribute(llvm::Attribute::NoCfCheck); if (TargetDecl->hasAttr()) FuncAttrs.addAttribute(llvm::Attribute::NoCallback); + if (TargetDecl->hasAttr()) + FuncAttrs.addAttribute("bpf_fastcall"); HasOptnone = TargetDecl->hasAttr(); if (auto *AllocSize = TargetDecl->getAttr()) { diff --git a/clang/test/CodeGen/bpf-attr-bpf-fastcall-1.c b/clang/test/CodeGen/bpf-attr-bpf-fastcall-1.c new file mode 100644 index 00000000000000..fa740d8e44ff51 --- /dev/null +++ b/clang/test/CodeGen/bpf-attr-bpf-fastcall-1.c @@ -0,0 +1,24 @@ +// REQUIRES: bpf-registered-target +// RUN: %clang_cc1 -triple bpf -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s + +#define __bpf_fastcall __attribute__((bpf_fastcall)) + +void test(void) __bpf_fastcall; +void (*ptr)(void) __bpf_fastcall; + +void foo(void) { + test(); + (*ptr)(); +} + +// CHECK: @ptr = global ptr null +// CHECK: define {{.*}} void @foo() +// CHECK: entry: +// CHECK: call void @test() #[[call_attr:[0-9]+]] +// CHECK: %[[ptr:.*]] = load ptr, ptr @ptr, align 8 +// CHECK: call void %[[ptr]]() #[[call_attr]] +// CHECK: ret void + +// CHECK: declare void @test() #[[func_attr:[0-9]+]] +// CHECK: attributes #[[func_attr]] = { {{.*}}"bpf_fastcall"{{.*}} } +// CHECK: attributes #[[call_attr]] = { "bpf_fastcall" } diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test index 1a71556213bb16..a7e425e3d5f431 100644 --- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test +++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test @@ -22,6 +22,7 @@ // CHECK-NEXT: AssumeAligned (SubjectMatchRule_objc_method, SubjectMatchRule_function) // CHECK-NEXT: Availability ((SubjectMatchRule_record, SubjectMatchRule_enum, SubjectMatchRule_enum_constant, SubjectMatchRule_field, SubjectMatchRule_function, SubjectMatchRule_namespace, SubjectMatchRule_objc_category, SubjectMatchRule_objc_implementation, SubjectMatchRule_objc_interface, SubjectMatchRule_objc_method, SubjectMatchRule_objc_property, SubjectMatchRule_objc_protocol, SubjectMatchRule_record, SubjectMatchRule_type_alias, SubjectMatchRule_variable)) // CHECK-NEXT: AvailableOnlyInDefaultEvalMethod (SubjectMatchRule_type_alias) +// CHECK-NEXT: BPFFastCall (SubjectMatchRule_hasType_functionType) // CHECK-NEXT: BPFPreserveAccessIndex (SubjectMatchRule_record) // CHECK-NEXT: BPFPreserveStaticOffset (SubjectMatchRule_record) // CHECK-NEXT: BTFDeclTag (SubjectMatchRule_variable, SubjectMatchRule_function, SubjectMatchRule_record, SubjectMatchRule_field, SubjectMatchRule_type_alias) diff --git a/clang/test/Sema/bpf-attr-bpf-fastcall.c b/clang/test/Sema/bpf-attr-bpf-fastcall.c new file mode 100644 index 00000000000000..178b1f50741e87 --- /dev/null +++ b/clang/test/Sema/bpf-attr-bpf-fastcall.c @@ -0,0 +1,14 @@ +// REQUIRES: bpf-registered-target +// RUN: %clang_cc1 %s -triple bpf -verify + +__attribute__((bpf_fastcall)) int var; // expected-warning {{'bpf_fastcall' attribute only applies to functions and function pointers}} + +__attribute__((bpf_fastcall)) void func(); +__attribute__((bpf_fastcall(1))) void func_invalid(); // expected-error {{'bpf_fastcall' attribute takes no arguments}} + +void (*ptr1)(void) __attribute__((bpf_fastcall)); +void (*ptr2)(void); +void foo(void) { + ptr2 = ptr1; // not an error + ptr1 = ptr2; // not an error +} diff --git a/llvm/lib/Target/BPF/BPFCallingConv.td b/llvm/lib/Target/BPF/BPFCallingConv.td index ef4ef1930aa8fb..a557211437e95f 100644 --- a/llvm/lib/Target/BPF/BPFCallingConv.td +++ b/llvm/lib/Target/BPF/BPFCallingConv.td @@ -46,3 +46,4 @@ def CC_BPF32 : CallingConv<[ ]>; def CSR : CalleeSavedRegs<(add R6, R7, R8, R9, R10)>; +def CSR_PreserveAll : CalleeSavedRegs<(add R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10)>; diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp index 071fe004806e3e..ff23d3b055d0d5 100644 --- a/llvm/lib/Target/BPF/BPFISelLowering.cpp +++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp @@ -402,6 +402,21 @@ SDValue BPFTargetLowering::LowerFormalArguments( const size_t BPFTargetLowering::MaxArgs = 5; +static void resetRegMaskBit(const TargetRegisterInfo *TRI, uint32_t *RegMask, + MCRegister Reg) { + for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg)) + RegMask[SubReg / 32] &= ~(1u << (SubReg % 32)); +} + +static uint32_t *regMaskFromTemplate(const TargetRegisterInfo *TRI, + MachineFunction &MF, + const uint32_t *BaseRegMask) { + uint32_t *RegMask = MF.allocateRegMask(); + unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs()); + memcpy(RegMask, BaseRegMask, sizeof(RegMask[0]) * RegMaskSize); + return RegMask; +} + SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { SelectionDAG &DAG = CLI.DAG; @@ -513,6 +528,22 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, for (auto &Reg : RegsToPass) Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType())); + bool HasFastCall = + (CLI.CB && isa(CLI.CB) && CLI.CB->hasFnAttr("bpf_fastcall")); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + if (HasFastCall) { + uint32_t *RegMask = regMaskFromTemplate( + TRI, MF, TRI->getCallPreservedMask(MF, CallingConv::PreserveAll)); + for (auto const &RegPair : RegsToPass) + resetRegMaskBit(TRI, RegMask, RegPair.first); + if (!CLI.CB->getType()->isVoidTy()) + resetRegMaskBit(TRI, RegMask, BPF::R0); + Ops.push_back(DAG.getRegisterMask(RegMask)); + } else { + Ops.push_back( + DAG.getRegisterMask(TRI->getCallPreservedMask(MF, CLI.CallConv))); + } + if (InGlue.getNode()) Ops.push_back(InGlue); diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td index 2ee630e29790f3..4baeeb017699d6 100644 --- a/llvm/lib/Target/BPF/BPFInstrInfo.td +++ b/llvm/lib/Target/BPF/BPFInstrInfo.td @@ -677,9 +677,7 @@ let isBranch = 1, isTerminator = 1, hasDelaySlot=0, isBarrier = 1 in { } // Jump and link -let isCall=1, hasDelaySlot=0, Uses = [R11], - // Potentially clobbered registers - Defs = [R0, R1, R2, R3, R4, R5] in { +let isCall=1, hasDelaySlot=0, Uses = [R11] in { def JAL : CALL<"call">; def JALX : CALLX<"callx">; } diff --git a/llvm/lib/Target/BPF/BPFMIPeephole.cpp b/llvm/lib/Target/BPF/BPFMIPeephole.cpp index f0edf706bd8fd7..c41eab319dbb9b 100644 --- a/llvm/lib/Target/BPF/BPFMIPeephole.cpp +++ b/llvm/lib/Target/BPF/BPFMIPeephole.cpp @@ -24,6 +24,8 @@ #include "BPFInstrInfo.h" #include "BPFTargetMachine.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -319,6 +321,7 @@ struct BPFMIPreEmitPeephole : public MachineFunctionPass { bool in16BitRange(int Num); bool eliminateRedundantMov(); bool adjustBranch(); + bool insertMissingCallerSavedSpills(); public: @@ -333,6 +336,7 @@ struct BPFMIPreEmitPeephole : public MachineFunctionPass { Changed = eliminateRedundantMov(); if (SupportGotol) Changed = adjustBranch() || Changed; + Changed |= insertMissingCallerSavedSpills(); return Changed; } }; @@ -596,6 +600,86 @@ bool BPFMIPreEmitPeephole::adjustBranch() { return Changed; } +static const unsigned CallerSavedRegs[] = {BPF::R0, BPF::R1, BPF::R2, + BPF::R3, BPF::R4, BPF::R5}; + +struct BPFFastCall { + MachineInstr *MI; + unsigned LiveCallerSavedRegs; +}; + +static void collectBPFFastCalls(const TargetRegisterInfo *TRI, + LivePhysRegs &LiveRegs, MachineBasicBlock &BB, + SmallVectorImpl &Calls) { + LiveRegs.init(*TRI); + LiveRegs.addLiveOuts(BB); + Calls.clear(); + for (MachineInstr &MI : llvm::reverse(BB)) { + if (MI.isCall()) { + unsigned LiveCallerSavedRegs = 0; + for (MCRegister R : CallerSavedRegs) { + bool DoSpillFill = !MI.definesRegister(R, TRI) && LiveRegs.contains(R); + if (!DoSpillFill) + continue; + LiveCallerSavedRegs |= 1 << R; + } + if (LiveCallerSavedRegs) + Calls.push_back({&MI, LiveCallerSavedRegs}); + } + LiveRegs.stepBackward(MI); + } +} + +static int64_t computeMinFixedObjOffset(MachineFrameInfo &MFI, + unsigned SlotSize) { + int64_t MinFixedObjOffset = 0; + // Same logic as in X86FrameLowering::adjustFrameForMsvcCxxEh() + for (int I = MFI.getObjectIndexBegin(); I < MFI.getObjectIndexEnd(); ++I) { + if (MFI.isDeadObjectIndex(I)) + continue; + MinFixedObjOffset = std::min(MinFixedObjOffset, MFI.getObjectOffset(I)); + } + MinFixedObjOffset -= + (SlotSize + MinFixedObjOffset % SlotSize) & (SlotSize - 1); + return MinFixedObjOffset; +} + +bool BPFMIPreEmitPeephole::insertMissingCallerSavedSpills() { + MachineFrameInfo &MFI = MF->getFrameInfo(); + SmallVector Calls; + LivePhysRegs LiveRegs; + const unsigned SlotSize = 8; + int64_t MinFixedObjOffset = computeMinFixedObjOffset(MFI, SlotSize); + bool Changed = false; + for (MachineBasicBlock &BB : *MF) { + collectBPFFastCalls(TRI, LiveRegs, BB, Calls); + Changed |= !Calls.empty(); + for (BPFFastCall &Call : Calls) { + int64_t CurOffset = MinFixedObjOffset; + for (MCRegister Reg : CallerSavedRegs) { + if (((1 << Reg) & Call.LiveCallerSavedRegs) == 0) + continue; + // Allocate stack object + CurOffset -= SlotSize; + MFI.CreateFixedSpillStackObject(SlotSize, CurOffset); + // Generate spill + BuildMI(BB, Call.MI->getIterator(), Call.MI->getDebugLoc(), + TII->get(BPF::STD)) + .addReg(Reg, RegState::Kill) + .addReg(BPF::R10) + .addImm(CurOffset); + // Generate fill + BuildMI(BB, ++Call.MI->getIterator(), Call.MI->getDebugLoc(), + TII->get(BPF::LDD)) + .addReg(Reg, RegState::Define) + .addReg(BPF::R10) + .addImm(CurOffset); + } + } + } + return Changed; +} + } // end default namespace INITIALIZE_PASS(BPFMIPreEmitPeephole, "bpf-mi-pemit-peephole", diff --git a/llvm/lib/Target/BPF/BPFRegisterInfo.cpp b/llvm/lib/Target/BPF/BPFRegisterInfo.cpp index 84af6806abb36c..69e1318954a973 100644 --- a/llvm/lib/Target/BPF/BPFRegisterInfo.cpp +++ b/llvm/lib/Target/BPF/BPFRegisterInfo.cpp @@ -40,6 +40,17 @@ BPFRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_SaveList; } +const uint32_t * +BPFRegisterInfo::getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const { + switch (CC) { + default: + return CSR_RegMask; + case CallingConv::PreserveAll: + return CSR_PreserveAll_RegMask; + } +} + BitVector BPFRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); markSuperRegs(Reserved, BPF::W10); // [W|R]10 is read only frame pointer diff --git a/llvm/lib/Target/BPF/BPFRegisterInfo.h b/llvm/lib/Target/BPF/BPFRegisterInfo.h index f7dea75ebea6f9..db868769a1579a 100644 --- a/llvm/lib/Target/BPF/BPFRegisterInfo.h +++ b/llvm/lib/Target/BPF/BPFRegisterInfo.h @@ -26,6 +26,9 @@ struct BPFRegisterInfo : public BPFGenRegisterInfo { const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const uint32_t *getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID) const override; + BitVector getReservedRegs(const MachineFunction &MF) const override; bool eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, diff --git a/llvm/test/CodeGen/BPF/bpf-fastcall-1.ll b/llvm/test/CodeGen/BPF/bpf-fastcall-1.ll new file mode 100644 index 00000000000000..fd81314a495ef8 --- /dev/null +++ b/llvm/test/CodeGen/BPF/bpf-fastcall-1.ll @@ -0,0 +1,46 @@ +; RUN: llc -O2 --march=bpfel %s -o - | FileCheck %s + +; Generated from the following C code: +; +; #define __bpf_fastcall __attribute__((bpf_fastcall)) +; +; void bar(void) __bpf_fastcall; +; void buz(long i, long j, long k); +; +; void foo(long i, long j, long k) { +; bar(); +; buz(i, j, k); +; } +; +; Using the following command: +; +; clang --target=bpf -emit-llvm -O2 -S -o - t.c +; +; (unnecessary attrs removed maually) + +; Check that function marked with bpf_fastcall does not clobber R1-R5. + +define dso_local void @foo(i64 noundef %i, i64 noundef %j, i64 noundef %k) { +entry: + tail call void @bar() #1 + tail call void @buz(i64 noundef %i, i64 noundef %j, i64 noundef %k) + ret void +} + +; CHECK: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: *(u64 *)(r10 - 8) = r1 +; CHECK-NEXT: *(u64 *)(r10 - 16) = r2 +; CHECK-NEXT: *(u64 *)(r10 - 24) = r3 +; CHECK-NEXT: call bar +; CHECK-NEXT: r3 = *(u64 *)(r10 - 24) +; CHECK-NEXT: r2 = *(u64 *)(r10 - 16) +; CHECK-NEXT: r1 = *(u64 *)(r10 - 8) +; CHECK-NEXT: call buz +; CHECK-NEXT: exit + +declare dso_local void @bar() #0 +declare dso_local void @buz(i64 noundef, i64 noundef, i64 noundef) + +attributes #0 = { "bpf_fastcall" } +attributes #1 = { nounwind "bpf_fastcall" } diff --git a/llvm/test/CodeGen/BPF/bpf-fastcall-2.ll b/llvm/test/CodeGen/BPF/bpf-fastcall-2.ll new file mode 100644 index 00000000000000..e3e29cdddca8ea --- /dev/null +++ b/llvm/test/CodeGen/BPF/bpf-fastcall-2.ll @@ -0,0 +1,68 @@ +; RUN: llc -O2 --march=bpfel %s -o - | FileCheck %s + +; Generated from the following C code: +; +; #define __bpf_fastcall __attribute__((bpf_fastcall)) +; +; void bar(void) __bpf_fastcall; +; void buz(long i, long j); +; +; void foo(long i, long j, long k, long l) { +; bar(); +; if (k > 42l) +; buz(i, 1); +; else +; buz(1, j); +; } +; +; Using the following command: +; +; clang --target=bpf -emit-llvm -O2 -S -o - t.c +; +; (unnecessary attrs removed maually) + +; Check that function marked with bpf_fastcall does not clobber R1-R5. +; Use R1 in one branch following call and R2 in another branch following call. + +define dso_local void @foo(i64 noundef %i, i64 noundef %j, i64 noundef %k, i64 noundef %l) { +entry: + tail call void @bar() #0 + %cmp = icmp sgt i64 %k, 42 + br i1 %cmp, label %if.then, label %if.else + +if.then: + tail call void @buz(i64 noundef %i, i64 noundef 1) + br label %if.end + +if.else: + tail call void @buz(i64 noundef 1, i64 noundef %j) + br label %if.end + +if.end: + ret void +} + +; CHECK: foo: # @foo +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: *(u64 *)(r10 - 8) = r1 +; CHECK-NEXT: *(u64 *)(r10 - 16) = r2 +; CHECK-NEXT: *(u64 *)(r10 - 24) = r3 +; CHECK-NEXT: call bar +; CHECK-NEXT: r3 = *(u64 *)(r10 - 24) +; CHECK-NEXT: r2 = *(u64 *)(r10 - 16) +; CHECK-NEXT: r1 = *(u64 *)(r10 - 8) +; CHECK-NEXT: r4 = 43 +; CHECK-NEXT: if r4 s> r3 goto [[ELSE:.*]] +; CHECK-NEXT: # %bb.1: # %if.then +; CHECK-NEXT: r2 = 1 +; CHECK-NEXT: goto [[END:.*]] +; CHECK-NEXT: [[ELSE]]: # %if.else +; CHECK-NEXT: r1 = 1 +; CHECK-NEXT: [[END]]: # %if.end +; CHECK-NEXT: call buz +; CHECK-NEXT: exit + +declare dso_local void @bar() #0 +declare dso_local void @buz(i64 noundef, i64 noundef) + +attributes #0 = { "bpf_fastcall" } diff --git a/llvm/test/CodeGen/BPF/bpf-fastcall-3.ll b/llvm/test/CodeGen/BPF/bpf-fastcall-3.ll new file mode 100644 index 00000000000000..81ca4e1ac57bc7 --- /dev/null +++ b/llvm/test/CodeGen/BPF/bpf-fastcall-3.ll @@ -0,0 +1,62 @@ +; RUN: llc -O2 --march=bpfel %s -o - | FileCheck %s + +; Generated from the following C code: +; +; #define __bpf_fastcall __attribute__((bpf_fastcall)) +; +; void quux(void *); +; void bar(long) __bpf_fastcall; +; void buz(long i, long j); +; +; void foo(long i, long j) { +; long k; +; bar(i); +; bar(i); +; buz(i, j); +; quux(&k); +; } +; +; Using the following command: +; +; clang --target=bpf -emit-llvm -O2 -S -o - t.c +; +; (unnecessary attrs removed maually) + +; Check that function marked with bpf_fastcall does not clobber R1-R5. +; Check that spills/fills wrapping the call use and reuse lowest stack offsets. + +define dso_local void @foo(i64 noundef %i, i64 noundef %j) { +entry: + %k = alloca i64, align 8 + tail call void @bar(i64 noundef %i) #0 + tail call void @bar(i64 noundef %i) #0 + tail call void @buz(i64 noundef %i, i64 noundef %j) + call void @quux(ptr noundef nonnull %k) + ret void +} + +; CHECK: # %bb.0: +; CHECK-NEXT: r3 = r1 +; CHECK-NEXT: *(u64 *)(r10 - 16) = r2 +; CHECK-NEXT: *(u64 *)(r10 - 24) = r3 +; CHECK-NEXT: call bar +; CHECK-NEXT: r3 = *(u64 *)(r10 - 24) +; CHECK-NEXT: r2 = *(u64 *)(r10 - 16) +; CHECK-NEXT: r1 = r3 +; CHECK-NEXT: *(u64 *)(r10 - 16) = r2 +; CHECK-NEXT: *(u64 *)(r10 - 24) = r3 +; CHECK-NEXT: call bar +; CHECK-NEXT: r3 = *(u64 *)(r10 - 24) +; CHECK-NEXT: r2 = *(u64 *)(r10 - 16) +; CHECK-NEXT: r1 = r3 +; CHECK-NEXT: call buz +; CHECK-NEXT: r1 = r10 +; CHECK-NEXT: r1 += -8 +; CHECK-NEXT: call quux +; CHECK-NEXT: exit + +declare dso_local void @bar(i64 noundef) #0 +declare dso_local void @buz(i64 noundef, i64 noundef) +declare dso_local void @quux(ptr noundef) + +attributes #0 = { "bpf_fastcall" } diff --git a/llvm/test/CodeGen/BPF/bpf-fastcall-regmask-1.ll b/llvm/test/CodeGen/BPF/bpf-fastcall-regmask-1.ll new file mode 100644 index 00000000000000..857d2f000d1d5a --- /dev/null +++ b/llvm/test/CodeGen/BPF/bpf-fastcall-regmask-1.ll @@ -0,0 +1,110 @@ +; RUN: llc -O2 --march=bpfel \ +; RUN: -print-after=stack-slot-coloring %s \ +; RUN: -o /dev/null 2>&1 | FileCheck %s + +; Generated from the following C code: +; +; #define __bpf_fastcall __attribute__((bpf_fastcall)) +; +; void bar1(void) __bpf_fastcall; +; void buz1(long i, long j, long k); +; void foo1(long i, long j, long k) { +; bar1(); +; buz1(i, j, k); +; } +; +; long bar2(void) __bpf_fastcall; +; void buz2(long i, long j, long k); +; void foo2(long i, long j, long k) { +; bar2(); +; buz2(i, j, k); +; } +; +; void bar3(long) __bpf_fastcall; +; void buz3(long i, long j, long k); +; void foo3(long i, long j, long k) { +; bar3(i); +; buz3(i, j, k); +; } +; +; long bar4(long, long) __bpf_fastcall; +; void buz4(long i, long j, long k); +; void foo4(long i, long j, long k) { +; bar4(i, j); +; buz4(i, j, k); +; } +; +; Using the following command: +; +; clang --target=bpf -emit-llvm -O2 -S -o - t.c +; +; (unnecessary attrs removed maually) + +; Check regmask for calls to functions marked with bpf_fastcall: +; - void function w/o parameters +; - non-void function w/o parameters +; - void function with parameters +; - non-void function with parameters + +declare dso_local void @bar1() #0 +declare dso_local void @buz1(i64 noundef, i64 noundef, i64 noundef) +define dso_local void @foo1(i64 noundef %i, i64 noundef %j, i64 noundef %k) { +entry: + tail call void @bar1() #1 + tail call void @buz1(i64 noundef %i, i64 noundef %j, i64 noundef %k) + ret void +} + +; CHECK: JAL @bar1, +; CHECK-SAME: , implicit $r11, implicit-def $r11 +; CHECK: JAL @buz1, +; CHECK-SAME: , implicit $r11, implicit $r1, implicit $r2, implicit $r3, implicit-def $r11 + +declare dso_local i64 @bar2() #0 +declare dso_local void @buz2(i64 noundef, i64 noundef, i64 noundef) +define dso_local void @foo2(i64 noundef %i, i64 noundef %j, i64 noundef %k) { +entry: + tail call i64 @bar2() #1 + tail call void @buz2(i64 noundef %i, i64 noundef %j, i64 noundef %k) + ret void +} + +; CHECK: JAL @bar2, +; CHECK-SAME: , implicit $r11, implicit-def $r11, implicit-def dead $r0 +; CHECK: JAL @buz2, +; CHECK-SAME: , implicit $r11, implicit $r1, implicit $r2, implicit $r3, implicit-def $r11 + +declare dso_local void @bar3(i64) #0 +declare dso_local void @buz3(i64 noundef, i64 noundef, i64 noundef) +define dso_local void @foo3(i64 noundef %i, i64 noundef %j, i64 noundef %k) { +entry: + tail call void @bar3(i64 noundef %i) #1 + tail call void @buz3(i64 noundef %i, i64 noundef %j, i64 noundef %k) + ret void +} + +; CHECK: JAL @bar3, +; CHECK-SAME: , implicit $r11, implicit $r1, implicit-def $r11 +; CHECK: JAL @buz3, +; CHECK-SAME: , implicit $r11, implicit $r1, implicit $r2, implicit $r3, implicit-def $r11 + +declare dso_local i64 @bar4(i64 noundef, i64 noundef) #0 +declare dso_local void @buz4(i64 noundef, i64 noundef, i64 noundef) +define dso_local void @foo4(i64 noundef %i, i64 noundef %j, i64 noundef %k) { +entry: + tail call i64 @bar4(i64 noundef %i, i64 noundef %j) #1 + tail call void @buz4(i64 noundef %i, i64 noundef %j, i64 noundef %k) + ret void +} + +; CHECK: JAL @bar4, +; CHECK-SAME: , implicit $r11, implicit $r1, implicit $r2, implicit-def $r11, implicit-def dead $r0 +; CHECK: JAL @buz4, +; CHECK-SAME: , implicit $r11, implicit $r1, implicit $r2, implicit $r3, implicit-def $r11 + +attributes #0 = { "bpf_fastcall" } +attributes #1 = { nounwind "bpf_fastcall" } From e2b97f3802ac5a75a603c9cacd2f3ab19b6cf9b5 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 21 Aug 2024 17:44:05 -0700 Subject: [PATCH 132/426] Revert "Speculative fix for asan/TestCases/Darwin/cstring_section.c" This fix is not enough, and the breaking patch was reverted with 2704b804bec50c2b016bf678bd534c330ec655b6. This reverts commit bf71c64839c0082e761a4f070ed92e01ced0187c. --- compiler-rt/test/asan/TestCases/Darwin/cstring_section.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler-rt/test/asan/TestCases/Darwin/cstring_section.c b/compiler-rt/test/asan/TestCases/Darwin/cstring_section.c index e40c4b1b8ed6ba..d72b0ba8a8bb33 100644 --- a/compiler-rt/test/asan/TestCases/Darwin/cstring_section.c +++ b/compiler-rt/test/asan/TestCases/Darwin/cstring_section.c @@ -6,10 +6,10 @@ // Check that "Hello.\n" is in __asan_cstring and not in __cstring. // CHECK: Contents of section {{.*}}__asan_cstring: // CHECK: 48656c6c {{.*}} Hello. -// CHECK: Contents of section {{.*}}__cstring: -// CHECK-NOT: 48656c6c {{.*}} Hello. // CHECK: Contents of section {{.*}}__const: // CHECK-NOT: 48656c6c {{.*}} Hello. +// CHECK: Contents of section {{.*}}__cstring: +// CHECK-NOT: 48656c6c {{.*}} Hello. int main(int argc, char *argv[]) { argv[0] = "Hello.\n"; From 359c704004ec0826059578c79974d9ea29a8fbff Mon Sep 17 00:00:00 2001 From: Shubham Sandeep Rastogi Date: Wed, 21 Aug 2024 17:52:37 -0700 Subject: [PATCH 133/426] Handle #dbg_values in SROA. (#94070) This patch properly handles #dbg_values in SROA by making sure that any #dbg_values get moved to before a store just like #dbg_declares do, or the #dbg_value is correctly updated with the right alloca after an aggregate alloca is broken up. The issue stems from swift where #dbg_values are emitted and not dbg.declares, the SROA pass doesn't handle the #dbg_values correctly and it causes them to all have undefs If we look at this simple-ish testcase (This is all I could reduce it down to, and I am still relatively bad at writing llvm IR by hand so I apologize in advance): ``` %T4main1TV13TangentVectorV = type <{ %T4main1UV13TangentVectorV, [7 x i8], %T4main1UV13TangentVectorV }> %T4main1UV13TangentVectorV = type <{ %T1M1SVySfG, [7 x i8], %T4main1VV13TangentVectorV }> %T1M1SVySfG = type <{ ptr, %Ts4Int8V }> %Ts4Int8V = type <{ i8 }> %T4main1VV13TangentVectorV = type <{ %T1M1SVySfG }> define hidden swiftcc void @"$s4main1TV13TangentVectorV1poiyA2E_AEtFZ"(ptr noalias nocapture sret(%T4main1TV13TangentVectorV) %0, ptr noalias nocapture dereferenceable(57) %1, ptr noalias nocapture dereferenceable(57) %2) #0 !dbg !44 { entry: %3 = alloca %T4main1VV13TangentVectorV %4 = alloca %T4main1UV13TangentVectorV %5 = alloca %T4main1VV13TangentVectorV %6 = alloca %T4main1UV13TangentVectorV %7 = alloca %T4main1VV13TangentVectorV %8 = alloca %T4main1UV13TangentVectorV %9 = alloca %T4main1VV13TangentVectorV %10 = alloca %T4main1UV13TangentVectorV call void @llvm.lifetime.start.p0(i64 9, ptr %3) call void @llvm.lifetime.start.p0(i64 25, ptr %4) call void @llvm.lifetime.start.p0(i64 9, ptr %5) call void @llvm.lifetime.start.p0(i64 25, ptr %6) call void @llvm.lifetime.start.p0(i64 9, ptr %7) call void @llvm.lifetime.start.p0(i64 25, ptr %8) call void @llvm.lifetime.start.p0(i64 9, ptr %9) call void @llvm.lifetime.start.p0(i64 25, ptr %10) %.u1 = getelementptr inbounds %T4main1TV13TangentVectorV, ptr %1, i32 0, i32 0 call void @llvm.memcpy.p0.p0.i64(ptr align 8 %4, ptr align 8 %.u1, i64 25, i1 false) %.u11 = getelementptr inbounds %T4main1TV13TangentVectorV, ptr %2, i32 0, i32 0 call void @llvm.memcpy.p0.p0.i64(ptr align 8 %6, ptr align 8 %.u11, i64 25, i1 false) call void @llvm.dbg.value(metadata ptr %4, metadata !62, metadata !DIExpression(DW_OP_deref)), !dbg !75 %.s = getelementptr inbounds %T4main1UV13TangentVectorV, ptr %4, i32 0, i32 0 %.s.c = getelementptr inbounds %T1M1SVySfG, ptr %.s, i32 0, i32 0 %11 = load ptr, ptr %.s.c %.s.b = getelementptr inbounds %T1M1SVySfG, ptr %.s, i32 0, i32 1 %.s.b._value = getelementptr inbounds %Ts4Int8V, ptr %.s.b, i32 0, i32 0 %12 = load i8, ptr %.s.b._value %.s2 = getelementptr inbounds %T4main1UV13TangentVectorV, ptr %6, i32 0, i32 0 %.s2.c = getelementptr inbounds %T1M1SVySfG, ptr %.s2, i32 0, i32 0 %13 = load ptr, ptr %.s2.c %.s2.b = getelementptr inbounds %T1M1SVySfG, ptr %.s2, i32 0, i32 1 %.s2.b._value = getelementptr inbounds %Ts4Int8V, ptr %.s2.b, i32 0, i32 0 %14 = load i8, ptr %.s2.b._value %.v = getelementptr inbounds %T4main1UV13TangentVectorV, ptr %4, i32 0, i32 2 call void @llvm.memcpy.p0.p0.i64(ptr align 8 %3, ptr align 8 %.v, i64 9, i1 false) %.v3 = getelementptr inbounds %T4main1UV13TangentVectorV, ptr %6, i32 0, i32 2 call void @llvm.memcpy.p0.p0.i64(ptr align 8 %5, ptr align 8 %.v3, i64 9, i1 false) %.s4 = getelementptr inbounds %T4main1VV13TangentVectorV, ptr %3, i32 0, i32 0 %.s4.c = getelementptr inbounds %T1M1SVySfG, ptr %.s4, i32 0, i32 0 %18 = load ptr, ptr %.s4.c %.s5 = getelementptr inbounds %T4main1VV13TangentVectorV, ptr %5, i32 0, i32 0 %.s5.c = getelementptr inbounds %T1M1SVySfG, ptr %.s5, i32 0, i32 0 %20 = load ptr, ptr %.s5.c %.u2 = getelementptr inbounds %T4main1TV13TangentVectorV, ptr %1, i32 0, i32 2 call void @llvm.memcpy.p0.p0.i64(ptr align 8 %8, ptr align 8 %.u2, i64 25, i1 false) %.u26 = getelementptr inbounds %T4main1TV13TangentVectorV, ptr %2, i32 0, i32 2 call void @llvm.memcpy.p0.p0.i64(ptr align 8 %10, ptr align 8 %.u26, i64 25, i1 false) %.s7 = getelementptr inbounds %T4main1UV13TangentVectorV, ptr %8, i32 0, i32 0 %.s7.c = getelementptr inbounds %T1M1SVySfG, ptr %.s7, i32 0, i32 0 %25 = load ptr, ptr %.s7.c %.s7.b = getelementptr inbounds %T1M1SVySfG, ptr %.s7, i32 0, i32 1 %.s7.b._value = getelementptr inbounds %Ts4Int8V, ptr %.s7.b, i32 0, i32 0 %26 = load i8, ptr %.s7.b._value %.s8 = getelementptr inbounds %T4main1UV13TangentVectorV, ptr %10, i32 0, i32 0 %.s8.c = getelementptr inbounds %T1M1SVySfG, ptr %.s8, i32 0, i32 0 %27 = load ptr, ptr %.s8.c %.s8.b = getelementptr inbounds %T1M1SVySfG, ptr %.s8, i32 0, i32 1 %.s8.b._value = getelementptr inbounds %Ts4Int8V, ptr %.s8.b, i32 0, i32 0 %28 = load i8, ptr %.s8.b._value %.v9 = getelementptr inbounds %T4main1UV13TangentVectorV, ptr %8, i32 0, i32 2 call void @llvm.memcpy.p0.p0.i64(ptr align 8 %7, ptr align 8 %.v9, i64 9, i1 false) %.v10 = getelementptr inbounds %T4main1UV13TangentVectorV, ptr %10, i32 0, i32 2 call void @llvm.memcpy.p0.p0.i64(ptr align 8 %9, ptr align 8 %.v10, i64 9, i1 false) %.s11 = getelementptr inbounds %T4main1VV13TangentVectorV, ptr %7, i32 0, i32 0 %.s11.c = getelementptr inbounds %T1M1SVySfG, ptr %.s11, i32 0, i32 0 %32 = load ptr, ptr %.s11.c %.s12 = getelementptr inbounds %T4main1VV13TangentVectorV, ptr %9, i32 0, i32 0 %.s12.c = getelementptr inbounds %T1M1SVySfG, ptr %.s12, i32 0, i32 0 %34 = load ptr, ptr %.s12.c call void @llvm.lifetime.end.p0(i64 25, ptr %10) call void @llvm.lifetime.end.p0(i64 9, ptr %9) call void @llvm.lifetime.end.p0(i64 25, ptr %8) call void @llvm.lifetime.end.p0(i64 9, ptr %7) call void @llvm.lifetime.end.p0(i64 25, ptr %6) call void @llvm.lifetime.end.p0(i64 9, ptr %5) call void @llvm.lifetime.end.p0(i64 25, ptr %4) call void @llvm.lifetime.end.p0(i64 9, ptr %3) ret void } !llvm.module.flags = !{!0, !1, !2, !3, !4, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15} !swift.module.flags = !{!33} !llvm.linker.options = !{!34, !35, !36, !37, !38, !39, !40, !41, !42, !43} !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 14, i32 4]} !1 = !{i32 1, !"Objective-C Version", i32 2} !2 = !{i32 1, !"Objective-C Image Info Version", i32 0} !3 = !{i32 1, !"Objective-C Image Info Section", !"__DATA, no_dead_strip"} !4 = !{i32 1, !"Objective-C Garbage Collection", i8 0} !6 = !{i32 7, !"Dwarf Version", i32 4} !7 = !{i32 2, !"Debug Info Version", i32 3} !8 = !{i32 1, !"wchar_size", i32 4} !9 = !{i32 8, !"PIC Level", i32 2} !10 = !{i32 7, !"uwtable", i32 1} !11 = !{i32 7, !"frame-pointer", i32 1} !12 = !{i32 1, !"Swift Version", i32 7} !13 = !{i32 1, !"Swift ABI Version", i32 7} !14 = !{i32 1, !"Swift Major Version", i8 6} !15 = !{i32 1, !"Swift Minor Version", i8 0} !16 = distinct !DICompileUnit(language: DW_LANG_Swift, file: !17, imports: !18, sdk: "MacOSX14.4.sdk") !17 = !DIFile(filename: "/Users/emilpedersen/swift2/swift/test/IRGen/debug_scope_distinct.swift", directory: "/Users/emilpedersen/swift2") !18 = !{!19, !21, !23, !25, !27, !29, !31} !19 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !17, entity: !20, file: !17) !20 = !DIModule(scope: null, name: "main", includePath: "/Users/emilpedersen/swift2/swift/test/IRGen") !21 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !17, entity: !22, file: !17) !22 = !DIModule(scope: null, name: "Swift", includePath: "/Users/emilpedersen/swift2/_build/Ninja-RelWithDebInfoAssert+stdlib-RelWithDebInfo/swift-macosx-arm64/lib/swift/macosx/Swift.swiftmodule/arm64-apple-macos.swiftmodule") !23 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !17, entity: !24, line: 60) !24 = !DIModule(scope: null, name: "_Differentiation", includePath: "/Users/emilpedersen/swift2/_build/Ninja-RelWithDebInfoAssert+stdlib-RelWithDebInfo/swift-macosx-arm64/lib/swift/macosx/_Differentiation.swiftmodule/arm64-apple-macos.swiftmodule") !25 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !17, entity: !26, line: 61) !26 = !DIModule(scope: null, name: "M", includePath: "/Users/emilpedersen/swift2/_build/Ninja-RelWithDebInfoAssert+stdlib-RelWithDebInfo/swift-macosx-arm64/test-macosx-arm64/IRGen/Output/debug_scope_distinct.swift.tmp/M.swiftmodule") !27 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !17, entity: !28, file: !17) !28 = !DIModule(scope: null, name: "_StringProcessing", includePath: "/Users/emilpedersen/swift2/_build/Ninja-RelWithDebInfoAssert+stdlib-RelWithDebInfo/swift-macosx-arm64/lib/swift/macosx/_StringProcessing.swiftmodule/arm64-apple-macos.swiftmodule") !29 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !17, entity: !30, file: !17) !30 = !DIModule(scope: null, name: "_SwiftConcurrencyShims", includePath: "/Users/emilpedersen/swift2/_build/Ninja-RelWithDebInfoAssert+stdlib-RelWithDebInfo/swift-macosx-arm64/lib/swift/shims") !31 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !17, entity: !32, file: !17) !32 = !DIModule(scope: null, name: "_Concurrency", includePath: "/Users/emilpedersen/swift2/_build/Ninja-RelWithDebInfoAssert+stdlib-RelWithDebInfo/swift-macosx-arm64/lib/swift/macosx/_Concurrency.swiftmodule/arm64-apple-macos.swiftmodule") !33 = !{i1 false} !34 = !{!"-lswiftCore"} !35 = !{!"-lswift_StringProcessing"} !36 = !{!"-lswift_Differentiation"} !37 = !{!"-lswiftDarwin"} !38 = !{!"-lswift_Concurrency"} !39 = !{!"-lswiftSwiftOnoneSupport"} !40 = !{!"-lobjc"} !41 = !{!"-lswiftCompatibilityConcurrency"} !42 = !{!"-lswiftCompatibility56"} !43 = !{!"-lswiftCompatibilityPacks"} !44 = distinct !DISubprogram( unit: !16, declaration: !52, retainedNodes: !53) !45 = !DIFile(filename: "", directory: "/") !46 = !DICompositeType(tag: DW_TAG_structure_type, scope: !47, elements: !48, identifier: "$s4main1TV13TangentVectorVD") !47 = !DICompositeType(tag: DW_TAG_structure_type, identifier: "$s4main1TVD") !48 = !{} !49 = !DISubroutineType(types: !50) !50 = !{!51} !51 = !DICompositeType(tag: DW_TAG_structure_type, identifier: "$s4main1TV13TangentVectorVXMtD") !52 = !DISubprogram( file: !45, type: !49, spFlags: DISPFlagOptimized) !53 = !{!54, !56, !57} !54 = !DILocalVariable( scope: !44, type: !55, flags: DIFlagArtificial) !55 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !46) !56 = !DILocalVariable( scope: !44, flags: DIFlagArtificial) !57 = !DILocalVariable( scope: !44, type: !58, flags: DIFlagArtificial) !58 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !51) !62 = !DILocalVariable( scope: !63, type: !72, flags: DIFlagArtificial) !63 = distinct !DISubprogram( type: !66, unit: !16, declaration: !69, retainedNodes: !70) !64 = !DICompositeType(tag: DW_TAG_structure_type, scope: !65, identifier: "$s4main1UV13TangentVectorVD") !65 = !DICompositeType(tag: DW_TAG_structure_type, identifier: "$s4main1UVD") !66 = !DISubroutineType(types: !67) !67 = !{!68} !68 = !DICompositeType(tag: DW_TAG_structure_type, identifier: "$s4main1UV13TangentVectorVXMtD") !69 = !DISubprogram( spFlags: DISPFlagOptimized) !70 = !{!71, !73} !71 = !DILocalVariable( scope: !63, flags: DIFlagArtificial) !72 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !64) !73 = !DILocalVariable( scope: !63, type: !74, flags: DIFlagArtificial) !74 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !68) !75 = !DILocation( scope: !63, inlinedAt: !76) !76 = distinct !DILocation( scope: !44) ``` if we run ` opt -S -passes=sroa file.ll -o -` With this patch we will see ``` %.sroa.5.sroa.021 = alloca [7 x i8], align 8 tail call void @llvm.dbg.value(metadata ptr %.sroa.5.sroa.021, metadata !59, metadata !DIExpression(DW_OP_deref, DW_OP_LLVM_fragment, 72, 56)), !dbg !72 %.sroa.5.sroa.014 = alloca [7 x i8], align 8 ``` Without this patch we will see: ``` %.sroa.5.sroa.021 = alloca [7 x i8], align 8 %.sroa.5.sroa.014 = alloca [7 x i8], align 8 ``` Thus this patch ensures that llvm.dbg.values that use allocas that are broken up still have the correct metadata and debug information is preserved This is part of a stack of patches and is preceded by: https://github.com/llvm/llvm-project/pull/94068 --- llvm/include/llvm/IR/DebugInfo.h | 2 + .../include/llvm/IR/DebugProgramInstruction.h | 5 + llvm/include/llvm/IR/IntrinsicInst.h | 8 ++ llvm/include/llvm/Transforms/Utils/Local.h | 10 ++ llvm/lib/IR/DebugInfo.cpp | 21 ++- llvm/lib/Transforms/Scalar/SROA.cpp | 23 ++-- llvm/lib/Transforms/Utils/Local.cpp | 34 +++++ .../Utils/PromoteMemoryToRegister.cpp | 4 + .../Generic/mem2reg-promote-alloca-1.ll | 2 +- llvm/test/DebugInfo/sroa-handle-dbg-value.ll | 110 ++++++++++++++++ llvm/test/Transforms/SROA/alignment.ll | 56 ++++---- llvm/test/Transforms/SROA/vector-promotion.ll | 120 +++++++++++------- 12 files changed, 313 insertions(+), 82 deletions(-) create mode 100644 llvm/test/DebugInfo/sroa-handle-dbg-value.ll diff --git a/llvm/include/llvm/IR/DebugInfo.h b/llvm/include/llvm/IR/DebugInfo.h index 5b80218d6c5ccd..73f45c3769be44 100644 --- a/llvm/include/llvm/IR/DebugInfo.h +++ b/llvm/include/llvm/IR/DebugInfo.h @@ -43,6 +43,8 @@ class Module; TinyPtrVector findDbgDeclares(Value *V); /// As above, for DVRDeclares. TinyPtrVector findDVRDeclares(Value *V); +/// As above, for DVRValues. +TinyPtrVector findDVRValues(Value *V); /// Finds the llvm.dbg.value intrinsics describing a value. void findDbgValues( diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h index 8d7427cc67e2d9..e6dd1e979794e2 100644 --- a/llvm/include/llvm/IR/DebugProgramInstruction.h +++ b/llvm/include/llvm/IR/DebugProgramInstruction.h @@ -427,6 +427,11 @@ class DbgVariableRecord : public DbgRecord, protected DebugValueUser { /// Does this describe the address of a local variable. True for dbg.addr /// and dbg.declare, but not dbg.value, which describes its value. bool isAddressOfVariable() const { return Type == LocationType::Declare; } + + /// Determine if this describes the value of a local variable. It is false for + /// dbg.declare, but true for dbg.value, which describes its value. + bool isValueOfVariable() const { return Type == LocationType::Value; } + LocationType getType() const { return Type; } void setKillLocation(); diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h index 2f1e2c08c3ecec..c188bec631a239 100644 --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -344,6 +344,14 @@ class DbgVariableIntrinsic : public DbgInfoIntrinsic { return getIntrinsicID() == Intrinsic::dbg_declare; } + /// Determine if this describes the value of a local variable. It is true for + /// dbg.value, but false for dbg.declare, which describes its address, and + /// false for dbg.assign, which describes a combination of the variable's + /// value and address. + bool isValueOfVariable() const { + return getIntrinsicID() == Intrinsic::dbg_value; + } + void setKillLocation() { // TODO: When/if we remove duplicate values from DIArgLists, we don't need // this set anymore. diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h index b17ff6539a25a4..bbf29e6f46b47b 100644 --- a/llvm/include/llvm/Transforms/Utils/Local.h +++ b/llvm/include/llvm/Transforms/Utils/Local.h @@ -259,6 +259,16 @@ CallInst *changeToCall(InvokeInst *II, DomTreeUpdater *DTU = nullptr); /// Dbg Intrinsic utilities /// +/// Creates and inserts a dbg_value record intrinsic before a store +/// that has an associated llvm.dbg.value intrinsic. +void InsertDebugValueAtStoreLoc(DbgVariableRecord *DVR, StoreInst *SI, + DIBuilder &Builder); + +/// Creates and inserts an llvm.dbg.value intrinsic before a store +/// that has an associated llvm.dbg.value intrinsic. +void InsertDebugValueAtStoreLoc(DbgVariableIntrinsic *DII, StoreInst *SI, + DIBuilder &Builder); + /// Inserts a llvm.dbg.value intrinsic before a store to an alloca'd value /// that has an associated llvm.dbg.declare intrinsic. void ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII, diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp index 7fa1f9696d43b2..e50b6f6335ef5f 100644 --- a/llvm/lib/IR/DebugInfo.cpp +++ b/llvm/lib/IR/DebugInfo.cpp @@ -46,7 +46,7 @@ using namespace llvm::dwarf; TinyPtrVector llvm::findDbgDeclares(Value *V) { // This function is hot. Check whether the value has any metadata to avoid a - // DenseMap lookup. + // DenseMap lookup. This check is a bitfield datamember lookup. if (!V->isUsedByMetadata()) return {}; auto *L = LocalAsMetadata::getIfExists(V); @@ -65,7 +65,7 @@ TinyPtrVector llvm::findDbgDeclares(Value *V) { } TinyPtrVector llvm::findDVRDeclares(Value *V) { // This function is hot. Check whether the value has any metadata to avoid a - // DenseMap lookup. + // DenseMap lookup. This check is a bitfield datamember lookup. if (!V->isUsedByMetadata()) return {}; auto *L = LocalAsMetadata::getIfExists(V); @@ -80,6 +80,23 @@ TinyPtrVector llvm::findDVRDeclares(Value *V) { return Declares; } +TinyPtrVector llvm::findDVRValues(Value *V) { + // This function is hot. Check whether the value has any metadata to avoid a + // DenseMap lookup. This check is a bitfield datamember lookup. + if (!V->isUsedByMetadata()) + return {}; + auto *L = LocalAsMetadata::getIfExists(V); + if (!L) + return {}; + + TinyPtrVector Values; + for (DbgVariableRecord *DVR : L->getAllDbgVariableRecordUsers()) + if (DVR->isValueOfVariable()) + Values.push_back(DVR); + + return Values; +} + template static void findDbgIntrinsics(SmallVectorImpl &Result, Value *V, diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index c738a2a6f39a45..26b62cb79cdedf 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -4977,8 +4977,6 @@ const Value *getAddress(const DbgVariableIntrinsic *DVI) { } const Value *getAddress(const DbgVariableRecord *DVR) { - assert(DVR->getType() == DbgVariableRecord::LocationType::Declare || - DVR->getType() == DbgVariableRecord::LocationType::Assign); return DVR->getAddress(); } @@ -4989,8 +4987,6 @@ bool isKillAddress(const DbgVariableIntrinsic *DVI) { } bool isKillAddress(const DbgVariableRecord *DVR) { - assert(DVR->getType() == DbgVariableRecord::LocationType::Declare || - DVR->getType() == DbgVariableRecord::LocationType::Assign); if (DVR->getType() == DbgVariableRecord::LocationType::Assign) return DVR->isKillAddress(); return DVR->isKillLocation(); @@ -5003,8 +4999,6 @@ const DIExpression *getAddressExpression(const DbgVariableIntrinsic *DVI) { } const DIExpression *getAddressExpression(const DbgVariableRecord *DVR) { - assert(DVR->getType() == DbgVariableRecord::LocationType::Declare || - DVR->getType() == DbgVariableRecord::LocationType::Assign); if (DVR->getType() == DbgVariableRecord::LocationType::Assign) return DVR->getAddressExpression(); return DVR->getExpression(); @@ -5187,6 +5181,19 @@ insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig, AllocaInst *NewAddr, return; } + if (Orig->isDbgValue()) { + DbgVariableRecord *DVR = DbgVariableRecord::createDbgVariableRecord( + NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc()); + // Drop debug information if the expression doesn't start with a + // DW_OP_deref. This is because without a DW_OP_deref, the #dbg_value + // describes the address of alloca rather than the value inside the alloca. + if (!NewFragmentExpr->startsWithDeref()) + DVR->setKillAddress(); + BeforeInst->getParent()->insertDbgRecordBefore(DVR, + BeforeInst->getIterator()); + return; + } + // Apply a DIAssignID to the store if it doesn't already have it. if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) { NewAddr->setMetadata(LLVMContext::MD_DIAssignID, @@ -5389,7 +5396,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { }; for_each(findDbgDeclares(Fragment.Alloca), RemoveOne); for_each(findDVRDeclares(Fragment.Alloca), RemoveOne); - + for_each(findDVRValues(Fragment.Alloca), RemoveOne); insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, NewExpr, &AI, NewDbgFragment, BitExtractOffset); } @@ -5399,6 +5406,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { // and the individual partitions. for_each(findDbgDeclares(&AI), MigrateOne); for_each(findDVRDeclares(&AI), MigrateOne); + for_each(findDVRValues(&AI), MigrateOne); for_each(at::getAssignmentMarkers(&AI), MigrateOne); for_each(at::getDVRAssignmentMarkers(&AI), MigrateOne); @@ -5545,7 +5553,6 @@ bool SROA::deleteDeadInstructions( } return Changed; } - /// Promote the allocas, using the best available technique. /// /// This attempts to promote whatever allocas have been identified as viable in diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index efb02fdec56d7e..d3710de1964ece 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -1731,6 +1731,26 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII, SI->getIterator()); } +static DIExpression *dropInitialDeref(const DIExpression *DIExpr) { + int NumEltDropped = DIExpr->getElements()[0] == dwarf::DW_OP_LLVM_arg ? 3 : 1; + return DIExpression::get(DIExpr->getContext(), + DIExpr->getElements().drop_front(NumEltDropped)); +} + +void llvm::InsertDebugValueAtStoreLoc(DbgVariableIntrinsic *DII, StoreInst *SI, + DIBuilder &Builder) { + auto *DIVar = DII->getVariable(); + assert(DIVar && "Missing variable"); + auto *DIExpr = DII->getExpression(); + DIExpr = dropInitialDeref(DIExpr); + Value *DV = SI->getValueOperand(); + + DebugLoc NewLoc = getDebugValueLoc(DII); + + insertDbgValueOrDbgVariableRecord(Builder, DV, DIVar, DIExpr, NewLoc, + SI->getIterator()); +} + /// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value /// that has an associated llvm.dbg.declare intrinsic. void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII, @@ -1805,6 +1825,20 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR, SI->getParent()->insertDbgRecordBefore(NewDVR, SI->getIterator()); } +void llvm::InsertDebugValueAtStoreLoc(DbgVariableRecord *DVR, StoreInst *SI, + DIBuilder &Builder) { + auto *DIVar = DVR->getVariable(); + assert(DIVar && "Missing variable"); + auto *DIExpr = DVR->getExpression(); + DIExpr = dropInitialDeref(DIExpr); + Value *DV = SI->getValueOperand(); + + DebugLoc NewLoc = getDebugValueLoc(DVR); + + insertDbgValueOrDbgVariableRecord(Builder, DV, DIVar, DIExpr, NewLoc, + SI->getIterator()); +} + /// Inserts a llvm.dbg.value intrinsic after a phi that has an associated /// llvm.dbg.declare intrinsic. void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII, diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index cfae63405966ff..5251eb86bca926 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -596,6 +596,10 @@ rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI, if (DbgItem->isAddressOfVariable()) { ConvertDebugDeclareToDebugValue(DbgItem, Info.OnlyStore, DIB); DbgItem->eraseFromParent(); + } else if (DbgItem->isValueOfVariable() && + DbgItem->getExpression()->startsWithDeref()) { + InsertDebugValueAtStoreLoc(DbgItem, Info.OnlyStore, DIB); + DbgItem->eraseFromParent(); } else if (DbgItem->getExpression()->startsWithDeref()) { DbgItem->eraseFromParent(); } diff --git a/llvm/test/DebugInfo/Generic/mem2reg-promote-alloca-1.ll b/llvm/test/DebugInfo/Generic/mem2reg-promote-alloca-1.ll index 3d469965d1cfa2..d76dcfe317b31f 100644 --- a/llvm/test/DebugInfo/Generic/mem2reg-promote-alloca-1.ll +++ b/llvm/test/DebugInfo/Generic/mem2reg-promote-alloca-1.ll @@ -21,7 +21,7 @@ ; CHECK: define dso_local void @fun(i32 %param) ; CHECK-NEXT: entry: ; CHECK-NEXT: #dbg_value(i32 %param, ![[PARAM:[0-9]+]], !DIExpression(), -; CHECK-NOT: #dbg_value({{.*}}, ![[PARAM]] +; CHECK-NEXT: #dbg_value(i32 %param, ![[PARAM]], !DIExpression(), ; CHECK: ![[PARAM]] = !DILocalVariable(name: "param", @g = dso_local global i32 0, align 4, !dbg !0 diff --git a/llvm/test/DebugInfo/sroa-handle-dbg-value.ll b/llvm/test/DebugInfo/sroa-handle-dbg-value.ll new file mode 100644 index 00000000000000..dc9abde884b376 --- /dev/null +++ b/llvm/test/DebugInfo/sroa-handle-dbg-value.ll @@ -0,0 +1,110 @@ +; This test was obtained from swift source code and then automatically reducing it via Delta. +; The swift source code was from the test test/DebugInfo/debug_scope_distinct.swift. + +; RUN: opt %s -S -p=sroa -o - | FileCheck %s + +; CHECK: [[SROA_5_SROA_21:%.*]] = alloca [7 x i8], align 8 +; CHECK-NEXT: #dbg_value(ptr [[SROA_5_SROA_21]], !59, !DIExpression(DW_OP_deref, DW_OP_LLVM_fragment, 72, 56), [[DBG72:![0-9]+]]) + +; CHECK: #dbg_value(ptr [[REG1:%[0-9]+]], [[META54:![0-9]+]], !DIExpression(DW_OP_deref), [[DBG78:![0-9]+]]) +; CHECK-NEXT: #dbg_value(ptr [[REG2:%[0-9]+]], [[META56:![0-9]+]], !DIExpression(DW_OP_deref), [[DBG78]]) +; CHECK-NEXT: #dbg_value(i64 0, [[META57:![0-9]+]], !DIExpression(), [[DBG78]]) + +; CHECK: [[SROA_418_SROA_COPYLOAD:%.*]] = load i8, ptr [[SROA_418_0_U1_IDX:%.*]], align 8, !dbg [[DBG78]] +; CHECK-NEXT #dbg_value(i8 [[SROA_418_SROA_COPYLOAD]], [[META59]], !DIExpression(DW_OP_deref, DW_OP_LLVM_fragment, 64, 8), [[DBG72]]) + +%T4main1TV13TangentVectorV = type <{ %T4main1UV13TangentVectorV, [7 x i8], %T4main1UV13TangentVectorV }> +%T4main1UV13TangentVectorV = type <{ %T1M1SVySfG, [7 x i8], %T4main1VV13TangentVectorV }> +%T1M1SVySfG = type <{ ptr, %Ts4Int8V }> +%Ts4Int8V = type <{ i8 }> +%T4main1VV13TangentVectorV = type <{ %T1M1SVySfG }> +define hidden swiftcc void @"$s4main1TV13TangentVectorV1poiyA2E_AEtFZ"(ptr noalias nocapture sret(%T4main1TV13TangentVectorV) %0, ptr noalias nocapture dereferenceable(57) %1, ptr noalias nocapture dereferenceable(57) %2) #0 !dbg !44 { +entry: + %3 = alloca %T4main1VV13TangentVectorV + %4 = alloca %T4main1UV13TangentVectorV + call void @llvm.dbg.value(metadata ptr %1, metadata !54, metadata !DIExpression(DW_OP_deref)), !dbg !61 + call void @llvm.dbg.value(metadata ptr %2, metadata !56, metadata !DIExpression(DW_OP_deref)), !dbg !61 + call void @llvm.dbg.value(metadata i64 0, metadata !57, metadata !DIExpression()), !dbg !61 + %.u1 = getelementptr inbounds %T4main1TV13TangentVectorV, ptr %1, i32 0, i32 0 + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %4, ptr align 8 %.u1, i64 25, i1 false), !dbg !61 + call void @llvm.dbg.value(metadata ptr %4, metadata !62, metadata !DIExpression(DW_OP_deref)), !dbg !75 + %.s = getelementptr inbounds %T4main1UV13TangentVectorV, ptr %4, i32 0, i32 0 + %.s.b = getelementptr inbounds %T1M1SVySfG, ptr %.s, i32 0, i32 1 + %.s.b._value = getelementptr inbounds %Ts4Int8V, ptr %.s.b, i32 0, i32 0 + %12 = load i8, ptr %.s.b._value + %.v = getelementptr inbounds %T4main1UV13TangentVectorV, ptr %4, i32 0, i32 2 + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %3, ptr align 8 %.v, i64 9, i1 false) + %.s4 = getelementptr inbounds %T4main1VV13TangentVectorV, ptr %3, i32 0, i32 0 + %.s4.c = getelementptr inbounds %T1M1SVySfG, ptr %.s4, i32 0, i32 0 + %18 = load ptr, ptr %.s4.c + ret void +} +!llvm.module.flags = !{!0, !1, !2, !3, !4, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15} +!swift.module.flags = !{!33} +!llvm.linker.options = !{!34, !35, !36, !37, !38, !39, !40, !41, !42, !43} +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 14, i32 4]} +!1 = !{i32 1, !"Objective-C Version", i32 2} +!2 = !{i32 1, !"Objective-C Image Info Version", i32 0} +!3 = !{i32 1, !"Objective-C Image Info Section", !"__DATA,no_dead_strip"} +!4 = !{i32 1, !"Objective-C Garbage Collection", i8 0} +!6 = !{i32 7, !"Dwarf Version", i32 4} +!7 = !{i32 2, !"Debug Info Version", i32 3} +!8 = !{i32 1, !"wchar_size", i32 4} +!9 = !{i32 8, !"PIC Level", i32 2} +!10 = !{i32 7, !"uwtable", i32 1} +!11 = !{i32 7, !"frame-pointer", i32 1} +!12 = !{i32 1, !"Swift Version", i32 7} +!13 = !{i32 1, !"Swift ABI Version", i32 7} +!14 = !{i32 1, !"Swift Major Version", i8 6} +!15 = !{i32 1, !"Swift Minor Version", i8 0} +!16 = distinct !DICompileUnit(language: DW_LANG_Swift, file: !17, imports: !18, sdk: "MacOSX14.4.sdk") +!17 = !DIFile(filename: "swift/swift/test/IRGen/debug_scope_distinct.swift", directory: "swift") +!18 = !{!19, !21, !23, !25, !27, !29, !31} +!19 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !17, entity: !20, file: !17) +!20 = !DIModule(scope: null, name: "main", includePath: "swift/swift/test/IRGen") +!21 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !17, entity: !22, file: !17) +!22 = !DIModule(scope: null, name: "Swift", includePath: "swift/_build/Ninja-RelWithDebInfoAssert+stdlib-RelWithDebInfo/swift-macosx-arm64/lib/swift/macosx/Swift.swiftmodule/arm64-apple-macos.swiftmodule") +!23 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !17, entity: !24, line: 60) +!24 = !DIModule(scope: null, name: "_Differentiation", includePath: "swift/_build/Ninja-RelWithDebInfoAssert+stdlib-RelWithDebInfo/swift-macosx-arm64/lib/swift/macosx/_Differentiation.swiftmodule/arm64-apple-macos.swiftmodule") +!25 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !17, entity: !26, line: 61) +!26 = !DIModule(scope: null, name: "M", includePath: "swift/_build/Ninja-RelWithDebInfoAssert+stdlib-RelWithDebInfo/swift-macosx-arm64/test-macosx-arm64/IRGen/Output/debug_scope_distinct.swift.tmp/M.swiftmodule") +!27 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !17, entity: !28, file: !17) +!28 = !DIModule(scope: null, name: "_StringProcessing", includePath: "swift/_build/Ninja-RelWithDebInfoAssert+stdlib-RelWithDebInfo/swift-macosx-arm64/lib/swift/macosx/_StringProcessing.swiftmodule/arm64-apple-macos.swiftmodule") +!29 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !17, entity: !30, file: !17) +!30 = !DIModule(scope: null, name: "_SwiftConcurrencyShims", includePath: "swift/_build/Ninja-RelWithDebInfoAssert+stdlib-RelWithDebInfo/swift-macosx-arm64/lib/swift/shims") +!31 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !17, entity: !32, file: !17) +!32 = !DIModule(scope: null, name: "_Concurrency", includePath: "swift/_build/Ninja-RelWithDebInfoAssert+stdlib-RelWithDebInfo/swift-macosx-arm64/lib/swift/macosx/_Concurrency.swiftmodule/arm64-apple-macos.swiftmodule") +!33 = !{ i1 false} +!34 = !{!"-lswiftCore"} +!35 = !{!"-lswift_StringProcessing"} +!36 = !{!"-lswift_Differentiation"} +!37 = !{!"-lswiftDarwin"} +!38 = !{!"-lswift_Concurrency"} +!39 = !{!"-lswiftSwiftOnoneSupport"} +!40 = !{!"-lobjc"} +!41 = !{!"-lswiftCompatibilityConcurrency"} +!42 = !{!"-lswiftCompatibility56"} +!43 = !{!"-lswiftCompatibilityPacks"} +!44 = distinct !DISubprogram(file: !45, type: !49, unit: !16, declaration: !52, retainedNodes: !53) +!45 = !DIFile(filename: "", directory: "/") +!46 = !DICompositeType(tag: DW_TAG_structure_type, scope: !47, elements: !48, identifier: "$s4main1TV13TangentVectorVD") +!47 = !DICompositeType(tag: DW_TAG_structure_type, identifier: "$s4main1TVD") +!48 = !{} +!49 = !DISubroutineType(types: !50) +!50 = !{ !51} +!51 = !DICompositeType(tag: DW_TAG_structure_type, identifier: "$s4main1TV13TangentVectorVXMtD") +!52 = !DISubprogram(spFlags: DISPFlagOptimized) +!53 = !{!54, !56, !57} +!54 = !DILocalVariable(name: "a", scope: !44, flags: DIFlagArtificial) +!55 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !46) +!56 = !DILocalVariable(name: "b", scope: !44, type: !55, flags: DIFlagArtificial) +!57 = !DILocalVariable(name: "c", scope: !44, type: !58, flags: DIFlagArtificial) +!58 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !51) +!61 = !DILocation(scope: !44) +!62 = !DILocalVariable(name: "d", scope: !63, type: !72, flags: DIFlagArtificial) +!63 = distinct !DISubprogram(unit: !16, retainedNodes: !70) +!64 = !DICompositeType(tag: DW_TAG_structure_type, size: 200, identifier: "$s4main1UV13TangentVectorVD") +!70 = !{} +!72 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !64) +!75 = !DILocation(scope: !63, inlinedAt: !76) +!76 = distinct !DILocation(scope: !44) diff --git a/llvm/test/Transforms/SROA/alignment.ll b/llvm/test/Transforms/SROA/alignment.ll index 98be495e5eb354..8322da189aeeaa 100644 --- a/llvm/test/Transforms/SROA/alignment.ll +++ b/llvm/test/Transforms/SROA/alignment.ll @@ -23,7 +23,9 @@ define void @test1(ptr %a, ptr %b) { ; ; CHECK-DEBUGLOC-LABEL: @test1( ; CHECK-DEBUGLOC-NEXT: entry: -; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META9:![0-9]+]], !DIExpression(), [[META14:![0-9]+]]) +; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META9:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 8), [[META14:![0-9]+]]) +; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META9]], !DIExpression(DW_OP_LLVM_fragment, 8, 8), [[META14]]) +; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META9]], !DIExpression(), [[META14]]) ; CHECK-DEBUGLOC-NEXT: [[GEP_A:%.*]] = getelementptr { i8, i8 }, ptr [[A:%.*]], i32 0, i32 0, !dbg [[DBG15:![0-9]+]] ; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr [[GEP_A]], [[META11:![0-9]+]], !DIExpression(), [[DBG15]]) ; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META12:![0-9]+]], !DIExpression(), [[META16:![0-9]+]]) @@ -57,24 +59,25 @@ define void @test2() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca i16, align 2 ; CHECK-NEXT: store volatile i16 0, ptr [[A_SROA_0]], align 2 -; CHECK-NEXT: [[A_SROA_0_1_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_0]], i64 1 -; CHECK-NEXT: [[A_SROA_0_1_A_SROA_0_2_RESULT:%.*]] = load i8, ptr [[A_SROA_0_1_SROA_IDX]], align 1 -; CHECK-NEXT: [[A_SROA_0_1_SROA_IDX2:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_0]], i64 1 -; CHECK-NEXT: store i8 42, ptr [[A_SROA_0_1_SROA_IDX2]], align 1 +; CHECK-NEXT: [[A_SROA_0_1_GEP2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_0]], i64 1 +; CHECK-NEXT: [[A_SROA_0_1_A_SROA_0_2_RESULT:%.*]] = load i8, ptr [[A_SROA_0_1_GEP2_SROA_IDX]], align 1 +; CHECK-NEXT: [[A_SROA_0_1_GEP2_SROA_IDX2:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_0]], i64 1 +; CHECK-NEXT: store i8 42, ptr [[A_SROA_0_1_GEP2_SROA_IDX2]], align 1 ; CHECK-NEXT: ret void ; ; CHECK-DEBUGLOC-LABEL: @test2( ; CHECK-DEBUGLOC-NEXT: entry: ; CHECK-DEBUGLOC-NEXT: [[A_SROA_0:%.*]] = alloca i16, align 2, !dbg [[DBG28:![0-9]+]] -; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META23:![0-9]+]], !DIExpression(), [[DBG28]]) +; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr [[A_SROA_0]], [[META23:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 8, 16), [[DBG28]]) +; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META23]], !DIExpression(), [[DBG28]]) ; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META24:![0-9]+]], !DIExpression(), [[META29:![0-9]+]]) ; CHECK-DEBUGLOC-NEXT: store volatile i16 0, ptr [[A_SROA_0]], align 2, !dbg [[DBG30:![0-9]+]] ; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META25:![0-9]+]], !DIExpression(), [[META31:![0-9]+]]) -; CHECK-DEBUGLOC-NEXT: [[A_SROA_0_1_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_0]], i64 1, !dbg [[DBG32:![0-9]+]] -; CHECK-DEBUGLOC-NEXT: [[A_SROA_0_1_A_SROA_0_2_RESULT:%.*]] = load i8, ptr [[A_SROA_0_1_SROA_IDX]], align 1, !dbg [[DBG32]] +; CHECK-DEBUGLOC-NEXT: [[A_SROA_0_1_GEP2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_0]], i64 1, !dbg [[DBG32:![0-9]+]] +; CHECK-DEBUGLOC-NEXT: [[A_SROA_0_1_A_SROA_0_2_RESULT:%.*]] = load i8, ptr [[A_SROA_0_1_GEP2_SROA_IDX]], align 1, !dbg [[DBG32]] ; CHECK-DEBUGLOC-NEXT: #dbg_value(i8 [[A_SROA_0_1_A_SROA_0_2_RESULT]], [[META26:![0-9]+]], !DIExpression(), [[DBG32]]) -; CHECK-DEBUGLOC-NEXT: [[A_SROA_0_1_SROA_IDX2:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_0]], i64 1, !dbg [[DBG33:![0-9]+]] -; CHECK-DEBUGLOC-NEXT: store i8 42, ptr [[A_SROA_0_1_SROA_IDX2]], align 1, !dbg [[DBG33]] +; CHECK-DEBUGLOC-NEXT: [[A_SROA_0_1_GEP2_SROA_IDX2:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_0]], i64 1, !dbg [[DBG33:![0-9]+]] +; CHECK-DEBUGLOC-NEXT: store i8 42, ptr [[A_SROA_0_1_GEP2_SROA_IDX2]], align 1, !dbg [[DBG33]] ; CHECK-DEBUGLOC-NEXT: ret void, !dbg [[DBG34:![0-9]+]] ; entry: @@ -117,7 +120,6 @@ define void @test3(ptr %x) { ; expecting. However, also check that any offset within an alloca can in turn ; reduce the alignment. ; -; ; CHECK-LABEL: @test3( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca [22 x i8], align 8 @@ -129,9 +131,11 @@ define void @test3(ptr %x) { ; CHECK-DEBUGLOC-LABEL: @test3( ; CHECK-DEBUGLOC-NEXT: entry: ; CHECK-DEBUGLOC-NEXT: [[A_SROA_0:%.*]] = alloca [22 x i8], align 8, !dbg [[DBG47:![0-9]+]] -; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META44:![0-9]+]], !DIExpression(), [[DBG47]]) +; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr [[A_SROA_0]], [[META44:![0-9]+]], !DIExpression(), [[DBG47]]) +; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META44]], !DIExpression(), [[DBG47]]) ; CHECK-DEBUGLOC-NEXT: [[B_SROA_0:%.*]] = alloca [18 x i8], align 2, !dbg [[DBG48:![0-9]+]] -; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META45:![0-9]+]], !DIExpression(), [[DBG48]]) +; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr [[B_SROA_0]], [[META45:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 48, 16), [[DBG48]]) +; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META45]], !DIExpression(), [[DBG48]]) ; CHECK-DEBUGLOC-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[A_SROA_0]], ptr align 8 [[X:%.*]], i32 22, i1 false), !dbg [[DBG49:![0-9]+]] ; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META46:![0-9]+]], !DIExpression(), [[META50:![0-9]+]]) ; CHECK-DEBUGLOC-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 2 [[B_SROA_0]], ptr align 2 [[X]], i32 18, i1 false), !dbg [[DBG51:![0-9]+]] @@ -158,31 +162,32 @@ define void @test5() { ; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca [9 x i8], align 1 ; CHECK-NEXT: [[A_SROA_3:%.*]] = alloca [9 x i8], align 1 ; CHECK-NEXT: store volatile double 0.000000e+00, ptr [[A_SROA_0]], align 1 -; CHECK-NEXT: [[A_SROA_0_7_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_0]], i64 7 -; CHECK-NEXT: [[A_SROA_0_7_A_SROA_0_7_WEIRD_LOAD1:%.*]] = load volatile i16, ptr [[A_SROA_0_7_SROA_IDX1]], align 1 +; CHECK-NEXT: [[A_SROA_0_7_WEIRD_GEP1_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_0]], i64 7 +; CHECK-NEXT: [[A_SROA_0_7_A_SROA_0_7_WEIRD_LOAD1:%.*]] = load volatile i16, ptr [[A_SROA_0_7_WEIRD_GEP1_SROA_IDX1]], align 1 ; CHECK-NEXT: [[A_SROA_0_0_A_SROA_0_0_D1:%.*]] = load double, ptr [[A_SROA_0]], align 1 ; CHECK-NEXT: store volatile double [[A_SROA_0_0_A_SROA_0_0_D1]], ptr [[A_SROA_3]], align 1 -; CHECK-NEXT: [[A_SROA_3_7_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_3]], i64 7 -; CHECK-NEXT: [[A_SROA_3_7_A_SROA_3_16_WEIRD_LOAD2:%.*]] = load volatile i16, ptr [[A_SROA_3_7_SROA_IDX]], align 1 +; CHECK-NEXT: [[A_SROA_3_7_WEIRD_GEP2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_3]], i64 7 +; CHECK-NEXT: [[A_SROA_3_7_A_SROA_3_16_WEIRD_LOAD2:%.*]] = load volatile i16, ptr [[A_SROA_3_7_WEIRD_GEP2_SROA_IDX]], align 1 ; CHECK-NEXT: ret void ; ; CHECK-DEBUGLOC-LABEL: @test5( ; CHECK-DEBUGLOC-NEXT: entry: ; CHECK-DEBUGLOC-NEXT: [[A_SROA_0:%.*]] = alloca [9 x i8], align 1, !dbg [[DBG63:![0-9]+]] ; CHECK-DEBUGLOC-NEXT: [[A_SROA_3:%.*]] = alloca [9 x i8], align 1, !dbg [[DBG63]] -; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META55:![0-9]+]], !DIExpression(), [[DBG63]]) +; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr [[A_SROA_0]], [[META55:![0-9]+]], !DIExpression(), [[DBG63]]) +; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META55]], !DIExpression(), [[DBG63]]) ; CHECK-DEBUGLOC-NEXT: store volatile double 0.000000e+00, ptr [[A_SROA_0]], align 1, !dbg [[DBG64:![0-9]+]] ; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META56:![0-9]+]], !DIExpression(), [[META65:![0-9]+]]) -; CHECK-DEBUGLOC-NEXT: [[A_SROA_0_7_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_0]], i64 7, !dbg [[DBG66:![0-9]+]] -; CHECK-DEBUGLOC-NEXT: [[A_SROA_0_7_A_SROA_0_7_WEIRD_LOAD1:%.*]] = load volatile i16, ptr [[A_SROA_0_7_SROA_IDX1]], align 1, !dbg [[DBG66]] +; CHECK-DEBUGLOC-NEXT: [[A_SROA_0_7_WEIRD_GEP1_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_0]], i64 7, !dbg [[DBG66:![0-9]+]] +; CHECK-DEBUGLOC-NEXT: [[A_SROA_0_7_A_SROA_0_7_WEIRD_LOAD1:%.*]] = load volatile i16, ptr [[A_SROA_0_7_WEIRD_GEP1_SROA_IDX1]], align 1, !dbg [[DBG66]] ; CHECK-DEBUGLOC-NEXT: #dbg_value(i16 [[A_SROA_0_7_A_SROA_0_7_WEIRD_LOAD1]], [[META57:![0-9]+]], !DIExpression(), [[DBG66]]) ; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META59:![0-9]+]], !DIExpression(), [[META67:![0-9]+]]) ; CHECK-DEBUGLOC-NEXT: [[A_SROA_0_0_A_SROA_0_0_D1:%.*]] = load double, ptr [[A_SROA_0]], align 1, !dbg [[DBG68:![0-9]+]] ; CHECK-DEBUGLOC-NEXT: #dbg_value(double [[A_SROA_0_0_A_SROA_0_0_D1]], [[META60:![0-9]+]], !DIExpression(), [[DBG68]]) ; CHECK-DEBUGLOC-NEXT: store volatile double [[A_SROA_0_0_A_SROA_0_0_D1]], ptr [[A_SROA_3]], align 1, !dbg [[DBG69:![0-9]+]] ; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META61:![0-9]+]], !DIExpression(), [[META70:![0-9]+]]) -; CHECK-DEBUGLOC-NEXT: [[A_SROA_3_7_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_3]], i64 7, !dbg [[DBG71:![0-9]+]] -; CHECK-DEBUGLOC-NEXT: [[A_SROA_3_7_A_SROA_3_16_WEIRD_LOAD2:%.*]] = load volatile i16, ptr [[A_SROA_3_7_SROA_IDX]], align 1, !dbg [[DBG71]] +; CHECK-DEBUGLOC-NEXT: [[A_SROA_3_7_WEIRD_GEP2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[A_SROA_3]], i64 7, !dbg [[DBG71:![0-9]+]] +; CHECK-DEBUGLOC-NEXT: [[A_SROA_3_7_A_SROA_3_16_WEIRD_LOAD2:%.*]] = load volatile i16, ptr [[A_SROA_3_7_WEIRD_GEP2_SROA_IDX]], align 1, !dbg [[DBG71]] ; CHECK-DEBUGLOC-NEXT: #dbg_value(i16 [[A_SROA_3_7_A_SROA_3_16_WEIRD_LOAD2]], [[META62:![0-9]+]], !DIExpression(), [[DBG71]]) ; CHECK-DEBUGLOC-NEXT: ret void, !dbg [[DBG72:![0-9]+]] ; @@ -219,7 +224,8 @@ define void @test6() { ; CHECK-DEBUGLOC-NEXT: entry: ; CHECK-DEBUGLOC-NEXT: [[A_SROA_0:%.*]] = alloca double, align 8, !dbg [[DBG78:![0-9]+]] ; CHECK-DEBUGLOC-NEXT: [[A_SROA_2:%.*]] = alloca double, align 8, !dbg [[DBG78]] -; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META75:![0-9]+]], !DIExpression(), [[DBG78]]) +; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr [[A_SROA_0]], [[META75:![0-9]+]], !DIExpression(), [[DBG78]]) +; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META75]], !DIExpression(), [[DBG78]]) ; CHECK-DEBUGLOC-NEXT: store volatile double 0.000000e+00, ptr [[A_SROA_0]], align 8, !dbg [[DBG79:![0-9]+]] ; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META76:![0-9]+]], !DIExpression(), [[META80:![0-9]+]]) ; CHECK-DEBUGLOC-NEXT: [[A_SROA_0_0_A_SROA_0_0_VAL:%.*]] = load double, ptr [[A_SROA_0]], align 8, !dbg [[DBG81:![0-9]+]] @@ -256,6 +262,7 @@ define void @test7(ptr %out) { ; CHECK-DEBUGLOC-LABEL: @test7( ; CHECK-DEBUGLOC-NEXT: entry: ; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META86:![0-9]+]], !DIExpression(), [[META90:![0-9]+]]) +; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META86]], !DIExpression(), [[META90]]) ; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META87:![0-9]+]], !DIExpression(), [[META91:![0-9]+]]) ; CHECK-DEBUGLOC-NEXT: [[A_SROA_0_0_COPYLOAD:%.*]] = load double, ptr [[OUT:%.*]], align 1, !dbg [[DBG92:![0-9]+]] ; CHECK-DEBUGLOC-NEXT: [[A_SROA_4_0_OUT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OUT]], i64 8, !dbg [[DBG92]] @@ -442,7 +449,8 @@ define dso_local i32 @pr45010(ptr %A) { ; ; CHECK-DEBUGLOC-LABEL: @pr45010( ; CHECK-DEBUGLOC-NEXT: [[B_SROA_0:%.*]] = alloca i32, align 4, !dbg [[DBG129:![0-9]+]] -; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META125:![0-9]+]], !DIExpression(), [[DBG129]]) +; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr [[B_SROA_0]], [[META125:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 32), [[DBG129]]) +; CHECK-DEBUGLOC-NEXT: #dbg_value(ptr undef, [[META125]], !DIExpression(), [[DBG129]]) ; CHECK-DEBUGLOC-NEXT: [[TMP1:%.*]] = load i32, ptr [[A:%.*]], align 4, !dbg [[DBG130:![0-9]+]] ; CHECK-DEBUGLOC-NEXT: #dbg_value(i32 [[TMP1]], [[META126:![0-9]+]], !DIExpression(), [[DBG130]]) ; CHECK-DEBUGLOC-NEXT: store atomic volatile i32 [[TMP1]], ptr [[B_SROA_0]] release, align 4, !dbg [[DBG131:![0-9]+]] diff --git a/llvm/test/Transforms/SROA/vector-promotion.ll b/llvm/test/Transforms/SROA/vector-promotion.ll index 8624ab27ed3cc9..08863dce1c7879 100644 --- a/llvm/test/Transforms/SROA/vector-promotion.ll +++ b/llvm/test/Transforms/SROA/vector-promotion.ll @@ -23,6 +23,7 @@ define i32 @test1(<4 x i32> %x, <4 x i32> %y) { ; DEBUG-LABEL: @test1( ; DEBUG-NEXT: entry: ; DEBUG-NEXT: #dbg_value(ptr undef, [[META9:![0-9]+]], !DIExpression(), [[META21:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META9]], !DIExpression(), [[META21]]) ; DEBUG-NEXT: #dbg_value(ptr undef, [[META11:![0-9]+]], !DIExpression(), [[META22:![0-9]+]]) ; DEBUG-NEXT: #dbg_value(ptr undef, [[META12:![0-9]+]], !DIExpression(), [[META23:![0-9]+]]) ; DEBUG-NEXT: [[A_SROA_0_8_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2, !dbg [[DBG24:![0-9]+]] @@ -72,6 +73,7 @@ define i32 @test2(<4 x i32> %x, <4 x i32> %y) { ; DEBUG-LABEL: @test2( ; DEBUG-NEXT: entry: ; DEBUG-NEXT: #dbg_value(ptr undef, [[META34:![0-9]+]], !DIExpression(), [[META45:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META34]], !DIExpression(), [[META45]]) ; DEBUG-NEXT: #dbg_value(ptr undef, [[META35:![0-9]+]], !DIExpression(), [[META46:![0-9]+]]) ; DEBUG-NEXT: #dbg_value(ptr undef, [[META36:![0-9]+]], !DIExpression(), [[META47:![0-9]+]]) ; DEBUG-NEXT: [[A_SROA_0_8_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2, !dbg [[DBG48:![0-9]+]] @@ -124,6 +126,7 @@ define i32 @test3(<4 x i32> %x, <4 x i32> %y) { ; DEBUG-LABEL: @test3( ; DEBUG-NEXT: entry: ; DEBUG-NEXT: #dbg_value(ptr undef, [[META59:![0-9]+]], !DIExpression(), [[META69:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META59]], !DIExpression(), [[META69]]) ; DEBUG-NEXT: #dbg_value(ptr undef, [[META60:![0-9]+]], !DIExpression(), [[META70:![0-9]+]]) ; DEBUG-NEXT: #dbg_value(ptr undef, [[META61:![0-9]+]], !DIExpression(), [[META71:![0-9]+]]) ; DEBUG-NEXT: [[A_SROA_0_8_VEC_INSERT:%.*]] = insertelement <4 x i32> [[X:%.*]], i32 -1, i32 2, !dbg [[DBG72:![0-9]+]] @@ -180,6 +183,7 @@ define i32 @test4(<4 x i32> %x, <4 x i32> %y, ptr %z) { ; DEBUG-LABEL: @test4( ; DEBUG-NEXT: entry: ; DEBUG-NEXT: #dbg_value(ptr undef, [[META83:![0-9]+]], !DIExpression(), [[META94:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META83]], !DIExpression(), [[META94]]) ; DEBUG-NEXT: #dbg_value(ptr undef, [[META84:![0-9]+]], !DIExpression(), [[META95:![0-9]+]]) ; DEBUG-NEXT: [[A_SROA_3_16_COPYLOAD:%.*]] = load <4 x i32>, ptr [[Z:%.*]], align 1, !dbg [[DBG96:![0-9]+]] ; DEBUG-NEXT: #dbg_value(ptr undef, [[META85:![0-9]+]], !DIExpression(), [[META97:![0-9]+]]) @@ -244,6 +248,7 @@ define i32 @test4_as1(<4 x i32> %x, <4 x i32> %y, ptr addrspace(1) %z) { ; DEBUG-LABEL: @test4_as1( ; DEBUG-NEXT: entry: ; DEBUG-NEXT: #dbg_value(ptr undef, [[META110:![0-9]+]], !DIExpression(), [[META121:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META110]], !DIExpression(), [[META121]]) ; DEBUG-NEXT: #dbg_value(ptr undef, [[META111:![0-9]+]], !DIExpression(), [[META122:![0-9]+]]) ; DEBUG-NEXT: [[A_SROA_3_16_COPYLOAD:%.*]] = load <4 x i32>, ptr addrspace(1) [[Z:%.*]], align 1, !dbg [[DBG123:![0-9]+]] ; DEBUG-NEXT: #dbg_value(ptr undef, [[META112:![0-9]+]], !DIExpression(), [[META124:![0-9]+]]) @@ -306,6 +311,7 @@ define i32 @test5(<4 x i32> %x, <4 x i32> %y, ptr %z) { ; DEBUG-LABEL: @test5( ; DEBUG-NEXT: entry: ; DEBUG-NEXT: #dbg_value(ptr undef, [[META137:![0-9]+]], !DIExpression(), [[META148:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META137]], !DIExpression(), [[META148]]) ; DEBUG-NEXT: #dbg_value(ptr undef, [[META138:![0-9]+]], !DIExpression(), [[META149:![0-9]+]]) ; DEBUG-NEXT: #dbg_value(ptr undef, [[META139:![0-9]+]], !DIExpression(), [[META150:![0-9]+]]) ; DEBUG-NEXT: [[Z_TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr [[Z:%.*]], i64 0, i64 2, !dbg [[DBG151:![0-9]+]] @@ -596,7 +602,8 @@ define i32 @PR14212(<3 x i8> %val) { ; ; DEBUG-LABEL: @PR14212( ; DEBUG-NEXT: entry: -; DEBUG-NEXT: #dbg_value(ptr undef, [[META250:![0-9]+]], !DIExpression(), [[META252:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META250:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 24, 8), [[META252:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META250]], !DIExpression(), [[META252]]) ; DEBUG-NEXT: [[TMP0:%.*]] = bitcast <3 x i8> [[VAL:%.*]] to i24, !dbg [[DBG253:![0-9]+]] ; DEBUG-NEXT: [[RETVAL_SROA_2_0_INSERT_EXT:%.*]] = zext i8 undef to i32, !dbg [[DBG254:![0-9]+]] ; DEBUG-NEXT: [[RETVAL_SROA_2_0_INSERT_SHIFT:%.*]] = shl i32 [[RETVAL_SROA_2_0_INSERT_EXT]], 24, !dbg [[DBG254]] @@ -630,7 +637,9 @@ define <2 x i8> @PR14349.1(i32 %x) { ; ; DEBUG-LABEL: @PR14349.1( ; DEBUG-NEXT: entry: -; DEBUG-NEXT: #dbg_value(ptr undef, [[META257:![0-9]+]], !DIExpression(), [[META260:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META257:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 16), [[META260:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META257]], !DIExpression(DW_OP_LLVM_fragment, 16, 16), [[META260]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META257]], !DIExpression(), [[META260]]) ; DEBUG-NEXT: [[A_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[X:%.*]] to i16, !dbg [[DBG261:![0-9]+]] ; DEBUG-NEXT: [[TMP0:%.*]] = bitcast i16 [[A_SROA_0_0_EXTRACT_TRUNC]] to <2 x i8>, !dbg [[DBG261]] ; DEBUG-NEXT: [[A_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i32 [[X]], 16, !dbg [[DBG261]] @@ -666,7 +675,9 @@ define i32 @PR14349.2(<2 x i8> %x) { ; ; DEBUG-LABEL: @PR14349.2( ; DEBUG-NEXT: entry: -; DEBUG-NEXT: #dbg_value(ptr undef, [[META266:![0-9]+]], !DIExpression(), [[META268:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META266:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 16), [[META268:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META266]], !DIExpression(DW_OP_LLVM_fragment, 16, 16), [[META268]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META266]], !DIExpression(), [[META268]]) ; DEBUG-NEXT: [[TMP0:%.*]] = bitcast <2 x i8> [[X:%.*]] to i16, !dbg [[DBG269:![0-9]+]] ; DEBUG-NEXT: [[A_SROA_2_0_INSERT_EXT:%.*]] = zext i16 undef to i32, !dbg [[DBG270:![0-9]+]] ; DEBUG-NEXT: [[A_SROA_2_0_INSERT_SHIFT:%.*]] = shl i32 [[A_SROA_2_0_INSERT_EXT]], 16, !dbg [[DBG270]] @@ -703,6 +714,7 @@ define i32 @test7(<2 x i32> %x, <2 x i32> %y) { ; DEBUG-LABEL: @test7( ; DEBUG-NEXT: entry: ; DEBUG-NEXT: #dbg_value(ptr undef, [[META273:![0-9]+]], !DIExpression(), [[META283:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META273]], !DIExpression(), [[META283]]) ; DEBUG-NEXT: #dbg_value(ptr undef, [[META274:![0-9]+]], !DIExpression(), [[META284:![0-9]+]]) ; DEBUG-NEXT: #dbg_value(ptr undef, [[META275:![0-9]+]], !DIExpression(), [[META285:![0-9]+]]) ; DEBUG-NEXT: [[A_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x i32> [[X:%.*]], i32 1, !dbg [[DBG286:![0-9]+]] @@ -752,6 +764,7 @@ define i32 @test8(<2 x i32> %x) { ; DEBUG-LABEL: @test8( ; DEBUG-NEXT: entry: ; DEBUG-NEXT: #dbg_value(ptr undef, [[META296:![0-9]+]], !DIExpression(), [[META301:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META296]], !DIExpression(), [[META301]]) ; DEBUG-NEXT: [[A_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x i32> [[X:%.*]], i32 0, !dbg [[DBG302:![0-9]+]] ; DEBUG-NEXT: #dbg_value(i32 [[A_SROA_0_0_VEC_EXTRACT]], [[META297:![0-9]+]], !DIExpression(), [[DBG302]]) ; DEBUG-NEXT: #dbg_value(ptr undef, [[META298:![0-9]+]], !DIExpression(), [[META303:![0-9]+]]) @@ -787,6 +800,7 @@ define <2 x i32> @test9(i32 %x, i32 %y) { ; DEBUG-LABEL: @test9( ; DEBUG-NEXT: entry: ; DEBUG-NEXT: #dbg_value(ptr undef, [[META309:![0-9]+]], !DIExpression(), [[META312:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META309]], !DIExpression(), [[META312]]) ; DEBUG-NEXT: [[A_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i32> undef, i32 [[X:%.*]], i32 0, !dbg [[DBG313:![0-9]+]] ; DEBUG-NEXT: #dbg_value(ptr undef, [[META310:![0-9]+]], !DIExpression(), [[META314:![0-9]+]]) ; DEBUG-NEXT: [[A_SROA_0_4_VEC_INSERT:%.*]] = insertelement <2 x i32> [[A_SROA_0_0_VEC_INSERT]], i32 [[Y:%.*]], i32 1, !dbg [[DBG315:![0-9]+]] @@ -818,6 +832,7 @@ define <2 x i32> @test10(<4 x i16> %x, i32 %y) { ; DEBUG-LABEL: @test10( ; DEBUG-NEXT: entry: ; DEBUG-NEXT: #dbg_value(ptr undef, [[META320:![0-9]+]], !DIExpression(), [[META323:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META320]], !DIExpression(), [[META323]]) ; DEBUG-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[X:%.*]] to <2 x i32>, !dbg [[DBG324:![0-9]+]] ; DEBUG-NEXT: #dbg_value(ptr undef, [[META321:![0-9]+]], !DIExpression(), [[META325:![0-9]+]]) ; DEBUG-NEXT: [[A_SROA_0_4_VEC_INSERT:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[Y:%.*]], i32 1, !dbg [[DBG326:![0-9]+]] @@ -851,6 +866,7 @@ define <2 x float> @test11(<4 x i16> %x, i32 %y) { ; DEBUG-LABEL: @test11( ; DEBUG-NEXT: entry: ; DEBUG-NEXT: #dbg_value(ptr undef, [[META331:![0-9]+]], !DIExpression(), [[META334:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META331]], !DIExpression(), [[META334]]) ; DEBUG-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[X:%.*]] to <2 x i32>, !dbg [[DBG335:![0-9]+]] ; DEBUG-NEXT: #dbg_value(ptr undef, [[META332:![0-9]+]], !DIExpression(), [[META336:![0-9]+]]) ; DEBUG-NEXT: [[A_SROA_0_4_VEC_INSERT:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[Y:%.*]], i32 1, !dbg [[DBG337:![0-9]+]] @@ -877,6 +893,7 @@ define <4 x float> @test12(<4 x i32> %val) { ; ; DEBUG-LABEL: @test12( ; DEBUG-NEXT: #dbg_value(ptr undef, [[META342:![0-9]+]], !DIExpression(), [[META344:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META342]], !DIExpression(), [[META344]]) ; DEBUG-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL:%.*]] to <4 x float>, !dbg [[DBG345:![0-9]+]] ; DEBUG-NEXT: #dbg_value(<4 x float> [[TMP1]], [[META343:![0-9]+]], !DIExpression(), [[DBG345]]) ; DEBUG-NEXT: ret <4 x float> [[TMP1]], !dbg [[DBG346:![0-9]+]] @@ -905,6 +922,7 @@ define <2 x i64> @test13(i32 %a, i32 %b, i32 %c, i32 %d) { ; DEBUG-LABEL: @test13( ; DEBUG-NEXT: entry: ; DEBUG-NEXT: #dbg_value(ptr undef, [[META349:![0-9]+]], !DIExpression(), [[META354:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META349]], !DIExpression(), [[META354]]) ; DEBUG-NEXT: [[X_SROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i32 0, !dbg [[DBG355:![0-9]+]] ; DEBUG-NEXT: #dbg_value(ptr undef, [[META350:![0-9]+]], !DIExpression(), [[META356:![0-9]+]]) ; DEBUG-NEXT: [[X_SROA_0_4_VEC_INSERT:%.*]] = insertelement <4 x i32> [[X_SROA_0_0_VEC_INSERT]], i32 [[B:%.*]], i32 1, !dbg [[DBG357:![0-9]+]] @@ -947,6 +965,7 @@ define i32 @test14(<2 x i64> %x) { ; DEBUG-LABEL: @test14( ; DEBUG-NEXT: entry: ; DEBUG-NEXT: #dbg_value(ptr undef, [[META366:![0-9]+]], !DIExpression(), [[META378:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META366]], !DIExpression(), [[META378]]) ; DEBUG-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[X:%.*]] to <4 x i32>, !dbg [[DBG379:![0-9]+]] ; DEBUG-NEXT: #dbg_value(ptr undef, [[META367:![0-9]+]], !DIExpression(), [[META380:![0-9]+]]) ; DEBUG-NEXT: [[X_ADDR_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0, !dbg [[DBG381:![0-9]+]] @@ -990,29 +1009,30 @@ define <4 x ptr> @test15(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[X_SROA_0:%.*]] = alloca <4 x ptr>, align 32 ; CHECK-NEXT: store i32 [[A:%.*]], ptr [[X_SROA_0]], align 32 -; CHECK-NEXT: [[X_SROA_0_4_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 4 -; CHECK-NEXT: store i32 [[B:%.*]], ptr [[X_SROA_0_4_SROA_IDX1]], align 4 -; CHECK-NEXT: [[X_SROA_0_8_SROA_IDX2:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 8 -; CHECK-NEXT: store i32 [[C:%.*]], ptr [[X_SROA_0_8_SROA_IDX2]], align 8 -; CHECK-NEXT: [[X_SROA_0_12_SROA_IDX3:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 12 -; CHECK-NEXT: store i32 [[D:%.*]], ptr [[X_SROA_0_12_SROA_IDX3]], align 4 +; CHECK-NEXT: [[X_SROA_0_4_X_TMP2_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 4 +; CHECK-NEXT: store i32 [[B:%.*]], ptr [[X_SROA_0_4_X_TMP2_SROA_IDX1]], align 4 +; CHECK-NEXT: [[X_SROA_0_8_X_TMP3_SROA_IDX2:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 8 +; CHECK-NEXT: store i32 [[C:%.*]], ptr [[X_SROA_0_8_X_TMP3_SROA_IDX2]], align 8 +; CHECK-NEXT: [[X_SROA_0_12_X_TMP4_SROA_IDX3:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 12 +; CHECK-NEXT: store i32 [[D:%.*]], ptr [[X_SROA_0_12_X_TMP4_SROA_IDX3]], align 4 ; CHECK-NEXT: [[X_SROA_0_0_X_SROA_0_0_RESULT:%.*]] = load <4 x ptr>, ptr [[X_SROA_0]], align 32 ; CHECK-NEXT: ret <4 x ptr> [[X_SROA_0_0_X_SROA_0_0_RESULT]] ; ; DEBUG-LABEL: @test15( ; DEBUG-NEXT: entry: ; DEBUG-NEXT: [[X_SROA_0:%.*]] = alloca <4 x ptr>, align 32, !dbg [[DBG400:![0-9]+]] -; DEBUG-NEXT: #dbg_value(ptr undef, [[META394:![0-9]+]], !DIExpression(), [[DBG400]]) +; DEBUG-NEXT: #dbg_value(ptr [[X_SROA_0]], [[META394:![0-9]+]], !DIExpression(), [[DBG400]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META394]], !DIExpression(), [[DBG400]]) ; DEBUG-NEXT: store i32 [[A:%.*]], ptr [[X_SROA_0]], align 32, !dbg [[DBG401:![0-9]+]] ; DEBUG-NEXT: #dbg_value(ptr undef, [[META395:![0-9]+]], !DIExpression(), [[META402:![0-9]+]]) -; DEBUG-NEXT: [[X_SROA_0_4_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 4, !dbg [[DBG403:![0-9]+]] -; DEBUG-NEXT: store i32 [[B:%.*]], ptr [[X_SROA_0_4_SROA_IDX1]], align 4, !dbg [[DBG403]] +; DEBUG-NEXT: [[X_SROA_0_4_X_TMP2_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 4, !dbg [[DBG403:![0-9]+]] +; DEBUG-NEXT: store i32 [[B:%.*]], ptr [[X_SROA_0_4_X_TMP2_SROA_IDX1]], align 4, !dbg [[DBG403]] ; DEBUG-NEXT: #dbg_value(ptr undef, [[META396:![0-9]+]], !DIExpression(), [[META404:![0-9]+]]) -; DEBUG-NEXT: [[X_SROA_0_8_SROA_IDX2:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 8, !dbg [[DBG405:![0-9]+]] -; DEBUG-NEXT: store i32 [[C:%.*]], ptr [[X_SROA_0_8_SROA_IDX2]], align 8, !dbg [[DBG405]] +; DEBUG-NEXT: [[X_SROA_0_8_X_TMP3_SROA_IDX2:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 8, !dbg [[DBG405:![0-9]+]] +; DEBUG-NEXT: store i32 [[C:%.*]], ptr [[X_SROA_0_8_X_TMP3_SROA_IDX2]], align 8, !dbg [[DBG405]] ; DEBUG-NEXT: #dbg_value(ptr undef, [[META397:![0-9]+]], !DIExpression(), [[META406:![0-9]+]]) -; DEBUG-NEXT: [[X_SROA_0_12_SROA_IDX3:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 12, !dbg [[DBG407:![0-9]+]] -; DEBUG-NEXT: store i32 [[D:%.*]], ptr [[X_SROA_0_12_SROA_IDX3]], align 4, !dbg [[DBG407]] +; DEBUG-NEXT: [[X_SROA_0_12_X_TMP4_SROA_IDX3:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 12, !dbg [[DBG407:![0-9]+]] +; DEBUG-NEXT: store i32 [[D:%.*]], ptr [[X_SROA_0_12_X_TMP4_SROA_IDX3]], align 4, !dbg [[DBG407]] ; DEBUG-NEXT: [[X_SROA_0_0_X_SROA_0_0_RESULT:%.*]] = load <4 x ptr>, ptr [[X_SROA_0]], align 32, !dbg [[DBG408:![0-9]+]] ; DEBUG-NEXT: #dbg_value(<4 x ptr> [[X_SROA_0_0_X_SROA_0_0_RESULT]], [[META398:![0-9]+]], !DIExpression(), [[DBG408]]) ; DEBUG-NEXT: ret <4 x ptr> [[X_SROA_0_0_X_SROA_0_0_RESULT]], !dbg [[DBG409:![0-9]+]] @@ -1046,6 +1066,7 @@ define <4 x ptr> @test16(i64 %a, i64 %b, i64 %c, i64 %d) { ; DEBUG-LABEL: @test16( ; DEBUG-NEXT: entry: ; DEBUG-NEXT: #dbg_value(ptr undef, [[META412:![0-9]+]], !DIExpression(), [[META417:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META412]], !DIExpression(), [[META417]]) ; DEBUG-NEXT: [[TMP0:%.*]] = inttoptr i64 [[A:%.*]] to ptr, !dbg [[DBG418:![0-9]+]] ; DEBUG-NEXT: [[X_SROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x ptr> undef, ptr [[TMP0]], i32 0, !dbg [[DBG418]] ; DEBUG-NEXT: #dbg_value(ptr undef, [[META413:![0-9]+]], !DIExpression(), [[META419:![0-9]+]]) @@ -1078,29 +1099,30 @@ define <4 x ptr> @test17(i32 %a, i32 %b, i64 %c, i64 %d) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[X_SROA_0:%.*]] = alloca <4 x ptr>, align 32 ; CHECK-NEXT: store i32 [[A:%.*]], ptr [[X_SROA_0]], align 32 -; CHECK-NEXT: [[X_SROA_0_4_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 4 -; CHECK-NEXT: store i32 [[B:%.*]], ptr [[X_SROA_0_4_SROA_IDX1]], align 4 -; CHECK-NEXT: [[X_SROA_0_16_SROA_IDX2:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 16 -; CHECK-NEXT: store i64 [[C:%.*]], ptr [[X_SROA_0_16_SROA_IDX2]], align 16 -; CHECK-NEXT: [[X_SROA_0_24_SROA_IDX3:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 24 -; CHECK-NEXT: store i64 [[D:%.*]], ptr [[X_SROA_0_24_SROA_IDX3]], align 8 +; CHECK-NEXT: [[X_SROA_0_4_X_TMP2_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 4 +; CHECK-NEXT: store i32 [[B:%.*]], ptr [[X_SROA_0_4_X_TMP2_SROA_IDX1]], align 4 +; CHECK-NEXT: [[X_SROA_0_16_X_TMP3_SROA_IDX2:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 16 +; CHECK-NEXT: store i64 [[C:%.*]], ptr [[X_SROA_0_16_X_TMP3_SROA_IDX2]], align 16 +; CHECK-NEXT: [[X_SROA_0_24_X_TMP4_SROA_IDX3:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 24 +; CHECK-NEXT: store i64 [[D:%.*]], ptr [[X_SROA_0_24_X_TMP4_SROA_IDX3]], align 8 ; CHECK-NEXT: [[X_SROA_0_0_X_SROA_0_0_RESULT:%.*]] = load <4 x ptr>, ptr [[X_SROA_0]], align 32 ; CHECK-NEXT: ret <4 x ptr> [[X_SROA_0_0_X_SROA_0_0_RESULT]] ; ; DEBUG-LABEL: @test17( ; DEBUG-NEXT: entry: ; DEBUG-NEXT: [[X_SROA_0:%.*]] = alloca <4 x ptr>, align 32, !dbg [[DBG434:![0-9]+]] -; DEBUG-NEXT: #dbg_value(ptr undef, [[META429:![0-9]+]], !DIExpression(), [[DBG434]]) +; DEBUG-NEXT: #dbg_value(ptr [[X_SROA_0]], [[META429:![0-9]+]], !DIExpression(), [[DBG434]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META429]], !DIExpression(), [[DBG434]]) ; DEBUG-NEXT: store i32 [[A:%.*]], ptr [[X_SROA_0]], align 32, !dbg [[DBG435:![0-9]+]] ; DEBUG-NEXT: #dbg_value(ptr undef, [[META430:![0-9]+]], !DIExpression(), [[META436:![0-9]+]]) -; DEBUG-NEXT: [[X_SROA_0_4_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 4, !dbg [[DBG437:![0-9]+]] -; DEBUG-NEXT: store i32 [[B:%.*]], ptr [[X_SROA_0_4_SROA_IDX1]], align 4, !dbg [[DBG437]] +; DEBUG-NEXT: [[X_SROA_0_4_X_TMP2_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 4, !dbg [[DBG437:![0-9]+]] +; DEBUG-NEXT: store i32 [[B:%.*]], ptr [[X_SROA_0_4_X_TMP2_SROA_IDX1]], align 4, !dbg [[DBG437]] ; DEBUG-NEXT: #dbg_value(ptr undef, [[META431:![0-9]+]], !DIExpression(), [[META438:![0-9]+]]) -; DEBUG-NEXT: [[X_SROA_0_16_SROA_IDX2:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 16, !dbg [[DBG439:![0-9]+]] -; DEBUG-NEXT: store i64 [[C:%.*]], ptr [[X_SROA_0_16_SROA_IDX2]], align 16, !dbg [[DBG439]] +; DEBUG-NEXT: [[X_SROA_0_16_X_TMP3_SROA_IDX2:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 16, !dbg [[DBG439:![0-9]+]] +; DEBUG-NEXT: store i64 [[C:%.*]], ptr [[X_SROA_0_16_X_TMP3_SROA_IDX2]], align 16, !dbg [[DBG439]] ; DEBUG-NEXT: #dbg_value(ptr undef, [[META432:![0-9]+]], !DIExpression(), [[META440:![0-9]+]]) -; DEBUG-NEXT: [[X_SROA_0_24_SROA_IDX3:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 24, !dbg [[DBG441:![0-9]+]] -; DEBUG-NEXT: store i64 [[D:%.*]], ptr [[X_SROA_0_24_SROA_IDX3]], align 8, !dbg [[DBG441]] +; DEBUG-NEXT: [[X_SROA_0_24_X_TMP4_SROA_IDX3:%.*]] = getelementptr inbounds i8, ptr [[X_SROA_0]], i64 24, !dbg [[DBG441:![0-9]+]] +; DEBUG-NEXT: store i64 [[D:%.*]], ptr [[X_SROA_0_24_X_TMP4_SROA_IDX3]], align 8, !dbg [[DBG441]] ; DEBUG-NEXT: [[X_SROA_0_0_X_SROA_0_0_RESULT:%.*]] = load <4 x ptr>, ptr [[X_SROA_0]], align 32, !dbg [[DBG442:![0-9]+]] ; DEBUG-NEXT: #dbg_value(<4 x ptr> [[X_SROA_0_0_X_SROA_0_0_RESULT]], [[META433:![0-9]+]], !DIExpression(), [[DBG442]]) ; DEBUG-NEXT: ret <4 x ptr> [[X_SROA_0_0_X_SROA_0_0_RESULT]], !dbg [[DBG443:![0-9]+]] @@ -1129,7 +1151,8 @@ define i1 @test18() { ; ; DEBUG-LABEL: @test18( ; DEBUG-NEXT: [[A_SROA_0:%.*]] = alloca <2 x i64>, align 32, !dbg [[DBG449:![0-9]+]] -; DEBUG-NEXT: #dbg_value(ptr undef, [[META446:![0-9]+]], !DIExpression(), [[DBG449]]) +; DEBUG-NEXT: #dbg_value(ptr [[A_SROA_0]], [[META446:![0-9]+]], !DIExpression(), [[DBG449]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META446]], !DIExpression(), [[DBG449]]) ; DEBUG-NEXT: store <2 x i64> , ptr [[A_SROA_0]], align 32, !dbg [[DBG450:![0-9]+]] ; DEBUG-NEXT: [[A_SROA_0_0_A_SROA_0_0_L:%.*]] = load i1, ptr [[A_SROA_0]], align 32, !dbg [[DBG451:![0-9]+]] ; DEBUG-NEXT: #dbg_value(i1 [[A_SROA_0_0_A_SROA_0_0_L]], [[META447:![0-9]+]], !DIExpression(), [[DBG451]]) @@ -1150,6 +1173,7 @@ define void @swap-8bytes(ptr %x, ptr %y) { ; ; DEBUG-LABEL: @swap-8bytes( ; DEBUG-NEXT: #dbg_value(ptr undef, [[META455:![0-9]+]], !DIExpression(), [[META456:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META455]], !DIExpression(), [[META456]]) ; DEBUG-NEXT: [[TMP_SROA_0_0_COPYLOAD:%.*]] = load i64, ptr [[X:%.*]], align 1, !dbg [[DBG457:![0-9]+]] ; DEBUG-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[X]], ptr [[Y:%.*]], i64 8, i1 false), !dbg [[DBG458:![0-9]+]] ; DEBUG-NEXT: store i64 [[TMP_SROA_0_0_COPYLOAD]], ptr [[Y]], align 1, !dbg [[DBG459:![0-9]+]] @@ -1276,10 +1300,10 @@ define <4 x float> @ptrLoadStoreTysFloat(ptr %init, float %val2) { ; CHECK-NEXT: [[OBJ:%.*]] = alloca <4 x float>, align 16 ; CHECK-NEXT: store <4 x float> zeroinitializer, ptr [[OBJ]], align 16 ; CHECK-NEXT: store ptr [[VAL0]], ptr [[OBJ]], align 16 -; CHECK-NEXT: [[OBJ_8_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8 -; CHECK-NEXT: store float [[VAL2:%.*]], ptr [[OBJ_8_SROA_IDX]], align 8 -; CHECK-NEXT: [[OBJ_12_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12 -; CHECK-NEXT: store float 1.310720e+05, ptr [[OBJ_12_SROA_IDX]], align 4 +; CHECK-NEXT: [[OBJ_8_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8 +; CHECK-NEXT: store float [[VAL2:%.*]], ptr [[OBJ_8_PTR2_SROA_IDX]], align 8 +; CHECK-NEXT: [[OBJ_12_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12 +; CHECK-NEXT: store float 1.310720e+05, ptr [[OBJ_12_PTR3_SROA_IDX]], align 4 ; CHECK-NEXT: [[OBJ_0_SROAVAL:%.*]] = load <4 x float>, ptr [[OBJ]], align 16 ; CHECK-NEXT: ret <4 x float> [[OBJ_0_SROAVAL]] ; @@ -1291,11 +1315,11 @@ define <4 x float> @ptrLoadStoreTysFloat(ptr %init, float %val2) { ; DEBUG-NEXT: store <4 x float> zeroinitializer, ptr [[OBJ]], align 16, !dbg [[DBG510:![0-9]+]] ; DEBUG-NEXT: store ptr [[VAL0]], ptr [[OBJ]], align 16, !dbg [[DBG511:![0-9]+]] ; DEBUG-NEXT: #dbg_value(ptr undef, [[META505:![0-9]+]], !DIExpression(), [[META512:![0-9]+]]) -; DEBUG-NEXT: [[OBJ_8_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8, !dbg [[DBG513:![0-9]+]] -; DEBUG-NEXT: store float [[VAL2:%.*]], ptr [[OBJ_8_SROA_IDX]], align 8, !dbg [[DBG513]] +; DEBUG-NEXT: [[OBJ_8_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8, !dbg [[DBG513:![0-9]+]] +; DEBUG-NEXT: store float [[VAL2:%.*]], ptr [[OBJ_8_PTR2_SROA_IDX]], align 8, !dbg [[DBG513]] ; DEBUG-NEXT: #dbg_value(ptr undef, [[META506:![0-9]+]], !DIExpression(), [[META514:![0-9]+]]) -; DEBUG-NEXT: [[OBJ_12_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12, !dbg [[DBG515:![0-9]+]] -; DEBUG-NEXT: store float 1.310720e+05, ptr [[OBJ_12_SROA_IDX]], align 4, !dbg [[DBG515]] +; DEBUG-NEXT: [[OBJ_12_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12, !dbg [[DBG515:![0-9]+]] +; DEBUG-NEXT: store float 1.310720e+05, ptr [[OBJ_12_PTR3_SROA_IDX]], align 4, !dbg [[DBG515]] ; DEBUG-NEXT: [[OBJ_0_SROAVAL:%.*]] = load <4 x float>, ptr [[OBJ]], align 16, !dbg [[DBG516:![0-9]+]] ; DEBUG-NEXT: #dbg_value(<4 x float> [[OBJ_0_SROAVAL]], [[META507:![0-9]+]], !DIExpression(), [[DBG516]]) ; DEBUG-NEXT: ret <4 x float> [[OBJ_0_SROAVAL]], !dbg [[DBG517:![0-9]+]] @@ -1356,10 +1380,10 @@ define <4 x ptr> @ptrLoadStoreTysPtr(ptr %init, i64 %val2) { ; CHECK-NEXT: [[OBJ:%.*]] = alloca <4 x ptr>, align 16 ; CHECK-NEXT: store <4 x ptr> zeroinitializer, ptr [[OBJ]], align 16 ; CHECK-NEXT: store ptr [[VAL0]], ptr [[OBJ]], align 16 -; CHECK-NEXT: [[OBJ_8_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8 -; CHECK-NEXT: store i64 [[VAL2:%.*]], ptr [[OBJ_8_SROA_IDX]], align 8 -; CHECK-NEXT: [[OBJ_12_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12 -; CHECK-NEXT: store i64 131072, ptr [[OBJ_12_SROA_IDX]], align 4 +; CHECK-NEXT: [[OBJ_8_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8 +; CHECK-NEXT: store i64 [[VAL2:%.*]], ptr [[OBJ_8_PTR2_SROA_IDX]], align 8 +; CHECK-NEXT: [[OBJ_12_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12 +; CHECK-NEXT: store i64 131072, ptr [[OBJ_12_PTR3_SROA_IDX]], align 4 ; CHECK-NEXT: [[OBJ_0_SROAVAL:%.*]] = load <4 x ptr>, ptr [[OBJ]], align 16 ; CHECK-NEXT: ret <4 x ptr> [[OBJ_0_SROAVAL]] ; @@ -1371,11 +1395,11 @@ define <4 x ptr> @ptrLoadStoreTysPtr(ptr %init, i64 %val2) { ; DEBUG-NEXT: store <4 x ptr> zeroinitializer, ptr [[OBJ]], align 16, !dbg [[DBG543:![0-9]+]] ; DEBUG-NEXT: store ptr [[VAL0]], ptr [[OBJ]], align 16, !dbg [[DBG544:![0-9]+]] ; DEBUG-NEXT: #dbg_value(ptr undef, [[META538:![0-9]+]], !DIExpression(), [[META545:![0-9]+]]) -; DEBUG-NEXT: [[OBJ_8_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8, !dbg [[DBG546:![0-9]+]] -; DEBUG-NEXT: store i64 [[VAL2:%.*]], ptr [[OBJ_8_SROA_IDX]], align 8, !dbg [[DBG546]] +; DEBUG-NEXT: [[OBJ_8_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8, !dbg [[DBG546:![0-9]+]] +; DEBUG-NEXT: store i64 [[VAL2:%.*]], ptr [[OBJ_8_PTR2_SROA_IDX]], align 8, !dbg [[DBG546]] ; DEBUG-NEXT: #dbg_value(ptr undef, [[META539:![0-9]+]], !DIExpression(), [[META547:![0-9]+]]) -; DEBUG-NEXT: [[OBJ_12_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12, !dbg [[DBG548:![0-9]+]] -; DEBUG-NEXT: store i64 131072, ptr [[OBJ_12_SROA_IDX]], align 4, !dbg [[DBG548]] +; DEBUG-NEXT: [[OBJ_12_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12, !dbg [[DBG548:![0-9]+]] +; DEBUG-NEXT: store i64 131072, ptr [[OBJ_12_PTR3_SROA_IDX]], align 4, !dbg [[DBG548]] ; DEBUG-NEXT: [[OBJ_0_SROAVAL:%.*]] = load <4 x ptr>, ptr [[OBJ]], align 16, !dbg [[DBG549:![0-9]+]] ; DEBUG-NEXT: #dbg_value(<4 x ptr> [[OBJ_0_SROAVAL]], [[META540:![0-9]+]], !DIExpression(), [[DBG549]]) ; DEBUG-NEXT: ret <4 x ptr> [[OBJ_0_SROAVAL]], !dbg [[DBG550:![0-9]+]] @@ -1405,6 +1429,7 @@ define <4 x i32> @validLoadStoreTy([2 x i64] %cond.coerce) { ; DEBUG-LABEL: @validLoadStoreTy( ; DEBUG-NEXT: entry: ; DEBUG-NEXT: #dbg_value(ptr undef, [[META553:![0-9]+]], !DIExpression(), [[META557:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META553]], !DIExpression(), [[META557]]) ; DEBUG-NEXT: #dbg_value(ptr undef, [[META554:![0-9]+]], !DIExpression(), [[META558:![0-9]+]]) ; DEBUG-NEXT: [[COND_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[COND_COERCE:%.*]], 0, !dbg [[DBG559:![0-9]+]] ; DEBUG-NEXT: [[COND_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[COND_COERCE_FCA_0_EXTRACT]], i32 0, !dbg [[DBG559]] @@ -1455,7 +1480,8 @@ define noundef zeroext i1 @CandidateTysRealloc() personality ptr null { ; ; DEBUG-LABEL: @CandidateTysRealloc( ; DEBUG-NEXT: entry: -; DEBUG-NEXT: #dbg_value(ptr undef, [[META565:![0-9]+]], !DIExpression(), [[META570:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr poison, [[META565:![0-9]+]], !DIExpression(), [[META570:![0-9]+]]) +; DEBUG-NEXT: #dbg_value(ptr undef, [[META565]], !DIExpression(), [[META570]]) ; DEBUG-NEXT: br label [[BB_1:%.*]], !dbg [[DBG571:![0-9]+]] ; DEBUG: bb.1: ; DEBUG-NEXT: br label [[BB_1]], !dbg [[DBG572:![0-9]+]] From d23c24f336674727d281258157fc5b15ce9040a4 Mon Sep 17 00:00:00 2001 From: Alexander Shaposhnikov Date: Wed, 21 Aug 2024 18:08:31 -0700 Subject: [PATCH 134/426] [llvm][nsan] Skip function declarations (#105598) Skip function declarations in the instrumentation pass. --- .../Transforms/Instrumentation/NumericalStabilitySanitizer.cpp | 3 ++- llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp index 5872396669435a..ffd9faff1d3a53 100644 --- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp @@ -2038,7 +2038,8 @@ static void moveFastMathFlags(Function &F, bool NumericalStabilitySanitizer::sanitizeFunction( Function &F, const TargetLibraryInfo &TLI) { - if (!F.hasFnAttribute(Attribute::SanitizeNumericalStability)) + if (!F.hasFnAttribute(Attribute::SanitizeNumericalStability) || + F.isDeclaration()) return false; // This is required to prevent instrumenting call to __nsan_init from within diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll index 5da68320d91f90..2131162bf4bf3f 100644 --- a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll +++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll @@ -4,6 +4,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +declare float @declaration_only(float %a) sanitize_numerical_stability + ; Tests with simple control flow. @float_const = private unnamed_addr constant float 0.5 From 2b66417d08d8e87f42cd154370ad1722ae7842c8 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 21 Aug 2024 20:08:39 -0500 Subject: [PATCH 135/426] [libc] Fix accidentally using system file on GPU Summary: Forgot to delete this --- libc/src/stdio/scanf_core/CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/libc/src/stdio/scanf_core/CMakeLists.txt b/libc/src/stdio/scanf_core/CMakeLists.txt index 5c00ae0c9973c2..a8935d464417c2 100644 --- a/libc/src/stdio/scanf_core/CMakeLists.txt +++ b/libc/src/stdio/scanf_core/CMakeLists.txt @@ -105,8 +105,6 @@ if(LIBC_TARGET_OS_IS_GPU) libc.src.stdio.getc libc.src.stdio.ungetc libc.src.stdio.ferror - COMPILE_OPTIONS - -DLIBC_COPT_STDIO_USE_SYSTEM_FILE ) elseif(TARGET libc.src.__support.File.file OR (NOT LLVM_LIBC_FULL_BUILD)) add_header_library( From 8e0b9c85924ca22a65d57988ea2c5c22a5181ed9 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Wed, 21 Aug 2024 18:52:48 -0700 Subject: [PATCH 136/426] [lldb-dap] Skip the lldb-dap output test on windows, it seems all the lldb-dap tests are disabled on windows. (#105604) This should fix https://lab.llvm.org/buildbot/#/builders/141/builds/1747 --- lldb/test/API/tools/lldb-dap/output/TestDAP_output.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py b/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py index 0d40ce993dc31c..02c34ba10321bd 100644 --- a/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py +++ b/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py @@ -8,6 +8,7 @@ class TestDAP_output(lldbdap_testcase.DAPTestCaseBase): + @skipIfWindows def test_output(self): program = self.getBuildArtifact("a.out") self.build_and_launch(program) From 7854b16d2699ca7cc02d4ea066230d370c751ba9 Mon Sep 17 00:00:00 2001 From: vporpo Date: Wed, 21 Aug 2024 19:05:30 -0700 Subject: [PATCH 137/426] [SandboxIR] Implement FuncletPadInst, CatchPadInst and CleanupInst (#105294) This patch implements sandboxir::FuncletPadInst,CatchInst,CleanupInst mirroring their llvm:: counterparts. --- llvm/include/llvm/SandboxIR/SandboxIR.h | 75 ++++++++++++++++ .../llvm/SandboxIR/SandboxIRValues.def | 2 + llvm/lib/SandboxIR/SandboxIR.cpp | 83 ++++++++++++++++- llvm/unittests/SandboxIR/SandboxIRTest.cpp | 90 +++++++++++++++++++ llvm/unittests/SandboxIR/TrackerTest.cpp | 51 +++++++++++ 5 files changed, 300 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index 278951113aed84..ed5b6f9c9da852 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -127,6 +127,9 @@ class CallBase; class CallInst; class InvokeInst; class CallBrInst; +class FuncletPadInst; +class CatchPadInst; +class CleanupPadInst; class GetElementPtrInst; class CastInst; class PtrToIntInst; @@ -256,6 +259,9 @@ class Value { friend class CallInst; // For getting `Val`. friend class InvokeInst; // For getting `Val`. friend class CallBrInst; // For getting `Val`. + friend class FuncletPadInst; // For getting `Val`. + friend class CatchPadInst; // For getting `Val`. + friend class CleanupPadInst; // For getting `Val`. friend class GetElementPtrInst; // For getting `Val`. friend class CatchSwitchInst; // For getting `Val`. friend class SwitchInst; // For getting `Val`. @@ -679,6 +685,8 @@ class Instruction : public sandboxir::User { friend class CallInst; // For getTopmostLLVMInstruction(). friend class InvokeInst; // For getTopmostLLVMInstruction(). friend class CallBrInst; // For getTopmostLLVMInstruction(). + friend class CatchPadInst; // For getTopmostLLVMInstruction(). + friend class CleanupPadInst; // For getTopmostLLVMInstruction(). friend class GetElementPtrInst; // For getTopmostLLVMInstruction(). friend class CatchSwitchInst; // For getTopmostLLVMInstruction(). friend class SwitchInst; // For getTopmostLLVMInstruction(). @@ -845,6 +853,7 @@ template class SingleLLVMInstructionImpl : public Instruction { #include "llvm/SandboxIR/SandboxIRValues.def" friend class UnaryInstruction; friend class CallBase; + friend class FuncletPadInst; Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final { return getOperandUseDefault(OpIdx, Verify); @@ -1843,6 +1852,68 @@ class CallBrInst final : public CallBase { } }; +class FuncletPadInst : public SingleLLVMInstructionImpl { + FuncletPadInst(ClassID SubclassID, Opcode Opc, llvm::Instruction *I, + Context &Ctx) + : SingleLLVMInstructionImpl(SubclassID, Opc, I, Ctx) {} + friend class CatchPadInst; // For constructor. + friend class CleanupPadInst; // For constructor. + +public: + /// Return the number of funcletpad arguments. + unsigned arg_size() const { + return cast(Val)->arg_size(); + } + /// Return the outer EH-pad this funclet is nested within. + /// + /// Note: This returns the associated CatchSwitchInst if this FuncletPadInst + /// is a CatchPadInst. + Value *getParentPad() const; + void setParentPad(Value *ParentPad); + /// Return the Idx-th funcletpad argument. + Value *getArgOperand(unsigned Idx) const; + /// Set the Idx-th funcletpad argument. + void setArgOperand(unsigned Idx, Value *V); + + // TODO: Implement missing functions: arg_operands(). + static bool classof(const Value *From) { + return From->getSubclassID() == ClassID::CatchPad || + From->getSubclassID() == ClassID::CleanupPad; + } +}; + +class CatchPadInst : public FuncletPadInst { + CatchPadInst(llvm::CatchPadInst *CPI, Context &Ctx) + : FuncletPadInst(ClassID::CatchPad, Opcode::CatchPad, CPI, Ctx) {} + friend class Context; // For constructor. + +public: + CatchSwitchInst *getCatchSwitch() const; + // TODO: We have not implemented setCatchSwitch() because we can't revert it + // for now, as there is no CatchPadInst member function that can undo it. + + static CatchPadInst *create(Value *ParentPad, ArrayRef Args, + BBIterator WhereIt, BasicBlock *WhereBB, + Context &Ctx, const Twine &Name = ""); + static bool classof(const Value *From) { + return From->getSubclassID() == ClassID::CatchPad; + } +}; + +class CleanupPadInst : public FuncletPadInst { + CleanupPadInst(llvm::CleanupPadInst *CPI, Context &Ctx) + : FuncletPadInst(ClassID::CleanupPad, Opcode::CleanupPad, CPI, Ctx) {} + friend class Context; // For constructor. + +public: + static CleanupPadInst *create(Value *ParentPad, ArrayRef Args, + BBIterator WhereIt, BasicBlock *WhereBB, + Context &Ctx, const Twine &Name = ""); + static bool classof(const Value *From) { + return From->getSubclassID() == ClassID::CleanupPad; + } +}; + class GetElementPtrInst final : public SingleLLVMInstructionImpl { /// Use Context::createGetElementPtrInst(). Don't call @@ -2745,6 +2816,10 @@ class Context { friend InvokeInst; // For createInvokeInst() CallBrInst *createCallBrInst(llvm::CallBrInst *I); friend CallBrInst; // For createCallBrInst() + CatchPadInst *createCatchPadInst(llvm::CatchPadInst *I); + friend CatchPadInst; // For createCatchPadInst() + CleanupPadInst *createCleanupPadInst(llvm::CleanupPadInst *I); + friend CleanupPadInst; // For createCleanupPadInst() GetElementPtrInst *createGetElementPtrInst(llvm::GetElementPtrInst *I); friend GetElementPtrInst; // For createGetElementPtrInst() CatchSwitchInst *createCatchSwitchInst(llvm::CatchSwitchInst *I); diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def index 56720f564a7cae..a75f872bc88acb 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def +++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def @@ -46,6 +46,8 @@ DEF_INSTR(Ret, OP(Ret), ReturnInst) DEF_INSTR(Call, OP(Call), CallInst) DEF_INSTR(Invoke, OP(Invoke), InvokeInst) DEF_INSTR(CallBr, OP(CallBr), CallBrInst) +DEF_INSTR(CatchPad, OP(CatchPad), CatchPadInst) +DEF_INSTR(CleanupPad, OP(CleanupPad), CleanupPadInst) DEF_INSTR(GetElementPtr, OP(GetElementPtr), GetElementPtrInst) DEF_INSTR(CatchSwitch, OP(CatchSwitch), CatchSwitchInst) DEF_INSTR(Switch, OP(Switch), SwitchInst) diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp index 92054e7cab86ee..1ff82a968a717f 100644 --- a/llvm/lib/SandboxIR/SandboxIR.cpp +++ b/llvm/lib/SandboxIR/SandboxIR.cpp @@ -1043,6 +1043,68 @@ BasicBlock *CallBrInst::getSuccessor(unsigned Idx) const { Ctx.getValue(cast(Val)->getSuccessor(Idx))); } +Value *FuncletPadInst::getParentPad() const { + return Ctx.getValue(cast(Val)->getParentPad()); +} + +void FuncletPadInst::setParentPad(Value *ParentPad) { + Ctx.getTracker() + .emplaceIfTracking>(this); + cast(Val)->setParentPad(ParentPad->Val); +} + +Value *FuncletPadInst::getArgOperand(unsigned Idx) const { + return Ctx.getValue(cast(Val)->getArgOperand(Idx)); +} + +void FuncletPadInst::setArgOperand(unsigned Idx, Value *V) { + Ctx.getTracker() + .emplaceIfTracking>( + this, Idx); + cast(Val)->setArgOperand(Idx, V->Val); +} + +CatchSwitchInst *CatchPadInst::getCatchSwitch() const { + return cast( + Ctx.getValue(cast(Val)->getCatchSwitch())); +} + +CatchPadInst *CatchPadInst::create(Value *ParentPad, ArrayRef Args, + BBIterator WhereIt, BasicBlock *WhereBB, + Context &Ctx, const Twine &Name) { + auto &Builder = Ctx.getLLVMIRBuilder(); + if (WhereIt != WhereBB->end()) + Builder.SetInsertPoint((*WhereIt).getTopmostLLVMInstruction()); + else + Builder.SetInsertPoint(cast(WhereBB->Val)); + SmallVector LLVMArgs; + LLVMArgs.reserve(Args.size()); + for (auto *Arg : Args) + LLVMArgs.push_back(Arg->Val); + llvm::CatchPadInst *LLVMI = + Builder.CreateCatchPad(ParentPad->Val, LLVMArgs, Name); + return Ctx.createCatchPadInst(LLVMI); +} + +CleanupPadInst *CleanupPadInst::create(Value *ParentPad, ArrayRef Args, + BBIterator WhereIt, BasicBlock *WhereBB, + Context &Ctx, const Twine &Name) { + auto &Builder = Ctx.getLLVMIRBuilder(); + if (WhereIt != WhereBB->end()) + Builder.SetInsertPoint((*WhereIt).getTopmostLLVMInstruction()); + else + Builder.SetInsertPoint(cast(WhereBB->Val)); + SmallVector LLVMArgs; + LLVMArgs.reserve(Args.size()); + for (auto *Arg : Args) + LLVMArgs.push_back(Arg->Val); + llvm::CleanupPadInst *LLVMI = + Builder.CreateCleanupPad(ParentPad->Val, LLVMArgs, Name); + return Ctx.createCleanupPadInst(LLVMI); +} + Value *GetElementPtrInst::create(Type *Ty, Value *Ptr, ArrayRef IdxList, BasicBlock::iterator WhereIt, @@ -2064,6 +2126,18 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { It->second = std::unique_ptr(new CallBrInst(LLVMCallBr, *this)); return It->second.get(); } + case llvm::Instruction::CatchPad: { + auto *LLVMCPI = cast(LLVMV); + It->second = + std::unique_ptr(new CatchPadInst(LLVMCPI, *this)); + return It->second.get(); + } + case llvm::Instruction::CleanupPad: { + auto *LLVMCPI = cast(LLVMV); + It->second = + std::unique_ptr(new CleanupPadInst(LLVMCPI, *this)); + return It->second.get(); + } case llvm::Instruction::GetElementPtr: { auto *LLVMGEP = cast(LLVMV); It->second = std::unique_ptr( @@ -2240,7 +2314,14 @@ UnreachableInst *Context::createUnreachableInst(llvm::UnreachableInst *UI) { std::unique_ptr(new UnreachableInst(UI, *this)); return cast(registerValue(std::move(NewPtr))); } - +CatchPadInst *Context::createCatchPadInst(llvm::CatchPadInst *I) { + auto NewPtr = std::unique_ptr(new CatchPadInst(I, *this)); + return cast(registerValue(std::move(NewPtr))); +} +CleanupPadInst *Context::createCleanupPadInst(llvm::CleanupPadInst *I) { + auto NewPtr = std::unique_ptr(new CleanupPadInst(I, *this)); + return cast(registerValue(std::move(NewPtr))); +} GetElementPtrInst * Context::createGetElementPtrInst(llvm::GetElementPtrInst *I) { auto NewPtr = diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index b6981027b4c040..28894397a60d6f 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -1867,6 +1867,96 @@ define void @foo(i8 %arg) { } } +TEST_F(SandboxIRTest, FuncletPadInst_CatchPadInst_CleanupPadInst) { + parseIR(C, R"IR( +define void @foo() { +dispatch: + %cs = catchswitch within none [label %handler0] unwind to caller +handler0: + %catchpad = catchpad within %cs [ptr @foo] + ret void +handler1: + %cleanuppad = cleanuppad within %cs [ptr @foo] + ret void +bb: + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + BasicBlock *LLVMDispatch = getBasicBlockByName(LLVMF, "dispatch"); + BasicBlock *LLVMHandler0 = getBasicBlockByName(LLVMF, "handler0"); + BasicBlock *LLVMHandler1 = getBasicBlockByName(LLVMF, "handler1"); + auto *LLVMCP = cast(&*LLVMHandler0->begin()); + auto *LLVMCLP = cast(&*LLVMHandler1->begin()); + + sandboxir::Context Ctx(C); + [[maybe_unused]] auto &F = *Ctx.createFunction(&LLVMF); + auto *Dispatch = cast(Ctx.getValue(LLVMDispatch)); + auto *Handler0 = cast(Ctx.getValue(LLVMHandler0)); + auto *Handler1 = cast(Ctx.getValue(LLVMHandler1)); + auto *BB = cast( + Ctx.getValue(getBasicBlockByName(LLVMF, "bb"))); + auto *BBRet = cast(&*BB->begin()); + auto *CS = cast(&*Dispatch->begin()); + [[maybe_unused]] auto *CP = + cast(&*Handler0->begin()); + [[maybe_unused]] auto *CLP = + cast(&*Handler1->begin()); + + // Check getCatchSwitch(). + EXPECT_EQ(CP->getCatchSwitch(), CS); + EXPECT_EQ(CP->getCatchSwitch(), Ctx.getValue(LLVMCP->getCatchSwitch())); + + for (llvm::FuncletPadInst *LLVMFPI : + {static_cast(LLVMCP), + static_cast(LLVMCLP)}) { + auto *FPI = cast(Ctx.getValue(LLVMFPI)); + // Check arg_size(). + EXPECT_EQ(FPI->arg_size(), LLVMFPI->arg_size()); + // Check getParentPad(). + EXPECT_EQ(FPI->getParentPad(), Ctx.getValue(LLVMFPI->getParentPad())); + // Check setParentPad(). + auto *OrigParentPad = FPI->getParentPad(); + auto *NewParentPad = Dispatch; + EXPECT_NE(NewParentPad, OrigParentPad); + FPI->setParentPad(NewParentPad); + EXPECT_EQ(FPI->getParentPad(), NewParentPad); + FPI->setParentPad(OrigParentPad); + EXPECT_EQ(FPI->getParentPad(), OrigParentPad); + // Check getArgOperand(). + for (auto Idx : seq(0, FPI->arg_size())) + EXPECT_EQ(FPI->getArgOperand(Idx), + Ctx.getValue(LLVMFPI->getArgOperand(Idx))); + // Check setArgOperand(). + auto *OrigArgOperand = FPI->getArgOperand(0); + auto *NewArgOperand = Dispatch; + EXPECT_NE(NewArgOperand, OrigArgOperand); + FPI->setArgOperand(0, NewArgOperand); + EXPECT_EQ(FPI->getArgOperand(0), NewArgOperand); + FPI->setArgOperand(0, OrigArgOperand); + EXPECT_EQ(FPI->getArgOperand(0), OrigArgOperand); + } + // Check CatchPadInst::create(). + auto *NewCPI = cast(sandboxir::CatchPadInst::create( + CS, {}, BBRet->getIterator(), BB, Ctx, "NewCPI")); + EXPECT_EQ(NewCPI->getCatchSwitch(), CS); + EXPECT_EQ(NewCPI->arg_size(), 0u); + EXPECT_EQ(NewCPI->getNextNode(), BBRet); +#ifndef NDEBUG + EXPECT_EQ(NewCPI->getName(), "NewCPI"); +#endif // NDEBUG + // Check CleanupPadInst::create(). + auto *NewCLPI = + cast(sandboxir::CleanupPadInst::create( + CS, {}, BBRet->getIterator(), BB, Ctx, "NewCLPI")); + EXPECT_EQ(NewCLPI->getParentPad(), CS); + EXPECT_EQ(NewCLPI->arg_size(), 0u); + EXPECT_EQ(NewCLPI->getNextNode(), BBRet); +#ifndef NDEBUG + EXPECT_EQ(NewCLPI->getName(), "NewCLPI"); +#endif // NDEBUG +} + TEST_F(SandboxIRTest, GetElementPtrInstruction) { parseIR(C, R"IR( define void @foo(ptr %ptr, <2 x ptr> %ptrs) { diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp index a2c3080011f162..c2faf60a57f3b8 100644 --- a/llvm/unittests/SandboxIR/TrackerTest.cpp +++ b/llvm/unittests/SandboxIR/TrackerTest.cpp @@ -1033,6 +1033,57 @@ define void @foo(i8 %arg) { EXPECT_EQ(CallBr->getIndirectDest(0), OrigIndirectDest); } +TEST_F(TrackerTest, FuncletPadInstSetters) { + parseIR(C, R"IR( +define void @foo() { +dispatch: + %cs = catchswitch within none [label %handler0] unwind to caller +handler0: + %catchpad = catchpad within %cs [ptr @foo] + ret void +handler1: + %cleanuppad = cleanuppad within %cs [ptr @foo] + ret void +bb: + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + [[maybe_unused]] auto &F = *Ctx.createFunction(&LLVMF); + auto *Dispatch = cast( + Ctx.getValue(getBasicBlockByName(LLVMF, "dispatch"))); + auto *Handler0 = cast( + Ctx.getValue(getBasicBlockByName(LLVMF, "handler0"))); + auto *Handler1 = cast( + Ctx.getValue(getBasicBlockByName(LLVMF, "handler1"))); + auto *CP = cast(&*Handler0->begin()); + auto *CLP = cast(&*Handler1->begin()); + + for (auto *FPI : {static_cast(CP), + static_cast(CLP)}) { + // Check setParentPad(). + auto *OrigParentPad = FPI->getParentPad(); + auto *NewParentPad = Dispatch; + EXPECT_NE(NewParentPad, OrigParentPad); + Ctx.save(); + FPI->setParentPad(NewParentPad); + EXPECT_EQ(FPI->getParentPad(), NewParentPad); + Ctx.revert(); + EXPECT_EQ(FPI->getParentPad(), OrigParentPad); + + // Check setArgOperand(). + auto *OrigArgOperand = FPI->getArgOperand(0); + auto *NewArgOperand = Dispatch; + EXPECT_NE(NewArgOperand, OrigArgOperand); + Ctx.save(); + FPI->setArgOperand(0, NewArgOperand); + EXPECT_EQ(FPI->getArgOperand(0), NewArgOperand); + Ctx.revert(); + EXPECT_EQ(FPI->getArgOperand(0), OrigArgOperand); + } +} + TEST_F(TrackerTest, PHINodeSetters) { parseIR(C, R"IR( define void @foo(i8 %arg0, i8 %arg1, i8 %arg2) { From 0ca77f6656a772624a591261957f6b313a0d544e Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 21 Aug 2024 19:23:07 -0700 Subject: [PATCH 138/426] [RISCV] Add CSRs and an instruction for Smctr and Ssctr extensions. (#105148) https://github.com/riscv/riscv-control-transfer-records/releases/tag/v1.0_rc3 --- .../Driver/print-supported-extensions-riscv.c | 2 + .../test/Preprocessor/riscv-target-features.c | 18 ++++++++ llvm/docs/RISCVUsage.rst | 3 ++ llvm/docs/ReleaseNotes.rst | 1 + llvm/lib/Target/RISCV/RISCVFeatures.td | 13 ++++++ llvm/lib/Target/RISCV/RISCVInstrInfo.td | 8 ++++ llvm/lib/Target/RISCV/RISCVSystemOperands.td | 9 ++++ llvm/test/CodeGen/RISCV/attributes.ll | 8 ++++ llvm/test/MC/RISCV/attribute-arch.s | 6 +++ llvm/test/MC/RISCV/hypervisor-csr-names.s | 17 ++++++++ llvm/test/MC/RISCV/machine-csr-names.s | 17 ++++++++ llvm/test/MC/RISCV/smctr-ssctr-valid.s | 30 +++++++++++++ llvm/test/MC/RISCV/supervisor-csr-names.s | 43 +++++++++++++++++++ .../TargetParser/RISCVISAInfoTest.cpp | 2 + 14 files changed, 177 insertions(+) create mode 100644 llvm/test/MC/RISCV/smctr-ssctr-valid.s diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c index 9497d01a832604..312c462f715d5e 100644 --- a/clang/test/Driver/print-supported-extensions-riscv.c +++ b/clang/test/Driver/print-supported-extensions-riscv.c @@ -175,8 +175,10 @@ // CHECK-NEXT: zalasr 0.1 'Zalasr' (Load-Acquire and Store-Release Instructions) // CHECK-NEXT: zvbc32e 0.7 'Zvbc32e' (Vector Carryless Multiplication with 32-bits elements) // CHECK-NEXT: zvkgs 0.7 'Zvkgs' (Vector-Scalar GCM instructions for Cryptography) +// CHECK-NEXT: smctr 1.0 'Smctr' (Control Transfer Records Machine Level) // CHECK-NEXT: smmpm 1.0 'Smmpm' (Machine-level Pointer Masking for M-mode) // CHECK-NEXT: smnpm 1.0 'Smnpm' (Machine-level Pointer Masking for next lower privilege mode) +// CHECK-NEXT: ssctr 1.0 'Ssctr' (Control Transfer Records Supervisor Level) // CHECK-NEXT: ssnpm 1.0 'Ssnpm' (Supervisor-level Pointer Masking for next lower privilege mode) // CHECK-NEXT: sspm 1.0 'Sspm' (Indicates Supervisor-mode Pointer Masking) // CHECK-NEXT: supm 1.0 'Supm' (Indicates User-mode Pointer Masking) diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c index 5bb6c10f85f1a7..60675065495bba 100644 --- a/clang/test/Preprocessor/riscv-target-features.c +++ b/clang/test/Preprocessor/riscv-target-features.c @@ -176,8 +176,10 @@ // Experimental extensions +// CHECK-NOT: __riscv_smctr{{.*$}} // CHECK-NOT: __riscv_smmpm{{.*$}} // CHECK-NOT: __riscv_smnpm{{.*$}} +// CHECK-NOT: __riscv_ssctr{{.*$}} // CHECK-NOT: __riscv_ssnpm{{.*$}} // CHECK-NOT: __riscv_sspm{{.*$}} // CHECK-NOT: __riscv_supm{{.*$}} @@ -1748,6 +1750,22 @@ // RUN: -o - | FileCheck --check-prefix=CHECK-SUPM-EXT %s // CHECK-SUPM-EXT: __riscv_supm 1000000{{$}} +// RUN: %clang --target=riscv32 -menable-experimental-extensions \ +// RUN: -march=rv32i_smctr1p0 -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-SMCTR-EXT %s +// RUN: %clang --target=riscv64 -menable-experimental-extensions \ +// RUN: -march=rv64i_smctr1p0 -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-SMCTR-EXT %s +// CHECK-SMCTR-EXT: __riscv_smctr 1000000{{$}} + +// RUN: %clang --target=riscv32 -menable-experimental-extensions \ +// RUN: -march=rv32i_ssctr1p0 -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-SSCTR-EXT %s +// RUN: %clang --target=riscv64 -menable-experimental-extensions \ +// RUN: -march=rv64i_ssctr1p0 -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-SSCTR-EXT %s +// CHECK-SSCTR-EXT: __riscv_ssctr 1000000{{$}} + // Misaligned // RUN: %clang --target=riscv32-unknown-linux-gnu -march=rv32i -E -dM %s \ diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst index 4e50f55e4cb60b..8846b82fcaea59 100644 --- a/llvm/docs/RISCVUsage.rst +++ b/llvm/docs/RISCVUsage.rst @@ -303,6 +303,9 @@ The primary goal of experimental support is to assist in the process of ratifica ``experimental-zvbc32e``, ``experimental-zvkgs`` LLVM implements the `0.7 release specification `__. +``experimental-smctr``, ``experimental-ssctr`` + LLVM implements the `1.0-rc3 specification `__. + To use an experimental extension from `clang`, you must add `-menable-experimental-extensions` to the command line, and specify the exact version of the experimental extension you are using. To use an experimental extension with LLVM's internal developer tools (e.g. `llc`, `llvm-objdump`, `llvm-mc`), you must prefix the extension name with `experimental-`. Note that you don't need to specify the version with internal tools, and shouldn't include the `experimental-` prefix with `clang`. Vendor Extensions diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 65fa21e517940b..c9eb5eea896905 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -114,6 +114,7 @@ Changes to the RISC-V Backend means Zve32x and Zve32f will also require Zvl64b. The prior support was largely untested. * The ``Zvbc32e`` and ``Zvkgs`` extensions are now supported experimentally. +* Added ``Smctr`` and ``Ssctr`` extensions. Changes to the WebAssembly Backend ---------------------------------- diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index d448f9301f3ae8..fa141c31f94dbd 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1054,6 +1054,19 @@ def FeatureStdExtSupm : RISCVExperimentalExtension<"supm", 1, 0, "'Supm' (Indicates User-mode Pointer Masking)">; +def FeatureStdExtSmctr + : RISCVExperimentalExtension<"smctr", 1, 0, + "'Smctr' (Control Transfer Records Machine Level)", + [FeatureStdExtSscsrind]>; +def FeatureStdExtSsctr + : RISCVExperimentalExtension<"ssctr" ,1, 0, + "'Ssctr' (Control Transfer Records Supervisor Level)", + [FeatureStdExtSscsrind]>; +def HasStdExtSmctrOrSsctr : Predicate<"Subtarget->hasStdExtSmctrOrSsctr()">, + AssemblerPredicate<(any_of FeatureStdExtSmctr, FeatureStdExtSsctr), + "'Smctr' (Control Transfer Records Machine Level) or " + "'Ssctr' (Control Transfer Records Supervisor Level)">; + //===----------------------------------------------------------------------===// // Vendor extensions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 74406bf4b10471..6d0952a42eda9f 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -839,6 +839,14 @@ def HLV_D : HLoad_r<0b0110110, 0b00000, "hlv.d">, Sched<[]>; def HSV_D : HStore_rr<0b0110111, "hsv.d">, Sched<[]>; } +let Predicates = [HasStdExtSmctrOrSsctr] in { +def SCTRCLR : Priv<"sctrclr", 0b0001000>, Sched<[]> { + let rd = 0; + let rs1 = 0; + let rs2 = 0b00100; +} +} + //===----------------------------------------------------------------------===// // Debug instructions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVSystemOperands.td b/llvm/lib/Target/RISCV/RISCVSystemOperands.td index a836227e18957c..d85b4a9cf77b33 100644 --- a/llvm/lib/Target/RISCV/RISCVSystemOperands.td +++ b/llvm/lib/Target/RISCV/RISCVSystemOperands.td @@ -455,3 +455,12 @@ def : SysReg<"mnscratch", 0x740>; def : SysReg<"mnepc", 0x741>; def : SysReg<"mncause", 0x742>; def : SysReg<"mnstatus", 0x744>; + +//===----------------------------------------------- +// Control Transfer Records CSRs +//===----------------------------------------------- +def : SysReg<"sctrctl", 0x14e>; +def : SysReg<"sctrstatus", 0x14f>; +def : SysReg<"sctrdepth", 0x15f>; +def : SysReg<"vsctrctl", 0x24e>; +def : SysReg<"mctrctl", 0x34e>; diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll index 2a02327cd3c7b0..1d4a634c89a22f 100644 --- a/llvm/test/CodeGen/RISCV/attributes.ll +++ b/llvm/test/CodeGen/RISCV/attributes.ll @@ -133,6 +133,8 @@ ; RUN: llc -mtriple=riscv32 -mattr=+experimental-smmpm %s -o - | FileCheck --check-prefix=RV32SMMPM %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-sspm %s -o - | FileCheck --check-prefix=RV32SSPM %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-supm %s -o - | FileCheck --check-prefix=RV32SUPM %s +; RUN: llc -mtriple=riscv32 -mattr=+experimental-smctr %s -o - | FileCheck --check-prefix=RV32SMCTR %s +; RUN: llc -mtriple=riscv32 -mattr=+experimental-ssctr %s -o - | FileCheck --check-prefix=RV32SSCTR %s ; RUN: llc -mtriple=riscv64 %s -o - | FileCheck %s ; RUN: llc -mtriple=riscv64 -mattr=+m %s -o - | FileCheck --check-prefixes=CHECK,RV64M %s @@ -273,6 +275,8 @@ ; RUN: llc -mtriple=riscv64 -mattr=+experimental-smmpm %s -o - | FileCheck --check-prefix=RV64SMMPM %s ; RUN: llc -mtriple=riscv64 -mattr=+experimental-sspm %s -o - | FileCheck --check-prefix=RV64SSPM %s ; RUN: llc -mtriple=riscv64 -mattr=+experimental-supm %s -o - | FileCheck --check-prefix=RV64SUPM %s +; RUN: llc -mtriple=riscv64 -mattr=+experimental-smctr %s -o - | FileCheck --check-prefix=RV64SMCTR %s +; RUN: llc -mtriple=riscv64 -mattr=+experimental-ssctr %s -o - | FileCheck --check-prefix=RV64SSCTR %s ; Tests for profile features. ; RUN: llc -mtriple=riscv32 -mattr=+rvi20u32 %s -o - | FileCheck --check-prefix=RVI20U32 %s @@ -421,6 +425,8 @@ ; RV32SMMPM: .attribute 5, "rv32i2p1_smmpm1p0" ; RV32SSPM: .attribute 5, "rv32i2p1_sspm1p0" ; RV32SUPM: .attribute 5, "rv32i2p1_supm1p0" +; RV32SMCTR: .attribute 5, "rv32i2p1_smctr1p0_sscsrind1p0" +; RV32SSCTR: .attribute 5, "rv32i2p1_sscsrind1p0_ssctr1p0" ; RV64M: .attribute 5, "rv64i2p1_m2p0_zmmul1p0" ; RV64ZMMUL: .attribute 5, "rv64i2p1_zmmul1p0" @@ -559,6 +565,8 @@ ; RV64SMMPM: .attribute 5, "rv64i2p1_smmpm1p0" ; RV64SSPM: .attribute 5, "rv64i2p1_sspm1p0" ; RV64SUPM: .attribute 5, "rv64i2p1_supm1p0" +; RV64SMCTR: .attribute 5, "rv64i2p1_smctr1p0_sscsrind1p0" +; RV64SSCTR: .attribute 5, "rv64i2p1_sscsrind1p0_ssctr1p0" ; RVI20U32: .attribute 5, "rv32i2p1" ; RVI20U64: .attribute 5, "rv64i2p1" diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s index 0ba15cfd489cb1..1c0b2a59d0693f 100644 --- a/llvm/test/MC/RISCV/attribute-arch.s +++ b/llvm/test/MC/RISCV/attribute-arch.s @@ -446,3 +446,9 @@ .attribute arch, "rv64i_supm1p0" # CHECK: attribute 5, "rv64i2p1_supm1p0" + +.attribute arch, "rv32i_smctr1p0" +# CHECK: attribute 5, "rv32i2p1_smctr1p0_sscsrind1p0" + +.attribute arch, "rv32i_ssctr1p0" +# CHECK: attribute 5, "rv32i2p1_sscsrind1p0_ssctr1p0" diff --git a/llvm/test/MC/RISCV/hypervisor-csr-names.s b/llvm/test/MC/RISCV/hypervisor-csr-names.s index 950570c74746a9..2f29e5dacbeb95 100644 --- a/llvm/test/MC/RISCV/hypervisor-csr-names.s +++ b/llvm/test/MC/RISCV/hypervisor-csr-names.s @@ -633,3 +633,20 @@ csrrs t2, 0x25C, zero csrrs t1, vstopi, zero # uimm12 csrrs t2, 0xEB0, zero + +################################## +# Control Transfer Records +################################## + +# vsctrctl +# name +# CHECK-INST: csrrs t1, vsctrctl, zero +# CHECK-ENC: encoding: [0x73,0x23,0xe0,0x24] +# CHECK-INST-ALIAS: csrr t1, vsctrctl +# uimm12 +# CHECK-INST: csrrs t2, vsctrctl, zero +# CHECK-ENC: encoding: [0xf3,0x23,0xe0,0x24] +# CHECK-INST-ALIAS: csrr t2, vsctrctl +csrrs t1, vsctrctl, zero +# uimm12 +csrrs t2, 0x24E, zero diff --git a/llvm/test/MC/RISCV/machine-csr-names.s b/llvm/test/MC/RISCV/machine-csr-names.s index 5f668aea00485d..ae1af1fc8abc35 100644 --- a/llvm/test/MC/RISCV/machine-csr-names.s +++ b/llvm/test/MC/RISCV/machine-csr-names.s @@ -2568,3 +2568,20 @@ csrrs t2, 0x308, zero csrrs t1, mvip, zero # uimm12 csrrs t2, 0x309, zero + +################################## +# Control Transfer Records +################################## + +# mctrctl +# name +# CHECK-INST: csrrs t1, mctrctl, zero +# CHECK-ENC: encoding: [0x73,0x23,0xe0,0x34] +# CHECK-INST-ALIAS: csrr t1, mctrctl +# uimm12 +# CHECK-INST: csrrs t2, mctrctl, zero +# CHECK-ENC: encoding: [0xf3,0x23,0xe0,0x34] +# CHECK-INST-ALIAS: csrr t2, mctrctl +csrrs t1, mctrctl, zero +# uimm12 +csrrs t2, 0x34E, zero diff --git a/llvm/test/MC/RISCV/smctr-ssctr-valid.s b/llvm/test/MC/RISCV/smctr-ssctr-valid.s new file mode 100644 index 00000000000000..0b4fe47ae33f4b --- /dev/null +++ b/llvm/test/MC/RISCV/smctr-ssctr-valid.s @@ -0,0 +1,30 @@ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-smctr -riscv-no-aliases -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK,CHECK-INST %s +# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-smctr -riscv-no-aliases -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK,CHECK-INST %s +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-ssctr -riscv-no-aliases -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK,CHECK-INST %s +# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-ssctr -riscv-no-aliases -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK,CHECK-INST %s +# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-smctr < %s \ +# RUN: | llvm-objdump --mattr=+experimental-smctr -M no-aliases -d - \ +# RUN: | FileCheck -check-prefix=CHECK-INST %s +# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+experimental-smctr < %s \ +# RUN: | llvm-objdump --mattr=+experimental-smctr -M no-aliases -d - \ +# RUN: | FileCheck -check-prefix=CHECK-INST %s +# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-ssctr < %s \ +# RUN: | llvm-objdump --mattr=+experimental-ssctr -M no-aliases -d - \ +# RUN: | FileCheck -check-prefix=CHECK-INST %s +# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+experimental-ssctr < %s \ +# RUN: | llvm-objdump --mattr=+experimental-ssctr -M no-aliases -d - \ +# RUN: | FileCheck -check-prefix=CHECK-INST %s + +# RUN: not llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s +# RUN: not llvm-mc -triple riscv64 -defsym=RV64=1 -riscv-no-aliases -show-encoding < %s 2>&1 \ +# RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s + +# CHECK-INST: sctrclr +# CHECK: encoding: [0x73,0x00,0x40,0x10] +# CHECK-NO-EXT: error: instruction requires the following: 'Smctr' (Control Transfer Records Machine Level) or 'Ssctr' (Control Transfer Records Supervisor Level){{$}} +sctrclr diff --git a/llvm/test/MC/RISCV/supervisor-csr-names.s b/llvm/test/MC/RISCV/supervisor-csr-names.s index 481f11e0082b8d..db0fcb381ef2a4 100644 --- a/llvm/test/MC/RISCV/supervisor-csr-names.s +++ b/llvm/test/MC/RISCV/supervisor-csr-names.s @@ -457,3 +457,46 @@ csrrs t2, 0xDB0, zero csrrs t1, scountinhibit, zero # uimm12 csrrs t2, 0x120, zero + +################################## +# Control Transfer Records +################################## + +# sctrctl +# name +# CHECK-INST: csrrs t1, sctrctl, zero +# CHECK-ENC: encoding: [0x73,0x23,0xe0,0x14] +# CHECK-INST-ALIAS: csrr t1, sctrctl +# uimm12 +# CHECK-INST: csrrs t2, sctrctl, zero +# CHECK-ENC: encoding: [0xf3,0x23,0xe0,0x14] +# CHECK-INST-ALIAS: csrr t2, sctrctl +csrrs t1, sctrctl, zero +# uimm12 +csrrs t2, 0x14E, zero + +# sctrstatus +# name +# CHECK-INST: csrrs t1, sctrstatus, zero +# CHECK-ENC: encoding: [0x73,0x23,0xf0,0x14] +# CHECK-INST-ALIAS: csrr t1, sctrstatus +# uimm12 +# CHECK-INST: csrrs t2, sctrstatus, zero +# CHECK-ENC: encoding: [0xf3,0x23,0xf0,0x14] +# CHECK-INST-ALIAS: csrr t2, sctrstatus +csrrs t1, sctrstatus, zero +# uimm12 +csrrs t2, 0x14F, zero + +# sctrdepth +# name +# CHECK-INST: csrrs t1, sctrdepth, zero +# CHECK-ENC: encoding: [0x73,0x23,0xf0,0x15] +# CHECK-INST-ALIAS: csrr t1, sctrdepth +# uimm12 +# CHECK-INST: csrrs t2, sctrdepth, zero +# CHECK-ENC: encoding: [0xf3,0x23,0xf0,0x15] +# CHECK-INST-ALIAS: csrr t2, sctrdepth +csrrs t1, sctrdepth, zero +# uimm12 +csrrs t2, 0x15F, zero diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp index 6172e48c484ce8..6662421eb26d9d 100644 --- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp +++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp @@ -1120,8 +1120,10 @@ Experimental extensions zalasr 0.1 zvbc32e 0.7 zvkgs 0.7 + smctr 1.0 smmpm 1.0 smnpm 1.0 + ssctr 1.0 ssnpm 1.0 sspm 1.0 supm 1.0 From 65f66d2c605f0c9b0af26244f4d42ca93f552ec8 Mon Sep 17 00:00:00 2001 From: "Ivan R. Ivanov" Date: Wed, 21 Aug 2024 22:59:11 +0900 Subject: [PATCH 139/426] [flang][NFC] Move OpenMP related passes into a separate directory (#104732) Reapplied with fixed library dependencies for shared lib build --- flang/docs/OpenMP-declare-target.md | 4 +- flang/docs/OpenMP-descriptor-management.md | 4 +- flang/include/flang/Optimizer/CMakeLists.txt | 1 + .../flang/Optimizer/OpenMP/CMakeLists.txt | 4 ++ flang/include/flang/Optimizer/OpenMP/Passes.h | 30 ++++++++++++++ .../include/flang/Optimizer/OpenMP/Passes.td | 40 +++++++++++++++++++ .../flang/Optimizer/Transforms/Passes.td | 26 ------------ flang/include/flang/Tools/CLOptions.inc | 7 ++-- flang/lib/Frontend/CMakeLists.txt | 1 + flang/lib/Optimizer/CMakeLists.txt | 1 + flang/lib/Optimizer/OpenMP/CMakeLists.txt | 26 ++++++++++++ .../FunctionFiltering.cpp} | 18 ++++----- .../MapInfoFinalization.cpp} | 21 +++++----- .../MarkDeclareTarget.cpp} | 26 ++++++++---- flang/lib/Optimizer/Transforms/CMakeLists.txt | 3 -- flang/tools/bbc/CMakeLists.txt | 1 + flang/tools/fir-opt/CMakeLists.txt | 1 + flang/tools/fir-opt/fir-opt.cpp | 2 + flang/tools/tco/CMakeLists.txt | 1 + 19 files changed, 154 insertions(+), 63 deletions(-) create mode 100644 flang/include/flang/Optimizer/OpenMP/CMakeLists.txt create mode 100644 flang/include/flang/Optimizer/OpenMP/Passes.h create mode 100644 flang/include/flang/Optimizer/OpenMP/Passes.td create mode 100644 flang/lib/Optimizer/OpenMP/CMakeLists.txt rename flang/lib/Optimizer/{Transforms/OMPFunctionFiltering.cpp => OpenMP/FunctionFiltering.cpp} (90%) rename flang/lib/Optimizer/{Transforms/OMPMapInfoFinalization.cpp => OpenMP/MapInfoFinalization.cpp} (96%) rename flang/lib/Optimizer/{Transforms/OMPMarkDeclareTarget.cpp => OpenMP/MarkDeclareTarget.cpp} (80%) diff --git a/flang/docs/OpenMP-declare-target.md b/flang/docs/OpenMP-declare-target.md index d29a46807e1eaf..45062469007b65 100644 --- a/flang/docs/OpenMP-declare-target.md +++ b/flang/docs/OpenMP-declare-target.md @@ -149,7 +149,7 @@ flang/lib/Lower/OpenMP.cpp function `genDeclareTargetIntGlobal`. There are currently two passes within Flang that are related to the processing of `declare target`: -* `OMPMarkDeclareTarget` - This pass is in charge of marking functions captured +* `MarkDeclareTarget` - This pass is in charge of marking functions captured (called from) in `target` regions or other `declare target` marked functions as `declare target`. It does so recursively, i.e. nested calls will also be implicitly marked. It currently will try to mark things as conservatively as @@ -157,7 +157,7 @@ possible, e.g. if captured in a `target` region it will apply `nohost`, unless it encounters a `host` `declare target` in which case it will apply the `any` device type. Functions are handled similarly, except we utilise the parent's device type where possible. -* `OMPFunctionFiltering` - This is executed after the `OMPMarkDeclareTarget` +* `FunctionFiltering` - This is executed after the `MarkDeclareTarget` pass, and its job is to conservatively remove host functions from the module where possible when compiling for the device. This helps make sure that most incompatible code for the host is not lowered for the diff --git a/flang/docs/OpenMP-descriptor-management.md b/flang/docs/OpenMP-descriptor-management.md index d0eb01b00f9bb9..66c153914f70da 100644 --- a/flang/docs/OpenMP-descriptor-management.md +++ b/flang/docs/OpenMP-descriptor-management.md @@ -44,7 +44,7 @@ Currently, Flang will lower these descriptor types in the OpenMP lowering (lower to all other map types, generating an omp.MapInfoOp containing relevant information required for lowering the OpenMP dialect to LLVM-IR during the final stages of the MLIR lowering. However, after the lowering to FIR/HLFIR has been performed an OpenMP dialect specific pass for Fortran, -`OMPMapInfoFinalizationPass` (Optimizer/OMPMapInfoFinalization.cpp) will expand the +`MapInfoFinalizationPass` (Optimizer/OpenMP/MapInfoFinalization.cpp) will expand the `omp.MapInfoOp`'s containing descriptors (which currently will be a `BoxType` or `BoxAddrOp`) into multiple mappings, with one extra per pointer member in the descriptor that is supported on top of the original descriptor map operation. These pointers members are linked to the parent descriptor by adding them to @@ -53,7 +53,7 @@ owning operation's (`omp.TargetOp`, `omp.TargetDataOp` etc.) map operand list an operation is `IsolatedFromAbove`, it also inserts them as `BlockArgs` to canonicalize the mappings and simplify lowering. -An example transformation by the `OMPMapInfoFinalizationPass`: +An example transformation by the `MapInfoFinalizationPass`: ``` diff --git a/flang/include/flang/Optimizer/CMakeLists.txt b/flang/include/flang/Optimizer/CMakeLists.txt index 89e43a9ee8d621..3336ac935e1012 100644 --- a/flang/include/flang/Optimizer/CMakeLists.txt +++ b/flang/include/flang/Optimizer/CMakeLists.txt @@ -2,3 +2,4 @@ add_subdirectory(CodeGen) add_subdirectory(Dialect) add_subdirectory(HLFIR) add_subdirectory(Transforms) +add_subdirectory(OpenMP) diff --git a/flang/include/flang/Optimizer/OpenMP/CMakeLists.txt b/flang/include/flang/Optimizer/OpenMP/CMakeLists.txt new file mode 100644 index 00000000000000..d59573f0f7fd91 --- /dev/null +++ b/flang/include/flang/Optimizer/OpenMP/CMakeLists.txt @@ -0,0 +1,4 @@ +set(LLVM_TARGET_DEFINITIONS Passes.td) +mlir_tablegen(Passes.h.inc -gen-pass-decls -name FlangOpenMP) + +add_public_tablegen_target(FlangOpenMPPassesIncGen) diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.h b/flang/include/flang/Optimizer/OpenMP/Passes.h new file mode 100644 index 00000000000000..403d79667bf448 --- /dev/null +++ b/flang/include/flang/Optimizer/OpenMP/Passes.h @@ -0,0 +1,30 @@ +//===- Passes.h - OpenMP pass entry points ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This header declares the flang OpenMP passes. +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_OPTIMIZER_OPENMP_PASSES_H +#define FORTRAN_OPTIMIZER_OPENMP_PASSES_H + +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassRegistry.h" + +#include + +namespace flangomp { +#define GEN_PASS_DECL +#define GEN_PASS_REGISTRATION +#include "flang/Optimizer/OpenMP/Passes.h.inc" + +} // namespace flangomp + +#endif // FORTRAN_OPTIMIZER_OPENMP_PASSES_H diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td new file mode 100644 index 00000000000000..395178e26a5762 --- /dev/null +++ b/flang/include/flang/Optimizer/OpenMP/Passes.td @@ -0,0 +1,40 @@ +//===-- Passes.td - flang OpenMP pass definition -----------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_OPTIMIZER_OPENMP_PASSES +#define FORTRAN_OPTIMIZER_OPENMP_PASSES + +include "mlir/Pass/PassBase.td" + +def MapInfoFinalizationPass + : Pass<"omp-map-info-finalization"> { + let summary = "expands OpenMP MapInfo operations containing descriptors"; + let description = [{ + Expands MapInfo operations containing descriptor types into multiple + MapInfo's for each pointer element in the descriptor that requires + explicit individual mapping by the OpenMP runtime. + }]; + let dependentDialects = ["mlir::omp::OpenMPDialect"]; +} + +def MarkDeclareTargetPass + : Pass<"omp-mark-declare-target", "mlir::ModuleOp"> { + let summary = "Marks all functions called by an OpenMP declare target function as declare target"; + let dependentDialects = ["mlir::omp::OpenMPDialect"]; +} + +def FunctionFiltering : Pass<"omp-function-filtering"> { + let summary = "Filters out functions intended for the host when compiling " + "for the target device."; + let dependentDialects = [ + "mlir::func::FuncDialect", + "fir::FIROpsDialect" + ]; +} + +#endif //FORTRAN_OPTIMIZER_OPENMP_PASSES diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td index a0211384667ed1..49bd4f5349a754 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.td +++ b/flang/include/flang/Optimizer/Transforms/Passes.td @@ -358,32 +358,6 @@ def LoopVersioning : Pass<"loop-versioning", "mlir::func::FuncOp"> { let dependentDialects = [ "fir::FIROpsDialect" ]; } -def OMPMapInfoFinalizationPass - : Pass<"omp-map-info-finalization"> { - let summary = "expands OpenMP MapInfo operations containing descriptors"; - let description = [{ - Expands MapInfo operations containing descriptor types into multiple - MapInfo's for each pointer element in the descriptor that requires - explicit individual mapping by the OpenMP runtime. - }]; - let dependentDialects = ["mlir::omp::OpenMPDialect"]; -} - -def OMPMarkDeclareTargetPass - : Pass<"omp-mark-declare-target", "mlir::ModuleOp"> { - let summary = "Marks all functions called by an OpenMP declare target function as declare target"; - let dependentDialects = ["mlir::omp::OpenMPDialect"]; -} - -def OMPFunctionFiltering : Pass<"omp-function-filtering"> { - let summary = "Filters out functions intended for the host when compiling " - "for the target device."; - let dependentDialects = [ - "mlir::func::FuncDialect", - "fir::FIROpsDialect" - ]; -} - def VScaleAttr : Pass<"vscale-attr", "mlir::func::FuncOp"> { let summary = "Add vscale_range attribute to functions"; let description = [{ diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc index 57b90017d052e4..1881e23b00045a 100644 --- a/flang/include/flang/Tools/CLOptions.inc +++ b/flang/include/flang/Tools/CLOptions.inc @@ -17,6 +17,7 @@ #include "mlir/Transforms/Passes.h" #include "flang/Optimizer/CodeGen/CodeGen.h" #include "flang/Optimizer/HLFIR/Passes.h" +#include "flang/Optimizer/OpenMP/Passes.h" #include "flang/Optimizer/Transforms/Passes.h" #include "llvm/Passes/OptimizationLevel.h" #include "llvm/Support/CommandLine.h" @@ -367,10 +368,10 @@ inline void createHLFIRToFIRPassPipeline( inline void createOpenMPFIRPassPipeline( mlir::PassManager &pm, bool isTargetDevice) { addNestedPassToAllTopLevelOperations( - pm, fir::createOMPMapInfoFinalizationPass); - pm.addPass(fir::createOMPMarkDeclareTargetPass()); + pm, flangomp::createMapInfoFinalizationPass); + pm.addPass(flangomp::createMarkDeclareTargetPass()); if (isTargetDevice) - pm.addPass(fir::createOMPFunctionFiltering()); + pm.addPass(flangomp::createFunctionFiltering()); } #if !defined(FLANG_EXCLUDE_CODEGEN) diff --git a/flang/lib/Frontend/CMakeLists.txt b/flang/lib/Frontend/CMakeLists.txt index c20b9096aff496..ecdcc73d61ec1f 100644 --- a/flang/lib/Frontend/CMakeLists.txt +++ b/flang/lib/Frontend/CMakeLists.txt @@ -38,6 +38,7 @@ add_flang_library(flangFrontend FIRTransforms HLFIRDialect HLFIRTransforms + FlangOpenMPTransforms MLIRTransforms MLIRBuiltinToLLVMIRTranslation MLIRLLVMToLLVMIRTranslation diff --git a/flang/lib/Optimizer/CMakeLists.txt b/flang/lib/Optimizer/CMakeLists.txt index 4a602162ed2b77..dd153ac33c0fbb 100644 --- a/flang/lib/Optimizer/CMakeLists.txt +++ b/flang/lib/Optimizer/CMakeLists.txt @@ -5,3 +5,4 @@ add_subdirectory(HLFIR) add_subdirectory(Support) add_subdirectory(Transforms) add_subdirectory(Analysis) +add_subdirectory(OpenMP) diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt new file mode 100644 index 00000000000000..92051634f0378b --- /dev/null +++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt @@ -0,0 +1,26 @@ +get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) + +add_flang_library(FlangOpenMPTransforms + FunctionFiltering.cpp + MapInfoFinalization.cpp + MarkDeclareTarget.cpp + + DEPENDS + FIRDialect + HLFIROpsIncGen + FlangOpenMPPassesIncGen + + LINK_LIBS + FIRAnalysis + FIRBuilder + FIRCodeGen + FIRDialect + FIRDialectSupport + FIRSupport + FortranCommon + MLIRFuncDialect + MLIROpenMPDialect + HLFIRDialect + MLIRIR + MLIRPass +) diff --git a/flang/lib/Optimizer/Transforms/OMPFunctionFiltering.cpp b/flang/lib/Optimizer/OpenMP/FunctionFiltering.cpp similarity index 90% rename from flang/lib/Optimizer/Transforms/OMPFunctionFiltering.cpp rename to flang/lib/Optimizer/OpenMP/FunctionFiltering.cpp index 0c472246c2a44c..bd9005d3e2df6f 100644 --- a/flang/lib/Optimizer/Transforms/OMPFunctionFiltering.cpp +++ b/flang/lib/Optimizer/OpenMP/FunctionFiltering.cpp @@ -1,4 +1,4 @@ -//===- OMPFunctionFiltering.cpp -------------------------------------------===// +//===- FunctionFiltering.cpp -------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -13,7 +13,7 @@ #include "flang/Optimizer/Dialect/FIRDialect.h" #include "flang/Optimizer/Dialect/FIROpsSupport.h" -#include "flang/Optimizer/Transforms/Passes.h" +#include "flang/Optimizer/OpenMP/Passes.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" @@ -21,18 +21,18 @@ #include "mlir/IR/BuiltinOps.h" #include "llvm/ADT/SmallVector.h" -namespace fir { -#define GEN_PASS_DEF_OMPFUNCTIONFILTERING -#include "flang/Optimizer/Transforms/Passes.h.inc" -} // namespace fir +namespace flangomp { +#define GEN_PASS_DEF_FUNCTIONFILTERING +#include "flang/Optimizer/OpenMP/Passes.h.inc" +} // namespace flangomp using namespace mlir; namespace { -class OMPFunctionFilteringPass - : public fir::impl::OMPFunctionFilteringBase { +class FunctionFilteringPass + : public flangomp::impl::FunctionFilteringBase { public: - OMPFunctionFilteringPass() = default; + FunctionFilteringPass() = default; void runOnOperation() override { MLIRContext *context = &getContext(); diff --git a/flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp similarity index 96% rename from flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp rename to flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp index ddaa3c5f404f0b..6e9cd03dca8f3f 100644 --- a/flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp +++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp @@ -1,5 +1,4 @@ -//===- OMPMapInfoFinalization.cpp -//---------------------------------------------------===// +//===- MapInfoFinalization.cpp -----------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -28,7 +27,7 @@ #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Optimizer/Dialect/Support/KindMapping.h" -#include "flang/Optimizer/Transforms/Passes.h" +#include "flang/Optimizer/OpenMP/Passes.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" #include "mlir/IR/BuiltinDialect.h" @@ -41,15 +40,15 @@ #include "llvm/Frontend/OpenMP/OMPConstants.h" #include -namespace fir { -#define GEN_PASS_DEF_OMPMAPINFOFINALIZATIONPASS -#include "flang/Optimizer/Transforms/Passes.h.inc" -} // namespace fir +namespace flangomp { +#define GEN_PASS_DEF_MAPINFOFINALIZATIONPASS +#include "flang/Optimizer/OpenMP/Passes.h.inc" +} // namespace flangomp namespace { -class OMPMapInfoFinalizationPass - : public fir::impl::OMPMapInfoFinalizationPassBase< - OMPMapInfoFinalizationPass> { +class MapInfoFinalizationPass + : public flangomp::impl::MapInfoFinalizationPassBase< + MapInfoFinalizationPass> { void genDescriptorMemberMaps(mlir::omp::MapInfoOp op, fir::FirOpBuilder &builder, @@ -245,7 +244,7 @@ class OMPMapInfoFinalizationPass // all users appropriately, making sure to only add a single member link // per new generation for the original originating descriptor MapInfoOp. assert(llvm::hasSingleElement(op->getUsers()) && - "OMPMapInfoFinalization currently only supports single users " + "MapInfoFinalization currently only supports single users " "of a MapInfoOp"); if (!op.getMembers().empty()) { diff --git a/flang/lib/Optimizer/Transforms/OMPMarkDeclareTarget.cpp b/flang/lib/Optimizer/OpenMP/MarkDeclareTarget.cpp similarity index 80% rename from flang/lib/Optimizer/Transforms/OMPMarkDeclareTarget.cpp rename to flang/lib/Optimizer/OpenMP/MarkDeclareTarget.cpp index 4946e13b22865d..a7ffd5fda82b7f 100644 --- a/flang/lib/Optimizer/Transforms/OMPMarkDeclareTarget.cpp +++ b/flang/lib/Optimizer/OpenMP/MarkDeclareTarget.cpp @@ -1,4 +1,16 @@ -#include "flang/Optimizer/Transforms/Passes.h" +//===- MarkDeclareTarget.cpp -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Mark functions called from explicit target code as implicitly declare target. +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/OpenMP/Passes.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" @@ -10,14 +22,14 @@ #include "mlir/Support/LLVM.h" #include "llvm/ADT/SmallPtrSet.h" -namespace fir { -#define GEN_PASS_DEF_OMPMARKDECLARETARGETPASS -#include "flang/Optimizer/Transforms/Passes.h.inc" -} // namespace fir +namespace flangomp { +#define GEN_PASS_DEF_MARKDECLARETARGETPASS +#include "flang/Optimizer/OpenMP/Passes.h.inc" +} // namespace flangomp namespace { -class OMPMarkDeclareTargetPass - : public fir::impl::OMPMarkDeclareTargetPassBase { +class MarkDeclareTargetPass + : public flangomp::impl::MarkDeclareTargetPassBase { void markNestedFuncs(mlir::omp::DeclareTargetDeviceType parentDevTy, mlir::omp::DeclareTargetCaptureClause parentCapClause, diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt index bf0a8d14d95df6..b32f2ef86fca44 100644 --- a/flang/lib/Optimizer/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt @@ -22,9 +22,6 @@ add_flang_library(FIRTransforms AddDebugInfo.cpp PolymorphicOpConversion.cpp LoopVersioning.cpp - OMPFunctionFiltering.cpp - OMPMapInfoFinalization.cpp - OMPMarkDeclareTarget.cpp StackReclaim.cpp VScaleAttr.cpp FunctionAttr.cpp diff --git a/flang/tools/bbc/CMakeLists.txt b/flang/tools/bbc/CMakeLists.txt index 9410fd00566006..69316d4dc61de3 100644 --- a/flang/tools/bbc/CMakeLists.txt +++ b/flang/tools/bbc/CMakeLists.txt @@ -25,6 +25,7 @@ FIRTransforms FIRBuilder HLFIRDialect HLFIRTransforms +FlangOpenMPTransforms ${dialect_libs} ${extension_libs} MLIRAffineToStandard diff --git a/flang/tools/fir-opt/CMakeLists.txt b/flang/tools/fir-opt/CMakeLists.txt index 43679a9d535782..4c6dbf7d9c8c37 100644 --- a/flang/tools/fir-opt/CMakeLists.txt +++ b/flang/tools/fir-opt/CMakeLists.txt @@ -19,6 +19,7 @@ target_link_libraries(fir-opt PRIVATE FIRCodeGen HLFIRDialect HLFIRTransforms + FlangOpenMPTransforms FIRAnalysis ${test_libs} ${dialect_libs} diff --git a/flang/tools/fir-opt/fir-opt.cpp b/flang/tools/fir-opt/fir-opt.cpp index 1846c1b317848f..f75fba27c68f08 100644 --- a/flang/tools/fir-opt/fir-opt.cpp +++ b/flang/tools/fir-opt/fir-opt.cpp @@ -14,6 +14,7 @@ #include "mlir/Tools/mlir-opt/MlirOptMain.h" #include "flang/Optimizer/CodeGen/CodeGen.h" #include "flang/Optimizer/HLFIR/Passes.h" +#include "flang/Optimizer/OpenMP/Passes.h" #include "flang/Optimizer/Support/InitFIR.h" #include "flang/Optimizer/Transforms/Passes.h" @@ -34,6 +35,7 @@ int main(int argc, char **argv) { fir::registerOptCodeGenPasses(); fir::registerOptTransformPasses(); hlfir::registerHLFIRPasses(); + flangomp::registerFlangOpenMPPasses(); #ifdef FLANG_INCLUDE_TESTS fir::test::registerTestFIRAliasAnalysisPass(); mlir::registerSideEffectTestPasses(); diff --git a/flang/tools/tco/CMakeLists.txt b/flang/tools/tco/CMakeLists.txt index 808219ac361f2a..698a398547c773 100644 --- a/flang/tools/tco/CMakeLists.txt +++ b/flang/tools/tco/CMakeLists.txt @@ -17,6 +17,7 @@ target_link_libraries(tco PRIVATE FIRBuilder HLFIRDialect HLFIRTransforms + FlangOpenMPTransforms ${dialect_libs} ${extension_libs} MLIRIR From bf88db78bd80cb624b49510c628ba841fb1fed04 Mon Sep 17 00:00:00 2001 From: itrofimow Date: Thu, 22 Aug 2024 06:53:41 +0400 Subject: [PATCH 140/426] [Symbolizer, DebugInfo] Clean up LLVMSymbolizer API: const string& -> StringRef (#104541) Nothing in the affected code depends on the `ModuleName` being null-terminated, so take it by `StringRef` instead of `const std::string &`. This change simplifies API consumption, since one doesn't always have a `std::string` at the call site (might have `std::string_view` instead), and also gives some minor performance improvements by removing string-copies in the cache-hit path of `getOrCreateModuleInfo`. --- .../llvm/DebugInfo/Symbolize/Symbolize.h | 16 +++++++------- llvm/lib/DebugInfo/Symbolize/Symbolize.cpp | 21 ++++++++++--------- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h b/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h index bd8de070f84c5a..df2e806259b369 100644 --- a/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h +++ b/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h @@ -77,7 +77,7 @@ class LLVMSymbolizer { // Overloads accepting ObjectFile does not support COFF currently Expected symbolizeCode(const ObjectFile &Obj, object::SectionedAddress ModuleOffset); - Expected symbolizeCode(const std::string &ModuleName, + Expected symbolizeCode(StringRef ModuleName, object::SectionedAddress ModuleOffset); Expected symbolizeCode(ArrayRef BuildID, object::SectionedAddress ModuleOffset); @@ -85,7 +85,7 @@ class LLVMSymbolizer { symbolizeInlinedCode(const ObjectFile &Obj, object::SectionedAddress ModuleOffset); Expected - symbolizeInlinedCode(const std::string &ModuleName, + symbolizeInlinedCode(StringRef ModuleName, object::SectionedAddress ModuleOffset); Expected symbolizeInlinedCode(ArrayRef BuildID, @@ -93,15 +93,14 @@ class LLVMSymbolizer { Expected symbolizeData(const ObjectFile &Obj, object::SectionedAddress ModuleOffset); - Expected symbolizeData(const std::string &ModuleName, + Expected symbolizeData(StringRef ModuleName, object::SectionedAddress ModuleOffset); Expected symbolizeData(ArrayRef BuildID, object::SectionedAddress ModuleOffset); Expected> symbolizeFrame(const ObjectFile &Obj, object::SectionedAddress ModuleOffset); Expected> - symbolizeFrame(const std::string &ModuleName, - object::SectionedAddress ModuleOffset); + symbolizeFrame(StringRef ModuleName, object::SectionedAddress ModuleOffset); Expected> symbolizeFrame(ArrayRef BuildID, object::SectionedAddress ModuleOffset); @@ -109,7 +108,7 @@ class LLVMSymbolizer { Expected> findSymbol(const ObjectFile &Obj, StringRef Symbol, uint64_t Offset); Expected> - findSymbol(const std::string &ModuleName, StringRef Symbol, uint64_t Offset); + findSymbol(StringRef ModuleName, StringRef Symbol, uint64_t Offset); Expected> findSymbol(ArrayRef BuildID, StringRef Symbol, uint64_t Offset); @@ -132,8 +131,7 @@ class LLVMSymbolizer { /// Only one attempt is made to load a module, and errors during loading are /// only reported once. Subsequent calls to get module info for a module that /// failed to load will return nullptr. - Expected - getOrCreateModuleInfo(const std::string &ModuleName); + Expected getOrCreateModuleInfo(StringRef ModuleName); private: // Bundles together object file with code/data and object file with @@ -210,7 +208,7 @@ class LLVMSymbolizer { ObjectPairForPathArch; /// Contains parsed binary for each path, or parsing error. - std::map BinaryForPath; + std::map> BinaryForPath; /// A list of cached binaries in LRU order. simple_ilist LRUBinaries; diff --git a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp index 9a18095edf35a8..42799b0826a02b 100644 --- a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp +++ b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp @@ -87,7 +87,7 @@ LLVMSymbolizer::symbolizeCode(const ObjectFile &Obj, } Expected -LLVMSymbolizer::symbolizeCode(const std::string &ModuleName, +LLVMSymbolizer::symbolizeCode(StringRef ModuleName, object::SectionedAddress ModuleOffset) { return symbolizeCodeCommon(ModuleName, ModuleOffset); } @@ -138,7 +138,7 @@ LLVMSymbolizer::symbolizeInlinedCode(const ObjectFile &Obj, } Expected -LLVMSymbolizer::symbolizeInlinedCode(const std::string &ModuleName, +LLVMSymbolizer::symbolizeInlinedCode(StringRef ModuleName, object::SectionedAddress ModuleOffset) { return symbolizeInlinedCodeCommon(ModuleName, ModuleOffset); } @@ -183,7 +183,7 @@ LLVMSymbolizer::symbolizeData(const ObjectFile &Obj, } Expected -LLVMSymbolizer::symbolizeData(const std::string &ModuleName, +LLVMSymbolizer::symbolizeData(StringRef ModuleName, object::SectionedAddress ModuleOffset) { return symbolizeDataCommon(ModuleName, ModuleOffset); } @@ -224,7 +224,7 @@ LLVMSymbolizer::symbolizeFrame(const ObjectFile &Obj, } Expected> -LLVMSymbolizer::symbolizeFrame(const std::string &ModuleName, +LLVMSymbolizer::symbolizeFrame(StringRef ModuleName, object::SectionedAddress ModuleOffset) { return symbolizeFrameCommon(ModuleName, ModuleOffset); } @@ -272,7 +272,7 @@ LLVMSymbolizer::findSymbol(const ObjectFile &Obj, StringRef Symbol, } Expected> -LLVMSymbolizer::findSymbol(const std::string &ModuleName, StringRef Symbol, +LLVMSymbolizer::findSymbol(StringRef ModuleName, StringRef Symbol, uint64_t Offset) { return findSymbolCommon(ModuleName, Symbol, Offset); } @@ -604,13 +604,13 @@ LLVMSymbolizer::createModuleInfo(const ObjectFile *Obj, } Expected -LLVMSymbolizer::getOrCreateModuleInfo(const std::string &ModuleName) { - std::string BinaryName = ModuleName; - std::string ArchName = Opts.DefaultArch; +LLVMSymbolizer::getOrCreateModuleInfo(StringRef ModuleName) { + StringRef BinaryName = ModuleName; + StringRef ArchName = Opts.DefaultArch; size_t ColonPos = ModuleName.find_last_of(':'); // Verify that substring after colon form a valid arch name. if (ColonPos != std::string::npos) { - std::string ArchStr = ModuleName.substr(ColonPos + 1); + StringRef ArchStr = ModuleName.substr(ColonPos + 1); if (Triple(ArchStr).getArch() != Triple::UnknownArch) { BinaryName = ModuleName.substr(0, ColonPos); ArchName = ArchStr; @@ -623,7 +623,8 @@ LLVMSymbolizer::getOrCreateModuleInfo(const std::string &ModuleName) { return I->second.get(); } - auto ObjectsOrErr = getOrCreateObjectPair(BinaryName, ArchName); + auto ObjectsOrErr = + getOrCreateObjectPair(std::string{BinaryName}, std::string{ArchName}); if (!ObjectsOrErr) { // Failed to find valid object file. Modules.emplace(ModuleName, std::unique_ptr()); From c62fa63ff1a043dc62b88270680657483f307fae Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 21 Aug 2024 20:08:11 -0700 Subject: [PATCH 141/426] [ELF] Move mainPart to Ctx. NFC Ctx was introduced in March 2022 as a more suitable place for such singletons. --- lld/ELF/Arch/PPC.cpp | 2 +- lld/ELF/Arch/RISCV.cpp | 4 +-- lld/ELF/Arch/SystemZ.cpp | 2 +- lld/ELF/Arch/X86.cpp | 2 +- lld/ELF/Arch/X86_64.cpp | 2 +- lld/ELF/Config.h | 2 ++ lld/ELF/Driver.cpp | 3 +- lld/ELF/Relocations.cpp | 33 ++++++++++++---------- lld/ELF/SyntheticSections.cpp | 53 ++++++++++++++++++----------------- lld/ELF/SyntheticSections.h | 2 -- lld/ELF/Thunks.cpp | 2 +- lld/ELF/Writer.cpp | 29 ++++++++++--------- 12 files changed, 72 insertions(+), 64 deletions(-) diff --git a/lld/ELF/Arch/PPC.cpp b/lld/ELF/Arch/PPC.cpp index 1b0838456428f1..186dcf229b6f6d 100644 --- a/lld/ELF/Arch/PPC.cpp +++ b/lld/ELF/Arch/PPC.cpp @@ -187,7 +187,7 @@ void PPC::writeGotHeader(uint8_t *buf) const { // _GLOBAL_OFFSET_TABLE_[0] = _DYNAMIC // glibc stores _dl_runtime_resolve in _GLOBAL_OFFSET_TABLE_[1], // link_map in _GLOBAL_OFFSET_TABLE_[2]. - write32(buf, mainPart->dynamic->getVA()); + write32(buf, ctx.mainPart->dynamic->getVA()); } void PPC::writeGotPlt(uint8_t *buf, const Symbol &s) const { diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp index 1e939caf591cef..dc9e541d5d8bef 100644 --- a/lld/ELF/Arch/RISCV.cpp +++ b/lld/ELF/Arch/RISCV.cpp @@ -200,9 +200,9 @@ int64_t RISCV::getImplicitAddend(const uint8_t *buf, RelType type) const { void RISCV::writeGotHeader(uint8_t *buf) const { if (config->is64) - write64le(buf, mainPart->dynamic->getVA()); + write64le(buf, ctx.mainPart->dynamic->getVA()); else - write32le(buf, mainPart->dynamic->getVA()); + write32le(buf, ctx.mainPart->dynamic->getVA()); } void RISCV::writeGotPlt(uint8_t *buf, const Symbol &s) const { diff --git a/lld/ELF/Arch/SystemZ.cpp b/lld/ELF/Arch/SystemZ.cpp index 0921bc11925189..293df50708952b 100644 --- a/lld/ELF/Arch/SystemZ.cpp +++ b/lld/ELF/Arch/SystemZ.cpp @@ -179,7 +179,7 @@ RelExpr SystemZ::getRelExpr(RelType type, const Symbol &s, void SystemZ::writeGotHeader(uint8_t *buf) const { // _GLOBAL_OFFSET_TABLE_[0] holds the value of _DYNAMIC. // _GLOBAL_OFFSET_TABLE_[1] and [2] are reserved. - write64be(buf, mainPart->dynamic->getVA()); + write64be(buf, ctx.mainPart->dynamic->getVA()); } void SystemZ::writeGotPlt(uint8_t *buf, const Symbol &s) const { diff --git a/lld/ELF/Arch/X86.cpp b/lld/ELF/Arch/X86.cpp index 8d4f258e2cf24e..20b69adc12bc05 100644 --- a/lld/ELF/Arch/X86.cpp +++ b/lld/ELF/Arch/X86.cpp @@ -164,7 +164,7 @@ RelExpr X86::adjustTlsExpr(RelType type, RelExpr expr) const { } void X86::writeGotPltHeader(uint8_t *buf) const { - write32le(buf, mainPart->dynamic->getVA()); + write32le(buf, ctx.mainPart->dynamic->getVA()); } void X86::writeGotPlt(uint8_t *buf, const Symbol &s) const { diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp index 3f95247e0035c3..65a81fe12f8709 100644 --- a/lld/ELF/Arch/X86_64.cpp +++ b/lld/ELF/Arch/X86_64.cpp @@ -411,7 +411,7 @@ void X86_64::writeGotPltHeader(uint8_t *buf) const { // in the psABI and glibc before Aug 2021 used the entry to compute run-time // load address of the shared object (note that this is relevant for linking // ld.so, not any other program). - write64le(buf, mainPart->dynamic->getVA()); + write64le(buf, ctx.mainPart->dynamic->getVA()); } void X86_64::writeGotPlt(uint8_t *buf, const Symbol &s) const { diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h index 035b385ba37ec3..0c7bfe1bef7e59 100644 --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -46,6 +46,7 @@ class Defined; class Symbol; class BitcodeCompiler; class OutputSection; +struct Partition; struct PhdrEntry; enum ELFKind : uint8_t { @@ -486,6 +487,7 @@ struct Ctx { // These variables are initialized by Writer and should not be used before // Writer is initialized. uint8_t *bufferStart; + Partition *mainPart; PhdrEntry *tlsPhdr; struct OutSections { OutputSection *elfHeader; diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index 36552e4bb035af..ced06a1c46a826 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -95,6 +95,7 @@ void Ctx::reset() { driver = LinkerDriver(); bufferStart = nullptr; + mainPart = nullptr; tlsPhdr = nullptr; out = OutSections{}; outputSections.clear(); @@ -3094,7 +3095,7 @@ template void LinkerDriver::link(opt::InputArgList &args) { // Now that the number of partitions is fixed, save a pointer to the main // partition. - mainPart = &partitions[0]; + ctx.mainPart = &partitions[0]; // Read .note.gnu.property sections from input object files which // contain a hint to tweak linker's and loader's behaviors. diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp index 9dbb4567495a81..9ccef389d48e38 100644 --- a/lld/ELF/Relocations.cpp +++ b/lld/ELF/Relocations.cpp @@ -399,7 +399,7 @@ template static void addCopyRelSymbol(SharedSymbol &ss) { for (SharedSymbol *sym : getSymbolsAt(ss)) replaceWithDefined(*sym, *sec, 0, sym->size); - mainPart->relaDyn->addSymbolReloc(target->copyRel, *sec, 0, ss); + ctx.mainPart->relaDyn->addSymbolReloc(target->copyRel, *sec, 0, ss); } // .eh_frame sections are mergeable input sections, so their input @@ -927,8 +927,9 @@ void elf::addGotEntry(Symbol &sym) { // If preemptible, emit a GLOB_DAT relocation. if (sym.isPreemptible) { - mainPart->relaDyn->addReloc({target->gotRel, in.got.get(), off, - DynamicReloc::AgainstSymbol, sym, 0, R_ABS}); + ctx.mainPart->relaDyn->addReloc({target->gotRel, in.got.get(), off, + DynamicReloc::AgainstSymbol, sym, 0, + R_ABS}); return; } @@ -947,7 +948,7 @@ static void addTpOffsetGotEntry(Symbol &sym) { in.got->addConstant({R_TPREL, target->symbolicRel, off, 0, &sym}); return; } - mainPart->relaDyn->addAddendOnlyRelocIfNonPreemptible( + ctx.mainPart->relaDyn->addAddendOnlyRelocIfNonPreemptible( target->tlsGotRel, *in.got, off, sym, target->symbolicRel); } @@ -1085,7 +1086,8 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset, if (LLVM_UNLIKELY(isIfunc) && config->zIfuncNoplt) { std::lock_guard lock(relocMutex); sym.exportDynamic = true; - mainPart->relaDyn->addSymbolReloc(type, *sec, offset, sym, addend, type); + ctx.mainPart->relaDyn->addSymbolReloc(type, *sec, offset, sym, addend, + type); return; } @@ -1727,7 +1729,8 @@ static bool handleNonPreemptibleIfunc(Symbol &sym, uint16_t flags) { // IRELATIVE in .rela.plt. auto *directSym = makeDefined(cast(sym)); directSym->allocateAux(); - auto &dyn = config->androidPackDynRelocs ? *in.relaPlt : *mainPart->relaDyn; + auto &dyn = + config->androidPackDynRelocs ? *in.relaPlt : *ctx.mainPart->relaDyn; addPltEntry(*in.iplt, *in.igotPlt, dyn, target->iRelativeRel, *directSym); sym.allocateAux(); ctx.symAux.back().pltIdx = ctx.symAux[directSym->auxIdx].pltIdx; @@ -1758,7 +1761,7 @@ void elf::postScanRelocations() { return; if (sym.isTagged() && sym.isDefined()) - mainPart->memtagGlobalDescriptors->addSymbol(sym); + ctx.mainPart->memtagGlobalDescriptors->addSymbol(sym); if (!sym.needsDynReloc()) return; @@ -1799,7 +1802,7 @@ void elf::postScanRelocations() { if (flags & NEEDS_TLSDESC) { got->addTlsDescEntry(sym); - mainPart->relaDyn->addAddendOnlyRelocIfNonPreemptible( + ctx.mainPart->relaDyn->addAddendOnlyRelocIfNonPreemptible( target->tlsDescRel, *got, got->getTlsDescOffset(sym), sym, target->tlsDescRel); } @@ -1810,22 +1813,22 @@ void elf::postScanRelocations() { // Write one to the GOT slot. got->addConstant({R_ADDEND, target->symbolicRel, off, 1, &sym}); else - mainPart->relaDyn->addSymbolReloc(target->tlsModuleIndexRel, *got, off, - sym); + ctx.mainPart->relaDyn->addSymbolReloc(target->tlsModuleIndexRel, *got, + off, sym); // If the symbol is preemptible we need the dynamic linker to write // the offset too. uint64_t offsetOff = off + config->wordsize; if (sym.isPreemptible) - mainPart->relaDyn->addSymbolReloc(target->tlsOffsetRel, *got, offsetOff, - sym); + ctx.mainPart->relaDyn->addSymbolReloc(target->tlsOffsetRel, *got, + offsetOff, sym); else got->addConstant({R_ABS, target->tlsOffsetRel, offsetOff, 0, &sym}); } if (flags & NEEDS_TLSGD_TO_IE) { got->addEntry(sym); - mainPart->relaDyn->addSymbolReloc(target->tlsGotRel, *got, - sym.getGotOffset(), sym); + ctx.mainPart->relaDyn->addSymbolReloc(target->tlsGotRel, *got, + sym.getGotOffset(), sym); } if (flags & NEEDS_GOT_DTPREL) { got->addEntry(sym); @@ -1841,7 +1844,7 @@ void elf::postScanRelocations() { if (ctx.needsTlsLd.load(std::memory_order_relaxed) && got->addTlsIndex()) { static Undefined dummy(ctx.internalFile, "", STB_LOCAL, 0, 0); if (config->shared) - mainPart->relaDyn->addReloc( + ctx.mainPart->relaDyn->addReloc( {target->tlsModuleIndexRel, got, got->getTlsIndexOff()}); else got->addConstant( diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index 7d26fa9aea74ab..1dbbd117290a4d 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -1017,9 +1017,9 @@ void MipsGotSection::build() { // for the TP-relative offset as we don't know how much other data will // be allocated before us in the static TLS block. if (s->isPreemptible || config->shared) - mainPart->relaDyn->addReloc({target->tlsGotRel, this, offset, - DynamicReloc::AgainstSymbolWithTargetVA, - *s, 0, R_ABS}); + ctx.mainPart->relaDyn->addReloc( + {target->tlsGotRel, this, offset, + DynamicReloc::AgainstSymbolWithTargetVA, *s, 0, R_ABS}); } for (std::pair &p : got.dynTlsSymbols) { Symbol *s = p.first; @@ -1027,7 +1027,8 @@ void MipsGotSection::build() { if (s == nullptr) { if (!config->shared) continue; - mainPart->relaDyn->addReloc({target->tlsModuleIndexRel, this, offset}); + ctx.mainPart->relaDyn->addReloc( + {target->tlsModuleIndexRel, this, offset}); } else { // When building a shared library we still need a dynamic relocation // for the module index. Therefore only checking for @@ -1035,15 +1036,15 @@ void MipsGotSection::build() { // thread-locals that have been marked as local through a linker script) if (!s->isPreemptible && !config->shared) continue; - mainPart->relaDyn->addSymbolReloc(target->tlsModuleIndexRel, *this, - offset, *s); + ctx.mainPart->relaDyn->addSymbolReloc(target->tlsModuleIndexRel, *this, + offset, *s); // However, we can skip writing the TLS offset reloc for non-preemptible // symbols since it is known even in shared libraries if (!s->isPreemptible) continue; offset += config->wordsize; - mainPart->relaDyn->addSymbolReloc(target->tlsOffsetRel, *this, offset, - *s); + ctx.mainPart->relaDyn->addSymbolReloc(target->tlsOffsetRel, *this, + offset, *s); } } @@ -1055,8 +1056,8 @@ void MipsGotSection::build() { // Dynamic relocations for "global" entries. for (const std::pair &p : got.global) { uint64_t offset = p.second * config->wordsize; - mainPart->relaDyn->addSymbolReloc(target->relativeRel, *this, offset, - *p.first); + ctx.mainPart->relaDyn->addSymbolReloc(target->relativeRel, *this, offset, + *p.first); } if (!config->isPic) continue; @@ -1066,15 +1067,15 @@ void MipsGotSection::build() { size_t pageCount = l.second.count; for (size_t pi = 0; pi < pageCount; ++pi) { uint64_t offset = (l.second.firstIndex + pi) * config->wordsize; - mainPart->relaDyn->addReloc({target->relativeRel, this, offset, l.first, - int64_t(pi * 0x10000)}); + ctx.mainPart->relaDyn->addReloc({target->relativeRel, this, offset, + l.first, int64_t(pi * 0x10000)}); } } for (const std::pair &p : got.local16) { uint64_t offset = p.second * config->wordsize; - mainPart->relaDyn->addReloc({target->relativeRel, this, offset, - DynamicReloc::AddendOnlyWithTargetVA, - *p.first.first, p.first.second, R_ABS}); + ctx.mainPart->relaDyn->addReloc({target->relativeRel, this, offset, + DynamicReloc::AddendOnlyWithTargetVA, + *p.first.first, p.first.second, R_ABS}); } } } @@ -1473,10 +1474,11 @@ DynamicSection::computeContents() { addInt(DT_AARCH64_MEMTAG_MODE, config->androidMemtagMode == NT_MEMTAG_LEVEL_ASYNC); addInt(DT_AARCH64_MEMTAG_HEAP, config->androidMemtagHeap); addInt(DT_AARCH64_MEMTAG_STACK, config->androidMemtagStack); - if (mainPart->memtagGlobalDescriptors->isNeeded()) { - addInSec(DT_AARCH64_MEMTAG_GLOBALS, *mainPart->memtagGlobalDescriptors); + if (ctx.mainPart->memtagGlobalDescriptors->isNeeded()) { + addInSec(DT_AARCH64_MEMTAG_GLOBALS, + *ctx.mainPart->memtagGlobalDescriptors); addInt(DT_AARCH64_MEMTAG_GLOBALSSZ, - mainPart->memtagGlobalDescriptors->getSize()); + ctx.mainPart->memtagGlobalDescriptors->getSize()); } } } @@ -1617,7 +1619,7 @@ uint32_t DynamicReloc::getSymIndex(SymbolTableBaseSection *symTab) const { size_t index = symTab->getSymbolIndex(*sym); assert((index != 0 || (type != target->gotRel && type != target->pltRel) || - !mainPart->dynSymTab->getParent()) && + !ctx.mainPart->dynSymTab->getParent()) && "GOT or PLT relocation must refer to symbol in dynamic symbol table"); return index; } @@ -2149,7 +2151,7 @@ void SymbolTableBaseSection::finalizeContents() { // Only the main partition's dynsym indexes are stored in the symbols // themselves. All other partitions use a lookup table. - if (this == mainPart->dynSymTab.get()) { + if (this == ctx.mainPart->dynSymTab.get()) { size_t i = 0; for (const SymbolTableEntry &s : symbols) s.sym->dynsymIndex = ++i; @@ -2193,7 +2195,7 @@ void SymbolTableBaseSection::addSymbol(Symbol *b) { } size_t SymbolTableBaseSection::getSymbolIndex(const Symbol &sym) { - if (this == mainPart->dynSymTab.get()) + if (this == ctx.mainPart->dynSymTab.get()) return sym.dynsymIndex; // Initializes symbol lookup tables lazily. This is used only for -r, @@ -3968,7 +3970,7 @@ void elf::combineEhSections() { llvm::append_range(eh.dependentSections, sec->dependentSections); } - if (!mainPart->armExidx) + if (!ctx.mainPart->armExidx) return; llvm::erase_if(ctx.inputSections, [](InputSectionBase *s) { // Ignore dead sections and the partition end marker (.part.end), @@ -4439,13 +4441,15 @@ size_t PartitionIndexSection::getSize() const { void PartitionIndexSection::finalizeContents() { for (size_t i = 1; i != partitions.size(); ++i) - partitions[i].nameStrTab = mainPart->dynStrTab->addString(partitions[i].name); + partitions[i].nameStrTab = + ctx.mainPart->dynStrTab->addString(partitions[i].name); } void PartitionIndexSection::writeTo(uint8_t *buf) { uint64_t va = getVA(); for (size_t i = 1; i != partitions.size(); ++i) { - write32(buf, mainPart->dynStrTab->getVA() + partitions[i].nameStrTab - va); + write32(buf, + ctx.mainPart->dynStrTab->getVA() + partitions[i].nameStrTab - va); write32(buf + 4, partitions[i].elfHeader->getVA() - (va + 4)); SyntheticSection *next = i == partitions.size() - 1 @@ -4922,7 +4926,6 @@ template void elf::createSyntheticSections() { InStruct elf::in; std::vector elf::partitions; -Partition *elf::mainPart; template void elf::splitSections(); template void elf::splitSections(); diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h index 43eb82cbb3e28b..56647f46b5fc41 100644 --- a/lld/ELF/SyntheticSections.h +++ b/lld/ELF/SyntheticSections.h @@ -1474,8 +1474,6 @@ struct Partition { unsigned getNumber() const { return this - &partitions[0] + 1; } }; -LLVM_LIBRARY_VISIBILITY extern Partition *mainPart; - inline Partition &SectionBase::getPartition() const { assert(isLive()); return partitions[partition - 1]; diff --git a/lld/ELF/Thunks.cpp b/lld/ELF/Thunks.cpp index f912f61e372943..478d956f43d9b1 100644 --- a/lld/ELF/Thunks.cpp +++ b/lld/ELF/Thunks.cpp @@ -474,7 +474,7 @@ class PPC64PILongBranchThunk final : public PPC64LongBranchThunk { assert(!dest.isPreemptible); if (std::optional index = in.ppc64LongBranchTarget->addEntry(&dest, addend)) { - mainPart->relaDyn->addRelativeReloc( + ctx.mainPart->relaDyn->addRelativeReloc( target->relativeRel, *in.ppc64LongBranchTarget, *index * UINT64_C(8), dest, addend + getPPC64GlobalEntryToLocalEntryOffset(dest.stOther), target->symbolicRel, R_ABS); diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index 82d9ea24d9bd3f..2091058c9c8874 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -315,7 +315,7 @@ template void Writer::run() { sec->maybeCompress(); if (script->hasSectionsCommand) - script->allocateHeaders(mainPart->phdrs); + script->allocateHeaders(ctx.mainPart->phdrs); // Remove empty PT_LOAD to avoid causing the dynamic linker to try to mmap a // 0 sized region. This has to be done late since only after assignAddresses @@ -833,10 +833,10 @@ template void Writer::setReservedSymbolSections() { } // .rela_iplt_{start,end} mark the start and the end of .rel[a].dyn. - if (ctx.sym.relaIpltStart && mainPart->relaDyn->isNeeded()) { - ctx.sym.relaIpltStart->section = mainPart->relaDyn.get(); - ctx.sym.relaIpltEnd->section = mainPart->relaDyn.get(); - ctx.sym.relaIpltEnd->value = mainPart->relaDyn->getSize(); + if (ctx.sym.relaIpltStart && ctx.mainPart->relaDyn->isNeeded()) { + ctx.sym.relaIpltStart->section = ctx.mainPart->relaDyn.get(); + ctx.sym.relaIpltEnd->section = ctx.mainPart->relaDyn.get(); + ctx.sym.relaIpltEnd->value = ctx.mainPart->relaDyn->getSize(); } PhdrEntry *last = nullptr; @@ -1681,7 +1681,7 @@ static void removeUnusedSyntheticSections() { // we would fail to remove it here. if (config->emachine == EM_AARCH64 && config->relrPackDynRelocs) if (auto *relSec = dyn_cast(sec)) - if (relSec == mainPart->relaDyn.get()) + if (relSec == ctx.mainPart->relaDyn.get()) return false; unused.insert(sec); return true; @@ -1721,10 +1721,10 @@ template void Writer::finalizeSections() { // It should be okay as no one seems to care about the type. // Even the author of gold doesn't remember why gold behaves that way. // https://sourceware.org/ml/binutils/2002-03/msg00360.html - if (mainPart->dynamic->parent) { + if (ctx.mainPart->dynamic->parent) { Symbol *s = symtab.addSymbol(Defined{ ctx.internalFile, "_DYNAMIC", STB_WEAK, STV_HIDDEN, STT_NOTYPE, - /*value=*/0, /*size=*/0, mainPart->dynamic.get()}); + /*value=*/0, /*size=*/0, ctx.mainPart->dynamic.get()}); s->isUsedInRegularObj = true; } @@ -1951,13 +1951,14 @@ template void Writer::finalizeSections() { addPhdrForSection(part, SHT_RISCV_ATTRIBUTES, PT_RISCV_ATTRIBUTES, PF_R); } - ctx.out.programHeaders->size = sizeof(Elf_Phdr) * mainPart->phdrs.size(); + ctx.out.programHeaders->size = + sizeof(Elf_Phdr) * ctx.mainPart->phdrs.size(); // Find the TLS segment. This happens before the section layout loop so that // Android relocation packing can look up TLS symbol addresses. We only need // to care about the main partition here because all TLS symbols were moved // to the main partition (see MarkLive.cpp). - for (PhdrEntry *p : mainPart->phdrs) + for (PhdrEntry *p : ctx.mainPart->phdrs) if (p->p_type == PT_TLS) ctx.tlsPhdr = p; } @@ -2720,8 +2721,8 @@ static uint16_t getELFType() { } template void Writer::writeHeader() { - writeEhdr(ctx.bufferStart, *mainPart); - writePhdrs(ctx.bufferStart + sizeof(Elf_Ehdr), *mainPart); + writeEhdr(ctx.bufferStart, *ctx.mainPart); + writePhdrs(ctx.bufferStart + sizeof(Elf_Ehdr), *ctx.mainPart); auto *eHdr = reinterpret_cast(ctx.bufferStart); eHdr->e_type = getELFType(); @@ -2884,7 +2885,7 @@ computeHash(llvm::MutableArrayRef hashBuf, } template void Writer::writeBuildId() { - if (!mainPart->buildId || !mainPart->buildId->getParent()) + if (!ctx.mainPart->buildId || !ctx.mainPart->buildId->getParent()) return; if (config->buildId == BuildIdKind::Hexstring) { @@ -2894,7 +2895,7 @@ template void Writer::writeBuildId() { } // Compute a hash of all sections of the output file. - size_t hashSize = mainPart->buildId->hashSize; + size_t hashSize = ctx.mainPart->buildId->hashSize; std::unique_ptr buildId(new uint8_t[hashSize]); MutableArrayRef output(buildId.get(), hashSize); llvm::ArrayRef input{ctx.bufferStart, size_t(fileSize)}; From 796787d07c30cb9448e1f9ff3f3da06c2fc96ccd Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 21 Aug 2024 20:27:44 -0700 Subject: [PATCH 142/426] [ELF] Remove unneeded script->. NFC --- lld/ELF/LinkerScript.cpp | 35 +++++++++++++++-------------------- lld/ELF/LinkerScript.h | 1 + 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp index 01c15d79e748ae..2f781379a27243 100644 --- a/lld/ELF/LinkerScript.cpp +++ b/lld/ELF/LinkerScript.cpp @@ -50,7 +50,7 @@ static bool isSectionPrefix(StringRef prefix, StringRef name) { return name.consume_front(prefix) && (name.empty() || name[0] == '.'); } -static StringRef getOutputSectionName(const InputSectionBase *s) { +StringRef LinkerScript::getOutputSectionName(const InputSectionBase *s) const { // This is for --emit-relocs and -r. If .text.foo is emitted as .text.bar, we // want to emit .rela.text.foo as .rela.text.bar for consistency (this is not // technically required, but not doing it is odd). This code guarantees that. @@ -77,7 +77,7 @@ static StringRef getOutputSectionName(const InputSectionBase *s) { if (s->name == "COMMON") return ".bss"; - if (script->hasSectionsCommand) + if (hasSectionsCommand) return s->name; // When no SECTIONS is specified, emulate GNU ld's internal linker scripts @@ -596,7 +596,7 @@ LinkerScript::computeInputSections(const InputSectionDescription *cmd, sortByPositionThenCommandLine(sizeAfterPrevSort, ret.size()); } else { SectionClassDesc *scd = - script->sectionClasses.lookup(CachedHashStringRef(cmd->classRef)); + sectionClasses.lookup(CachedHashStringRef(cmd->classRef)); if (!scd) { errorOrWarn("undefined section class '" + cmd->classRef + "'"); return ret; @@ -1160,7 +1160,7 @@ bool LinkerScript::assignOffsets(OutputSection *sec) { } state->outSec = sec; - if (!(sec->addrExpr && script->hasSectionsCommand)) { + if (!(sec->addrExpr && hasSectionsCommand)) { // ALIGN is respected. sec->alignment is the max of ALIGN and the maximum of // input section alignments. const uint64_t pos = dot; @@ -1419,15 +1419,6 @@ void LinkerScript::adjustSectionsAfterSorting() { maybePropagatePhdrs(osd->osec, defPhdrs); } -static uint64_t computeBase(uint64_t min, bool allocateHeaders) { - // If there is no SECTIONS or if the linkerscript is explicit about program - // headers, do our best to allocate them. - if (!script->hasSectionsCommand || allocateHeaders) - return 0; - // Otherwise only allocate program headers if that would not add a page. - return alignDown(min, config->maxPageSize); -} - // When the SECTIONS command is used, try to find an address for the file and // program headers output sections, which can be added to the first PT_LOAD // segment when program headers are created. @@ -1453,8 +1444,13 @@ void LinkerScript::allocateHeaders(SmallVector &phdrs) { }); bool paged = !config->omagic && !config->nmagic; uint64_t headerSize = getHeaderSize(); - if ((paged || hasExplicitHeaders) && - headerSize <= min - computeBase(min, hasExplicitHeaders)) { + + uint64_t base = 0; + // If SECTIONS is present and the linkerscript is not explicit about program + // headers, only allocate program headers if that would not add a page. + if (hasSectionsCommand && !hasExplicitHeaders) + base = alignDown(min, config->maxPageSize); + if ((paged || hasExplicitHeaders) && headerSize <= min - base) { min = alignDown(min - headerSize, config->maxPageSize); ctx.out.elfHeader->addr = min; ctx.out.programHeaders->addr = min + ctx.out.elfHeader->size; @@ -1487,7 +1483,7 @@ LinkerScript::AddressState::AddressState() { // that has changed its section or value (or nullptr if no symbol has changed). std::pair LinkerScript::assignAddresses() { - if (script->hasSectionsCommand) { + if (hasSectionsCommand) { // With a linker script, assignment of addresses to headers is covered by // allocateHeaders(). dot = config->imageBase.value_or(0); @@ -1805,10 +1801,9 @@ void LinkerScript::addScriptReferencedSymbolsToSymTable() { for (StringRef name : *symRefsVec.pop_back_val()) { reference(name); // Prevent the symbol from being discarded by --gc-sections. - script->referencedSymbols.push_back(name); - auto it = script->provideMap.find(name); - if (it != script->provideMap.end() && - LinkerScript::shouldAddProvideSym(name) && + referencedSymbols.push_back(name); + auto it = provideMap.find(name); + if (it != provideMap.end() && shouldAddProvideSym(name) && added.insert(name).second) { symRefsVec.push_back(&it->second); } diff --git a/lld/ELF/LinkerScript.h b/lld/ELF/LinkerScript.h index 90090ce16de547..b057051f772bf8 100644 --- a/lld/ELF/LinkerScript.h +++ b/lld/ELF/LinkerScript.h @@ -300,6 +300,7 @@ class LinkerScript final { llvm::DenseMap nameToOutputSection; + StringRef getOutputSectionName(const InputSectionBase *s) const; void addSymbol(SymbolAssignment *cmd); void assignSymbol(SymbolAssignment *cmd, bool inSec); void setDot(Expr e, const Twine &loc, bool inSec); From 88636854b007affdbe324369b26c9ded66934b22 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 21 Aug 2024 20:37:37 -0700 Subject: [PATCH 143/426] [RISCV][GISel] Correct registers classes in vector sext/zext.mir tests. NFC The liveins were always for an LMUL=1 register class even if the first instruction used a larger regsister class. One test in zext.mir used the wrong class for the first instruction. --- .../instruction-select/rvv/sext.mir | 48 ++++++++--------- .../instruction-select/rvv/zext.mir | 54 +++++++++---------- 2 files changed, 51 insertions(+), 51 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/sext.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/sext.mir index 382166fb20544e..a52e7203761ab9 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/sext.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/sext.mir @@ -393,10 +393,10 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m2 ; RV32I-LABEL: name: sext_nxv16i16_nxv16i8 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m2 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF @@ -405,7 +405,7 @@ body: | ; RV32I-NEXT: PseudoRET implicit $v8m4 ; ; RV64I-LABEL: name: sext_nxv16i16_nxv16i8 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m2 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF @@ -425,10 +425,10 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m2 ; RV32I-LABEL: name: sext_nxv16i32_nxv16i8 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m2 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF @@ -437,7 +437,7 @@ body: | ; RV32I-NEXT: PseudoRET implicit $v8m8 ; ; RV64I-LABEL: name: sext_nxv16i32_nxv16i8 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m2 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF @@ -457,10 +457,10 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m4 ; RV32I-LABEL: name: sext_nxv32i16_nxv32i8 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m4 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF @@ -469,7 +469,7 @@ body: | ; RV32I-NEXT: PseudoRET implicit $v8m8 ; ; RV64I-LABEL: name: sext_nxv32i16_nxv32i8 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m4 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF @@ -681,10 +681,10 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m2 ; RV32I-LABEL: name: sext_nxv8i32_nxv8i16 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m2 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF @@ -693,7 +693,7 @@ body: | ; RV32I-NEXT: PseudoRET implicit $v8m4 ; ; RV64I-LABEL: name: sext_nxv8i32_nxv8i16 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m2 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF @@ -713,10 +713,10 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m2 ; RV32I-LABEL: name: sext_nxv8i64_nxv8i16 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m2 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF @@ -725,7 +725,7 @@ body: | ; RV32I-NEXT: PseudoRET implicit $v8m8 ; ; RV64I-LABEL: name: sext_nxv8i64_nxv8i16 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m2 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF @@ -745,10 +745,10 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m4 ; RV32I-LABEL: name: sext_nxv16i32_nxv16i16 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m4 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF @@ -757,7 +757,7 @@ body: | ; RV32I-NEXT: PseudoRET implicit $v8m8 ; ; RV64I-LABEL: name: sext_nxv16i32_nxv16i16 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m4 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF @@ -841,10 +841,10 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m2 ; RV32I-LABEL: name: sext_nxv4i64_nxv4i32 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m2 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF @@ -853,7 +853,7 @@ body: | ; RV32I-NEXT: PseudoRET implicit $v8m4 ; ; RV64I-LABEL: name: sext_nxv4i64_nxv4i32 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m2 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF @@ -873,10 +873,10 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m4 ; RV32I-LABEL: name: sext_nxv8i64_nxv8i32 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m4 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF @@ -885,7 +885,7 @@ body: | ; RV32I-NEXT: PseudoRET implicit $v8m8 ; ; RV64I-LABEL: name: sext_nxv8i64_nxv8i32 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m4 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/zext.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/zext.mir index 2fc9e05602a8b0..ad151b4d9c7fee 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/zext.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/zext.mir @@ -393,10 +393,10 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m2 ; RV32I-LABEL: name: zext_nxv16i16_nxv16i8 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m2 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF @@ -405,7 +405,7 @@ body: | ; RV32I-NEXT: PseudoRET implicit $v8m4 ; ; RV64I-LABEL: name: zext_nxv16i16_nxv16i8 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m2 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF @@ -425,10 +425,10 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m2 ; RV32I-LABEL: name: zext_nxv16i32_nxv16i8 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m2 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF @@ -437,7 +437,7 @@ body: | ; RV32I-NEXT: PseudoRET implicit $v8m8 ; ; RV64I-LABEL: name: zext_nxv16i32_nxv16i8 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m2 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF @@ -457,10 +457,10 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m4 ; RV32I-LABEL: name: zext_nxv32i16_nxv32i8 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m4 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF @@ -469,7 +469,7 @@ body: | ; RV32I-NEXT: PseudoRET implicit $v8m8 ; ; RV64I-LABEL: name: zext_nxv32i16_nxv32i8 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m4 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF @@ -681,10 +681,10 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m2 ; RV32I-LABEL: name: zext_nxv8i32_nxv8i16 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m2 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF @@ -693,7 +693,7 @@ body: | ; RV32I-NEXT: PseudoRET implicit $v8m4 ; ; RV64I-LABEL: name: zext_nxv8i32_nxv8i16 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m2 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF @@ -713,26 +713,26 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m2 ; RV32I-LABEL: name: zext_nxv8i64_nxv8i16 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m2 ; RV32I-NEXT: {{ $}} - ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m4 + ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF4_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */ ; RV32I-NEXT: $v8m8 = COPY %1 ; RV32I-NEXT: PseudoRET implicit $v8m8 ; ; RV64I-LABEL: name: zext_nxv8i64_nxv8i16 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m2 ; RV64I-NEXT: {{ $}} - ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m4 + ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF4_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */ ; RV64I-NEXT: $v8m8 = COPY %1 ; RV64I-NEXT: PseudoRET implicit $v8m8 - %0:vrb() = COPY $v8m4 + %0:vrb() = COPY $v8m2 %1:vrb() = G_ZEXT %0() $v8m8 = COPY %1() PseudoRET implicit $v8m8 @@ -745,10 +745,10 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m4 ; RV32I-LABEL: name: zext_nxv16i32_nxv16i16 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m4 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF @@ -757,7 +757,7 @@ body: | ; RV32I-NEXT: PseudoRET implicit $v8m8 ; ; RV64I-LABEL: name: zext_nxv16i32_nxv16i16 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m4 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF @@ -841,10 +841,10 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m2 ; RV32I-LABEL: name: zext_nxv4i64_nxv4i32 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m2 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF @@ -853,7 +853,7 @@ body: | ; RV32I-NEXT: PseudoRET implicit $v8m4 ; ; RV64I-LABEL: name: zext_nxv4i64_nxv4i32 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m2 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF @@ -873,10 +873,10 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m4 ; RV32I-LABEL: name: zext_nxv8i64_nxv8i32 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m4 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF @@ -885,7 +885,7 @@ body: | ; RV32I-NEXT: PseudoRET implicit $v8m8 ; ; RV64I-LABEL: name: zext_nxv8i64_nxv8i32 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m4 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF From 503907dc505db1e439e7061113bf84dd105f2e35 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 21 Aug 2024 21:12:18 -0700 Subject: [PATCH 144/426] [ELF] LinkerScript: initialize dot. NFC Ensure that `dot` is initialized even if `script` uses default-initialization. --- lld/ELF/LinkerScript.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lld/ELF/LinkerScript.h b/lld/ELF/LinkerScript.h index b057051f772bf8..6634478160bc60 100644 --- a/lld/ELF/LinkerScript.h +++ b/lld/ELF/LinkerScript.h @@ -332,7 +332,7 @@ class LinkerScript final { OutputSection *aether; - uint64_t dot; + uint64_t dot = 0; public: OutputDesc *createOutputSection(StringRef name, StringRef location); From 4629aa17976b4110e6e94e7c92926c789730702e Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 21 Aug 2024 21:23:28 -0700 Subject: [PATCH 145/426] [ELF] Move script into Ctx. NFC Ctx was introduced in March 2022 as a more suitable place for such singletons. We now use default-initialization for `LinkerScript` and should pay attention to non-class types (e.g. `dot` is initialized by commit 503907dc505db1e439e7061113bf84dd105f2e35). --- lld/ELF/Config.h | 2 + lld/ELF/Driver.cpp | 22 ++++---- lld/ELF/ICF.cpp | 2 +- lld/ELF/LinkerScript.cpp | 6 +- lld/ELF/LinkerScript.h | 7 --- lld/ELF/MapFile.cpp | 2 +- lld/ELF/MarkLive.cpp | 4 +- lld/ELF/Relocations.cpp | 4 +- lld/ELF/ScriptParser.cpp | 100 ++++++++++++++++++---------------- lld/ELF/SyntheticSections.cpp | 13 +++-- lld/ELF/Writer.cpp | 93 +++++++++++++++---------------- 11 files changed, 128 insertions(+), 127 deletions(-) diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h index 0c7bfe1bef7e59..5987edee0e93e7 100644 --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -46,6 +46,7 @@ class Defined; class Symbol; class BitcodeCompiler; class OutputSection; +class LinkerScript; struct Partition; struct PhdrEntry; @@ -483,6 +484,7 @@ struct DuplicateSymbol { struct Ctx { LinkerDriver driver; + LinkerScript *script; // These variables are initialized by Writer and should not be used before // Writer is initialized. diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index ced06a1c46a826..308fd86c29ba12 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -93,6 +93,7 @@ void elf::errorOrWarn(const Twine &msg) { void Ctx::reset() { driver = LinkerDriver(); + script = nullptr; bufferStart = nullptr; mainPart = nullptr; @@ -160,8 +161,9 @@ bool link(ArrayRef args, llvm::raw_ostream &stdoutOS, "--error-limit=0 to see all errors)"; config = ConfigWrapper(); - script = ScriptWrapper(); + LinkerScript script; + elf::ctx.script = &script; elf::ctx.symAux.emplace_back(); partitions.clear(); @@ -463,7 +465,7 @@ static void checkOptions() { if (config->emachine != EM_AARCH64) error("--execute-only is only supported on AArch64 targets"); - if (config->singleRoRx && !script->hasSectionsCommand) + if (config->singleRoRx && !ctx.script->hasSectionsCommand) error("--execute-only and --no-rosegment cannot be used together"); } @@ -2456,10 +2458,10 @@ static void readSymbolPartitionSection(InputSectionBase *s) { // Forbid partitions from being used on incompatible targets, and forbid them // from being used together with various linker features that assume a single // set of output sections. - if (script->hasSectionsCommand) + if (ctx.script->hasSectionsCommand) error(toString(s->file) + ": partitions cannot be used with the SECTIONS command"); - if (script->hasPhdrsCommands()) + if (ctx.script->hasPhdrsCommands()) error(toString(s->file) + ": partitions cannot be used with the PHDRS command"); if (!config->sectionStartMap.empty()) @@ -2873,7 +2875,7 @@ template void LinkerDriver::link(opt::InputArgList &args) { // After potential archive member extraction involving ENTRY and // -u/--undefined-glob, check whether PROVIDE symbols should be defined (the // RHS may refer to definitions in just extracted object files). - script->addScriptReferencedSymbolsToSymTable(); + ctx.script->addScriptReferencedSymbolsToSymTable(); // Prevent LTO from removing any definition referenced by -u. for (StringRef name : config->undefined) @@ -2939,7 +2941,7 @@ template void LinkerDriver::link(opt::InputArgList &args) { // We want to declare linker script's symbols early, // so that we can version them. // They also might be exported if referenced by DSOs. - script->declareSymbols(); + ctx.script->declareSymbols(); // Handle --exclude-libs. This is before scanVersionScript() due to a // workaround for Android ndk: for a defined versioned symbol in an archive @@ -3158,13 +3160,13 @@ template void LinkerDriver::link(opt::InputArgList &args) { llvm::TimeTraceScope timeScope("Assign sections"); // Create output sections described by SECTIONS commands. - script->processSectionCommands(); + ctx.script->processSectionCommands(); // Linker scripts control how input sections are assigned to output // sections. Input sections that were not handled by scripts are called // "orphans", and they are assigned to output sections by the default rule. // Process that. - script->addOrphanSections(); + ctx.script->addOrphanSections(); } { @@ -3174,9 +3176,9 @@ template void LinkerDriver::link(opt::InputArgList &args) { // merging MergeInputSections into a single MergeSyntheticSection. From this // point onwards InputSectionDescription::sections should be used instead of // sectionBases. - for (SectionCommand *cmd : script->sectionCommands) + for (SectionCommand *cmd : ctx.script->sectionCommands) if (auto *osd = dyn_cast(cmd)) - osd->osec.finalizeInputSections(&script.s); + osd->osec.finalizeInputSections(ctx.script); } // Two input sections with different output sections should not be folded. diff --git a/lld/ELF/ICF.cpp b/lld/ELF/ICF.cpp index 44e8a71cc62869..92b3bbb46cc95d 100644 --- a/lld/ELF/ICF.cpp +++ b/lld/ELF/ICF.cpp @@ -577,7 +577,7 @@ template void ICF::run() { // InputSectionDescription::sections is populated by processSectionCommands(). // ICF may fold some input sections assigned to output sections. Remove them. - for (SectionCommand *cmd : script->sectionCommands) + for (SectionCommand *cmd : ctx.script->sectionCommands) if (auto *osd = dyn_cast(cmd)) for (SectionCommand *subCmd : osd->osec.commands) if (auto *isd = dyn_cast(subCmd)) diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp index 2f781379a27243..9ddda99d90f02d 100644 --- a/lld/ELF/LinkerScript.cpp +++ b/lld/ELF/LinkerScript.cpp @@ -44,8 +44,6 @@ using namespace llvm::support::endian; using namespace lld; using namespace lld::elf; -ScriptWrapper elf::script; - static bool isSectionPrefix(StringRef prefix, StringRef name) { return name.consume_front(prefix) && (name.empty() || name[0] == '.'); } @@ -862,7 +860,7 @@ static OutputSection *findByName(ArrayRef vec, } static OutputDesc *createSection(InputSectionBase *isec, StringRef outsecName) { - OutputDesc *osd = script->createOutputSection(outsecName, ""); + OutputDesc *osd = ctx.script->createOutputSection(outsecName, ""); osd->osec.recordSection(isec); return osd; } @@ -1470,7 +1468,7 @@ void LinkerScript::allocateHeaders(SmallVector &phdrs) { } LinkerScript::AddressState::AddressState() { - for (auto &mri : script->memoryRegions) { + for (auto &mri : ctx.script->memoryRegions) { MemoryRegion *mr = mri.second; mr->curPos = (mr->origin)().getValue(); } diff --git a/lld/ELF/LinkerScript.h b/lld/ELF/LinkerScript.h index 6634478160bc60..dee558722299f7 100644 --- a/lld/ELF/LinkerScript.h +++ b/lld/ELF/LinkerScript.h @@ -446,13 +446,6 @@ class LinkerScript final { llvm::DenseMap sectionClasses; }; -struct ScriptWrapper { - LinkerScript s; - LinkerScript *operator->() { return &s; } -}; - -LLVM_LIBRARY_VISIBILITY extern ScriptWrapper script; - } // end namespace lld::elf #endif // LLD_ELF_LINKER_SCRIPT_H diff --git a/lld/ELF/MapFile.cpp b/lld/ELF/MapFile.cpp index 1bad529b403299..26de8e4f0d8358 100644 --- a/lld/ELF/MapFile.cpp +++ b/lld/ELF/MapFile.cpp @@ -158,7 +158,7 @@ static void writeMapFile(raw_fd_ostream &os) { << " Size Align Out In Symbol\n"; OutputSection *osec = nullptr; - for (SectionCommand *cmd : script->sectionCommands) { + for (SectionCommand *cmd : ctx.script->sectionCommands) { if (auto *assign = dyn_cast(cmd)) { if (assign->provide && !assign->sym) continue; diff --git a/lld/ELF/MarkLive.cpp b/lld/ELF/MarkLive.cpp index 16e5883c2002c3..b2558a20ba1a78 100644 --- a/lld/ELF/MarkLive.cpp +++ b/lld/ELF/MarkLive.cpp @@ -234,7 +234,7 @@ template void MarkLive::run() { markSymbol(symtab.find(config->fini)); for (StringRef s : config->undefined) markSymbol(symtab.find(s)); - for (StringRef s : script->referencedSymbols) + for (StringRef s : ctx.script->referencedSymbols) markSymbol(symtab.find(s)); for (auto [symName, _] : symtab.cmseSymMap) { markSymbol(symtab.cmseSymMap[symName].sym); @@ -293,7 +293,7 @@ template void MarkLive::run() { // Preserve special sections and those which are specified in linker // script KEEP command. - if (isReserved(sec) || script->shouldKeep(sec)) { + if (isReserved(sec) || ctx.script->shouldKeep(sec)) { enqueue(sec, 0); } else if ((!config->zStartStopGC || sec->name.starts_with("__libc_")) && isValidCIdentifier(sec->name)) { diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp index 9ccef389d48e38..fa94842f3636b3 100644 --- a/lld/ELF/Relocations.cpp +++ b/lld/ELF/Relocations.cpp @@ -66,7 +66,7 @@ using namespace lld; using namespace lld::elf; static std::optional getLinkerScriptLocation(const Symbol &sym) { - for (SectionCommand *cmd : script->sectionCommands) + for (SectionCommand *cmd : ctx.script->sectionCommands) if (auto *assign = dyn_cast(cmd)) if (assign->sym == &sym) return assign->location; @@ -2420,7 +2420,7 @@ static void scanCrossRefs(const NoCrossRefCommand &cmd, OutputSection *osec, // scan relocations from its input sections for prohibited cross references. template void elf::checkNoCrossRefs() { for (OutputSection *osec : ctx.outputSections) { - for (const NoCrossRefCommand &noxref : script->noCrossRefs) { + for (const NoCrossRefCommand &noxref : ctx.script->noCrossRefs) { if (!llvm::is_contained(noxref.outputSections, osec->name) || (noxref.toFirst && noxref.outputSections[0] == osec->name)) continue; diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp index bdbce396cba1f8..08773bfb6ffe07 100644 --- a/lld/ELF/ScriptParser.cpp +++ b/lld/ELF/ScriptParser.cpp @@ -145,7 +145,7 @@ static void moveAbsRight(ExprValue &a, ExprValue &b) { if (a.sec == nullptr || (a.forceAbsolute && !b.isAbsolute())) std::swap(a, b); if (!b.isAbsolute()) - script->recordError( + ctx.script->recordError( a.loc + ": at least one side of the expression must be absolute"); } @@ -278,7 +278,7 @@ void ScriptParser::readLinkerScript() { } else if (tok == "NOCROSSREFS_TO") { readNoCrossRefs(/*to=*/true); } else if (SymbolAssignment *cmd = readAssignment(tok)) { - script->sectionCommands.push_back(cmd); + ctx.script->sectionCommands.push_back(cmd); } else { setError("unknown directive: " + tok); } @@ -296,7 +296,7 @@ void ScriptParser::readDefsym() { setError("EOF expected, but got " + next()); auto *cmd = make( name, e, 0, getCurrentMB().getBufferIdentifier().str()); - script->sectionCommands.push_back(cmd); + ctx.script->sectionCommands.push_back(cmd); } void ScriptParser::readNoCrossRefs(bool to) { @@ -307,7 +307,7 @@ void ScriptParser::readNoCrossRefs(bool to) { if (cmd.outputSections.size() < 2) warn(getCurrentLocation() + ": ignored with fewer than 2 output sections"); else - script->noCrossRefs.push_back(std::move(cmd)); + ctx.script->noCrossRefs.push_back(std::move(cmd)); } void ScriptParser::addFile(StringRef s) { @@ -529,7 +529,7 @@ void ScriptParser::readPhdrs() { setError("unexpected header attribute: " + next()); } - script->phdrsCommands.push_back(cmd); + ctx.script->phdrsCommands.push_back(cmd); } } @@ -540,11 +540,11 @@ void ScriptParser::readRegionAlias() { StringRef name = readName(); expect(")"); - if (script->memoryRegions.count(alias)) + if (ctx.script->memoryRegions.count(alias)) setError("redefinition of memory region '" + alias + "'"); - if (!script->memoryRegions.count(name)) + if (!ctx.script->memoryRegions.count(name)) setError("memory region '" + name + "' is not defined"); - script->memoryRegions.insert({alias, script->memoryRegions[name]}); + ctx.script->memoryRegions.insert({alias, ctx.script->memoryRegions[name]}); } void ScriptParser::readSearchDir() { @@ -562,7 +562,7 @@ void ScriptParser::readSearchDir() { SmallVector ScriptParser::readOverlay() { Expr addrExpr; if (consume(":")) { - addrExpr = [] { return script->getDot(); }; + addrExpr = [] { return ctx.script->getDot(); }; } else { addrExpr = readExpr(); expect(":"); @@ -570,7 +570,7 @@ SmallVector ScriptParser::readOverlay() { // When AT is omitted, LMA should equal VMA. script->getDot() when evaluating // lmaExpr will ensure this, even if the start address is specified. Expr lmaExpr = - consume("AT") ? readParenExpr() : [] { return script->getDot(); }; + consume("AT") ? readParenExpr() : [] { return ctx.script->getDot(); }; expect("{"); SmallVector v; @@ -610,7 +610,8 @@ SmallVector ScriptParser::readOverlay() { SectionClassDesc *ScriptParser::readSectionClassDescription() { StringRef name = readSectionClassName(); SectionClassDesc *desc = make(name); - if (!script->sectionClasses.insert({CachedHashStringRef(name), desc}).second) + if (!ctx.script->sectionClasses.insert({CachedHashStringRef(name), desc}) + .second) setError("section class '" + name + "' already defined"); expect("{"); while (auto tok = till("}")) { @@ -637,7 +638,7 @@ StringRef ScriptParser::readSectionClassName() { void ScriptParser::readOverwriteSections() { expect("{"); while (auto tok = till("}")) - script->overwriteSections.push_back(readOutputSectionDescription(tok)); + ctx.script->overwriteSections.push_back(readOutputSectionDescription(tok)); } void ScriptParser::readSections() { @@ -666,16 +667,16 @@ void ScriptParser::readSections() { // If DATA_SEGMENT_RELRO_END is absent, for sections after DATA_SEGMENT_ALIGN, // the relro fields should be cleared. - if (!script->seenRelroEnd) + if (!ctx.script->seenRelroEnd) for (SectionCommand *cmd : v) if (auto *osd = dyn_cast(cmd)) osd->osec.relro = false; - script->sectionCommands.insert(script->sectionCommands.end(), v.begin(), - v.end()); + ctx.script->sectionCommands.insert(ctx.script->sectionCommands.end(), + v.begin(), v.end()); if (atEOF() || !consume("INSERT")) { - script->hasSectionsCommand = true; + ctx.script->hasSectionsCommand = true; return; } @@ -690,7 +691,7 @@ void ScriptParser::readSections() { if (auto *os = dyn_cast(cmd)) names.push_back(os->osec.name); if (!names.empty()) - script->insertCommands.push_back({std::move(names), isAfter, where}); + ctx.script->insertCommands.push_back({std::move(names), isAfter, where}); } void ScriptParser::readTarget() { @@ -865,7 +866,7 @@ ScriptParser::readInputSectionDescription(StringRef tok) { else cmd = readInputSectionRules(tok, withFlags, withoutFlags); expect(")"); - script->keptSections.push_back(cmd); + ctx.script->keptSections.push_back(cmd); return cmd; } if (tok == "INPUT_SECTION_FLAGS") { @@ -894,7 +895,7 @@ Expr ScriptParser::readAssert() { return [=] { if (!e().getValue()) errorOrWarn(msg); - return script->getDot(); + return ctx.script->getDot(); }; } @@ -984,7 +985,7 @@ static Expr checkAlignment(Expr e, std::string &loc) { OutputDesc *ScriptParser::readOverlaySectionDescription() { OutputDesc *osd = - script->createOutputSection(readName(), getCurrentLocation()); + ctx.script->createOutputSection(readName(), getCurrentLocation()); osd->osec.inOverlay = true; expect("{"); while (auto tok = till("}")) { @@ -1007,12 +1008,12 @@ OutputDesc *ScriptParser::readOverlaySectionDescription() { OutputDesc *ScriptParser::readOutputSectionDescription(StringRef outSec) { OutputDesc *cmd = - script->createOutputSection(unquote(outSec), getCurrentLocation()); + ctx.script->createOutputSection(unquote(outSec), getCurrentLocation()); OutputSection *osec = &cmd->osec; // Maybe relro. Will reset to false if DATA_SEGMENT_RELRO_END is absent. - osec->relro = script->seenDataAlign && !script->seenRelroEnd; + osec->relro = ctx.script->seenDataAlign && !ctx.script->seenRelroEnd; - size_t symbolsReferenced = script->referencedSymbols.size(); + size_t symbolsReferenced = ctx.script->referencedSymbols.size(); if (peek() != ":") readSectionAddressType(osec); @@ -1095,7 +1096,7 @@ OutputDesc *ScriptParser::readOutputSectionDescription(StringRef outSec) { // Consume optional comma following output section command. consume(","); - if (script->referencedSymbols.size() > symbolsReferenced) + if (ctx.script->referencedSymbols.size() > symbolsReferenced) osec->expressionsUseSymbols = true; return cmd; } @@ -1158,7 +1159,7 @@ SymbolAssignment *ScriptParser::readAssignment(StringRef tok) { const char *oldS = prevTok.data(); SymbolAssignment *cmd = nullptr; - bool savedSeenRelroEnd = script->seenRelroEnd; + bool savedSeenRelroEnd = ctx.script->seenRelroEnd; const StringRef op = peek(); { SaveAndRestore saved(inExpr, true); @@ -1178,7 +1179,7 @@ SymbolAssignment *ScriptParser::readAssignment(StringRef tok) { } if (cmd) { - cmd->dataSegmentRelroEnd = !savedSeenRelroEnd && script->seenRelroEnd; + cmd->dataSegmentRelroEnd = !savedSeenRelroEnd && ctx.script->seenRelroEnd; cmd->commandString = StringRef(oldS, curTok.data() - oldS).str(); squeezeSpaces(cmd->commandString); expect(";"); @@ -1197,7 +1198,7 @@ SymbolAssignment *ScriptParser::readSymbolAssignment(StringRef name) { if (op != "=") { std::string loc = getCurrentLocation(); e = [=, c = op[0]]() -> ExprValue { - ExprValue lhs = script->getSymbolValue(name, loc); + ExprValue lhs = ctx.script->getSymbolValue(name, loc); switch (c) { case '*': return lhs.getValue() * e().getValue(); @@ -1460,8 +1461,8 @@ StringRef ScriptParser::readParenName() { } static void checkIfExists(const OutputSection &osec, StringRef location) { - if (osec.location.empty() && script->errorOnMissingSection) - script->recordError(location + ": undefined section " + osec.name); + if (osec.location.empty() && ctx.script->errorOnMissingSection) + ctx.script->recordError(location + ": undefined section " + osec.name); } static bool isValidSymbolName(StringRef s) { @@ -1503,7 +1504,7 @@ Expr ScriptParser::readPrimary() { } if (tok == "ADDR") { StringRef name = readParenName(); - OutputSection *osec = &script->getOrCreateOutputSection(name)->osec; + OutputSection *osec = &ctx.script->getOrCreateOutputSection(name)->osec; osec->usedInExpression = true; return [=]() -> ExprValue { checkIfExists(*osec, location); @@ -1515,7 +1516,8 @@ Expr ScriptParser::readPrimary() { Expr e = readExpr(); if (consume(")")) { e = checkAlignment(e, location); - return [=] { return alignToPowerOf2(script->getDot(), e().getValue()); }; + return + [=] { return alignToPowerOf2(ctx.script->getDot(), e().getValue()); }; } expect(","); Expr e2 = checkAlignment(readExpr(), location); @@ -1528,7 +1530,7 @@ Expr ScriptParser::readPrimary() { } if (tok == "ALIGNOF") { StringRef name = readParenName(); - OutputSection *osec = &script->getOrCreateOutputSection(name)->osec; + OutputSection *osec = &ctx.script->getOrCreateOutputSection(name)->osec; return [=] { checkIfExists(*osec, location); return osec->addralign; @@ -1544,17 +1546,17 @@ Expr ScriptParser::readPrimary() { expect(","); readExpr(); expect(")"); - script->seenDataAlign = true; + ctx.script->seenDataAlign = true; return [=] { uint64_t align = std::max(uint64_t(1), e().getValue()); - return (script->getDot() + align - 1) & -align; + return (ctx.script->getDot() + align - 1) & -align; }; } if (tok == "DATA_SEGMENT_END") { expect("("); expect("."); expect(")"); - return [] { return script->getDot(); }; + return [] { return ctx.script->getDot(); }; } if (tok == "DATA_SEGMENT_RELRO_END") { // GNU linkers implements more complicated logic to handle @@ -1565,8 +1567,10 @@ Expr ScriptParser::readPrimary() { expect(","); readExpr(); expect(")"); - script->seenRelroEnd = true; - return [=] { return alignToPowerOf2(script->getDot(), config->maxPageSize); }; + ctx.script->seenRelroEnd = true; + return [=] { + return alignToPowerOf2(ctx.script->getDot(), config->maxPageSize); + }; } if (tok == "DEFINED") { StringRef name = readParenName(); @@ -1581,15 +1585,15 @@ Expr ScriptParser::readPrimary() { } if (tok == "LENGTH") { StringRef name = readParenName(); - if (script->memoryRegions.count(name) == 0) { + if (ctx.script->memoryRegions.count(name) == 0) { setError("memory region not defined: " + name); return [] { return 0; }; } - return script->memoryRegions[name]->length; + return ctx.script->memoryRegions[name]->length; } if (tok == "LOADADDR") { StringRef name = readParenName(); - OutputSection *osec = &script->getOrCreateOutputSection(name)->osec; + OutputSection *osec = &ctx.script->getOrCreateOutputSection(name)->osec; osec->usedInExpression = true; return [=] { checkIfExists(*osec, location); @@ -1617,11 +1621,11 @@ Expr ScriptParser::readPrimary() { } if (tok == "ORIGIN") { StringRef name = readParenName(); - if (script->memoryRegions.count(name) == 0) { + if (ctx.script->memoryRegions.count(name) == 0) { setError("memory region not defined: " + name); return [] { return 0; }; } - return script->memoryRegions[name]->origin; + return ctx.script->memoryRegions[name]->origin; } if (tok == "SEGMENT_START") { expect("("); @@ -1633,7 +1637,7 @@ Expr ScriptParser::readPrimary() { } if (tok == "SIZEOF") { StringRef name = readParenName(); - OutputSection *cmd = &script->getOrCreateOutputSection(name)->osec; + OutputSection *cmd = &ctx.script->getOrCreateOutputSection(name)->osec; // Linker script does not create an output section if its content is empty. // We want to allow SIZEOF(.foo) where .foo is a section which happened to // be empty. @@ -1644,7 +1648,7 @@ Expr ScriptParser::readPrimary() { // Tok is the dot. if (tok == ".") - return [=] { return script->getSymbolValue(tok, location); }; + return [=] { return ctx.script->getSymbolValue(tok, location); }; // Tok is a literal number. if (std::optional val = parseInt(tok)) @@ -1656,10 +1660,10 @@ Expr ScriptParser::readPrimary() { else if (!isValidSymbolName(tok)) setError("malformed number: " + tok); if (activeProvideSym) - script->provideMap[*activeProvideSym].push_back(tok); + ctx.script->provideMap[*activeProvideSym].push_back(tok); else - script->referencedSymbols.push_back(tok); - return [=] { return script->getSymbolValue(tok, location); }; + ctx.script->referencedSymbols.push_back(tok); + return [=] { return ctx.script->getSymbolValue(tok, location); }; } Expr ScriptParser::readTernary(Expr cond) { @@ -1849,7 +1853,7 @@ void ScriptParser::readMemory() { // Add the memory region to the region map. MemoryRegion *mr = make(tok, origin, length, flags, invFlags, negFlags, negInvFlags); - if (!script->memoryRegions.insert({tok, mr}).second) + if (!ctx.script->memoryRegions.insert({tok, mr}).second) setError("region '" + tok + "' already defined"); } } diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index 1dbbd117290a4d..4c2b6db08b99a2 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -2343,7 +2343,7 @@ bool SymtabShndxSection::isNeeded() const { // late, and we do not know them here. For simplicity, we just always create // a .symtab_shndx section when the amount of output sections is huge. size_t size = 0; - for (SectionCommand *cmd : script->sectionCommands) + for (SectionCommand *cmd : ctx.script->sectionCommands) if (isa(cmd)) ++size; return size >= SHN_LORESERVE; @@ -4495,7 +4495,7 @@ void InStruct::reset() { static bool needsInterpSection() { return !config->relocatable && !config->shared && - !config->dynamicLinker.empty() && script->needsInterpSection(); + !config->dynamicLinker.empty() && ctx.script->needsInterpSection(); } bool elf::hasMemtag() { @@ -4630,7 +4630,7 @@ size_t MemtagGlobalDescriptors::getSize() const { } static OutputSection *findSection(StringRef name) { - for (SectionCommand *cmd : script->sectionCommands) + for (SectionCommand *cmd : ctx.script->sectionCommands) if (auto *osd = dyn_cast(cmd)) if (osd->osec.name == name) return &osd->osec; @@ -4682,7 +4682,8 @@ template void elf::createSyntheticSections() { // If there is a SECTIONS command and a .data.rel.ro section name use name // .data.rel.ro.bss so that we match in the .data.rel.ro output section. // This makes sure our relro is contiguous. - bool hasDataRelRo = script->hasSectionsCommand && findSection(".data.rel.ro"); + bool hasDataRelRo = + ctx.script->hasSectionsCommand && findSection(".data.rel.ro"); in.bssRelRo = std::make_unique( hasDataRelRo ? ".data.rel.ro.bss" : ".bss.rel.ro", 0, 1); add(*in.bssRelRo); @@ -4851,8 +4852,8 @@ template void elf::createSyntheticSections() { // Add .relro_padding if DATA_SEGMENT_RELRO_END is used; otherwise, add the // section in the absence of PHDRS/SECTIONS commands. if (config->zRelro && - ((script->phdrsCommands.empty() && !script->hasSectionsCommand) || - script->seenRelroEnd)) { + ((ctx.script->phdrsCommands.empty() && !ctx.script->hasSectionsCommand) || + ctx.script->seenRelroEnd)) { in.relroPadding = std::make_unique(); add(*in.relroPadding); } diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index 2091058c9c8874..087804b43918ab 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -231,7 +231,7 @@ void elf::addReservedSymbols() { addOptionalRegular("__dso_handle", ctx.out.elfHeader, 0, STV_HIDDEN); // If linker script do layout we do not need to create any standard symbols. - if (script->hasSectionsCommand) + if (ctx.script->hasSectionsCommand) return; auto add = [](StringRef s, int64_t pos) { @@ -293,7 +293,7 @@ static void demoteSymbolsAndComputeIsPreemptible() { } static OutputSection *findSection(StringRef name, unsigned partition = 1) { - for (SectionCommand *cmd : script->sectionCommands) + for (SectionCommand *cmd : ctx.script->sectionCommands) if (auto *osd = dyn_cast(cmd)) if (osd->osec.name == name && osd->osec.partition == partition) return &osd->osec; @@ -314,8 +314,8 @@ template void Writer::run() { for (OutputSection *sec : ctx.outputSections) sec->maybeCompress(); - if (script->hasSectionsCommand) - script->allocateHeaders(ctx.mainPart->phdrs); + if (ctx.script->hasSectionsCommand) + ctx.script->allocateHeaders(ctx.mainPart->phdrs); // Remove empty PT_LOAD to avoid causing the dynamic linker to try to mmap a // 0 sized region. This has to be done late since only after assignAddresses @@ -338,7 +338,7 @@ template void Writer::run() { // Handle --print-memory-usage option. if (config->printMemoryUsage) - script->printMemoryUsage(lld::outs()); + ctx.script->printMemoryUsage(lld::outs()); if (config->checkSections) checkSections(); @@ -496,7 +496,7 @@ static void demoteAndCopyLocalSymbols() { // referring to a section (that happens if the section is a synthetic one), we // don't create a section symbol for that section. template void Writer::addSectionSymbols() { - for (SectionCommand *cmd : script->sectionCommands) { + for (SectionCommand *cmd : ctx.script->sectionCommands) { auto *osd = dyn_cast(cmd); if (!osd) continue; @@ -995,7 +995,8 @@ findOrphanPos(SmallVectorImpl::iterator b, // making a read-only segment writable. If memory regions are defined, an // orphan section should continue the same region as the found section to // better resemble the behavior of GNU ld. - bool mustAfter = script->hasPhdrsCommands() || !script->memoryRegions.empty(); + bool mustAfter = + ctx.script->hasPhdrsCommands() || !ctx.script->memoryRegions.empty(); if (cast(*i)->osec.sortRank <= sec->sortRank || mustAfter) { for (auto j = ++i; j != e; ++j) { if (!isOutputSecWithInputSections(*j)) @@ -1212,7 +1213,7 @@ static void sortSection(OutputSection &osec, if (auto *isd = dyn_cast(b)) sortISDBySectionOrder(isd, order, osec.flags & SHF_EXECINSTR); - if (script->hasSectionsCommand) + if (ctx.script->hasSectionsCommand) return; if (name == ".init_array" || name == ".fini_array") { @@ -1241,7 +1242,7 @@ template void Writer::sortInputSections() { // Build the order once since it is expensive. DenseMap order = buildSectionOrder(); maybeShuffle(order); - for (SectionCommand *cmd : script->sectionCommands) + for (SectionCommand *cmd : ctx.script->sectionCommands) if (auto *osd = dyn_cast(cmd)) sortSection(osd->osec, order); } @@ -1252,33 +1253,33 @@ template void Writer::sortSections() { // Don't sort if using -r. It is not necessary and we want to preserve the // relative order for SHF_LINK_ORDER sections. if (config->relocatable) { - script->adjustOutputSections(); + ctx.script->adjustOutputSections(); return; } sortInputSections(); - for (SectionCommand *cmd : script->sectionCommands) + for (SectionCommand *cmd : ctx.script->sectionCommands) if (auto *osd = dyn_cast(cmd)) osd->osec.sortRank = getSectionRank(osd->osec); - if (!script->hasSectionsCommand) { + if (!ctx.script->hasSectionsCommand) { // OutputDescs are mostly contiguous, but may be interleaved with // SymbolAssignments in the presence of INSERT commands. auto mid = std::stable_partition( - script->sectionCommands.begin(), script->sectionCommands.end(), + ctx.script->sectionCommands.begin(), ctx.script->sectionCommands.end(), [](SectionCommand *cmd) { return isa(cmd); }); - std::stable_sort(script->sectionCommands.begin(), mid, compareSections); + std::stable_sort(ctx.script->sectionCommands.begin(), mid, compareSections); } // Process INSERT commands and update output section attributes. From this // point onwards the order of script->sectionCommands is fixed. - script->processInsertCommands(); - script->adjustOutputSections(); + ctx.script->processInsertCommands(); + ctx.script->adjustOutputSections(); - if (script->hasSectionsCommand) + if (ctx.script->hasSectionsCommand) sortOrphanSections(); - script->adjustSectionsAfterSorting(); + ctx.script->adjustSectionsAfterSorting(); } template void Writer::sortOrphanSections() { @@ -1321,8 +1322,8 @@ template void Writer::sortOrphanSections() { // after another commands. For the details, look at shouldSkip // function. - auto i = script->sectionCommands.begin(); - auto e = script->sectionCommands.end(); + auto i = ctx.script->sectionCommands.begin(); + auto e = ctx.script->sectionCommands.end(); auto nonScriptI = std::find_if(i, e, [](SectionCommand *cmd) { if (auto *osd = dyn_cast(cmd)) return osd->osec.sectionIndex == UINT32_MAX; @@ -1434,7 +1435,7 @@ template void Writer::finalizeAddressDependentContent() { ThunkCreator tc; AArch64Err843419Patcher a64p; ARMErr657417Patcher a32p; - script->assignAddresses(); + ctx.script->assignAddresses(); // .ARM.exidx and SHF_LINK_ORDER do not require precise addresses, but they // do require the relative addresses of OutputSections because linker scripts @@ -1457,7 +1458,7 @@ template void Writer::finalizeAddressDependentContent() { bool changed = target->needsThunks ? tc.createThunks(pass, ctx.outputSections) : target->relaxOnce(pass); - bool spilled = script->spillSections(); + bool spilled = ctx.script->spillSections(); changed |= spilled; ++pass; @@ -1471,12 +1472,12 @@ template void Writer::finalizeAddressDependentContent() { if (config->fixCortexA53Errata843419) { if (changed) - script->assignAddresses(); + ctx.script->assignAddresses(); changed |= a64p.createFixes(); } if (config->fixCortexA8) { if (changed) - script->assignAddresses(); + ctx.script->assignAddresses(); changed |= a32p.createFixes(); } @@ -1517,7 +1518,7 @@ template void Writer::finalizeAddressDependentContent() { } std::pair changes = - script->assignAddresses(); + ctx.script->assignAddresses(); if (!changed) { // Some symbols may be dependent on section addresses. When we break the // loop, the symbol values are finalized because a previous @@ -1548,7 +1549,7 @@ template void Writer::finalizeAddressDependentContent() { // If addrExpr is set, the address may not be a multiple of the alignment. // Warn because this is error-prone. - for (SectionCommand *cmd : script->sectionCommands) + for (SectionCommand *cmd : ctx.script->sectionCommands) if (auto *osd = dyn_cast(cmd)) { OutputSection *osec = &osd->osec; if (osec->addr % osec->addralign != 0) @@ -1559,7 +1560,7 @@ template void Writer::finalizeAddressDependentContent() { // Sizes are no longer allowed to grow, so all allowable spills have been // taken. Remove any leftover potential spills. - script->erasePotentialSpillSections(); + ctx.script->erasePotentialSpillSections(); } // If Input Sections have been shrunk (basic block sections) then @@ -1613,7 +1614,7 @@ template void Writer::optimizeBasicBlockJumps() { assert(config->optimizeBBJumps); SmallVector storage; - script->assignAddresses(); + ctx.script->assignAddresses(); // For every output section that has executable input sections, this // does the following: // 1. Deletes all direct jump instructions in input sections that @@ -1634,7 +1635,7 @@ template void Writer::optimizeBasicBlockJumps() { numDeleted += target->deleteFallThruJmpInsn(sec, sec.file, next); } if (numDeleted > 0) { - script->assignAddresses(); + ctx.script->assignAddresses(); LLVM_DEBUG(llvm::dbgs() << "Removing " << numDeleted << " fall through jumps\n"); } @@ -1697,7 +1698,7 @@ static void removeUnusedSyntheticSections() { llvm::erase_if(isd->sections, [&](InputSection *isec) { return unused.count(isec); }); - llvm::erase_if(script->orphanSections, [&](const InputSectionBase *sec) { + llvm::erase_if(ctx.script->orphanSections, [&](const InputSectionBase *sec) { return unused.count(sec); }); } @@ -1713,7 +1714,7 @@ template void Writer::finalizeSections() { // symbols for sections, so that the runtime can get the start and end // addresses of each section by section name. Add such symbols. addStartEndSymbols(); - for (SectionCommand *cmd : script->sectionCommands) + for (SectionCommand *cmd : ctx.script->sectionCommands) if (auto *osd = dyn_cast(cmd)) addStartStopSymbols(osd->osec); @@ -1792,7 +1793,7 @@ template void Writer::finalizeSections() { // Change values of linker-script-defined symbols from placeholders (assigned // by declareSymbols) to actual definitions. - script->processSymbolAssignments(); + ctx.script->processSymbolAssignments(); if (!config->relocatable) { llvm::TimeTraceScope timeScope("Scan relocations"); @@ -1888,14 +1889,14 @@ template void Writer::finalizeSections() { in.mipsGot->build(); removeUnusedSyntheticSections(); - script->diagnoseOrphanHandling(); - script->diagnoseMissingSGSectionAddress(); + ctx.script->diagnoseOrphanHandling(); + ctx.script->diagnoseMissingSGSectionAddress(); sortSections(); // Create a list of OutputSections, assign sectionIndex, and populate // in.shStrTab. If -z nosectionheader is specified, drop non-ALLOC sections. - for (SectionCommand *cmd : script->sectionCommands) + for (SectionCommand *cmd : ctx.script->sectionCommands) if (auto *osd = dyn_cast(cmd)) { OutputSection *osec = &osd->osec; if (!in.shStrTab && !(osec->flags & SHF_ALLOC)) @@ -1935,8 +1936,8 @@ template void Writer::finalizeSections() { // image base and the dynamic section on mips includes the image base. if (!config->relocatable && !config->oFormatBinary) { for (Partition &part : partitions) { - part.phdrs = script->hasPhdrsCommands() ? script->createPhdrs() - : createPhdrs(part); + part.phdrs = ctx.script->hasPhdrsCommands() ? ctx.script->createPhdrs() + : createPhdrs(part); if (config->emachine == EM_ARM) { // PT_ARM_EXIDX is the ARM EHABI equivalent of PT_GNU_EH_FRAME addPhdrForSection(part, SHT_ARM_EXIDX, PT_ARM_EXIDX, PF_R); @@ -1967,7 +1968,7 @@ template void Writer::finalizeSections() { // have the headers, we can find out which sections they point to. setReservedSymbolSections(); - if (script->noCrossRefs.size()) { + if (ctx.script->noCrossRefs.size()) { llvm::TimeTraceScope timeScope("Check NOCROSSREFS"); checkNoCrossRefs(); } @@ -2019,7 +2020,7 @@ template void Writer::finalizeSections() { } } - if (!script->hasSectionsCommand && !config->relocatable) + if (!ctx.script->hasSectionsCommand && !config->relocatable) fixSectionAlignments(); // This is used to: @@ -2071,7 +2072,7 @@ template void Writer::finalizeSections() { for (OutputSection *sec : ctx.outputSections) sec->finalize(); - script->checkFinalScriptConditions(); + ctx.script->checkFinalScriptConditions(); if (config->emachine == EM_ARM && !config->isLE && config->armBe8) { addArmInputSectionMappingSymbols(); @@ -2276,7 +2277,7 @@ SmallVector Writer::createPhdrs(Partition &part) { if (load && sec != relroEnd && sec->memRegion == load->firstSec->memRegion && (sameLMARegion || load->lastSec == ctx.out.programHeaders) && - (script->hasSectionsCommand || sec->type == SHT_NOBITS || + (ctx.script->hasSectionsCommand || sec->type == SHT_NOBITS || load->lastSec->type != SHT_NOBITS)) { load->p_flags |= newFlags; } else { @@ -2407,7 +2408,7 @@ template void Writer::fixSectionAlignments() { (prev->p_flags & PF_X) != (p->p_flags & PF_X)) || cmd->type == SHT_LLVM_PART_EHDR) cmd->addrExpr = [] { - return alignToPowerOf2(script->getDot(), config->maxPageSize); + return alignToPowerOf2(ctx.script->getDot(), config->maxPageSize); }; // PT_TLS is at the start of the first RW PT_LOAD. If `p` includes PT_TLS, // it must be the RW. Align to p_align(PT_TLS) to make sure @@ -2424,14 +2425,14 @@ template void Writer::fixSectionAlignments() { // blocks correctly. We need to keep the workaround for a while. else if (ctx.tlsPhdr && ctx.tlsPhdr->firstSec == p->firstSec) cmd->addrExpr = [] { - return alignToPowerOf2(script->getDot(), config->maxPageSize) + - alignToPowerOf2(script->getDot() % config->maxPageSize, + return alignToPowerOf2(ctx.script->getDot(), config->maxPageSize) + + alignToPowerOf2(ctx.script->getDot() % config->maxPageSize, ctx.tlsPhdr->p_align); }; else cmd->addrExpr = [] { - return alignToPowerOf2(script->getDot(), config->maxPageSize) + - script->getDot() % config->maxPageSize; + return alignToPowerOf2(ctx.script->getDot(), config->maxPageSize) + + ctx.script->getDot() % config->maxPageSize; }; } }; From f3bf46f5308a9684f4a5493268d6a96396130871 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 21 Aug 2024 21:41:06 -0700 Subject: [PATCH 146/426] [RISCV][GISel] Correct registers classes in vector anyext.mir test. NFC --- .../instruction-select/rvv/anyext.mir | 54 +++++++++---------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/anyext.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/anyext.mir index eda1180b82854d..40fbd90f3aef59 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/anyext.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/anyext.mir @@ -395,10 +395,10 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m2 ; RV32I-LABEL: name: anyext_nxv16i16_nxv16i8 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m2 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF @@ -407,7 +407,7 @@ body: | ; RV32I-NEXT: PseudoRET implicit $v8m4 ; ; RV64I-LABEL: name: anyext_nxv16i16_nxv16i8 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m2 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF @@ -427,26 +427,26 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m2 ; RV32I-LABEL: name: anyext_nxv16i32_nxv16i8 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m2 ; RV32I-NEXT: {{ $}} - ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m4 + ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF4_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */ ; RV32I-NEXT: $v8m8 = COPY %1 ; RV32I-NEXT: PseudoRET implicit $v8m8 ; ; RV64I-LABEL: name: anyext_nxv16i32_nxv16i8 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m2 ; RV64I-NEXT: {{ $}} - ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m4 + ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF4_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */ ; RV64I-NEXT: $v8m8 = COPY %1 ; RV64I-NEXT: PseudoRET implicit $v8m8 - %0:vrb() = COPY $v8m4 + %0:vrb() = COPY $v8m2 %1:vrb() = G_ANYEXT %0() $v8m8 = COPY %1() PseudoRET implicit $v8m8 @@ -459,10 +459,10 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m4 ; RV32I-LABEL: name: anyext_nxv32i16_nxv32i8 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m4 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF @@ -471,7 +471,7 @@ body: | ; RV32I-NEXT: PseudoRET implicit $v8m8 ; ; RV64I-LABEL: name: anyext_nxv32i16_nxv32i8 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m4 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF @@ -683,10 +683,10 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m2 ; RV32I-LABEL: name: anyext_nxv8i32_nxv8i16 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m2 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF @@ -695,7 +695,7 @@ body: | ; RV32I-NEXT: PseudoRET implicit $v8m4 ; ; RV64I-LABEL: name: anyext_nxv8i32_nxv8i16 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m2 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF @@ -715,10 +715,10 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m2 ; RV32I-LABEL: name: anyext_nxv8i64_nxv8i16 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m2 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF @@ -727,7 +727,7 @@ body: | ; RV32I-NEXT: PseudoRET implicit $v8m8 ; ; RV64I-LABEL: name: anyext_nxv8i64_nxv8i16 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m2 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF @@ -747,10 +747,10 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m4 ; RV32I-LABEL: name: anyext_nxv16i32_nxv16i16 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m4 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF @@ -759,7 +759,7 @@ body: | ; RV32I-NEXT: PseudoRET implicit $v8m8 ; ; RV64I-LABEL: name: anyext_nxv16i32_nxv16i16 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m4 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF @@ -843,10 +843,10 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m2 ; RV32I-LABEL: name: anyext_nxv4i64_nxv4i32 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m2 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF @@ -855,7 +855,7 @@ body: | ; RV32I-NEXT: PseudoRET implicit $v8m4 ; ; RV64I-LABEL: name: anyext_nxv4i64_nxv4i32 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m2 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF @@ -875,10 +875,10 @@ regBankSelected: true tracksRegLiveness: true body: | bb.0.entry: - liveins: $v8 + liveins: $v8m4 ; RV32I-LABEL: name: anyext_nxv8i64_nxv8i32 - ; RV32I: liveins: $v8 + ; RV32I: liveins: $v8m4 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4 ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF @@ -887,7 +887,7 @@ body: | ; RV32I-NEXT: PseudoRET implicit $v8m8 ; ; RV64I-LABEL: name: anyext_nxv8i64_nxv8i32 - ; RV64I: liveins: $v8 + ; RV64I: liveins: $v8m4 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4 ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF From 8039886e6d8985921802295dbc86401546120ac8 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 22 Aug 2024 09:16:37 +0400 Subject: [PATCH 147/426] AMDGPU: Handle folding frame indexes into s_add_i32 (#101694) This does not yet enable producing direct frame index references in s_add_i32, only the lowering. --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 87 ++ .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 34 +- .../eliminate-frame-index-s-add-i32.mir | 870 +++++++++++++++--- llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll | 19 +- llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 66 +- llvm/test/CodeGen/AMDGPU/frame-index.mir | 4 +- .../local-stack-alloc-block-sp-reference.ll | 8 +- 7 files changed, 841 insertions(+), 247 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 7523b619748cc7..4c571a36e4896c 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2432,7 +2432,94 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, MI->eraseFromParent(); return true; } + case AMDGPU::S_ADD_I32: { + // TODO: Handle s_or_b32, s_and_b32. + unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1; + MachineOperand &OtherOp = MI->getOperand(OtherOpIdx); + assert(FrameReg || MFI->isBottomOfStack()); + + MachineOperand &DstOp = MI->getOperand(0); + const DebugLoc &DL = MI->getDebugLoc(); + Register MaterializedReg = FrameReg; + + // Defend against live scc, which should never happen in practice. + bool DeadSCC = MI->getOperand(3).isDead(); + + Register TmpReg; + + if (FrameReg && !ST.enableFlatScratch()) { + // FIXME: In the common case where the add does not also read its result + // (i.e. this isn't a reg += fi), it's not finding the dest reg as + // available. + TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, MI, + false, 0); + BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32)) + .addDef(TmpReg, RegState::Renamable) + .addReg(FrameReg) + .addImm(ST.getWavefrontSizeLog2()) + .setOperandDead(3); // Set SCC dead + MaterializedReg = TmpReg; + } + + int64_t Offset = FrameInfo.getObjectOffset(Index); + + // For the non-immediate case, we could fall through to the default + // handling, but we do an in-place update of the result register here to + // avoid scavenging another register. + if (OtherOp.isImm()) { + OtherOp.setImm(OtherOp.getImm() + Offset); + Offset = 0; + + if (MaterializedReg) + FIOp.ChangeToRegister(MaterializedReg, false); + else + FIOp.ChangeToImmediate(0); + } else if (MaterializedReg) { + // If we can't fold the other operand, do another increment. + Register DstReg = DstOp.getReg(); + + if (!TmpReg && MaterializedReg == FrameReg) { + TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, + MI, false, 0); + DstReg = TmpReg; + } + + auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_ADD_I32)) + .addDef(DstReg, RegState::Renamable) + .addReg(MaterializedReg, RegState::Kill) + .add(OtherOp); + if (DeadSCC) + AddI32.setOperandDead(3); + + MaterializedReg = DstReg; + + OtherOp.ChangeToRegister(MaterializedReg, false); + OtherOp.setIsKill(true); + OtherOp.setIsRenamable(true); + FIOp.ChangeToImmediate(Offset); + } else { + // If we don't have any other offset to apply, we can just directly + // interpret the frame index as the offset. + FIOp.ChangeToImmediate(Offset); + } + + if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) { + assert(Offset == 0); + MI->removeOperand(3); + MI->removeOperand(OtherOpIdx); + MI->setDesc(TII->get(FIOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); + } else if (DeadSCC && FIOp.isImm() && FIOp.getImm() == 0) { + assert(Offset == 0); + MI->removeOperand(3); + MI->removeOperand(FIOperandNum); + MI->setDesc( + TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); + } + + assert(!FIOp.isFI()); + return true; + } default: { // Other access to frame index const DebugLoc &DL = MI->getDebugLoc(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index f4fd803c8dda89..04833eaaa3283b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -15,11 +15,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-NEXT: s_and_b32 s0, s0, 15 -; GFX9-NEXT: s_add_i32 s1, s1, 0 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: scratch_store_dword off, v0, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s0, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -36,8 +34,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX10-NEXT: s_and_b32 s1, s0, 15 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-NEXT: s_lshl_b32 s1, s1, 2 -; GFX10-NEXT: s_add_i32 s0, s0, 0 -; GFX10-NEXT: s_add_i32 s1, s1, 0 ; GFX10-NEXT: scratch_store_dword off, v0, s0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc @@ -51,11 +47,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s1, s0, 2 ; GFX940-NEXT: s_and_b32 s0, s0, 15 -; GFX940-NEXT: s_add_i32 s1, s1, 0 ; GFX940-NEXT: s_lshl_b32 s0, s0, 2 ; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_add_i32 s0, s0, 0 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm @@ -68,8 +62,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-NEXT: s_lshl_b32 s1, s1, 2 -; GFX11-NEXT: s_add_i32 s0, s0, 0 -; GFX11-NEXT: s_add_i32 s1, s1, 0 ; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc @@ -84,8 +76,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX12-NEXT: s_and_b32 s1, s0, 15 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-NEXT: s_lshl_b32 s1, s1, 2 -; GFX12-NEXT: s_add_co_i32 s0, s0, 0 -; GFX12-NEXT: s_add_co_i32 s1, s1, 0 ; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS @@ -1042,13 +1032,13 @@ define void @store_load_large_imm_offset_foo() { ; GFX9-LABEL: store_load_large_imm_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: s_movk_i32 s0, 0x3e80 -; GFX9-NEXT: s_add_i32 s1, s32, 4 +; GFX9-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-NEXT: s_add_i32 s1, s32, s0 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_add_i32 s0, s1, 4 ; GFX9-NEXT: scratch_store_dword off, v0, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc @@ -1059,10 +1049,10 @@ define void @store_load_large_imm_offset_foo() { ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 13 -; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_movk_i32 s0, 0x3e80 -; GFX10-NEXT: s_add_i32 s1, s32, 4 -; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-NEXT: s_add_i32 s1, s32, s0 +; GFX10-NEXT: s_add_i32 s0, s1, 4 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_store_dword off, v1, s0 @@ -1074,13 +1064,13 @@ define void @store_load_large_imm_offset_foo() { ; GFX940-LABEL: store_load_large_imm_offset_foo: ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 13 ; GFX940-NEXT: s_movk_i32 s0, 0x3e80 -; GFX940-NEXT: s_add_i32 s1, s32, 4 +; GFX940-NEXT: v_mov_b32_e32 v0, 13 +; GFX940-NEXT: s_add_i32 s1, s32, s0 ; GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, 15 -; GFX940-NEXT: s_add_i32 s0, s0, s1 +; GFX940-NEXT: s_add_i32 s0, s1, 4 ; GFX940-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 @@ -1092,9 +1082,9 @@ define void @store_load_large_imm_offset_foo() { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 ; GFX11-NEXT: s_movk_i32 s0, 0x3e80 -; GFX11-NEXT: s_add_i32 s1, s32, 4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s1, s32, s0 +; GFX11-NEXT: s_add_i32 s0, s1, 4 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-add-i32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-add-i32.mir index a09b39069e5c9a..585bfb4c58eae2 100644 --- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-add-i32.mir +++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-add-i32.mir @@ -21,13 +21,13 @@ machineFunctionInfo: body: | bb.0: ; MUBUFW64-LABEL: name: s_add_i32__inline_imm__fi_offset0 - ; MUBUFW64: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc - ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 12, killed $sgpr4, implicit-def dead $scc + ; MUBUFW64: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 12, $sgpr4, implicit-def dead $scc ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7 ; ; MUBUFW32-LABEL: name: s_add_i32__inline_imm__fi_offset0 - ; MUBUFW32: $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc - ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 12, killed $sgpr4, implicit-def dead $scc + ; MUBUFW32: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 12, $sgpr4, implicit-def dead $scc ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7 ; ; FLATSCRW64-LABEL: name: s_add_i32__inline_imm__fi_offset0 @@ -54,13 +54,13 @@ machineFunctionInfo: body: | bb.0: ; MUBUFW64-LABEL: name: s_add_i32__fi_offset0__inline_imm - ; MUBUFW64: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc - ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, 12, implicit-def dead $scc + ; MUBUFW64: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr4, 12, implicit-def dead $scc ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7 ; ; MUBUFW32-LABEL: name: s_add_i32__fi_offset0__inline_imm - ; MUBUFW32: $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc - ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, 12, implicit-def dead $scc + ; MUBUFW32: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr4, 12, implicit-def dead $scc ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7 ; ; FLATSCRW64-LABEL: name: s_add_i32__fi_offset0__inline_imm @@ -88,25 +88,21 @@ machineFunctionInfo: body: | bb.0: ; MUBUFW64-LABEL: name: s_add_i32__inline_imm___fi_offset_inline_imm - ; MUBUFW64: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc - ; MUBUFW64-NEXT: $sgpr4 = S_ADD_I32 killed $sgpr4, 16, implicit-def $scc - ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 12, killed $sgpr4, implicit-def $scc + ; MUBUFW64: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 28, $sgpr4, implicit-def $scc ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7 ; ; MUBUFW32-LABEL: name: s_add_i32__inline_imm___fi_offset_inline_imm - ; MUBUFW32: $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def $scc - ; MUBUFW32-NEXT: $sgpr4 = S_ADD_I32 killed $sgpr4, 16, implicit-def $scc - ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 12, killed $sgpr4, implicit-def $scc + ; MUBUFW32: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 28, $sgpr4, implicit-def $scc ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7 ; ; FLATSCRW64-LABEL: name: s_add_i32__inline_imm___fi_offset_inline_imm - ; FLATSCRW64: $sgpr4 = S_ADD_I32 $sgpr32, 16, implicit-def $scc - ; FLATSCRW64-NEXT: renamable $sgpr7 = S_ADD_I32 12, killed $sgpr4, implicit-def $scc + ; FLATSCRW64: renamable $sgpr7 = S_ADD_I32 28, $sgpr32, implicit-def $scc ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7 ; ; FLATSCRW32-LABEL: name: s_add_i32__inline_imm___fi_offset_inline_imm - ; FLATSCRW32: $sgpr4 = S_ADD_I32 $sgpr32, 16, implicit-def $scc - ; FLATSCRW32-NEXT: renamable $sgpr7 = S_ADD_I32 12, killed $sgpr4, implicit-def $scc + ; FLATSCRW32: renamable $sgpr7 = S_ADD_I32 28, $sgpr32, implicit-def $scc ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7 renamable $sgpr7 = S_ADD_I32 12, %stack.1, implicit-def $scc SI_RETURN implicit $sgpr7 @@ -125,13 +121,13 @@ machineFunctionInfo: body: | bb.0: ; MUBUFW64-LABEL: name: s_add_i32__literal__fi_offset0 - ; MUBUFW64: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc - ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 68, killed $sgpr4, implicit-def dead $scc + ; MUBUFW64: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 68, $sgpr4, implicit-def dead $scc ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7 ; ; MUBUFW32-LABEL: name: s_add_i32__literal__fi_offset0 - ; MUBUFW32: $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc - ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 68, killed $sgpr4, implicit-def dead $scc + ; MUBUFW32: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 68, $sgpr4, implicit-def dead $scc ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7 ; ; FLATSCRW64-LABEL: name: s_add_i32__literal__fi_offset0 @@ -158,13 +154,13 @@ machineFunctionInfo: body: | bb.0: ; MUBUFW64-LABEL: name: s_add_i32__fi_offset0__literal - ; MUBUFW64: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc - ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, 68, implicit-def $scc + ; MUBUFW64: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr4, 68, implicit-def $scc ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7 ; ; MUBUFW32-LABEL: name: s_add_i32__fi_offset0__literal - ; MUBUFW32: $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc - ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, 68, implicit-def $scc + ; MUBUFW32: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr4, 68, implicit-def $scc ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7 ; ; FLATSCRW64-LABEL: name: s_add_i32__fi_offset0__literal @@ -192,25 +188,21 @@ machineFunctionInfo: body: | bb.0: ; MUBUFW64-LABEL: name: s_add_i32__literal__fi_offset96 - ; MUBUFW64: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc - ; MUBUFW64-NEXT: $sgpr4 = S_ADD_I32 killed $sgpr4, 96, implicit-def $scc - ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 68, killed $sgpr4, implicit-def $scc + ; MUBUFW64: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 164, $sgpr4, implicit-def $scc ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; MUBUFW32-LABEL: name: s_add_i32__literal__fi_offset96 - ; MUBUFW32: $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def $scc - ; MUBUFW32-NEXT: $sgpr4 = S_ADD_I32 killed $sgpr4, 96, implicit-def $scc - ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 68, killed $sgpr4, implicit-def $scc + ; MUBUFW32: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 164, $sgpr4, implicit-def $scc ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; FLATSCRW64-LABEL: name: s_add_i32__literal__fi_offset96 - ; FLATSCRW64: $sgpr4 = S_ADD_I32 $sgpr32, 96, implicit-def $scc - ; FLATSCRW64-NEXT: renamable $sgpr7 = S_ADD_I32 68, killed $sgpr4, implicit-def $scc + ; FLATSCRW64: renamable $sgpr7 = S_ADD_I32 164, $sgpr32, implicit-def $scc ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; FLATSCRW32-LABEL: name: s_add_i32__literal__fi_offset96 - ; FLATSCRW32: $sgpr4 = S_ADD_I32 $sgpr32, 96, implicit-def $scc - ; FLATSCRW32-NEXT: renamable $sgpr7 = S_ADD_I32 68, killed $sgpr4, implicit-def $scc + ; FLATSCRW32: renamable $sgpr7 = S_ADD_I32 164, $sgpr32, implicit-def $scc ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc renamable $sgpr7 = S_ADD_I32 68, %stack.1, implicit-def $scc SI_RETURN implicit $sgpr7, implicit $scc @@ -230,25 +222,21 @@ machineFunctionInfo: body: | bb.0: ; MUBUFW64-LABEL: name: s_add_i32____fi_offset96__literal - ; MUBUFW64: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc - ; MUBUFW64-NEXT: $sgpr4 = S_ADD_I32 killed $sgpr4, 96, implicit-def $scc - ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, 68, implicit-def $scc + ; MUBUFW64: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr4, 164, implicit-def $scc ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; MUBUFW32-LABEL: name: s_add_i32____fi_offset96__literal - ; MUBUFW32: $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def $scc - ; MUBUFW32-NEXT: $sgpr4 = S_ADD_I32 killed $sgpr4, 96, implicit-def $scc - ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, 68, implicit-def $scc + ; MUBUFW32: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr4, 164, implicit-def $scc ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; FLATSCRW64-LABEL: name: s_add_i32____fi_offset96__literal - ; FLATSCRW64: $sgpr4 = S_ADD_I32 $sgpr32, 96, implicit-def $scc - ; FLATSCRW64-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, 68, implicit-def $scc + ; FLATSCRW64: renamable $sgpr7 = S_ADD_I32 $sgpr32, 164, implicit-def $scc ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; FLATSCRW32-LABEL: name: s_add_i32____fi_offset96__literal - ; FLATSCRW32: $sgpr4 = S_ADD_I32 $sgpr32, 96, implicit-def $scc - ; FLATSCRW32-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, 68, implicit-def $scc + ; FLATSCRW32: renamable $sgpr7 = S_ADD_I32 $sgpr32, 164, implicit-def $scc ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc renamable $sgpr7 = S_ADD_I32 %stack.1, 68, implicit-def $scc SI_RETURN implicit $sgpr7, implicit $scc @@ -270,27 +258,31 @@ body: | ; MUBUFW64-LABEL: name: s_add_i32__sgpr__fi_offset0 ; MUBUFW64: liveins: $sgpr8 ; MUBUFW64-NEXT: {{ $}} - ; MUBUFW64-NEXT: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc - ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr8, killed $sgpr4, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, $sgpr8, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = COPY killed renamable $sgpr7 ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7 ; ; MUBUFW32-LABEL: name: s_add_i32__sgpr__fi_offset0 ; MUBUFW32: liveins: $sgpr8 ; MUBUFW32-NEXT: {{ $}} - ; MUBUFW32-NEXT: $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc - ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr8, killed $sgpr4, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, $sgpr8, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = COPY killed renamable $sgpr7 ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7 ; ; FLATSCRW64-LABEL: name: s_add_i32__sgpr__fi_offset0 ; FLATSCRW64: liveins: $sgpr8 ; FLATSCRW64-NEXT: {{ $}} - ; FLATSCRW64-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr8, $sgpr32, implicit-def dead $scc + ; FLATSCRW64-NEXT: renamable $sgpr4 = S_ADD_I32 killed $sgpr32, $sgpr8, implicit-def dead $scc + ; FLATSCRW64-NEXT: renamable $sgpr7 = COPY killed renamable $sgpr4 ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7 ; ; FLATSCRW32-LABEL: name: s_add_i32__sgpr__fi_offset0 ; FLATSCRW32: liveins: $sgpr8 ; FLATSCRW32-NEXT: {{ $}} - ; FLATSCRW32-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr8, $sgpr32, implicit-def dead $scc + ; FLATSCRW32-NEXT: renamable $sgpr4 = S_ADD_I32 killed $sgpr32, $sgpr8, implicit-def dead $scc + ; FLATSCRW32-NEXT: renamable $sgpr7 = COPY killed renamable $sgpr4 ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7 renamable $sgpr7 = S_ADD_I32 $sgpr8, %stack.0, implicit-def dead $scc SI_RETURN implicit $sgpr7 @@ -312,27 +304,31 @@ body: | ; MUBUFW64-LABEL: name: s_add_i32__fi_offset0__sgpr ; MUBUFW64: liveins: $sgpr8 ; MUBUFW64-NEXT: {{ $}} - ; MUBUFW64-NEXT: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, $sgpr8, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = COPY killed renamable $sgpr7 ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7 ; ; MUBUFW32-LABEL: name: s_add_i32__fi_offset0__sgpr ; MUBUFW32: liveins: $sgpr8 ; MUBUFW32-NEXT: {{ $}} - ; MUBUFW32-NEXT: $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, $sgpr8, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = COPY killed renamable $sgpr7 ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7 ; ; FLATSCRW64-LABEL: name: s_add_i32__fi_offset0__sgpr ; FLATSCRW64: liveins: $sgpr8 ; FLATSCRW64-NEXT: {{ $}} - ; FLATSCRW64-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr32, $sgpr8, implicit-def dead $scc + ; FLATSCRW64-NEXT: renamable $sgpr4 = S_ADD_I32 killed $sgpr32, $sgpr8, implicit-def dead $scc + ; FLATSCRW64-NEXT: renamable $sgpr7 = COPY killed renamable $sgpr4 ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7 ; ; FLATSCRW32-LABEL: name: s_add_i32__fi_offset0__sgpr ; FLATSCRW32: liveins: $sgpr8 ; FLATSCRW32-NEXT: {{ $}} - ; FLATSCRW32-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr32, $sgpr8, implicit-def dead $scc + ; FLATSCRW32-NEXT: renamable $sgpr4 = S_ADD_I32 killed $sgpr32, $sgpr8, implicit-def dead $scc + ; FLATSCRW32-NEXT: renamable $sgpr7 = COPY killed renamable $sgpr4 ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7 renamable $sgpr7 = S_ADD_I32 %stack.0, $sgpr8, implicit-def dead $scc SI_RETURN implicit $sgpr7 @@ -355,31 +351,31 @@ body: | ; MUBUFW64-LABEL: name: s_add_i32__sgpr__fi_literal_offset ; MUBUFW64: liveins: $sgpr8 ; MUBUFW64-NEXT: {{ $}} - ; MUBUFW64-NEXT: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc - ; MUBUFW64-NEXT: $sgpr4 = S_ADD_I32 killed $sgpr4, 80, implicit-def $scc - ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr8, killed $sgpr4, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, $sgpr8, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 killed renamable $sgpr7, 80, implicit-def dead $scc ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7 ; ; MUBUFW32-LABEL: name: s_add_i32__sgpr__fi_literal_offset ; MUBUFW32: liveins: $sgpr8 ; MUBUFW32-NEXT: {{ $}} - ; MUBUFW32-NEXT: $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def $scc - ; MUBUFW32-NEXT: $sgpr4 = S_ADD_I32 killed $sgpr4, 80, implicit-def $scc - ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr8, killed $sgpr4, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, $sgpr8, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 killed renamable $sgpr7, 80, implicit-def dead $scc ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7 ; ; FLATSCRW64-LABEL: name: s_add_i32__sgpr__fi_literal_offset ; FLATSCRW64: liveins: $sgpr8 ; FLATSCRW64-NEXT: {{ $}} - ; FLATSCRW64-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 80, implicit-def $scc - ; FLATSCRW64-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr8, killed $sgpr4, implicit-def dead $scc + ; FLATSCRW64-NEXT: renamable $sgpr4 = S_ADD_I32 killed $sgpr32, $sgpr8, implicit-def dead $scc + ; FLATSCRW64-NEXT: renamable $sgpr7 = S_ADD_I32 killed renamable $sgpr4, 80, implicit-def dead $scc ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7 ; ; FLATSCRW32-LABEL: name: s_add_i32__sgpr__fi_literal_offset ; FLATSCRW32: liveins: $sgpr8 ; FLATSCRW32-NEXT: {{ $}} - ; FLATSCRW32-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 80, implicit-def $scc - ; FLATSCRW32-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr8, killed $sgpr4, implicit-def dead $scc + ; FLATSCRW32-NEXT: renamable $sgpr4 = S_ADD_I32 killed $sgpr32, $sgpr8, implicit-def dead $scc + ; FLATSCRW32-NEXT: renamable $sgpr7 = S_ADD_I32 killed renamable $sgpr4, 80, implicit-def dead $scc ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7 renamable $sgpr7 = S_ADD_I32 $sgpr8, %stack.1, implicit-def dead $scc SI_RETURN implicit $sgpr7 @@ -402,71 +398,157 @@ body: | ; MUBUFW64-LABEL: name: s_add_i32__fi_literal_offset__sgpr ; MUBUFW64: liveins: $sgpr8 ; MUBUFW64-NEXT: {{ $}} - ; MUBUFW64-NEXT: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc - ; MUBUFW64-NEXT: $sgpr4 = S_ADD_I32 killed $sgpr4, 80, implicit-def $scc + ; MUBUFW64-NEXT: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, $sgpr8, implicit-def $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 80, killed renamable $sgpr7, implicit-def $scc ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; MUBUFW32-LABEL: name: s_add_i32__fi_literal_offset__sgpr ; MUBUFW32: liveins: $sgpr8 ; MUBUFW32-NEXT: {{ $}} - ; MUBUFW32-NEXT: $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def $scc - ; MUBUFW32-NEXT: $sgpr4 = S_ADD_I32 killed $sgpr4, 80, implicit-def $scc + ; MUBUFW32-NEXT: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, $sgpr8, implicit-def $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 80, killed renamable $sgpr7, implicit-def $scc ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; FLATSCRW64-LABEL: name: s_add_i32__fi_literal_offset__sgpr ; FLATSCRW64: liveins: $sgpr8 ; FLATSCRW64-NEXT: {{ $}} - ; FLATSCRW64-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 80, implicit-def $scc - ; FLATSCRW64-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, $sgpr8, implicit-def $scc + ; FLATSCRW64-NEXT: renamable $sgpr4 = S_ADD_I32 killed $sgpr32, $sgpr8, implicit-def $scc + ; FLATSCRW64-NEXT: renamable $sgpr7 = S_ADD_I32 80, killed renamable $sgpr4, implicit-def $scc ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; FLATSCRW32-LABEL: name: s_add_i32__fi_literal_offset__sgpr ; FLATSCRW32: liveins: $sgpr8 ; FLATSCRW32-NEXT: {{ $}} - ; FLATSCRW32-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 80, implicit-def $scc - ; FLATSCRW32-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, $sgpr8, implicit-def $scc + ; FLATSCRW32-NEXT: renamable $sgpr4 = S_ADD_I32 killed $sgpr32, $sgpr8, implicit-def $scc + ; FLATSCRW32-NEXT: renamable $sgpr7 = S_ADD_I32 80, killed renamable $sgpr4, implicit-def $scc ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc renamable $sgpr7 = S_ADD_I32 %stack.1, $sgpr8, implicit-def $scc SI_RETURN implicit $sgpr7, implicit $scc ... -# FIXME: Fail verifier -# --- -# name: s_add_i32__kernel__literal__fi_offset96__offset_literal -# tracksRegLiveness: true -# stack: -# - { id: 0, size: 96, alignment: 16 } -# - { id: 1, size: 128, alignment: 4 } -# machineFunctionInfo: -# scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' -# frameOffsetReg: '$sgpr33' -# stackPtrOffsetReg: '$sgpr32' -# isEntryFunction: true -# body: | -# bb.0: -# renamable $sgpr7 = S_ADD_I32 68, %stack.1, implicit-def dead $scc -# SI_RETURN implicit $sgpr7 -# ... - -# --- -# name: s_add_i32__kernel__fi_offset96__offset_literal__literal -# tracksRegLiveness: true -# stack: -# - { id: 0, size: 96, alignment: 16 } -# - { id: 1, size: 128, alignment: 4 } -# machineFunctionInfo: -# scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' -# frameOffsetReg: '$sgpr33' -# stackPtrOffsetReg: '$sgpr32' -# isEntryFunction: true -# body: | -# bb.0: -# renamable $sgpr7 = S_ADD_I32 %stack.1, 68, implicit-def $scc -# SI_RETURN implicit $sgpr7, implicit $scc - -# ... + +--- +name: s_add_i32__kernel__literal__fi_offset96__offset_literal +tracksRegLiveness: true +stack: + - { id: 0, size: 96, alignment: 16 } + - { id: 1, size: 128, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + ; MUBUFW64-LABEL: name: s_add_i32__kernel__literal__fi_offset96__offset_literal + ; MUBUFW64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: renamable $sgpr7 = S_MOV_B32 164 + ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7 + ; + ; MUBUFW32-LABEL: name: s_add_i32__kernel__literal__fi_offset96__offset_literal + ; MUBUFW32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: renamable $sgpr7 = S_MOV_B32 164 + ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7 + ; + ; FLATSCRW64-LABEL: name: s_add_i32__kernel__literal__fi_offset96__offset_literal + ; FLATSCRW64: renamable $sgpr7 = S_MOV_B32 164 + ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7 + ; + ; FLATSCRW32-LABEL: name: s_add_i32__kernel__literal__fi_offset96__offset_literal + ; FLATSCRW32: renamable $sgpr7 = S_MOV_B32 164 + ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7 + renamable $sgpr7 = S_ADD_I32 68, %stack.1, implicit-def dead $scc + SI_RETURN implicit $sgpr7 +... + +--- +name: s_add_i32__kernel__literal__fi_offset96__offset_literal_live_scc +tracksRegLiveness: true +stack: + - { id: 0, size: 96, alignment: 16 } + - { id: 1, size: 128, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + ; MUBUFW64-LABEL: name: s_add_i32__kernel__literal__fi_offset96__offset_literal_live_scc + ; MUBUFW64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 164, 0, implicit-def $scc + ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc + ; + ; MUBUFW32-LABEL: name: s_add_i32__kernel__literal__fi_offset96__offset_literal_live_scc + ; MUBUFW32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 164, 0, implicit-def $scc + ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc + ; + ; FLATSCRW64-LABEL: name: s_add_i32__kernel__literal__fi_offset96__offset_literal_live_scc + ; FLATSCRW64: renamable $sgpr7 = S_ADD_I32 164, 0, implicit-def $scc + ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc + ; + ; FLATSCRW32-LABEL: name: s_add_i32__kernel__literal__fi_offset96__offset_literal_live_scc + ; FLATSCRW32: renamable $sgpr7 = S_ADD_I32 164, 0, implicit-def $scc + ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc + renamable $sgpr7 = S_ADD_I32 68, %stack.1, implicit-def $scc + SI_RETURN implicit $sgpr7, implicit $scc +... + +--- +name: s_add_i32__kernel__fi_offset96__offset_literal__literal +tracksRegLiveness: true +stack: + - { id: 0, size: 96, alignment: 16 } + - { id: 1, size: 128, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + ; MUBUFW64-LABEL: name: s_add_i32__kernel__fi_offset96__offset_literal__literal + ; MUBUFW64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: renamable $sgpr7 = S_MOV_B32 164 + ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7 + ; + ; MUBUFW32-LABEL: name: s_add_i32__kernel__fi_offset96__offset_literal__literal + ; MUBUFW32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: renamable $sgpr7 = S_MOV_B32 164 + ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7 + ; + ; FLATSCRW64-LABEL: name: s_add_i32__kernel__fi_offset96__offset_literal__literal + ; FLATSCRW64: renamable $sgpr7 = S_MOV_B32 164 + ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7 + ; + ; FLATSCRW32-LABEL: name: s_add_i32__kernel__fi_offset96__offset_literal__literal + ; FLATSCRW32: renamable $sgpr7 = S_MOV_B32 164 + ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7 + renamable $sgpr7 = S_ADD_I32 %stack.1, 68, implicit-def dead $scc + SI_RETURN implicit $sgpr7 + +... --- name: s_add_i32__kernel__sgpr__fi_literal_offset @@ -620,27 +702,31 @@ body: | ; MUBUFW64-LABEL: name: s_add_i32__sgpr__fi_offset0__live_scc ; MUBUFW64: liveins: $sgpr8 ; MUBUFW64-NEXT: {{ $}} - ; MUBUFW64-NEXT: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc - ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr8, killed $sgpr4, implicit-def $scc + ; MUBUFW64-NEXT: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, $sgpr8, implicit-def $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 killed renamable $sgpr7, 0, implicit-def $scc ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; MUBUFW32-LABEL: name: s_add_i32__sgpr__fi_offset0__live_scc ; MUBUFW32: liveins: $sgpr8 ; MUBUFW32-NEXT: {{ $}} - ; MUBUFW32-NEXT: $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc - ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr8, killed $sgpr4, implicit-def $scc + ; MUBUFW32-NEXT: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, $sgpr8, implicit-def $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 killed renamable $sgpr7, 0, implicit-def $scc ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; FLATSCRW64-LABEL: name: s_add_i32__sgpr__fi_offset0__live_scc ; FLATSCRW64: liveins: $sgpr8 ; FLATSCRW64-NEXT: {{ $}} - ; FLATSCRW64-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr8, $sgpr32, implicit-def $scc + ; FLATSCRW64-NEXT: renamable $sgpr4 = S_ADD_I32 killed $sgpr32, $sgpr8, implicit-def $scc + ; FLATSCRW64-NEXT: renamable $sgpr7 = S_ADD_I32 killed renamable $sgpr4, 0, implicit-def $scc ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; FLATSCRW32-LABEL: name: s_add_i32__sgpr__fi_offset0__live_scc ; FLATSCRW32: liveins: $sgpr8 ; FLATSCRW32-NEXT: {{ $}} - ; FLATSCRW32-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr8, $sgpr32, implicit-def $scc + ; FLATSCRW32-NEXT: renamable $sgpr4 = S_ADD_I32 killed $sgpr32, $sgpr8, implicit-def $scc + ; FLATSCRW32-NEXT: renamable $sgpr7 = S_ADD_I32 killed renamable $sgpr4, 0, implicit-def $scc ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc renamable $sgpr7 = S_ADD_I32 $sgpr8, %stack.0, implicit-def $scc SI_RETURN implicit $sgpr7, implicit $scc @@ -709,31 +795,31 @@ body: | ; MUBUFW64-LABEL: name: s_add_i32__sgpr__fi_literal_offset__live_scc ; MUBUFW64: liveins: $sgpr8 ; MUBUFW64-NEXT: {{ $}} - ; MUBUFW64-NEXT: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc - ; MUBUFW64-NEXT: $sgpr4 = S_ADD_I32 killed $sgpr4, 96, implicit-def $scc - ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr8, killed $sgpr4, implicit-def $scc + ; MUBUFW64-NEXT: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, $sgpr8, implicit-def $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 killed renamable $sgpr7, 96, implicit-def $scc ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; MUBUFW32-LABEL: name: s_add_i32__sgpr__fi_literal_offset__live_scc ; MUBUFW32: liveins: $sgpr8 ; MUBUFW32-NEXT: {{ $}} - ; MUBUFW32-NEXT: $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def $scc - ; MUBUFW32-NEXT: $sgpr4 = S_ADD_I32 killed $sgpr4, 96, implicit-def $scc - ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr8, killed $sgpr4, implicit-def $scc + ; MUBUFW32-NEXT: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, $sgpr8, implicit-def $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 killed renamable $sgpr7, 96, implicit-def $scc ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; FLATSCRW64-LABEL: name: s_add_i32__sgpr__fi_literal_offset__live_scc ; FLATSCRW64: liveins: $sgpr8 ; FLATSCRW64-NEXT: {{ $}} - ; FLATSCRW64-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 96, implicit-def $scc - ; FLATSCRW64-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr8, killed $sgpr4, implicit-def $scc + ; FLATSCRW64-NEXT: renamable $sgpr4 = S_ADD_I32 killed $sgpr32, $sgpr8, implicit-def $scc + ; FLATSCRW64-NEXT: renamable $sgpr7 = S_ADD_I32 killed renamable $sgpr4, 96, implicit-def $scc ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; FLATSCRW32-LABEL: name: s_add_i32__sgpr__fi_literal_offset__live_scc ; FLATSCRW32: liveins: $sgpr8 ; FLATSCRW32-NEXT: {{ $}} - ; FLATSCRW32-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 96, implicit-def $scc - ; FLATSCRW32-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr8, killed $sgpr4, implicit-def $scc + ; FLATSCRW32-NEXT: renamable $sgpr4 = S_ADD_I32 killed $sgpr32, $sgpr8, implicit-def $scc + ; FLATSCRW32-NEXT: renamable $sgpr7 = S_ADD_I32 killed renamable $sgpr4, 96, implicit-def $scc ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc renamable $sgpr7 = S_ADD_I32 $sgpr8, %stack.1, implicit-def $scc SI_RETURN implicit $sgpr7, implicit $scc @@ -754,25 +840,21 @@ machineFunctionInfo: body: | bb.0: ; MUBUFW64-LABEL: name: s_add_i32__inlineimm__fi_offset_32__total_offset_inlineimm - ; MUBUFW64: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc - ; MUBUFW64-NEXT: $sgpr4 = S_ADD_I32 killed $sgpr4, 32, implicit-def $scc - ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 8, killed $sgpr4, implicit-def $scc + ; MUBUFW64: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 40, $sgpr4, implicit-def $scc ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; MUBUFW32-LABEL: name: s_add_i32__inlineimm__fi_offset_32__total_offset_inlineimm - ; MUBUFW32: $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def $scc - ; MUBUFW32-NEXT: $sgpr4 = S_ADD_I32 killed $sgpr4, 32, implicit-def $scc - ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 8, killed $sgpr4, implicit-def $scc + ; MUBUFW32: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 40, $sgpr4, implicit-def $scc ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; FLATSCRW64-LABEL: name: s_add_i32__inlineimm__fi_offset_32__total_offset_inlineimm - ; FLATSCRW64: $sgpr4 = S_ADD_I32 $sgpr32, 32, implicit-def $scc - ; FLATSCRW64-NEXT: renamable $sgpr7 = S_ADD_I32 8, killed $sgpr4, implicit-def $scc + ; FLATSCRW64: renamable $sgpr7 = S_ADD_I32 40, $sgpr32, implicit-def $scc ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; FLATSCRW32-LABEL: name: s_add_i32__inlineimm__fi_offset_32__total_offset_inlineimm - ; FLATSCRW32: $sgpr4 = S_ADD_I32 $sgpr32, 32, implicit-def $scc - ; FLATSCRW32-NEXT: renamable $sgpr7 = S_ADD_I32 8, killed $sgpr4, implicit-def $scc + ; FLATSCRW32: renamable $sgpr7 = S_ADD_I32 40, $sgpr32, implicit-def $scc ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc renamable $sgpr7 = S_ADD_I32 8, %stack.1, implicit-def $scc SI_RETURN implicit $sgpr7, implicit $scc @@ -792,25 +874,21 @@ machineFunctionInfo: body: | bb.0: ; MUBUFW64-LABEL: name: s_add_i32__fi_offset_32__inlineimm__total_offset_inlineimm - ; MUBUFW64: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc - ; MUBUFW64-NEXT: $sgpr4 = S_ADD_I32 killed $sgpr4, 32, implicit-def $scc - ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, 8, implicit-def $scc + ; MUBUFW64: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr4, 40, implicit-def $scc ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; MUBUFW32-LABEL: name: s_add_i32__fi_offset_32__inlineimm__total_offset_inlineimm - ; MUBUFW32: $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def $scc - ; MUBUFW32-NEXT: $sgpr4 = S_ADD_I32 killed $sgpr4, 32, implicit-def $scc - ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, 8, implicit-def $scc + ; MUBUFW32: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 $sgpr4, 40, implicit-def $scc ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; FLATSCRW64-LABEL: name: s_add_i32__fi_offset_32__inlineimm__total_offset_inlineimm - ; FLATSCRW64: $sgpr4 = S_ADD_I32 $sgpr32, 32, implicit-def $scc - ; FLATSCRW64-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, 8, implicit-def $scc + ; FLATSCRW64: renamable $sgpr7 = S_ADD_I32 $sgpr32, 40, implicit-def $scc ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; FLATSCRW32-LABEL: name: s_add_i32__fi_offset_32__inlineimm__total_offset_inlineimm - ; FLATSCRW32: $sgpr4 = S_ADD_I32 $sgpr32, 32, implicit-def $scc - ; FLATSCRW32-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, 8, implicit-def $scc + ; FLATSCRW32: renamable $sgpr7 = S_ADD_I32 $sgpr32, 40, implicit-def $scc ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc renamable $sgpr7 = S_ADD_I32 %stack.1, 8, implicit-def $scc SI_RETURN implicit $sgpr7, implicit $scc @@ -835,7 +913,7 @@ body: | ; MUBUFW64-NEXT: {{ $}} ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 - ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 8, 32, implicit-def $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 40, 0, implicit-def $scc ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; MUBUFW32-LABEL: name: s_add_i32__kernel_inlineimm__fi_offset_32__total_offset_inlineimm @@ -843,15 +921,15 @@ body: | ; MUBUFW32-NEXT: {{ $}} ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 - ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 8, 32, implicit-def $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 40, 0, implicit-def $scc ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; FLATSCRW64-LABEL: name: s_add_i32__kernel_inlineimm__fi_offset_32__total_offset_inlineimm - ; FLATSCRW64: renamable $sgpr7 = S_ADD_I32 8, 32, implicit-def $scc + ; FLATSCRW64: renamable $sgpr7 = S_ADD_I32 40, 0, implicit-def $scc ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; FLATSCRW32-LABEL: name: s_add_i32__kernel_inlineimm__fi_offset_32__total_offset_inlineimm - ; FLATSCRW32: renamable $sgpr7 = S_ADD_I32 8, 32, implicit-def $scc + ; FLATSCRW32: renamable $sgpr7 = S_ADD_I32 40, 0, implicit-def $scc ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc renamable $sgpr7 = S_ADD_I32 8, %stack.1, implicit-def $scc SI_RETURN implicit $sgpr7, implicit $scc @@ -876,7 +954,7 @@ body: | ; MUBUFW64-NEXT: {{ $}} ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 - ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 32, 8, implicit-def $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 0, 40, implicit-def $scc ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; MUBUFW32-LABEL: name: s_add_i32__kernel_fi_offset_32__inlineimm__total_offset_inlineimm @@ -884,17 +962,503 @@ body: | ; MUBUFW32-NEXT: {{ $}} ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 - ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 32, 8, implicit-def $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 0, 40, implicit-def $scc ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; FLATSCRW64-LABEL: name: s_add_i32__kernel_fi_offset_32__inlineimm__total_offset_inlineimm - ; FLATSCRW64: renamable $sgpr7 = S_ADD_I32 32, 8, implicit-def $scc + ; FLATSCRW64: renamable $sgpr7 = S_ADD_I32 0, 40, implicit-def $scc ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc ; ; FLATSCRW32-LABEL: name: s_add_i32__kernel_fi_offset_32__inlineimm__total_offset_inlineimm - ; FLATSCRW32: renamable $sgpr7 = S_ADD_I32 32, 8, implicit-def $scc + ; FLATSCRW32: renamable $sgpr7 = S_ADD_I32 0, 40, implicit-def $scc ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc renamable $sgpr7 = S_ADD_I32 %stack.1, 8, implicit-def $scc SI_RETURN implicit $sgpr7, implicit $scc ... + +--- +name: s_add_i32__0__fi_offset0 +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; MUBUFW64-LABEL: name: s_add_i32__0__fi_offset0 + ; MUBUFW64: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = COPY $sgpr4 + ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7 + ; + ; MUBUFW32-LABEL: name: s_add_i32__0__fi_offset0 + ; MUBUFW32: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = COPY $sgpr4 + ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7 + ; + ; FLATSCRW64-LABEL: name: s_add_i32__0__fi_offset0 + ; FLATSCRW64: renamable $sgpr7 = COPY $sgpr32 + ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7 + ; + ; FLATSCRW32-LABEL: name: s_add_i32__0__fi_offset0 + ; FLATSCRW32: renamable $sgpr7 = COPY $sgpr32 + ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7 + renamable $sgpr7 = S_ADD_I32 0, %stack.0, implicit-def dead $scc + SI_RETURN implicit $sgpr7 + +... + +--- +name: s_add_i32__fi_offset0__0 +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; MUBUFW64-LABEL: name: s_add_i32__fi_offset0__0 + ; MUBUFW64: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = COPY $sgpr4 + ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7 + ; + ; MUBUFW32-LABEL: name: s_add_i32__fi_offset0__0 + ; MUBUFW32: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = COPY $sgpr4 + ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7 + ; + ; FLATSCRW64-LABEL: name: s_add_i32__fi_offset0__0 + ; FLATSCRW64: renamable $sgpr7 = COPY $sgpr32 + ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7 + ; + ; FLATSCRW32-LABEL: name: s_add_i32__fi_offset0__0 + ; FLATSCRW32: renamable $sgpr7 = COPY $sgpr32 + ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7 + renamable $sgpr7 = S_ADD_I32 %stack.0, 0, implicit-def dead $scc + SI_RETURN implicit $sgpr7 + +... + +--- +name: s_add_i32__same_sgpr__fi_offset0 +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $sgpr7 + ; MUBUFW64-LABEL: name: s_add_i32__same_sgpr__fi_offset0 + ; MUBUFW64: liveins: $sgpr7 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, $sgpr7, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = COPY killed renamable $sgpr7 + ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7 + ; + ; MUBUFW32-LABEL: name: s_add_i32__same_sgpr__fi_offset0 + ; MUBUFW32: liveins: $sgpr7 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, $sgpr7, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = COPY killed renamable $sgpr7 + ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7 + ; + ; FLATSCRW64-LABEL: name: s_add_i32__same_sgpr__fi_offset0 + ; FLATSCRW64: liveins: $sgpr7 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $sgpr4 = S_ADD_I32 killed $sgpr32, $sgpr7, implicit-def dead $scc + ; FLATSCRW64-NEXT: renamable $sgpr7 = COPY killed renamable $sgpr4 + ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7 + ; + ; FLATSCRW32-LABEL: name: s_add_i32__same_sgpr__fi_offset0 + ; FLATSCRW32: liveins: $sgpr7 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: renamable $sgpr4 = S_ADD_I32 killed $sgpr32, $sgpr7, implicit-def dead $scc + ; FLATSCRW32-NEXT: renamable $sgpr7 = COPY killed renamable $sgpr4 + ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7 + renamable $sgpr7 = S_ADD_I32 $sgpr7, %stack.0, implicit-def dead $scc + SI_RETURN implicit $sgpr7 + +... + +--- +name: s_add_i32__different_sgpr__fi_offset0 +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $sgpr8 + ; MUBUFW64-LABEL: name: s_add_i32__different_sgpr__fi_offset0 + ; MUBUFW64: liveins: $sgpr8 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, $sgpr8, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = COPY killed renamable $sgpr7 + ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7 + ; + ; MUBUFW32-LABEL: name: s_add_i32__different_sgpr__fi_offset0 + ; MUBUFW32: liveins: $sgpr8 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, $sgpr8, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = COPY killed renamable $sgpr7 + ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7 + ; + ; FLATSCRW64-LABEL: name: s_add_i32__different_sgpr__fi_offset0 + ; FLATSCRW64: liveins: $sgpr8 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $sgpr4 = S_ADD_I32 killed $sgpr32, $sgpr8, implicit-def dead $scc + ; FLATSCRW64-NEXT: renamable $sgpr7 = COPY killed renamable $sgpr4 + ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7 + ; + ; FLATSCRW32-LABEL: name: s_add_i32__different_sgpr__fi_offset0 + ; FLATSCRW32: liveins: $sgpr8 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: renamable $sgpr4 = S_ADD_I32 killed $sgpr32, $sgpr8, implicit-def dead $scc + ; FLATSCRW32-NEXT: renamable $sgpr7 = COPY killed renamable $sgpr4 + ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7 + renamable $sgpr7 = S_ADD_I32 $sgpr8, %stack.0, implicit-def dead $scc + SI_RETURN implicit $sgpr7 + +... + +--- +name: s_add_i32__different_sgpr__fi_offset0_live_after +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $sgpr8 + ; MUBUFW64-LABEL: name: s_add_i32__different_sgpr__fi_offset0_live_after + ; MUBUFW64: liveins: $sgpr8 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, $sgpr8, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr7 = COPY killed renamable $sgpr7 + ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7, implicit $sgpr8 + ; + ; MUBUFW32-LABEL: name: s_add_i32__different_sgpr__fi_offset0_live_after + ; MUBUFW32: liveins: $sgpr8 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_I32 killed $sgpr4, $sgpr8, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr7 = COPY killed renamable $sgpr7 + ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7, implicit $sgpr8 + ; + ; FLATSCRW64-LABEL: name: s_add_i32__different_sgpr__fi_offset0_live_after + ; FLATSCRW64: liveins: $sgpr8 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $sgpr4 = S_ADD_I32 killed $sgpr32, $sgpr8, implicit-def dead $scc + ; FLATSCRW64-NEXT: renamable $sgpr7 = COPY killed renamable $sgpr4 + ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7, implicit $sgpr8 + ; + ; FLATSCRW32-LABEL: name: s_add_i32__different_sgpr__fi_offset0_live_after + ; FLATSCRW32: liveins: $sgpr8 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: renamable $sgpr4 = S_ADD_I32 killed $sgpr32, $sgpr8, implicit-def dead $scc + ; FLATSCRW32-NEXT: renamable $sgpr7 = COPY killed renamable $sgpr4 + ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7, implicit $sgpr8 + renamable $sgpr7 = S_ADD_I32 $sgpr8, %stack.0, implicit-def dead $scc + SI_RETURN implicit $sgpr7, implicit $sgpr8 + +... + +--- +name: s_add_i32__identity_sgpr__fi_offset0__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $sgpr8 + + ; MUBUFW64-LABEL: name: s_add_i32__identity_sgpr__fi_offset0__kernel + ; MUBUFW64: liveins: $sgpr8, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: renamable $sgpr8 = COPY $sgpr8 + ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr8 + ; + ; MUBUFW32-LABEL: name: s_add_i32__identity_sgpr__fi_offset0__kernel + ; MUBUFW32: liveins: $sgpr8, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: renamable $sgpr8 = COPY $sgpr8 + ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr8 + ; + ; FLATSCRW64-LABEL: name: s_add_i32__identity_sgpr__fi_offset0__kernel + ; FLATSCRW64: liveins: $sgpr8 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $sgpr8 = COPY $sgpr8 + ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr8 + ; + ; FLATSCRW32-LABEL: name: s_add_i32__identity_sgpr__fi_offset0__kernel + ; FLATSCRW32: liveins: $sgpr8 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: renamable $sgpr8 = COPY $sgpr8 + ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr8 + renamable $sgpr8 = S_ADD_I32 $sgpr8, %stack.0, implicit-def dead $scc + SI_RETURN implicit $sgpr8 + +... + +--- +name: s_add_i32__fi_offset0__identity_sgpr__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $sgpr8 + + ; MUBUFW64-LABEL: name: s_add_i32__fi_offset0__identity_sgpr__kernel + ; MUBUFW64: liveins: $sgpr8, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: renamable $sgpr8 = COPY $sgpr8 + ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr8 + ; + ; MUBUFW32-LABEL: name: s_add_i32__fi_offset0__identity_sgpr__kernel + ; MUBUFW32: liveins: $sgpr8, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: renamable $sgpr8 = COPY $sgpr8 + ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr8 + ; + ; FLATSCRW64-LABEL: name: s_add_i32__fi_offset0__identity_sgpr__kernel + ; FLATSCRW64: liveins: $sgpr8 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $sgpr8 = COPY $sgpr8 + ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr8 + ; + ; FLATSCRW32-LABEL: name: s_add_i32__fi_offset0__identity_sgpr__kernel + ; FLATSCRW32: liveins: $sgpr8 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: renamable $sgpr8 = COPY $sgpr8 + ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr8 + renamable $sgpr8 = S_ADD_I32 $sgpr8, %stack.0, implicit-def dead $scc + SI_RETURN implicit $sgpr8 + +... + +--- +name: s_add_i32__identity_sgpr__fi_offset32__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } + - { id: 1, size: 64, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $sgpr8 + + ; MUBUFW64-LABEL: name: s_add_i32__identity_sgpr__fi_offset32__kernel + ; MUBUFW64: liveins: $sgpr8, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: renamable $sgpr8 = S_ADD_I32 $sgpr8, 32, implicit-def dead $scc + ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr8 + ; + ; MUBUFW32-LABEL: name: s_add_i32__identity_sgpr__fi_offset32__kernel + ; MUBUFW32: liveins: $sgpr8, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: renamable $sgpr8 = S_ADD_I32 $sgpr8, 32, implicit-def dead $scc + ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr8 + ; + ; FLATSCRW64-LABEL: name: s_add_i32__identity_sgpr__fi_offset32__kernel + ; FLATSCRW64: liveins: $sgpr8 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $sgpr8 = S_ADD_I32 $sgpr8, 32, implicit-def dead $scc + ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr8 + ; + ; FLATSCRW32-LABEL: name: s_add_i32__identity_sgpr__fi_offset32__kernel + ; FLATSCRW32: liveins: $sgpr8 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: renamable $sgpr8 = S_ADD_I32 $sgpr8, 32, implicit-def dead $scc + ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr8 + renamable $sgpr8 = S_ADD_I32 $sgpr8, %stack.1, implicit-def dead $scc + SI_RETURN implicit $sgpr8 + +... + +--- +name: s_add_i32__fi_offset32__identity_sgpr__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } + - { id: 1, size: 64, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $sgpr8 + + ; MUBUFW64-LABEL: name: s_add_i32__fi_offset32__identity_sgpr__kernel + ; MUBUFW64: liveins: $sgpr8, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: renamable $sgpr8 = S_ADD_I32 $sgpr8, 32, implicit-def dead $scc + ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr8 + ; + ; MUBUFW32-LABEL: name: s_add_i32__fi_offset32__identity_sgpr__kernel + ; MUBUFW32: liveins: $sgpr8, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: renamable $sgpr8 = S_ADD_I32 $sgpr8, 32, implicit-def dead $scc + ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr8 + ; + ; FLATSCRW64-LABEL: name: s_add_i32__fi_offset32__identity_sgpr__kernel + ; FLATSCRW64: liveins: $sgpr8 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $sgpr8 = S_ADD_I32 $sgpr8, 32, implicit-def dead $scc + ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr8 + ; + ; FLATSCRW32-LABEL: name: s_add_i32__fi_offset32__identity_sgpr__kernel + ; FLATSCRW32: liveins: $sgpr8 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: renamable $sgpr8 = S_ADD_I32 $sgpr8, 32, implicit-def dead $scc + ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr8 + renamable $sgpr8 = S_ADD_I32 $sgpr8, %stack.1, implicit-def dead $scc + SI_RETURN implicit $sgpr8 + +... + + +--- +name: s_add_i32__identity_sgpr__fi_offset0 +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $sgpr8 + + ; MUBUFW64-LABEL: name: s_add_i32__identity_sgpr__fi_offset0 + ; MUBUFW64: liveins: $sgpr8 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr8 = S_ADD_I32 killed $sgpr4, $sgpr8, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr8 = COPY killed renamable $sgpr8 + ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr8 + ; + ; MUBUFW32-LABEL: name: s_add_i32__identity_sgpr__fi_offset0 + ; MUBUFW32: liveins: $sgpr8 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr8 = S_ADD_I32 killed $sgpr4, $sgpr8, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr8 = COPY killed renamable $sgpr8 + ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr8 + ; + ; FLATSCRW64-LABEL: name: s_add_i32__identity_sgpr__fi_offset0 + ; FLATSCRW64: liveins: $sgpr8 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $sgpr4 = S_ADD_I32 killed $sgpr32, $sgpr8, implicit-def dead $scc + ; FLATSCRW64-NEXT: renamable $sgpr8 = COPY killed renamable $sgpr4 + ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr8 + ; + ; FLATSCRW32-LABEL: name: s_add_i32__identity_sgpr__fi_offset0 + ; FLATSCRW32: liveins: $sgpr8 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: renamable $sgpr4 = S_ADD_I32 killed $sgpr32, $sgpr8, implicit-def dead $scc + ; FLATSCRW32-NEXT: renamable $sgpr8 = COPY killed renamable $sgpr4 + ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr8 + renamable $sgpr8 = S_ADD_I32 $sgpr8, %stack.0, implicit-def dead $scc + SI_RETURN implicit $sgpr8 + +... + +--- +name: s_add_i32__fi_offset32__identity_sgpr +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } + - { id: 1, size: 64, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $sgpr8 + + ; MUBUFW64-LABEL: name: s_add_i32__fi_offset32__identity_sgpr + ; MUBUFW64: liveins: $sgpr8 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr8 = S_ADD_I32 killed $sgpr4, $sgpr8, implicit-def dead $scc + ; MUBUFW64-NEXT: renamable $sgpr8 = S_ADD_I32 killed renamable $sgpr8, 32, implicit-def dead $scc + ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr8 + ; + ; MUBUFW32-LABEL: name: s_add_i32__fi_offset32__identity_sgpr + ; MUBUFW32: liveins: $sgpr8 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr8 = S_ADD_I32 killed $sgpr4, $sgpr8, implicit-def dead $scc + ; MUBUFW32-NEXT: renamable $sgpr8 = S_ADD_I32 killed renamable $sgpr8, 32, implicit-def dead $scc + ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr8 + ; + ; FLATSCRW64-LABEL: name: s_add_i32__fi_offset32__identity_sgpr + ; FLATSCRW64: liveins: $sgpr8 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $sgpr4 = S_ADD_I32 killed $sgpr32, $sgpr8, implicit-def dead $scc + ; FLATSCRW64-NEXT: renamable $sgpr8 = S_ADD_I32 killed renamable $sgpr4, 32, implicit-def dead $scc + ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr8 + ; + ; FLATSCRW32-LABEL: name: s_add_i32__fi_offset32__identity_sgpr + ; FLATSCRW32: liveins: $sgpr8 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: renamable $sgpr4 = S_ADD_I32 killed $sgpr32, $sgpr8, implicit-def dead $scc + ; FLATSCRW32-NEXT: renamable $sgpr8 = S_ADD_I32 killed renamable $sgpr4, 32, implicit-def dead $scc + ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr8 + renamable $sgpr8 = S_ADD_I32 $sgpr8, %stack.1, implicit-def dead $scc + SI_RETURN implicit $sgpr8 + +... diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll index 89da9b8e75bc9c..9d9d5b239a12c8 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -101,7 +101,6 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS @@ -237,7 +236,6 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS @@ -375,7 +373,6 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS @@ -514,8 +511,6 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS @@ -652,11 +647,10 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS @@ -795,11 +789,10 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS @@ -939,8 +932,6 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS @@ -1077,11 +1068,10 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS @@ -1219,11 +1209,10 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index 14d8b71c5167a2..9a9fd289e2d0c4 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -381,11 +381,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-NEXT: s_and_b32 s0, s0, 15 -; GFX9-NEXT: s_add_i32 s1, s1, 0 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: scratch_store_dword off, v0, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s0, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -402,8 +400,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX10-NEXT: s_and_b32 s1, s0, 15 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-NEXT: s_lshl_b32 s1, s1, 2 -; GFX10-NEXT: s_add_i32 s0, s0, 0 -; GFX10-NEXT: s_add_i32 s1, s1, 0 ; GFX10-NEXT: scratch_store_dword off, v0, s0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc @@ -418,8 +414,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-NEXT: s_lshl_b32 s1, s1, 2 -; GFX11-NEXT: s_add_i32 s0, s0, 0 -; GFX11-NEXT: s_add_i32 s1, s1, 0 ; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc @@ -434,8 +428,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX12-NEXT: s_and_b32 s1, s0, 15 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-NEXT: s_lshl_b32 s1, s1, 2 -; GFX12-NEXT: s_add_co_i32 s0, s0, 0 -; GFX12-NEXT: s_add_co_i32 s1, s1, 0 ; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS @@ -455,11 +447,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 -; GFX9-PAL-NEXT: s_add_i32 s1, s1, 0 ; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_add_i32 s0, s0, 0 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm @@ -471,11 +461,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s1, s0, 2 ; GFX940-NEXT: s_and_b32 s0, s0, 15 -; GFX940-NEXT: s_add_i32 s1, s1, 0 ; GFX940-NEXT: s_lshl_b32 s0, s0, 2 ; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_add_i32 s0, s0, 0 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm @@ -497,8 +485,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX10-PAL-NEXT: s_add_i32 s0, s0, 0 -; GFX10-PAL-NEXT: s_add_i32 s1, s1, 0 ; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc @@ -513,8 +499,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX11-PAL-NEXT: s_add_i32 s0, s0, 0 -; GFX11-PAL-NEXT: s_add_i32 s1, s1, 0 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc @@ -529,8 +513,6 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX12-PAL-NEXT: s_add_co_i32 s0, s0, 0 -; GFX12-PAL-NEXT: s_add_co_i32 s1, s1, 0 ; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS @@ -552,13 +534,11 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_lshl_b32 s0, s2, 2 -; GFX9-NEXT: s_add_i32 s0, s0, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_and_b32 s0, s2, 15 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 -; GFX9-NEXT: s_add_i32 s0, s0, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -573,8 +553,6 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX10-NEXT: s_and_b32 s0, s2, 15 ; GFX10-NEXT: s_lshl_b32 s1, s2, 2 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 -; GFX10-NEXT: s_add_i32 s1, s1, 0 -; GFX10-NEXT: s_add_i32 s0, s0, 0 ; GFX10-NEXT: scratch_store_dword off, v0, s1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc @@ -587,8 +565,6 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-NEXT: s_lshl_b32 s1, s1, 2 -; GFX11-NEXT: s_add_i32 s0, s0, 0 -; GFX11-NEXT: s_add_i32 s1, s1, 0 ; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc @@ -601,8 +577,6 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX12-NEXT: s_and_b32 s1, s0, 15 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-NEXT: s_lshl_b32 s1, s1, 2 -; GFX12-NEXT: s_add_co_i32 s0, s0, 0 -; GFX12-NEXT: s_add_co_i32 s1, s1, 0 ; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS @@ -621,11 +595,9 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 -; GFX9-PAL-NEXT: s_add_i32 s1, s1, 0 -; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_add_i32 s0, s0, 0 +; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm @@ -633,13 +605,11 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX940-LABEL: store_load_sindex_foo: ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: s_lshl_b32 s1, s0, 2 -; GFX940-NEXT: s_and_b32 s0, s0, 15 -; GFX940-NEXT: s_add_i32 s1, s1, 0 ; GFX940-NEXT: v_mov_b32_e32 v0, 15 -; GFX940-NEXT: s_lshl_b32 s0, s0, 2 +; GFX940-NEXT: s_and_b32 s0, s0, 15 ; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_add_i32 s0, s0, 0 +; GFX940-NEXT: s_lshl_b32 s0, s0, 2 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm @@ -659,8 +629,6 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX10-PAL-NEXT: s_add_i32 s0, s0, 0 -; GFX10-PAL-NEXT: s_add_i32 s1, s1, 0 ; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc @@ -673,8 +641,6 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX11-PAL-NEXT: s_add_i32 s0, s0, 0 -; GFX11-PAL-NEXT: s_add_i32 s1, s1, 0 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc @@ -687,8 +653,6 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-PAL-NEXT: s_lshl_b32 s1, s1, 2 -; GFX12-PAL-NEXT: s_add_co_i32 s0, s0, 0 -; GFX12-PAL-NEXT: s_add_co_i32 s1, s1, 0 ; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS @@ -3693,12 +3657,12 @@ define void @store_load_large_imm_offset_foo() { ; GFX9-LABEL: store_load_large_imm_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: s_movk_i32 s0, 0x3000 -; GFX9-NEXT: s_add_i32 s1, s32, 4 +; GFX9-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-NEXT: s_add_i32 s1, s32, s0 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_add_i32 s0, s1, 4 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3710,10 +3674,10 @@ define void @store_load_large_imm_offset_foo() { ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 13 -; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_movk_i32 s0, 0x3800 -; GFX10-NEXT: s_add_i32 s1, s32, 4 -; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-NEXT: s_add_i32 s1, s32, s0 +; GFX10-NEXT: s_add_i32 s0, s1, 4 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 @@ -3755,12 +3719,12 @@ define void @store_load_large_imm_offset_foo() { ; GFX9-PAL-LABEL: store_load_large_imm_offset_foo: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 -; GFX9-PAL-NEXT: s_add_i32 s1, s32, 4 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-PAL-NEXT: s_add_i32 s1, s32, s0 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_add_i32 s0, s0, s1 +; GFX9-PAL-NEXT: s_add_i32 s0, s1, 4 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) @@ -3786,10 +3750,10 @@ define void @store_load_large_imm_offset_foo() { ; GFX10-PAL: ; %bb.0: ; %bb ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 -; GFX10-PAL-NEXT: s_add_i32 s1, s32, 4 -; GFX10-PAL-NEXT: s_add_i32 s0, s0, s1 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-PAL-NEXT: s_add_i32 s1, s32, s0 +; GFX10-PAL-NEXT: s_add_i32 s0, s1, 4 ; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 diff --git a/llvm/test/CodeGen/AMDGPU/frame-index.mir b/llvm/test/CodeGen/AMDGPU/frame-index.mir index 34c7614ae36f98..f388aeb0470291 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-index.mir +++ b/llvm/test/CodeGen/AMDGPU/frame-index.mir @@ -55,8 +55,8 @@ body: | ; GCN-LABEL: name: func_add_constant_to_fi_uniform_i32 ; GCN: liveins: $sgpr30_sgpr31 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr0 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc - ; GCN-NEXT: renamable $sgpr4 = nuw S_ADD_I32 killed $sgpr0, 4, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr0 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4 = nuw S_ADD_I32 $sgpr0, 4, implicit-def dead $scc ; GCN-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr4, implicit $exec ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: DS_WRITE_B32 undef renamable $vgpr0, killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll index 49531e3b4f8f30..f6a77a763c2cde 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -161,8 +161,8 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: .LBB1_1: ; %loadstoreloop ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLATSCR-NEXT: s_add_i32 s3, s33, 0x3000 -; FLATSCR-NEXT: s_add_i32 s1, s0, s3 +; FLATSCR-NEXT: s_add_i32 s3, s33, s0 +; FLATSCR-NEXT: s_add_i32 s1, s3, 0x3000 ; FLATSCR-NEXT: s_add_i32 s0, s0, 1 ; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120 ; FLATSCR-NEXT: scratch_store_byte off, v2, s1 @@ -170,8 +170,8 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_1 ; FLATSCR-NEXT: ; %bb.2: ; %split ; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 -; FLATSCR-NEXT: s_add_i32 s1, s33, 0x3000 -; FLATSCR-NEXT: s_add_i32 s0, s0, s1 +; FLATSCR-NEXT: s_add_i32 s1, s33, s0 +; FLATSCR-NEXT: s_add_i32 s0, s1, 0x3000 ; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:208 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_add_i32 s0, s33, 0x3000 From ded6dd244cce3e683201a668ce321d4474baa8fb Mon Sep 17 00:00:00 2001 From: h-vetinari Date: Thu, 22 Aug 2024 16:32:40 +1100 Subject: [PATCH 148/426] [clang][NFC] remove resolved issue from StandardCPlusPlusModules.rst (#105610) This landed as https://github.com/llvm/llvm-project/pull/102287 for main & https://github.com/llvm/llvm-project/pull/102561 for 19.x CC @ChuanqiXu9 --- clang/docs/StandardCPlusPlusModules.rst | 70 +------------------------ 1 file changed, 1 insertion(+), 69 deletions(-) diff --git a/clang/docs/StandardCPlusPlusModules.rst b/clang/docs/StandardCPlusPlusModules.rst index ccc0cb59f8e710..1469feb3ad45bd 100644 --- a/clang/docs/StandardCPlusPlusModules.rst +++ b/clang/docs/StandardCPlusPlusModules.rst @@ -1297,74 +1297,6 @@ A high-level overview of support for standards features, including modules, can be found on the `C++ Feature Status `_ page. -Missing VTables for classes attached to modules -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Now the compiler may miss emitting the definition of vtables -for classes attached to modules, if the definition of the class -doesn't contain any key function in that module units -(The key function is the first non-pure virtual function that is -not inline at the point of class definition.) - -(Note: technically, the key function is not a thing for modules. -We use the concept here for convinient.) - -For example, - -.. code-block:: c++ - - // layer1.cppm - export module foo:layer1; - struct Fruit { - virtual ~Fruit() = default; - virtual void eval() = 0; - }; - struct Banana : public Fruit { - Banana() {} - void eval() override; - }; - - // layer2.cppm - export module foo:layer2; - import :layer1; - export void layer2_fun() { - Banana *b = new Banana(); - b->eval(); - } - void Banana::eval() { - } - -For the above example, we can't find the definition for the vtable of -class ``Banana`` in any object files. - -The expected behavior is, for dynamic classes attached to named modules, -the vtable should always be emitted to the module units the class attaches -to. - -To workaround the problem, users can add the key function manually in the -corresponding module units. e.g., - -.. code-block:: c++ - - // layer1.cppm - export module foo:layer1; - struct Fruit { - virtual ~Fruit() = default; - virtual void eval() = 0; - }; - struct Banana : public Fruit { - // Hack a key function to hint the compiler to emit the virtual table. - virtual void anchor(); - - Banana() {} - void eval() override; - }; - - void Banana::anchor() {} - -This is tracked by -`#70585 `_. - Including headers after import is not well-supported ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1436,7 +1368,7 @@ non-module unit depending on the definition of some macros. However, this usage is forbidden by P1857R3 which is not yet implemented in Clang. This means that is possible to write invalid modules which will no longer be accepted once P1857R3 is implemented. This is tracked by -`#56917 `_. +`#54047 `_. Until then, it is recommended not to mix macros with module declarations. From fde2d23ee2a204050a210f2f7b290643a272f737 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Thu, 22 Aug 2024 01:10:54 -0500 Subject: [PATCH 149/426] [PGO][OpenMP] Instrumentation for GPU devices (Revision of #76587) (#102691) This pull request is a revised version of #76587. This pull request fixes some build issues that were present in the previous version of this change. > This pull request is the first part of an ongoing effort to extends PGO instrumentation to GPU device code. This PR makes the following changes: > > - Adds blank registration functions to device RTL > - Gives PGO globals protected visibility when targeting a supported GPU > - Handles any addrspace casts for PGO calls > - Implements PGO global extraction in GPU plugins (currently only dumps info) > > These changes can be tested by supplying `-fprofile-instrument=clang` while targeting a GPU. --- clang/lib/CodeGen/CodeGenPGO.cpp | 13 ++- .../include/llvm/Frontend/OpenMP/OMPKinds.def | 3 + llvm/include/llvm/ProfileData/InstrProf.h | 4 + llvm/lib/ProfileData/InstrProf.cpp | 25 ++++- .../Instrumentation/InstrProfiling.cpp | 44 +++++++-- .../Instrumentation/PGOInstrumentation.cpp | 24 +++-- offload/DeviceRTL/CMakeLists.txt | 2 + offload/DeviceRTL/include/Profiling.h | 21 ++++ offload/DeviceRTL/src/Profiling.cpp | 22 +++++ offload/plugins-nextgen/common/CMakeLists.txt | 3 +- .../common/include/GlobalHandler.h | 29 +++++- .../common/src/GlobalHandler.cpp | 96 +++++++++++++++++++ .../common/src/PluginInterface.cpp | 14 +++ offload/test/CMakeLists.txt | 6 ++ offload/test/lit.cfg | 3 + offload/test/lit.site.cfg.in | 2 +- offload/test/offloading/pgo1.c | 74 ++++++++++++++ 17 files changed, 357 insertions(+), 28 deletions(-) create mode 100644 offload/DeviceRTL/include/Profiling.h create mode 100644 offload/DeviceRTL/src/Profiling.cpp create mode 100644 offload/test/offloading/pgo1.c diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp index cfcdb5911b581c..2bc0fe909efd14 100644 --- a/clang/lib/CodeGen/CodeGenPGO.cpp +++ b/clang/lib/CodeGen/CodeGenPGO.cpp @@ -1195,10 +1195,15 @@ void CodeGenPGO::emitCounterSetOrIncrement(CGBuilderTy &Builder, const Stmt *S, unsigned Counter = (*RegionCounterMap)[S]; - llvm::Value *Args[] = {FuncNameVar, - Builder.getInt64(FunctionHash), - Builder.getInt32(NumRegionCounters), - Builder.getInt32(Counter), StepV}; + // Make sure that pointer to global is passed in with zero addrspace + // This is relevant during GPU profiling + auto *NormalizedFuncNameVarPtr = + llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( + FuncNameVar, llvm::PointerType::get(CGM.getLLVMContext(), 0)); + + llvm::Value *Args[] = { + NormalizedFuncNameVarPtr, Builder.getInt64(FunctionHash), + Builder.getInt32(NumRegionCounters), Builder.getInt32(Counter), StepV}; if (llvm::EnableSingleByteCoverage) Builder.CreateCall(CGM.getIntrinsic(llvm::Intrinsic::instrprof_cover), diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def index d9e9c14af3b157..d8f3c8fa06b747 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -506,6 +506,9 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32) __OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,) __OMP_RTL(__kmpc_syncwarp, false, Void, Int64) +__OMP_RTL(__llvm_profile_register_function, false, Void, VoidPtr) +__OMP_RTL(__llvm_profile_register_names_function, false, Void, VoidPtr, Int64) + __OMP_RTL(__last, false, Void, ) #undef __OMP_RTL diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h index 824dcf2372c832..c4270478565d9f 100644 --- a/llvm/include/llvm/ProfileData/InstrProf.h +++ b/llvm/include/llvm/ProfileData/InstrProf.h @@ -181,6 +181,10 @@ inline StringRef getInstrProfBitmapBiasVarName() { /// Return the marker used to separate PGO names during serialization. inline StringRef getInstrProfNameSeparator() { return "\01"; } +/// Determines whether module targets a GPU eligable for PGO +/// instrumentation +bool isGPUProfTarget(const Module &M); + /// Please use getIRPGOFuncName for LLVM IR instrumentation. This function is /// for front-end (Clang, etc) instrumentation. /// Return the modified name for function \c F suitable to be diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index e38855c92b1a33..b9937c9429b77d 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -437,13 +437,31 @@ std::string getPGOFuncNameVarName(StringRef FuncName, return VarName; } +bool isGPUProfTarget(const Module &M) { + const auto &T = Triple(M.getTargetTriple()); + return T.isAMDGPU() || T.isNVPTX(); +} + +void setPGOFuncVisibility(Module &M, GlobalVariable *FuncNameVar) { + // If the target is a GPU, make the symbol protected so it can + // be read from the host device + if (isGPUProfTarget(M)) + FuncNameVar->setVisibility(GlobalValue::ProtectedVisibility); + // Hide the symbol so that we correctly get a copy for each executable. + else if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage())) + FuncNameVar->setVisibility(GlobalValue::HiddenVisibility); +} + GlobalVariable *createPGOFuncNameVar(Module &M, GlobalValue::LinkageTypes Linkage, StringRef PGOFuncName) { + // Ensure profiling variables on GPU are visible to be read from host + if (isGPUProfTarget(M)) + Linkage = GlobalValue::ExternalLinkage; // We generally want to match the function's linkage, but available_externally // and extern_weak both have the wrong semantics, and anything that doesn't // need to link across compilation units doesn't need to be visible at all. - if (Linkage == GlobalValue::ExternalWeakLinkage) + else if (Linkage == GlobalValue::ExternalWeakLinkage) Linkage = GlobalValue::LinkOnceAnyLinkage; else if (Linkage == GlobalValue::AvailableExternallyLinkage) Linkage = GlobalValue::LinkOnceODRLinkage; @@ -457,10 +475,7 @@ GlobalVariable *createPGOFuncNameVar(Module &M, new GlobalVariable(M, Value->getType(), true, Linkage, Value, getPGOFuncNameVarName(PGOFuncName, Linkage)); - // Hide the symbol so that we correctly get a copy for each executable. - if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage())) - FuncNameVar->setVisibility(GlobalValue::HiddenVisibility); - + setPGOFuncVisibility(M, FuncNameVar); return FuncNameVar; } diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 1b3954a36699a0..25bed6da3ad40f 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -1059,6 +1059,8 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { llvm::InstrProfValueKind::IPVK_MemOPSize); CallInst *Call = nullptr; auto *TLI = &GetTLI(*Ind->getFunction()); + auto *NormalizedDataVarPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( + DataVar, PointerType::get(M.getContext(), 0)); // To support value profiling calls within Windows exception handlers, funclet // information contained within operand bundles needs to be copied over to @@ -1067,11 +1069,13 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { SmallVector OpBundles; Ind->getOperandBundlesAsDefs(OpBundles); if (!IsMemOpSize) { - Value *Args[3] = {Ind->getTargetValue(), DataVar, Builder.getInt32(Index)}; + Value *Args[3] = {Ind->getTargetValue(), NormalizedDataVarPtr, + Builder.getInt32(Index)}; Call = Builder.CreateCall(getOrInsertValueProfilingCall(M, *TLI), Args, OpBundles); } else { - Value *Args[3] = {Ind->getTargetValue(), DataVar, Builder.getInt32(Index)}; + Value *Args[3] = {Ind->getTargetValue(), NormalizedDataVarPtr, + Builder.getInt32(Index)}; Call = Builder.CreateCall( getOrInsertValueProfilingCall(M, *TLI, ValueProfilingCallType::MemOp), Args, OpBundles); @@ -1814,7 +1818,8 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { getInstrProfSectionName(IPSK_vals, TT.getObjectFormat())); ValuesVar->setAlignment(Align(8)); maybeSetComdat(ValuesVar, Fn, CntsVarName); - ValuesPtrExpr = ValuesVar; + ValuesPtrExpr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( + ValuesVar, PointerType::get(Fn->getContext(), 0)); } uint64_t NumCounters = Inc->getNumCounters()->getZExtValue(); @@ -1838,6 +1843,10 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]); + if (isGPUProfTarget(M)) { + Linkage = GlobalValue::ExternalLinkage; + Visibility = GlobalValue::ProtectedVisibility; + } // If the data variable is not referenced by code (if we don't emit // @llvm.instrprof.value.profile, NS will be 0), and the counter keeps the // data variable live under linker GC, the data variable can be private. This @@ -1849,9 +1858,9 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { // If profd is in a deduplicate comdat, NS==0 with a hash suffix guarantees // that other copies must have the same CFG and cannot have value profiling. // If no hash suffix, other profd copies may be referenced by code. - if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) && - (TT.isOSBinFormatELF() || - (!DataReferencedByCode && TT.isOSBinFormatCOFF()))) { + else if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) && + (TT.isOSBinFormatELF() || + (!DataReferencedByCode && TT.isOSBinFormatCOFF()))) { Linkage = GlobalValue::PrivateLinkage; Visibility = GlobalValue::DefaultVisibility; } @@ -1974,6 +1983,13 @@ void InstrLowerer::emitNameData() { NamesVar = new GlobalVariable(M, NamesVal->getType(), true, GlobalValue::PrivateLinkage, NamesVal, getInstrProfNamesVarName()); + + // Make names variable public if current target is a GPU + if (isGPUProfTarget(M)) { + NamesVar->setLinkage(GlobalValue::ExternalLinkage); + NamesVar->setVisibility(GlobalValue::VisibilityTypes::ProtectedVisibility); + } + NamesSize = CompressedNameStr.size(); setGlobalVariableLargeSection(TT, *NamesVar); NamesVar->setSection( @@ -2040,10 +2056,13 @@ void InstrLowerer::emitRegistration() { IRBuilder<> IRB(BasicBlock::Create(M.getContext(), "", RegisterF)); for (Value *Data : CompilerUsedVars) if (!isa(Data)) - IRB.CreateCall(RuntimeRegisterF, Data); + // Check for addrspace cast when profiling GPU + IRB.CreateCall(RuntimeRegisterF, + IRB.CreatePointerBitCastOrAddrSpaceCast(Data, VoidPtrTy)); for (Value *Data : UsedVars) if (Data != NamesVar && !isa(Data)) - IRB.CreateCall(RuntimeRegisterF, Data); + IRB.CreateCall(RuntimeRegisterF, + IRB.CreatePointerBitCastOrAddrSpaceCast(Data, VoidPtrTy)); if (NamesVar) { Type *ParamTypes[] = {VoidPtrTy, Int64Ty}; @@ -2052,7 +2071,9 @@ void InstrLowerer::emitRegistration() { auto *NamesRegisterF = Function::Create(NamesRegisterTy, GlobalVariable::ExternalLinkage, getInstrProfNamesRegFuncName(), M); - IRB.CreateCall(NamesRegisterF, {NamesVar, IRB.getInt64(NamesSize)}); + IRB.CreateCall(NamesRegisterF, {IRB.CreatePointerBitCastOrAddrSpaceCast( + NamesVar, VoidPtrTy), + IRB.getInt64(NamesSize)}); } IRB.CreateRetVoid(); @@ -2073,7 +2094,10 @@ bool InstrLowerer::emitRuntimeHook() { auto *Var = new GlobalVariable(M, Int32Ty, false, GlobalValue::ExternalLinkage, nullptr, getInstrProfRuntimeHookVarName()); - Var->setVisibility(GlobalValue::HiddenVisibility); + if (isGPUProfTarget(M)) + Var->setVisibility(GlobalValue::ProtectedVisibility); + else + Var->setVisibility(GlobalValue::HiddenVisibility); if (TT.isOSBinFormatELF() && !TT.isPS()) { // Mark the user variable as used so that it isn't stripped out. diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index b3644031c5a44b..39cf94daab7d3b 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -909,6 +909,10 @@ void FunctionInstrumenter::instrument() { auto Name = FuncInfo.FuncNameVar; auto CFGHash = ConstantInt::get(Type::getInt64Ty(M.getContext()), FuncInfo.FunctionHash); + // Make sure that pointer to global is passed in with zero addrspace + // This is relevant during GPU profiling + auto *NormalizedNamePtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( + Name, PointerType::get(M.getContext(), 0)); if (PGOFunctionEntryCoverage) { auto &EntryBB = F.getEntryBlock(); IRBuilder<> Builder(&EntryBB, EntryBB.getFirstInsertionPt()); @@ -916,7 +920,7 @@ void FunctionInstrumenter::instrument() { // i32 ) Builder.CreateCall( Intrinsic::getDeclaration(&M, Intrinsic::instrprof_cover), - {Name, CFGHash, Builder.getInt32(1), Builder.getInt32(0)}); + {NormalizedNamePtr, CFGHash, Builder.getInt32(1), Builder.getInt32(0)}); return; } @@ -971,7 +975,8 @@ void FunctionInstrumenter::instrument() { // i32 ) Builder.CreateCall( Intrinsic::getDeclaration(&M, Intrinsic::instrprof_timestamp), - {Name, CFGHash, Builder.getInt32(NumCounters), Builder.getInt32(I)}); + {NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters), + Builder.getInt32(I)}); I += PGOBlockCoverage ? 8 : 1; } @@ -985,7 +990,8 @@ void FunctionInstrumenter::instrument() { Intrinsic::getDeclaration(&M, PGOBlockCoverage ? Intrinsic::instrprof_cover : Intrinsic::instrprof_increment), - {Name, CFGHash, Builder.getInt32(NumCounters), Builder.getInt32(I++)}); + {NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters), + Builder.getInt32(I++)}); } // Now instrument select instructions: @@ -1028,11 +1034,14 @@ void FunctionInstrumenter::instrument() { ToProfile = Builder.CreatePtrToInt(Cand.V, Builder.getInt64Ty()); assert(ToProfile && "value profiling Value is of unexpected type"); + auto *NormalizedNamePtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( + Name, PointerType::get(M.getContext(), 0)); + SmallVector OpBundles; populateEHOperandBundle(Cand, BlockColors, OpBundles); Builder.CreateCall( Intrinsic::getDeclaration(&M, Intrinsic::instrprof_value_profile), - {FuncInfo.FuncNameVar, Builder.getInt64(FuncInfo.FunctionHash), + {NormalizedNamePtr, Builder.getInt64(FuncInfo.FunctionHash), ToProfile, Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)}, OpBundles); } @@ -1709,10 +1718,13 @@ void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) { IRBuilder<> Builder(&SI); Type *Int64Ty = Builder.getInt64Ty(); auto *Step = Builder.CreateZExt(SI.getCondition(), Int64Ty); + auto *NormalizedFuncNameVarPtr = + ConstantExpr::getPointerBitCastOrAddrSpaceCast( + FuncNameVar, PointerType::get(M->getContext(), 0)); Builder.CreateCall( Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step), - {FuncNameVar, Builder.getInt64(FuncHash), Builder.getInt32(TotalNumCtrs), - Builder.getInt32(*CurCtrIdx), Step}); + {NormalizedFuncNameVarPtr, Builder.getInt64(FuncHash), + Builder.getInt32(TotalNumCtrs), Builder.getInt32(*CurCtrIdx), Step}); ++(*CurCtrIdx); } diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt index 7818c8d752599c..f30afd9674a072 100644 --- a/offload/DeviceRTL/CMakeLists.txt +++ b/offload/DeviceRTL/CMakeLists.txt @@ -77,6 +77,7 @@ set(include_files ${include_directory}/Interface.h ${include_directory}/LibC.h ${include_directory}/Mapping.h + ${include_directory}/Profiling.h ${include_directory}/State.h ${include_directory}/Synchronization.h ${include_directory}/Types.h @@ -93,6 +94,7 @@ set(src_files ${source_directory}/Mapping.cpp ${source_directory}/Misc.cpp ${source_directory}/Parallelism.cpp + ${source_directory}/Profiling.cpp ${source_directory}/Reduction.cpp ${source_directory}/State.cpp ${source_directory}/Synchronization.cpp diff --git a/offload/DeviceRTL/include/Profiling.h b/offload/DeviceRTL/include/Profiling.h new file mode 100644 index 00000000000000..d9947522541219 --- /dev/null +++ b/offload/DeviceRTL/include/Profiling.h @@ -0,0 +1,21 @@ +//===-------- Profiling.h - OpenMP interface ---------------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_DEVICERTL_PROFILING_H +#define OMPTARGET_DEVICERTL_PROFILING_H + +extern "C" { +void __llvm_profile_register_function(void *Ptr); +void __llvm_profile_register_names_function(void *Ptr, long int I); +void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2); +} + +#endif diff --git a/offload/DeviceRTL/src/Profiling.cpp b/offload/DeviceRTL/src/Profiling.cpp new file mode 100644 index 00000000000000..bb3caaadcc03dd --- /dev/null +++ b/offload/DeviceRTL/src/Profiling.cpp @@ -0,0 +1,22 @@ +//===------- Profiling.cpp ---------------------------------------- C++ ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Profiling.h" + +#pragma omp begin declare target device_type(nohost) + +extern "C" { + +// Provides empty implementations for certain functions in compiler-rt +// that are emitted by the PGO instrumentation. +void __llvm_profile_register_function(void *Ptr) {} +void __llvm_profile_register_names_function(void *Ptr, long int I) {} +void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {} +} + +#pragma omp end declare target diff --git a/offload/plugins-nextgen/common/CMakeLists.txt b/offload/plugins-nextgen/common/CMakeLists.txt index aea20c6ec31435..4dca5422087bba 100644 --- a/offload/plugins-nextgen/common/CMakeLists.txt +++ b/offload/plugins-nextgen/common/CMakeLists.txt @@ -7,7 +7,7 @@ add_library(PluginCommon OBJECT src/RPC.cpp src/Utils/ELF.cpp ) -add_dependencies(PluginCommon intrinsics_gen) +add_dependencies(PluginCommon intrinsics_gen LLVMProfileData) # Only enable JIT for those targets that LLVM can support. set(supported_jit_targets AMDGPU NVPTX) @@ -52,6 +52,7 @@ target_compile_definitions(PluginCommon PRIVATE target_compile_options(PluginCommon PUBLIC ${offload_compile_flags}) target_link_options(PluginCommon PUBLIC ${offload_link_flags}) +target_link_libraries(PluginCommon PRIVATE LLVMProfileData) target_include_directories(PluginCommon PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h index 829b4b72911935..d2914e7cd0eb4f 100644 --- a/offload/plugins-nextgen/common/include/GlobalHandler.h +++ b/offload/plugins-nextgen/common/include/GlobalHandler.h @@ -13,10 +13,11 @@ #ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H #define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H -#include +#include #include "llvm/ADT/DenseMap.h" #include "llvm/Object/ELFObjectFile.h" +#include "llvm/ProfileData/InstrProf.h" #include "Shared/Debug.h" #include "Shared/Utils.h" @@ -55,6 +56,23 @@ class GlobalTy { void setPtr(void *P) { Ptr = P; } }; +using IntPtrT = void *; +struct __llvm_profile_data { +#define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer) \ + std::remove_const::type Name; +#include "llvm/ProfileData/InstrProfData.inc" +}; + +/// PGO profiling data extracted from a GPU device +struct GPUProfGlobals { + SmallVector NamesData; + SmallVector> Counts; + SmallVector<__llvm_profile_data> Data; + Triple TargetTriple; + + void dump() const; +}; + /// Subclass of GlobalTy that holds the memory for a global of \p Ty. template class StaticGlobalTy : public GlobalTy { Ty Data; @@ -164,6 +182,15 @@ class GenericGlobalHandlerTy { return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal, /*D2H=*/false); } + + /// Checks whether a given image contains profiling globals. + bool hasProfilingGlobals(GenericDeviceTy &Device, DeviceImageTy &Image); + + /// Reads profiling data from a GPU image to supplied profdata struct. + /// Iterates through the image symbol table and stores global values + /// with profiling prefixes. + Expected readProfilingGlobals(GenericDeviceTy &Device, + DeviceImageTy &Image); }; } // namespace plugin diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp index ba0aa47f8e51c3..59719027f122a8 100644 --- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp +++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp @@ -19,6 +19,7 @@ #include "llvm/Support/Error.h" #include +#include using namespace llvm; using namespace omp; @@ -161,3 +162,98 @@ Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device, return Plugin::success(); } + +bool GenericGlobalHandlerTy::hasProfilingGlobals(GenericDeviceTy &Device, + DeviceImageTy &Image) { + GlobalTy global(getInstrProfNamesVarName().str(), 0); + if (auto Err = getGlobalMetadataFromImage(Device, Image, global)) { + consumeError(std::move(Err)); + return false; + } + return true; +} + +Expected +GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device, + DeviceImageTy &Image) { + GPUProfGlobals DeviceProfileData; + auto ObjFile = getELFObjectFile(Image); + if (!ObjFile) + return ObjFile.takeError(); + + std::unique_ptr ELFObj( + static_cast(ObjFile->release())); + DeviceProfileData.TargetTriple = ELFObj->makeTriple(); + + // Iterate through elf symbols + for (auto &Sym : ELFObj->symbols()) { + auto NameOrErr = Sym.getName(); + if (!NameOrErr) + return NameOrErr.takeError(); + + // Check if given current global is a profiling global based + // on name + if (*NameOrErr == getInstrProfNamesVarName()) { + // Read in profiled function names + DeviceProfileData.NamesData = SmallVector(Sym.getSize(), 0); + GlobalTy NamesGlobal(NameOrErr->str(), Sym.getSize(), + DeviceProfileData.NamesData.data()); + if (auto Err = readGlobalFromDevice(Device, Image, NamesGlobal)) + return Err; + } else if (NameOrErr->starts_with(getInstrProfCountersVarPrefix())) { + // Read global variable profiling counts + SmallVector Counts(Sym.getSize() / sizeof(int64_t), 0); + GlobalTy CountGlobal(NameOrErr->str(), Sym.getSize(), Counts.data()); + if (auto Err = readGlobalFromDevice(Device, Image, CountGlobal)) + return Err; + DeviceProfileData.Counts.push_back(std::move(Counts)); + } else if (NameOrErr->starts_with(getInstrProfDataVarPrefix())) { + // Read profiling data for this global variable + __llvm_profile_data Data{}; + GlobalTy DataGlobal(NameOrErr->str(), Sym.getSize(), &Data); + if (auto Err = readGlobalFromDevice(Device, Image, DataGlobal)) + return Err; + DeviceProfileData.Data.push_back(std::move(Data)); + } + } + return DeviceProfileData; +} + +void GPUProfGlobals::dump() const { + outs() << "======= GPU Profile =======\nTarget: " << TargetTriple.str() + << "\n"; + + outs() << "======== Counters =========\n"; + for (const auto &Count : Counts) { + outs() << "["; + for (size_t i = 0; i < Count.size(); i++) { + if (i == 0) + outs() << " "; + outs() << Count[i] << " "; + } + outs() << "]\n"; + } + + outs() << "========== Data ===========\n"; + for (const auto &ProfData : Data) { + outs() << "{ "; +#define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer) \ + outs() << ProfData.Name << " "; +#include "llvm/ProfileData/InstrProfData.inc" + outs() << "}\n"; + } + + outs() << "======== Functions ========\n"; + std::string s; + s.reserve(NamesData.size()); + for (uint8_t Name : NamesData) { + s.push_back((char)Name); + } + + InstrProfSymtab Symtab; + if (Error Err = Symtab.create(StringRef(s))) { + consumeError(std::move(Err)); + } + Symtab.dumpNames(outs()); + outs() << "===========================\n"; +} diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 84d946507ea74a..60f7c918d7adb2 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -842,6 +842,20 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { DeviceMemoryPoolTracking.AllocationMax); } + for (auto *Image : LoadedImages) { + GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); + if (!Handler.hasProfilingGlobals(*this, *Image)) + continue; + + GPUProfGlobals profdata; + auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image); + if (!ProfOrErr) + return ProfOrErr.takeError(); + + // TODO: write data to profiling file + ProfOrErr->dump(); + } + // Delete the memory manager before deinitializing the device. Otherwise, // we may delete device allocations after the device is deinitialized. if (MemoryManager) diff --git a/offload/test/CMakeLists.txt b/offload/test/CMakeLists.txt index 3ac5d7907e2cc2..495d1ef62226e7 100644 --- a/offload/test/CMakeLists.txt +++ b/offload/test/CMakeLists.txt @@ -12,6 +12,12 @@ else() set(LIBOMPTARGET_DEBUG False) endif() +if (NOT OPENMP_STANDALONE_BUILD AND "compiler-rt" IN_LIST LLVM_ENABLE_RUNTIMES) + set(LIBOMPTARGET_TEST_GPU_PGO True) +else() + set(LIBOMPTARGET_TEST_GPU_PGO False) +endif() + # Replace the space from user's input with ";" in case that CMake add escape # char into the lit command. string(REPLACE " " ";" LIBOMPTARGET_LIT_ARG_LIST "${LIBOMPTARGET_LIT_ARGS}") diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index b4fc7d3b333b35..dc39ecb6708d9a 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -112,6 +112,9 @@ config.available_features.add(config.libomptarget_current_target) if config.libomptarget_has_libc: config.available_features.add('libc') +if config.libomptarget_test_pgo: + config.available_features.add('pgo') + # Determine whether the test system supports unified memory. # For CUDA, this is the case with compute capability 70 (Volta) or higher. # For all other targets, we currently assume it is. diff --git a/offload/test/lit.site.cfg.in b/offload/test/lit.site.cfg.in index 62ada1d81721d6..a1cb5acc38a405 100644 --- a/offload/test/lit.site.cfg.in +++ b/offload/test/lit.site.cfg.in @@ -27,6 +27,6 @@ config.offload_device_info = "@OFFLOAD_DEVICE_INFO_EXECUTABLE@" config.libomptarget_debug = @LIBOMPTARGET_DEBUG@ config.has_libomptarget_ompt = @LIBOMPTARGET_OMPT_SUPPORT@ config.libomptarget_has_libc = @LIBOMPTARGET_GPU_LIBC_SUPPORT@ - +config.libomptarget_test_pgo = @LIBOMPTARGET_TEST_GPU_PGO@ # Let the main config do the real work. lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg") diff --git a/offload/test/offloading/pgo1.c b/offload/test/offloading/pgo1.c new file mode 100644 index 00000000000000..c0d698323adf06 --- /dev/null +++ b/offload/test/offloading/pgo1.c @@ -0,0 +1,74 @@ +// RUN: %libomptarget-compile-generic -fprofile-instr-generate \ +// RUN: -Xclang "-fprofile-instrument=clang" +// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \ +// RUN: --check-prefix="CLANG-PGO" +// RUN: %libomptarget-compile-generic -fprofile-generate \ +// RUN: -Xclang "-fprofile-instrument=llvm" +// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \ +// RUN: --check-prefix="LLVM-PGO" + +// REQUIRES: gpu +// REQUIRES: pgo + +#ifdef _OPENMP +#include +#endif + +int test1(int a) { return a / 2; } +int test2(int a) { return a * 2; } + +int main() { + int m = 2; +#pragma omp target + for (int i = 0; i < 10; i++) { + m = test1(m); + for (int j = 0; j < 2; j++) { + m = test2(m); + } + } +} + +// CLANG-PGO: ======== Counters ========= +// CLANG-PGO-NEXT: [ 0 11 20 ] +// CLANG-PGO-NEXT: [ 10 ] +// CLANG-PGO-NEXT: [ 20 ] +// CLANG-PGO-NEXT: ========== Data =========== +// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} +// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// CLANG-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } +// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} +// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// CLANG-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } +// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} +// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// CLANG-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } +// CLANG-PGO-NEXT: ======== Functions ======== +// CLANG-PGO-NEXT: pgo1.c: +// CLANG-PGO-SAME: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}} +// CLANG-PGO-NEXT: test1 +// CLANG-PGO-NEXT: test2 + +// LLVM-PGO: ======== Counters ========= +// LLVM-PGO-NEXT: [ 20 10 2 1 ] +// LLVM-PGO-NEXT: [ 10 ] +// LLVM-PGO-NEXT: [ 20 ] +// LLVM-PGO-NEXT: ========== Data =========== +// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} +// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// LLVM-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } +// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} +// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// LLVM-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } +// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} +// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// LLVM-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } +// LLVM-PGO-NEXT: ======== Functions ======== +// LLVM-PGO-NEXT: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}} +// LLVM-PGO-NEXT: test1 +// LLVM-PGO-NEXT: test2 From 410f751144e8b2e9574f03e0d0fb8560fe3cb797 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Thu, 22 Aug 2024 06:11:34 +0000 Subject: [PATCH 150/426] =?UTF-8?q?[Flang][Runtime]=20Fix=20type=20used=20?= =?UTF-8?q?to=20store=20result=20of=20typeInfo::Value::Ge=E2=80=A6=20(#105?= =?UTF-8?q?589)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …tValue Current choice was only working out of accident on 64 bit machine, it led to an implicit cast to smaller type on 32 bit machine. Use the exact type instead. --- flang/runtime/copy.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/flang/runtime/copy.cpp b/flang/runtime/copy.cpp index 41d2aef1a6865a..b20f68f019498b 100644 --- a/flang/runtime/copy.cpp +++ b/flang/runtime/copy.cpp @@ -183,8 +183,9 @@ RT_API_ATTRS void CopyElement(const Descriptor &to, const SubscriptValue toAt[], const typeInfo::Value *bounds{component->bounds()}; std::size_t elements{1}; for (int dim{0}; dim < component->rank(); ++dim) { - SubscriptValue lb{bounds[2 * dim].GetValue(&curTo).value_or(0)}; - SubscriptValue ub{ + typeInfo::TypeParameterValue lb{ + bounds[2 * dim].GetValue(&curTo).value_or(0)}; + typeInfo::TypeParameterValue ub{ bounds[2 * dim + 1].GetValue(&curTo).value_or(0)}; extents[dim] = ub >= lb ? ub - lb + 1 : 0; elements *= extents[dim]; From 820396c3a874f57205bfe52cc82bcac3a0035b3d Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 21 Aug 2024 23:32:13 -0700 Subject: [PATCH 151/426] [Transforms] Construct SmallVector with iterator ranges (NFC) (#105607) --- llvm/lib/Transforms/HipStdPar/HipStdPar.cpp | 2 +- llvm/lib/Transforms/IPO/AttributorAttributes.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp index 1a8096f647d847..d740500ef1f8f6 100644 --- a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp +++ b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp @@ -80,7 +80,7 @@ static inline bool checkIfSupported(GlobalVariable &G) { << G.getName(); Instruction *I = nullptr; - SmallVector Tmp(G.user_begin(), G.user_end()); + SmallVector Tmp(G.users()); SmallPtrSet Visited; do { auto U = std::move(Tmp.back()); diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 5052a40efe6026..5469eab6f3dfee 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -12334,7 +12334,7 @@ struct AAIndirectCallInfoCallSite : public AAIndirectCallInfo { bool CBIsVoid = CB->getType()->isVoidTy(); BasicBlock::iterator IP = CB->getIterator(); FunctionType *CSFT = CB->getFunctionType(); - SmallVector CSArgs(CB->arg_begin(), CB->arg_end()); + SmallVector CSArgs(CB->args()); // If we know all callees and there are none, the call site is (effectively) // dead (or UB). From 0534c4f693d4643e71f7a02c7937b655fdcd9c82 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 21 Aug 2024 23:41:33 -0700 Subject: [PATCH 152/426] [asan][Darwin] Simplify test (#105599) Checking order of sections is not a goal of the test. The goal is make sure there is only one "Hello" string and it's in __asan_cstring. --- compiler-rt/test/asan/TestCases/Darwin/cstring_section.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/compiler-rt/test/asan/TestCases/Darwin/cstring_section.c b/compiler-rt/test/asan/TestCases/Darwin/cstring_section.c index d72b0ba8a8bb33..93ed009bb197de 100644 --- a/compiler-rt/test/asan/TestCases/Darwin/cstring_section.c +++ b/compiler-rt/test/asan/TestCases/Darwin/cstring_section.c @@ -1,15 +1,11 @@ // Test that AddressSanitizer moves constant strings into a separate section. // RUN: %clang_asan -c -o %t %s -// RUN: llvm-objdump -s %t | FileCheck %s +// RUN: llvm-objdump -s %t | FileCheck %s --implicit-check-not="Hello." // Check that "Hello.\n" is in __asan_cstring and not in __cstring. // CHECK: Contents of section {{.*}}__asan_cstring: -// CHECK: 48656c6c {{.*}} Hello. -// CHECK: Contents of section {{.*}}__const: -// CHECK-NOT: 48656c6c {{.*}} Hello. -// CHECK: Contents of section {{.*}}__cstring: -// CHECK-NOT: 48656c6c {{.*}} Hello. +// CHECK-NEXT: 48656c6c {{.*}} Hello. int main(int argc, char *argv[]) { argv[0] = "Hello.\n"; From 5f6172f0684b6a224d207ff8d093fc9aad92e331 Mon Sep 17 00:00:00 2001 From: Sameer Sahasrabuddhe Date: Thu, 22 Aug 2024 11:43:12 +0530 Subject: [PATCH 153/426] [Transforms] Refactor CreateControlFlowHub (#103013) CreateControlFlowHub is a method that redirects control flow edges from a set of incoming blocks to a set of outgoing blocks through a new set of "guard" blocks. This is now refactored into a separate file with one enhancement: The input to the method is now a set of branches rather than two sets of blocks. The original implementation reroutes every edge from incoming blocks to outgoing blocks. But it is possible that for some incoming block InBB, some successor S might be in the set of outgoing blocks, but that particular edge should not be rerouted. The new implementation makes this possible by allowing the user to specify the targets of each branch that need to be rerouted. This is needed when improving the implementation of FixIrreducible #101386. Current use in FixIrreducible does not demonstrate this finer control over the edges being rerouted. But in UnifyLoopExits, when only one successor of an exiting block is an exit block, this refinement now reroutes only the relevant control-flow through the edge; the non-exit successor is not rerouted. This results in fewer branches and PHI nodes in the hub. --- .../llvm/Transforms/Utils/BasicBlockUtils.h | 75 ---- .../llvm/Transforms/Utils/ControlFlowUtils.h | 123 +++++++ llvm/lib/Transforms/Utils/BasicBlockUtils.cpp | 314 ---------------- llvm/lib/Transforms/Utils/CMakeLists.txt | 1 + .../lib/Transforms/Utils/ControlFlowUtils.cpp | 342 ++++++++++++++++++ llvm/lib/Transforms/Utils/FixIrreducible.cpp | 35 +- llvm/lib/Transforms/Utils/UnifyLoopExits.cpp | 67 ++-- ...cannot-create-empty-or-backward-segment.ll | 13 +- .../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 32 +- llvm/test/Transforms/FixIrreducible/basic.ll | 10 +- .../Transforms/FixIrreducible/bug45623.ll | 3 +- llvm/test/Transforms/FixIrreducible/nested.ll | 3 +- llvm/test/Transforms/FixIrreducible/switch.ll | 3 +- .../Transforms/FixIrreducible/unreachable.ll | 4 +- .../workarounds/needs-unified-loop-exits.ll | 52 ++- .../UnifyLoopExits/integer_guards.ll | 6 +- llvm/test/Transforms/UnifyLoopExits/nested.ll | 2 +- .../Transforms/UnifyLoopExits/restore-ssa.ll | 4 +- .../Transforms/UnifyLoopExits/undef-phis.ll | 4 +- 19 files changed, 585 insertions(+), 508 deletions(-) create mode 100644 llvm/include/llvm/Transforms/Utils/ControlFlowUtils.h create mode 100644 llvm/lib/Transforms/Utils/ControlFlowUtils.cpp diff --git a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h index c99df6bf94d025..b447942ffbd676 100644 --- a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h +++ b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h @@ -602,81 +602,6 @@ bool SplitIndirectBrCriticalEdges(Function &F, bool IgnoreBlocksWithoutPHI, BranchProbabilityInfo *BPI = nullptr, BlockFrequencyInfo *BFI = nullptr); -/// Given a set of incoming and outgoing blocks, create a "hub" such that every -/// edge from an incoming block InBB to an outgoing block OutBB is now split -/// into two edges, one from InBB to the hub and another from the hub to -/// OutBB. The hub consists of a series of guard blocks, one for each outgoing -/// block. Each guard block conditionally branches to the corresponding outgoing -/// block, or the next guard block in the chain. These guard blocks are returned -/// in the argument vector. -/// -/// Since the control flow edges from InBB to OutBB have now been replaced, the -/// function also updates any PHINodes in OutBB. For each such PHINode, the -/// operands corresponding to incoming blocks are moved to a new PHINode in the -/// hub, and the hub is made an operand of the original PHINode. -/// -/// Input CFG: -/// ---------- -/// -/// Def -/// | -/// v -/// In1 In2 -/// | | -/// | | -/// v v -/// Foo ---> Out1 Out2 -/// | -/// v -/// Use -/// -/// -/// Create hub: Incoming = {In1, In2}, Outgoing = {Out1, Out2} -/// ---------------------------------------------------------- -/// -/// Def -/// | -/// v -/// In1 In2 Foo -/// | Hub | | -/// | + - - | - - + | -/// | ' v ' V -/// +------> Guard1 -----> Out1 -/// ' | ' -/// ' v ' -/// ' Guard2 -----> Out2 -/// ' ' | -/// + - - - - - + | -/// v -/// Use -/// -/// Limitations: -/// ----------- -/// 1. This assumes that all terminators in the CFG are direct branches (the -/// "br" instruction). The presence of any other control flow such as -/// indirectbr, switch or callbr will cause an assert. -/// -/// 2. The updates to the PHINodes are not sufficient to restore SSA -/// form. Consider a definition Def, its use Use, incoming block In2 and -/// outgoing block Out2, such that: -/// a. In2 is reachable from D or contains D. -/// b. U is reachable from Out2 or is contained in Out2. -/// c. U is not a PHINode if U is contained in Out2. -/// -/// Clearly, Def dominates Out2 since the program is valid SSA. But when the -/// hub is introduced, there is a new path through the hub along which Use is -/// reachable from entry without passing through Def, and SSA is no longer -/// valid. To fix this, we need to look at all the blocks post-dominated by -/// the hub on the one hand, and dominated by Out2 on the other. This is left -/// for the caller to accomplish, since each specific use of this function -/// may have additional information which simplifies this fixup. For example, -/// see restoreSSA() in the UnifyLoopExits pass. -BasicBlock *CreateControlFlowHub( - DomTreeUpdater *DTU, SmallVectorImpl &GuardBlocks, - const SetVector &Predecessors, - const SetVector &Successors, const StringRef Prefix, - std::optional MaxControlFlowBooleans = std::nullopt); - // Utility function for inverting branch condition and for swapping its // successors void InvertBranch(BranchInst *PBI, IRBuilderBase &Builder); diff --git a/llvm/include/llvm/Transforms/Utils/ControlFlowUtils.h b/llvm/include/llvm/Transforms/Utils/ControlFlowUtils.h new file mode 100644 index 00000000000000..f789c3af75c685 --- /dev/null +++ b/llvm/include/llvm/Transforms/Utils/ControlFlowUtils.h @@ -0,0 +1,123 @@ +//===- Transforms/Utils/ControlFlowUtils.h --------------------*- C++ -*---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Utilities to manipulate the CFG and restore SSA for the new control flow. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_UTILS_CONTROLFLOWUTILS_H +#define LLVM_TRANSFORMS_UTILS_CONTROLFLOWUTILS_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" + +namespace llvm { + +class BasicBlock; +class DomTreeUpdater; + +/// Given a set of branch descriptors [BB, Succ0, Succ1], create a "hub" such +/// that the control flow from each BB to a successor is now split into two +/// edges, one from BB to the hub and another from the hub to the successor. The +/// hub consists of a series of guard blocks, one for each outgoing block. Each +/// guard block conditionally branches to the corresponding outgoing block, or +/// the next guard block in the chain. These guard blocks are returned in the +/// argument vector. +/// +/// This also updates any PHINodes in the successor. For each such PHINode, the +/// operands corresponding to incoming blocks are moved to a new PHINode in the +/// hub, and the hub is made an operand of the original PHINode. +/// +/// Note that for some block BB with a conditional branch, it is not necessary +/// that both successors are rerouted. The client specifies this by setting +/// either Succ0 or Succ1 to nullptr, in which case, the corresponding successor +/// is not rerouted. +/// +/// Input CFG: +/// ---------- +/// +/// Def +/// | +/// v +/// In1 In2 +/// | | +/// | | +/// v v +/// Foo ---> Out1 Out2 +/// | +/// v +/// Use +/// +/// +/// Create hub: Incoming = {In1, In2}, Outgoing = {Out1, Out2} +/// ---------------------------------------------------------- +/// +/// Def +/// | +/// v +/// In1 In2 Foo +/// | Hub | | +/// | + - - | - - + | +/// | ' v ' V +/// +------> Guard1 -----> Out1 +/// ' | ' +/// ' v ' +/// ' Guard2 -----> Out2 +/// ' ' | +/// + - - - - - + | +/// v +/// Use +/// +/// Limitations: +/// ----------- +/// 1. This assumes that all terminators in the CFG are direct branches (the +/// "br" instruction). The presence of any other control flow such as +/// indirectbr, switch or callbr will cause an assert. +/// +/// 2. The updates to the PHINodes are not sufficient to restore SSA +/// form. Consider a definition Def, its use Use, incoming block In2 and +/// outgoing block Out2, such that: +/// a. In2 is reachable from D or contains D. +/// b. U is reachable from Out2 or is contained in Out2. +/// c. U is not a PHINode if U is contained in Out2. +/// +/// Clearly, Def dominates Out2 since the program is valid SSA. But when the +/// hub is introduced, there is a new path through the hub along which Use is +/// reachable from entry without passing through Def, and SSA is no longer +/// valid. To fix this, we need to look at all the blocks post-dominated by +/// the hub on the one hand, and dominated by Out2 on the other. This is left +/// for the caller to accomplish, since each specific use of this function +/// may have additional information which simplifies this fixup. For example, +/// see restoreSSA() in the UnifyLoopExits pass. +struct ControlFlowHub { + struct BranchDescriptor { + BasicBlock *BB; + BasicBlock *Succ0; + BasicBlock *Succ1; + + BranchDescriptor(BasicBlock *BB, BasicBlock *Succ0, BasicBlock *Succ1) + : BB(BB), Succ0(Succ0), Succ1(Succ1) {} + }; + + void addBranch(BasicBlock *BB, BasicBlock *Succ0, BasicBlock *Succ1) { + assert(BB); + assert(Succ0 || Succ1); + Branches.emplace_back(BB, Succ0, Succ1); + } + + BasicBlock * + finalize(DomTreeUpdater *DTU, SmallVectorImpl &GuardBlocks, + const StringRef Prefix, + std::optional MaxControlFlowBooleans = std::nullopt); + + SmallVector Branches; +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_UTILS_CONTROLFLOWUTILS_H diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index c78965aaeba3ca..4144c7993b7e42 100644 --- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -1891,320 +1891,6 @@ BranchInst *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue, return BI; } -// After creating a control flow hub, the operands of PHINodes in an outgoing -// block Out no longer match the predecessors of that block. Predecessors of Out -// that are incoming blocks to the hub are now replaced by just one edge from -// the hub. To match this new control flow, the corresponding values from each -// PHINode must now be moved a new PHINode in the first guard block of the hub. -// -// This operation cannot be performed with SSAUpdater, because it involves one -// new use: If the block Out is in the list of Incoming blocks, then the newly -// created PHI in the Hub will use itself along that edge from Out to Hub. -static void reconnectPhis(BasicBlock *Out, BasicBlock *GuardBlock, - const SetVector &Incoming, - BasicBlock *FirstGuardBlock) { - auto I = Out->begin(); - while (I != Out->end() && isa(I)) { - auto Phi = cast(I); - auto NewPhi = - PHINode::Create(Phi->getType(), Incoming.size(), - Phi->getName() + ".moved", FirstGuardBlock->begin()); - bool AllUndef = true; - for (auto *In : Incoming) { - Value *V = UndefValue::get(Phi->getType()); - if (In == Out) { - V = NewPhi; - } else if (Phi->getBasicBlockIndex(In) != -1) { - V = Phi->removeIncomingValue(In, false); - AllUndef &= isa(V); - } - NewPhi->addIncoming(V, In); - } - assert(NewPhi->getNumIncomingValues() == Incoming.size()); - Value *NewV = NewPhi; - if (AllUndef) { - NewPhi->eraseFromParent(); - NewV = UndefValue::get(Phi->getType()); - } - if (Phi->getNumOperands() == 0) { - Phi->replaceAllUsesWith(NewV); - I = Phi->eraseFromParent(); - continue; - } - Phi->addIncoming(NewV, GuardBlock); - ++I; - } -} - -using BBPredicates = DenseMap; -using BBSetVector = SetVector; - -// Redirects the terminator of the incoming block to the first guard -// block in the hub. The condition of the original terminator (if it -// was conditional) and its original successors are returned as a -// tuple . The function additionally filters -// out successors that are not in the set of outgoing blocks. -// -// - condition is non-null iff the branch is conditional. -// - Succ1 is non-null iff the sole/taken target is an outgoing block. -// - Succ2 is non-null iff condition is non-null and the fallthrough -// target is an outgoing block. -static std::tuple -redirectToHub(BasicBlock *BB, BasicBlock *FirstGuardBlock, - const BBSetVector &Outgoing) { - assert(isa(BB->getTerminator()) && - "Only support branch terminator."); - auto Branch = cast(BB->getTerminator()); - auto Condition = Branch->isConditional() ? Branch->getCondition() : nullptr; - - BasicBlock *Succ0 = Branch->getSuccessor(0); - BasicBlock *Succ1 = nullptr; - Succ0 = Outgoing.count(Succ0) ? Succ0 : nullptr; - - if (Branch->isUnconditional()) { - Branch->setSuccessor(0, FirstGuardBlock); - assert(Succ0); - } else { - Succ1 = Branch->getSuccessor(1); - Succ1 = Outgoing.count(Succ1) ? Succ1 : nullptr; - assert(Succ0 || Succ1); - if (Succ0 && !Succ1) { - Branch->setSuccessor(0, FirstGuardBlock); - } else if (Succ1 && !Succ0) { - Branch->setSuccessor(1, FirstGuardBlock); - } else { - Branch->eraseFromParent(); - BranchInst::Create(FirstGuardBlock, BB); - } - } - - assert(Succ0 || Succ1); - return std::make_tuple(Condition, Succ0, Succ1); -} -// Setup the branch instructions for guard blocks. -// -// Each guard block terminates in a conditional branch that transfers -// control to the corresponding outgoing block or the next guard -// block. The last guard block has two outgoing blocks as successors -// since the condition for the final outgoing block is trivially -// true. So we create one less block (including the first guard block) -// than the number of outgoing blocks. -static void setupBranchForGuard(SmallVectorImpl &GuardBlocks, - const BBSetVector &Outgoing, - BBPredicates &GuardPredicates) { - // To help keep the loop simple, temporarily append the last - // outgoing block to the list of guard blocks. - GuardBlocks.push_back(Outgoing.back()); - - for (int i = 0, e = GuardBlocks.size() - 1; i != e; ++i) { - auto Out = Outgoing[i]; - assert(GuardPredicates.count(Out)); - BranchInst::Create(Out, GuardBlocks[i + 1], GuardPredicates[Out], - GuardBlocks[i]); - } - - // Remove the last block from the guard list. - GuardBlocks.pop_back(); -} - -/// We are using one integer to represent the block we are branching to. Then at -/// each guard block, the predicate was calcuated using a simple `icmp eq`. -static void calcPredicateUsingInteger( - const BBSetVector &Incoming, const BBSetVector &Outgoing, - SmallVectorImpl &GuardBlocks, BBPredicates &GuardPredicates) { - auto &Context = Incoming.front()->getContext(); - auto FirstGuardBlock = GuardBlocks.front(); - - auto Phi = PHINode::Create(Type::getInt32Ty(Context), Incoming.size(), - "merged.bb.idx", FirstGuardBlock); - - for (auto In : Incoming) { - Value *Condition; - BasicBlock *Succ0; - BasicBlock *Succ1; - std::tie(Condition, Succ0, Succ1) = - redirectToHub(In, FirstGuardBlock, Outgoing); - Value *IncomingId = nullptr; - if (Succ0 && Succ1) { - // target_bb_index = Condition ? index_of_succ0 : index_of_succ1. - auto Succ0Iter = find(Outgoing, Succ0); - auto Succ1Iter = find(Outgoing, Succ1); - Value *Id0 = ConstantInt::get(Type::getInt32Ty(Context), - std::distance(Outgoing.begin(), Succ0Iter)); - Value *Id1 = ConstantInt::get(Type::getInt32Ty(Context), - std::distance(Outgoing.begin(), Succ1Iter)); - IncomingId = SelectInst::Create(Condition, Id0, Id1, "target.bb.idx", - In->getTerminator()->getIterator()); - } else { - // Get the index of the non-null successor. - auto SuccIter = Succ0 ? find(Outgoing, Succ0) : find(Outgoing, Succ1); - IncomingId = ConstantInt::get(Type::getInt32Ty(Context), - std::distance(Outgoing.begin(), SuccIter)); - } - Phi->addIncoming(IncomingId, In); - } - - for (int i = 0, e = Outgoing.size() - 1; i != e; ++i) { - auto Out = Outgoing[i]; - auto Cmp = ICmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_EQ, Phi, - ConstantInt::get(Type::getInt32Ty(Context), i), - Out->getName() + ".predicate", GuardBlocks[i]); - GuardPredicates[Out] = Cmp; - } -} - -/// We record the predicate of each outgoing block using a phi of boolean. -static void calcPredicateUsingBooleans( - const BBSetVector &Incoming, const BBSetVector &Outgoing, - SmallVectorImpl &GuardBlocks, BBPredicates &GuardPredicates, - SmallVectorImpl &DeletionCandidates) { - auto &Context = Incoming.front()->getContext(); - auto BoolTrue = ConstantInt::getTrue(Context); - auto BoolFalse = ConstantInt::getFalse(Context); - auto FirstGuardBlock = GuardBlocks.front(); - - // The predicate for the last outgoing is trivially true, and so we - // process only the first N-1 successors. - for (int i = 0, e = Outgoing.size() - 1; i != e; ++i) { - auto Out = Outgoing[i]; - LLVM_DEBUG(dbgs() << "Creating guard for " << Out->getName() << "\n"); - - auto Phi = - PHINode::Create(Type::getInt1Ty(Context), Incoming.size(), - StringRef("Guard.") + Out->getName(), FirstGuardBlock); - GuardPredicates[Out] = Phi; - } - - for (auto *In : Incoming) { - Value *Condition; - BasicBlock *Succ0; - BasicBlock *Succ1; - std::tie(Condition, Succ0, Succ1) = - redirectToHub(In, FirstGuardBlock, Outgoing); - - // Optimization: Consider an incoming block A with both successors - // Succ0 and Succ1 in the set of outgoing blocks. The predicates - // for Succ0 and Succ1 complement each other. If Succ0 is visited - // first in the loop below, control will branch to Succ0 using the - // corresponding predicate. But if that branch is not taken, then - // control must reach Succ1, which means that the incoming value of - // the predicate from `In` is true for Succ1. - bool OneSuccessorDone = false; - for (int i = 0, e = Outgoing.size() - 1; i != e; ++i) { - auto Out = Outgoing[i]; - PHINode *Phi = cast(GuardPredicates[Out]); - if (Out != Succ0 && Out != Succ1) { - Phi->addIncoming(BoolFalse, In); - } else if (!Succ0 || !Succ1 || OneSuccessorDone) { - // Optimization: When only one successor is an outgoing block, - // the incoming predicate from `In` is always true. - Phi->addIncoming(BoolTrue, In); - } else { - assert(Succ0 && Succ1); - if (Out == Succ0) { - Phi->addIncoming(Condition, In); - } else { - auto Inverted = invertCondition(Condition); - DeletionCandidates.push_back(Condition); - Phi->addIncoming(Inverted, In); - } - OneSuccessorDone = true; - } - } - } -} - -// Capture the existing control flow as guard predicates, and redirect -// control flow from \p Incoming block through the \p GuardBlocks to the -// \p Outgoing blocks. -// -// There is one guard predicate for each outgoing block OutBB. The -// predicate represents whether the hub should transfer control flow -// to OutBB. These predicates are NOT ORTHOGONAL. The Hub evaluates -// them in the same order as the Outgoing set-vector, and control -// branches to the first outgoing block whose predicate evaluates to true. -static void -convertToGuardPredicates(SmallVectorImpl &GuardBlocks, - SmallVectorImpl &DeletionCandidates, - const BBSetVector &Incoming, - const BBSetVector &Outgoing, const StringRef Prefix, - std::optional MaxControlFlowBooleans) { - BBPredicates GuardPredicates; - auto F = Incoming.front()->getParent(); - - for (int i = 0, e = Outgoing.size() - 1; i != e; ++i) - GuardBlocks.push_back( - BasicBlock::Create(F->getContext(), Prefix + ".guard", F)); - - // When we are using an integer to record which target block to jump to, we - // are creating less live values, actually we are using one single integer to - // store the index of the target block. When we are using booleans to store - // the branching information, we need (N-1) boolean values, where N is the - // number of outgoing block. - if (!MaxControlFlowBooleans || Outgoing.size() <= *MaxControlFlowBooleans) - calcPredicateUsingBooleans(Incoming, Outgoing, GuardBlocks, GuardPredicates, - DeletionCandidates); - else - calcPredicateUsingInteger(Incoming, Outgoing, GuardBlocks, GuardPredicates); - - setupBranchForGuard(GuardBlocks, Outgoing, GuardPredicates); -} - -BasicBlock *llvm::CreateControlFlowHub( - DomTreeUpdater *DTU, SmallVectorImpl &GuardBlocks, - const BBSetVector &Incoming, const BBSetVector &Outgoing, - const StringRef Prefix, std::optional MaxControlFlowBooleans) { - if (Outgoing.size() < 2) - return Outgoing.front(); - - SmallVector Updates; - if (DTU) { - for (auto *In : Incoming) { - for (auto Succ : successors(In)) - if (Outgoing.count(Succ)) - Updates.push_back({DominatorTree::Delete, In, Succ}); - } - } - - SmallVector DeletionCandidates; - convertToGuardPredicates(GuardBlocks, DeletionCandidates, Incoming, Outgoing, - Prefix, MaxControlFlowBooleans); - auto FirstGuardBlock = GuardBlocks.front(); - - // Update the PHINodes in each outgoing block to match the new control flow. - for (int i = 0, e = GuardBlocks.size(); i != e; ++i) - reconnectPhis(Outgoing[i], GuardBlocks[i], Incoming, FirstGuardBlock); - - reconnectPhis(Outgoing.back(), GuardBlocks.back(), Incoming, FirstGuardBlock); - - if (DTU) { - int NumGuards = GuardBlocks.size(); - assert((int)Outgoing.size() == NumGuards + 1); - - for (auto In : Incoming) - Updates.push_back({DominatorTree::Insert, In, FirstGuardBlock}); - - for (int i = 0; i != NumGuards - 1; ++i) { - Updates.push_back({DominatorTree::Insert, GuardBlocks[i], Outgoing[i]}); - Updates.push_back( - {DominatorTree::Insert, GuardBlocks[i], GuardBlocks[i + 1]}); - } - Updates.push_back({DominatorTree::Insert, GuardBlocks[NumGuards - 1], - Outgoing[NumGuards - 1]}); - Updates.push_back({DominatorTree::Insert, GuardBlocks[NumGuards - 1], - Outgoing[NumGuards]}); - DTU->applyUpdates(Updates); - } - - for (auto I : DeletionCandidates) { - if (I->use_empty()) - if (auto Inst = dyn_cast_or_null(I)) - Inst->eraseFromParent(); - } - - return FirstGuardBlock; -} - void llvm::InvertBranch(BranchInst *PBI, IRBuilderBase &Builder) { Value *NewCond = PBI->getCondition(); // If this is a "cmp" instruction, only used for branching (and nowhere diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt index 51e8821773c3af..b5a7eedbac62b1 100644 --- a/llvm/lib/Transforms/Utils/CMakeLists.txt +++ b/llvm/lib/Transforms/Utils/CMakeLists.txt @@ -16,6 +16,7 @@ add_llvm_component_library(LLVMTransformUtils CodeExtractor.cpp CodeLayout.cpp CodeMoverUtils.cpp + ControlFlowUtils.cpp CtorUtils.cpp CountVisits.cpp Debugify.cpp diff --git a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp new file mode 100644 index 00000000000000..5ba626fa213ad2 --- /dev/null +++ b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp @@ -0,0 +1,342 @@ +//===- ControlFlowUtils.cpp - Control Flow Utilities -----------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Utilities to manipulate the CFG and restore SSA for the new control flow. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/ControlFlowUtils.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/Transforms/Utils/Local.h" + +#define DEBUG_TYPE "control-flow-hub" + +using namespace llvm; + +using BBPredicates = DenseMap; +using EdgeDescriptor = ControlFlowHub::BranchDescriptor; + +// Redirects the terminator of the incoming block to the first guard block in +// the hub. Returns the branch condition from `BB` if it exits. +// - If only one of Succ0 or Succ1 is not null, the corresponding branch +// successor is redirected to the FirstGuardBlock. +// - Else both are not null, and branch is replaced with an unconditional +// branch to the FirstGuardBlock. +static Value *redirectToHub(BasicBlock *BB, BasicBlock *Succ0, + BasicBlock *Succ1, BasicBlock *FirstGuardBlock) { + assert(isa(BB->getTerminator()) && + "Only support branch terminator."); + auto *Branch = cast(BB->getTerminator()); + auto *Condition = Branch->isConditional() ? Branch->getCondition() : nullptr; + + assert(Succ0 || Succ1); + + if (Branch->isUnconditional()) { + assert(Succ0 == Branch->getSuccessor(0)); + assert(!Succ1); + Branch->setSuccessor(0, FirstGuardBlock); + } else { + assert(!Succ1 || Succ1 == Branch->getSuccessor(1)); + if (Succ0 && !Succ1) { + Branch->setSuccessor(0, FirstGuardBlock); + } else if (Succ1 && !Succ0) { + Branch->setSuccessor(1, FirstGuardBlock); + } else { + Branch->eraseFromParent(); + BranchInst::Create(FirstGuardBlock, BB); + } + } + + return Condition; +} + +// Setup the branch instructions for guard blocks. +// +// Each guard block terminates in a conditional branch that transfers +// control to the corresponding outgoing block or the next guard +// block. The last guard block has two outgoing blocks as successors. +static void setupBranchForGuard(ArrayRef GuardBlocks, + ArrayRef Outgoing, + BBPredicates &GuardPredicates) { + assert(Outgoing.size() > 1); + assert(GuardBlocks.size() == Outgoing.size() - 1); + int I = 0; + for (int E = GuardBlocks.size() - 1; I != E; ++I) { + BasicBlock *Out = Outgoing[I]; + BranchInst::Create(Out, GuardBlocks[I + 1], GuardPredicates[Out], + GuardBlocks[I]); + } + BasicBlock *Out = Outgoing[I]; + BranchInst::Create(Out, Outgoing[I + 1], GuardPredicates[Out], + GuardBlocks[I]); +} + +// Assign an index to each outgoing block. At the corresponding guard +// block, compute the branch condition by comparing this index. +static void calcPredicateUsingInteger(ArrayRef Branches, + ArrayRef Outgoing, + ArrayRef GuardBlocks, + BBPredicates &GuardPredicates) { + LLVMContext &Context = GuardBlocks.front()->getContext(); + BasicBlock *FirstGuardBlock = GuardBlocks.front(); + Type *Int32Ty = Type::getInt32Ty(Context); + + auto *Phi = PHINode::Create(Int32Ty, Branches.size(), "merged.bb.idx", + FirstGuardBlock); + + for (auto [BB, Succ0, Succ1] : Branches) { + Value *Condition = redirectToHub(BB, Succ0, Succ1, FirstGuardBlock); + Value *IncomingId = nullptr; + if (Succ0 && Succ1) { + auto Succ0Iter = find(Outgoing, Succ0); + auto Succ1Iter = find(Outgoing, Succ1); + Value *Id0 = + ConstantInt::get(Int32Ty, std::distance(Outgoing.begin(), Succ0Iter)); + Value *Id1 = + ConstantInt::get(Int32Ty, std::distance(Outgoing.begin(), Succ1Iter)); + IncomingId = SelectInst::Create(Condition, Id0, Id1, "target.bb.idx", + BB->getTerminator()->getIterator()); + } else { + // Get the index of the non-null successor. + auto SuccIter = Succ0 ? find(Outgoing, Succ0) : find(Outgoing, Succ1); + IncomingId = + ConstantInt::get(Int32Ty, std::distance(Outgoing.begin(), SuccIter)); + } + Phi->addIncoming(IncomingId, BB); + } + + for (int I = 0, E = Outgoing.size() - 1; I != E; ++I) { + BasicBlock *Out = Outgoing[I]; + LLVM_DEBUG(dbgs() << "Creating integer guard for " << Out->getName() + << "\n"); + auto *Cmp = ICmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_EQ, Phi, + ConstantInt::get(Int32Ty, I), + Out->getName() + ".predicate", GuardBlocks[I]); + GuardPredicates[Out] = Cmp; + } +} + +// Determine the branch condition to be used at each guard block from the +// original boolean values. +static void calcPredicateUsingBooleans( + ArrayRef Branches, ArrayRef Outgoing, + SmallVectorImpl &GuardBlocks, BBPredicates &GuardPredicates, + SmallVectorImpl &DeletionCandidates) { + LLVMContext &Context = GuardBlocks.front()->getContext(); + auto *BoolTrue = ConstantInt::getTrue(Context); + auto *BoolFalse = ConstantInt::getFalse(Context); + BasicBlock *FirstGuardBlock = GuardBlocks.front(); + + // The predicate for the last outgoing is trivially true, and so we + // process only the first N-1 successors. + for (int I = 0, E = Outgoing.size() - 1; I != E; ++I) { + BasicBlock *Out = Outgoing[I]; + LLVM_DEBUG(dbgs() << "Creating boolean guard for " << Out->getName() + << "\n"); + + auto *Phi = + PHINode::Create(Type::getInt1Ty(Context), Branches.size(), + StringRef("Guard.") + Out->getName(), FirstGuardBlock); + GuardPredicates[Out] = Phi; + } + + for (auto [BB, Succ0, Succ1] : Branches) { + Value *Condition = redirectToHub(BB, Succ0, Succ1, FirstGuardBlock); + + // Optimization: Consider an incoming block A with both successors + // Succ0 and Succ1 in the set of outgoing blocks. The predicates + // for Succ0 and Succ1 complement each other. If Succ0 is visited + // first in the loop below, control will branch to Succ0 using the + // corresponding predicate. But if that branch is not taken, then + // control must reach Succ1, which means that the incoming value of + // the predicate from `BB` is true for Succ1. + bool OneSuccessorDone = false; + for (int I = 0, E = Outgoing.size() - 1; I != E; ++I) { + BasicBlock *Out = Outgoing[I]; + PHINode *Phi = cast(GuardPredicates[Out]); + if (Out != Succ0 && Out != Succ1) { + Phi->addIncoming(BoolFalse, BB); + } else if (!Succ0 || !Succ1 || OneSuccessorDone) { + // Optimization: When only one successor is an outgoing block, + // the incoming predicate from `BB` is always true. + Phi->addIncoming(BoolTrue, BB); + } else { + assert(Succ0 && Succ1); + if (Out == Succ0) { + Phi->addIncoming(Condition, BB); + } else { + Value *Inverted = invertCondition(Condition); + DeletionCandidates.push_back(Condition); + Phi->addIncoming(Inverted, BB); + } + OneSuccessorDone = true; + } + } + } +} + +// Capture the existing control flow as guard predicates, and redirect +// control flow from \p Incoming block through the \p GuardBlocks to the +// \p Outgoing blocks. +// +// There is one guard predicate for each outgoing block OutBB. The +// predicate represents whether the hub should transfer control flow +// to OutBB. These predicates are NOT ORTHOGONAL. The Hub evaluates +// them in the same order as the Outgoing set-vector, and control +// branches to the first outgoing block whose predicate evaluates to true. +// +// The last guard block has two outgoing blocks as successors since the +// condition for the final outgoing block is trivially true. So we create one +// less block (including the first guard block) than the number of outgoing +// blocks. +static void convertToGuardPredicates( + ArrayRef Branches, ArrayRef Outgoing, + SmallVectorImpl &GuardBlocks, + SmallVectorImpl &DeletionCandidates, const StringRef Prefix, + std::optional MaxControlFlowBooleans) { + BBPredicates GuardPredicates; + Function *F = Outgoing.front()->getParent(); + + for (int I = 0, E = Outgoing.size() - 1; I != E; ++I) + GuardBlocks.push_back( + BasicBlock::Create(F->getContext(), Prefix + ".guard", F)); + + // When we are using an integer to record which target block to jump to, we + // are creating less live values, actually we are using one single integer to + // store the index of the target block. When we are using booleans to store + // the branching information, we need (N-1) boolean values, where N is the + // number of outgoing block. + if (!MaxControlFlowBooleans || Outgoing.size() <= *MaxControlFlowBooleans) + calcPredicateUsingBooleans(Branches, Outgoing, GuardBlocks, GuardPredicates, + DeletionCandidates); + else + calcPredicateUsingInteger(Branches, Outgoing, GuardBlocks, GuardPredicates); + + setupBranchForGuard(GuardBlocks, Outgoing, GuardPredicates); +} + +// After creating a control flow hub, the operands of PHINodes in an outgoing +// block Out no longer match the predecessors of that block. Predecessors of Out +// that are incoming blocks to the hub are now replaced by just one edge from +// the hub. To match this new control flow, the corresponding values from each +// PHINode must now be moved a new PHINode in the first guard block of the hub. +// +// This operation cannot be performed with SSAUpdater, because it involves one +// new use: If the block Out is in the list of Incoming blocks, then the newly +// created PHI in the Hub will use itself along that edge from Out to Hub. +static void reconnectPhis(BasicBlock *Out, BasicBlock *GuardBlock, + ArrayRef Incoming, + BasicBlock *FirstGuardBlock) { + auto I = Out->begin(); + while (I != Out->end() && isa(I)) { + auto *Phi = cast(I); + auto *NewPhi = + PHINode::Create(Phi->getType(), Incoming.size(), + Phi->getName() + ".moved", FirstGuardBlock->begin()); + bool AllUndef = true; + for (auto [BB, Succ0, Succ1] : Incoming) { + Value *V = PoisonValue::get(Phi->getType()); + if (BB == Out) { + V = NewPhi; + } else if (Phi->getBasicBlockIndex(BB) != -1) { + V = Phi->removeIncomingValue(BB, false); + AllUndef &= isa(V); + } + NewPhi->addIncoming(V, BB); + } + assert(NewPhi->getNumIncomingValues() == Incoming.size()); + Value *NewV = NewPhi; + if (AllUndef) { + NewPhi->eraseFromParent(); + NewV = PoisonValue::get(Phi->getType()); + } + if (Phi->getNumOperands() == 0) { + Phi->replaceAllUsesWith(NewV); + I = Phi->eraseFromParent(); + continue; + } + Phi->addIncoming(NewV, GuardBlock); + ++I; + } +} + +BasicBlock *ControlFlowHub::finalize( + DomTreeUpdater *DTU, SmallVectorImpl &GuardBlocks, + const StringRef Prefix, std::optional MaxControlFlowBooleans) { +#ifndef NDEBUG + SmallSet Incoming; +#endif + SetVector Outgoing; + + for (auto [BB, Succ0, Succ1] : Branches) { +#ifndef NDEBUG + assert(Incoming.insert(BB).second && "Duplicate entry for incoming block."); +#endif + if (Succ0) + Outgoing.insert(Succ0); + if (Succ1) + Outgoing.insert(Succ1); + } + + if (Outgoing.size() < 2) + return Outgoing.front(); + + SmallVector Updates; + if (DTU) { + for (auto [BB, Succ0, Succ1] : Branches) { + if (Succ0) + Updates.push_back({DominatorTree::Delete, BB, Succ0}); + if (Succ1) + Updates.push_back({DominatorTree::Delete, BB, Succ1}); + } + } + + SmallVector DeletionCandidates; + convertToGuardPredicates(Branches, Outgoing.getArrayRef(), GuardBlocks, + DeletionCandidates, Prefix, MaxControlFlowBooleans); + BasicBlock *FirstGuardBlock = GuardBlocks.front(); + + // Update the PHINodes in each outgoing block to match the new control flow. + for (int I = 0, E = GuardBlocks.size(); I != E; ++I) + reconnectPhis(Outgoing[I], GuardBlocks[I], Branches, FirstGuardBlock); + // Process the Nth (last) outgoing block with the (N-1)th (last) guard block. + reconnectPhis(Outgoing.back(), GuardBlocks.back(), Branches, FirstGuardBlock); + + if (DTU) { + int NumGuards = GuardBlocks.size(); + + for (auto [BB, Succ0, Succ1] : Branches) + Updates.push_back({DominatorTree::Insert, BB, FirstGuardBlock}); + + for (int I = 0; I != NumGuards - 1; ++I) { + Updates.push_back({DominatorTree::Insert, GuardBlocks[I], Outgoing[I]}); + Updates.push_back( + {DominatorTree::Insert, GuardBlocks[I], GuardBlocks[I + 1]}); + } + // The second successor of the last guard block is an outgoing block instead + // of having a "next" guard block. + Updates.push_back({DominatorTree::Insert, GuardBlocks[NumGuards - 1], + Outgoing[NumGuards - 1]}); + Updates.push_back({DominatorTree::Insert, GuardBlocks[NumGuards - 1], + Outgoing[NumGuards]}); + DTU->applyUpdates(Updates); + } + + for (auto I : DeletionCandidates) { + if (I->use_empty()) + if (auto *Inst = dyn_cast_or_null(I)) + Inst->eraseFromParent(); + } + + return FirstGuardBlock; +} diff --git a/llvm/lib/Transforms/Utils/FixIrreducible.cpp b/llvm/lib/Transforms/Utils/FixIrreducible.cpp index 11e24d0585be48..cdd4b36d2d9ebf 100644 --- a/llvm/lib/Transforms/Utils/FixIrreducible.cpp +++ b/llvm/lib/Transforms/Utils/FixIrreducible.cpp @@ -51,15 +51,14 @@ // including inside any newly created loops. This ensures that any SCC hidden // inside a maximal SCC is also transformed. // -// The actual transformation is handled by function CreateControlFlowHub, which -// takes a set of incoming blocks (the predecessors) and outgoing blocks (the -// headers). The function also moves every PHINode in an outgoing block to the -// hub. Since the hub dominates all the outgoing blocks, each such PHINode -// continues to dominate its uses. Since every header in an SCC has at least two -// predecessors, every value used in the header (or later) but defined in a -// predecessor (or earlier) is represented by a PHINode in a header. Hence the -// above handling of PHINodes is sufficient and no further processing is -// required to restore SSA. +// The actual transformation is handled by the ControlFlowHub, which redirects +// specified control flow edges through a set of guard blocks. This also moves +// every PHINode in an outgoing block to the hub. Since the hub dominates all +// the outgoing blocks, each such PHINode continues to dominate its uses. Since +// every header in an SCC has at least two predecessors, every value used in the +// header (or later) but defined in a predecessor (or earlier) is represented by +// a PHINode in a header. Hence the above handling of PHINodes is sufficient and +// no further processing is required to restore SSA. // // Limitation: The pass cannot handle switch statements and indirect // branches. Both must be lowered to plain branches first. @@ -74,6 +73,7 @@ #include "llvm/Pass.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/ControlFlowUtils.h" #define DEBUG_TYPE "fix-irreducible" @@ -189,9 +189,24 @@ static void createNaturalLoopInternal(LoopInfo &LI, DominatorTree &DT, // Redirect all the backedges through a "hub" consisting of a series // of guard blocks that manage the flow of control from the // predecessors to the headers. + ControlFlowHub CHub; + for (BasicBlock *P : Predecessors) { + auto *Branch = cast(P->getTerminator()); + BasicBlock *Succ0 = Branch->getSuccessor(0); + Succ0 = Headers.count(Succ0) ? Succ0 : nullptr; + BasicBlock *Succ1 = + Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); + Succ1 = Succ1 && Headers.count(Succ1) ? Succ1 : nullptr; + CHub.addBranch(P, Succ0, Succ1); + + LLVM_DEBUG(dbgs() << "Added branch: " << P->getName() << " -> " + << (Succ0 ? Succ0->getName() : "") << " " + << (Succ1 ? Succ1->getName() : "") << "\n"); + } + SmallVector GuardBlocks; DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); - CreateControlFlowHub(&DTU, GuardBlocks, Predecessors, Headers, "irr"); + CHub.finalize(&DTU, GuardBlocks, "irr"); #if defined(EXPENSIVE_CHECKS) assert(DT.verify(DominatorTree::VerificationLevel::Full)); #else diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp index 1d51f61351fe27..856f3c3ed3e131 100644 --- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp +++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp @@ -26,6 +26,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/ControlFlowUtils.h" #define DEBUG_TYPE "unify-loop-exits" @@ -34,7 +35,7 @@ using namespace llvm; static cl::opt MaxBooleansInControlFlowHub( "max-booleans-in-control-flow-hub", cl::init(32), cl::Hidden, cl::desc("Set the maximum number of outgoing blocks for using a boolean " - "value to record the exiting block in CreateControlFlowHub.")); + "value to record the exiting block in the ControlFlowHub.")); namespace { struct UnifyLoopExitsLegacyPass : public FunctionPass { @@ -86,7 +87,7 @@ INITIALIZE_PASS_END(UnifyLoopExitsLegacyPass, "unify-loop-exits", // for creating the new PHI is well-known, and also the set of incoming blocks // to the new PHI. static void restoreSSA(const DominatorTree &DT, const Loop *L, - const SetVector &Incoming, + SmallVectorImpl &Incoming, BasicBlock *LoopExitBlock) { using InstVector = SmallVector; using IIMap = MapVector; @@ -146,47 +147,30 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { // traverse the entire loop body. It is more efficient to first // locate the exiting blocks and then examine their successors to // locate the exit blocks. - SetVector ExitingBlocks; - SetVector Exits; - - // We need SetVectors, but the Loop API takes a vector, so we use a temporary. - SmallVector Temp; - L->getExitingBlocks(Temp); - for (auto *BB : Temp) { - ExitingBlocks.insert(BB); - for (auto *S : successors(BB)) { - auto SL = LI.getLoopFor(S); - // A successor is not an exit if it is directly or indirectly in the - // current loop. - if (SL == L || L->contains(SL)) - continue; - Exits.insert(S); - } - } - - LLVM_DEBUG( - dbgs() << "Found exit blocks:"; - for (auto Exit : Exits) { - dbgs() << " " << Exit->getName(); - } - dbgs() << "\n"; - - dbgs() << "Found exiting blocks:"; - for (auto EB : ExitingBlocks) { - dbgs() << " " << EB->getName(); - } - dbgs() << "\n";); - - if (Exits.size() <= 1) { - LLVM_DEBUG(dbgs() << "loop does not have multiple exits; nothing to do\n"); - return false; + SmallVector ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + // Redirect exiting edges through a control flow hub. + ControlFlowHub CHub; + for (auto *BB : ExitingBlocks) { + auto *Branch = cast(BB->getTerminator()); + BasicBlock *Succ0 = Branch->getSuccessor(0); + Succ0 = L->contains(Succ0) ? nullptr : Succ0; + + BasicBlock *Succ1 = + Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); + Succ1 = L->contains(Succ1) ? nullptr : Succ1; + CHub.addBranch(BB, Succ0, Succ1); + + LLVM_DEBUG(dbgs() << "Added exiting branch: " << BB->getName() << " -> {" + << (Succ0 ? Succ0->getName() : "") << ", " + << (Succ1 ? Succ1->getName() : "") << "}\n"); } SmallVector GuardBlocks; DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); - auto LoopExitBlock = - CreateControlFlowHub(&DTU, GuardBlocks, ExitingBlocks, Exits, "loop.exit", - MaxBooleansInControlFlowHub.getValue()); + BasicBlock *LoopExitBlock = CHub.finalize( + &DTU, GuardBlocks, "loop.exit", MaxBooleansInControlFlowHub.getValue()); restoreSSA(DT, L, ExitingBlocks, LoopExitBlock); @@ -218,8 +202,7 @@ static bool runImpl(LoopInfo &LI, DominatorTree &DT) { bool Changed = false; auto Loops = LI.getLoopsInPreorder(); for (auto *L : Loops) { - LLVM_DEBUG(dbgs() << "Loop: " << L->getHeader()->getName() << " (depth: " - << LI.getLoopDepth(L->getHeader()) << ")\n"); + LLVM_DEBUG(dbgs() << "Processing loop:\n"; L->print(dbgs())); Changed |= unifyLoopExits(DT, LI, L); } return Changed; @@ -240,6 +223,8 @@ namespace llvm { PreservedAnalyses UnifyLoopExitsPass::run(Function &F, FunctionAnalysisManager &AM) { + LLVM_DEBUG(dbgs() << "===== Unifying loop exits in function " << F.getName() + << "\n"); auto &LI = AM.getResult(F); auto &DT = AM.getResult(F); diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll index 32b9f9cb97095f..85ed2914b8c7f5 100644 --- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -74,18 +74,17 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK-NEXT: s_cbranch_vccnz .LBB0_2 ; CHECK-NEXT: ; %bb.9: ; %bb13 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: s_mov_b64 s[16:17], 0 +; CHECK-NEXT: s_mov_b64 s[20:21], 0 ; CHECK-NEXT: s_mov_b64 vcc, s[6:7] ; CHECK-NEXT: s_cbranch_vccz .LBB0_11 ; CHECK-NEXT: ; %bb.10: ; %bb16 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 s[16:17], 0 +; CHECK-NEXT: s_mov_b64 s[20:21], -1 ; CHECK-NEXT: s_mov_b64 s[22:23], s[12:13] -; CHECK-NEXT: s_mov_b64 s[18:19], s[16:17] -; CHECK-NEXT: s_branch .LBB0_2 -; CHECK-NEXT: .LBB0_11: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 s[20:21], 0 -; CHECK-NEXT: ; implicit-def: $sgpr16_sgpr17 -; CHECK-NEXT: s_mov_b64 s[18:19], s[16:17] +; CHECK-NEXT: .LBB0_11: ; %Flow11 +; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: s_mov_b64 s[18:19], 0 ; CHECK-NEXT: s_branch .LBB0_2 ; CHECK-NEXT: .LBB0_12: ; %loop.exit.guard6 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 65614a17fc0114..0894d3251423d6 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -7644,9 +7644,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_2 -; GFX7-NEXT: ; %bb.3: ; %Flow22 +; GFX7-NEXT: ; %bb.3: ; %Flow23 ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: .LBB28_4: ; %Flow23 +; GFX7-NEXT: .LBB28_4: ; %Flow24 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_mov_b64 s[8:9], exec ; GFX7-NEXT: v_readfirstlane_b32 s10, v1 @@ -7674,7 +7674,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_6 -; GFX7-NEXT: .LBB28_7: ; %Flow21 +; GFX7-NEXT: .LBB28_7: ; %Flow22 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 @@ -7723,7 +7723,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_cbranch_execnz .LBB28_11 ; GFX7-NEXT: ; %bb.12: ; %Flow ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: .LBB28_13: ; %Flow19 +; GFX7-NEXT: .LBB28_13: ; %Flow20 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: v_readfirstlane_b32 s4, v2 @@ -7768,9 +7768,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_2 -; GFX6-NEXT: ; %bb.3: ; %Flow20 +; GFX6-NEXT: ; %bb.3: ; %Flow21 ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: .LBB28_4: ; %Flow21 +; GFX6-NEXT: .LBB28_4: ; %Flow22 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_mov_b64 s[8:9], exec ; GFX6-NEXT: v_readfirstlane_b32 s10, v1 @@ -7798,7 +7798,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_6 -; GFX6-NEXT: .LBB28_7: ; %Flow19 +; GFX6-NEXT: .LBB28_7: ; %Flow20 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 @@ -7847,7 +7847,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_cbranch_execnz .LBB28_11 ; GFX6-NEXT: ; %bb.12: ; %Flow ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: .LBB28_13: ; %Flow17 +; GFX6-NEXT: .LBB28_13: ; %Flow18 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: v_readfirstlane_b32 s4, v2 @@ -8484,9 +8484,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_2 -; GFX7-NEXT: ; %bb.3: ; %Flow22 +; GFX7-NEXT: ; %bb.3: ; %Flow23 ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: .LBB29_4: ; %Flow23 +; GFX7-NEXT: .LBB29_4: ; %Flow24 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_mov_b64 s[8:9], exec ; GFX7-NEXT: v_readfirstlane_b32 s10, v1 @@ -8514,7 +8514,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_6 -; GFX7-NEXT: .LBB29_7: ; %Flow21 +; GFX7-NEXT: .LBB29_7: ; %Flow22 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 @@ -8563,7 +8563,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_cbranch_execnz .LBB29_11 ; GFX7-NEXT: ; %bb.12: ; %Flow ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: .LBB29_13: ; %Flow19 +; GFX7-NEXT: .LBB29_13: ; %Flow20 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: v_readfirstlane_b32 s4, v2 @@ -8608,9 +8608,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_2 -; GFX6-NEXT: ; %bb.3: ; %Flow20 +; GFX6-NEXT: ; %bb.3: ; %Flow21 ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: .LBB29_4: ; %Flow21 +; GFX6-NEXT: .LBB29_4: ; %Flow22 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_mov_b64 s[8:9], exec ; GFX6-NEXT: v_readfirstlane_b32 s10, v1 @@ -8638,7 +8638,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_6 -; GFX6-NEXT: .LBB29_7: ; %Flow19 +; GFX6-NEXT: .LBB29_7: ; %Flow20 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 @@ -8687,7 +8687,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_cbranch_execnz .LBB29_11 ; GFX6-NEXT: ; %bb.12: ; %Flow ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: .LBB29_13: ; %Flow17 +; GFX6-NEXT: .LBB29_13: ; %Flow18 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX6-NEXT: v_readfirstlane_b32 s4, v2 diff --git a/llvm/test/Transforms/FixIrreducible/basic.ll b/llvm/test/Transforms/FixIrreducible/basic.ll index 7ba1360160db20..30591c80c5559d 100644 --- a/llvm/test/Transforms/FixIrreducible/basic.ll +++ b/llvm/test/Transforms/FixIrreducible/basic.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -fix-irreducible -S | FileCheck %s -check-prefix=CHECK -; RUN: opt < %s -passes=fix-irreducible -S | FileCheck %s -check-prefix=CHECK +; RUN: opt < %s -fix-irreducible --verify-loop-info -S | FileCheck %s +; RUN: opt < %s -passes='fix-irreducible,verify' -S | FileCheck %s -check-prefix=CHECK define i32 @basic(i1 %PredEntry, i1 %PredLeft, i1 %PredRight, i32 %X, i32 %Y) { ; CHECK-LABEL: @basic( @@ -136,8 +136,8 @@ define i32 @separate_predecessors(i1 %PredEntry, i1 %PredA, i1 %PredB, i1 %PredC ; CHECK-NEXT: [[RET:%.*]] = phi i32 [ [[C_PHI_MOVED:%.*]], [[C:%.*]] ], [ [[D_INC]], [[D:%.*]] ] ; CHECK-NEXT: ret i32 [[RET]] ; CHECK: irr.guard: -; CHECK-NEXT: [[D_PHI_MOVED]] = phi i32 [ [[D_PHI_MOVED]], [[D]] ], [ undef, [[A]] ], [ [[C_PHI_MOVED]], [[C]] ], [ [[Y:%.*]], [[B]] ] -; CHECK-NEXT: [[C_PHI_MOVED]] = phi i32 [ [[D_INC]], [[D]] ], [ [[X]], [[A]] ], [ [[C_PHI_MOVED]], [[C]] ], [ undef, [[B]] ] +; CHECK-NEXT: [[D_PHI_MOVED]] = phi i32 [ [[D_PHI_MOVED]], [[D]] ], [ poison, [[A]] ], [ [[C_PHI_MOVED]], [[C]] ], [ [[Y:%.*]], [[B]] ] +; CHECK-NEXT: [[C_PHI_MOVED]] = phi i32 [ [[D_INC]], [[D]] ], [ [[X]], [[A]] ], [ [[C_PHI_MOVED]], [[C]] ], [ poison, [[B]] ] ; CHECK-NEXT: [[GUARD_C:%.*]] = phi i1 [ true, [[D]] ], [ true, [[A]] ], [ false, [[C]] ], [ false, [[B]] ] ; CHECK-NEXT: br i1 [[GUARD_C]], label [[C]], label [[D]] ; @@ -237,7 +237,7 @@ define i32 @hidden_nodes(i1 %PredEntry, i1 %PredA, i1 %PredB, i1 %PredC, i1 %Pre ; CHECK: exit: ; CHECK-NEXT: ret i32 [[B_PHI_MOVED]] ; CHECK: irr.guard: -; CHECK-NEXT: [[B_PHI_MOVED]] = phi i32 [ undef, [[E]] ], [ [[Y:%.*]], [[ENTRY:%.*]] ], [ [[A_INC]], [[A:%.*]] ] +; CHECK-NEXT: [[B_PHI_MOVED]] = phi i32 [ poison, [[E]] ], [ [[Y:%.*]], [[ENTRY:%.*]] ], [ [[A_INC]], [[A:%.*]] ] ; CHECK-NEXT: [[A_PHI_MOVED]] = phi i32 [ [[C_INC]], [[E]] ], [ [[X:%.*]], [[ENTRY]] ], [ [[A_PHI_MOVED]], [[A]] ] ; CHECK-NEXT: [[GUARD_A:%.*]] = phi i1 [ true, [[E]] ], [ [[PREDENTRY:%.*]], [[ENTRY]] ], [ false, [[A]] ] ; CHECK-NEXT: br i1 [[GUARD_A]], label [[A]], label [[B:%.*]] diff --git a/llvm/test/Transforms/FixIrreducible/bug45623.ll b/llvm/test/Transforms/FixIrreducible/bug45623.ll index 89d31dd4fea6b8..c78b593e9319b1 100644 --- a/llvm/test/Transforms/FixIrreducible/bug45623.ll +++ b/llvm/test/Transforms/FixIrreducible/bug45623.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -fix-irreducible -S | FileCheck %s +; RUN: opt < %s -fix-irreducible --verify-loop-info -S | FileCheck %s +; RUN: opt < %s -passes='fix-irreducible,verify' -S | FileCheck %s define dso_local void @tre_tnfa_run_backtrack() { ; CHECK-LABEL: @tre_tnfa_run_backtrack( diff --git a/llvm/test/Transforms/FixIrreducible/nested.ll b/llvm/test/Transforms/FixIrreducible/nested.ll index 85f7d39920b9d6..1665bbf9930287 100644 --- a/llvm/test/Transforms/FixIrreducible/nested.ll +++ b/llvm/test/Transforms/FixIrreducible/nested.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -fix-irreducible -S | FileCheck %s -check-prefix=CHECK +; RUN: opt < %s -fix-irreducible --verify-loop-info -S | FileCheck %s +; RUN: opt < %s -passes='fix-irreducible,verify' -S | FileCheck %s define void @nested_irr_top_level(i1 %Pred0, i1 %Pred1, i1 %Pred2, i1 %Pred3, i1 %Pred4, i1 %Pred5) { ; CHECK-LABEL: @nested_irr_top_level( diff --git a/llvm/test/Transforms/FixIrreducible/switch.ll b/llvm/test/Transforms/FixIrreducible/switch.ll index 27b9f9bf3b53be..f648b597b84695 100644 --- a/llvm/test/Transforms/FixIrreducible/switch.ll +++ b/llvm/test/Transforms/FixIrreducible/switch.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes='lower-switch,fix-irreducible' -S | FileCheck %s +; RUN: opt < %s -lowerswitch -fix-irreducible --verify-loop-info -S | FileCheck %s +; RUN: opt < %s -passes='lower-switch,fix-irreducible,verify' -S | FileCheck %s define void @loop_1(i32 %Value, i1 %PredEntry, i1 %PredD) { ; CHECK-LABEL: @loop_1( diff --git a/llvm/test/Transforms/FixIrreducible/unreachable.ll b/llvm/test/Transforms/FixIrreducible/unreachable.ll index 71cd81e01953ea..e61bd2b5a0ae45 100644 --- a/llvm/test/Transforms/FixIrreducible/unreachable.ll +++ b/llvm/test/Transforms/FixIrreducible/unreachable.ll @@ -1,4 +1,6 @@ -; RUN: opt %s -fix-irreducible -S -o - | FileCheck %s +; NOTE: Do not autogenerate +; RUN: opt < %s -fix-irreducible --verify-loop-info -S | FileCheck %s +; RUN: opt < %s -passes='fix-irreducible,verify' -S | FileCheck %s ; CHECK-LABEL: @unreachable( ; CHECK: entry: diff --git a/llvm/test/Transforms/StructurizeCFG/workarounds/needs-unified-loop-exits.ll b/llvm/test/Transforms/StructurizeCFG/workarounds/needs-unified-loop-exits.ll index 4b3a43000f17a3..6f6fc4d0f4e646 100644 --- a/llvm/test/Transforms/StructurizeCFG/workarounds/needs-unified-loop-exits.ll +++ b/llvm/test/Transforms/StructurizeCFG/workarounds/needs-unified-loop-exits.ll @@ -37,9 +37,8 @@ define void @exiting-block(i1 %PredH1, i1 %PredB2, i1 %PredB1, i1 %PredH2) { ; CHECK-NEXT: br i1 [[PREDB2_INV]], label [[L2:%.*]], label [[FLOW2:%.*]] ; CHECK: Flow: ; CHECK-NEXT: [[TMP2:%.*]] = phi i1 [ false, [[FLOW2]] ], [ true, [[H2]] ] -; CHECK-NEXT: [[TMP3:%.*]] = phi i1 [ false, [[FLOW2]] ], [ undef, [[H2]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ [[TMP7:%.*]], [[FLOW2]] ], [ true, [[H2]] ] -; CHECK-NEXT: br i1 [[TMP4]], label [[LOOP_EXIT_GUARD1:%.*]], label [[H2]] +; CHECK-NEXT: [[TMP3:%.*]] = phi i1 [ [[TMP5:%.*]], [[FLOW2]] ], [ true, [[H2]] ] +; CHECK-NEXT: br i1 [[TMP3]], label [[LOOP_EXIT_GUARD1:%.*]], label [[H2]] ; CHECK: L2: ; CHECK-NEXT: br label [[FLOW2]] ; CHECK: L1: @@ -51,18 +50,17 @@ define void @exiting-block(i1 %PredH1, i1 %PredB2, i1 %PredB1, i1 %PredH2) { ; CHECK: exit: ; CHECK-NEXT: ret void ; CHECK: Flow5: -; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ undef, [[L1:%.*]] ], [ [[TMP3]], [[LOOP_EXIT_GUARD1]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi i1 [ false, [[L1]] ], [ true, [[LOOP_EXIT_GUARD1]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ false, [[L1:%.*]] ], [ true, [[LOOP_EXIT_GUARD1]] ] ; CHECK-NEXT: br label [[FLOW4]] ; CHECK: loop.exit.guard: -; CHECK-NEXT: br i1 [[TMP8:%.*]], label [[C:%.*]], label [[EXIT]] +; CHECK-NEXT: br i1 [[TMP6:%.*]], label [[C:%.*]], label [[EXIT]] ; CHECK: Flow2: -; CHECK-NEXT: [[TMP7]] = phi i1 [ false, [[L2]] ], [ true, [[B2]] ] +; CHECK-NEXT: [[TMP5]] = phi i1 [ false, [[L2]] ], [ true, [[B2]] ] ; CHECK-NEXT: br label [[FLOW]] ; CHECK: Flow4: -; CHECK-NEXT: [[TMP8]] = phi i1 [ [[TMP5]], [[FLOW5]] ], [ [[TMP0]], [[FLOW3]] ] -; CHECK-NEXT: [[TMP9:%.*]] = phi i1 [ [[TMP6]], [[FLOW5]] ], [ true, [[FLOW3]] ] -; CHECK-NEXT: br i1 [[TMP9]], label [[LOOP_EXIT_GUARD:%.*]], label [[H1]] +; CHECK-NEXT: [[TMP6]] = phi i1 [ false, [[FLOW5]] ], [ [[TMP0]], [[FLOW3]] ] +; CHECK-NEXT: [[TMP7:%.*]] = phi i1 [ [[TMP4]], [[FLOW5]] ], [ true, [[FLOW3]] ] +; CHECK-NEXT: br i1 [[TMP7]], label [[LOOP_EXIT_GUARD:%.*]], label [[H1]] ; CHECK: loop.exit.guard1: ; CHECK-NEXT: br i1 [[TMP2]], label [[L1]], label [[FLOW5]] ; @@ -115,22 +113,21 @@ define void @incorrect-backedge(i1 %PredH2, i1 %PredH3, i1 %PredL2, i1 %PredL13, ; CHECK: L2: ; CHECK-NEXT: br i1 [[PREDL2_INV]], label [[L13:%.*]], label [[FLOW3:%.*]] ; CHECK: Flow: -; CHECK-NEXT: [[TMP0:%.*]] = phi i1 [ [[TMP7:%.*]], [[FLOW3]] ], [ true, [[H3]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ [[TMP8:%.*]], [[FLOW3]] ], [ false, [[H3]] ] -; CHECK-NEXT: [[TMP2:%.*]] = phi i1 [ [[TMP8]], [[FLOW3]] ], [ true, [[H3]] ] -; CHECK-NEXT: [[TMP3:%.*]] = phi i1 [ [[TMP9:%.*]], [[FLOW3]] ], [ true, [[H3]] ] -; CHECK-NEXT: br i1 [[TMP3]], label [[LOOP_EXIT_GUARD2:%.*]], label [[H3]] +; CHECK-NEXT: [[TMP0:%.*]] = phi i1 [ [[TMP6:%.*]], [[FLOW3]] ], [ true, [[H3]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, [[FLOW3]] ], [ true, [[H3]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi i1 [ [[TMP7:%.*]], [[FLOW3]] ], [ true, [[H3]] ] +; CHECK-NEXT: br i1 [[TMP2]], label [[LOOP_EXIT_GUARD2:%.*]], label [[H3]] ; CHECK: L13: ; CHECK-NEXT: br label [[FLOW3]] ; CHECK: Flow5: -; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ [[TMP10:%.*]], [[LOOP_EXIT_GUARD1:%.*]] ], [ true, [[LOOP_EXIT_GUARD:%.*]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ false, [[LOOP_EXIT_GUARD1]] ], [ true, [[LOOP_EXIT_GUARD]] ] -; CHECK-NEXT: br i1 [[TMP5]], label [[L1:%.*]], label [[FLOW6:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = phi i1 [ [[TMP8:%.*]], [[LOOP_EXIT_GUARD1:%.*]] ], [ true, [[LOOP_EXIT_GUARD:%.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ false, [[LOOP_EXIT_GUARD1]] ], [ true, [[LOOP_EXIT_GUARD]] ] +; CHECK-NEXT: br i1 [[TMP4]], label [[L1:%.*]], label [[FLOW6:%.*]] ; CHECK: L1: ; CHECK-NEXT: br label [[FLOW6]] ; CHECK: Flow6: -; CHECK-NEXT: [[TMP6:%.*]] = phi i1 [ [[PREDL1:%.*]], [[L1]] ], [ [[TMP4]], [[FLOW5:%.*]] ] -; CHECK-NEXT: br i1 [[TMP6]], label [[EXIT:%.*]], label [[H1]] +; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ [[PREDL1:%.*]], [[L1]] ], [ [[TMP3]], [[FLOW5:%.*]] ] +; CHECK-NEXT: br i1 [[TMP5]], label [[EXIT:%.*]], label [[H1]] ; CHECK: exit: ; CHECK-NEXT: ret void ; CHECK: loop.exit.guard: @@ -138,16 +135,15 @@ define void @incorrect-backedge(i1 %PredH2, i1 %PredH3, i1 %PredL2, i1 %PredL13, ; CHECK: loop.exit.guard1: ; CHECK-NEXT: br label [[FLOW5]] ; CHECK: Flow3: -; CHECK-NEXT: [[TMP7]] = phi i1 [ true, [[L13]] ], [ false, [[L2]] ] -; CHECK-NEXT: [[TMP8]] = phi i1 [ false, [[L13]] ], [ undef, [[L2]] ] -; CHECK-NEXT: [[TMP9]] = phi i1 [ [[PREDL13_INV]], [[L13]] ], [ true, [[L2]] ] +; CHECK-NEXT: [[TMP6]] = phi i1 [ true, [[L13]] ], [ false, [[L2]] ] +; CHECK-NEXT: [[TMP7]] = phi i1 [ [[PREDL13_INV]], [[L13]] ], [ true, [[L2]] ] ; CHECK-NEXT: br label [[FLOW]] ; CHECK: Flow4: -; CHECK-NEXT: [[TMP10]] = phi i1 [ [[TMP2]], [[LOOP_EXIT_GUARD2]] ], [ false, [[H2]] ] -; CHECK-NEXT: [[TMP11:%.*]] = phi i1 [ [[TMP1]], [[LOOP_EXIT_GUARD2]] ], [ true, [[H2]] ] -; CHECK-NEXT: [[TMP12:%.*]] = phi i1 [ [[TMP0]], [[LOOP_EXIT_GUARD2]] ], [ true, [[H2]] ] -; CHECK-NEXT: [[DOTINV]] = xor i1 [[TMP11]], true -; CHECK-NEXT: br i1 [[TMP12]], label [[LOOP_EXIT_GUARD]], label [[H2]] +; CHECK-NEXT: [[TMP8]] = phi i1 [ [[TMP1]], [[LOOP_EXIT_GUARD2]] ], [ false, [[H2]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi i1 [ false, [[LOOP_EXIT_GUARD2]] ], [ true, [[H2]] ] +; CHECK-NEXT: [[TMP10:%.*]] = phi i1 [ [[TMP0]], [[LOOP_EXIT_GUARD2]] ], [ true, [[H2]] ] +; CHECK-NEXT: [[DOTINV]] = xor i1 [[TMP9]], true +; CHECK-NEXT: br i1 [[TMP10]], label [[LOOP_EXIT_GUARD]], label [[H2]] ; CHECK: loop.exit.guard2: ; CHECK-NEXT: br label [[FLOW4]] ; diff --git a/llvm/test/Transforms/UnifyLoopExits/integer_guards.ll b/llvm/test/Transforms/UnifyLoopExits/integer_guards.ll index 803f14b24a9767..f55639ff2db37b 100644 --- a/llvm/test/Transforms/UnifyLoopExits/integer_guards.ll +++ b/llvm/test/Transforms/UnifyLoopExits/integer_guards.ll @@ -111,7 +111,7 @@ define void @inner_loop(i1 %PredEntry, i1 %PredA, i1 %PredB) { ; CHECK-NEXT: [[E_PREDICATE:%.*]] = icmp eq i32 [[MERGED_BB_IDX]], 1 ; CHECK-NEXT: br i1 [[E_PREDICATE]], label [[E:%.*]], label [[I]] ; CHECK: loop.exit.guard2: -; CHECK-NEXT: [[MERGED_BB_IDX_MOVED]] = phi i32 [ 0, [[B]] ], [ 1, [[D]] ], [ undef, [[F]] ] +; CHECK-NEXT: [[MERGED_BB_IDX_MOVED]] = phi i32 [ 0, [[B]] ], [ 1, [[D]] ], [ poison, [[F]] ] ; CHECK-NEXT: [[MERGED_BB_IDX3:%.*]] = phi i32 [ 0, [[B]] ], [ 0, [[D]] ], [ 1, [[F]] ] ; CHECK-NEXT: [[LOOP_EXIT_GUARD_PREDICATE:%.*]] = icmp eq i32 [[MERGED_BB_IDX3]], 0 ; CHECK-NEXT: br i1 [[LOOP_EXIT_GUARD_PREDICATE]], label [[LOOP_EXIT_GUARD]], label [[G]] @@ -152,8 +152,8 @@ define void @inner_loop(i1 %PredEntry, i1 %PredA, i1 %PredB) { ; BOOLEAN: loop.exit.guard1: ; BOOLEAN-NEXT: br i1 [[GUARD_E]], label [[E:%.*]], label [[I]] ; BOOLEAN: loop.exit.guard2: -; BOOLEAN-NEXT: [[GUARD_E_MOVED]] = phi i1 [ false, [[B]] ], [ true, [[D]] ], [ undef, [[F]] ] -; BOOLEAN-NEXT: [[GUARD_C_MOVED]] = phi i1 [ true, [[B]] ], [ false, [[D]] ], [ undef, [[F]] ] +; BOOLEAN-NEXT: [[GUARD_E_MOVED]] = phi i1 [ false, [[B]] ], [ true, [[D]] ], [ poison, [[F]] ] +; BOOLEAN-NEXT: [[GUARD_C_MOVED]] = phi i1 [ true, [[B]] ], [ false, [[D]] ], [ poison, [[F]] ] ; BOOLEAN-NEXT: [[GUARD_LOOP_EXIT_GUARD:%.*]] = phi i1 [ true, [[B]] ], [ true, [[D]] ], [ false, [[F]] ] ; BOOLEAN-NEXT: br i1 [[GUARD_LOOP_EXIT_GUARD]], label [[LOOP_EXIT_GUARD]], label [[G]] ; diff --git a/llvm/test/Transforms/UnifyLoopExits/nested.ll b/llvm/test/Transforms/UnifyLoopExits/nested.ll index b02e353d186396..8fae2c4349a7b9 100644 --- a/llvm/test/Transforms/UnifyLoopExits/nested.ll +++ b/llvm/test/Transforms/UnifyLoopExits/nested.ll @@ -31,7 +31,7 @@ define void @nested(i1 %PredB3, i1 %PredB4, i1 %PredA4, i1 %PredA3, i32 %X, i32 ; CHECK-NEXT: [[EXIT_PHI:%.*]] = phi i32 [ [[Z:%.*]], [[C:%.*]] ], [ [[EXIT_PHI_MOVED:%.*]], [[LOOP_EXIT_GUARD]] ] ; CHECK-NEXT: ret void ; CHECK: loop.exit.guard: -; CHECK-NEXT: [[EXIT_PHI_MOVED]] = phi i32 [ undef, [[A4]] ], [ [[A4_PHI]], [[A5]] ] +; CHECK-NEXT: [[EXIT_PHI_MOVED]] = phi i32 [ poison, [[A4]] ], [ [[A4_PHI]], [[A5]] ] ; CHECK-NEXT: [[GUARD_C:%.*]] = phi i1 [ true, [[A4]] ], [ false, [[A5]] ] ; CHECK-NEXT: br i1 [[GUARD_C]], label [[C]], label [[EXIT]] ; CHECK: loop.exit.guard1: diff --git a/llvm/test/Transforms/UnifyLoopExits/restore-ssa.ll b/llvm/test/Transforms/UnifyLoopExits/restore-ssa.ll index 33dbc1be89ec55..3e68df3e79260a 100644 --- a/llvm/test/Transforms/UnifyLoopExits/restore-ssa.ll +++ b/llvm/test/Transforms/UnifyLoopExits/restore-ssa.ll @@ -31,7 +31,7 @@ define i32 @exiting-used-in-exit(ptr %arg1, ptr %arg2) local_unnamed_addr align ; CHECK-NEXT: ret i32 [[PHI]] ; CHECK: loop.exit.guard: ; CHECK-NEXT: [[MYTMP41_MOVED]] = phi i32 [ poison, [[A]] ], [ [[MYTMP41]], [[B]] ] -; CHECK-NEXT: [[PHI_MOVED]] = phi i32 [ [[MYTMP42]], [[A]] ], [ undef, [[B]] ] +; CHECK-NEXT: [[PHI_MOVED]] = phi i32 [ [[MYTMP42]], [[A]] ], [ poison, [[B]] ] ; CHECK-NEXT: [[GUARD_RETURN:%.*]] = phi i1 [ true, [[A]] ], [ false, [[B]] ] ; CHECK-NEXT: br i1 [[GUARD_RETURN]], label [[RETURN]], label [[C]] ; @@ -142,7 +142,7 @@ define i32 @mixed-use-in-exit(ptr %arg1, ptr %arg2) local_unnamed_addr align 2 { ; CHECK-NEXT: ret i32 [[PHI]] ; CHECK: loop.exit.guard: ; CHECK-NEXT: [[MYTMP41_MOVED]] = phi i32 [ poison, [[A]] ], [ [[MYTMP41]], [[C]] ] -; CHECK-NEXT: [[PHI_MOVED]] = phi i32 [ [[MYTMP43]], [[A]] ], [ undef, [[C]] ] +; CHECK-NEXT: [[PHI_MOVED]] = phi i32 [ [[MYTMP43]], [[A]] ], [ poison, [[C]] ] ; CHECK-NEXT: [[GUARD_RETURN:%.*]] = phi i1 [ true, [[A]] ], [ false, [[C]] ] ; CHECK-NEXT: br i1 [[GUARD_RETURN]], label [[RETURN]], label [[D]] ; diff --git a/llvm/test/Transforms/UnifyLoopExits/undef-phis.ll b/llvm/test/Transforms/UnifyLoopExits/undef-phis.ll index 722ccc519a7646..05f50fcc37d6e1 100644 --- a/llvm/test/Transforms/UnifyLoopExits/undef-phis.ll +++ b/llvm/test/Transforms/UnifyLoopExits/undef-phis.ll @@ -23,11 +23,11 @@ define fastcc void @undef_phi(i64 %i5247, i1 %i4530, i1 %i4936.not) { ; CHECK-NEXT: store volatile [2 x i32] [[I5293]], ptr addrspace(5) null, align 4 ; CHECK-NEXT: ret void ; CHECK: [[LOOP_EXIT_GUARD]]: -; CHECK-NEXT: [[DOTMOVED]] = phi i32 [ [[TMP0]], %[[MBB4321]] ], [ undef, %[[LOOP_EXIT_GUARD1]] ] +; CHECK-NEXT: [[DOTMOVED]] = phi i32 [ [[TMP0]], %[[MBB4321]] ], [ poison, %[[LOOP_EXIT_GUARD1]] ] ; CHECK-NEXT: [[GUARD_MBB4531:%.*]] = phi i1 [ false, %[[MBB4321]] ], [ [[GUARD_MBB4531_MOVED:%.*]], %[[LOOP_EXIT_GUARD1]] ] ; CHECK-NEXT: br i1 [[GUARD_MBB4531]], label %[[MBB4531]], label %[[MBB5291]] ; CHECK: [[LOOP_EXIT_GUARD1]]: -; CHECK-NEXT: [[GUARD_MBB4531_MOVED]] = phi i1 [ true, %[[MBB4454]] ], [ undef, %[[MBB4535]] ] +; CHECK-NEXT: [[GUARD_MBB4531_MOVED]] = phi i1 [ true, %[[MBB4454]] ], [ poison, %[[MBB4535]] ] ; CHECK-NEXT: [[GUARD_LOOP_EXIT_GUARD:%.*]] = phi i1 [ true, %[[MBB4454]] ], [ false, %[[MBB4535]] ] ; CHECK-NEXT: br i1 [[GUARD_LOOP_EXIT_GUARD]], label %[[LOOP_EXIT_GUARD]], label %[[MBB4321]] ; From b4feb26606de84ff53d9b65a3b79c00a2b4d7c22 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 21 Aug 2024 23:53:36 -0700 Subject: [PATCH 154/426] [ELF] Move target to Ctx. NFC Ctx was introduced in March 2022 as a more suitable place for such singletons. Follow-up to driver (2022-10) and script (2024-08). --- lld/ELF/AArch64ErrataFix.cpp | 9 +-- lld/ELF/ARMErrataFix.cpp | 13 ++-- lld/ELF/Arch/AArch64.cpp | 14 ++-- lld/ELF/Arch/ARM.cpp | 6 +- lld/ELF/Arch/LoongArch.cpp | 7 +- lld/ELF/Arch/PPC64.cpp | 4 +- lld/ELF/Arch/RISCV.cpp | 6 +- lld/ELF/Config.h | 2 + lld/ELF/Driver.cpp | 17 ++--- lld/ELF/InputSection.cpp | 12 ++-- lld/ELF/LinkerScript.cpp | 2 +- lld/ELF/MarkLive.cpp | 4 +- lld/ELF/OutputSections.cpp | 8 +-- lld/ELF/Relocations.cpp | 125 +++++++++++++++++----------------- lld/ELF/ScriptParser.cpp | 2 +- lld/ELF/Symbols.cpp | 15 ++-- lld/ELF/SyntheticSections.cpp | 120 ++++++++++++++++---------------- lld/ELF/Target.cpp | 2 - lld/ELF/Target.h | 1 - lld/ELF/Thunks.cpp | 87 ++++++++++++----------- lld/ELF/Writer.cpp | 20 +++--- 21 files changed, 245 insertions(+), 231 deletions(-) diff --git a/lld/ELF/AArch64ErrataFix.cpp b/lld/ELF/AArch64ErrataFix.cpp index 6ca0a3038714b5..7a1477ebb79ad0 100644 --- a/lld/ELF/AArch64ErrataFix.cpp +++ b/lld/ELF/AArch64ErrataFix.cpp @@ -413,12 +413,12 @@ void Patch843419Section::writeTo(uint8_t *buf) { write32le(buf, read32le(patchee->content().begin() + patcheeOffset)); // Apply any relocation transferred from the original patchee section. - target->relocateAlloc(*this, buf); + ctx.target->relocateAlloc(*this, buf); // Return address is the next instruction after the one we have just copied. uint64_t s = getLDSTAddr() + 4; uint64_t p = patchSym->getVA() + 4; - target->relocateNoSym(buf + 4, R_AARCH64_JUMP26, s - p); + ctx.target->relocateNoSym(buf + 4, R_AARCH64_JUMP26, s - p); } void AArch64Err843419Patcher::init() { @@ -483,7 +483,8 @@ void AArch64Err843419Patcher::insertPatches( InputSectionDescription &isd, std::vector &patches) { uint64_t isecLimit; uint64_t prevIsecLimit = isd.sections.front()->outSecOff; - uint64_t patchUpperBound = prevIsecLimit + target->getThunkSectionSpacing(); + uint64_t patchUpperBound = + prevIsecLimit + ctx.target->getThunkSectionSpacing(); uint64_t outSecAddr = isd.sections.front()->getParent()->addr; // Set the outSecOff of patches to the place where we want to insert them. @@ -500,7 +501,7 @@ void AArch64Err843419Patcher::insertPatches( (*patchIt)->outSecOff = prevIsecLimit; ++patchIt; } - patchUpperBound = prevIsecLimit + target->getThunkSectionSpacing(); + patchUpperBound = prevIsecLimit + ctx.target->getThunkSectionSpacing(); } prevIsecLimit = isecLimit; } diff --git a/lld/ELF/ARMErrataFix.cpp b/lld/ELF/ARMErrataFix.cpp index 9fb791f4848a36..7068344a73b955 100644 --- a/lld/ELF/ARMErrataFix.cpp +++ b/lld/ELF/ARMErrataFix.cpp @@ -157,11 +157,11 @@ static uint64_t getThumbDestAddr(uint64_t sourceAddr, uint32_t instr) { write16le(buf + 2, instr & 0x0000ffff); int64_t offset; if (isBcc(instr)) - offset = target->getImplicitAddend(buf, R_ARM_THM_JUMP19); + offset = ctx.target->getImplicitAddend(buf, R_ARM_THM_JUMP19); else if (isB(instr)) - offset = target->getImplicitAddend(buf, R_ARM_THM_JUMP24); + offset = ctx.target->getImplicitAddend(buf, R_ARM_THM_JUMP24); else - offset = target->getImplicitAddend(buf, R_ARM_THM_CALL); + offset = ctx.target->getImplicitAddend(buf, R_ARM_THM_CALL); // A BLX instruction from Thumb to Arm may have an address that is // not 4-byte aligned. As Arm instructions are always 4-byte aligned // the instruction is calculated (from Arm ARM): @@ -182,7 +182,7 @@ void Patch657417Section::writeTo(uint8_t *buf) { write32le(buf, 0x9000f000); // If we have a relocation then apply it. if (!relocs().empty()) { - target->relocateAlloc(*this, buf); + ctx.target->relocateAlloc(*this, buf); return; } @@ -197,7 +197,8 @@ void Patch657417Section::writeTo(uint8_t *buf) { // state with a PC Bias of 4. uint64_t pcBias = isBLX(instr) ? 8 : 4; uint64_t p = getVA(pcBias); - target->relocateNoSym(buf, isARM ? R_ARM_JUMP24 : R_ARM_THM_JUMP24, s - p); + ctx.target->relocateNoSym(buf, isARM ? R_ARM_JUMP24 : R_ARM_THM_JUMP24, + s - p); } // Given a branch instruction spanning two 4KiB regions, at offset off from the @@ -233,7 +234,7 @@ static bool patchInRange(const InputSection *isec, uint64_t off, // after isec. As there can be more than one patch in the patch section we // add 0x100 as contingency to account for worst case of 1 branch every 4KiB // for a 1 MiB range. - return target->inBranchRange( + return ctx.target->inBranchRange( isBcc(instr) ? R_ARM_THM_JUMP19 : R_ARM_THM_JUMP24, isec->getVA(off), isec->getVA() + isec->getSize() + 0x100); } diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp index 0106349b2d2778..75d85d14bd62c3 100644 --- a/lld/ELF/Arch/AArch64.cpp +++ b/lld/ELF/Arch/AArch64.cpp @@ -784,7 +784,7 @@ bool AArch64Relaxer::tryRelaxAdrpAdd(const Relocation &adrpRel, write32le(buf + adrpRel.offset, 0xd503201f); // adr x_ write32le(buf + adrRel.offset, 0x10000000 | adrpDestReg); - target->relocate(buf + adrRel.offset, adrRel, val); + ctx.target->relocate(buf + adrRel.offset, adrRel, val); return true; } @@ -854,11 +854,13 @@ bool AArch64Relaxer::tryRelaxAdrpLdr(const Relocation &adrpRel, // add x_, x_ write32le(buf + addRel.offset, 0x91000000 | adrpDestReg | (adrpDestReg << 5)); - target->relocate(buf + adrpSymRel.offset, adrpSymRel, - SignExtend64(getAArch64Page(sym.getVA()) - - getAArch64Page(secAddr + adrpSymRel.offset), - 64)); - target->relocate(buf + addRel.offset, addRel, SignExtend64(sym.getVA(), 64)); + ctx.target->relocate( + buf + adrpSymRel.offset, adrpSymRel, + SignExtend64(getAArch64Page(sym.getVA()) - + getAArch64Page(secAddr + adrpSymRel.offset), + 64)); + ctx.target->relocate(buf + addRel.offset, addRel, + SignExtend64(sym.getVA(), 64)); tryRelaxAdrpAdd(adrpSymRel, addRel, secAddr, buf); return true; } diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp index 07a7535c4a231d..827ba3a6c68a14 100644 --- a/lld/ELF/Arch/ARM.cpp +++ b/lld/ELF/Arch/ARM.cpp @@ -1386,9 +1386,9 @@ void ArmCmseSGSection::writeTo(uint8_t *buf) { write16(p + 2, 0xe97f); write16(p + 4, 0xf000); // B.W S write16(p + 6, 0xb000); - target->relocateNoSym(p + 4, R_ARM_THM_JUMP24, - s->acleSeSym->getVA() - - (getVA() + s->offset + s->size)); + ctx.target->relocateNoSym(p + 4, R_ARM_THM_JUMP24, + s->acleSeSym->getVA() - + (getVA() + s->offset + s->size)); } } diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp index 68defb750c538e..01e42a5867b7ef 100644 --- a/lld/ELF/Arch/LoongArch.cpp +++ b/lld/ELF/Arch/LoongArch.cpp @@ -349,7 +349,8 @@ void LoongArch::writePltHeader(uint8_t *buf) const { write32le(buf + 0, insn(PCADDU12I, R_T2, hi20(offset), 0)); write32le(buf + 4, insn(sub, R_T1, R_T1, R_T3)); write32le(buf + 8, insn(ld, R_T3, R_T2, lo12(offset))); - write32le(buf + 12, insn(addi, R_T1, R_T1, lo12(-target->pltHeaderSize - 12))); + write32le(buf + 12, + insn(addi, R_T1, R_T1, lo12(-ctx.target->pltHeaderSize - 12))); write32le(buf + 16, insn(addi, R_T0, R_T2, lo12(offset))); write32le(buf + 20, insn(srli, R_T1, R_T1, config->is64 ? 1 : 2)); write32le(buf + 24, insn(ld, R_T0, R_T0, config->wordsize)); @@ -374,8 +375,8 @@ void LoongArch::writePlt(uint8_t *buf, const Symbol &sym, } RelType LoongArch::getDynRel(RelType type) const { - return type == target->symbolicRel ? type - : static_cast(R_LARCH_NONE); + return type == ctx.target->symbolicRel ? type + : static_cast(R_LARCH_NONE); } RelExpr LoongArch::getRelExpr(const RelType type, const Symbol &s, diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp index 753ced698a05c0..15abbfda664331 100644 --- a/lld/ELF/Arch/PPC64.cpp +++ b/lld/ELF/Arch/PPC64.cpp @@ -409,8 +409,8 @@ static bool tryRelaxPPC64TocIndirection(const Relocation &rel, return false; // Add PPC64TocOffset that will be subtracted by PPC64::relocate(). - static_cast(*target).relaxGot(bufLoc, rel, - tocRelative + ppc64TocOffset); + static_cast(*ctx.target) + .relaxGot(bufLoc, rel, tocRelative + ppc64TocOffset); return true; } diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp index dc9e541d5d8bef..2435864ce5a7f0 100644 --- a/lld/ELF/Arch/RISCV.cpp +++ b/lld/ELF/Arch/RISCV.cpp @@ -235,7 +235,7 @@ void RISCV::writePltHeader(uint8_t *buf) const { write32le(buf + 0, utype(AUIPC, X_T2, hi20(offset))); write32le(buf + 4, rtype(SUB, X_T1, X_T1, X_T3)); write32le(buf + 8, itype(load, X_T3, X_T2, lo12(offset))); - write32le(buf + 12, itype(ADDI, X_T1, X_T1, -target->pltHeaderSize - 12)); + write32le(buf + 12, itype(ADDI, X_T1, X_T1, -ctx.target->pltHeaderSize - 12)); write32le(buf + 16, itype(ADDI, X_T0, X_T2, lo12(offset))); write32le(buf + 20, itype(SRLI, X_T1, X_T1, config->is64 ? 1 : 2)); write32le(buf + 24, itype(load, X_T0, X_T0, config->wordsize)); @@ -256,8 +256,8 @@ void RISCV::writePlt(uint8_t *buf, const Symbol &sym, } RelType RISCV::getDynRel(RelType type) const { - return type == target->symbolicRel ? type - : static_cast(R_RISCV_NONE); + return type == ctx.target->symbolicRel ? type + : static_cast(R_RISCV_NONE); } RelExpr RISCV::getRelExpr(const RelType type, const Symbol &s, diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h index 5987edee0e93e7..fd40ec9805aa2b 100644 --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -47,6 +47,7 @@ class Symbol; class BitcodeCompiler; class OutputSection; class LinkerScript; +class TargetInfo; struct Partition; struct PhdrEntry; @@ -485,6 +486,7 @@ struct DuplicateSymbol { struct Ctx { LinkerDriver driver; LinkerScript *script; + TargetInfo *target; // These variables are initialized by Writer and should not be used before // Writer is initialized. diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index 308fd86c29ba12..37460a7a6c8eb4 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -94,6 +94,7 @@ void elf::errorOrWarn(const Twine &msg) { void Ctx::reset() { driver = LinkerDriver(); script = nullptr; + target = nullptr; bufferStart = nullptr; mainPart = nullptr; @@ -2065,13 +2066,13 @@ void LinkerDriver::inferMachineType() { // each target. static uint64_t getMaxPageSize(opt::InputArgList &args) { uint64_t val = args::getZOptionValue(args, OPT_z, "max-page-size", - target->defaultMaxPageSize); + ctx.target->defaultMaxPageSize); if (!isPowerOf2_64(val)) { error("max-page-size: value isn't a power of 2"); - return target->defaultMaxPageSize; + return ctx.target->defaultMaxPageSize; } if (config->nmagic || config->omagic) { - if (val != target->defaultMaxPageSize) + if (val != ctx.target->defaultMaxPageSize) warn("-z max-page-size set, but paging disabled by omagic or nmagic"); return 1; } @@ -2082,13 +2083,13 @@ static uint64_t getMaxPageSize(opt::InputArgList &args) { // each target. static uint64_t getCommonPageSize(opt::InputArgList &args) { uint64_t val = args::getZOptionValue(args, OPT_z, "common-page-size", - target->defaultCommonPageSize); + ctx.target->defaultCommonPageSize); if (!isPowerOf2_64(val)) { error("common-page-size: value isn't a power of 2"); - return target->defaultCommonPageSize; + return ctx.target->defaultCommonPageSize; } if (config->nmagic || config->omagic) { - if (val != target->defaultCommonPageSize) + if (val != ctx.target->defaultCommonPageSize) warn("-z common-page-size set, but paging disabled by omagic or nmagic"); return 1; } @@ -3106,9 +3107,9 @@ template void LinkerDriver::link(opt::InputArgList &args) { // The Target instance handles target-specific stuff, such as applying // relocations or writing a PLT section. It also contains target-dependent // values such as a default image base address. - target = getTarget(); + ctx.target = getTarget(); - config->eflags = target->calcEFlags(); + config->eflags = ctx.target->calcEFlags(); // maxPageSize (sometimes called abi page size) is the maximum page size that // the output can be run on. For example if the OS can use 4k or 64k page // sizes then maxPageSize must be 64k for the output to be useable on both. diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp index fd3e947428388b..03b91804c81543 100644 --- a/lld/ELF/InputSection.cpp +++ b/lld/ELF/InputSection.cpp @@ -434,7 +434,7 @@ void InputSection::copyRelocations(uint8_t *buf) { template void InputSection::copyRelocations(uint8_t *buf, llvm::iterator_range rels) { - const TargetInfo &target = *elf::target; + const TargetInfo &target = *elf::ctx.target; InputSectionBase *sec = getRelocatedSection(); (void)sec->contentMaybeDecompress(); // uncompress if needed @@ -950,7 +950,7 @@ uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type, template void InputSection::relocateNonAlloc(uint8_t *buf, Relocs rels) { const unsigned bits = sizeof(typename ELFT::uint) * 8; - const TargetInfo &target = *elf::target; + const TargetInfo &target = *elf::ctx.target; const auto emachine = config->emachine; const bool isDebug = isDebugSection(*this); const bool isDebugLine = isDebug && name == ".debug_line"; @@ -1103,7 +1103,7 @@ void InputSectionBase::relocate(uint8_t *buf, uint8_t *bufEnd) { adjustSplitStackFunctionPrologues(buf, bufEnd); if (flags & SHF_ALLOC) { - target->relocateAlloc(*this, buf); + ctx.target->relocateAlloc(*this, buf); return; } @@ -1198,8 +1198,8 @@ void InputSectionBase::adjustSplitStackFunctionPrologues(uint8_t *buf, if (Defined *f = getEnclosingFunction(rel.offset)) { prologues.insert(f); - if (target->adjustPrologueForCrossSplitStack(buf + f->value, end, - f->stOther)) + if (ctx.target->adjustPrologueForCrossSplitStack(buf + f->value, end, + f->stOther)) continue; if (!getFile()->someNoSplitStack) error(lld::toString(this) + ": " + f->getName() + @@ -1208,7 +1208,7 @@ void InputSectionBase::adjustSplitStackFunctionPrologues(uint8_t *buf, } } - if (target->needsMoreStackNonSplit) + if (ctx.target->needsMoreStackNonSplit) switchMorestackCallsToMorestackNonSplit(prologues, morestackCalls); } diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp index 9ddda99d90f02d..8bab26cd3b0f07 100644 --- a/lld/ELF/LinkerScript.cpp +++ b/lld/ELF/LinkerScript.cpp @@ -1487,7 +1487,7 @@ LinkerScript::assignAddresses() { dot = config->imageBase.value_or(0); } else { // Assign addresses to headers right now. - dot = target->getImageBase(); + dot = ctx.target->getImageBase(); ctx.out.elfHeader->addr = dot; ctx.out.programHeaders->addr = dot + ctx.out.elfHeader->size; dot += getHeaderSize(); diff --git a/lld/ELF/MarkLive.cpp b/lld/ELF/MarkLive.cpp index b2558a20ba1a78..56ff53fc89bddf 100644 --- a/lld/ELF/MarkLive.cpp +++ b/lld/ELF/MarkLive.cpp @@ -75,8 +75,8 @@ template class MarkLive { template static uint64_t getAddend(InputSectionBase &sec, const typename ELFT::Rel &rel) { - return target->getImplicitAddend(sec.content().begin() + rel.r_offset, - rel.getType(config->isMips64EL)); + return ctx.target->getImplicitAddend(sec.content().begin() + rel.r_offset, + rel.getType(config->isMips64EL)); } template diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp index c076f442558fac..cb17e107d6dae2 100644 --- a/lld/ELF/OutputSections.cpp +++ b/lld/ELF/OutputSections.cpp @@ -278,7 +278,7 @@ static void nopInstrFill(uint8_t *buf, size_t size) { unsigned i = 0; if (size == 0) return; - std::vector> nopFiller = *target->nopInstrs; + std::vector> nopFiller = *ctx.target->nopInstrs; unsigned num = size / nopFiller.back().size(); for (unsigned c = 0; c < num; ++c) { memcpy(buf + i, nopFiller.back().data(), nopFiller.back().size()); @@ -541,7 +541,7 @@ void OutputSection::writeTo(uint8_t *buf, parallel::TaskGroup &tg) { else end = buf + sections[i + 1]->outSecOff; if (isec->nopFiller) { - assert(target->nopInstrs); + assert(ctx.target->nopInstrs); nopInstrFill(start, end - start); } else fill(start, end - start, filler); @@ -857,7 +857,7 @@ std::array OutputSection::getFiller() { if (filler) return *filler; if (flags & SHF_EXECINSTR) - return target->trapInstr; + return ctx.target->trapInstr; return {0, 0, 0, 0}; } @@ -890,7 +890,7 @@ void OutputSection::checkDynRelAddends(const uint8_t *bufStart) { int64_t writtenAddend = relOsec->type == SHT_NOBITS ? 0 - : target->getImplicitAddend(relocTarget, rel.type); + : ctx.target->getImplicitAddend(relocTarget, rel.type); if (addend != writtenAddend) internalLinkerError( getErrorLocation(relocTarget), diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp index fa94842f3636b3..e5f58f1a7dd129 100644 --- a/lld/ELF/Relocations.cpp +++ b/lld/ELF/Relocations.cpp @@ -399,7 +399,7 @@ template static void addCopyRelSymbol(SharedSymbol &ss) { for (SharedSymbol *sym : getSymbolsAt(ss)) replaceWithDefined(*sym, *sec, 0, sym->size); - ctx.mainPart->relaDyn->addSymbolReloc(target->copyRel, *sec, 0, ss); + ctx.mainPart->relaDyn->addSymbolReloc(ctx.target->copyRel, *sec, 0, ss); } // .eh_frame sections are mergeable input sections, so their input @@ -511,7 +511,7 @@ int64_t RelocationScanner::computeMipsAddend(const RelTy &rel, RelExpr expr, for (const RelTy *ri = &rel; ri != static_cast(end); ++ri) if (ri->getType(config->isMips64EL) == pairTy && ri->getSymbol(config->isMips64EL) == symIndex) - return target->getImplicitAddend(buf + ri->r_offset, pairTy); + return ctx.target->getImplicitAddend(buf + ri->r_offset, pairTy); warn("can't find matching " + toString(pairTy) + " relocation for " + toString(type)); @@ -876,8 +876,8 @@ static void addRelativeReloc(InputSectionBase &isec, uint64_t offsetInSec, if (sym.isTagged()) { std::lock_guard lock(relocMutex); - part.relaDyn->addRelativeReloc(target->relativeRel, isec, offsetInSec, sym, - addend, type, expr); + part.relaDyn->addRelativeReloc(ctx.target->relativeRel, isec, offsetInSec, + sym, addend, type, expr); // With MTE globals, we always want to derive the address tag by `ldg`-ing // the symbol. When we have a RELATIVE relocation though, we no longer have // a reference to the symbol. Because of this, when we have an addend that @@ -906,8 +906,8 @@ static void addRelativeReloc(InputSectionBase &isec, uint64_t offsetInSec, part.relrDyn->relocs.push_back({&isec, isec.relocs().size() - 1}); return; } - part.relaDyn->addRelativeReloc(target->relativeRel, isec, offsetInSec, - sym, addend, type, expr); + part.relaDyn->addRelativeReloc(ctx.target->relativeRel, isec, + offsetInSec, sym, addend, type, expr); } template @@ -927,7 +927,7 @@ void elf::addGotEntry(Symbol &sym) { // If preemptible, emit a GLOB_DAT relocation. if (sym.isPreemptible) { - ctx.mainPart->relaDyn->addReloc({target->gotRel, in.got.get(), off, + ctx.mainPart->relaDyn->addReloc({ctx.target->gotRel, in.got.get(), off, DynamicReloc::AgainstSymbol, sym, 0, R_ABS}); return; @@ -936,20 +936,20 @@ void elf::addGotEntry(Symbol &sym) { // Otherwise, the value is either a link-time constant or the load base // plus a constant. if (!config->isPic || isAbsolute(sym)) - in.got->addConstant({R_ABS, target->symbolicRel, off, 0, &sym}); + in.got->addConstant({R_ABS, ctx.target->symbolicRel, off, 0, &sym}); else - addRelativeReloc(*in.got, off, sym, 0, R_ABS, target->symbolicRel); + addRelativeReloc(*in.got, off, sym, 0, R_ABS, ctx.target->symbolicRel); } static void addTpOffsetGotEntry(Symbol &sym) { in.got->addEntry(sym); uint64_t off = sym.getGotOffset(); if (!sym.isPreemptible && !config->shared) { - in.got->addConstant({R_TPREL, target->symbolicRel, off, 0, &sym}); + in.got->addConstant({R_TPREL, ctx.target->symbolicRel, off, 0, &sym}); return; } ctx.mainPart->relaDyn->addAddendOnlyRelocIfNonPreemptible( - target->tlsGotRel, *in.got, off, sym, target->symbolicRel); + ctx.target->tlsGotRel, *in.got, off, sym, ctx.target->symbolicRel); } // Return true if we can define a symbol in the executable that @@ -997,7 +997,7 @@ bool RelocationScanner::isStaticLinkTimeConstant(RelExpr e, RelType type, // These never do, except if the entire file is position dependent or if // only the low bits are used. if (e == R_GOT || e == R_PLT) - return target->usesOnlyLowPageBits(type) || !config->isPic; + return ctx.target->usesOnlyLowPageBits(type) || !config->isPic; // R_AARCH64_AUTH_ABS64 requires a dynamic relocation. if (sym.isPreemptible || e == R_AARCH64_AUTH) @@ -1018,7 +1018,7 @@ bool RelocationScanner::isStaticLinkTimeConstant(RelExpr e, RelType type, if (!absVal && relE) return true; if (!absVal && !relE) - return target->usesOnlyLowPageBits(type); + return ctx.target->usesOnlyLowPageBits(type); assert(absVal && relE); @@ -1072,8 +1072,8 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset, type == R_HEX_GD_PLT_B32_PCREL_X))) expr = fromPlt(expr); } else if (!isAbsoluteValue(sym)) { - expr = - target->adjustGotPcExpr(type, addend, sec->content().data() + offset); + expr = ctx.target->adjustGotPcExpr(type, addend, + sec->content().data() + offset); // If the target adjusted the expression to R_RELAX_GOT_PC, we may end up // needing the GOT if we can't relax everything. if (expr == R_RELAX_GOT_PC) @@ -1142,15 +1142,15 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset, !(config->zText || (isa(sec) && config->emachine != EM_MIPS)); if (canWrite) { - RelType rel = target->getDynRel(type); + RelType rel = ctx.target->getDynRel(type); if (oneof(expr) || - (rel == target->symbolicRel && !sym.isPreemptible)) { + (rel == ctx.target->symbolicRel && !sym.isPreemptible)) { addRelativeReloc(*sec, offset, sym, addend, expr, type); return; } if (rel != 0) { - if (config->emachine == EM_MIPS && rel == target->symbolicRel) - rel = target->relativeRel; + if (config->emachine == EM_MIPS && rel == ctx.target->symbolicRel) + rel = ctx.target->relativeRel; std::lock_guard lock(relocMutex); Partition &part = sec->getPartition(); if (config->emachine == EM_AARCH64 && type == R_AARCH64_AUTH_ABS64) { @@ -1363,9 +1363,9 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym, if (oneof(expr)) { // Local-Dynamic relocs can be optimized to Local-Exec. if (execOptimize) { - c.addReloc({target->adjustTlsExpr(type, R_RELAX_TLS_LD_TO_LE), type, + c.addReloc({ctx.target->adjustTlsExpr(type, R_RELAX_TLS_LD_TO_LE), type, offset, addend, &sym}); - return target->getTlsGdRelaxSkip(type); + return ctx.target->getTlsGdRelaxSkip(type); } if (expr == R_TLSLD_HINT) return 1; @@ -1377,7 +1377,7 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym, // Local-Dynamic relocs can be optimized to Local-Exec. if (expr == R_DTPREL) { if (execOptimize) - expr = target->adjustTlsExpr(type, R_RELAX_TLS_LD_TO_LE); + expr = ctx.target->adjustTlsExpr(type, R_RELAX_TLS_LD_TO_LE); c.addReloc({expr, type, offset, addend, &sym}); return 1; } @@ -1408,13 +1408,13 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym, // the categorization in RISCV::relocateAlloc. if (sym.isPreemptible) { sym.setFlags(NEEDS_TLSGD_TO_IE); - c.addReloc({target->adjustTlsExpr(type, R_RELAX_TLS_GD_TO_IE), type, + c.addReloc({ctx.target->adjustTlsExpr(type, R_RELAX_TLS_GD_TO_IE), type, offset, addend, &sym}); } else { - c.addReloc({target->adjustTlsExpr(type, R_RELAX_TLS_GD_TO_LE), type, + c.addReloc({ctx.target->adjustTlsExpr(type, R_RELAX_TLS_GD_TO_LE), type, offset, addend, &sym}); } - return target->getTlsGdRelaxSkip(type); + return ctx.target->getTlsGdRelaxSkip(type); } if (oneofisPic && !target->usesOnlyLowPageBits(type)) + if (expr == R_GOT && config->isPic && + !ctx.target->usesOnlyLowPageBits(type)) addRelativeReloc(c, offset, sym, addend, expr, type); else c.addReloc({expr, type, offset, addend, &sym}); @@ -1461,10 +1462,11 @@ void RelocationScanner::scanOne(typename Relocs::const_iterator &i) { if (offset == uint64_t(-1)) return; - RelExpr expr = target->getRelExpr(type, sym, sec->content().data() + offset); + RelExpr expr = + ctx.target->getRelExpr(type, sym, sec->content().data() + offset); int64_t addend = RelTy::HasAddend ? getAddend(rel) - : target->getImplicitAddend( + : ctx.target->getImplicitAddend( sec->content().data() + rel.r_offset, type); if (LLVM_UNLIKELY(config->emachine == EM_MIPS)) addend += computeMipsAddend(rel, expr, sym.isLocal()); @@ -1731,7 +1733,7 @@ static bool handleNonPreemptibleIfunc(Symbol &sym, uint16_t flags) { directSym->allocateAux(); auto &dyn = config->androidPackDynRelocs ? *in.relaPlt : *ctx.mainPart->relaDyn; - addPltEntry(*in.iplt, *in.igotPlt, dyn, target->iRelativeRel, *directSym); + addPltEntry(*in.iplt, *in.igotPlt, dyn, ctx.target->iRelativeRel, *directSym); sym.allocateAux(); ctx.symAux.back().pltIdx = ctx.symAux[directSym->auxIdx].pltIdx; @@ -1739,7 +1741,7 @@ static bool handleNonPreemptibleIfunc(Symbol &sym, uint16_t flags) { // Change the value to the IPLT and redirect all references to it. auto &d = cast(sym); d.section = in.iplt.get(); - d.value = d.getPltIdx() * target->ipltEntrySize; + d.value = d.getPltIdx() * ctx.target->ipltEntrySize; d.size = 0; // It's important to set the symbol type here so that dynamic loaders // don't try to call the PLT as if it were an ifunc resolver. @@ -1770,7 +1772,7 @@ void elf::postScanRelocations() { if (flags & NEEDS_GOT) addGotEntry(sym); if (flags & NEEDS_PLT) - addPltEntry(*in.plt, *in.gotPlt, *in.relaPlt, target->pltRel, sym); + addPltEntry(*in.plt, *in.gotPlt, *in.relaPlt, ctx.target->pltRel, sym); if (flags & NEEDS_COPY) { if (sym.isObject()) { invokeELFT(addCopyRelSymbol, cast(sym)); @@ -1781,8 +1783,8 @@ void elf::postScanRelocations() { assert(sym.isFunc() && sym.hasFlag(NEEDS_PLT)); if (!sym.isDefined()) { replaceWithDefined(sym, *in.plt, - target->pltHeaderSize + - target->pltEntrySize * sym.getPltIdx(), + ctx.target->pltHeaderSize + + ctx.target->pltEntrySize * sym.getPltIdx(), 0); sym.setFlags(NEEDS_COPY); if (config->emachine == EM_PPC) { @@ -1803,37 +1805,37 @@ void elf::postScanRelocations() { if (flags & NEEDS_TLSDESC) { got->addTlsDescEntry(sym); ctx.mainPart->relaDyn->addAddendOnlyRelocIfNonPreemptible( - target->tlsDescRel, *got, got->getTlsDescOffset(sym), sym, - target->tlsDescRel); + ctx.target->tlsDescRel, *got, got->getTlsDescOffset(sym), sym, + ctx.target->tlsDescRel); } if (flags & NEEDS_TLSGD) { got->addDynTlsEntry(sym); uint64_t off = got->getGlobalDynOffset(sym); if (isLocalInExecutable) // Write one to the GOT slot. - got->addConstant({R_ADDEND, target->symbolicRel, off, 1, &sym}); + got->addConstant({R_ADDEND, ctx.target->symbolicRel, off, 1, &sym}); else - ctx.mainPart->relaDyn->addSymbolReloc(target->tlsModuleIndexRel, *got, - off, sym); + ctx.mainPart->relaDyn->addSymbolReloc(ctx.target->tlsModuleIndexRel, + *got, off, sym); // If the symbol is preemptible we need the dynamic linker to write // the offset too. uint64_t offsetOff = off + config->wordsize; if (sym.isPreemptible) - ctx.mainPart->relaDyn->addSymbolReloc(target->tlsOffsetRel, *got, + ctx.mainPart->relaDyn->addSymbolReloc(ctx.target->tlsOffsetRel, *got, offsetOff, sym); else - got->addConstant({R_ABS, target->tlsOffsetRel, offsetOff, 0, &sym}); + got->addConstant({R_ABS, ctx.target->tlsOffsetRel, offsetOff, 0, &sym}); } if (flags & NEEDS_TLSGD_TO_IE) { got->addEntry(sym); - ctx.mainPart->relaDyn->addSymbolReloc(target->tlsGotRel, *got, + ctx.mainPart->relaDyn->addSymbolReloc(ctx.target->tlsGotRel, *got, sym.getGotOffset(), sym); } if (flags & NEEDS_GOT_DTPREL) { got->addEntry(sym); got->addConstant( - {R_ABS, target->tlsOffsetRel, sym.getGotOffset(), 0, &sym}); + {R_ABS, ctx.target->tlsOffsetRel, sym.getGotOffset(), 0, &sym}); } if ((flags & NEEDS_TLSIE) && !(flags & NEEDS_TLSGD_TO_IE)) @@ -1845,10 +1847,10 @@ void elf::postScanRelocations() { static Undefined dummy(ctx.internalFile, "", STB_LOCAL, 0, 0); if (config->shared) ctx.mainPart->relaDyn->addReloc( - {target->tlsModuleIndexRel, got, got->getTlsIndexOff()}); + {ctx.target->tlsModuleIndexRel, got, got->getTlsIndexOff()}); else - got->addConstant( - {R_ADDEND, target->symbolicRel, got->getTlsIndexOff(), 1, &dummy}); + got->addConstant({R_ADDEND, ctx.target->symbolicRel, + got->getTlsIndexOff(), 1, &dummy}); } assert(ctx.symAux.size() == 1); @@ -2054,8 +2056,8 @@ ThunkSection *ThunkCreator::getISDThunkSec(OutputSection *os, ThunkSection *ts = tp.first; uint64_t tsBase = os->addr + ts->outSecOff - pcBias; uint64_t tsLimit = tsBase + ts->getSize(); - if (target->inBranchRange(rel.type, src, - (src > tsLimit) ? tsBase : tsLimit)) + if (ctx.target->inBranchRange(rel.type, src, + (src > tsLimit) ? tsBase : tsLimit)) return ts; } @@ -2065,11 +2067,11 @@ ThunkSection *ThunkCreator::getISDThunkSec(OutputSection *os, // possible. Error if InputSection is so large we cannot place ThunkSection // anywhere in Range. uint64_t thunkSecOff = isec->outSecOff; - if (!target->inBranchRange(rel.type, src, - os->addr + thunkSecOff + rel.addend)) { + if (!ctx.target->inBranchRange(rel.type, src, + os->addr + thunkSecOff + rel.addend)) { thunkSecOff = isec->outSecOff + isec->getSize(); - if (!target->inBranchRange(rel.type, src, - os->addr + thunkSecOff + rel.addend)) + if (!ctx.target->inBranchRange(rel.type, src, + os->addr + thunkSecOff + rel.addend)) fatal("InputSection too large for range extension thunk " + isec->getObjMsg(src - (os->addr + isec->outSecOff))); } @@ -2123,8 +2125,7 @@ ThunkSection *ThunkCreator::getISThunkSec(InputSection *isec) { // allow for the creation of a short thunk. void ThunkCreator::createInitialThunkSections( ArrayRef outputSections) { - uint32_t thunkSectionSpacing = target->getThunkSectionSpacing(); - + uint32_t thunkSectionSpacing = ctx.target->getThunkSectionSpacing(); forEachInputSectionDescription( outputSections, [&](OutputSection *os, InputSectionDescription *isd) { if (isd->sections.empty()) @@ -2188,7 +2189,7 @@ ThunkSection *ThunkCreator::addThunkSection(OutputSection *os, uint64_t isdSize = isd->sections.back()->outSecOff + isd->sections.back()->getSize() - isd->sections.front()->outSecOff; - if (os->size > target->getThunkSectionSpacing() && isdSize > 4096) + if (os->size > ctx.target->getThunkSectionSpacing() && isdSize > 4096) ts->roundUpSizeForErrata = true; } isd->thunkSections.push_back({ts, pass}); @@ -2230,8 +2231,8 @@ std::pair ThunkCreator::getThunk(InputSection *isec, for (Thunk *t : *thunkVec) if (isThunkSectionCompatible(isec, t->getThunkTargetSym()->section) && t->isCompatibleWith(*isec, rel) && - target->inBranchRange(rel.type, src, - t->getThunkTargetSym()->getVA(-pcBias))) + ctx.target->inBranchRange(rel.type, src, + t->getThunkTargetSym()->getVA(-pcBias))) return std::make_pair(t, false); // No existing compatible Thunk in range, create a new one @@ -2246,7 +2247,7 @@ std::pair ThunkCreator::getThunk(InputSection *isec, // relocation back to its original non-Thunk target. bool ThunkCreator::normalizeExistingThunk(Relocation &rel, uint64_t src) { if (Thunk *t = thunks.lookup(rel.sym)) { - if (target->inBranchRange(rel.type, src, rel.sym->getVA(rel.addend))) + if (ctx.target->inBranchRange(rel.type, src, rel.sym->getVA(rel.addend))) return true; rel.sym = &t->destination; rel.addend = t->addend; @@ -2286,7 +2287,7 @@ bool ThunkCreator::createThunks(uint32_t pass, this->pass = pass; bool addressesChanged = false; - if (pass == 0 && target->getThunkSectionSpacing()) + if (pass == 0 && ctx.target->getThunkSectionSpacing()) createInitialThunkSections(outputSections); // Create all the Thunks and insert them into synthetic ThunkSections. The @@ -2306,8 +2307,8 @@ bool ThunkCreator::createThunks(uint32_t pass, if (pass > 0 && normalizeExistingThunk(rel, src)) continue; - if (!target->needsThunk(rel.expr, rel.type, isec->file, src, - *rel.sym, rel.addend)) + if (!ctx.target->needsThunk(rel.expr, rel.type, isec->file, src, + *rel.sym, rel.addend)) continue; Thunk *t; @@ -2378,8 +2379,8 @@ void elf::hexagonTLSSymbolUpdate(ArrayRef outputSections) { if (rel.sym->type == llvm::ELF::STT_TLS && rel.expr == R_PLT_PC) { if (needEntry) { sym->allocateAux(); - addPltEntry(*in.plt, *in.gotPlt, *in.relaPlt, target->pltRel, - *sym); + addPltEntry(*in.plt, *in.gotPlt, *in.relaPlt, + ctx.target->pltRel, *sym); needEntry = false; } rel.sym = sym; diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp index 08773bfb6ffe07..819036a1ab1820 100644 --- a/lld/ELF/ScriptParser.cpp +++ b/lld/ELF/ScriptParser.cpp @@ -1326,7 +1326,7 @@ Expr ScriptParser::readExpr1(Expr lhs, int minPrec) { Expr ScriptParser::getPageSize() { std::string location = getCurrentLocation(); return [=]() -> uint64_t { - if (target) + if (ctx.target) return config->commonPageSize; error(location + ": unable to calculate page size"); return 4096; // Return a dummy value. diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp index 13fc6dc0dd572a..b08c679ab36850 100644 --- a/lld/ELF/Symbols.cpp +++ b/lld/ELF/Symbols.cpp @@ -152,7 +152,7 @@ uint64_t Symbol::getGotVA() const { } uint64_t Symbol::getGotOffset() const { - return getGotIdx() * target->gotEntrySize; + return getGotIdx() * ctx.target->gotEntrySize; } uint64_t Symbol::getGotPltVA() const { @@ -163,15 +163,16 @@ uint64_t Symbol::getGotPltVA() const { uint64_t Symbol::getGotPltOffset() const { if (isInIplt) - return getPltIdx() * target->gotEntrySize; - return (getPltIdx() + target->gotPltHeaderEntriesNum) * target->gotEntrySize; + return getPltIdx() * ctx.target->gotEntrySize; + return (getPltIdx() + ctx.target->gotPltHeaderEntriesNum) * + ctx.target->gotEntrySize; } uint64_t Symbol::getPltVA() const { - uint64_t outVA = isInIplt - ? in.iplt->getVA() + getPltIdx() * target->ipltEntrySize - : in.plt->getVA() + in.plt->headerSize + - getPltIdx() * target->pltEntrySize; + uint64_t outVA = + isInIplt ? in.iplt->getVA() + getPltIdx() * ctx.target->ipltEntrySize + : in.plt->getVA() + in.plt->headerSize + + getPltIdx() * ctx.target->pltEntrySize; // While linking microMIPS code PLT code are always microMIPS // code. Set the less-significant bit to track that fact. diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index 4c2b6db08b99a2..df82e9ed0652ec 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -641,7 +641,7 @@ void EhFrameSection::writeTo(uint8_t *buf) { // in the output buffer, but relocateAlloc() still works because // getOffset() takes care of discontiguous section pieces. for (EhInputSection *s : sections) - target->relocateAlloc(*s, buf); + ctx.target->relocateAlloc(*s, buf); if (getPartition().ehFrameHdr && getPartition().ehFrameHdr->getParent()) getPartition().ehFrameHdr->write(); @@ -649,8 +649,8 @@ void EhFrameSection::writeTo(uint8_t *buf) { GotSection::GotSection() : SyntheticSection(SHF_ALLOC | SHF_WRITE, SHT_PROGBITS, - target->gotEntrySize, ".got") { - numEntries = target->gotHeaderEntriesNum; + ctx.target->gotEntrySize, ".got") { + numEntries = ctx.target->gotHeaderEntriesNum; } void GotSection::addConstant(const Relocation &r) { relocations.push_back(r); } @@ -702,7 +702,8 @@ uint64_t GotSection::getGlobalDynOffset(const Symbol &b) const { void GotSection::finalizeContents() { if (config->emachine == EM_PPC64 && - numEntries <= target->gotHeaderEntriesNum && !ctx.sym.globalOffsetTable) + numEntries <= ctx.target->gotHeaderEntriesNum && + !ctx.sym.globalOffsetTable) size = 0; else size = numEntries * config->wordsize; @@ -711,15 +712,15 @@ void GotSection::finalizeContents() { bool GotSection::isNeeded() const { // Needed if the GOT symbol is used or the number of entries is more than just // the header. A GOT with just the header may not be needed. - return hasGotOffRel || numEntries > target->gotHeaderEntriesNum; + return hasGotOffRel || numEntries > ctx.target->gotHeaderEntriesNum; } void GotSection::writeTo(uint8_t *buf) { // On PPC64 .got may be needed but empty. Skip the write. if (size == 0) return; - target->writeGotHeader(buf); - target->relocateAlloc(*this, buf); + ctx.target->writeGotHeader(buf); + ctx.target->relocateAlloc(*this, buf); } static uint64_t getMipsPageAddr(uint64_t addr) { @@ -1018,7 +1019,7 @@ void MipsGotSection::build() { // be allocated before us in the static TLS block. if (s->isPreemptible || config->shared) ctx.mainPart->relaDyn->addReloc( - {target->tlsGotRel, this, offset, + {ctx.target->tlsGotRel, this, offset, DynamicReloc::AgainstSymbolWithTargetVA, *s, 0, R_ABS}); } for (std::pair &p : got.dynTlsSymbols) { @@ -1028,7 +1029,7 @@ void MipsGotSection::build() { if (!config->shared) continue; ctx.mainPart->relaDyn->addReloc( - {target->tlsModuleIndexRel, this, offset}); + {ctx.target->tlsModuleIndexRel, this, offset}); } else { // When building a shared library we still need a dynamic relocation // for the module index. Therefore only checking for @@ -1036,14 +1037,14 @@ void MipsGotSection::build() { // thread-locals that have been marked as local through a linker script) if (!s->isPreemptible && !config->shared) continue; - ctx.mainPart->relaDyn->addSymbolReloc(target->tlsModuleIndexRel, *this, - offset, *s); + ctx.mainPart->relaDyn->addSymbolReloc(ctx.target->tlsModuleIndexRel, + *this, offset, *s); // However, we can skip writing the TLS offset reloc for non-preemptible // symbols since it is known even in shared libraries if (!s->isPreemptible) continue; offset += config->wordsize; - ctx.mainPart->relaDyn->addSymbolReloc(target->tlsOffsetRel, *this, + ctx.mainPart->relaDyn->addSymbolReloc(ctx.target->tlsOffsetRel, *this, offset, *s); } } @@ -1056,8 +1057,8 @@ void MipsGotSection::build() { // Dynamic relocations for "global" entries. for (const std::pair &p : got.global) { uint64_t offset = p.second * config->wordsize; - ctx.mainPart->relaDyn->addSymbolReloc(target->relativeRel, *this, offset, - *p.first); + ctx.mainPart->relaDyn->addSymbolReloc(ctx.target->relativeRel, *this, + offset, *p.first); } if (!config->isPic) continue; @@ -1067,13 +1068,13 @@ void MipsGotSection::build() { size_t pageCount = l.second.count; for (size_t pi = 0; pi < pageCount; ++pi) { uint64_t offset = (l.second.firstIndex + pi) * config->wordsize; - ctx.mainPart->relaDyn->addReloc({target->relativeRel, this, offset, + ctx.mainPart->relaDyn->addReloc({ctx.target->relativeRel, this, offset, l.first, int64_t(pi * 0x10000)}); } } for (const std::pair &p : got.local16) { uint64_t offset = p.second * config->wordsize; - ctx.mainPart->relaDyn->addReloc({target->relativeRel, this, offset, + ctx.mainPart->relaDyn->addReloc({ctx.target->relativeRel, this, offset, DynamicReloc::AddendOnlyWithTargetVA, *p.first.first, p.first.second, R_ABS}); } @@ -1180,16 +1181,16 @@ void GotPltSection::addEntry(Symbol &sym) { } size_t GotPltSection::getSize() const { - return (target->gotPltHeaderEntriesNum + entries.size()) * - target->gotEntrySize; + return (ctx.target->gotPltHeaderEntriesNum + entries.size()) * + ctx.target->gotEntrySize; } void GotPltSection::writeTo(uint8_t *buf) { - target->writeGotPltHeader(buf); - buf += target->gotPltHeaderEntriesNum * target->gotEntrySize; + ctx.target->writeGotPltHeader(buf); + buf += ctx.target->gotPltHeaderEntriesNum * ctx.target->gotEntrySize; for (const Symbol *b : entries) { - target->writeGotPlt(buf, *b); - buf += target->gotEntrySize; + ctx.target->writeGotPlt(buf, *b); + buf += ctx.target->gotEntrySize; } } @@ -1217,7 +1218,7 @@ static StringRef getIgotPltName() { IgotPltSection::IgotPltSection() : SyntheticSection(SHF_ALLOC | SHF_WRITE, config->emachine == EM_PPC64 ? SHT_NOBITS : SHT_PROGBITS, - target->gotEntrySize, getIgotPltName()) {} + ctx.target->gotEntrySize, getIgotPltName()) {} void IgotPltSection::addEntry(Symbol &sym) { assert(ctx.symAux.back().pltIdx == entries.size()); @@ -1225,13 +1226,13 @@ void IgotPltSection::addEntry(Symbol &sym) { } size_t IgotPltSection::getSize() const { - return entries.size() * target->gotEntrySize; + return entries.size() * ctx.target->gotEntrySize; } void IgotPltSection::writeTo(uint8_t *buf) { for (const Symbol *b : entries) { - target->writeIgotPlt(buf, *b); - buf += target->gotEntrySize; + ctx.target->writeIgotPlt(buf, *b); + buf += ctx.target->gotEntrySize; } } @@ -1444,15 +1445,15 @@ DynamicSection::computeContents() { break; case EM_AARCH64: if (llvm::find_if(in.relaPlt->relocs, [](const DynamicReloc &r) { - return r.type == target->pltRel && - r.sym->stOther & STO_AARCH64_VARIANT_PCS; + return r.type == ctx.target->pltRel && + r.sym->stOther & STO_AARCH64_VARIANT_PCS; }) != in.relaPlt->relocs.end()) addInt(DT_AARCH64_VARIANT_PCS, 0); addInSec(DT_PLTGOT, *in.gotPlt); break; case EM_RISCV: if (llvm::any_of(in.relaPlt->relocs, [](const DynamicReloc &r) { - return r.type == target->pltRel && + return r.type == ctx.target->pltRel && (r.sym->stOther & STO_RISCV_VARIANT_CC); })) addInt(DT_RISCV_VARIANT_CC, 0); @@ -1534,7 +1535,7 @@ DynamicSection::computeContents() { if (config->emachine == EM_MIPS) { addInt(DT_MIPS_RLD_VERSION, 1); addInt(DT_MIPS_FLAGS, RHF_NOTPOT); - addInt(DT_MIPS_BASE_ADDRESS, target->getImageBase()); + addInt(DT_MIPS_BASE_ADDRESS, ctx.target->getImageBase()); addInt(DT_MIPS_SYMTABNO, part.dynSymTab->getNumSymbols()); addInt(DT_MIPS_LOCAL_GOTNO, in.mipsGot->getLocalEntriesNum()); @@ -1562,7 +1563,7 @@ DynamicSection::computeContents() { if (config->emachine == EM_PPC64 && in.plt->isNeeded()) { // The Glink tag points to 32 bytes before the first lazy symbol resolution // stub, which starts directly after the header. - addInt(DT_PPC64_GLINK, in.plt->getVA() + target->pltHeaderSize - 32); + addInt(DT_PPC64_GLINK, in.plt->getVA() + ctx.target->pltHeaderSize - 32); } if (config->emachine == EM_PPC64) @@ -1618,7 +1619,8 @@ uint32_t DynamicReloc::getSymIndex(SymbolTableBaseSection *symTab) const { return 0; size_t index = symTab->getSymbolIndex(*sym); - assert((index != 0 || (type != target->gotRel && type != target->pltRel) || + assert((index != 0 || + (type != ctx.target->gotRel && type != ctx.target->pltRel) || !ctx.mainPart->dynSymTab->getParent()) && "GOT or PLT relocation must refer to symbol in dynamic symbol table"); return index; @@ -1637,7 +1639,7 @@ void RelocationBaseSection::addSymbolReloc( RelType dynType, InputSectionBase &isec, uint64_t offsetInSec, Symbol &sym, int64_t addend, std::optional addendRelType) { addReloc(DynamicReloc::AgainstSymbol, dynType, isec, offsetInSec, sym, addend, - R_ADDEND, addendRelType ? *addendRelType : target->noneRel); + R_ADDEND, addendRelType ? *addendRelType : ctx.target->noneRel); } void RelocationBaseSection::addAddendOnlyRelocIfNonPreemptible( @@ -1665,7 +1667,7 @@ void RelocationBaseSection::mergeRels() { void RelocationBaseSection::partitionRels() { if (!combreloc) return; - const RelType relativeRel = target->relativeRel; + const RelType relativeRel = ctx.target->relativeRel; numRelativeRelocs = std::stable_partition(relocs.begin(), relocs.end(), [=](auto &r) { return r.type == relativeRel; }) - @@ -1703,7 +1705,7 @@ void RelocationBaseSection::computeRels() { auto irelative = std::stable_partition( relocs.begin() + numRelativeRelocs, relocs.end(), - [t = target->iRelativeRel](auto &r) { return r.type != t; }); + [t = ctx.target->iRelativeRel](auto &r) { return r.type != t; }); // Sort by (!IsRelative,SymIndex,r_offset). DT_REL[A]COUNT requires us to // place R_*_RELATIVE first. SymIndex is to improve locality, while r_offset @@ -1839,7 +1841,7 @@ bool AndroidPackedRelocationSection::updateAllocSize() { rel.type, false); r.r_addend = config->isRela ? rel.computeAddend() : 0; - if (r.getType(config->isMips64EL) == target->relativeRel) + if (r.getType(config->isMips64EL) == ctx.target->relativeRel) relatives.push_back(r); else nonRelatives.push_back(r); @@ -1937,7 +1939,7 @@ bool AndroidPackedRelocationSection::updateAllocSize() { add(RELOCATION_GROUPED_BY_OFFSET_DELTA_FLAG | RELOCATION_GROUPED_BY_INFO_FLAG | hasAddendIfRela); add(g[0].r_offset - offset); - add(target->relativeRel); + add(ctx.target->relativeRel); if (config->isRela) { add(g[0].r_addend - addend); addend = g[0].r_addend; @@ -1948,7 +1950,7 @@ bool AndroidPackedRelocationSection::updateAllocSize() { add(RELOCATION_GROUPED_BY_OFFSET_DELTA_FLAG | RELOCATION_GROUPED_BY_INFO_FLAG | hasAddendIfRela); add(config->wordsize); - add(target->relativeRel); + add(ctx.target->relativeRel); if (config->isRela) { for (const auto &i : llvm::drop_begin(g)) { add(i.r_addend - addend); @@ -1963,7 +1965,7 @@ bool AndroidPackedRelocationSection::updateAllocSize() { if (!ungroupedRelatives.empty()) { add(ungroupedRelatives.size()); add(RELOCATION_GROUPED_BY_INFO_FLAG | hasAddendIfRela); - add(target->relativeRel); + add(ctx.target->relativeRel); for (Elf_Rela &r : ungroupedRelatives) { add(r.r_offset - offset); offset = r.r_offset; @@ -2538,7 +2540,7 @@ void HashTableSection::writeTo(uint8_t *buf) { PltSection::PltSection() : SyntheticSection(SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 16, ".plt"), - headerSize(target->pltHeaderSize) { + headerSize(ctx.target->pltHeaderSize) { // On PowerPC, this section contains lazy symbol resolvers. if (config->emachine == EM_PPC64) { name = ".glink"; @@ -2560,12 +2562,12 @@ PltSection::PltSection() void PltSection::writeTo(uint8_t *buf) { // At beginning of PLT, we have code to call the dynamic // linker to resolve dynsyms at runtime. Write such code. - target->writePltHeader(buf); + ctx.target->writePltHeader(buf); size_t off = headerSize; for (const Symbol *sym : entries) { - target->writePlt(buf + off, *sym, getVA() + off); - off += target->pltEntrySize; + ctx.target->writePlt(buf + off, *sym, getVA() + off); + off += ctx.target->pltEntrySize; } } @@ -2576,7 +2578,7 @@ void PltSection::addEntry(Symbol &sym) { } size_t PltSection::getSize() const { - return headerSize + entries.size() * target->pltEntrySize; + return headerSize + entries.size() * ctx.target->pltEntrySize; } bool PltSection::isNeeded() const { @@ -2587,12 +2589,12 @@ bool PltSection::isNeeded() const { // Used by ARM to add mapping symbols in the PLT section, which aid // disassembly. void PltSection::addSymbols() { - target->addPltHeaderSymbols(*this); + ctx.target->addPltHeaderSymbols(*this); size_t off = headerSize; for (size_t i = 0; i < entries.size(); ++i) { - target->addPltSymbols(*this, off); - off += target->pltEntrySize; + ctx.target->addPltSymbols(*this, off); + off += ctx.target->pltEntrySize; } } @@ -2607,13 +2609,13 @@ IpltSection::IpltSection() void IpltSection::writeTo(uint8_t *buf) { uint32_t off = 0; for (const Symbol *sym : entries) { - target->writeIplt(buf + off, *sym, getVA() + off); - off += target->ipltEntrySize; + ctx.target->writeIplt(buf + off, *sym, getVA() + off); + off += ctx.target->ipltEntrySize; } } size_t IpltSection::getSize() const { - return entries.size() * target->ipltEntrySize; + return entries.size() * ctx.target->ipltEntrySize; } void IpltSection::addEntry(Symbol &sym) { @@ -2626,8 +2628,8 @@ void IpltSection::addEntry(Symbol &sym) { void IpltSection::addSymbols() { size_t off = 0; for (size_t i = 0, e = entries.size(); i != e; ++i) { - target->addPltSymbols(*this, off); - off += target->pltEntrySize; + ctx.target->addPltSymbols(*this, off); + off += ctx.target->pltEntrySize; } } @@ -2641,7 +2643,7 @@ void PPC32GlinkSection::writeTo(uint8_t *buf) { } size_t PPC32GlinkSection::getSize() const { - return headerSize + entries.size() * target->pltEntrySize + footerSize; + return headerSize + entries.size() * ctx.target->pltEntrySize + footerSize; } // This is an x86-only extra PLT section and used only when a security @@ -2706,12 +2708,12 @@ IBTPltSection::IBTPltSection() : SyntheticSection(SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 16, ".plt") {} void IBTPltSection::writeTo(uint8_t *buf) { - target->writeIBTPlt(buf, in.plt->getNumEntries()); + ctx.target->writeIBTPlt(buf, in.plt->getNumEntries()); } size_t IBTPltSection::getSize() const { // 16 is the header size of .plt. - return 16 + in.plt->getNumEntries() * target->pltEntrySize; + return 16 + in.plt->getNumEntries() * ctx.target->pltEntrySize; } bool IBTPltSection::isNeeded() const { return in.plt->getNumEntries() > 0; } @@ -4185,7 +4187,7 @@ void ARMExidxSyntheticSection::writeTo(uint8_t *buf) { // Recalculate outSecOff as finalizeAddressDependentContent() // may have altered syntheticSection outSecOff. d->outSecOff = offset + outSecOff; - target->relocateAlloc(*d, buf + offset); + ctx.target->relocateAlloc(*d, buf + offset); offset += d->getSize(); } else { // A Linker generated CANTUNWIND section. @@ -4193,7 +4195,7 @@ void ARMExidxSyntheticSection::writeTo(uint8_t *buf) { write32(buf + offset + 4, 0x1); uint64_t s = isec->getVA(); uint64_t p = getVA() + offset; - target->relocateNoSym(buf + offset, R_ARM_PREL31, s - p); + ctx.target->relocateNoSym(buf + offset, R_ARM_PREL31, s - p); offset += 8; } } @@ -4202,7 +4204,7 @@ void ARMExidxSyntheticSection::writeTo(uint8_t *buf) { write32(buf + offset + 4, 0x1); uint64_t s = sentinel->getVA(sentinel->getSize()); uint64_t p = getVA() + offset; - target->relocateNoSym(buf + offset, R_ARM_PREL31, s - p); + ctx.target->relocateNoSym(buf + offset, R_ARM_PREL31, s - p); assert(size == offset + 8); } @@ -4866,7 +4868,7 @@ template void elf::createSyntheticSections() { // _GLOBAL_OFFSET_TABLE_ is defined relative to either .got.plt or .got. Treat // it as a relocation and ensure the referenced section is created. if (ctx.sym.globalOffsetTable && config->emachine != EM_MIPS) { - if (target->gotBaseSymInGotPlt) + if (ctx.target->gotBaseSymInGotPlt) in.gotPlt->hasGotPltOffRel = true; else in.got->hasGotOffRel = true; diff --git a/lld/ELF/Target.cpp b/lld/ELF/Target.cpp index 584e9270469d00..a1f2229ad131fe 100644 --- a/lld/ELF/Target.cpp +++ b/lld/ELF/Target.cpp @@ -38,8 +38,6 @@ using namespace llvm::ELF; using namespace lld; using namespace lld::elf; -const TargetInfo *elf::target; - std::string lld::toString(RelType type) { StringRef s = getELFRelocationTypeName(elf::config->emachine, type); if (s == "Unknown") diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h index 0cefa318135662..9894fb32c503c3 100644 --- a/lld/ELF/Target.h +++ b/lld/ELF/Target.h @@ -241,7 +241,6 @@ void convertArmInstructionstoBE8(InputSection *sec, uint8_t *buf); void createTaggedSymbols(const SmallVector &files); void initSymbolAnchors(); -LLVM_LIBRARY_VISIBILITY extern const TargetInfo *target; TargetInfo *getTarget(); template bool isMipsPIC(const Defined *sym); diff --git a/lld/ELF/Thunks.cpp b/lld/ELF/Thunks.cpp index 478d956f43d9b1..fe83c086d84322 100644 --- a/lld/ELF/Thunks.cpp +++ b/lld/ELF/Thunks.cpp @@ -475,9 +475,10 @@ class PPC64PILongBranchThunk final : public PPC64LongBranchThunk { if (std::optional index = in.ppc64LongBranchTarget->addEntry(&dest, addend)) { ctx.mainPart->relaDyn->addRelativeReloc( - target->relativeRel, *in.ppc64LongBranchTarget, *index * UINT64_C(8), - dest, addend + getPPC64GlobalEntryToLocalEntryOffset(dest.stOther), - target->symbolicRel, R_ABS); + ctx.target->relativeRel, *in.ppc64LongBranchTarget, + *index * UINT64_C(8), dest, + addend + getPPC64GlobalEntryToLocalEntryOffset(dest.stOther), + ctx.target->symbolicRel, R_ABS); } } }; @@ -528,7 +529,7 @@ void AArch64Thunk::writeTo(uint8_t *buf) { uint64_t s = getAArch64ThunkDestVA(destination, addend); uint64_t p = getThunkTargetSym()->getVA(); write32(buf, 0x14000000); // b S - target->relocateNoSym(buf, R_AARCH64_CALL26, s - p); + ctx.target->relocateNoSym(buf, R_AARCH64_CALL26, s - p); } // AArch64 long range Thunks. @@ -541,7 +542,7 @@ void AArch64ABSLongThunk::writeLong(uint8_t *buf) { }; uint64_t s = getAArch64ThunkDestVA(destination, addend); memcpy(buf, data, sizeof(data)); - target->relocateNoSym(buf + 8, R_AARCH64_ABS64, s); + ctx.target->relocateNoSym(buf + 8, R_AARCH64_ABS64, s); } void AArch64ABSLongThunk::addSymbols(ThunkSection &isec) { @@ -566,9 +567,9 @@ void AArch64ADRPThunk::writeLong(uint8_t *buf) { uint64_t s = getAArch64ThunkDestVA(destination, addend); uint64_t p = getThunkTargetSym()->getVA(); memcpy(buf, data, sizeof(data)); - target->relocateNoSym(buf, R_AARCH64_ADR_PREL_PG_HI21, - getAArch64Page(s) - getAArch64Page(p)); - target->relocateNoSym(buf + 4, R_AARCH64_ADD_ABS_LO12_NC, s); + ctx.target->relocateNoSym(buf, R_AARCH64_ADR_PREL_PG_HI21, + getAArch64Page(s) - getAArch64Page(p)); + ctx.target->relocateNoSym(buf + 4, R_AARCH64_ADD_ABS_LO12_NC, s); } void AArch64ADRPThunk::addSymbols(ThunkSection &isec) { @@ -609,7 +610,7 @@ void ARMThunk::writeTo(uint8_t *buf) { uint64_t p = getThunkTargetSym()->getVA(); int64_t offset = s - p - 8; write32(buf, 0xea000000); // b S - target->relocateNoSym(buf, R_ARM_JUMP24, offset); + ctx.target->relocateNoSym(buf, R_ARM_JUMP24, offset); } bool ARMThunk::isCompatibleWith(const InputSection &isec, @@ -653,7 +654,7 @@ void ThumbThunk::writeTo(uint8_t *buf) { int64_t offset = s - p - 4; write16(buf + 0, 0xf000); // b.w S write16(buf + 2, 0xb000); - target->relocateNoSym(buf, R_ARM_THM_JUMP24, offset); + ctx.target->relocateNoSym(buf, R_ARM_THM_JUMP24, offset); } bool ThumbThunk::isCompatibleWith(const InputSection &isec, @@ -671,8 +672,8 @@ void ARMV7ABSLongThunk::writeLong(uint8_t *buf) { write32(buf + 4, 0xe340c000); // movt ip,:upper16:S write32(buf + 8, 0xe12fff1c); // bx ip uint64_t s = getARMThunkDestVA(destination); - target->relocateNoSym(buf, R_ARM_MOVW_ABS_NC, s); - target->relocateNoSym(buf + 4, R_ARM_MOVT_ABS, s); + ctx.target->relocateNoSym(buf, R_ARM_MOVW_ABS_NC, s); + ctx.target->relocateNoSym(buf + 4, R_ARM_MOVT_ABS, s); } void ARMV7ABSLongThunk::addSymbols(ThunkSection &isec) { @@ -688,8 +689,8 @@ void ThumbV7ABSLongThunk::writeLong(uint8_t *buf) { write16(buf + 6, 0x0c00); write16(buf + 8, 0x4760); // bx ip uint64_t s = getARMThunkDestVA(destination); - target->relocateNoSym(buf, R_ARM_THM_MOVW_ABS_NC, s); - target->relocateNoSym(buf + 4, R_ARM_THM_MOVT_ABS, s); + ctx.target->relocateNoSym(buf, R_ARM_THM_MOVW_ABS_NC, s); + ctx.target->relocateNoSym(buf + 4, R_ARM_THM_MOVT_ABS, s); } void ThumbV7ABSLongThunk::addSymbols(ThunkSection &isec) { @@ -706,8 +707,8 @@ void ARMV7PILongThunk::writeLong(uint8_t *buf) { uint64_t s = getARMThunkDestVA(destination); uint64_t p = getThunkTargetSym()->getVA(); int64_t offset = s - p - 16; - target->relocateNoSym(buf, R_ARM_MOVW_PREL_NC, offset); - target->relocateNoSym(buf + 4, R_ARM_MOVT_PREL, offset); + ctx.target->relocateNoSym(buf, R_ARM_MOVW_PREL_NC, offset); + ctx.target->relocateNoSym(buf + 4, R_ARM_MOVT_PREL, offset); } void ARMV7PILongThunk::addSymbols(ThunkSection &isec) { @@ -726,8 +727,8 @@ void ThumbV7PILongThunk::writeLong(uint8_t *buf) { uint64_t s = getARMThunkDestVA(destination); uint64_t p = getThunkTargetSym()->getVA() & ~0x1; int64_t offset = s - p - 12; - target->relocateNoSym(buf, R_ARM_THM_MOVW_PREL_NC, offset); - target->relocateNoSym(buf + 4, R_ARM_THM_MOVT_PREL, offset); + ctx.target->relocateNoSym(buf, R_ARM_THM_MOVW_PREL_NC, offset); + ctx.target->relocateNoSym(buf + 4, R_ARM_THM_MOVT_PREL, offset); } void ThumbV7PILongThunk::addSymbols(ThunkSection &isec) { @@ -747,7 +748,7 @@ void ThumbV6MABSLongThunk::writeLong(uint8_t *buf) { write16(buf + 6, 0xbd01); // pop {r0, pc} ; restore r0 and branch to dest write32(buf + 8, 0x00000000); // L1: .word S uint64_t s = getARMThunkDestVA(destination); - target->relocateNoSym(buf + 8, R_ARM_ABS32, s); + ctx.target->relocateNoSym(buf + 8, R_ARM_ABS32, s); } void ThumbV6MABSLongThunk::addSymbols(ThunkSection &isec) { @@ -774,10 +775,10 @@ void ThumbV6MABSXOLongThunk::writeLong(uint8_t *buf) { write16(buf + 16, 0x9001); // str r0, [sp, #4] ; SP + 4 = S write16(buf + 18, 0xbd01); // pop {r0, pc} ; restore r0 and branch to dest uint64_t s = getARMThunkDestVA(destination); - target->relocateNoSym(buf + 2, R_ARM_THM_ALU_ABS_G3, s); - target->relocateNoSym(buf + 6, R_ARM_THM_ALU_ABS_G2_NC, s); - target->relocateNoSym(buf + 10, R_ARM_THM_ALU_ABS_G1_NC, s); - target->relocateNoSym(buf + 14, R_ARM_THM_ALU_ABS_G0_NC, s); + ctx.target->relocateNoSym(buf + 2, R_ARM_THM_ALU_ABS_G3, s); + ctx.target->relocateNoSym(buf + 6, R_ARM_THM_ALU_ABS_G2_NC, s); + ctx.target->relocateNoSym(buf + 10, R_ARM_THM_ALU_ABS_G1_NC, s); + ctx.target->relocateNoSym(buf + 14, R_ARM_THM_ALU_ABS_G0_NC, s); } void ThumbV6MABSXOLongThunk::addSymbols(ThunkSection &isec) { @@ -799,7 +800,7 @@ void ThumbV6MPILongThunk::writeLong(uint8_t *buf) { write32(buf + 12, 0x00000000); // L2: .word S - (P + (L1 - P) + 4) uint64_t s = getARMThunkDestVA(destination); uint64_t p = getThunkTargetSym()->getVA() & ~0x1; - target->relocateNoSym(buf + 12, R_ARM_REL32, s - p - 12); + ctx.target->relocateNoSym(buf + 12, R_ARM_REL32, s - p - 12); } void ThumbV6MPILongThunk::addSymbols(ThunkSection &isec) { @@ -813,7 +814,8 @@ void ThumbV6MPILongThunk::addSymbols(ThunkSection &isec) { void ARMV5LongLdrPcThunk::writeLong(uint8_t *buf) { write32(buf + 0, 0xe51ff004); // ldr pc, [pc,#-4] ; L1 write32(buf + 4, 0x00000000); // L1: .word S - target->relocateNoSym(buf + 4, R_ARM_ABS32, getARMThunkDestVA(destination)); + ctx.target->relocateNoSym(buf + 4, R_ARM_ABS32, + getARMThunkDestVA(destination)); } void ARMV5LongLdrPcThunk::addSymbols(ThunkSection &isec) { @@ -828,7 +830,8 @@ void ARMV4ABSLongBXThunk::writeLong(uint8_t *buf) { write32(buf + 0, 0xe59fc000); // ldr r12, [pc] ; L1 write32(buf + 4, 0xe12fff1c); // bx r12 write32(buf + 8, 0x00000000); // L1: .word S - target->relocateNoSym(buf + 8, R_ARM_ABS32, getARMThunkDestVA(destination)); + ctx.target->relocateNoSym(buf + 8, R_ARM_ABS32, + getARMThunkDestVA(destination)); } void ARMV4ABSLongBXThunk::addSymbols(ThunkSection &isec) { @@ -844,7 +847,8 @@ void ThumbV4ABSLongBXThunk::writeLong(uint8_t *buf) { write16(buf + 2, 0xe7fd); // b #-6 ; Arm recommended sequence to follow bx pc write32(buf + 4, 0xe51ff004); // ldr pc, [pc, #-4] ; L1 write32(buf + 8, 0x00000000); // L1: .word S - target->relocateNoSym(buf + 8, R_ARM_ABS32, getARMThunkDestVA(destination)); + ctx.target->relocateNoSym(buf + 8, R_ARM_ABS32, + getARMThunkDestVA(destination)); } void ThumbV4ABSLongBXThunk::addSymbols(ThunkSection &isec) { @@ -862,7 +866,8 @@ void ThumbV4ABSLongThunk::writeLong(uint8_t *buf) { write32(buf + 4, 0xe59fc000); // ldr r12, [pc] ; L1 write32(buf + 8, 0xe12fff1c); // bx r12 write32(buf + 12, 0x00000000); // L1: .word S - target->relocateNoSym(buf + 12, R_ARM_ABS32, getARMThunkDestVA(destination)); + ctx.target->relocateNoSym(buf + 12, R_ARM_ABS32, + getARMThunkDestVA(destination)); } void ThumbV4ABSLongThunk::addSymbols(ThunkSection &isec) { @@ -881,7 +886,7 @@ void ARMV4PILongBXThunk::writeLong(uint8_t *buf) { write32(buf + 12, 0x00000000); // L2: .word S - (P + (L1 - P) + 8) uint64_t s = getARMThunkDestVA(destination); uint64_t p = getThunkTargetSym()->getVA() & ~0x1; - target->relocateNoSym(buf + 12, R_ARM_REL32, s - p - 12); + ctx.target->relocateNoSym(buf + 12, R_ARM_REL32, s - p - 12); } void ARMV4PILongBXThunk::addSymbols(ThunkSection &isec) { @@ -898,7 +903,7 @@ void ARMV4PILongThunk::writeLong(uint8_t *buf) { write32(buf + 8, 0x00000000); // L2: .word S - (P + (L1 - P) + 8) uint64_t s = getARMThunkDestVA(destination); uint64_t p = getThunkTargetSym()->getVA() & ~0x1; - target->relocateNoSym(buf + 8, R_ARM_REL32, s - p - 12); + ctx.target->relocateNoSym(buf + 8, R_ARM_REL32, s - p - 12); } void ARMV4PILongThunk::addSymbols(ThunkSection &isec) { @@ -917,7 +922,7 @@ void ThumbV4PILongBXThunk::writeLong(uint8_t *buf) { write32(buf + 12, 0x00000000); // L2: .word S - (P + (L1 - P) + 8) uint64_t s = getARMThunkDestVA(destination); uint64_t p = getThunkTargetSym()->getVA() & ~0x1; - target->relocateNoSym(buf + 12, R_ARM_REL32, s - p - 16); + ctx.target->relocateNoSym(buf + 12, R_ARM_REL32, s - p - 16); } void ThumbV4PILongBXThunk::addSymbols(ThunkSection &isec) { @@ -938,7 +943,7 @@ void ThumbV4PILongThunk::writeLong(uint8_t *buf) { write32(buf + 16, 0x00000000); // L2: .word S - (P + (L1 - P) + 8) uint64_t s = getARMThunkDestVA(destination); uint64_t p = getThunkTargetSym()->getVA() & ~0x1; - target->relocateNoSym(buf + 16, R_ARM_REL32, s - p - 16); + ctx.target->relocateNoSym(buf + 16, R_ARM_REL32, s - p - 16); } void ThumbV4PILongThunk::addSymbols(ThunkSection &isec) { @@ -953,7 +958,7 @@ void ThumbV4PILongThunk::addSymbols(ThunkSection &isec) { // Use the long jump which covers a range up to 8MiB. void AVRThunk::writeTo(uint8_t *buf) { write32(buf, 0x940c); // jmp func - target->relocateNoSym(buf, R_AVR_CALL, destination.getVA()); + ctx.target->relocateNoSym(buf, R_AVR_CALL, destination.getVA()); } void AVRThunk::addSymbols(ThunkSection &isec) { @@ -968,8 +973,8 @@ void MipsThunk::writeTo(uint8_t *buf) { write32(buf + 4, 0x08000000 | (s >> 2)); // j func write32(buf + 8, 0x27390000); // addiu $25, $25, %lo(func) write32(buf + 12, 0x00000000); // nop - target->relocateNoSym(buf, R_MIPS_HI16, s); - target->relocateNoSym(buf + 8, R_MIPS_LO16, s); + ctx.target->relocateNoSym(buf, R_MIPS_HI16, s); + ctx.target->relocateNoSym(buf + 8, R_MIPS_LO16, s); } void MipsThunk::addSymbols(ThunkSection &isec) { @@ -990,9 +995,9 @@ void MicroMipsThunk::writeTo(uint8_t *buf) { write16(buf + 4, 0xd400); // j func write16(buf + 8, 0x3339); // addiu $25, $25, %lo(func) write16(buf + 12, 0x0c00); // nop - target->relocateNoSym(buf, R_MICROMIPS_HI16, s); - target->relocateNoSym(buf + 4, R_MICROMIPS_26_S1, s); - target->relocateNoSym(buf + 8, R_MICROMIPS_LO16, s); + ctx.target->relocateNoSym(buf, R_MICROMIPS_HI16, s); + ctx.target->relocateNoSym(buf + 4, R_MICROMIPS_26_S1, s); + ctx.target->relocateNoSym(buf + 8, R_MICROMIPS_LO16, s); } void MicroMipsThunk::addSymbols(ThunkSection &isec) { @@ -1015,9 +1020,9 @@ void MicroMipsR6Thunk::writeTo(uint8_t *buf) { write16(buf, 0x1320); // lui $25, %hi(func) write16(buf + 4, 0x3339); // addiu $25, $25, %lo(func) write16(buf + 8, 0x9400); // bc func - target->relocateNoSym(buf, R_MICROMIPS_HI16, s); - target->relocateNoSym(buf + 4, R_MICROMIPS_LO16, s); - target->relocateNoSym(buf + 8, R_MICROMIPS_PC26_S1, s - p - 12); + ctx.target->relocateNoSym(buf, R_MICROMIPS_HI16, s); + ctx.target->relocateNoSym(buf + 4, R_MICROMIPS_LO16, s); + ctx.target->relocateNoSym(buf + 8, R_MICROMIPS_PC26_S1, s - p - 12); } void MicroMipsR6Thunk::addSymbols(ThunkSection &isec) { diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index 087804b43918ab..0165253551714c 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -826,7 +826,7 @@ template void Writer::setReservedSymbolSections() { // The _GLOBAL_OFFSET_TABLE_ symbol is defined by target convention usually // to the start of the .got or .got.plt section. InputSection *sec = in.gotPlt.get(); - if (!target->gotBaseSymInGotPlt) + if (!ctx.target->gotBaseSymInGotPlt) sec = in.mipsGot ? cast(in.mipsGot.get()) : cast(in.got.get()); ctx.sym.globalOffsetTable->section = sec; @@ -1177,8 +1177,8 @@ sortISDBySectionOrder(InputSectionDescription *isd, // cover most cases). size_t insPt = 0; if (executableOutputSection && !orderedSections.empty() && - target->getThunkSectionSpacing() && - totalSize >= target->getThunkSectionSpacing()) { + ctx.target->getThunkSectionSpacing() && + totalSize >= ctx.target->getThunkSectionSpacing()) { uint64_t unorderedPos = 0; for (; insPt != unorderedSections.size(); ++insPt) { unorderedPos += unorderedSections[insPt]->getSize(); @@ -1455,9 +1455,9 @@ template void Writer::finalizeAddressDependentContent() { uint32_t pass = 0, assignPasses = 0; for (;;) { - bool changed = target->needsThunks + bool changed = ctx.target->needsThunks ? tc.createThunks(pass, ctx.outputSections) - : target->relaxOnce(pass); + : ctx.target->relaxOnce(pass); bool spilled = ctx.script->spillSections(); changed |= spilled; ++pass; @@ -1465,8 +1465,8 @@ template void Writer::finalizeAddressDependentContent() { // With Thunk Size much smaller than branch range we expect to // converge quickly; if we get to 30 something has gone wrong. if (changed && pass >= 30) { - error(target->needsThunks ? "thunk creation not converged" - : "relaxation not converged"); + error(ctx.target->needsThunks ? "thunk creation not converged" + : "relaxation not converged"); break; } @@ -1541,7 +1541,7 @@ template void Writer::finalizeAddressDependentContent() { } } if (!config->relocatable) - target->finalizeRelax(pass); + ctx.target->finalizeRelax(pass); if (config->relocatable) for (OutputSection *sec : ctx.outputSections) @@ -1632,7 +1632,7 @@ template void Writer::optimizeBasicBlockJumps() { for (size_t i = 0, e = sections.size(); i != e; ++i) { InputSection *next = i + 1 < sections.size() ? sections[i + 1] : nullptr; InputSection &sec = *sections[i]; - numDeleted += target->deleteFallThruJmpInsn(sec, sec.file, next); + numDeleted += ctx.target->deleteFallThruJmpInsn(sec, sec.file, next); } if (numDeleted > 0) { ctx.script->assignAddresses(); @@ -2803,7 +2803,7 @@ template void Writer::writeSectionsBinary() { static void fillTrap(uint8_t *i, uint8_t *end) { for (; i + 4 <= end; i += 4) - memcpy(i, &target->trapInstr, 4); + memcpy(i, &ctx.target->trapInstr, 4); } // Fill the last page of executable segments with trap instructions From 67d3ef74b31e1517d4f679e754cc2b3041c95901 Mon Sep 17 00:00:00 2001 From: Vyacheslav Levytskyy Date: Thu, 22 Aug 2024 09:40:27 +0200 Subject: [PATCH 155/426] [SPIR-V] Rework usage of virtual registers' types and classes (#104104) This PR continues https://github.com/llvm/llvm-project/pull/101732 changes in virtual register processing aimed to improve correctness of emitted MIR between passes from the perspective of MachineVerifier. Namely, the following changes are introduced: * register classes (lib/Target/SPIRV/SPIRVRegisterInfo.td) and instruction patterns (lib/Target/SPIRV/SPIRVInstrInfo.td) are corrected and simplified (by removing unnecessary sophisticated options) -- e.g., this PR gets rid of duplicating 32/64 bits patterns, removes ANYID register class and simplifies definition of the rest of register classes, * hardcoded LLT scalar types in passes before instruction selection are corrected -- the goal is to have correct bit width before instruction selection, and use 64 bits registers for pattern matching in the instruction selection pass; 32-bit registers remain where they are described in such terms by SPIR-V specification (like, for example, creation of virtual registers for scope/mem semantics operands), * rework virtual register type/class assignment for calls/builtins lowering, * a series of minor changes to fix validity of emitted code between passes: - ensure that that bitcast changes the type, - fix the pattern for instruction selection for OpExtInst, - simplify inline asm operands usage, - account for arbitrary integer sizes / update legalizer rules; * add '-verify-machineinstrs' to existed test cases. See also https://github.com/llvm/llvm-project/issues/88129 that this PR may resolve. This PR fixes a great number of issues reported by MachineVerifier and, as a result, reduces a number of failed test cases for the mode with expensive checks set on from ~200 to ~57. --- .../SPIRV/MCTargetDesc/SPIRVMCCodeEmitter.cpp | 3 +- llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp | 254 ++++++------------ llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp | 27 +- llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp | 155 +++++++---- llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h | 7 +- llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp | 10 +- llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp | 9 +- llvm/lib/Target/SPIRV/SPIRVInstrInfo.td | 95 ++++--- .../Target/SPIRV/SPIRVInstructionSelector.cpp | 66 +++-- llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp | 82 ++++-- llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp | 17 +- llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp | 175 ++++++------ llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td | 46 ++-- llvm/lib/Target/SPIRV/SPIRVSubtarget.h | 2 + .../SPIRV/constant/global-constants.ll | 2 +- .../constant/local-aggregate-constant.ll | 2 +- .../SPIRV/constant/local-bool-constants.ll | 2 +- .../constant/local-float-point-constants.ll | 2 +- .../constant/local-integers-constants.ll | 2 +- .../SPIRV/constant/local-null-constants.ll | 2 +- .../constant/local-vector-matrix-constants.ll | 2 +- .../SPV_INTEL_arbitrary_precision_integers.ll | 2 +- .../bfloat16-conv.ll | 2 +- .../decorate-prefetch-w-cache-controls.ll | 2 +- .../fp_two_calls.ll | 2 +- .../global-var-decorations.ll | 2 +- .../global-var-host-access.ll | 2 +- .../SPV_INTEL_inline_assembly/inline_asm.ll | 2 +- .../SPIRV/extensions/SPV_INTEL_optnone.ll | 4 +- .../builtin-op-wrappers.ll | 2 +- .../cl_intel_sub_groups.ll | 2 +- .../builtin_alloca.ll | 2 +- .../SPV_INTEL_variable_length_array/vararr.ll | 2 +- .../vararr_spec_const.ll | 2 +- .../extensions/SPV_KHR_bit_instructions.ll | 4 +- .../SPV_KHR_no_integer_wrap_decoration.ll | 2 +- .../SPV_KHR_shader_clock/shader_clock.ll | 2 +- .../subgroup-rotate.ll | 2 +- .../uniform-group-instructions.ll | 2 +- .../enable-all-extensions-but-one.ll | 2 +- .../SPIRV/extensions/enable-all-extensions.ll | 2 +- .../SPIRV/function/alloca-load-store.ll | 2 +- .../SPIRV/function/identity-function.ll | 2 +- .../function/multiple-anonymous-functions.ll | 2 +- .../function/trivial-function-definition.ll | 2 +- .../trivial-function-with-attributes.ll | 2 +- .../function/trivial-function-with-call.ll | 2 +- .../hlsl-intrinsics/SV_DispatchThreadID.ll | 2 +- .../SPIRV/hlsl-intrinsics/WaveGetLaneIndex.ll | 2 +- .../test/CodeGen/SPIRV/hlsl-intrinsics/abs.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/acos.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/asin.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/atan.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/ceil.ll | 2 +- .../test/CodeGen/SPIRV/hlsl-intrinsics/cos.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/cosh.ll | 2 +- .../test/CodeGen/SPIRV/hlsl-intrinsics/exp.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/exp2.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/floor.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/fmad.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/fmax.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/fmin.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/frac.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/lerp.ll | 2 +- .../test/CodeGen/SPIRV/hlsl-intrinsics/log.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/log10.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/log2.ll | 2 +- .../test/CodeGen/SPIRV/hlsl-intrinsics/pow.ll | 2 +- .../test/CodeGen/SPIRV/hlsl-intrinsics/rcp.ll | 2 +- .../SPIRV/hlsl-intrinsics/reversebits.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/round.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/rsqrt.ll | 2 +- .../test/CodeGen/SPIRV/hlsl-intrinsics/sin.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/sinh.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/smax.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/smin.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/sqrt.ll | 2 +- .../test/CodeGen/SPIRV/hlsl-intrinsics/tan.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/tanh.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/trunc.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/umax.ll | 2 +- .../CodeGen/SPIRV/hlsl-intrinsics/umin.ll | 2 +- llvm/test/CodeGen/SPIRV/image/sampler.ll | 2 +- .../CodeGen/SPIRV/instructions/atomic-ptr.ll | 4 +- .../SPIRV/instructions/atomic_acqrel.ll | 2 +- .../CodeGen/SPIRV/instructions/atomic_seq.ll | 2 +- .../instructions/call-complex-function.ll | 2 +- .../instructions/call-trivial-function.ll | 2 +- llvm/test/CodeGen/SPIRV/instructions/fcmp.ll | 2 +- .../CodeGen/SPIRV/instructions/float-casts.ll | 2 +- .../SPIRV/instructions/float-fast-flags.ll | 2 +- llvm/test/CodeGen/SPIRV/instructions/icmp.ll | 2 +- .../CodeGen/SPIRV/instructions/intrinsics.ll | 2 +- .../SPIRV/instructions/nested-composites.ll | 2 +- .../instructions/scalar-bitwise-operations.ll | 2 +- .../scalar-floating-point-arithmetic.ll | 2 +- .../instructions/scalar-integer-arithmetic.ll | 2 +- .../SPIRV/instructions/select-ptr-load.ll | 4 +- .../test/CodeGen/SPIRV/instructions/select.ll | 4 +- .../undef-nested-composite-store.ll | 2 +- .../undef-simple-composite-store.ll | 2 +- .../CodeGen/SPIRV/instructions/unreachable.ll | 2 +- .../instructions/vector-bitwise-operations.ll | 2 +- .../vector-floating-point-arithmetic.ll | 2 +- .../instructions/vector-integer-arithmetic.ll | 2 +- .../test/CodeGen/SPIRV/llvm-intrinsics/abs.ll | 6 +- .../CodeGen/SPIRV/llvm-intrinsics/assume.ll | 2 +- .../CodeGen/SPIRV/llvm-intrinsics/bswap.ll | 2 +- .../CodeGen/SPIRV/llvm-intrinsics/ceil.ll | 2 +- .../CodeGen/SPIRV/llvm-intrinsics/ctlz.ll | 2 +- .../CodeGen/SPIRV/llvm-intrinsics/ctpop.ll | 2 +- .../CodeGen/SPIRV/llvm-intrinsics/cttz.ll | 2 +- .../CodeGen/SPIRV/llvm-intrinsics/fabs.ll | 2 +- .../SPIRV/llvm-intrinsics/fp-intrinsics.ll | 2 +- .../SPIRV/llvm-intrinsics/invariant.ll | 2 +- .../CodeGen/SPIRV/llvm-intrinsics/lifetime.ll | 4 +- .../llvm-intrinsics/llvm-vector-reduce/add.ll | 7 +- .../llvm-intrinsics/llvm-vector-reduce/and.ll | 7 +- .../llvm-vector-reduce/fadd.ll | 7 +- .../llvm-vector-reduce/fmax.ll | 2 +- .../llvm-vector-reduce/fmaximum.ll | 2 +- .../llvm-vector-reduce/fmin.ll | 2 +- .../llvm-vector-reduce/fminimum.ll | 2 +- .../llvm-vector-reduce/fmul.ll | 2 +- .../llvm-intrinsics/llvm-vector-reduce/mul.ll | 2 +- .../llvm-intrinsics/llvm-vector-reduce/or.ll | 2 +- .../llvm-vector-reduce/smax.ll | 2 +- .../llvm-vector-reduce/smin.ll | 2 +- .../llvm-vector-reduce/umax.ll | 2 +- .../llvm-vector-reduce/umin.ll | 2 +- .../llvm-intrinsics/llvm-vector-reduce/xor.ll | 2 +- .../CodeGen/SPIRV/llvm-intrinsics/maxnum.ll | 2 +- .../SPIRV/llvm-intrinsics/nearbyint.ll | 2 +- .../SPIRV/llvm-intrinsics/ptr-annotation.ll | 2 +- .../SPIRV/llvm-intrinsics/satur-arith.ll | 4 +- .../CodeGen/SPIRV/llvm-intrinsics/sqrt.ll | 2 +- .../llvm-intrinsics/umul.with.overflow.ll | 2 +- .../SPIRV/pointers/argument-ptr-to-struct.ll | 2 +- .../SPIRV/pointers/bitcast-fix-accesschain.ll | 5 +- .../SPIRV/pointers/bitcast-fix-load.ll | 2 +- .../SPIRV/pointers/bitcast-fix-store.ll | 2 +- llvm/test/CodeGen/SPIRV/pointers/complex.ll | 4 +- .../SPIRV/pointers/custom-kernel-arg-type.ll | 2 +- .../SPIRV/pointers/duplicate-type-ptr-def.ll | 2 +- .../pointers/getelementptr-addressspace.ll | 2 +- .../SPIRV/pointers/getelementptr-base-type.ll | 2 +- .../pointers/getelementptr-bitcast-load.ll | 2 +- .../pointers/getelementptr-kernel-arg-char.ll | 2 +- .../CodeGen/SPIRV/pointers/global-ptrtoint.ll | 4 +- .../SPIRV/pointers/global-zeroinitializer.ll | 4 +- ...argument-builtin-vload-type-discrapency.ll | 2 +- .../kernel-argument-pointer-addressspace.ll | 2 +- ...er-type-deduction-no-bitcast-to-generic.ll | 2 +- ...ment-pointer-type-deduction-no-metadata.ll | 2 +- .../pointers/kernel-argument-pointer-type.ll | 2 +- ...el-argument-ptr-i8-default-element-type.ll | 2 +- .../kernel-argument-ptr-no-bitcast.ll | 2 +- .../SPIRV/pointers/load-addressspace.ll | 2 +- .../pointers/nested-struct-opaque-pointers.ll | 2 +- .../SPIRV/pointers/ptr-argument-byref.ll | 2 +- .../SPIRV/pointers/ptr-argument-byval.ll | 2 +- ...tore-kernel-arg-i8-ptr-as-value-operand.ll | 2 +- .../store-kernel-arg-ptr-as-value-operand.ll | 2 +- .../pointers/store-operand-ptr-to-struct.ll | 2 +- .../SPIRV/pointers/struct-opaque-pointers.ll | 2 +- .../pointers/two-bitcast-or-param-users.ll | 2 +- .../SPIRV/pointers/two-subsequent-bitcasts.ll | 2 +- .../SPIRV/pointers/type-deduce-args-rev.ll | 2 +- .../SPIRV/pointers/type-deduce-args.ll | 2 +- .../pointers/type-deduce-by-call-chain.ll | 2 +- .../pointers/type-deduce-by-call-complex.ll | 2 +- .../SPIRV/pointers/type-deduce-by-call-rev.ll | 2 +- .../SPIRV/pointers/type-deduce-by-call.ll | 2 +- .../pointers/type-deduce-call-no-bitcast.ll | 2 +- .../CodeGen/SPIRV/pointers/typeof-ptr-int.ll | 2 +- .../SPIRV/pointers/variables-storage-class.ll | 2 +- .../SPIRV/transcoding/sub_group_ballot.ll | 2 +- 177 files changed, 647 insertions(+), 669 deletions(-) diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCCodeEmitter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCCodeEmitter.cpp index 6dd0df2a104c0f..42567f695395ef 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCCodeEmitter.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCCodeEmitter.cpp @@ -67,7 +67,8 @@ static bool hasType(const MCInst &MI, const MCInstrInfo &MII) { // Check if we define an ID, and take a type as operand 1. auto &DefOpInfo = MCDesc.operands()[0]; auto &FirstArgOpInfo = MCDesc.operands()[1]; - return DefOpInfo.RegClass != SPIRV::TYPERegClassID && + return DefOpInfo.RegClass >= 0 && FirstArgOpInfo.RegClass >= 0 && + DefOpInfo.RegClass != SPIRV::TYPERegClassID && FirstArgOpInfo.RegClass == SPIRV::TYPERegClassID; } return false; diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index 09f06728d2d10d..66cf163a1a0ac2 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -408,7 +408,7 @@ buildBoolRegister(MachineIRBuilder &MIRBuilder, const SPIRVType *ResultType, Register ResultRegister = MIRBuilder.getMRI()->createGenericVirtualRegister(Type); - MIRBuilder.getMRI()->setRegClass(ResultRegister, &SPIRV::iIDRegClass); + MIRBuilder.getMRI()->setRegClass(ResultRegister, GR->getRegClass(ResultType)); GR->assignSPIRVTypeToVReg(BoolType, ResultRegister, MIRBuilder.getMF()); return std::make_tuple(ResultRegister, BoolType); } @@ -430,6 +430,7 @@ static bool buildSelectInst(MachineIRBuilder &MIRBuilder, TrueConst = GR->buildConstantInt(1, MIRBuilder, ReturnType); FalseConst = GR->buildConstantInt(0, MIRBuilder, ReturnType); } + return MIRBuilder.buildSelect(ReturnRegister, SourceRegister, TrueConst, FalseConst); } @@ -443,7 +444,7 @@ static Register buildLoadInst(SPIRVType *BaseType, Register PtrRegister, MachineRegisterInfo *MRI = MIRBuilder.getMRI(); if (!DestinationReg.isValid()) { DestinationReg = MRI->createVirtualRegister(&SPIRV::iIDRegClass); - MRI->setType(DestinationReg, LLT::scalar(32)); + MRI->setType(DestinationReg, LLT::scalar(64)); GR->assignSPIRVTypeToVReg(BaseType, DestinationReg, MIRBuilder.getMF()); } // TODO: consider using correct address space and alignment (p0 is canonical @@ -526,11 +527,11 @@ static SPIRV::Scope::Scope getSPIRVScope(SPIRV::CLMemoryScope ClScope) { report_fatal_error("Unknown CL memory scope"); } -static Register buildConstantIntReg(uint64_t Val, MachineIRBuilder &MIRBuilder, - SPIRVGlobalRegistry *GR, - unsigned BitWidth = 32) { - SPIRVType *IntType = GR->getOrCreateSPIRVIntegerType(BitWidth, MIRBuilder); - return GR->buildConstantInt(Val, MIRBuilder, IntType); +static Register buildConstantIntReg32(uint64_t Val, + MachineIRBuilder &MIRBuilder, + SPIRVGlobalRegistry *GR) { + return GR->buildConstantInt(Val, MIRBuilder, + GR->getOrCreateSPIRVIntegerType(32, MIRBuilder)); } static Register buildScopeReg(Register CLScopeRegister, @@ -548,7 +549,7 @@ static Register buildScopeReg(Register CLScopeRegister, return CLScopeRegister; } } - return buildConstantIntReg(Scope, MIRBuilder, GR); + return buildConstantIntReg32(Scope, MIRBuilder, GR); } static Register buildMemSemanticsReg(Register SemanticsRegister, @@ -568,24 +569,19 @@ static Register buildMemSemanticsReg(Register SemanticsRegister, return SemanticsRegister; } } - return buildConstantIntReg(Semantics, MIRBuilder, GR); + return buildConstantIntReg32(Semantics, MIRBuilder, GR); } static bool buildOpFromWrapper(MachineIRBuilder &MIRBuilder, unsigned Opcode, const SPIRV::IncomingCall *Call, Register TypeReg, ArrayRef ImmArgs = {}) { - MachineRegisterInfo *MRI = MIRBuilder.getMRI(); auto MIB = MIRBuilder.buildInstr(Opcode); if (TypeReg.isValid()) MIB.addDef(Call->ReturnRegister).addUse(TypeReg); unsigned Sz = Call->Arguments.size() - ImmArgs.size(); - for (unsigned i = 0; i < Sz; ++i) { - Register ArgReg = Call->Arguments[i]; - if (!MRI->getRegClassOrNull(ArgReg)) - MRI->setRegClass(ArgReg, &SPIRV::iIDRegClass); - MIB.addUse(ArgReg); - } + for (unsigned i = 0; i < Sz; ++i) + MIB.addUse(Call->Arguments[i]); for (uint32_t ImmArg : ImmArgs) MIB.addImm(ImmArg); return true; @@ -599,8 +595,6 @@ static bool buildAtomicInitInst(const SPIRV::IncomingCall *Call, assert(Call->Arguments.size() == 2 && "Need 2 arguments for atomic init translation"); - MIRBuilder.getMRI()->setRegClass(Call->Arguments[0], &SPIRV::iIDRegClass); - MIRBuilder.getMRI()->setRegClass(Call->Arguments[1], &SPIRV::iIDRegClass); MIRBuilder.buildInstr(SPIRV::OpStore) .addUse(Call->Arguments[0]) .addUse(Call->Arguments[1]); @@ -616,27 +610,22 @@ static bool buildAtomicLoadInst(const SPIRV::IncomingCall *Call, return buildOpFromWrapper(MIRBuilder, SPIRV::OpAtomicLoad, Call, TypeReg); Register PtrRegister = Call->Arguments[0]; - MIRBuilder.getMRI()->setRegClass(PtrRegister, &SPIRV::iIDRegClass); // TODO: if true insert call to __translate_ocl_memory_sccope before // OpAtomicLoad and the function implementation. We can use Translator's // output for transcoding/atomic_explicit_arguments.cl as an example. - Register ScopeRegister; - if (Call->Arguments.size() > 1) { - ScopeRegister = Call->Arguments[1]; - MIRBuilder.getMRI()->setRegClass(ScopeRegister, &SPIRV::iIDRegClass); - } else - ScopeRegister = buildConstantIntReg(SPIRV::Scope::Device, MIRBuilder, GR); - + Register ScopeRegister = + Call->Arguments.size() > 1 + ? Call->Arguments[1] + : buildConstantIntReg32(SPIRV::Scope::Device, MIRBuilder, GR); Register MemSemanticsReg; if (Call->Arguments.size() > 2) { // TODO: Insert call to __translate_ocl_memory_order before OpAtomicLoad. MemSemanticsReg = Call->Arguments[2]; - MIRBuilder.getMRI()->setRegClass(MemSemanticsReg, &SPIRV::iIDRegClass); } else { int Semantics = SPIRV::MemorySemantics::SequentiallyConsistent | getMemSemanticsForStorageClass(GR->getPointerStorageClass(PtrRegister)); - MemSemanticsReg = buildConstantIntReg(Semantics, MIRBuilder, GR); + MemSemanticsReg = buildConstantIntReg32(Semantics, MIRBuilder, GR); } MIRBuilder.buildInstr(SPIRV::OpAtomicLoad) @@ -656,14 +645,12 @@ static bool buildAtomicStoreInst(const SPIRV::IncomingCall *Call, return buildOpFromWrapper(MIRBuilder, SPIRV::OpAtomicStore, Call, Register(0)); Register ScopeRegister = - buildConstantIntReg(SPIRV::Scope::Device, MIRBuilder, GR); + buildConstantIntReg32(SPIRV::Scope::Device, MIRBuilder, GR); Register PtrRegister = Call->Arguments[0]; - MIRBuilder.getMRI()->setRegClass(PtrRegister, &SPIRV::iIDRegClass); int Semantics = SPIRV::MemorySemantics::SequentiallyConsistent | getMemSemanticsForStorageClass(GR->getPointerStorageClass(PtrRegister)); - Register MemSemanticsReg = buildConstantIntReg(Semantics, MIRBuilder, GR); - MIRBuilder.getMRI()->setRegClass(Call->Arguments[1], &SPIRV::iIDRegClass); + Register MemSemanticsReg = buildConstantIntReg32(Semantics, MIRBuilder, GR); MIRBuilder.buildInstr(SPIRV::OpAtomicStore) .addUse(PtrRegister) .addUse(ScopeRegister) @@ -686,9 +673,6 @@ static bool buildAtomicCompareExchangeInst( Register ObjectPtr = Call->Arguments[0]; // Pointer (volatile A *object.) Register ExpectedArg = Call->Arguments[1]; // Comparator (C* expected). Register Desired = Call->Arguments[2]; // Value (C Desired). - MRI->setRegClass(ObjectPtr, &SPIRV::iIDRegClass); - MRI->setRegClass(ExpectedArg, &SPIRV::iIDRegClass); - MRI->setRegClass(Desired, &SPIRV::iIDRegClass); SPIRVType *SpvDesiredTy = GR->getSPIRVTypeForVReg(Desired); LLT DesiredLLT = MRI->getType(Desired); @@ -729,13 +713,11 @@ static bool buildAtomicCompareExchangeInst( MemSemEqualReg = Call->Arguments[3]; if (MemOrdNeq == MemSemEqual) MemSemUnequalReg = Call->Arguments[4]; - MRI->setRegClass(Call->Arguments[3], &SPIRV::iIDRegClass); - MRI->setRegClass(Call->Arguments[4], &SPIRV::iIDRegClass); } if (!MemSemEqualReg.isValid()) - MemSemEqualReg = buildConstantIntReg(MemSemEqual, MIRBuilder, GR); + MemSemEqualReg = buildConstantIntReg32(MemSemEqual, MIRBuilder, GR); if (!MemSemUnequalReg.isValid()) - MemSemUnequalReg = buildConstantIntReg(MemSemUnequal, MIRBuilder, GR); + MemSemUnequalReg = buildConstantIntReg32(MemSemUnequal, MIRBuilder, GR); Register ScopeReg; auto Scope = IsCmpxchg ? SPIRV::Scope::Workgroup : SPIRV::Scope::Device; @@ -747,20 +729,19 @@ static bool buildAtomicCompareExchangeInst( Scope = getSPIRVScope(ClScope); if (ClScope == static_cast(Scope)) ScopeReg = Call->Arguments[5]; - MRI->setRegClass(Call->Arguments[5], &SPIRV::iIDRegClass); } if (!ScopeReg.isValid()) - ScopeReg = buildConstantIntReg(Scope, MIRBuilder, GR); + ScopeReg = buildConstantIntReg32(Scope, MIRBuilder, GR); Register Expected = IsCmpxchg ? ExpectedArg : buildLoadInst(SpvDesiredTy, ExpectedArg, MIRBuilder, - GR, LLT::scalar(32)); + GR, LLT::scalar(64)); MRI->setType(Expected, DesiredLLT); Register Tmp = !IsCmpxchg ? MRI->createGenericVirtualRegister(DesiredLLT) : Call->ReturnRegister; if (!MRI->getRegClassOrNull(Tmp)) - MRI->setRegClass(Tmp, &SPIRV::iIDRegClass); + MRI->setRegClass(Tmp, GR->getRegClass(SpvDesiredTy)); GR->assignSPIRVTypeToVReg(SpvDesiredTy, Tmp, MIRBuilder.getMF()); SPIRVType *IntTy = GR->getOrCreateSPIRVIntegerType(32, MIRBuilder); @@ -799,12 +780,10 @@ static bool buildAtomicRMWInst(const SPIRV::IncomingCall *Call, unsigned Opcode, Register PtrRegister = Call->Arguments[0]; unsigned Semantics = SPIRV::MemorySemantics::None; - MRI->setRegClass(PtrRegister, &SPIRV::iIDRegClass); Register MemSemanticsReg = Call->Arguments.size() >= 3 ? Call->Arguments[2] : Register(); MemSemanticsReg = buildMemSemanticsReg(MemSemanticsReg, PtrRegister, Semantics, MIRBuilder, GR); - MRI->setRegClass(Call->Arguments[1], &SPIRV::iIDRegClass); Register ValueReg = Call->Arguments[1]; Register ValueTypeReg = GR->getSPIRVTypeID(Call->ReturnType); // support cl_ext_float_atomics @@ -817,7 +796,7 @@ static bool buildAtomicRMWInst(const SPIRV::IncomingCall *Call, unsigned Opcode, Opcode = SPIRV::OpAtomicFAddEXT; Register NegValueReg = MRI->createGenericVirtualRegister(MRI->getType(ValueReg)); - MRI->setRegClass(NegValueReg, &SPIRV::iIDRegClass); + MRI->setRegClass(NegValueReg, GR->getRegClass(Call->ReturnType)); GR->assignSPIRVTypeToVReg(Call->ReturnType, NegValueReg, MIRBuilder.getMF()); MIRBuilder.buildInstr(TargetOpcode::G_FNEG) @@ -845,21 +824,10 @@ static bool buildAtomicFloatingRMWInst(const SPIRV::IncomingCall *Call, SPIRVGlobalRegistry *GR) { assert(Call->Arguments.size() == 4 && "Wrong number of atomic floating-type builtin"); - - MachineRegisterInfo *MRI = MIRBuilder.getMRI(); - Register PtrReg = Call->Arguments[0]; - MRI->setRegClass(PtrReg, &SPIRV::iIDRegClass); - Register ScopeReg = Call->Arguments[1]; - MRI->setRegClass(ScopeReg, &SPIRV::iIDRegClass); - Register MemSemanticsReg = Call->Arguments[2]; - MRI->setRegClass(MemSemanticsReg, &SPIRV::iIDRegClass); - Register ValueReg = Call->Arguments[3]; - MRI->setRegClass(ValueReg, &SPIRV::iIDRegClass); - MIRBuilder.buildInstr(Opcode) .addDef(Call->ReturnRegister) .addUse(GR->getSPIRVTypeID(Call->ReturnType)) @@ -936,13 +904,10 @@ static bool buildBarrierInst(const SPIRV::IncomingCall *Call, unsigned Opcode, MemSemantics |= SPIRV::MemorySemantics::SequentiallyConsistent; } - Register MemSemanticsReg; - if (MemFlags == MemSemantics) { - MemSemanticsReg = Call->Arguments[0]; - MRI->setRegClass(MemSemanticsReg, &SPIRV::iIDRegClass); - } else - MemSemanticsReg = buildConstantIntReg(MemSemantics, MIRBuilder, GR); - + Register MemSemanticsReg = + MemFlags == MemSemantics + ? Call->Arguments[0] + : buildConstantIntReg32(MemSemantics, MIRBuilder, GR); Register ScopeReg; SPIRV::Scope::Scope Scope = SPIRV::Scope::Workgroup; SPIRV::Scope::Scope MemScope = Scope; @@ -959,19 +924,16 @@ static bool buildBarrierInst(const SPIRV::IncomingCall *Call, unsigned Opcode, if (!(MemFlags & SPIRV::CLK_LOCAL_MEM_FENCE) || (Opcode == SPIRV::OpMemoryBarrier)) Scope = MemScope; - - if (CLScope == static_cast(Scope)) { + if (CLScope == static_cast(Scope)) ScopeReg = Call->Arguments[1]; - MRI->setRegClass(ScopeReg, &SPIRV::iIDRegClass); - } } if (!ScopeReg.isValid()) - ScopeReg = buildConstantIntReg(Scope, MIRBuilder, GR); + ScopeReg = buildConstantIntReg32(Scope, MIRBuilder, GR); auto MIB = MIRBuilder.buildInstr(Opcode).addUse(ScopeReg); if (Opcode != SPIRV::OpMemoryBarrier) - MIB.addUse(buildConstantIntReg(MemScope, MIRBuilder, GR)); + MIB.addUse(buildConstantIntReg32(MemScope, MIRBuilder, GR)); MIB.addUse(MemSemanticsReg); return true; } @@ -1073,19 +1035,13 @@ static bool generateGroupInst(const SPIRV::IncomingCall *Call, "Group Operation parameter must be an integer constant"); uint64_t GrpOp = MI->getOperand(1).getCImm()->getValue().getZExtValue(); Register ScopeReg = Call->Arguments[0]; - if (!MRI->getRegClassOrNull(ScopeReg)) - MRI->setRegClass(ScopeReg, &SPIRV::iIDRegClass); auto MIB = MIRBuilder.buildInstr(GroupBuiltin->Opcode) .addDef(Call->ReturnRegister) .addUse(GR->getSPIRVTypeID(Call->ReturnType)) .addUse(ScopeReg) .addImm(GrpOp); - for (unsigned i = 2; i < Call->Arguments.size(); ++i) { - Register ArgReg = Call->Arguments[i]; - if (!MRI->getRegClassOrNull(ArgReg)) - MRI->setRegClass(ArgReg, &SPIRV::iIDRegClass); - MIB.addUse(ArgReg); - } + for (unsigned i = 2; i < Call->Arguments.size(); ++i) + MIB.addUse(Call->Arguments[i]); return true; } @@ -1104,7 +1060,7 @@ static bool generateGroupInst(const SPIRV::IncomingCall *Call, } else { if (BoolRegType->getOpcode() == SPIRV::OpTypeInt) { Arg0 = MRI->createGenericVirtualRegister(LLT::scalar(1)); - MRI->setRegClass(Arg0, &SPIRV::IDRegClass); + MRI->setRegClass(Arg0, &SPIRV::iIDRegClass); GR->assignSPIRVTypeToVReg(BoolType, Arg0, MIRBuilder.getMF()); MIRBuilder.buildICmp(CmpInst::ICMP_NE, Arg0, BoolReg, GR->buildConstantInt(0, MIRBuilder, BoolRegType)); @@ -1133,7 +1089,7 @@ static bool generateGroupInst(const SPIRV::IncomingCall *Call, auto Scope = Builtin->Name.starts_with("sub_group") ? SPIRV::Scope::Subgroup : SPIRV::Scope::Workgroup; - Register ScopeRegister = buildConstantIntReg(Scope, MIRBuilder, GR); + Register ScopeRegister = buildConstantIntReg32(Scope, MIRBuilder, GR); Register VecReg; if (GroupBuiltin->Opcode == SPIRV::OpGroupBroadcast && @@ -1178,10 +1134,8 @@ static bool generateGroupInst(const SPIRV::IncomingCall *Call, if (VecReg.isValid()) MIB.addUse(VecReg); else - for (unsigned i = 1; i < Call->Arguments.size(); i++) { + for (unsigned i = 1; i < Call->Arguments.size(); i++) MIB.addUse(Call->Arguments[i]); - MRI->setRegClass(Call->Arguments[i], &SPIRV::iIDRegClass); - } } // Build select instruction. @@ -1215,7 +1169,6 @@ static bool generateIntelSubgroupsInst(const SPIRV::IncomingCall *Call, : Register(0)); } - MachineRegisterInfo *MRI = MIRBuilder.getMRI(); if (IntelSubgroups->IsBlock) { // Minimal number or arguments set in TableGen records is 1 if (SPIRVType *Arg0Type = GR->getSPIRVTypeForVReg(Call->Arguments[0])) { @@ -1252,11 +1205,8 @@ static bool generateIntelSubgroupsInst(const SPIRV::IncomingCall *Call, : MIRBuilder.buildInstr(OpCode) .addDef(Call->ReturnRegister) .addUse(GR->getSPIRVTypeID(Call->ReturnType)); - for (size_t i = 0; i < Call->Arguments.size(); ++i) { + for (size_t i = 0; i < Call->Arguments.size(); ++i) MIB.addUse(Call->Arguments[i]); - MRI->setRegClass(Call->Arguments[i], &SPIRV::iIDRegClass); - } - return true; } @@ -1278,11 +1228,8 @@ static bool generateGroupUniformInst(const SPIRV::IncomingCall *Call, MachineRegisterInfo *MRI = MIRBuilder.getMRI(); Register GroupResultReg = Call->ReturnRegister; - MRI->setRegClass(GroupResultReg, &SPIRV::iIDRegClass); - - // Scope Register ScopeReg = Call->Arguments[0]; - MRI->setRegClass(ScopeReg, &SPIRV::iIDRegClass); + Register ValueReg = Call->Arguments[2]; // Group Operation Register ConstGroupOpReg = Call->Arguments[1]; @@ -1297,10 +1244,6 @@ static bool generateGroupUniformInst(const SPIRV::IncomingCall *Call, "integer constant", false); - // Value - Register ValueReg = Call->Arguments[2]; - MRI->setRegClass(ValueReg, &SPIRV::iIDRegClass); - auto MIB = MIRBuilder.buildInstr(GroupUniform->Opcode) .addDef(GroupResultReg) .addUse(GR->getSPIRVTypeID(Call->ReturnType)) @@ -1324,9 +1267,7 @@ static bool generateKernelClockInst(const SPIRV::IncomingCall *Call, report_fatal_error(DiagMsg.c_str(), false); } - MachineRegisterInfo *MRI = MIRBuilder.getMRI(); Register ResultReg = Call->ReturnRegister; - MRI->setRegClass(ResultReg, &SPIRV::iIDRegClass); // Deduce the `Scope` operand from the builtin function name. SPIRV::Scope::Scope ScopeArg = @@ -1334,7 +1275,7 @@ static bool generateKernelClockInst(const SPIRV::IncomingCall *Call, .EndsWith("device", SPIRV::Scope::Scope::Device) .EndsWith("work_group", SPIRV::Scope::Scope::Workgroup) .EndsWith("sub_group", SPIRV::Scope::Scope::Subgroup); - Register ScopeReg = buildConstantIntReg(ScopeArg, MIRBuilder, GR); + Register ScopeReg = buildConstantIntReg32(ScopeArg, MIRBuilder, GR); MIRBuilder.buildInstr(SPIRV::OpReadClockKHR) .addDef(ResultReg) @@ -1634,7 +1575,7 @@ static bool generateImageSizeQueryInst(const SPIRV::IncomingCall *Call, if (NumExpectedRetComponents != NumActualRetComponents) { QueryResult = MIRBuilder.getMRI()->createGenericVirtualRegister( LLT::fixed_vector(NumActualRetComponents, 32)); - MIRBuilder.getMRI()->setRegClass(QueryResult, &SPIRV::iIDRegClass); + MIRBuilder.getMRI()->setRegClass(QueryResult, &SPIRV::vIDRegClass); SPIRVType *IntTy = GR->getOrCreateSPIRVIntegerType(32, MIRBuilder); QueryResultType = GR->getOrCreateSPIRVVectorType( IntTy, NumActualRetComponents, MIRBuilder); @@ -1643,13 +1584,12 @@ static bool generateImageSizeQueryInst(const SPIRV::IncomingCall *Call, bool IsDimBuf = ImgType->getOperand(2).getImm() == SPIRV::Dim::DIM_Buffer; unsigned Opcode = IsDimBuf ? SPIRV::OpImageQuerySize : SPIRV::OpImageQuerySizeLod; - MIRBuilder.getMRI()->setRegClass(Call->Arguments[0], &SPIRV::iIDRegClass); auto MIB = MIRBuilder.buildInstr(Opcode) .addDef(QueryResult) .addUse(GR->getSPIRVTypeID(QueryResultType)) .addUse(Call->Arguments[0]); if (!IsDimBuf) - MIB.addUse(buildConstantIntReg(0, MIRBuilder, GR)); // Lod id. + MIB.addUse(buildConstantIntReg32(0, MIRBuilder, GR)); // Lod id. if (NumExpectedRetComponents == NumActualRetComponents) return true; if (NumExpectedRetComponents == 1) { @@ -1699,7 +1639,6 @@ static bool generateImageMiscQueryInst(const SPIRV::IncomingCall *Call, SPIRV::lookupNativeBuiltin(Builtin->Name, Builtin->Set)->Opcode; Register Image = Call->Arguments[0]; - MIRBuilder.getMRI()->setRegClass(Image, &SPIRV::iIDRegClass); SPIRV::Dim::Dim ImageDimensionality = static_cast( GR->getSPIRVTypeForVReg(Image)->getOperand(2).getImm()); (void)ImageDimensionality; @@ -1763,12 +1702,8 @@ static bool generateReadImageInst(const StringRef DemangledCall, SPIRVGlobalRegistry *GR) { Register Image = Call->Arguments[0]; MachineRegisterInfo *MRI = MIRBuilder.getMRI(); - MRI->setRegClass(Image, &SPIRV::iIDRegClass); - MRI->setRegClass(Call->Arguments[1], &SPIRV::iIDRegClass); bool HasOclSampler = DemangledCall.contains_insensitive("ocl_sampler"); bool HasMsaa = DemangledCall.contains_insensitive("msaa"); - if (HasOclSampler || HasMsaa) - MRI->setRegClass(Call->Arguments[2], &SPIRV::iIDRegClass); if (HasOclSampler) { Register Sampler = Call->Arguments[1]; @@ -1794,32 +1729,35 @@ static bool generateReadImageInst(const StringRef DemangledCall, Register Lod = GR->buildConstantFP(APFloat::getZero(APFloat::IEEEsingle()), MIRBuilder); - SPIRVType *TempType = Call->ReturnType; - bool NeedsExtraction = false; - if (TempType->getOpcode() != SPIRV::OpTypeVector) { - TempType = - GR->getOrCreateSPIRVVectorType(Call->ReturnType, 4, MIRBuilder); - NeedsExtraction = true; - } - LLT LLType = LLT::scalar(GR->getScalarOrVectorBitWidth(TempType)); - Register TempRegister = MRI->createGenericVirtualRegister(LLType); - MRI->setRegClass(TempRegister, &SPIRV::iIDRegClass); - GR->assignSPIRVTypeToVReg(TempType, TempRegister, MIRBuilder.getMF()); - - MIRBuilder.buildInstr(SPIRV::OpImageSampleExplicitLod) - .addDef(NeedsExtraction ? TempRegister : Call->ReturnRegister) - .addUse(GR->getSPIRVTypeID(TempType)) - .addUse(SampledImage) - .addUse(Call->Arguments[2]) // Coordinate. - .addImm(SPIRV::ImageOperand::Lod) - .addUse(Lod); - if (NeedsExtraction) + if (Call->ReturnType->getOpcode() != SPIRV::OpTypeVector) { + SPIRVType *TempType = + GR->getOrCreateSPIRVVectorType(Call->ReturnType, 4, MIRBuilder); + Register TempRegister = + MRI->createGenericVirtualRegister(GR->getRegType(TempType)); + MRI->setRegClass(TempRegister, GR->getRegClass(TempType)); + GR->assignSPIRVTypeToVReg(TempType, TempRegister, MIRBuilder.getMF()); + MIRBuilder.buildInstr(SPIRV::OpImageSampleExplicitLod) + .addDef(TempRegister) + .addUse(GR->getSPIRVTypeID(TempType)) + .addUse(SampledImage) + .addUse(Call->Arguments[2]) // Coordinate. + .addImm(SPIRV::ImageOperand::Lod) + .addUse(Lod); MIRBuilder.buildInstr(SPIRV::OpCompositeExtract) .addDef(Call->ReturnRegister) .addUse(GR->getSPIRVTypeID(Call->ReturnType)) .addUse(TempRegister) .addImm(0); + } else { + MIRBuilder.buildInstr(SPIRV::OpImageSampleExplicitLod) + .addDef(Call->ReturnRegister) + .addUse(GR->getSPIRVTypeID(Call->ReturnType)) + .addUse(SampledImage) + .addUse(Call->Arguments[2]) // Coordinate. + .addImm(SPIRV::ImageOperand::Lod) + .addUse(Lod); + } } else if (HasMsaa) { MIRBuilder.buildInstr(SPIRV::OpImageRead) .addDef(Call->ReturnRegister) @@ -1841,9 +1779,6 @@ static bool generateReadImageInst(const StringRef DemangledCall, static bool generateWriteImageInst(const SPIRV::IncomingCall *Call, MachineIRBuilder &MIRBuilder, SPIRVGlobalRegistry *GR) { - MIRBuilder.getMRI()->setRegClass(Call->Arguments[0], &SPIRV::iIDRegClass); - MIRBuilder.getMRI()->setRegClass(Call->Arguments[1], &SPIRV::iIDRegClass); - MIRBuilder.getMRI()->setRegClass(Call->Arguments[2], &SPIRV::iIDRegClass); MIRBuilder.buildInstr(SPIRV::OpImageWrite) .addUse(Call->Arguments[0]) // Image. .addUse(Call->Arguments[1]) // Coordinate. @@ -1898,10 +1833,6 @@ static bool generateSampleImageInst(const StringRef DemangledCall, "Unable to recognize SPIRV type name: " + ReturnType; report_fatal_error(DiagMsg.c_str()); } - MRI->setRegClass(Call->Arguments[0], &SPIRV::iIDRegClass); - MRI->setRegClass(Call->Arguments[1], &SPIRV::iIDRegClass); - MRI->setRegClass(Call->Arguments[3], &SPIRV::iIDRegClass); - MIRBuilder.buildInstr(SPIRV::OpImageSampleExplicitLod) .addDef(Call->ReturnRegister) .addUse(GR->getSPIRVTypeID(Type)) @@ -2020,7 +1951,6 @@ static bool buildNDRange(const SPIRV::IncomingCall *Call, MachineIRBuilder &MIRBuilder, SPIRVGlobalRegistry *GR) { MachineRegisterInfo *MRI = MIRBuilder.getMRI(); - MRI->setRegClass(Call->Arguments[0], &SPIRV::iIDRegClass); SPIRVType *PtrType = GR->getSPIRVTypeForVReg(Call->Arguments[0]); assert(PtrType->getOpcode() == SPIRV::OpTypePointer && PtrType->getOperand(2).isReg()); @@ -2034,14 +1964,9 @@ static bool buildNDRange(const SPIRV::IncomingCall *Call, unsigned NumArgs = Call->Arguments.size(); assert(NumArgs >= 2); Register GlobalWorkSize = Call->Arguments[NumArgs < 4 ? 1 : 2]; - MRI->setRegClass(GlobalWorkSize, &SPIRV::iIDRegClass); Register LocalWorkSize = NumArgs == 2 ? Register(0) : Call->Arguments[NumArgs < 4 ? 2 : 3]; - if (LocalWorkSize.isValid()) - MRI->setRegClass(LocalWorkSize, &SPIRV::iIDRegClass); Register GlobalWorkOffset = NumArgs <= 3 ? Register(0) : Call->Arguments[1]; - if (GlobalWorkOffset.isValid()) - MRI->setRegClass(GlobalWorkOffset, &SPIRV::iIDRegClass); if (NumArgs < 4) { Register Const; SPIRVType *SpvTy = GR->getSPIRVTypeForVReg(GlobalWorkSize); @@ -2050,8 +1975,6 @@ static bool buildNDRange(const SPIRV::IncomingCall *Call, assert(DefInstr && isSpvIntrinsic(*DefInstr, Intrinsic::spv_gep) && DefInstr->getOperand(3).isReg()); Register GWSPtr = DefInstr->getOperand(3).getReg(); - if (!MRI->getRegClassOrNull(GWSPtr)) - MRI->setRegClass(GWSPtr, &SPIRV::iIDRegClass); // TODO: Maybe simplify generation of the type of the fields. unsigned Size = Call->Builtin->Name == "ndrange_3D" ? 3 : 2; unsigned BitWidth = GR->getPointerSize() == 64 ? 64 : 32; @@ -2140,10 +2063,10 @@ static bool buildEnqueueKernel(const SPIRV::IncomingCall *Call, auto GEPInst = MIRBuilder.buildIntrinsic( Intrinsic::spv_gep, ArrayRef{Reg}, true, false); GEPInst - .addImm(GepMI->getOperand(2).getImm()) // In bound. - .addUse(ArrayMI->getOperand(0).getReg()) // Alloca. - .addUse(buildConstantIntReg(0, MIRBuilder, GR)) // Indices. - .addUse(buildConstantIntReg(I, MIRBuilder, GR)); + .addImm(GepMI->getOperand(2).getImm()) // In bound. + .addUse(ArrayMI->getOperand(0).getReg()) // Alloca. + .addUse(buildConstantIntReg32(0, MIRBuilder, GR)) // Indices. + .addUse(buildConstantIntReg32(I, MIRBuilder, GR)); LocalSizes.push_back(Reg); } } @@ -2160,7 +2083,7 @@ static bool buildEnqueueKernel(const SPIRV::IncomingCall *Call, // If there are no event arguments in the original call, add dummy ones. if (!HasEvents) { - MIB.addUse(buildConstantIntReg(0, MIRBuilder, GR)); // Dummy num events. + MIB.addUse(buildConstantIntReg32(0, MIRBuilder, GR)); // Dummy num events. Register NullPtr = GR->getOrCreateConstNullPtr( MIRBuilder, getOrCreateSPIRVDeviceEventPointer(MIRBuilder, GR)); MIB.addUse(NullPtr); // Dummy wait events. @@ -2179,10 +2102,10 @@ static bool buildEnqueueKernel(const SPIRV::IncomingCall *Call, Type *PType = const_cast(getBlockStructType(BlockLiteralReg, MRI)); // TODO: these numbers should be obtained from block literal structure. // Param Size: Size of block literal structure. - MIB.addUse(buildConstantIntReg(DL.getTypeStoreSize(PType), MIRBuilder, GR)); + MIB.addUse(buildConstantIntReg32(DL.getTypeStoreSize(PType), MIRBuilder, GR)); // Param Aligment: Aligment of block literal structure. - MIB.addUse( - buildConstantIntReg(DL.getPrefTypeAlign(PType).value(), MIRBuilder, GR)); + MIB.addUse(buildConstantIntReg32(DL.getPrefTypeAlign(PType).value(), + MIRBuilder, GR)); for (unsigned i = 0; i < LocalSizes.size(); i++) MIB.addUse(LocalSizes[i]); @@ -2200,7 +2123,6 @@ static bool generateEnqueueInst(const SPIRV::IncomingCall *Call, switch (Opcode) { case SPIRV::OpRetainEvent: case SPIRV::OpReleaseEvent: - MIRBuilder.getMRI()->setRegClass(Call->Arguments[0], &SPIRV::iIDRegClass); return MIRBuilder.buildInstr(Opcode).addUse(Call->Arguments[0]); case SPIRV::OpCreateUserEvent: case SPIRV::OpGetDefaultQueue: @@ -2208,21 +2130,15 @@ static bool generateEnqueueInst(const SPIRV::IncomingCall *Call, .addDef(Call->ReturnRegister) .addUse(GR->getSPIRVTypeID(Call->ReturnType)); case SPIRV::OpIsValidEvent: - MIRBuilder.getMRI()->setRegClass(Call->Arguments[0], &SPIRV::iIDRegClass); return MIRBuilder.buildInstr(Opcode) .addDef(Call->ReturnRegister) .addUse(GR->getSPIRVTypeID(Call->ReturnType)) .addUse(Call->Arguments[0]); case SPIRV::OpSetUserEventStatus: - MIRBuilder.getMRI()->setRegClass(Call->Arguments[0], &SPIRV::iIDRegClass); - MIRBuilder.getMRI()->setRegClass(Call->Arguments[1], &SPIRV::iIDRegClass); return MIRBuilder.buildInstr(Opcode) .addUse(Call->Arguments[0]) .addUse(Call->Arguments[1]); case SPIRV::OpCaptureEventProfilingInfo: - MIRBuilder.getMRI()->setRegClass(Call->Arguments[0], &SPIRV::iIDRegClass); - MIRBuilder.getMRI()->setRegClass(Call->Arguments[1], &SPIRV::iIDRegClass); - MIRBuilder.getMRI()->setRegClass(Call->Arguments[2], &SPIRV::iIDRegClass); return MIRBuilder.buildInstr(Opcode) .addUse(Call->Arguments[0]) .addUse(Call->Arguments[1]) @@ -2250,7 +2166,7 @@ static bool generateAsyncCopy(const SPIRV::IncomingCall *Call, return buildOpFromWrapper(MIRBuilder, Opcode, Call, IsSet ? TypeReg : Register(0)); - auto Scope = buildConstantIntReg(SPIRV::Scope::Workgroup, MIRBuilder, GR); + auto Scope = buildConstantIntReg32(SPIRV::Scope::Workgroup, MIRBuilder, GR); switch (Opcode) { case SPIRV::OpGroupAsyncCopy: { @@ -2270,7 +2186,7 @@ static bool generateAsyncCopy(const SPIRV::IncomingCall *Call, .addUse(Call->Arguments[2]) .addUse(Call->Arguments.size() > 4 ? Call->Arguments[3] - : buildConstantIntReg(1, MIRBuilder, GR)) + : buildConstantIntReg32(1, MIRBuilder, GR)) .addUse(EventReg); if (NewType != nullptr) insertAssignInstr(Call->ReturnRegister, nullptr, NewType, GR, MIRBuilder, @@ -2435,22 +2351,15 @@ static bool generateLoadStoreInst(const SPIRV::IncomingCall *Call, // Add a pointer to the value to load/store. MIB.addUse(Call->Arguments[0]); MachineRegisterInfo *MRI = MIRBuilder.getMRI(); - MRI->setRegClass(Call->Arguments[0], &SPIRV::iIDRegClass); // Add a value to store. - if (!IsLoad) { + if (!IsLoad) MIB.addUse(Call->Arguments[1]); - MRI->setRegClass(Call->Arguments[1], &SPIRV::iIDRegClass); - } // Add optional memory attributes and an alignment. unsigned NumArgs = Call->Arguments.size(); - if ((IsLoad && NumArgs >= 2) || NumArgs >= 3) { + if ((IsLoad && NumArgs >= 2) || NumArgs >= 3) MIB.addImm(getConstFromIntrinsic(Call->Arguments[IsLoad ? 1 : 2], MRI)); - MRI->setRegClass(Call->Arguments[IsLoad ? 1 : 2], &SPIRV::iIDRegClass); - } - if ((IsLoad && NumArgs >= 3) || NumArgs >= 4) { + if ((IsLoad && NumArgs >= 3) || NumArgs >= 4) MIB.addImm(getConstFromIntrinsic(Call->Arguments[IsLoad ? 2 : 3], MRI)); - MRI->setRegClass(Call->Arguments[IsLoad ? 2 : 3], &SPIRV::iIDRegClass); - } return true; } @@ -2540,12 +2449,13 @@ std::optional lowerBuiltin(const StringRef DemangledCall, Register ReturnRegister = OrigRet; SPIRVType *ReturnType = nullptr; if (OrigRetTy && !OrigRetTy->isVoidTy()) { - ReturnType = GR->assignTypeToVReg(OrigRetTy, OrigRet, MIRBuilder); + ReturnType = GR->assignTypeToVReg(OrigRetTy, ReturnRegister, MIRBuilder); if (!MIRBuilder.getMRI()->getRegClassOrNull(ReturnRegister)) - MIRBuilder.getMRI()->setRegClass(ReturnRegister, &SPIRV::iIDRegClass); + MIRBuilder.getMRI()->setRegClass(ReturnRegister, + GR->getRegClass(ReturnType)); } else if (OrigRetTy && OrigRetTy->isVoidTy()) { ReturnRegister = MIRBuilder.getMRI()->createVirtualRegister(&IDRegClass); - MIRBuilder.getMRI()->setType(ReturnRegister, LLT::scalar(32)); + MIRBuilder.getMRI()->setType(ReturnRegister, LLT::scalar(64)); ReturnType = GR->assignTypeToVReg(OrigRetTy, ReturnRegister, MIRBuilder); } diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp index 316abe866a163c..27a9cb0ba9b8c0 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp @@ -371,7 +371,7 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, } auto MRI = MIRBuilder.getMRI(); - Register FuncVReg = MRI->createGenericVirtualRegister(LLT::scalar(32)); + Register FuncVReg = MRI->createGenericVirtualRegister(LLT::scalar(64)); MRI->setRegClass(FuncVReg, &SPIRV::iIDRegClass); if (F.isDeclaration()) GR->add(&F, &MIRBuilder.getMF(), FuncVReg); @@ -403,12 +403,14 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, int i = 0; for (const auto &Arg : F.args()) { assert(VRegs[i].size() == 1 && "Formal arg has multiple vregs"); - MRI->setRegClass(VRegs[i][0], &SPIRV::iIDRegClass); + Register ArgReg = VRegs[i][0]; + MRI->setRegClass(ArgReg, GR->getRegClass(ArgTypeVRegs[i])); + MRI->setType(ArgReg, GR->getRegType(ArgTypeVRegs[i])); MIRBuilder.buildInstr(SPIRV::OpFunctionParameter) - .addDef(VRegs[i][0]) + .addDef(ArgReg) .addUse(GR->getSPIRVTypeID(ArgTypeVRegs[i])); if (F.isDeclaration()) - GR->add(&Arg, &MIRBuilder.getMF(), VRegs[i][0]); + GR->add(&Arg, &MIRBuilder.getMF(), ArgReg); i++; } // Name the function. @@ -532,10 +534,17 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, SmallVector ArgVRegs; for (auto Arg : Info.OrigArgs) { assert(Arg.Regs.size() == 1 && "Call arg has multiple VRegs"); - ArgVRegs.push_back(Arg.Regs[0]); - SPIRVType *SPIRVTy = GR->getOrCreateSPIRVType(Arg.Ty, MIRBuilder); - if (!GR->getSPIRVTypeForVReg(Arg.Regs[0])) - GR->assignSPIRVTypeToVReg(SPIRVTy, Arg.Regs[0], MF); + Register ArgReg = Arg.Regs[0]; + ArgVRegs.push_back(ArgReg); + SPIRVType *SpvType = GR->getSPIRVTypeForVReg(ArgReg); + if (!SpvType) { + SpvType = GR->getOrCreateSPIRVType(Arg.Ty, MIRBuilder); + GR->assignSPIRVTypeToVReg(SpvType, ArgReg, MF); + } + if (!MRI->getRegClassOrNull(ArgReg)) { + MRI->setRegClass(ArgReg, GR->getRegClass(SpvType)); + MRI->setType(ArgReg, GR->getRegType(SpvType)); + } } auto instructionSet = canUseOpenCL ? SPIRV::InstructionSet::OpenCL_std : SPIRV::InstructionSet::GLSL_std_450; @@ -557,7 +566,7 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, for (const Argument &Arg : CF->args()) { if (MIRBuilder.getDataLayout().getTypeStoreSize(Arg.getType()).isZero()) continue; // Don't handle zero sized types. - Register Reg = MRI->createGenericVirtualRegister(LLT::scalar(32)); + Register Reg = MRI->createGenericVirtualRegister(LLT::scalar(64)); MRI->setRegClass(Reg, &SPIRV::iIDRegClass); ToInsert.push_back({Reg}); VRegArgs.push_back(ToInsert.back()); diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index 6702a0efc638ae..54bcb96772e7ab 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -72,17 +72,14 @@ void SPIRVGlobalRegistry::assignSPIRVTypeToVReg(SPIRVType *SpirvType, VRegToTypeMap[&MF][VReg] = SpirvType; } -static Register createTypeVReg(MachineIRBuilder &MIRBuilder) { - auto &MRI = MIRBuilder.getMF().getRegInfo(); - auto Res = MRI.createGenericVirtualRegister(LLT::scalar(32)); +static Register createTypeVReg(MachineRegisterInfo &MRI) { + auto Res = MRI.createGenericVirtualRegister(LLT::scalar(64)); MRI.setRegClass(Res, &SPIRV::TYPERegClass); return Res; } -static Register createTypeVReg(MachineRegisterInfo &MRI) { - auto Res = MRI.createGenericVirtualRegister(LLT::scalar(32)); - MRI.setRegClass(Res, &SPIRV::TYPERegClass); - return Res; +inline Register createTypeVReg(MachineIRBuilder &MIRBuilder) { + return createTypeVReg(MIRBuilder.getMF().getRegInfo()); } SPIRVType *SPIRVGlobalRegistry::getOpTypeBool(MachineIRBuilder &MIRBuilder) { @@ -157,26 +154,24 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeVector(uint32_t NumElems, return MIB; } -std::tuple +std::tuple SPIRVGlobalRegistry::getOrCreateConstIntReg(uint64_t Val, SPIRVType *SpvType, MachineIRBuilder *MIRBuilder, MachineInstr *I, const SPIRVInstrInfo *TII) { - const IntegerType *LLVMIntTy; - if (SpvType) - LLVMIntTy = cast(getTypeForSPIRVType(SpvType)); - else - LLVMIntTy = IntegerType::getInt32Ty(CurMF->getFunction().getContext()); + assert(SpvType); + const IntegerType *LLVMIntTy = + cast(getTypeForSPIRVType(SpvType)); + unsigned BitWidth = getScalarOrVectorBitWidth(SpvType); bool NewInstr = false; // Find a constant in DT or build a new one. ConstantInt *CI = ConstantInt::get(const_cast(LLVMIntTy), Val); Register Res = DT.find(CI, CurMF); if (!Res.isValid()) { - unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32; // TODO: handle cases where the type is not 32bit wide // TODO: https://github.com/llvm/llvm-project/issues/88129 - LLT LLTy = LLT::scalar(32); - Res = CurMF->getRegInfo().createGenericVirtualRegister(LLTy); + Res = + CurMF->getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth)); CurMF->getRegInfo().setRegClass(Res, &SPIRV::iIDRegClass); if (MIRBuilder) assignTypeToVReg(LLVMIntTy, Res, *MIRBuilder); @@ -185,7 +180,7 @@ SPIRVGlobalRegistry::getOrCreateConstIntReg(uint64_t Val, SPIRVType *SpvType, DT.add(CI, CurMF, Res); NewInstr = true; } - return std::make_tuple(Res, CI, NewInstr); + return std::make_tuple(Res, CI, NewInstr, BitWidth); } std::tuple @@ -193,27 +188,19 @@ SPIRVGlobalRegistry::getOrCreateConstFloatReg(APFloat Val, SPIRVType *SpvType, MachineIRBuilder *MIRBuilder, MachineInstr *I, const SPIRVInstrInfo *TII) { - const Type *LLVMFloatTy; + assert(SpvType); LLVMContext &Ctx = CurMF->getFunction().getContext(); - unsigned BitWidth = 32; - if (SpvType) - LLVMFloatTy = getTypeForSPIRVType(SpvType); - else { - LLVMFloatTy = Type::getFloatTy(Ctx); - if (MIRBuilder) - SpvType = getOrCreateSPIRVType(LLVMFloatTy, *MIRBuilder); - } + const Type *LLVMFloatTy = getTypeForSPIRVType(SpvType); + unsigned BitWidth = getScalarOrVectorBitWidth(SpvType); bool NewInstr = false; // Find a constant in DT or build a new one. auto *const CI = ConstantFP::get(Ctx, Val); Register Res = DT.find(CI, CurMF); if (!Res.isValid()) { - if (SpvType) - BitWidth = getScalarOrVectorBitWidth(SpvType); // TODO: handle cases where the type is not 32bit wide // TODO: https://github.com/llvm/llvm-project/issues/88129 - LLT LLTy = LLT::scalar(32); - Res = CurMF->getRegInfo().createGenericVirtualRegister(LLTy); + Res = + CurMF->getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth)); CurMF->getRegInfo().setRegClass(Res, &SPIRV::fIDRegClass); if (MIRBuilder) assignTypeToVReg(LLVMFloatTy, Res, *MIRBuilder); @@ -269,7 +256,8 @@ Register SPIRVGlobalRegistry::getOrCreateConstInt(uint64_t Val, MachineInstr &I, ConstantInt *CI; Register Res; bool New; - std::tie(Res, CI, New) = + unsigned BitWidth; + std::tie(Res, CI, New, BitWidth) = getOrCreateConstIntReg(Val, SpvType, nullptr, &I, &TII); // If we have found Res register which is defined by the passed G_CONSTANT // machine instruction, a new constant instruction should be created. @@ -281,7 +269,7 @@ Register SPIRVGlobalRegistry::getOrCreateConstInt(uint64_t Val, MachineInstr &I, MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantI)) .addDef(Res) .addUse(getSPIRVTypeID(SpvType)); - addNumImm(APInt(getScalarOrVectorBitWidth(SpvType), Val), MIB); + addNumImm(APInt(BitWidth, Val), MIB); } else { MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull)) .addDef(Res) @@ -297,19 +285,17 @@ Register SPIRVGlobalRegistry::buildConstantInt(uint64_t Val, MachineIRBuilder &MIRBuilder, SPIRVType *SpvType, bool EmitIR) { + assert(SpvType); auto &MF = MIRBuilder.getMF(); - const IntegerType *LLVMIntTy; - if (SpvType) - LLVMIntTy = cast(getTypeForSPIRVType(SpvType)); - else - LLVMIntTy = IntegerType::getInt32Ty(MF.getFunction().getContext()); + const IntegerType *LLVMIntTy = + cast(getTypeForSPIRVType(SpvType)); // Find a constant in DT or build a new one. const auto ConstInt = ConstantInt::get(const_cast(LLVMIntTy), Val); Register Res = DT.find(ConstInt, &MF); if (!Res.isValid()) { - unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32; - LLT LLTy = LLT::scalar(EmitIR ? BitWidth : 32); + unsigned BitWidth = getScalarOrVectorBitWidth(SpvType); + LLT LLTy = LLT::scalar(BitWidth); Res = MF.getRegInfo().createGenericVirtualRegister(LLTy); MF.getRegInfo().setRegClass(Res, &SPIRV::iIDRegClass); assignTypeToVReg(LLVMIntTy, Res, MIRBuilder, @@ -318,18 +304,17 @@ Register SPIRVGlobalRegistry::buildConstantInt(uint64_t Val, if (EmitIR) { MIRBuilder.buildConstant(Res, *ConstInt); } else { - if (!SpvType) - SpvType = getOrCreateSPIRVIntegerType(BitWidth, MIRBuilder); + Register SpvTypeReg = getSPIRVTypeID(SpvType); MachineInstrBuilder MIB; if (Val) { MIB = MIRBuilder.buildInstr(SPIRV::OpConstantI) .addDef(Res) - .addUse(getSPIRVTypeID(SpvType)); + .addUse(SpvTypeReg); addNumImm(APInt(BitWidth, Val), MIB); } else { MIB = MIRBuilder.buildInstr(SPIRV::OpConstantNull) .addDef(Res) - .addUse(getSPIRVTypeID(SpvType)); + .addUse(SpvTypeReg); } const auto &Subtarget = CurMF->getSubtarget(); constrainSelectedInstRegOperands(*MIB, *Subtarget.getInstrInfo(), @@ -353,7 +338,8 @@ Register SPIRVGlobalRegistry::buildConstantFP(APFloat Val, const auto ConstFP = ConstantFP::get(Ctx, Val); Register Res = DT.find(ConstFP, &MF); if (!Res.isValid()) { - Res = MF.getRegInfo().createGenericVirtualRegister(LLT::scalar(32)); + Res = MF.getRegInfo().createGenericVirtualRegister( + LLT::scalar(getScalarOrVectorBitWidth(SpvType))); MF.getRegInfo().setRegClass(Res, &SPIRV::fIDRegClass); assignSPIRVTypeToVReg(SpvType, Res, MF); DT.add(ConstFP, &MF, Res); @@ -407,7 +393,7 @@ Register SPIRVGlobalRegistry::getOrCreateCompositeOrNull( // TODO: handle cases where the type is not 32bit wide // TODO: https://github.com/llvm/llvm-project/issues/88129 - LLT LLTy = LLT::scalar(32); + LLT LLTy = LLT::scalar(64); Register SpvVecConst = CurMF->getRegInfo().createGenericVirtualRegister(LLTy); CurMF->getRegInfo().setRegClass(SpvVecConst, &SPIRV::iIDRegClass); @@ -509,7 +495,7 @@ Register SPIRVGlobalRegistry::getOrCreateIntCompositeOrNull( getOrCreateSPIRVIntegerType(BitWidth, MIRBuilder); SpvScalConst = buildConstantInt(Val, MIRBuilder, SpvBaseType, EmitIR); } - LLT LLTy = EmitIR ? LLT::fixed_vector(ElemCnt, BitWidth) : LLT::scalar(32); + LLT LLTy = EmitIR ? LLT::fixed_vector(ElemCnt, BitWidth) : LLT::scalar(64); Register SpvVecConst = CurMF->getRegInfo().createGenericVirtualRegister(LLTy); CurMF->getRegInfo().setRegClass(SpvVecConst, &SPIRV::iIDRegClass); @@ -650,7 +636,6 @@ Register SPIRVGlobalRegistry::buildGlobalVariable( // Set to Reg the same type as ResVReg has. auto MRI = MIRBuilder.getMRI(); - assert(MRI->getType(ResVReg).isPointer() && "Pointer type is expected"); if (Reg != ResVReg) { LLT RegLLTy = LLT::pointer(MRI->getType(ResVReg).getAddressSpace(), getPointerSize()); @@ -706,8 +691,9 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeArray(uint32_t NumElems, bool EmitIR) { assert((ElemType->getOpcode() != SPIRV::OpTypeVoid) && "Invalid array element type"); + SPIRVType *SpvTypeInt32 = getOrCreateSPIRVIntegerType(32, MIRBuilder); Register NumElementsVReg = - buildConstantInt(NumElems, MIRBuilder, nullptr, EmitIR); + buildConstantInt(NumElems, MIRBuilder, SpvTypeInt32, EmitIR); auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeArray) .addDef(createTypeVReg(MIRBuilder)) .addUse(getSPIRVTypeID(ElemType)) @@ -1188,14 +1174,15 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeCoopMatr( if (ResVReg.isValid()) return MIRBuilder.getMF().getRegInfo().getUniqueVRegDef(ResVReg); ResVReg = createTypeVReg(MIRBuilder); + SPIRVType *SpvTypeInt32 = getOrCreateSPIRVIntegerType(32, MIRBuilder); SPIRVType *SpirvTy = MIRBuilder.buildInstr(SPIRV::OpTypeCooperativeMatrixKHR) .addDef(ResVReg) .addUse(getSPIRVTypeID(ElemType)) - .addUse(buildConstantInt(Scope, MIRBuilder, nullptr, true)) - .addUse(buildConstantInt(Rows, MIRBuilder, nullptr, true)) - .addUse(buildConstantInt(Columns, MIRBuilder, nullptr, true)) - .addUse(buildConstantInt(Use, MIRBuilder, nullptr, true)); + .addUse(buildConstantInt(Scope, MIRBuilder, SpvTypeInt32, true)) + .addUse(buildConstantInt(Rows, MIRBuilder, SpvTypeInt32, true)) + .addUse(buildConstantInt(Columns, MIRBuilder, SpvTypeInt32, true)) + .addUse(buildConstantInt(Use, MIRBuilder, SpvTypeInt32, true)); DT.add(ExtensionType, &MIRBuilder.getMF(), ResVReg); return SpirvTy; } @@ -1386,8 +1373,8 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVArrayType( if (Reg.isValid()) return getSPIRVTypeForVReg(Reg); MachineBasicBlock &BB = *I.getParent(); - SPIRVType *SpirvType = getOrCreateSPIRVIntegerType(32, I, TII); - Register Len = getOrCreateConstInt(NumElements, I, SpirvType, TII); + SPIRVType *SpvTypeInt32 = getOrCreateSPIRVIntegerType(32, I, TII); + Register Len = getOrCreateConstInt(NumElements, I, SpvTypeInt32, TII); auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpTypeArray)) .addDef(createTypeVReg(CurMF->getRegInfo())) .addUse(getSPIRVTypeID(BaseType)) @@ -1436,7 +1423,7 @@ Register SPIRVGlobalRegistry::getOrCreateUndef(MachineInstr &I, Register Res = DT.find(UV, CurMF); if (Res.isValid()) return Res; - LLT LLTy = LLT::scalar(32); + LLT LLTy = LLT::scalar(64); Res = CurMF->getRegInfo().createGenericVirtualRegister(LLTy); CurMF->getRegInfo().setRegClass(Res, &SPIRV::iIDRegClass); assignSPIRVTypeToVReg(SpvType, Res, *CurMF); @@ -1451,3 +1438,61 @@ Register SPIRVGlobalRegistry::getOrCreateUndef(MachineInstr &I, *ST.getRegisterInfo(), *ST.getRegBankInfo()); return Res; } + +const TargetRegisterClass * +SPIRVGlobalRegistry::getRegClass(SPIRVType *SpvType) const { + unsigned Opcode = SpvType->getOpcode(); + switch (Opcode) { + case SPIRV::OpTypeFloat: + return &SPIRV::fIDRegClass; + case SPIRV::OpTypePointer: + return &SPIRV::pIDRegClass; + case SPIRV::OpTypeVector: { + SPIRVType *ElemType = getSPIRVTypeForVReg(SpvType->getOperand(1).getReg()); + unsigned ElemOpcode = ElemType ? ElemType->getOpcode() : 0; + if (ElemOpcode == SPIRV::OpTypeFloat) + return &SPIRV::vfIDRegClass; + if (ElemOpcode == SPIRV::OpTypePointer) + return &SPIRV::vpIDRegClass; + return &SPIRV::vIDRegClass; + } + } + return &SPIRV::iIDRegClass; +} + +inline unsigned getAS(SPIRVType *SpvType) { + return storageClassToAddressSpace( + static_cast( + SpvType->getOperand(1).getImm())); +} + +LLT SPIRVGlobalRegistry::getRegType(SPIRVType *SpvType) const { + unsigned Opcode = SpvType ? SpvType->getOpcode() : 0; + switch (Opcode) { + case SPIRV::OpTypeInt: + case SPIRV::OpTypeFloat: + case SPIRV::OpTypeBool: + return LLT::scalar(getScalarOrVectorBitWidth(SpvType)); + case SPIRV::OpTypePointer: + return LLT::pointer(getAS(SpvType), getPointerSize()); + case SPIRV::OpTypeVector: { + SPIRVType *ElemType = getSPIRVTypeForVReg(SpvType->getOperand(1).getReg()); + LLT ET; + switch (ElemType ? ElemType->getOpcode() : 0) { + case SPIRV::OpTypePointer: + ET = LLT::pointer(getAS(ElemType), getPointerSize()); + break; + case SPIRV::OpTypeInt: + case SPIRV::OpTypeFloat: + case SPIRV::OpTypeBool: + ET = LLT::scalar(getScalarOrVectorBitWidth(ElemType)); + break; + default: + ET = LLT::scalar(64); + } + return LLT::fixed_vector( + static_cast(SpvType->getOperand(2).getImm()), ET); + } + } + return LLT::scalar(64); +} diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h index 821c1218fcb7f0..a5cb86f4f1c638 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h @@ -430,7 +430,7 @@ class SPIRVGlobalRegistry { getOrCreateSpecialType(const Type *Ty, MachineIRBuilder &MIRBuilder, SPIRV::AccessQualifier::AccessQualifier AccQual); - std::tuple getOrCreateConstIntReg( + std::tuple getOrCreateConstIntReg( uint64_t Val, SPIRVType *SpvType, MachineIRBuilder *MIRBuilder, MachineInstr *I = nullptr, const SPIRVInstrInfo *TII = nullptr); std::tuple getOrCreateConstFloatReg( @@ -455,7 +455,7 @@ class SPIRVGlobalRegistry { public: Register buildConstantInt(uint64_t Val, MachineIRBuilder &MIRBuilder, - SPIRVType *SpvType = nullptr, bool EmitIR = true); + SPIRVType *SpvType, bool EmitIR = true); Register getOrCreateConstInt(uint64_t Val, MachineInstr &I, SPIRVType *SpvType, const SPIRVInstrInfo &TII, bool ZeroAsNull = true); @@ -550,6 +550,9 @@ class SPIRVGlobalRegistry { SPIRVType *getOrCreateOpTypeByOpcode(const Type *Ty, MachineIRBuilder &MIRBuilder, unsigned Opcode); + + const TargetRegisterClass *getRegClass(SPIRVType *SpvType) const; + LLT getRegType(SPIRVType *SpvType) const; }; } // end namespace llvm #endif // LLLVM_LIB_TARGET_SPIRV_SPIRVTYPEMANAGER_H diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp index 76419faa38e090..8db9808bb87e1d 100644 --- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp @@ -91,13 +91,9 @@ SPIRVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0u, RC); if (VT.isFloatingPoint()) - RC = VT.isVector() ? &SPIRV::vfIDRegClass - : (VT.getScalarSizeInBits() > 32 ? &SPIRV::fID64RegClass - : &SPIRV::fIDRegClass); + RC = VT.isVector() ? &SPIRV::vfIDRegClass : &SPIRV::fIDRegClass; else if (VT.isInteger()) - RC = VT.isVector() ? &SPIRV::vIDRegClass - : (VT.getScalarSizeInBits() > 32 ? &SPIRV::iID64RegClass - : &SPIRV::iIDRegClass); + RC = VT.isVector() ? &SPIRV::vIDRegClass : &SPIRV::iIDRegClass; else RC = &SPIRV::iIDRegClass; @@ -115,7 +111,7 @@ static void doInsertBitcast(const SPIRVSubtarget &STI, MachineRegisterInfo *MRI, SPIRVGlobalRegistry &GR, MachineInstr &I, Register OpReg, unsigned OpIdx, SPIRVType *NewPtrType) { - Register NewReg = MRI->createGenericVirtualRegister(LLT::scalar(32)); + Register NewReg = MRI->createGenericVirtualRegister(LLT::scalar(64)); MachineIRBuilder MIB(I); bool Res = MIB.buildInstr(SPIRV::OpBitcast) .addDef(NewReg) diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp index 12cf7613a45cf3..dac7640cdddd69 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp @@ -256,12 +256,9 @@ void SPIRVInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } bool SPIRVInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { - if (MI.getOpcode() == SPIRV::GET_ID || MI.getOpcode() == SPIRV::GET_ID64 || - MI.getOpcode() == SPIRV::GET_fID || MI.getOpcode() == SPIRV::GET_fID64 || - MI.getOpcode() == SPIRV::GET_pID32 || - MI.getOpcode() == SPIRV::GET_pID64 || MI.getOpcode() == SPIRV::GET_vfID || - MI.getOpcode() == SPIRV::GET_vID || MI.getOpcode() == SPIRV::GET_vpID32 || - MI.getOpcode() == SPIRV::GET_vpID64) { + if (MI.getOpcode() == SPIRV::GET_ID || MI.getOpcode() == SPIRV::GET_fID || + MI.getOpcode() == SPIRV::GET_pID || MI.getOpcode() == SPIRV::GET_vfID || + MI.getOpcode() == SPIRV::GET_vID || MI.getOpcode() == SPIRV::GET_vpID) { auto &MRI = MI.getMF()->getRegInfo(); MRI.replaceRegWith(MI.getOperand(0).getReg(), MI.getOperand(1).getReg()); MI.eraseFromParent(); diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td index c4b09dd6bfe430..79b9bb87739fec 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td @@ -15,18 +15,14 @@ include "SPIRVSymbolicOperands.td" // Codegen only metadata instructions let isCodeGenOnly=1 in { - def ASSIGN_TYPE: Pseudo<(outs ANYID:$dst_id), (ins ANYID:$src_id, TYPE:$src_ty)>; - def DECL_TYPE: Pseudo<(outs ANYID:$dst_id), (ins ANYID:$src_id, TYPE:$src_ty)>; - def GET_ID: Pseudo<(outs iID:$dst_id), (ins ANYID:$src)>; - def GET_ID64: Pseudo<(outs iID64:$dst_id), (ins ANYID:$src)>; - def GET_fID: Pseudo<(outs fID:$dst_id), (ins ANYID:$src)>; - def GET_fID64: Pseudo<(outs fID64:$dst_id), (ins ANYID:$src)>; - def GET_pID32: Pseudo<(outs pID32:$dst_id), (ins ANYID:$src)>; - def GET_pID64: Pseudo<(outs pID64:$dst_id), (ins ANYID:$src)>; - def GET_vID: Pseudo<(outs vID:$dst_id), (ins ANYID:$src)>; - def GET_vfID: Pseudo<(outs vfID:$dst_id), (ins ANYID:$src)>; - def GET_vpID32: Pseudo<(outs vpID32:$dst_id), (ins ANYID:$src)>; - def GET_vpID64: Pseudo<(outs vpID64:$dst_id), (ins ANYID:$src)>; + def ASSIGN_TYPE: Pseudo<(outs ID:$dst_id), (ins ID:$src_id, TYPE:$src_ty)>; + def DECL_TYPE: Pseudo<(outs ID:$dst_id), (ins ID:$src_id, TYPE:$src_ty)>; + def GET_ID: Pseudo<(outs iID:$dst_id), (ins iID:$src)>; + def GET_fID: Pseudo<(outs fID:$dst_id), (ins fID:$src)>; + def GET_pID: Pseudo<(outs pID:$dst_id), (ins pID:$src)>; + def GET_vID: Pseudo<(outs vID:$dst_id), (ins vID:$src)>; + def GET_vfID: Pseudo<(outs vfID:$dst_id), (ins vfID:$src)>; + def GET_vpID: Pseudo<(outs vpID:$dst_id), (ins vpID:$src)>; } def SPVTypeBin : SDTypeProfile<1, 2, []>; @@ -36,16 +32,18 @@ def assigntype : SDNode<"SPIRVISD::AssignType", SPVTypeBin>; def : GINodeEquiv; class BinOp opCode, list pattern=[]> - : Op; class BinOpTyped opCode, RegisterClass CID, SDNode node> - : Op; + : Op; class TernOpTyped opCode, RegisterClass CCond, RegisterClass CID, SDNode node> - : Op; + : Op; multiclass BinOpTypedGen opCode, SDNode node, bit genF = 0, bit genV = 0> { if genF then @@ -61,44 +59,40 @@ multiclass BinOpTypedGen opCode, SDNode node, bit genF = 0 } multiclass TernOpTypedGen opCode, SDNode node, bit genP = 1, bit genI = 1, bit genF = 0, bit genV = 0> { - if genF then { - def SFSCond: TernOpTyped; - def SFVCond: TernOpTyped; + if genP then { + def SPSCond: TernOpTyped; + def SPVCond: TernOpTyped; } if genI then { def SISCond: TernOpTyped; def SIVCond: TernOpTyped; } - if genP then { - def SPSCond32: TernOpTyped; - def SPVCond32: TernOpTyped; - def SPSCond64: TernOpTyped; - def SPVCond64: TernOpTyped; + if genF then { + def SFSCond: TernOpTyped; + def SFVCond: TernOpTyped; } if genV then { - if genF then { - def VFSCond: TernOpTyped; - def VFVCond: TernOpTyped; + if genP then { + def VPSCond: TernOpTyped; + def VPVCond: TernOpTyped; } if genI then { def VISCond: TernOpTyped; def VIVCond: TernOpTyped; } - if genP then { - def VPSCond32: TernOpTyped; - def VPVCond32: TernOpTyped; - def VPSCond64: TernOpTyped; - def VPVCond64: TernOpTyped; + if genF then { + def VFSCond: TernOpTyped; + def VFVCond: TernOpTyped; } } } class UnOp opCode, list pattern=[]> - : Op; class UnOpTyped opCode, RegisterClass CID, SDNode node> - : Op; + : Op; class SimpleOp opCode>: Op; @@ -152,9 +146,12 @@ def OpMemberDecorateString: Op<5633, (outs), def OpExtension: Op<10, (outs), (ins StringImm:$name, variable_ops), "OpExtension $name">; def OpExtInstImport: Op<11, (outs ID:$res), (ins StringImm:$extInstsName, variable_ops), "$res = OpExtInstImport $extInstsName">; -def OpExtInst: Op<12, (outs ID:$res), (ins TYPE:$ty, ID:$set, Extension:$inst, variable_ops), +// $set should have been a register by the SPIR-V specification, +// however, OpExtInst and OpExtInstImport get its own special case treatment +// after instruction selection, so `i32imm` is the correct definition from the +// perspective of the instruction selection pass +def OpExtInst: Op<12, (outs ID:$res), (ins TYPE:$ty, i32imm:$set, Extension:$inst, variable_ops), "$res = OpExtInst $ty $set $inst">; - // 3.42.5 Mode-Setting Instructions def OpMemoryModel: Op<14, (outs), (ins AddressingModel:$addr, MemoryModel:$mem), @@ -222,21 +219,21 @@ return CurDAG->getTargetConstant( N->getValueAP().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32); }]>; -def fimm_to_i32 : SDNodeXFormgetTargetConstant( - N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32); + N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64); }]>; -def gi_bitcast_fimm_to_i32 : GICustomOperandRenderer<"renderFImm32">, - GISDNodeXFormEquiv; +def gi_bitcast_fimm_to_i64 : GICustomOperandRenderer<"renderFImm64">, + GISDNodeXFormEquiv; def gi_bitcast_imm_to_i32 : GICustomOperandRenderer<"renderImm32">, GISDNodeXFormEquiv; -def PseudoConstI: IntImmLeaf; -def PseudoConstF: FPImmLeaf; -def ConstPseudoTrue: IntImmLeaf; -def ConstPseudoFalse: IntImmLeaf; +def PseudoConstI: IntImmLeaf; +def PseudoConstF: FPImmLeaf; +def ConstPseudoTrue: IntImmLeaf; +def ConstPseudoFalse: IntImmLeaf; def ConstPseudoNull: IntImmLeaf; multiclass IntFPImm opCode, string name> { @@ -634,7 +631,7 @@ let isTerminator=1 in { let isReturn = 1, hasDelaySlot=0, isBarrier = 0, isTerminator=1, isNotDuplicable = 1 in { def OpKill: SimpleOp<"OpKill", 252>; def OpReturn: SimpleOp<"OpReturn", 253>; - def OpReturnValue: Op<254, (outs), (ins ANYID:$ret), "OpReturnValue $ret">; + def OpReturnValue: Op<254, (outs), (ins ID:$ret), "OpReturnValue $ret">; def OpUnreachable: SimpleOp<"OpUnreachable", 255>; } def OpLifetimeStart: Op<256, (outs), (ins ID:$ptr, i32imm:$sz), "OpLifetimeStart $ptr, $sz">; @@ -862,9 +859,9 @@ def OpGroupLogicalXorKHR: Op<6408, (outs ID:$res), (ins TYPE:$type, ID:$scope, i "$res = OpGroupLogicalXorKHR $type $scope $groupOp $value">; // Inline Assembly Instructions -def OpAsmTargetINTEL: Op<5609, (outs ID:$res), (ins StringImm:$str), "$res = OpAsmTargetINTEL $str">; +def OpAsmTargetINTEL: Op<5609, (outs ID:$res), (ins StringImm:$str, variable_ops), "$res = OpAsmTargetINTEL $str">; def OpAsmINTEL: Op<5610, (outs ID:$res), (ins TYPE:$type, TYPE:$asm_type, ID:$target, - StringImm:$asm, StringImm:$constraints), + StringImm:$asm, StringImm:$constraints, variable_ops), "$res = OpAsmINTEL $type $asm_type $target $asm">; def OpAsmCallINTEL: Op<5611, (outs ID:$res), (ins TYPE:$type, ID:$asm, variable_ops), "$res = OpAsmCallINTEL $type $asm">; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index ecb3cee4e781af..1104b6a7212935 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -186,7 +186,7 @@ class SPIRVInstructionSelector : public InstructionSelector { void renderImm32(MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const; - void renderFImm32(MachineInstrBuilder &MIB, const MachineInstr &I, + void renderFImm64(MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const; bool selectConst(Register ResVReg, const SPIRVType *ResType, const APInt &Imm, @@ -307,11 +307,13 @@ void SPIRVInstructionSelector::resetVRegsType(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { Register Reg = Register::index2VirtReg(I); - LLT Ty = MRI.getType(Reg); - if (Ty.isScalar()) - MRI.setType(Reg, LLT::scalar(32)); - else if (Ty.isVector() && !Ty.isPointer()) - MRI.setType(Reg, LLT::scalar(32)); + LLT RegType = MRI.getType(Reg); + if (RegType.isScalar()) + MRI.setType(Reg, LLT::scalar(64)); + else if (RegType.isPointer()) + MRI.setType(Reg, LLT::pointer(0, 64)); + else if (RegType.isVector()) + MRI.setType(Reg, LLT::fixed_vector(2, LLT::scalar(64))); } for (const auto &MBB : MF) { for (const auto &MI : MBB) { @@ -351,20 +353,24 @@ bool SPIRVInstructionSelector::select(MachineInstr &I) { Register SrcReg = I.getOperand(1).getReg(); auto *Def = MRI->getVRegDef(SrcReg); if (isTypeFoldingSupported(Def->getOpcode())) { - if (MRI->getType(DstReg).isPointer()) - MRI->setType(DstReg, LLT::scalar(32)); bool Res = selectImpl(I, *CoverageInfo); + LLVM_DEBUG({ + if (!Res && Def->getOpcode() != TargetOpcode::G_CONSTANT) { + dbgs() << "Unexpected pattern in ASSIGN_TYPE.\nInstruction: "; + I.print(dbgs()); + } + }); assert(Res || Def->getOpcode() == TargetOpcode::G_CONSTANT); if (Res) return Res; } - MRI->setRegClass(DstReg, &SPIRV::iIDRegClass); + MRI->setRegClass(SrcReg, MRI->getRegClass(DstReg)); MRI->replaceRegWith(SrcReg, DstReg); I.removeFromParent(); return true; } else if (I.getNumDefs() == 1) { - // Make all vregs 32 bits (for SPIR-V IDs). - MRI->setType(I.getOperand(0).getReg(), LLT::scalar(32)); + // Make all vregs 64 bits (for SPIR-V IDs). + MRI->setType(I.getOperand(0).getReg(), LLT::scalar(64)); } return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } @@ -381,9 +387,9 @@ bool SPIRVInstructionSelector::select(MachineInstr &I) { SPIRVType *ResType = HasDefs ? GR.getSPIRVTypeForVReg(ResVReg) : nullptr; assert(!HasDefs || ResType || I.getOpcode() == TargetOpcode::G_GLOBAL_VALUE); if (spvSelect(ResVReg, ResType, I)) { - if (HasDefs) // Make all vregs 32 bits (for SPIR-V IDs). + if (HasDefs) // Make all vregs 64 bits (for SPIR-V IDs). for (unsigned i = 0; i < I.getNumDefs(); ++i) - MRI->setType(I.getOperand(i).getReg(), LLT::scalar(32)); + MRI->setType(I.getOperand(i).getReg(), LLT::scalar(64)); I.removeFromParent(); return true; } @@ -909,7 +915,7 @@ bool SPIRVInstructionSelector::selectMemOperation(Register ResVReg, GlobalVariable *GV = new GlobalVariable(*CurFunction.getParent(), LLVMArrTy, true, GlobalValue::InternalLinkage, Constant::getNullValue(LLVMArrTy)); - Register VarReg = MRI->createGenericVirtualRegister(LLT::scalar(32)); + Register VarReg = MRI->createGenericVirtualRegister(LLT::scalar(64)); GR.add(GV, GR.CurMF, VarReg); buildOpDecorate(VarReg, I, TII, SPIRV::Decoration::Constant, {}); @@ -921,7 +927,7 @@ bool SPIRVInstructionSelector::selectMemOperation(Register ResVReg, .constrainAllUses(TII, TRI, RBI); SPIRVType *SourceTy = GR.getOrCreateSPIRVPointerType( ValTy, I, TII, SPIRV::StorageClass::UniformConstant); - SrcReg = MRI->createGenericVirtualRegister(LLT::scalar(32)); + SrcReg = MRI->createGenericVirtualRegister(LLT::scalar(64)); selectUnOpWithSrc(SrcReg, SourceTy, I, VarReg, SPIRV::OpBitcast); } auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCopyMemorySized)) @@ -996,7 +1002,7 @@ bool SPIRVInstructionSelector::selectUnmergeValues(MachineInstr &I) const { if (!ResType) { // There was no "assign type" actions, let's fix this now ResType = ScalarType; - MRI->setRegClass(ResVReg, &SPIRV::iIDRegClass); + MRI->setRegClass(ResVReg, GR.getRegClass(ResType)); MRI->setType(ResVReg, LLT::scalar(GR.getScalarOrVectorBitWidth(ResType))); GR.assignSPIRVTypeToVReg(ResType, ResVReg, *GR.CurMF); } @@ -1716,7 +1722,7 @@ bool SPIRVInstructionSelector::selectICmp(Register ResVReg, return selectCmp(ResVReg, ResType, CmpOpc, I); } -void SPIRVInstructionSelector::renderFImm32(MachineInstrBuilder &MIB, +void SPIRVInstructionSelector::renderFImm64(MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const { assert(I.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && @@ -1743,7 +1749,7 @@ SPIRVInstructionSelector::buildI32Constant(uint32_t Val, MachineInstr &I, auto ConstInt = ConstantInt::get(LLVMTy, Val); Register NewReg = GR.find(ConstInt, GR.CurMF); if (!NewReg.isValid()) { - NewReg = MRI->createGenericVirtualRegister(LLT::scalar(32)); + NewReg = MRI->createGenericVirtualRegister(LLT::scalar(64)); GR.add(ConstInt, GR.CurMF, NewReg); MachineInstr *MI; MachineBasicBlock &BB = *I.getParent(); @@ -1857,12 +1863,17 @@ bool SPIRVInstructionSelector::selectExt(Register ResVReg, return selectSelect(ResVReg, ResType, I, IsSigned); SPIRVType *SrcType = GR.getSPIRVTypeForVReg(SrcReg); - if (SrcType == ResType) + if (SrcType == ResType) { + const TargetRegisterClass *DstRC = MRI->getRegClassOrNull(ResVReg); + const TargetRegisterClass *SrcRC = MRI->getRegClassOrNull(SrcReg); + if (DstRC != SrcRC && SrcRC) + MRI->setRegClass(ResVReg, SrcRC); return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY)) .addDef(ResVReg) .addUse(SrcReg) .constrainAllUses(TII, TRI, RBI); + } unsigned Opcode = IsSigned ? SPIRV::OpSConvert : SPIRV::OpUConvert; return selectUnOp(ResVReg, ResType, I, Opcode); @@ -1901,12 +1912,17 @@ bool SPIRVInstructionSelector::selectTrunc(Register ResVReg, const SPIRVType *ArgType = GR.getSPIRVTypeForVReg(IntReg); if (GR.isScalarOrVectorOfType(ResVReg, SPIRV::OpTypeBool)) return selectIntToBool(IntReg, ResVReg, I, ArgType, ResType); - if (ArgType == ResType) + if (ArgType == ResType) { + const TargetRegisterClass *DstRC = MRI->getRegClassOrNull(ResVReg); + const TargetRegisterClass *SrcRC = MRI->getRegClassOrNull(IntReg); + if (DstRC != SrcRC && SrcRC) + MRI->setRegClass(ResVReg, SrcRC); return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY)) .addDef(ResVReg) .addUse(IntReg) .constrainAllUses(TII, TRI, RBI); + } bool IsSigned = GR.isScalarOrVectorSigned(ResType); unsigned Opcode = IsSigned ? SPIRV::OpSConvert : SPIRV::OpUConvert; return selectUnOp(ResVReg, ResType, I, Opcode); @@ -2089,7 +2105,7 @@ bool SPIRVInstructionSelector::wrapIntoSpecConstantOp( GR.add(OpDefine, MF, WrapReg); CompositeArgs.push_back(WrapReg); // Decorate the wrapper register and generate a new instruction - MRI->setType(WrapReg, LLT::pointer(0, 32)); + MRI->setType(WrapReg, LLT::pointer(0, 64)); GR.assignSPIRVTypeToVReg(OpType, WrapReg, *MF); MachineBasicBlock &BB = *I.getParent(); Result = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpSpecConstantOp)) @@ -2402,7 +2418,7 @@ bool SPIRVInstructionSelector::selectGlobalValue( // registers without a definition. We will resolve it later, during // module analysis stage. MachineRegisterInfo *MRI = MIRBuilder.getMRI(); - Register FuncVReg = MRI->createGenericVirtualRegister(LLT::scalar(32)); + Register FuncVReg = MRI->createGenericVirtualRegister(LLT::scalar(64)); MRI->setRegClass(FuncVReg, &SPIRV::iIDRegClass); MachineInstrBuilder MB = BuildMI(BB, I, I.getDebugLoc(), @@ -2470,7 +2486,7 @@ bool SPIRVInstructionSelector::selectLog10(Register ResVReg, MachineBasicBlock &BB = *I.getParent(); // Build log2(x). - Register VarReg = MRI->createVirtualRegister(&SPIRV::iIDRegClass); + Register VarReg = MRI->createVirtualRegister(GR.getRegClass(ResType)); bool Result = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpExtInst)) .addDef(VarReg) @@ -2522,7 +2538,7 @@ bool SPIRVInstructionSelector::selectSpvThreadId(Register ResVReg, // Create new register for GlobalInvocationID builtin variable. Register NewRegister = MIRBuilder.getMRI()->createVirtualRegister(&SPIRV::iIDRegClass); - MIRBuilder.getMRI()->setType(NewRegister, LLT::pointer(0, 32)); + MIRBuilder.getMRI()->setType(NewRegister, LLT::pointer(0, 64)); GR.assignSPIRVTypeToVReg(PtrType, NewRegister, MIRBuilder.getMF()); // Build GlobalInvocationID global variable with the necessary decorations. @@ -2535,7 +2551,7 @@ bool SPIRVInstructionSelector::selectSpvThreadId(Register ResVReg, // Create new register for loading value. MachineRegisterInfo *MRI = MIRBuilder.getMRI(); Register LoadedRegister = MRI->createVirtualRegister(&SPIRV::iIDRegClass); - MIRBuilder.getMRI()->setType(LoadedRegister, LLT::pointer(0, 32)); + MIRBuilder.getMRI()->setType(LoadedRegister, LLT::pointer(0, 64)); GR.assignSPIRVTypeToVReg(Vec3Ty, LoadedRegister, MIRBuilder.getMF()); // Load v3uint value from the global variable. diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp index e775f8c57b048e..9fe4d8a16bc32a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp @@ -54,6 +54,13 @@ bool isTypeFoldingSupported(unsigned Opcode) { return TypeFoldingSupportingOpcs.count(Opcode) > 0; } +LegalityPredicate typeOfExtendedScalars(unsigned TypeIdx, bool IsExtendedInts) { + return [IsExtendedInts, TypeIdx](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + return IsExtendedInts && Ty.isValid() && Ty.isScalar(); + }; +} + SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { using namespace TargetOpcode; @@ -142,7 +149,27 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { p2, p3, p4, p5, p6}; auto allPtrs = {p0, p1, p2, p3, p4, p5, p6}; - auto allWritablePtrs = {p0, p1, p3, p4, p5, p6}; + + bool IsExtendedInts = + ST.canUseExtension( + SPIRV::Extension::SPV_INTEL_arbitrary_precision_integers) || + ST.canUseExtension(SPIRV::Extension::SPV_KHR_bit_instructions); + auto extendedScalarsAndVectors = + [IsExtendedInts](const LegalityQuery &Query) { + const LLT Ty = Query.Types[0]; + return IsExtendedInts && Ty.isValid() && !Ty.isPointerOrPointerVector(); + }; + auto extendedScalarsAndVectorsProduct = [IsExtendedInts]( + const LegalityQuery &Query) { + const LLT Ty1 = Query.Types[0], Ty2 = Query.Types[1]; + return IsExtendedInts && Ty1.isValid() && Ty2.isValid() && + !Ty1.isPointerOrPointerVector() && !Ty2.isPointerOrPointerVector(); + }; + auto extendedPtrsScalarsAndVectors = + [IsExtendedInts](const LegalityQuery &Query) { + const LLT Ty = Query.Types[0]; + return IsExtendedInts && Ty.isValid(); + }; for (auto Opc : TypeFoldingSupportingOpcs) getActionDefinitionsBuilder(Opc).custom(); @@ -173,17 +200,21 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { getActionDefinitionsBuilder(G_UNMERGE_VALUES).alwaysLegal(); getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE}) - .legalIf(all(typeInSet(0, allWritablePtrs), typeInSet(1, allPtrs))); + .legalIf(all(typeInSet(0, allPtrs), typeInSet(1, allPtrs))); getActionDefinitionsBuilder(G_MEMSET).legalIf( - all(typeInSet(0, allWritablePtrs), typeInSet(1, allIntScalars))); + all(typeInSet(0, allPtrs), typeInSet(1, allIntScalars))); getActionDefinitionsBuilder(G_ADDRSPACE_CAST) .legalForCartesianProduct(allPtrs, allPtrs); getActionDefinitionsBuilder({G_LOAD, G_STORE}).legalIf(typeInSet(1, allPtrs)); - getActionDefinitionsBuilder(G_BITREVERSE).legalFor(allIntScalarsAndVectors); + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS, + G_BITREVERSE, G_SADDSAT, G_UADDSAT, G_SSUBSAT, + G_USUBSAT}) + .legalFor(allIntScalarsAndVectors) + .legalIf(extendedScalarsAndVectors); getActionDefinitionsBuilder(G_FMA).legalFor(allFloatScalarsAndVectors); @@ -195,13 +226,18 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { .legalForCartesianProduct(allFloatScalarsAndVectors, allScalarsAndVectors); - getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) - .legalFor(allIntScalarsAndVectors); + getActionDefinitionsBuilder(G_CTPOP) + .legalForCartesianProduct(allIntScalarsAndVectors) + .legalIf(extendedScalarsAndVectorsProduct); - getActionDefinitionsBuilder(G_CTPOP).legalForCartesianProduct( - allIntScalarsAndVectors, allIntScalarsAndVectors); + // Extensions. + getActionDefinitionsBuilder({G_TRUNC, G_ZEXT, G_SEXT, G_ANYEXT}) + .legalForCartesianProduct(allScalarsAndVectors) + .legalIf(extendedScalarsAndVectorsProduct); - getActionDefinitionsBuilder(G_PHI).legalFor(allPtrsScalarsAndVectors); + getActionDefinitionsBuilder(G_PHI) + .legalFor(allPtrsScalarsAndVectors) + .legalIf(extendedPtrsScalarsAndVectors); getActionDefinitionsBuilder(G_BITCAST).legalIf( all(typeInSet(0, allPtrsScalarsAndVectors), @@ -212,11 +248,17 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { getActionDefinitionsBuilder({G_STACKSAVE, G_STACKRESTORE}).alwaysLegal(); getActionDefinitionsBuilder(G_INTTOPTR) - .legalForCartesianProduct(allPtrs, allIntScalars); + .legalForCartesianProduct(allPtrs, allIntScalars) + .legalIf( + all(typeInSet(0, allPtrs), typeOfExtendedScalars(1, IsExtendedInts))); getActionDefinitionsBuilder(G_PTRTOINT) - .legalForCartesianProduct(allIntScalars, allPtrs); - getActionDefinitionsBuilder(G_PTR_ADD).legalForCartesianProduct( - allPtrs, allIntScalars); + .legalForCartesianProduct(allIntScalars, allPtrs) + .legalIf( + all(typeOfExtendedScalars(0, IsExtendedInts), typeInSet(1, allPtrs))); + getActionDefinitionsBuilder(G_PTR_ADD) + .legalForCartesianProduct(allPtrs, allIntScalars) + .legalIf( + all(typeInSet(0, allPtrs), typeOfExtendedScalars(1, IsExtendedInts))); // ST.canDirectlyComparePointers() for pointer args is supported in // legalizeCustom(). @@ -232,14 +274,14 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_SUB, G_ATOMICRMW_XOR, G_ATOMICRMW_UMAX, G_ATOMICRMW_UMIN}) - .legalForCartesianProduct(allIntScalars, allWritablePtrs); + .legalForCartesianProduct(allIntScalars, allPtrs); getActionDefinitionsBuilder( {G_ATOMICRMW_FADD, G_ATOMICRMW_FSUB, G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX}) - .legalForCartesianProduct(allFloatScalars, allWritablePtrs); + .legalForCartesianProduct(allFloatScalars, allPtrs); getActionDefinitionsBuilder(G_ATOMICRMW_XCHG) - .legalForCartesianProduct(allFloatAndIntScalarsAndPtrs, allWritablePtrs); + .legalForCartesianProduct(allFloatAndIntScalarsAndPtrs, allPtrs); getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS).lower(); // TODO: add proper legalization rules. @@ -248,10 +290,6 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { getActionDefinitionsBuilder({G_UADDO, G_USUBO, G_SMULO, G_UMULO}) .alwaysLegal(); - // Extensions. - getActionDefinitionsBuilder({G_TRUNC, G_ZEXT, G_SEXT, G_ANYEXT}) - .legalForCartesianProduct(allScalarsAndVectors); - // FP conversions. getActionDefinitionsBuilder({G_FPTRUNC, G_FPEXT}) .legalForCartesianProduct(allFloatScalarsAndVectors); @@ -311,10 +349,6 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { // Struct return types become a single scalar, so cannot easily legalize. getActionDefinitionsBuilder({G_SMULH, G_UMULH}).alwaysLegal(); - - // supported saturation arithmetic - getActionDefinitionsBuilder({G_SADDSAT, G_UADDSAT, G_SSUBSAT, G_USUBSAT}) - .legalFor(allIntScalarsAndVectors); } getLegacyLegalizerInfo().computeTables(); diff --git a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp index 5ec228416a8886..4e903a705bc535 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp @@ -56,11 +56,9 @@ extern void processInstr(MachineInstr &MI, MachineIRBuilder &MIB, } // namespace llvm static bool isMetaInstrGET(unsigned Opcode) { - return Opcode == SPIRV::GET_ID || Opcode == SPIRV::GET_ID64 || - Opcode == SPIRV::GET_fID || Opcode == SPIRV::GET_fID64 || - Opcode == SPIRV::GET_pID32 || Opcode == SPIRV::GET_pID64 || - Opcode == SPIRV::GET_vID || Opcode == SPIRV::GET_vfID || - Opcode == SPIRV::GET_vpID32 || Opcode == SPIRV::GET_vpID64; + return Opcode == SPIRV::GET_ID || Opcode == SPIRV::GET_fID || + Opcode == SPIRV::GET_pID || Opcode == SPIRV::GET_vID || + Opcode == SPIRV::GET_vfID || Opcode == SPIRV::GET_vpID; } static bool mayBeInserted(unsigned Opcode) { @@ -126,9 +124,8 @@ static void processNewInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, if (!ResVType) continue; // Set type & class - MRI.setRegClass(ResVReg, &SPIRV::iIDRegClass); - MRI.setType(ResVReg, - LLT::scalar(GR->getScalarOrVectorBitWidth(ResVType))); + MRI.setRegClass(ResVReg, GR->getRegClass(ResVType)); + MRI.setType(ResVReg, GR->getRegType(ResVType)); GR->assignSPIRVTypeToVReg(ResVType, ResVReg, *GR->CurMF); } // If this is a simple operation that is to be reduced by TableGen @@ -140,10 +137,6 @@ static void processNewInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, continue; // Restore usual instructions pattern for the newly inserted // instruction - MRI.setRegClass(ResVReg, MRI.getType(ResVReg).isVector() - ? &SPIRV::iIDRegClass - : &SPIRV::ANYIDRegClass); - MRI.setType(ResVReg, LLT::scalar(32)); insertAssignInstr(ResVReg, nullptr, ResVType, GR, MIB, MRI); processInstr(I, MIB, MRI, GR); } diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index 7c158abba3c28c..4e45aa4b888a92 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -193,14 +193,24 @@ static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR, // If the ptrcast would be redundant, replace all uses with the source // register. + MachineRegisterInfo *MRI = MIB.getMRI(); if (GR->getSPIRVTypeForVReg(Source) == AssignedPtrType) { // Erase Def's assign type instruction if we are going to replace Def. - if (MachineInstr *AssignMI = findAssignTypeInstr(Def, MIB.getMRI())) + if (MachineInstr *AssignMI = findAssignTypeInstr(Def, MRI)) ToErase.push_back(AssignMI); - MIB.getMRI()->replaceRegWith(Def, Source); + MRI->replaceRegWith(Def, Source); } else { GR->assignSPIRVTypeToVReg(AssignedPtrType, Def, MF); MIB.buildBitcast(Def, Source); + // MachineVerifier requires that bitcast must change the type. + // Change AddressSpace if needed to hint that Def and Source points to + // different types: this doesn't change actual code generation. + LLT DefType = MRI->getType(Def); + if (DefType == MRI->getType(Source)) + MRI->setType(Def, + LLT::pointer((DefType.getAddressSpace() + 1) % + SPIRVSubtarget::MaxLegalAddressSpace, + GR->getPointerSize())); } } } @@ -287,7 +297,8 @@ static SPIRVType *propagateSPIRVType(MachineInstr *MI, SPIRVGlobalRegistry *GR, if (SpvType) GR->assignSPIRVTypeToVReg(SpvType, Reg, MIB.getMF()); if (!MRI.getRegClassOrNull(Reg)) - MRI.setRegClass(Reg, &SPIRV::iIDRegClass); + MRI.setRegClass(Reg, SpvType ? GR->getRegClass(SpvType) + : &SPIRV::iIDRegClass); } } return SpvType; @@ -308,68 +319,26 @@ static void widenScalarLLTNextPow2(Register Reg, MachineRegisterInfo &MRI) { MRI.setType(Reg, LLT::scalar(NewSz)); } -inline bool getIsFloat(SPIRVType *SpvType, const SPIRVGlobalRegistry &GR) { - bool IsFloat = SpvType->getOpcode() == SPIRV::OpTypeFloat; - return IsFloat ? true - : SpvType->getOpcode() == SPIRV::OpTypeVector && - GR.getSPIRVTypeForVReg(SpvType->getOperand(1).getReg()) - ->getOpcode() == SPIRV::OpTypeFloat; -} - -static const TargetRegisterClass *getRegClass(SPIRVType *SpvType, - const SPIRVGlobalRegistry &GR) { - unsigned Opcode = SpvType->getOpcode(); - switch (Opcode) { - case SPIRV::OpTypeFloat: - return &SPIRV::fIDRegClass; - case SPIRV::OpTypePointer: - return GR.getPointerSize() == 64 ? &SPIRV::pID64RegClass - : &SPIRV::pID32RegClass; - case SPIRV::OpTypeVector: { - SPIRVType *ElemType = - GR.getSPIRVTypeForVReg(SpvType->getOperand(1).getReg()); - unsigned ElemOpcode = ElemType ? ElemType->getOpcode() : 0; - if (ElemOpcode == SPIRV::OpTypeFloat) - return &SPIRV::vfIDRegClass; - if (ElemOpcode == SPIRV::OpTypePointer) - return GR.getPointerSize() == 64 ? &SPIRV::vpID64RegClass - : &SPIRV::vpID32RegClass; - return &SPIRV::vIDRegClass; - } - } - return &SPIRV::iIDRegClass; -} - static std::pair createNewIdReg(SPIRVType *SpvType, Register SrcReg, MachineRegisterInfo &MRI, const SPIRVGlobalRegistry &GR) { if (!SpvType) SpvType = GR.getSPIRVTypeForVReg(SrcReg); - assert(SpvType && "VReg is expected to have SPIRV type"); - LLT NewT; - LLT SrcLLT = MRI.getType(SrcReg); - bool IsFloat = getIsFloat(SpvType, GR); - auto GetIdOp = IsFloat ? SPIRV::GET_fID : SPIRV::GET_ID; - if (SrcLLT.isPointer()) { - unsigned PtrSz = GR.getPointerSize(); - NewT = LLT::pointer(0, PtrSz); - bool IsVec = SrcLLT.isVector(); - if (IsVec) - NewT = LLT::fixed_vector(2, NewT); - if (PtrSz == 64) - GetIdOp = IsVec ? SPIRV::GET_vpID64 : SPIRV::GET_pID64; - else - GetIdOp = IsVec ? SPIRV::GET_vpID32 : SPIRV::GET_pID32; - } else if (SrcLLT.isVector()) { - NewT = LLT::scalar(GR.getScalarOrVectorBitWidth(SpvType)); - NewT = LLT::fixed_vector(2, NewT); - GetIdOp = IsFloat ? SPIRV::GET_vfID : SPIRV::GET_vID; - } else { - NewT = LLT::scalar(GR.getScalarOrVectorBitWidth(SpvType)); - } - Register IdReg = MRI.createGenericVirtualRegister(NewT); - MRI.setRegClass(IdReg, getRegClass(SpvType, GR)); - return {IdReg, GetIdOp}; + const TargetRegisterClass *RC = GR.getRegClass(SpvType); + Register Reg = MRI.createGenericVirtualRegister(GR.getRegType(SpvType)); + MRI.setRegClass(Reg, RC); + unsigned GetIdOp = SPIRV::GET_ID; + if (RC == &SPIRV::fIDRegClass) + GetIdOp = SPIRV::GET_fID; + else if (RC == &SPIRV::pIDRegClass) + GetIdOp = SPIRV::GET_pID; + else if (RC == &SPIRV::vfIDRegClass) + GetIdOp = SPIRV::GET_vfID; + else if (RC == &SPIRV::vpIDRegClass) + GetIdOp = SPIRV::GET_vpID; + else if (RC == &SPIRV::vIDRegClass) + GetIdOp = SPIRV::GET_vID; + return {Reg, GetIdOp}; } // Insert ASSIGN_TYPE instuction between Reg and its definition, set NewReg as @@ -391,7 +360,7 @@ Register insertAssignInstr(Register Reg, Type *Ty, SPIRVType *SpvType, if (auto *RC = MRI.getRegClassOrNull(Reg)) { MRI.setRegClass(NewReg, RC); } else { - auto RegClass = getRegClass(SpvType, *GR); + auto RegClass = GR->getRegClass(SpvType); MRI.setRegClass(NewReg, RegClass); MRI.setRegClass(Reg, RegClass); } @@ -445,6 +414,11 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, SmallVector ToErase; DenseMap RegsAlreadyAddedToDT; + bool IsExtendedInts = + ST->canUseExtension( + SPIRV::Extension::SPV_INTEL_arbitrary_precision_integers) || + ST->canUseExtension(SPIRV::Extension::SPV_KHR_bit_instructions); + for (MachineBasicBlock *MBB : post_order(&MF)) { if (MBB->empty()) continue; @@ -455,10 +429,12 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, MachineInstr &MI = *MII; unsigned MIOp = MI.getOpcode(); - // validate bit width of scalar registers - for (const auto &MOP : MI.operands()) - if (MOP.isReg()) - widenScalarLLTNextPow2(MOP.getReg(), MRI); + if (!IsExtendedInts) { + // validate bit width of scalar registers + for (const auto &MOP : MI.operands()) + if (MOP.isReg()) + widenScalarLLTNextPow2(MOP.getReg(), MRI); + } if (isSpvIntrinsic(MI, Intrinsic::spv_assign_ptr_type)) { Register Reg = MI.getOperand(1).getReg(); @@ -501,6 +477,8 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, if (isSpvIntrinsic(UseMI, Intrinsic::spv_assign_type) || isSpvIntrinsic(UseMI, Intrinsic::spv_assign_name)) continue; + if (UseMI.getOpcode() == SPIRV::ASSIGN_TYPE) + NeedAssignType = false; } Type *Ty = nullptr; if (MIOp == TargetOpcode::G_CONSTANT) { @@ -619,12 +597,39 @@ static void processInstrsWithTypeFolding(MachineFunction &MF, if (UseMI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST) continue; } - if (MRI.getType(DstReg).isPointer()) - MRI.setType(DstReg, LLT::pointer(0, GR->getPointerSize())); } } } +static Register +collectInlineAsmInstrOperands(MachineInstr *MI, + SmallVector *Ops = nullptr) { + Register DefReg; + unsigned StartOp = InlineAsm::MIOp_FirstOperand, + AsmDescOp = InlineAsm::MIOp_FirstOperand; + for (unsigned Idx = StartOp, MISz = MI->getNumOperands(); Idx != MISz; + ++Idx) { + const MachineOperand &MO = MI->getOperand(Idx); + if (MO.isMetadata()) + continue; + if (Idx == AsmDescOp && MO.isImm()) { + // compute the index of the next operand descriptor + const InlineAsm::Flag F(MO.getImm()); + AsmDescOp += 1 + F.getNumOperandRegisters(); + continue; + } + if (MO.isReg() && MO.isDef()) { + if (!Ops) + return MO.getReg(); + else + DefReg = MO.getReg(); + } else if (Ops) { + Ops->push_back(Idx); + } + } + return DefReg; +} + static void insertInlineAsmProcess(MachineFunction &MF, SPIRVGlobalRegistry *GR, const SPIRVSubtarget &ST, MachineIRBuilder MIRBuilder, @@ -634,7 +639,7 @@ insertInlineAsmProcess(MachineFunction &MF, SPIRVGlobalRegistry *GR, for (unsigned i = 0, Sz = ToProcess.size(); i + 1 < Sz; i += 2) { MachineInstr *I1 = ToProcess[i], *I2 = ToProcess[i + 1]; assert(isSpvIntrinsic(*I1, Intrinsic::spv_inline_asm) && I2->isInlineAsm()); - MIRBuilder.setInsertPt(*I1->getParent(), *I1); + MIRBuilder.setInsertPt(*I2->getParent(), *I2); if (!AsmTargetReg.isValid()) { // define vendor specific assembly target or dialect @@ -680,26 +685,8 @@ insertInlineAsmProcess(MachineFunction &MF, SPIRVGlobalRegistry *GR, MIRBuilder.buildInstr(SPIRV::OpDecorate) .addUse(AsmReg) .addImm(static_cast(SPIRV::Decoration::SideEffectsINTEL)); - Register DefReg; - SmallVector Ops; - unsigned StartOp = InlineAsm::MIOp_FirstOperand, - AsmDescOp = InlineAsm::MIOp_FirstOperand; - unsigned I2Sz = I2->getNumOperands(); - for (unsigned Idx = StartOp; Idx != I2Sz; ++Idx) { - const MachineOperand &MO = I2->getOperand(Idx); - if (MO.isMetadata()) - continue; - if (Idx == AsmDescOp && MO.isImm()) { - // compute the index of the next operand descriptor - const InlineAsm::Flag F(MO.getImm()); - AsmDescOp += 1 + F.getNumOperandRegisters(); - } else { - if (MO.isReg() && MO.isDef()) - DefReg = MO.getReg(); - else - Ops.push_back(Idx); - } - } + + Register DefReg = collectInlineAsmInstrOperands(I2); if (!DefReg.isValid()) { DefReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); MRI.setRegClass(DefReg, &SPIRV::iIDRegClass); @@ -707,19 +694,13 @@ insertInlineAsmProcess(MachineFunction &MF, SPIRVGlobalRegistry *GR, Type::getVoidTy(MF.getFunction().getContext()), MIRBuilder); GR->assignSPIRVTypeToVReg(VoidType, DefReg, MF); } + auto AsmCall = MIRBuilder.buildInstr(SPIRV::OpAsmCallINTEL) .addDef(DefReg) .addUse(GR->getSPIRVTypeID(RetType)) .addUse(AsmReg); - unsigned IntrIdx = 2; - for (unsigned Idx : Ops) { - ++IntrIdx; - const MachineOperand &MO = I2->getOperand(Idx); - if (MO.isReg()) - AsmCall.addUse(MO.getReg()); - else - AsmCall.addUse(I1->getOperand(IntrIdx).getReg()); - } + for (unsigned IntrIdx = 3; IntrIdx < I1->getNumOperands(); ++IntrIdx) + AsmCall.addUse(I1->getOperand(IntrIdx).getReg()); } for (MachineInstr *MI : ToProcess) MI->eraseFromParent(); diff --git a/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td b/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td index 936ad8e684b3e2..1ef42b79f1a8ea 100644 --- a/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td +++ b/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td @@ -12,7 +12,6 @@ let Namespace = "SPIRV" in { // Pointer types for patterns with the GlobalISelEmitter - def p32 : PtrValueType ; def p64 : PtrValueType ; class VTPtrVec @@ -20,50 +19,35 @@ let Namespace = "SPIRV" in { int isPointer = true; } - def v2p32 : VTPtrVec<2, p32>; def v2p64 : VTPtrVec<2, p64>; // Class for type registers def TYPE0 : Register<"TYPE0">; - def TYPE : RegisterClass<"SPIRV", [i32], 32, (add TYPE0)>; + def TYPE : RegisterClass<"SPIRV", [i64], 64, (add TYPE0)>; // Class for non-type registers def ID0 : Register<"ID0">; - def ID640 : Register<"ID640">; def fID0 : Register<"fID0">; - def fID640 : Register<"fID640">; - def pID320 : Register<"pID320">; - def pID640 : Register<"pID640">; + def pID0 : Register<"pID0">; def vID0 : Register<"vID0">; def vfID0 : Register<"vfID0">; - def vpID320 : Register<"vpID320">; - def vpID640 : Register<"vpID640">; - - def iID : RegisterClass<"SPIRV", [i32], 32, (add ID0)>; - def iID64 : RegisterClass<"SPIRV", [i64], 32, (add ID640)>; - def fID : RegisterClass<"SPIRV", [f32], 32, (add fID0)>; - def fID64 : RegisterClass<"SPIRV", [f64], 32, (add fID640)>; - def pID32 : RegisterClass<"SPIRV", [p32], 32, (add pID320)>; - def pID64 : RegisterClass<"SPIRV", [p64], 32, (add pID640)>; - def vID : RegisterClass<"SPIRV", [v2i32], 32, (add vID0)>; - def vfID : RegisterClass<"SPIRV", [v2f32], 32, (add vfID0)>; - def vpID32 : RegisterClass<"SPIRV", [v2p32], 32, (add vpID320)>; - def vpID64 : RegisterClass<"SPIRV", [v2p64], 32, (add vpID640)>; - + def vpID0 : Register<"vpID0">; + + def iID : RegisterClass<"SPIRV", [i64], 64, (add ID0)>; + def fID : RegisterClass<"SPIRV", [f64], 64, (add fID0)>; + def pID : RegisterClass<"SPIRV", [p64], 64, (add pID0)>; + def vID : RegisterClass<"SPIRV", [v2i64], 64, (add vID0)>; + def vfID : RegisterClass<"SPIRV", [v2f64], 64, (add vfID0)>; + def vpID : RegisterClass<"SPIRV", [v2p64], 64, (add vpID0)>; + def ID : RegisterClass< "SPIRV", - [i32, i64, f32, f64, p32, p64, v2i32, v2f32, v2p32, v2p64], - 32, - (add iID, iID64, fID, fID64, pID32, pID64, vID, vfID, vpID32, vpID64)>; - - def ANYID : RegisterClass< - "SPIRV", - [i32, i64, f32, f64, p32, p64, v2i32, v2f32, v2p32, v2p64], - 32, - (add ID0, ID640, fID0, fID640, pID320, pID640, vID0, vfID0, vpID320, vpID640)>; + [i64, f64, p64, v2i64, v2f64, v2p64], + 64, + (add iID, fID, pID, vID, vfID, vpID)>; // A few instructions like OpName can take ids from both type and non-type // instructions, so we need a super-class to allow for both to count as valid // arguments for these instructions. - def ANY : RegisterClass<"SPIRV", [i32], 32, (add TYPE, ID)>; + def ANY : RegisterClass<"SPIRV", [i64], 64, (add TYPE, ID)>; } diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.h b/llvm/lib/Target/SPIRV/SPIRVSubtarget.h index 211216488db799..82ec3cc95cdd3f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.h +++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.h @@ -128,6 +128,8 @@ class SPIRVSubtarget : public SPIRVGenSubtargetInfo { static bool classof(const TargetSubtargetInfo *ST) { return ST->getTargetTriple().isSPIRV(); } + + static constexpr unsigned MaxLegalAddressSpace = 6; }; } // namespace llvm diff --git a/llvm/test/CodeGen/SPIRV/constant/global-constants.ll b/llvm/test/CodeGen/SPIRV/constant/global-constants.ll index 74e28cbe7acb17..43dbed8b044b5e 100644 --- a/llvm/test/CodeGen/SPIRV/constant/global-constants.ll +++ b/llvm/test/CodeGen/SPIRV/constant/global-constants.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} @global = addrspace(1) constant i32 1 ; OpenCL global memory diff --git a/llvm/test/CodeGen/SPIRV/constant/local-aggregate-constant.ll b/llvm/test/CodeGen/SPIRV/constant/local-aggregate-constant.ll index 355bd32e261a17..447cad6528dd42 100644 --- a/llvm/test/CodeGen/SPIRV/constant/local-aggregate-constant.ll +++ b/llvm/test/CodeGen/SPIRV/constant/local-aggregate-constant.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s %aggregate = type { i8, i32 } diff --git a/llvm/test/CodeGen/SPIRV/constant/local-bool-constants.ll b/llvm/test/CodeGen/SPIRV/constant/local-bool-constants.ll index 73312c1e933727..c98f253aedfb4a 100644 --- a/llvm/test/CodeGen/SPIRV/constant/local-bool-constants.ll +++ b/llvm/test/CodeGen/SPIRV/constant/local-bool-constants.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s define i1 @getConstantTrue() { ret i1 true diff --git a/llvm/test/CodeGen/SPIRV/constant/local-float-point-constants.ll b/llvm/test/CodeGen/SPIRV/constant/local-float-point-constants.ll index 1bb795f14ab00a..5764e956342227 100644 --- a/llvm/test/CodeGen/SPIRV/constant/local-float-point-constants.ll +++ b/llvm/test/CodeGen/SPIRV/constant/local-float-point-constants.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s define half @getConstantFP16() { ret half 0x3ff1340000000000 ; 0x3c4d represented as double. diff --git a/llvm/test/CodeGen/SPIRV/constant/local-integers-constants.ll b/llvm/test/CodeGen/SPIRV/constant/local-integers-constants.ll index 5cfc0d2e9dc8f4..a7c04fa00e752b 100644 --- a/llvm/test/CodeGen/SPIRV/constant/local-integers-constants.ll +++ b/llvm/test/CodeGen/SPIRV/constant/local-integers-constants.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s define i8 @getConstantI8() { ret i8 2 diff --git a/llvm/test/CodeGen/SPIRV/constant/local-null-constants.ll b/llvm/test/CodeGen/SPIRV/constant/local-null-constants.ll index 8009f488f6dd2f..210679021532a3 100644 --- a/llvm/test/CodeGen/SPIRV/constant/local-null-constants.ll +++ b/llvm/test/CodeGen/SPIRV/constant/local-null-constants.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ;; OpenCL global memory define ptr addrspace(1) @getConstant1() { diff --git a/llvm/test/CodeGen/SPIRV/constant/local-vector-matrix-constants.ll b/llvm/test/CodeGen/SPIRV/constant/local-vector-matrix-constants.ll index 0e35588221a498..981ab08dbef338 100644 --- a/llvm/test/CodeGen/SPIRV/constant/local-vector-matrix-constants.ll +++ b/llvm/test/CodeGen/SPIRV/constant/local-vector-matrix-constants.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ;; TODO: Add test for matrix. But how are they represented in LLVM IR? diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_arbitrary_precision_integers.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_arbitrary_precision_integers.ll index f49367c50e0ef0..41d4b58ed1157e 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_arbitrary_precision_integers.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_arbitrary_precision_integers.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_arbitrary_precision_integers %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_arbitrary_precision_integers %s -o - | FileCheck %s define i6 @getConstantI6() { ret i6 2 diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_conversion/bfloat16-conv.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_conversion/bfloat16-conv.ll index 91fa340e461121..7bc6f5b9d56f5a 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_conversion/bfloat16-conv.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_conversion/bfloat16-conv.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_conversion %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_conversion %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_conversion %s -o - -filetype=obj | spirv-val %} ; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/decorate-prefetch-w-cache-controls.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/decorate-prefetch-w-cache-controls.ll index 9a13b720f61f74..4428a2049f9cef 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/decorate-prefetch-w-cache-controls.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/decorate-prefetch-w-cache-controls.ll @@ -1,6 +1,6 @@ ; Adapted from https://github.com/KhronosGroup/SPIRV-LLVM-Translator/tree/main/test/extensions/INTEL/SPV_INTEL_cache_controls -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_cache_controls %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_cache_controls %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV ; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_cache_controls %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV: Capability CacheControlsINTEL diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_two_calls.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_two_calls.ll index b7fecefe9a5812..c5a2918f92c29e 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_two_calls.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_two_calls.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s ; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: OpCapability Int8 diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_fpga_decorations/global-var-decorations.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_fpga_decorations/global-var-decorations.ll index 40008873bf19bf..ece9502450032d 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_fpga_decorations/global-var-decorations.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_fpga_decorations/global-var-decorations.ll @@ -1,6 +1,6 @@ ; Adapted from https://github.com/KhronosGroup/SPIRV-LLVM-Translator/tree/main/test/extensions/INTEL/SPV_INTEL_global_variable_fpga_decorations -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_global_variable_fpga_decorations %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_global_variable_fpga_decorations %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV ; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_global_variable_fpga_decorations %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV: Capability GlobalVariableFPGADecorationsINTEL diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_host_access/global-var-host-access.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_host_access/global-var-host-access.ll index 1397435efb2d4f..4256d3d0ce7cf0 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_host_access/global-var-host-access.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_host_access/global-var-host-access.ll @@ -1,6 +1,6 @@ ; Adapted from https://github.com/KhronosGroup/SPIRV-LLVM-Translator/tree/main/test/extensions/INTEL/SPV_INTEL_global_variable_host_access -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_global_variable_host_access,+SPV_INTEL_global_variable_fpga_decorations %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_global_variable_host_access,+SPV_INTEL_global_variable_fpga_decorations %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV ; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_global_variable_host_access,+SPV_INTEL_global_variable_fpga_decorations %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV-DAG: Capability GlobalVariableHostAccessINTEL diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_inline_assembly/inline_asm.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_inline_assembly/inline_asm.ll index 449dd71954500b..e006651d49e4bd 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_inline_assembly/inline_asm.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_inline_assembly/inline_asm.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s --spirv-ext=+SPV_INTEL_inline_assembly -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s --spirv-ext=+SPV_INTEL_inline_assembly -o - | FileCheck %s ; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s --spirv-ext=+SPV_INTEL_inline_assembly -o - -filetype=obj | spirv-val %} ; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_optnone.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_optnone.ll index a611be8eb6ee79..1744ec96804019 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_optnone.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_optnone.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_optnone %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-EXTENSION -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NO-EXTENSION +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_optnone %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-EXTENSION +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NO-EXTENSION ; CHECK-EXTENSION: OpCapability OptNoneINTEL ; CHECK-EXTENSION: OpExtension "SPV_INTEL_optnone" diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_subgroups/builtin-op-wrappers.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_subgroups/builtin-op-wrappers.ll index b15a788be7d715..8c145a463ed274 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_subgroups/builtin-op-wrappers.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_subgroups/builtin-op-wrappers.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_subgroups %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_subgroups %s -o - | FileCheck %s ; CHECK-DAG: Capability SubgroupShuffleINTEL ; CHECK-DAG: Capability SubgroupBufferBlockIOINTEL diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_subgroups/cl_intel_sub_groups.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_subgroups/cl_intel_sub_groups.ll index df17ec435ad377..9374e154a0239f 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_subgroups/cl_intel_sub_groups.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_subgroups/cl_intel_sub_groups.ll @@ -37,7 +37,7 @@ ; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_subgroups %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_subgroups %s -o - | FileCheck %s ; CHECK-ERROR: LLVM ERROR: intel_sub_group_shuffle: the builtin requires the following SPIR-V extension: SPV_INTEL_subgroups diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/builtin_alloca.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/builtin_alloca.ll index 4d6173e5b7232e..b3518f044a24fb 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/builtin_alloca.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/builtin_alloca.ll @@ -23,7 +23,7 @@ ; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_variable_length_array %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_variable_length_array %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_variable_length_array %s -o - -filetype=obj | spirv-val %} ; CHECK-ERROR: LLVM ERROR: array allocation: this instruction requires the following SPIR-V extension: SPV_INTEL_variable_length_array diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/vararr.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/vararr.ll index 8a54d22a539db1..3668dfcb4af75a 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/vararr.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/vararr.ll @@ -1,7 +1,7 @@ ; Modified from: https://github.com/KhronosGroup/SPIRV-LLVM-Translator/test/extensions/INTEL/SPV_INTEL_variable_length_array/basic.ll ; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_variable_length_array %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_variable_length_array %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_variable_length_array %s -o - -filetype=obj | spirv-val %} ; CHECK-ERROR: LLVM ERROR: array allocation: this instruction requires the following SPIR-V extension: SPV_INTEL_variable_length_array diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/vararr_spec_const.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/vararr_spec_const.ll index 7b9f75d74db997..dea04c9a21a1ca 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/vararr_spec_const.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/vararr_spec_const.ll @@ -1,6 +1,6 @@ ; Modified from: https://github.com/KhronosGroup/SPIRV-LLVM-Translator/test/extensions/INTEL/SPV_INTEL_variable_length_array/vla_spec_const.ll -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_variable_length_array %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_variable_length_array %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_variable_length_array %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV: Capability VariableLengthArrayINTEL diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_bit_instructions.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_bit_instructions.ll index 100f02faba8575..40e2aff0d755a3 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_bit_instructions.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_bit_instructions.ll @@ -1,5 +1,5 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s --spirv-ext=+SPV_KHR_bit_instructions -o - | FileCheck %s --check-prefix=CHECK-EXTENSION -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-NO-EXTENSION +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s --spirv-ext=+SPV_KHR_bit_instructions -o - | FileCheck %s --check-prefix=CHECK-EXTENSION +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-NO-EXTENSION ; CHECK-EXTENSION: OpCapability BitInstructions ; CHECK-EXTENSION-NEXT: OpExtension "SPV_KHR_bit_instructions" diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_no_integer_wrap_decoration.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_no_integer_wrap_decoration.ll index 0d9ab4ab65ceaa..dac22c0d84c2e1 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_no_integer_wrap_decoration.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_no_integer_wrap_decoration.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_KHR_no_integer_wrap_decoration %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_KHR_no_integer_wrap_decoration %s -o - | FileCheck %s ; CHECK-DAG: OpExtension "SPV_KHR_no_integer_wrap_decoration" diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll index e219f61b5c6e34..8ecd0a2b25ebcf 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll @@ -1,5 +1,5 @@ ; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_shader_clock %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_shader_clock %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_shader_clock %s -o - -filetype=obj | spirv-val %} ; CHECK-ERROR: LLVM ERROR: clock_read_device: the builtin requires the following SPIR-V extension: SPV_KHR_shader_clock diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_subgroup_rotate/subgroup-rotate.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_subgroup_rotate/subgroup-rotate.ll index a38c9072ed1bd4..1391fddfcdb369 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_subgroup_rotate/subgroup-rotate.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_subgroup_rotate/subgroup-rotate.ll @@ -1,5 +1,5 @@ ; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_KHR_subgroup_rotate %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_KHR_subgroup_rotate %s -o - | FileCheck %s ; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_subgroup_rotate %s -o - -filetype=obj | spirv-val %} ; CHECK-ERROR: LLVM ERROR: OpGroupNonUniformRotateKHR instruction requires the following SPIR-V extension: SPV_KHR_subgroup_rotate diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_uniform_group_instructions/uniform-group-instructions.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_uniform_group_instructions/uniform-group-instructions.ll index 0de654be8ed7d5..96e74149f44dbb 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_uniform_group_instructions/uniform-group-instructions.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_uniform_group_instructions/uniform-group-instructions.ll @@ -1,6 +1,6 @@ ; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_KHR_uniform_group_instructions %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_KHR_uniform_group_instructions %s -o - | FileCheck %s ; CHECK-ERROR: LLVM ERROR: __spirv_GroupBitwiseAndKHR: the builtin requires the following SPIR-V extension: SPV_KHR_uniform_group_instructions diff --git a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions-but-one.ll b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions-but-one.ll index 973a5e6f60569f..02d21a1abafa5f 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions-but-one.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions-but-one.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=all,-SPV_INTEL_arbitrary_precision_integers %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=all,-SPV_INTEL_arbitrary_precision_integers %s -o - | FileCheck %s define i6 @foo() { %call = tail call i32 @llvm.bitreverse.i32(i32 42) diff --git a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll index a5b979469b931d..f745794e11de13 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=all %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=all %s -o - | FileCheck %s define i6 @getConstantI6() { ret i6 2 diff --git a/llvm/test/CodeGen/SPIRV/function/alloca-load-store.ll b/llvm/test/CodeGen/SPIRV/function/alloca-load-store.ll index 5c06d65b6b4e61..55ab715feff3dd 100644 --- a/llvm/test/CodeGen/SPIRV/function/alloca-load-store.ll +++ b/llvm/test/CodeGen/SPIRV/function/alloca-load-store.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; CHECK-DAG: OpName %[[#BAR:]] "bar" ; CHECK-DAG: OpName %[[#FOO:]] "foo" diff --git a/llvm/test/CodeGen/SPIRV/function/identity-function.ll b/llvm/test/CodeGen/SPIRV/function/identity-function.ll index 005f1061f4fbb1..0acfa4666dfa09 100644 --- a/llvm/test/CodeGen/SPIRV/function/identity-function.ll +++ b/llvm/test/CodeGen/SPIRV/function/identity-function.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; CHECK-DAG: OpName %[[#VALUE:]] "value" ; CHECK-DAG: OpName %[[#IDENTITY:]] "identity" diff --git a/llvm/test/CodeGen/SPIRV/function/multiple-anonymous-functions.ll b/llvm/test/CodeGen/SPIRV/function/multiple-anonymous-functions.ll index 93bdbe3c1c2117..1b21658af32aa1 100644 --- a/llvm/test/CodeGen/SPIRV/function/multiple-anonymous-functions.ll +++ b/llvm/test/CodeGen/SPIRV/function/multiple-anonymous-functions.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ;; Types: ; CHECK-DAG: %[[#I32:]] = OpTypeInt 32 diff --git a/llvm/test/CodeGen/SPIRV/function/trivial-function-definition.ll b/llvm/test/CodeGen/SPIRV/function/trivial-function-definition.ll index cbef5862e54a64..a0e304f175acdd 100644 --- a/llvm/test/CodeGen/SPIRV/function/trivial-function-definition.ll +++ b/llvm/test/CodeGen/SPIRV/function/trivial-function-definition.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ;; Debug info: ; CHECK: OpName %[[#FOO:]] "foo" diff --git a/llvm/test/CodeGen/SPIRV/function/trivial-function-with-attributes.ll b/llvm/test/CodeGen/SPIRV/function/trivial-function-with-attributes.ll index 6c11993bc6dcc6..924163232056af 100644 --- a/llvm/test/CodeGen/SPIRV/function/trivial-function-with-attributes.ll +++ b/llvm/test/CodeGen/SPIRV/function/trivial-function-with-attributes.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ;; FIXME: Are there any attributes that would make the IR invalid for SPIR-V? diff --git a/llvm/test/CodeGen/SPIRV/function/trivial-function-with-call.ll b/llvm/test/CodeGen/SPIRV/function/trivial-function-with-call.ll index 87f45ffb3435a1..d3e2b6b46b8fed 100644 --- a/llvm/test/CodeGen/SPIRV/function/trivial-function-with-call.ll +++ b/llvm/test/CodeGen/SPIRV/function/trivial-function-with-call.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ;; Debug info: ; CHECK: OpName %[[#FOO:]] "foo" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/SV_DispatchThreadID.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/SV_DispatchThreadID.ll index e93271b703f7b2..c84b1c4b06c199 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/SV_DispatchThreadID.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/SV_DispatchThreadID.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-unknown %s -o - -filetype=obj | spirv-val %} ; This file generated from the following command: diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveGetLaneIndex.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveGetLaneIndex.ll index ec35690ac1547c..89a8575fa15991 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveGetLaneIndex.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveGetLaneIndex.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-unknown %s -o - -filetype=obj | spirv-val %} ; This file generated from the following command: diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/abs.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/abs.ll index 38c033bdd4dd78..8f1092c2206ed8 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/abs.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/abs.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/acos.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/acos.ll index fb0ced342aba39..7c9450267cbe89 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/acos.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/acos.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/asin.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/asin.ll index a8424170b26e3b..4d57c6fce77f70 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/asin.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/asin.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan.ll index 65975fc413e532..65e198d0e71a35 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/ceil.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/ceil.ll index 1b358ae54502ba..93677aadffa5e9 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/ceil.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/ceil.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cos.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cos.ll index 28675cf9f15412..e9e9642354f5a5 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cos.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cos.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cosh.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cosh.ll index 35ba1d69d7cbb1..1560f9b9bd7605 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cosh.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cosh.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/exp.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/exp.ll index ee230df41a6c7b..c1734a264ea042 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/exp.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/exp.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/exp2.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/exp2.ll index eeaca1b6560af0..4753b7bd9fe5bd 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/exp2.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/exp2.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/floor.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/floor.ll index 5b972104d50389..ea19fa94ea3265 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/floor.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/floor.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmad.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmad.ll index ce9b8f09daead1..b1ca34dc504c03 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmad.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmad.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmax.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmax.ll index 159d4ac19c8cc8..ca0fcfe8d646b6 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmax.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmax.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmin.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmin.ll index 15946b5038eec3..adc563bcea5c6c 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmin.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmin.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/frac.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/frac.ll index 3c48782a185862..4c088b6b38103c 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/frac.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/frac.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/lerp.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/lerp.ll index 63547820c18c77..aa7ad8c74d336c 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/lerp.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/lerp.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; Make sure SPIRV operation function calls for lerp are generated as FMix diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log.ll index 5a09f32b83b2d4..f85b20324da515 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log10.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log10.ll index 52ca6812d5d63a..32d63a0c0f1d21 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log10.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log10.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: %[[#extinst:]] = OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log2.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log2.ll index 21f02a40fc089e..add7f77897f790 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log2.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log2.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/pow.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/pow.ll index 7fae9637946fa2..3ac98853b92fbc 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/pow.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/pow.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rcp.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rcp.ll index 34f3c610ca81da..6f91162a378c8a 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rcp.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rcp.ll @@ -1,4 +1,4 @@ - ; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s + ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; CHECK-DAG: %[[#float_64:]] = OpTypeFloat 64 ; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/reversebits.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/reversebits.ll index e58c9ab6dfb1c1..a23b15ab075d60 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/reversebits.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/reversebits.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpMemoryModel Logical GLSL450 diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/round.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/round.ll index baf20833840b65..1c7e78261ffefd 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/round.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/round.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rsqrt.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rsqrt.ll index 650b32910d65e6..91023a1e401e16 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rsqrt.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rsqrt.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sin.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sin.ll index 061af5b37345ad..a6ae70a48e5db4 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sin.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sin.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sinh.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sinh.ll index 55e050c5001bf4..3b8bdbed0041bb 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sinh.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sinh.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/smax.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/smax.ll index 6bbf10323faba2..901e4764e15f67 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/smax.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/smax.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/smin.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/smin.ll index 04ab9600c85b7c..c39c39f0455fad 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/smin.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/smin.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sqrt.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sqrt.ll index 6882b77a427339..bb1f0346047e22 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sqrt.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sqrt.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/tan.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/tan.ll index 7bdce99dbfaa7e..b4a6e1574f732b 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/tan.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/tan.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/tanh.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/tanh.ll index 8cddc0fc090a7d..94fc3f0ec7abf5 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/tanh.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/tanh.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/trunc.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/trunc.ll index d75b7fa5a381d8..2a308028a9b482 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/trunc.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/trunc.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/umax.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/umax.ll index 32677df3d51831..01606a38732772 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/umax.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/umax.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/umin.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/umin.ll index a91fb8096c2f0c..34185ad7143e32 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/umin.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/umin.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/image/sampler.ll b/llvm/test/CodeGen/SPIRV/image/sampler.ll index 7b45c95f5ed433..f6ac3510ab6751 100644 --- a/llvm/test/CodeGen/SPIRV/image/sampler.ll +++ b/llvm/test/CodeGen/SPIRV/image/sampler.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#i32:]] = OpTypeInt 32 0 diff --git a/llvm/test/CodeGen/SPIRV/instructions/atomic-ptr.ll b/llvm/test/CodeGen/SPIRV/instructions/atomic-ptr.ll index 86e9be15a7c08f..9469d24b20af26 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/atomic-ptr.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/atomic-ptr.ll @@ -3,8 +3,8 @@ ; of spirv-val in this case, because there is a difference of accepted return types ; between atomicrmw and OpAtomicExchange. -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; CHECK-DAG: %[[#LongTy:]] = OpTypeInt 64 0 ; CHECK-DAG: %[[#PtrLongTy:]] = OpTypePointer CrossWorkgroup %[[#LongTy]] diff --git a/llvm/test/CodeGen/SPIRV/instructions/atomic_acqrel.ll b/llvm/test/CodeGen/SPIRV/instructions/atomic_acqrel.ll index d0c4531a75b65f..07d1a5cf662eca 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/atomic_acqrel.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/atomic_acqrel.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: OpName [[ADD:%.*]] "test_add" diff --git a/llvm/test/CodeGen/SPIRV/instructions/atomic_seq.ll b/llvm/test/CodeGen/SPIRV/instructions/atomic_seq.ll index fc1d6dafa1b08f..4078ffe1a10b87 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/atomic_seq.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/atomic_seq.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: OpName [[ADD:%.*]] "test_add" diff --git a/llvm/test/CodeGen/SPIRV/instructions/call-complex-function.ll b/llvm/test/CodeGen/SPIRV/instructions/call-complex-function.ll index a5b40b2e72c8da..8b2f14288772ce 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/call-complex-function.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/call-complex-function.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; CHECK-DAG: OpName [[FUN:%.+]] "fun" ; CHECK-DAG: OpName [[FOO:%.+]] "foo" diff --git a/llvm/test/CodeGen/SPIRV/instructions/call-trivial-function.ll b/llvm/test/CodeGen/SPIRV/instructions/call-trivial-function.ll index 6924b7006f8373..3777be6c9ebcf9 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/call-trivial-function.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/call-trivial-function.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; CHECK-DAG: OpName [[VALUE:%.+]] "value" ; CHECK-DAG: OpName [[IDENTITY:%.+]] "identity" diff --git a/llvm/test/CodeGen/SPIRV/instructions/fcmp.ll b/llvm/test/CodeGen/SPIRV/instructions/fcmp.ll index 01d4fc44f83a56..eb3da9e33f1e64 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/fcmp.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/fcmp.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; CHECK-DAG: OpName [[UEQ:%.*]] "test_ueq" ; CHECK-DAG: OpName [[OEQ:%.*]] "test_oeq" diff --git a/llvm/test/CodeGen/SPIRV/instructions/float-casts.ll b/llvm/test/CodeGen/SPIRV/instructions/float-casts.ll index 3b311d841623dd..8833d3bb084a92 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/float-casts.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/float-casts.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; CHECK-DAG: OpName [[TRUNC32_16:%.*]] "f32tof16" ; CHECK-DAG: OpName [[EXT16_32:%.*]] "f16tof32" diff --git a/llvm/test/CodeGen/SPIRV/instructions/float-fast-flags.ll b/llvm/test/CodeGen/SPIRV/instructions/float-fast-flags.ll index 1db5f8bb0ee36f..43336db4e86fa8 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/float-fast-flags.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/float-fast-flags.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; DISABLED-CHECK-DAG: OpName [[FNEG:%.+]] "scalar_fneg" ; CHECK-DAG: OpName [[FADD:%.+]] "test_fadd" diff --git a/llvm/test/CodeGen/SPIRV/instructions/icmp.ll b/llvm/test/CodeGen/SPIRV/instructions/icmp.ll index 28c14a99d2a080..bbf947f84c09a7 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/icmp.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/icmp.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; CHECK-DAG: OpName [[EQ:%.*]] "test_eq" ; CHECK-DAG: OpName [[NE:%.*]] "test_ne" diff --git a/llvm/test/CodeGen/SPIRV/instructions/intrinsics.ll b/llvm/test/CodeGen/SPIRV/instructions/intrinsics.ll index fe900d186a9f72..e859dee51a6c3d 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/intrinsics.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/intrinsics.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 %s -mtriple=spirv32-unknown-unknown -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 %s -mtriple=spirv32-unknown-unknown -o - | FileCheck %s declare float @llvm.fabs.f32(float) declare float @llvm.rint.f32(float) diff --git a/llvm/test/CodeGen/SPIRV/instructions/nested-composites.ll b/llvm/test/CodeGen/SPIRV/instructions/nested-composites.ll index b326a929e783ce..88e992f1834527 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/nested-composites.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/nested-composites.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; CHECK-DAG: OpName [[FOOBAR:%.+]] "foobar" ; CHECK-DAG: OpName [[PRODUCER:%.+]] "producer" diff --git a/llvm/test/CodeGen/SPIRV/instructions/scalar-bitwise-operations.ll b/llvm/test/CodeGen/SPIRV/instructions/scalar-bitwise-operations.ll index 5424fb457bc7d2..81f19b7e8ad64e 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/scalar-bitwise-operations.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/scalar-bitwise-operations.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; CHECK-DAG: OpName [[SCALAR_SHL:%.+]] "scalar_shl" ; CHECK-DAG: OpName [[SCALAR_LSHR:%.+]] "scalar_lshr" diff --git a/llvm/test/CodeGen/SPIRV/instructions/scalar-floating-point-arithmetic.ll b/llvm/test/CodeGen/SPIRV/instructions/scalar-floating-point-arithmetic.ll index 8aa0f05ccfc256..7ce43c2e1b05d2 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/scalar-floating-point-arithmetic.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/scalar-floating-point-arithmetic.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; DISABLED-CHECK-DAG: OpName [[SCALAR_FNEG:%.+]] "scalar_fneg" ; CHECK-DAG: OpName [[SCALAR_FADD:%.+]] "scalar_fadd" diff --git a/llvm/test/CodeGen/SPIRV/instructions/scalar-integer-arithmetic.ll b/llvm/test/CodeGen/SPIRV/instructions/scalar-integer-arithmetic.ll index da32feecc6d6db..d222dfa570cf78 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/scalar-integer-arithmetic.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/scalar-integer-arithmetic.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; CHECK-DAG: OpName [[SCALAR_ADD:%.+]] "scalar_add" ; CHECK-DAG: OpName [[SCALAR_SUB:%.+]] "scalar_sub" diff --git a/llvm/test/CodeGen/SPIRV/instructions/select-ptr-load.ll b/llvm/test/CodeGen/SPIRV/instructions/select-ptr-load.ll index 0ff28952f8081a..6e6cd2f68a9713 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/select-ptr-load.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/select-ptr-load.ll @@ -1,7 +1,7 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV-DAG: %[[Float:.*]] = OpTypeFloat 32 diff --git a/llvm/test/CodeGen/SPIRV/instructions/select.ll b/llvm/test/CodeGen/SPIRV/instructions/select.ll index 9234b97157d9d8..91d5f12c4b7ba7 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/select.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/select.ll @@ -1,7 +1,7 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: OpName [[SCALARi32:%.+]] "select_i32" diff --git a/llvm/test/CodeGen/SPIRV/instructions/undef-nested-composite-store.ll b/llvm/test/CodeGen/SPIRV/instructions/undef-nested-composite-store.ll index 7425b303f8a87c..98993ef3bced09 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/undef-nested-composite-store.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/undef-nested-composite-store.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; CHECK-DAG: %[[#I32:]] = OpTypeInt 32 ; CHECK-DAG: %[[#I16:]] = OpTypeInt 16 diff --git a/llvm/test/CodeGen/SPIRV/instructions/undef-simple-composite-store.ll b/llvm/test/CodeGen/SPIRV/instructions/undef-simple-composite-store.ll index cb2c89be28f1f1..d03704bf30a81d 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/undef-simple-composite-store.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/undef-simple-composite-store.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; CHECK-DAG: %[[#I32:]] = OpTypeInt 32 ; CHECK-DAG: %[[#I16:]] = OpTypeInt 16 diff --git a/llvm/test/CodeGen/SPIRV/instructions/unreachable.ll b/llvm/test/CodeGen/SPIRV/instructions/unreachable.ll index 0a4538c6de2804..f8949cdf9032ea 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/unreachable.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/unreachable.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; CHECK: OpUnreachable define void @test_unreachable() { diff --git a/llvm/test/CodeGen/SPIRV/instructions/vector-bitwise-operations.ll b/llvm/test/CodeGen/SPIRV/instructions/vector-bitwise-operations.ll index 664c42d805f63b..05d8042fe91214 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/vector-bitwise-operations.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/vector-bitwise-operations.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; CHECK-DAG: OpName [[VECTOR_SHL:%.+]] "vector_shl" ; CHECK-DAG: OpName [[VECTOR_LSHR:%.+]] "vector_lshr" diff --git a/llvm/test/CodeGen/SPIRV/instructions/vector-floating-point-arithmetic.ll b/llvm/test/CodeGen/SPIRV/instructions/vector-floating-point-arithmetic.ll index 5513ddac1765ab..1823b4f4cf03b5 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/vector-floating-point-arithmetic.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/vector-floating-point-arithmetic.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; CHECK-DAG: OpName [[VECTOR_FNEG:%.+]] "vector_fneg" ; CHECK-DAG: OpName [[VECTOR_FADD:%.+]] "vector_fadd" diff --git a/llvm/test/CodeGen/SPIRV/instructions/vector-integer-arithmetic.ll b/llvm/test/CodeGen/SPIRV/instructions/vector-integer-arithmetic.ll index 1e61c7ac1d4043..fc4113894dee13 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/vector-integer-arithmetic.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/vector-integer-arithmetic.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; CHECK-DAG: OpName [[VECTOR_ADD:%.+]] "vector_add" ; CHECK-DAG: OpName [[VECTOR_SUB:%.+]] "vector_sub" diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/abs.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/abs.ll index d627a2fc7838a9..f74c6ef99b4559 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/abs.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/abs.ll @@ -1,4 +1,8 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-linux %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-linux %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-linux %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: %[[#]] = OpExtInst %[[#]] %[[#]] s_abs ; CHECK: %[[#]] = OpExtInst %[[#]] %[[#]] s_abs diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/assume.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/assume.ll index 48c96fae8b03a0..3d2080e0050b7a 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/assume.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/assume.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; CHECK-SPIRV-NOT: OpCapability ExpectAssumeKHR ; CHECK-SPIRV-NOT: OpExtension "SPV_KHR_expect_assume" diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/bswap.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/bswap.ll index 3f2ab9fa7190bc..0ec99a602e4b04 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/bswap.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/bswap.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; CHECK-SPIRV: OpName %[[#FuncNameInt16:]] "spirv.llvm_bswap_i16" ; CHECK-SPIRV: OpName %[[#FuncNameInt32:]] "spirv.llvm_bswap_i32" diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ceil.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ceil.ll index 8411d90e23f0d1..a960c876cb5964 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ceil.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ceil.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; CHECK: %[[#extinst_id:]] = OpExtInstImport "OpenCL.std" diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ctlz.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ctlz.ll index 147fd80462f9ad..480ea761a669cf 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ctlz.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ctlz.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; CHECK: %[[#extinst_id:]] = OpExtInstImport "OpenCL.std" diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ctpop.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ctpop.ll index f2d881b083dd7e..21598d712f5c32 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ctpop.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ctpop.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-linux %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-linux %s -o - | FileCheck %s ; CHECK: %[[#]] = OpBitCount %[[#]] %[[#]] ; CHECK: %[[#]] = OpBitCount %[[#]] %[[#]] diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/cttz.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/cttz.ll index b9bdcfdb162f76..96abdfc30e73b6 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/cttz.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/cttz.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; CHECK: %[[#extinst_id:]] = OpExtInstImport "OpenCL.std" diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fabs.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fabs.ll index 5de22021762927..fe170bb287cc9c 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fabs.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fabs.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; CHECK: %[[#extinst_id:]] = OpExtInstImport "OpenCL.std" diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fp-intrinsics.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fp-intrinsics.ll index 8f14e8cf272ba7..3d46b527bf14f7 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fp-intrinsics.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fp-intrinsics.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; CHECK: %[[#extinst_id:]] = OpExtInstImport "OpenCL.std" diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/invariant.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/invariant.ll index 5b700b7fcfee3e..bb6225f8ad4a6f 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/invariant.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/invariant.ll @@ -1,5 +1,5 @@ ;; Make sure the backend doesn't crash if the input LLVM IR contains llvm.invariant.* intrinsics -; RUN: llc -O0 -mtriple=spirv64-unknown-linux %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-linux %s -o - | FileCheck %s ; CHECK-NOT: OpFunctionParameter ; CHECK-NOT: OpFunctionCall diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/lifetime.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/lifetime.ll index 7fae8759c1f7d9..45683585d65146 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/lifetime.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/lifetime.ll @@ -1,7 +1,7 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#Char:]] = OpTypeInt 8 0 diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/add.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/add.ll index e0c84ee3a3f1c1..ef9719e64586d9 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/add.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/add.ll @@ -1,7 +1,8 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s -; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} -target triple = "spir64-unknown-unknown" +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[Char:.*]] = OpTypeInt 8 0 ; CHECK-DAG: %[[CharVec2:.*]] = OpTypeVector %[[Char]] 2 diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/and.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/and.ll index 12a4a86fa4a8be..e9d9adba8f3651 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/and.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/and.ll @@ -1,7 +1,8 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s -; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} -target triple = "spir64-unknown-unknown" +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[Char:.*]] = OpTypeInt 8 0 ; CHECK-DAG: %[[CharVec2:.*]] = OpTypeVector %[[Char]] 2 diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fadd.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fadd.ll index 459bc6bdcdaff8..38ec4db34c9bcf 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fadd.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fadd.ll @@ -1,7 +1,8 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s -; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} -target triple = "spir64-unknown-unknown" +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[Half:.*]] = OpTypeFloat 16 ; CHECK-DAG: %[[HalfVec2:.*]] = OpTypeVector %[[Half]] 2 diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fmax.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fmax.ll index 4f9cd29cd05d2a..eb0edae9f0ab56 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fmax.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fmax.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} target triple = "spir64-unknown-unknown" diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fmaximum.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fmaximum.ll index 837bea0fbe6247..418cafaf80a164 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fmaximum.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fmaximum.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} target triple = "spir64-unknown-unknown" diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fmin.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fmin.ll index 475da2e1ec31ea..b17d577b57226d 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fmin.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fmin.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} target triple = "spir64-unknown-unknown" diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fminimum.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fminimum.ll index b525c849c3b59a..26bb7796957eb0 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fminimum.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fminimum.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} target triple = "spir64-unknown-unknown" diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fmul.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fmul.ll index 0985be992ca74e..25efce819ad47f 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fmul.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/fmul.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} target triple = "spir64-unknown-unknown" diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/mul.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/mul.ll index 1a700577e46b41..16455f2e21cb6f 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/mul.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/mul.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} target triple = "spir64-unknown-unknown" diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/or.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/or.ll index 90c6cf5562a924..badfebc27072f6 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/or.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/or.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} target triple = "spir64-unknown-unknown" diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/smax.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/smax.ll index 4551fa31681d2b..54afb3a25b4d67 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/smax.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/smax.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} target triple = "spir64-unknown-unknown" diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/smin.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/smin.ll index a0d257bb131804..a95c2ea0bbbf67 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/smin.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/smin.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} target triple = "spir64-unknown-unknown" diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/umax.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/umax.ll index ba5dba76aeecca..a742009cef0111 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/umax.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/umax.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} target triple = "spir64-unknown-unknown" diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/umin.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/umin.ll index e16bde88ef5094..7844c205c7ab0e 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/umin.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/umin.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} target triple = "spir64-unknown-unknown" diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/xor.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/xor.ll index cf887bb358aca6..22f45a2c0bd6cc 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/xor.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llvm-vector-reduce/xor.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} target triple = "spir64-unknown-unknown" diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/maxnum.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/maxnum.ll index 7b8c05d9c98eda..e9254c27ab4042 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/maxnum.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/maxnum.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s define spir_func float @Test(float %x, float %y) { entry: diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/nearbyint.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/nearbyint.ll index 70194e21982dc9..7405ca4d64d196 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/nearbyint.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/nearbyint.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; CHECK: %[[#]] = OpExtInst %[[#]] %[[#]] rint diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ptr-annotation.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ptr-annotation.ll index 06f1d0bf7fd37c..556062a6fef4b4 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ptr-annotation.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ptr-annotation.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: OpName %[[#Foo:]] "foo" diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/satur-arith.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/satur-arith.ll index 5b59206ff7f2d3..08f15c077fed92 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/satur-arith.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/satur-arith.ll @@ -1,7 +1,7 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "OpenCL.std" diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/sqrt.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/sqrt.ll index e47b7b84c2b20c..5eec92f978b587 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/sqrt.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/sqrt.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; CHECK-DAG: %[[#ExtInstSetId:]] = OpExtInstImport "OpenCL.std" ; CHECK-DAG: %[[#Float:]] = OpTypeFloat 32 diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/umul.with.overflow.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/umul.with.overflow.ll index 406a23fa7d3df5..c34771bf381ea9 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/umul.with.overflow.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/umul.with.overflow.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; CHECK-SPIRV: OpName %[[#NAME_UMUL_FUNC_8:]] "spirv.llvm_umul_with_overflow_i8" ; CHECK-SPIRV: OpName %[[#NAME_UMUL_FUNC_32:]] "spirv.llvm_umul_with_overflow_i32" diff --git a/llvm/test/CodeGen/SPIRV/pointers/argument-ptr-to-struct.ll b/llvm/test/CodeGen/SPIRV/pointers/argument-ptr-to-struct.ll index ac72ec28c37d9d..df57e94a04c409 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/argument-ptr-to-struct.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/argument-ptr-to-struct.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#VOID:]] = OpTypeVoid diff --git a/llvm/test/CodeGen/SPIRV/pointers/bitcast-fix-accesschain.ll b/llvm/test/CodeGen/SPIRV/pointers/bitcast-fix-accesschain.ll index 7fae6ca2c48cf1..7db1eed84bf7d9 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/bitcast-fix-accesschain.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/bitcast-fix-accesschain.ll @@ -1,6 +1,9 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} + ; CHECK-DAG: %[[#TYCHAR:]] = OpTypeInt 8 0 ; CHECK-DAG: %[[#TYCHARPTR:]] = OpTypePointer Function %[[#TYCHAR]] ; CHECK-DAG: %[[#TYINT32:]] = OpTypeInt 32 0 diff --git a/llvm/test/CodeGen/SPIRV/pointers/bitcast-fix-load.ll b/llvm/test/CodeGen/SPIRV/pointers/bitcast-fix-load.ll index 18752fdf843d20..d6a0071167cef2 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/bitcast-fix-load.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/bitcast-fix-load.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#TYLONG:]] = OpTypeInt 32 0 diff --git a/llvm/test/CodeGen/SPIRV/pointers/bitcast-fix-store.ll b/llvm/test/CodeGen/SPIRV/pointers/bitcast-fix-store.ll index 202bcfbf2599a9..02641a8e75a55e 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/bitcast-fix-store.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/bitcast-fix-store.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#TYLONG:]] = OpTypeInt 32 0 diff --git a/llvm/test/CodeGen/SPIRV/pointers/complex.ll b/llvm/test/CodeGen/SPIRV/pointers/complex.ll index 6253ef24283b67..3b0974bc084974 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/complex.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/complex.ll @@ -1,7 +1,7 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: OpName %[[#Foo:]] "foo" diff --git a/llvm/test/CodeGen/SPIRV/pointers/custom-kernel-arg-type.ll b/llvm/test/CodeGen/SPIRV/pointers/custom-kernel-arg-type.ll index 4593fad783c60e..db804de26ecf1e 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/custom-kernel-arg-type.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/custom-kernel-arg-type.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: %[[TyInt:.*]] = OpTypeInt 8 0 diff --git a/llvm/test/CodeGen/SPIRV/pointers/duplicate-type-ptr-def.ll b/llvm/test/CodeGen/SPIRV/pointers/duplicate-type-ptr-def.ll index 8e70bef3a5399a..2f7579aa11abb2 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/duplicate-type-ptr-def.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/duplicate-type-ptr-def.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: %[[#Char:]] = OpTypeInt 8 0 diff --git a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-addressspace.ll b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-addressspace.ll index 7e9c6214c2818a..7a09ac973b590c 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-addressspace.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-addressspace.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: %[[#INT8:]] = OpTypeInt 8 0 diff --git a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-base-type.ll b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-base-type.ll index fc999ba1a3cdac..c822dbc5d6c0ed 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-base-type.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-base-type.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: %[[#FLOAT32:]] = OpTypeFloat 32 diff --git a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-bitcast-load.ll b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-bitcast-load.ll index 132f10262432b2..1d846a35a65aac 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-bitcast-load.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-bitcast-load.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#INT8:]] = OpTypeInt 8 0 diff --git a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-kernel-arg-char.ll b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-kernel-arg-char.ll index d2a65917bfd659..a5e891dae6f11d 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-kernel-arg-char.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-kernel-arg-char.ll @@ -1,5 +1,5 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#INT8:]] = OpTypeInt 8 0 diff --git a/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll b/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll index d0c64b4353ec68..7982893a0a9135 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll @@ -1,9 +1,9 @@ ; This test is to check that correct virtual register type is created after ptrtoint. -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpName %[[GlobalValue:.*]] "dev_global" diff --git a/llvm/test/CodeGen/SPIRV/pointers/global-zeroinitializer.ll b/llvm/test/CodeGen/SPIRV/pointers/global-zeroinitializer.ll index 679b0e436afc59..74320b36fd597b 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/global-zeroinitializer.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/global-zeroinitializer.ll @@ -1,7 +1,7 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpName %[[#Var:]] "var" diff --git a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-builtin-vload-type-discrapency.ll b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-builtin-vload-type-discrapency.ll index b4948b66aed86d..e0e4276bc876ab 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-builtin-vload-type-discrapency.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-builtin-vload-type-discrapency.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#INT8:]] = OpTypeInt 8 0 diff --git a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-addressspace.ll b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-addressspace.ll index a3a730ac67e782..2665923cc4e708 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-addressspace.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-addressspace.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#INT:]] = OpTypeInt 32 0 diff --git a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-bitcast-to-generic.ll b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-bitcast-to-generic.ll index b74a3449980d97..c61902831acf3f 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-bitcast-to-generic.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-bitcast-to-generic.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#IMAGE:]] = OpTypeImage %2 2D 0 0 0 0 Unknown ReadOnly diff --git a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-metadata.ll b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-metadata.ll index a513d103970663..bc6b99e01ff212 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-metadata.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-metadata.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} define spir_kernel void @test(ptr addrspace(1) %srcimg) { diff --git a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type.ll b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type.ll index b8f205a68e5616..e246dac1f5abb9 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#FLOAT32:]] = OpTypeFloat 32 diff --git a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-ptr-i8-default-element-type.ll b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-ptr-i8-default-element-type.ll index 55bddfdad699b2..f4c0ca3c46599c 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-ptr-i8-default-element-type.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-ptr-i8-default-element-type.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#CHAR:]] = OpTypeInt 8 diff --git a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-ptr-no-bitcast.ll b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-ptr-no-bitcast.ll index 0d2a832c496b1b..f8af9ef763e156 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-ptr-no-bitcast.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-ptr-no-bitcast.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#CHAR:]] = OpTypeInt 8 diff --git a/llvm/test/CodeGen/SPIRV/pointers/load-addressspace.ll b/llvm/test/CodeGen/SPIRV/pointers/load-addressspace.ll index 1667abc51be9fc..b3c68d22f9bdd4 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/load-addressspace.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/load-addressspace.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: %[[#INT8:]] = OpTypeInt 8 0 diff --git a/llvm/test/CodeGen/SPIRV/pointers/nested-struct-opaque-pointers.ll b/llvm/test/CodeGen/SPIRV/pointers/nested-struct-opaque-pointers.ll index 77b895c7762fba..2094f71f60531f 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/nested-struct-opaque-pointers.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/nested-struct-opaque-pointers.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-NOT: OpTypeInt 8 0 diff --git a/llvm/test/CodeGen/SPIRV/pointers/ptr-argument-byref.ll b/llvm/test/CodeGen/SPIRV/pointers/ptr-argument-byref.ll index 639906af3a952f..043af4b5708b0c 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/ptr-argument-byref.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/ptr-argument-byref.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} target triple = "spirv64-unknown-unknown" diff --git a/llvm/test/CodeGen/SPIRV/pointers/ptr-argument-byval.ll b/llvm/test/CodeGen/SPIRV/pointers/ptr-argument-byval.ll index 6b684bf41bbb09..5219be2d4e231a 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/ptr-argument-byval.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/ptr-argument-byval.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#VOID:]] = OpTypeVoid diff --git a/llvm/test/CodeGen/SPIRV/pointers/store-kernel-arg-i8-ptr-as-value-operand.ll b/llvm/test/CodeGen/SPIRV/pointers/store-kernel-arg-i8-ptr-as-value-operand.ll index 5adaf6f65688df..463cc7098b0f3a 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/store-kernel-arg-i8-ptr-as-value-operand.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/store-kernel-arg-i8-ptr-as-value-operand.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#CHAR:]] = OpTypeInt 8 diff --git a/llvm/test/CodeGen/SPIRV/pointers/store-kernel-arg-ptr-as-value-operand.ll b/llvm/test/CodeGen/SPIRV/pointers/store-kernel-arg-ptr-as-value-operand.ll index e7ce3ef621e83a..f0821fc0c03124 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/store-kernel-arg-ptr-as-value-operand.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/store-kernel-arg-ptr-as-value-operand.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} define spir_kernel void @foo(ptr addrspace(1) %arg) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 { diff --git a/llvm/test/CodeGen/SPIRV/pointers/store-operand-ptr-to-struct.ll b/llvm/test/CodeGen/SPIRV/pointers/store-operand-ptr-to-struct.ll index 3a0d65e1e95f19..0a8321266eee1d 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/store-operand-ptr-to-struct.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/store-operand-ptr-to-struct.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; TODO: OpFunctionParameter should be a pointer of struct base type. diff --git a/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll b/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll index 6d4913f802c289..03ecf5e8d839a1 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: %[[TyInt64:.*]] = OpTypeInt 64 0 diff --git a/llvm/test/CodeGen/SPIRV/pointers/two-bitcast-or-param-users.ll b/llvm/test/CodeGen/SPIRV/pointers/two-bitcast-or-param-users.ll index 23c3faaf88151f..d3b63ec9e1094c 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/two-bitcast-or-param-users.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/two-bitcast-or-param-users.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#INT:]] = OpTypeInt 32 diff --git a/llvm/test/CodeGen/SPIRV/pointers/two-subsequent-bitcasts.ll b/llvm/test/CodeGen/SPIRV/pointers/two-subsequent-bitcasts.ll index 83234e3986c84f..8c01df44563efe 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/two-subsequent-bitcasts.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/two-subsequent-bitcasts.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#float:]] = OpTypeFloat 32 diff --git a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-args-rev.ll b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-args-rev.ll index ae7fb99907b131..80b0b682266e1d 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-args-rev.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-args-rev.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV-DAG: OpName %[[FooArg:.*]] "known_type_ptr" diff --git a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-args.ll b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-args.ll index ee411f26466027..b25fe969b0579a 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-args.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-args.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV-DAG: OpName %[[FooArg:.*]] "unknown_type_ptr" diff --git a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-chain.ll b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-chain.ll index f060a97a57296b..dbc88cd1a78592 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-chain.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-chain.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV-DAG: OpName %[[ArgCum:.*]] "_arg_cum" diff --git a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-complex.ll b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-complex.ll index ea7a22c31d0e85..4802766ea2145b 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-complex.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-complex.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV-DAG: %[[Long:.*]] = OpTypeInt 32 0 diff --git a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-rev.ll b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-rev.ll index 76769ab8743082..042909c291d139 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-rev.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-rev.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV-DAG: OpName %[[FooArg:.*]] "known_type_ptr" diff --git a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call.ll b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call.ll index 8cbf360a2e38d4..c329f05be7627c 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV-DAG: OpName %[[FooArg:.*]] "known_type_ptr" diff --git a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-call-no-bitcast.ll b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-call-no-bitcast.ll index edb31ffeee8e86..101116f4378114 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-call-no-bitcast.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-call-no-bitcast.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV-DAG: OpName %[[Foo:.*]] "foo" diff --git a/llvm/test/CodeGen/SPIRV/pointers/typeof-ptr-int.ll b/llvm/test/CodeGen/SPIRV/pointers/typeof-ptr-int.ll index f144418cf54259..a6921a01d5d026 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/typeof-ptr-int.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/typeof-ptr-int.ll @@ -1,7 +1,7 @@ ; This test is to check that two functions have different SPIR-V type ; definitions, even though their LLVM function types are identical. -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: OpName %[[Fun32:.*]] "tp_arg_i32" diff --git a/llvm/test/CodeGen/SPIRV/pointers/variables-storage-class.ll b/llvm/test/CodeGen/SPIRV/pointers/variables-storage-class.ll index 034feed72dc7bc..2d4c805ac9df15 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/variables-storage-class.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/variables-storage-class.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} @Ptr = addrspace(1) global ptr addrspace(1) null diff --git a/llvm/test/CodeGen/SPIRV/transcoding/sub_group_ballot.ll b/llvm/test/CodeGen/SPIRV/transcoding/sub_group_ballot.ll index c579859a3f5314..a815f5d44969c9 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/sub_group_ballot.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/sub_group_ballot.ll @@ -155,7 +155,7 @@ ;; dst[4] = get_sub_group_lt_mask(); ;; } -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; CHECK-SPIRV-DAG: OpCapability GroupNonUniformBallot From de2b6cb6ab6472a13c68ddcd963aa2f25e298772 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 22 Aug 2024 09:47:35 +0200 Subject: [PATCH 156/426] [InstCombine] Fold icmp over select of cmp more aggressively (#105536) When folding an icmp into a select, treat an icmp of a constant with a one-use ucmp/scmp intrinsic as a simplification. These comparisons will reduce down to an icmp. This addresses a regression seen in Rust and also in llvm-opt-benchmark. --- .../InstCombine/InstCombineCompares.cpp | 10 +++- .../test/Transforms/InstCombine/select-cmp.ll | 54 ++++++++++++++----- 2 files changed, 50 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 34c9e0fde4f428..8e8d472a5df1d3 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -4209,6 +4209,14 @@ Instruction *InstCombinerImpl::foldSelectICmp(ICmpInst::Predicate Pred, if (Op2) CI = dyn_cast(Op2); + auto Simplifies = [&](Value *Op, unsigned Idx) { + // A comparison of ucmp/scmp with a constant will fold into an icmp. + const APInt *Dummy; + return Op || + (isa(SI->getOperand(Idx)) && + SI->getOperand(Idx)->hasOneUse() && match(RHS, m_APInt(Dummy))); + }; + // We only want to perform this transformation if it will not lead to // additional code. This is true if either both sides of the select // fold to a constant (in which case the icmp is replaced with a select @@ -4219,7 +4227,7 @@ Instruction *InstCombinerImpl::foldSelectICmp(ICmpInst::Predicate Pred, bool Transform = false; if (Op1 && Op2) Transform = true; - else if (Op1 || Op2) { + else if (Simplifies(Op1, 1) || Simplifies(Op2, 2)) { // Local case if (SI->hasOneUse()) Transform = true; diff --git a/llvm/test/Transforms/InstCombine/select-cmp.ll b/llvm/test/Transforms/InstCombine/select-cmp.ll index 697010b90db584..234815949d77d4 100644 --- a/llvm/test/Transforms/InstCombine/select-cmp.ll +++ b/llvm/test/Transforms/InstCombine/select-cmp.ll @@ -482,10 +482,9 @@ define i1 @test_select_inverse_nonconst4(i64 %x, i64 %y, i64 %z, i1 %cond) { define i1 @sel_icmp_two_cmp(i1 %c, i32 %a1, i32 %a2, i32 %a3, i32 %a4) { ; CHECK-LABEL: @sel_icmp_two_cmp( -; CHECK-NEXT: [[V1:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A1:%.*]], i32 [[A2:%.*]]) -; CHECK-NEXT: [[V2:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[A3:%.*]], i32 [[A4:%.*]]) -; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C:%.*]], i8 [[V1]], i8 [[V2]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[SEL]], 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp ule i32 [[A1:%.*]], [[A2:%.*]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp sle i32 [[A3:%.*]], [[A4:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = select i1 [[C:%.*]], i1 [[CMP1]], i1 [[CMP2]] ; CHECK-NEXT: ret i1 [[CMP]] ; %v1 = call i8 @llvm.ucmp(i32 %a1, i32 %a2) @@ -498,10 +497,10 @@ define i1 @sel_icmp_two_cmp(i1 %c, i32 %a1, i32 %a2, i32 %a3, i32 %a4) { define i1 @sel_icmp_two_cmp_extra_use1(i1 %c, i32 %a1, i32 %a2, i32 %a3, i32 %a4) { ; CHECK-LABEL: @sel_icmp_two_cmp_extra_use1( ; CHECK-NEXT: [[V1:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A1:%.*]], i32 [[A2:%.*]]) -; CHECK-NEXT: [[V2:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[A3:%.*]], i32 [[A4:%.*]]) ; CHECK-NEXT: call void @use.i8(i8 [[V1]]) -; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C:%.*]], i8 [[V1]], i8 [[V2]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[SEL]], 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp ule i32 [[A1]], [[A2]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp sle i32 [[A3:%.*]], [[A4:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = select i1 [[C:%.*]], i1 [[CMP1]], i1 [[CMP2]] ; CHECK-NEXT: ret i1 [[CMP]] ; %v1 = call i8 @llvm.ucmp(i32 %a1, i32 %a2) @@ -544,6 +543,35 @@ define i1 @sel_icmp_two_cmp_not_const(i1 %c, i32 %a1, i32 %a2, i32 %a3, i32 %a4, ret i1 %cmp } +define <2 x i1> @sel_icmp_two_cmp_vec(i1 %c, <2 x i32> %a1, <2 x i32> %a2, <2 x i32> %a3, <2 x i32> %a4) { +; CHECK-LABEL: @sel_icmp_two_cmp_vec( +; CHECK-NEXT: [[CMP1:%.*]] = icmp ule <2 x i32> [[A1:%.*]], [[A2:%.*]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp sle <2 x i32> [[A3:%.*]], [[A4:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = select i1 [[C:%.*]], <2 x i1> [[CMP1]], <2 x i1> [[CMP2]] +; CHECK-NEXT: ret <2 x i1> [[CMP]] +; + %v1 = call <2 x i8> @llvm.ucmp(<2 x i32> %a1, <2 x i32> %a2) + %v2 = call <2 x i8> @llvm.scmp(<2 x i32> %a3, <2 x i32> %a4) + %sel = select i1 %c, <2 x i8> %v1, <2 x i8> %v2 + %cmp = icmp sle <2 x i8> %sel, zeroinitializer + ret <2 x i1> %cmp +} + +define <2 x i1> @sel_icmp_two_cmp_vec_nonsplat(i1 %c, <2 x i32> %a1, <2 x i32> %a2, <2 x i32> %a3, <2 x i32> %a4) { +; CHECK-LABEL: @sel_icmp_two_cmp_vec_nonsplat( +; CHECK-NEXT: [[V1:%.*]] = call <2 x i8> @llvm.ucmp.v2i8.v2i32(<2 x i32> [[A1:%.*]], <2 x i32> [[A2:%.*]]) +; CHECK-NEXT: [[V2:%.*]] = call <2 x i8> @llvm.scmp.v2i8.v2i32(<2 x i32> [[A3:%.*]], <2 x i32> [[A4:%.*]]) +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C:%.*]], <2 x i8> [[V1]], <2 x i8> [[V2]] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i8> [[SEL]], +; CHECK-NEXT: ret <2 x i1> [[CMP]] +; + %v1 = call <2 x i8> @llvm.ucmp(<2 x i32> %a1, <2 x i32> %a2) + %v2 = call <2 x i8> @llvm.scmp(<2 x i32> %a3, <2 x i32> %a4) + %sel = select i1 %c, <2 x i8> %v1, <2 x i8> %v2 + %cmp = icmp sle <2 x i8> %sel, + ret <2 x i1> %cmp +} + define i1 @sel_icmp_cmp_and_simplify(i1 %c, i32 %a1, i32 %a2) { ; CHECK-LABEL: @sel_icmp_cmp_and_simplify( ; CHECK-NEXT: [[CMP1:%.*]] = icmp ule i32 [[A1:%.*]], [[A2:%.*]] @@ -559,9 +587,9 @@ define i1 @sel_icmp_cmp_and_simplify(i1 %c, i32 %a1, i32 %a2) { define i1 @sel_icmp_cmp_and_no_simplify(i1 %c, i32 %a1, i32 %a2, i8 %b) { ; CHECK-LABEL: @sel_icmp_cmp_and_no_simplify( -; CHECK-NEXT: [[V:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A1:%.*]], i32 [[A2:%.*]]) -; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C:%.*]], i8 [[V]], i8 [[B:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[SEL]], 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp ule i32 [[A1:%.*]], [[A2:%.*]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i8 [[B:%.*]], 1 +; CHECK-NEXT: [[CMP:%.*]] = select i1 [[C:%.*]], i1 [[CMP1]], i1 [[CMP2]] ; CHECK-NEXT: ret i1 [[CMP]] ; %v = call i8 @llvm.ucmp(i32 %a1, i32 %a2) @@ -572,9 +600,9 @@ define i1 @sel_icmp_cmp_and_no_simplify(i1 %c, i32 %a1, i32 %a2, i8 %b) { define i1 @sel_icmp_cmp_and_no_simplify_comm(i1 %c, i32 %a1, i32 %a2, i8 %b) { ; CHECK-LABEL: @sel_icmp_cmp_and_no_simplify_comm( -; CHECK-NEXT: [[V:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A1:%.*]], i32 [[A2:%.*]]) -; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C:%.*]], i8 [[B:%.*]], i8 [[V]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[SEL]], 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i8 [[B:%.*]], 1 +; CHECK-NEXT: [[CMP2:%.*]] = icmp ule i32 [[A1:%.*]], [[A2:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = select i1 [[C:%.*]], i1 [[CMP1]], i1 [[CMP2]] ; CHECK-NEXT: ret i1 [[CMP]] ; %v = call i8 @llvm.ucmp(i32 %a1, i32 %a2) From e3389365b5d62bc9781dc9a23b14d72e333018d7 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 22 Aug 2024 09:48:05 +0200 Subject: [PATCH 157/426] Build SanitizerCommon if ctx_profile enabled (#105495) ctx_profile has a dependency on SanitizerCommon, so make sure it is built even if we otherwise disable sanitizers. --- compiler-rt/lib/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/lib/CMakeLists.txt b/compiler-rt/lib/CMakeLists.txt index 22f9b3ea8a0c35..e6158ec4088951 100644 --- a/compiler-rt/lib/CMakeLists.txt +++ b/compiler-rt/lib/CMakeLists.txt @@ -9,7 +9,7 @@ include(SanitizerUtils) # #TODO: Refactor sanitizer_common into smaller pieces (e.g. flag parsing, utils). if (COMPILER_RT_HAS_SANITIZER_COMMON AND - (COMPILER_RT_BUILD_SANITIZERS OR COMPILER_RT_BUILD_XRAY OR COMPILER_RT_BUILD_MEMPROF)) + (COMPILER_RT_BUILD_SANITIZERS OR COMPILER_RT_BUILD_XRAY OR COMPILER_RT_BUILD_MEMPROF OR COMPILER_RT_BUILD_CTX_PROFILE)) add_subdirectory(sanitizer_common) endif() From c79d1fa540390f6e37e1ea326153559eeadd0de6 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Thu, 22 Aug 2024 09:51:16 +0200 Subject: [PATCH 158/426] [clang][bytecode] Don't discard all void-typed expressions (#105625) For void-types InitListExprs, we need to diagnose them as invalid. But only if we are _not_ discarding. --- clang/lib/AST/ByteCode/Compiler.cpp | 23 +++++++++-------------- clang/test/AST/ByteCode/literals.cpp | 1 + 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 6d05f75131640a..10f3222726fd43 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -1318,15 +1318,6 @@ bool Compiler::VisitArraySubscriptExpr(const ArraySubscriptExpr *E) { template bool Compiler::visitInitList(ArrayRef Inits, const Expr *ArrayFiller, const Expr *E) { - - QualType QT = E->getType(); - - if (const auto *AT = QT->getAs()) - QT = AT->getValueType(); - - if (QT->isVoidType()) - return this->emitInvalid(E); - // Handle discarding first. if (DiscardResult) { for (const Expr *Init : Inits) { @@ -1336,6 +1327,13 @@ bool Compiler::visitInitList(ArrayRef Inits, return true; } + QualType QT = E->getType(); + if (const auto *AT = QT->getAs()) + QT = AT->getValueType(); + + if (QT->isVoidType()) + return this->emitInvalid(E); + // Primitive values. if (std::optional T = classify(QT)) { assert(!DiscardResult); @@ -3251,12 +3249,9 @@ template bool Compiler::visit(const Expr *E) { if (E->getType().isNull()) return false; - if (E->getType()->isVoidType()) - return this->discard(E); - // Create local variable to hold the return value. - if (!E->isGLValue() && !E->getType()->isAnyComplexType() && - !classify(E->getType())) { + if (!E->getType()->isVoidType() && !E->isGLValue() && + !E->getType()->isAnyComplexType() && !classify(E->getType())) { std::optional LocalIndex = allocateLocal(E); if (!LocalIndex) return false; diff --git a/clang/test/AST/ByteCode/literals.cpp b/clang/test/AST/ByteCode/literals.cpp index a46f6ed747ec2f..2329d4d973f01d 100644 --- a/clang/test/AST/ByteCode/literals.cpp +++ b/clang/test/AST/ByteCode/literals.cpp @@ -46,6 +46,7 @@ static_assert(Failed2 == 0, ""); // both-error {{not an integral constant expres // both-note {{initializer of 'Failed2' is not a constant expression}} const int x = *(volatile int*)0x1234; +static_assert((void{}, true), ""); namespace ScalarTypes { constexpr int ScalarInitInt = int(); From fab515ca7f3c64b47dd94a92156a4696771ee22a Mon Sep 17 00:00:00 2001 From: Andrei Safronov Date: Thu, 22 Aug 2024 11:34:21 +0300 Subject: [PATCH 159/426] [Xtensa] Implement lowering Mul/Div/Shift operations. (#99981) Implement lowering of the Mul/Div operations and also shift parts operations. Implement lowering of the bit manipulations, like ROT/SWAP/CTPOP/CTTZ/CTLZ. --- llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp | 80 ++- llvm/lib/Target/Xtensa/XtensaISelLowering.cpp | 172 ++++- llvm/lib/Target/Xtensa/XtensaISelLowering.h | 26 +- llvm/lib/Target/Xtensa/XtensaInstrInfo.td | 3 +- llvm/lib/Target/Xtensa/XtensaOperators.td | 13 + llvm/test/CodeGen/Xtensa/bswap.ll | 404 +++++++++++ llvm/test/CodeGen/Xtensa/ctlz-cttz-ctpop.ll | 176 +++++ llvm/test/CodeGen/Xtensa/div.ll | 486 +++++++++++++ llvm/test/CodeGen/Xtensa/mul.ll | 672 ++++++++++++++++++ llvm/test/CodeGen/Xtensa/rotl-rotr.ll | 501 +++++++++++++ llvm/test/CodeGen/Xtensa/shift.ll | 172 +++++ 11 files changed, 2697 insertions(+), 8 deletions(-) create mode 100644 llvm/test/CodeGen/Xtensa/bswap.ll create mode 100644 llvm/test/CodeGen/Xtensa/ctlz-cttz-ctpop.ll create mode 100644 llvm/test/CodeGen/Xtensa/div.ll create mode 100644 llvm/test/CodeGen/Xtensa/mul.ll create mode 100644 llvm/test/CodeGen/Xtensa/rotl-rotr.ll create mode 100644 llvm/test/CodeGen/Xtensa/shift.ll diff --git a/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp b/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp index 145f2850363589..6f6d3342fcd7f2 100644 --- a/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp +++ b/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp @@ -67,7 +67,7 @@ class XtensaDAGToDAGISel : public SelectionDAGISel { Valid = isValidAddrOffset(Scale, OffsetVal); if (Valid) { - // If the first operand is a FI, get the TargetFI Node + // If the first operand is a FI, get the TargetFI Node. if (FrameIndexSDNode *FIN = dyn_cast(Addr.getOperand(0))) Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy); @@ -125,6 +125,7 @@ FunctionPass *llvm::createXtensaISelDag(XtensaTargetMachine &TM, void XtensaDAGToDAGISel::Select(SDNode *Node) { SDLoc DL(Node); + EVT VT = Node->getValueType(0); // If we have a custom node, we already have selected! if (Node->isMachineOpcode()) { @@ -132,5 +133,82 @@ void XtensaDAGToDAGISel::Select(SDNode *Node) { return; } + switch (Node->getOpcode()) { + case ISD::SHL: { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + auto *C = dyn_cast(N1); + // If C is constant in range [1..31] then we can generate SLLI + // instruction using pattern matching, otherwise generate SLL. + if (!C || C->isZero()) { + SDNode *SSL = CurDAG->getMachineNode(Xtensa::SSL, DL, MVT::Glue, N1); + SDNode *SLL = + CurDAG->getMachineNode(Xtensa::SLL, DL, VT, N0, SDValue(SSL, 0)); + ReplaceNode(Node, SLL); + return; + } + break; + } + case ISD::SRL: { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + auto *C = dyn_cast(N1); + + // If C is constant then we can generate SRLI + // instruction using pattern matching or EXTUI, otherwise generate SRL. + if (C) { + if (isUInt<4>(C->getZExtValue())) + break; + unsigned ShAmt = C->getZExtValue(); + SDNode *EXTUI = CurDAG->getMachineNode( + Xtensa::EXTUI, DL, VT, N0, CurDAG->getTargetConstant(ShAmt, DL, VT), + CurDAG->getTargetConstant(32 - ShAmt, DL, VT)); + ReplaceNode(Node, EXTUI); + return; + } + + SDNode *SSR = CurDAG->getMachineNode(Xtensa::SSR, DL, MVT::Glue, N1); + SDNode *SRL = + CurDAG->getMachineNode(Xtensa::SRL, DL, VT, N0, SDValue(SSR, 0)); + ReplaceNode(Node, SRL); + return; + } + case ISD::SRA: { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + auto *C = dyn_cast(N1); + // If C is constant then we can generate SRAI + // instruction using pattern matching, otherwise generate SRA. + if (!C) { + SDNode *SSR = CurDAG->getMachineNode(Xtensa::SSR, DL, MVT::Glue, N1); + SDNode *SRA = + CurDAG->getMachineNode(Xtensa::SRA, DL, VT, N0, SDValue(SSR, 0)); + ReplaceNode(Node, SRA); + return; + } + break; + } + case XtensaISD::SRCL: { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + SDValue N2 = Node->getOperand(2); + SDNode *SSL = CurDAG->getMachineNode(Xtensa::SSL, DL, MVT::Glue, N2); + SDNode *SRC = + CurDAG->getMachineNode(Xtensa::SRC, DL, VT, N0, N1, SDValue(SSL, 0)); + ReplaceNode(Node, SRC); + return; + } + case XtensaISD::SRCR: { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + SDValue N2 = Node->getOperand(2); + SDNode *SSR = CurDAG->getMachineNode(Xtensa::SSR, DL, MVT::Glue, N2); + SDNode *SRC = + CurDAG->getMachineNode(Xtensa::SRC, DL, VT, N0, N1, SDValue(SSR, 0)); + ReplaceNode(Node, SRC); + return; + } + } + SelectCode(Node); } diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp index 80d01d662a2217..c7675c2f501761 100644 --- a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp +++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp @@ -21,6 +21,7 @@ #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" @@ -98,6 +99,32 @@ XtensaTargetLowering::XtensaTargetLowering(const TargetMachine &TM, setCondCodeAction(ISD::SETUGT, MVT::i32, Expand); setCondCodeAction(ISD::SETULE, MVT::i32, Expand); + setOperationAction(ISD::MUL, MVT::i32, Expand); + setOperationAction(ISD::MULHU, MVT::i32, Expand); + setOperationAction(ISD::MULHS, MVT::i32, Expand); + setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); + setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); + + setOperationAction(ISD::SDIV, MVT::i32, Expand); + setOperationAction(ISD::UDIV, MVT::i32, Expand); + setOperationAction(ISD::SREM, MVT::i32, Expand); + setOperationAction(ISD::UREM, MVT::i32, Expand); + setOperationAction(ISD::SDIVREM, MVT::i32, Expand); + setOperationAction(ISD::UDIVREM, MVT::i32, Expand); + + setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); + + setOperationAction(ISD::BSWAP, MVT::i32, Expand); + setOperationAction(ISD::ROTL, MVT::i32, Expand); + setOperationAction(ISD::ROTR, MVT::i32, Expand); + setOperationAction(ISD::CTPOP, MVT::i32, Custom); + setOperationAction(ISD::CTTZ, MVT::i32, Expand); + setOperationAction(ISD::CTLZ, MVT::i32, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); + // Implement custom stack allocations setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom); // Implement custom stack save and restore @@ -629,8 +656,12 @@ SDValue XtensaTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = Table.getValueType(); unsigned EntrySize = MJTI->getEntrySize(TD); - Index = DAG.getNode(ISD::MUL, DL, Index.getValueType(), Index, - DAG.getConstant(EntrySize, DL, Index.getValueType())); + assert((MJTI->getEntrySize(TD) == 4) && "Unsupported jump-table entry size"); + + Index = DAG.getNode( + ISD::SHL, DL, Index.getValueType(), Index, + DAG.getConstant(Log2_32(EntrySize), DL, Index.getValueType())); + SDValue Addr = DAG.getNode(ISD::ADD, DL, Index.getValueType(), Index, Table); SDValue LD = DAG.getLoad(PtrVT, DL, Chain, Addr, @@ -662,10 +693,12 @@ SDValue XtensaTargetLowering::getAddrPCRel(SDValue Op, return DAG.getNode(XtensaISD::PCREL_WRAPPER, DL, Ty, Op); } -SDValue XtensaTargetLowering::LowerConstantPool(ConstantPoolSDNode *CP, +SDValue XtensaTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { - EVT PtrVT = getPointerTy(DAG.getDataLayout()); + EVT PtrVT = Op.getValueType(); + ConstantPoolSDNode *CP = cast(Op); SDValue Result; + if (!CP->isMachineConstantPoolEntry()) { Result = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset()); @@ -713,6 +746,119 @@ SDValue XtensaTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, return DAG.getMergeValues(Ops, DL); } +SDValue XtensaTargetLowering::LowerShiftLeftParts(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + MVT VT = MVT::i32; + SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1); + SDValue Shamt = Op.getOperand(2); + + // if Shamt - register size < 0: // Shamt < register size + // Lo = Lo << Shamt + // Hi = (Hi << Shamt) | (Lo >>u (register size - Shamt)) + // else: + // Lo = 0 + // Hi = Lo << (Shamt - register size) + + SDValue MinusRegisterSize = DAG.getConstant(-32, DL, VT); + SDValue ShamtMinusRegisterSize = + DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusRegisterSize); + + SDValue LoTrue = DAG.getNode(ISD::SHL, DL, VT, Lo, Shamt); + SDValue HiTrue = DAG.getNode(XtensaISD::SRCL, DL, VT, Hi, Lo, Shamt); + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue HiFalse = DAG.getNode(ISD::SHL, DL, VT, Lo, ShamtMinusRegisterSize); + + SDValue Cond = DAG.getSetCC(DL, VT, ShamtMinusRegisterSize, Zero, ISD::SETLT); + Lo = DAG.getNode(ISD::SELECT, DL, VT, Cond, LoTrue, Zero); + Hi = DAG.getNode(ISD::SELECT, DL, VT, Cond, HiTrue, HiFalse); + + return DAG.getMergeValues({Lo, Hi}, DL); +} + +SDValue XtensaTargetLowering::LowerShiftRightParts(SDValue Op, + SelectionDAG &DAG, + bool IsSRA) const { + SDLoc DL(Op); + SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1); + SDValue Shamt = Op.getOperand(2); + MVT VT = MVT::i32; + + // SRA expansion: + // if Shamt - register size < 0: // Shamt < register size + // Lo = (Lo >>u Shamt) | (Hi << u (register size - Shamt)) + // Hi = Hi >>s Shamt + // else: + // Lo = Hi >>s (Shamt - register size); + // Hi = Hi >>s (register size - 1) + // + // SRL expansion: + // if Shamt - register size < 0: // Shamt < register size + // Lo = (Lo >>u Shamt) | (Hi << u (register size - Shamt)) + // Hi = Hi >>u Shamt + // else: + // Lo = Hi >>u (Shamt - register size); + // Hi = 0; + + unsigned ShiftRightOp = IsSRA ? ISD::SRA : ISD::SRL; + SDValue MinusRegisterSize = DAG.getConstant(-32, DL, VT); + SDValue RegisterSizeMinus1 = DAG.getConstant(32 - 1, DL, VT); + SDValue ShamtMinusRegisterSize = + DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusRegisterSize); + + SDValue LoTrue = DAG.getNode(XtensaISD::SRCR, DL, VT, Hi, Lo, Shamt); + SDValue HiTrue = DAG.getNode(ShiftRightOp, DL, VT, Hi, Shamt); + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue LoFalse = + DAG.getNode(ShiftRightOp, DL, VT, Hi, ShamtMinusRegisterSize); + SDValue HiFalse; + + if (IsSRA) { + HiFalse = DAG.getNode(ShiftRightOp, DL, VT, Hi, RegisterSizeMinus1); + } else { + HiFalse = Zero; + } + + SDValue Cond = DAG.getSetCC(DL, VT, ShamtMinusRegisterSize, Zero, ISD::SETLT); + Lo = DAG.getNode(ISD::SELECT, DL, VT, Cond, LoTrue, LoFalse); + Hi = DAG.getNode(ISD::SELECT, DL, VT, Cond, HiTrue, HiFalse); + + return DAG.getMergeValues({Lo, Hi}, DL); +} + +SDValue XtensaTargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { + auto &TLI = DAG.getTargetLoweringInfo(); + return TLI.expandCTPOP(Op.getNode(), DAG); +} + +bool XtensaTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT, + SDValue C) const { + APInt Imm; + unsigned EltSizeInBits; + + if (ISD::isConstantSplatVector(C.getNode(), Imm)) { + EltSizeInBits = VT.getScalarSizeInBits(); + } else if (VT.isScalarInteger()) { + EltSizeInBits = VT.getSizeInBits(); + if (auto *ConstNode = dyn_cast(C.getNode())) + Imm = ConstNode->getAPIntValue(); + else + return false; + } else { + return false; + } + + // Omit if data size exceeds. + if (EltSizeInBits > 32) + return false; + + // Convert MULT to LSL. + if (Imm.isPowerOf2() && Imm.isIntN(5)) + return true; + + return false; +} + SDValue XtensaTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -726,8 +872,10 @@ SDValue XtensaTargetLowering::LowerOperation(SDValue Op, return LowerBlockAddress(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); + case ISD::CTPOP: + return LowerCTPOP(Op, DAG); case ISD::ConstantPool: - return LowerConstantPool(cast(Op), DAG); + return LowerConstantPool(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::STACKSAVE: @@ -736,6 +884,12 @@ SDValue XtensaTargetLowering::LowerOperation(SDValue Op, return LowerSTACKRESTORE(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); + case ISD::SHL_PARTS: + return LowerShiftLeftParts(Op, DAG); + case ISD::SRA_PARTS: + return LowerShiftRightParts(Op, DAG, true); + case ISD::SRL_PARTS: + return LowerShiftRightParts(Op, DAG, false); default: report_fatal_error("Unexpected node to lower"); } @@ -747,12 +901,18 @@ const char *XtensaTargetLowering::getTargetNodeName(unsigned Opcode) const { return "XtensaISD::BR_JT"; case XtensaISD::CALL: return "XtensaISD::CALL"; + case XtensaISD::EXTUI: + return "XtensaISD::EXTUI"; case XtensaISD::PCREL_WRAPPER: return "XtensaISD::PCREL_WRAPPER"; case XtensaISD::RET: return "XtensaISD::RET"; case XtensaISD::SELECT_CC: return "XtensaISD::SELECT_CC"; + case XtensaISD::SRCL: + return "XtensaISD::SRCL"; + case XtensaISD::SRCR: + return "XtensaISD::SRCR"; } return nullptr; } @@ -827,6 +987,8 @@ XtensaTargetLowering::emitSelectCC(MachineInstr &MI, MachineBasicBlock *XtensaTargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *MBB) const { + DebugLoc DL = MI.getDebugLoc(); + switch (MI.getOpcode()) { case Xtensa::SELECT: return emitSelectCC(MI, MBB); diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.h b/llvm/lib/Target/Xtensa/XtensaISelLowering.h index dd811ae9f3a774..8e7346b40dfe59 100644 --- a/llvm/lib/Target/Xtensa/XtensaISelLowering.h +++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.h @@ -30,6 +30,11 @@ enum { // There is an optional glue operand at the end. CALL, + // Extract unsigned immediate. Operand 0 is value, operand 1 + // is bit position of the field [0..31], operand 2 is bit size + // of the field [1..16] + EXTUI, + // Wraps a TargetGlobalAddress that should be loaded using PC-relative // accesses. Operand 0 is the address. PCREL_WRAPPER, @@ -40,6 +45,12 @@ enum { // the lhs and rhs (ops #0 and #1) of a conditional expression with the // condition code in op #4 SELECT_CC, + + // SRCL(R) performs shift left(right) of the concatenation of 2 registers + // and returns high(low) 32-bit part of 64-bit result + SRCL, + // Shift Right Combined + SRCR, }; } @@ -50,6 +61,10 @@ class XtensaTargetLowering : public TargetLowering { explicit XtensaTargetLowering(const TargetMachine &TM, const XtensaSubtarget &STI); + MVT getScalarShiftAmountTy(const DataLayout &, EVT LHSTy) const override { + return LHSTy.getSizeInBits() <= 32 ? MVT::i32 : MVT::i64; + } + EVT getSetCCResultType(const DataLayout &, LLVMContext &, EVT VT) const override { if (!VT.isVector()) @@ -82,6 +97,9 @@ class XtensaTargetLowering : public TargetLowering { const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; + bool decomposeMulByConstant(LLVMContext &Context, EVT VT, + SDValue C) const override; + const XtensaSubtarget &getSubtarget() const { return Subtarget; } MachineBasicBlock * @@ -101,7 +119,9 @@ class XtensaTargetLowering : public TargetLowering { SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerConstantPool(ConstantPoolSDNode *CP, SelectionDAG &DAG) const; + SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; @@ -111,6 +131,10 @@ class XtensaTargetLowering : public TargetLowering { SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG, bool IsSRA) const; + SDValue getAddrPCRel(SDValue Op, SelectionDAG &DAG) const; CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const; diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td index fc134e794153b6..0d01864b54bc38 100644 --- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td +++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td @@ -138,7 +138,8 @@ let Defs = [SAR] in { } def EXTUI : RRR_Inst<0x00, 0x04, 0x00, (outs AR:$r), (ins AR:$t, uimm5:$imm1, imm1_16:$imm2), - "extui\t$r, $t, $imm1, $imm2", []> { + "extui\t$r, $t, $imm1, $imm2", + [(set AR:$r, (Xtensa_extui AR:$t, uimm5:$imm1, imm1_16:$imm2))]> { bits<5> imm1; bits<4> imm2; diff --git a/llvm/lib/Target/Xtensa/XtensaOperators.td b/llvm/lib/Target/Xtensa/XtensaOperators.td index 93cd1c933dbde6..3dd73b44f336a5 100644 --- a/llvm/lib/Target/Xtensa/XtensaOperators.td +++ b/llvm/lib/Target/Xtensa/XtensaOperators.td @@ -24,6 +24,13 @@ def SDT_XtensaSelectCC : SDTypeProfile<1, 5, [SDTCisSameAs<0, 1>, SDTCisSameAs<2, 3>, SDTCisVT<5, i32>]>; + +def SDT_XtensaSRC : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; + +def SDT_XtensaEXTUI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; + //===----------------------------------------------------------------------===// // Node definitions //===----------------------------------------------------------------------===// @@ -46,3 +53,9 @@ def Xtensa_brjt: SDNode<"XtensaISD::BR_JT", SDT_XtensaBrJT, [SDNPHasChain]>; def Xtensa_select_cc: SDNode<"XtensaISD::SELECT_CC", SDT_XtensaSelectCC, [SDNPInGlue]>; + +def Xtensa_srcl: SDNode<"XtensaISD::SRCL", SDT_XtensaSRC>; + +def Xtensa_srcr: SDNode<"XtensaISD::SRCR", SDT_XtensaSRC>; + +def Xtensa_extui: SDNode<"XtensaISD::EXTUI", SDT_XtensaEXTUI>; diff --git a/llvm/test/CodeGen/Xtensa/bswap.ll b/llvm/test/CodeGen/Xtensa/bswap.ll new file mode 100644 index 00000000000000..6a87aa84351cf2 --- /dev/null +++ b/llvm/test/CodeGen/Xtensa/bswap.ll @@ -0,0 +1,404 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=xtensa -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=XTENSA %s + +declare i16 @llvm.bswap.i16(i16) +declare i32 @llvm.bswap.i32(i32) +declare i64 @llvm.bswap.i64(i64) +declare i8 @llvm.bitreverse.i8(i8) +declare i16 @llvm.bitreverse.i16(i16) +declare i32 @llvm.bitreverse.i32(i32) +declare i64 @llvm.bitreverse.i64(i64) + +define i16 @test_bswap_i16(i16 %a) nounwind { +; XTENSA-LABEL: test_bswap_i16: +; XTENSA: l32r a8, .LCPI0_0 +; XTENSA-NEXT: and a8, a2, a8 +; XTENSA-NEXT: srli a8, a8, 8 +; XTENSA-NEXT: slli a9, a2, 8 +; XTENSA-NEXT: or a2, a9, a8 +; XTENSA-NEXT: ret + %tmp = call i16 @llvm.bswap.i16(i16 %a) + ret i16 %tmp +} + +define i32 @test_bswap_i32(i32 %a) nounwind { +; XTENSA-LABEL: test_bswap_i32: +; XTENSA: srli a8, a2, 8 +; XTENSA-NEXT: l32r a9, .LCPI1_0 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: extui a10, a2, 24, 8 +; XTENSA-NEXT: or a8, a8, a10 +; XTENSA-NEXT: and a9, a2, a9 +; XTENSA-NEXT: slli a9, a9, 8 +; XTENSA-NEXT: slli a10, a2, 24 +; XTENSA-NEXT: or a9, a10, a9 +; XTENSA-NEXT: or a2, a9, a8 +; XTENSA-NEXT: ret + %tmp = call i32 @llvm.bswap.i32(i32 %a) + ret i32 %tmp +} + +define i64 @test_bswap_i64(i64 %a) nounwind { +; XTENSA-LABEL: test_bswap_i64: +; XTENSA: srli a8, a3, 8 +; XTENSA-NEXT: l32r a9, .LCPI2_0 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: extui a10, a3, 24, 8 +; XTENSA-NEXT: or a8, a8, a10 +; XTENSA-NEXT: and a10, a3, a9 +; XTENSA-NEXT: slli a10, a10, 8 +; XTENSA-NEXT: slli a11, a3, 24 +; XTENSA-NEXT: or a10, a11, a10 +; XTENSA-NEXT: or a8, a10, a8 +; XTENSA-NEXT: srli a10, a2, 8 +; XTENSA-NEXT: and a10, a10, a9 +; XTENSA-NEXT: extui a11, a2, 24, 8 +; XTENSA-NEXT: or a10, a10, a11 +; XTENSA-NEXT: and a9, a2, a9 +; XTENSA-NEXT: slli a9, a9, 8 +; XTENSA-NEXT: slli a11, a2, 24 +; XTENSA-NEXT: or a9, a11, a9 +; XTENSA-NEXT: or a3, a9, a10 +; XTENSA-NEXT: or a2, a8, a8 +; XTENSA-NEXT: ret + %tmp = call i64 @llvm.bswap.i64(i64 %a) + ret i64 %tmp +} + +define i8 @test_bitreverse_i8(i8 %a) nounwind { +; XTENSA-LABEL: test_bitreverse_i8: +; XTENSA: movi a8, 15 +; XTENSA-NEXT: and a8, a2, a8 +; XTENSA-NEXT: slli a8, a8, 4 +; XTENSA-NEXT: movi a9, 240 +; XTENSA-NEXT: and a9, a2, a9 +; XTENSA-NEXT: srli a9, a9, 4 +; XTENSA-NEXT: or a8, a9, a8 +; XTENSA-NEXT: srli a9, a8, 2 +; XTENSA-NEXT: movi a10, 51 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: and a8, a8, a10 +; XTENSA-NEXT: slli a8, a8, 2 +; XTENSA-NEXT: or a8, a9, a8 +; XTENSA-NEXT: srli a9, a8, 1 +; XTENSA-NEXT: movi a10, 85 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: and a8, a8, a10 +; XTENSA-NEXT: slli a8, a8, 1 +; XTENSA-NEXT: or a2, a9, a8 +; XTENSA-NEXT: ret + %tmp = call i8 @llvm.bitreverse.i8(i8 %a) + ret i8 %tmp +} + +define i16 @test_bitreverse_i16(i16 %a) nounwind { +; XTENSA-LABEL: test_bitreverse_i16: +; XTENSA: l32r a8, .LCPI4_0 +; XTENSA-NEXT: and a8, a2, a8 +; XTENSA-NEXT: srli a8, a8, 8 +; XTENSA-NEXT: slli a9, a2, 8 +; XTENSA-NEXT: or a8, a9, a8 +; XTENSA-NEXT: srli a9, a8, 4 +; XTENSA-NEXT: l32r a10, .LCPI4_1 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: and a8, a8, a10 +; XTENSA-NEXT: slli a8, a8, 4 +; XTENSA-NEXT: or a8, a9, a8 +; XTENSA-NEXT: srli a9, a8, 2 +; XTENSA-NEXT: l32r a10, .LCPI4_2 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: and a8, a8, a10 +; XTENSA-NEXT: slli a8, a8, 2 +; XTENSA-NEXT: or a8, a9, a8 +; XTENSA-NEXT: srli a9, a8, 1 +; XTENSA-NEXT: l32r a10, .LCPI4_3 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: and a8, a8, a10 +; XTENSA-NEXT: slli a8, a8, 1 +; XTENSA-NEXT: or a2, a9, a8 +; XTENSA-NEXT: ret + %tmp = call i16 @llvm.bitreverse.i16(i16 %a) + ret i16 %tmp +} + +define i32 @test_bitreverse_i32(i32 %a) nounwind { +; XTENSA-LABEL: test_bitreverse_i32: +; XTENSA: srli a8, a2, 8 +; XTENSA-NEXT: l32r a9, .LCPI5_0 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: extui a10, a2, 24, 8 +; XTENSA-NEXT: or a8, a8, a10 +; XTENSA-NEXT: and a9, a2, a9 +; XTENSA-NEXT: slli a9, a9, 8 +; XTENSA-NEXT: slli a10, a2, 24 +; XTENSA-NEXT: or a9, a10, a9 +; XTENSA-NEXT: or a8, a9, a8 +; XTENSA-NEXT: srli a9, a8, 4 +; XTENSA-NEXT: l32r a10, .LCPI5_1 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: and a8, a8, a10 +; XTENSA-NEXT: slli a8, a8, 4 +; XTENSA-NEXT: or a8, a9, a8 +; XTENSA-NEXT: srli a9, a8, 2 +; XTENSA-NEXT: l32r a10, .LCPI5_2 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: and a8, a8, a10 +; XTENSA-NEXT: slli a8, a8, 2 +; XTENSA-NEXT: or a8, a9, a8 +; XTENSA-NEXT: srli a9, a8, 1 +; XTENSA-NEXT: l32r a10, .LCPI5_3 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: and a8, a8, a10 +; XTENSA-NEXT: slli a8, a8, 1 +; XTENSA-NEXT: or a2, a9, a8 +; XTENSA-NEXT: ret + %tmp = call i32 @llvm.bitreverse.i32(i32 %a) + ret i32 %tmp +} + +define i64 @test_bitreverse_i64(i64 %a) nounwind { +; XTENSA-LABEL: test_bitreverse_i64: +; XTENSA: srli a8, a3, 8 +; XTENSA-NEXT: l32r a9, .LCPI6_0 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: extui a10, a3, 24, 8 +; XTENSA-NEXT: or a8, a8, a10 +; XTENSA-NEXT: and a10, a3, a9 +; XTENSA-NEXT: slli a10, a10, 8 +; XTENSA-NEXT: slli a11, a3, 24 +; XTENSA-NEXT: or a10, a11, a10 +; XTENSA-NEXT: or a8, a10, a8 +; XTENSA-NEXT: srli a10, a8, 4 +; XTENSA-NEXT: l32r a11, .LCPI6_1 +; XTENSA-NEXT: and a10, a10, a11 +; XTENSA-NEXT: and a8, a8, a11 +; XTENSA-NEXT: slli a8, a8, 4 +; XTENSA-NEXT: or a8, a10, a8 +; XTENSA-NEXT: srli a10, a8, 2 +; XTENSA-NEXT: l32r a7, .LCPI6_2 +; XTENSA-NEXT: and a10, a10, a7 +; XTENSA-NEXT: and a8, a8, a7 +; XTENSA-NEXT: slli a8, a8, 2 +; XTENSA-NEXT: or a8, a10, a8 +; XTENSA-NEXT: srli a10, a8, 1 +; XTENSA-NEXT: l32r a6, .LCPI6_3 +; XTENSA-NEXT: and a10, a10, a6 +; XTENSA-NEXT: and a8, a8, a6 +; XTENSA-NEXT: slli a8, a8, 1 +; XTENSA-NEXT: or a8, a10, a8 +; XTENSA-NEXT: srli a10, a2, 8 +; XTENSA-NEXT: and a10, a10, a9 +; XTENSA-NEXT: extui a5, a2, 24, 8 +; XTENSA-NEXT: or a10, a10, a5 +; XTENSA-NEXT: and a9, a2, a9 +; XTENSA-NEXT: slli a9, a9, 8 +; XTENSA-NEXT: slli a5, a2, 24 +; XTENSA-NEXT: or a9, a5, a9 +; XTENSA-NEXT: or a9, a9, a10 +; XTENSA-NEXT: srli a10, a9, 4 +; XTENSA-NEXT: and a10, a10, a11 +; XTENSA-NEXT: and a9, a9, a11 +; XTENSA-NEXT: slli a9, a9, 4 +; XTENSA-NEXT: or a9, a10, a9 +; XTENSA-NEXT: srli a10, a9, 2 +; XTENSA-NEXT: and a10, a10, a7 +; XTENSA-NEXT: and a9, a9, a7 +; XTENSA-NEXT: slli a9, a9, 2 +; XTENSA-NEXT: or a9, a10, a9 +; XTENSA-NEXT: srli a10, a9, 1 +; XTENSA-NEXT: and a10, a10, a6 +; XTENSA-NEXT: and a9, a9, a6 +; XTENSA-NEXT: slli a9, a9, 1 +; XTENSA-NEXT: or a3, a10, a9 +; XTENSA-NEXT: or a2, a8, a8 +; XTENSA-NEXT: ret + %tmp = call i64 @llvm.bitreverse.i64(i64 %a) + ret i64 %tmp +} + +define i16 @test_bswap_bitreverse_i16(i16 %a) nounwind { +; XTENSA-LABEL: test_bswap_bitreverse_i16: +; XTENSA: srli a8, a2, 4 +; XTENSA-NEXT: l32r a9, .LCPI7_0 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: and a9, a2, a9 +; XTENSA-NEXT: slli a9, a9, 4 +; XTENSA-NEXT: or a8, a8, a9 +; XTENSA-NEXT: srli a9, a8, 2 +; XTENSA-NEXT: l32r a10, .LCPI7_1 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: and a8, a8, a10 +; XTENSA-NEXT: slli a8, a8, 2 +; XTENSA-NEXT: or a8, a9, a8 +; XTENSA-NEXT: srli a9, a8, 1 +; XTENSA-NEXT: l32r a10, .LCPI7_2 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: and a8, a8, a10 +; XTENSA-NEXT: slli a8, a8, 1 +; XTENSA-NEXT: or a2, a9, a8 +; XTENSA-NEXT: ret + %tmp = call i16 @llvm.bswap.i16(i16 %a) + %tmp2 = call i16 @llvm.bitreverse.i16(i16 %tmp) + ret i16 %tmp2 +} + +define i32 @test_bswap_bitreverse_i32(i32 %a) nounwind { +; XTENSA-LABEL: test_bswap_bitreverse_i32: +; XTENSA: srli a8, a2, 4 +; XTENSA-NEXT: l32r a9, .LCPI8_0 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: and a9, a2, a9 +; XTENSA-NEXT: slli a9, a9, 4 +; XTENSA-NEXT: or a8, a8, a9 +; XTENSA-NEXT: srli a9, a8, 2 +; XTENSA-NEXT: l32r a10, .LCPI8_1 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: and a8, a8, a10 +; XTENSA-NEXT: slli a8, a8, 2 +; XTENSA-NEXT: or a8, a9, a8 +; XTENSA-NEXT: srli a9, a8, 1 +; XTENSA-NEXT: l32r a10, .LCPI8_2 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: and a8, a8, a10 +; XTENSA-NEXT: slli a8, a8, 1 +; XTENSA-NEXT: or a2, a9, a8 +; XTENSA-NEXT: ret + %tmp = call i32 @llvm.bswap.i32(i32 %a) + %tmp2 = call i32 @llvm.bitreverse.i32(i32 %tmp) + ret i32 %tmp2 +} + +define i64 @test_bswap_bitreverse_i64(i64 %a) nounwind { +; XTENSA-LABEL: test_bswap_bitreverse_i64: +; XTENSA: srli a8, a2, 4 +; XTENSA-NEXT: l32r a9, .LCPI9_0 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: and a10, a2, a9 +; XTENSA-NEXT: slli a10, a10, 4 +; XTENSA-NEXT: or a8, a8, a10 +; XTENSA-NEXT: srli a10, a8, 2 +; XTENSA-NEXT: l32r a11, .LCPI9_1 +; XTENSA-NEXT: and a10, a10, a11 +; XTENSA-NEXT: and a8, a8, a11 +; XTENSA-NEXT: slli a8, a8, 2 +; XTENSA-NEXT: or a8, a10, a8 +; XTENSA-NEXT: srli a10, a8, 1 +; XTENSA-NEXT: l32r a7, .LCPI9_2 +; XTENSA-NEXT: and a10, a10, a7 +; XTENSA-NEXT: and a8, a8, a7 +; XTENSA-NEXT: slli a8, a8, 1 +; XTENSA-NEXT: or a2, a10, a8 +; XTENSA-NEXT: srli a8, a3, 4 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: and a9, a3, a9 +; XTENSA-NEXT: slli a9, a9, 4 +; XTENSA-NEXT: or a8, a8, a9 +; XTENSA-NEXT: srli a9, a8, 2 +; XTENSA-NEXT: and a9, a9, a11 +; XTENSA-NEXT: and a8, a8, a11 +; XTENSA-NEXT: slli a8, a8, 2 +; XTENSA-NEXT: or a8, a9, a8 +; XTENSA-NEXT: srli a9, a8, 1 +; XTENSA-NEXT: and a9, a9, a7 +; XTENSA-NEXT: and a8, a8, a7 +; XTENSA-NEXT: slli a8, a8, 1 +; XTENSA-NEXT: or a3, a9, a8 +; XTENSA-NEXT: ret + %tmp = call i64 @llvm.bswap.i64(i64 %a) + %tmp2 = call i64 @llvm.bitreverse.i64(i64 %tmp) + ret i64 %tmp2 +} + +define i16 @test_bitreverse_bswap_i16(i16 %a) nounwind { +; XTENSA-LABEL: test_bitreverse_bswap_i16: +; XTENSA: srli a8, a2, 4 +; XTENSA-NEXT: l32r a9, .LCPI10_0 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: and a9, a2, a9 +; XTENSA-NEXT: slli a9, a9, 4 +; XTENSA-NEXT: or a8, a8, a9 +; XTENSA-NEXT: srli a9, a8, 2 +; XTENSA-NEXT: l32r a10, .LCPI10_1 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: and a8, a8, a10 +; XTENSA-NEXT: slli a8, a8, 2 +; XTENSA-NEXT: or a8, a9, a8 +; XTENSA-NEXT: srli a9, a8, 1 +; XTENSA-NEXT: l32r a10, .LCPI10_2 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: and a8, a8, a10 +; XTENSA-NEXT: slli a8, a8, 1 +; XTENSA-NEXT: or a2, a9, a8 +; XTENSA-NEXT: ret + %tmp = call i16 @llvm.bitreverse.i16(i16 %a) + %tmp2 = call i16 @llvm.bswap.i16(i16 %tmp) + ret i16 %tmp2 +} + +define i32 @test_bitreverse_bswap_i32(i32 %a) nounwind { +; XTENSA-LABEL: test_bitreverse_bswap_i32: +; XTENSA: srli a8, a2, 4 +; XTENSA-NEXT: l32r a9, .LCPI11_0 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: and a9, a2, a9 +; XTENSA-NEXT: slli a9, a9, 4 +; XTENSA-NEXT: or a8, a8, a9 +; XTENSA-NEXT: srli a9, a8, 2 +; XTENSA-NEXT: l32r a10, .LCPI11_1 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: and a8, a8, a10 +; XTENSA-NEXT: slli a8, a8, 2 +; XTENSA-NEXT: or a8, a9, a8 +; XTENSA-NEXT: srli a9, a8, 1 +; XTENSA-NEXT: l32r a10, .LCPI11_2 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: and a8, a8, a10 +; XTENSA-NEXT: slli a8, a8, 1 +; XTENSA-NEXT: or a2, a9, a8 +; XTENSA-NEXT: ret + %tmp = call i32 @llvm.bitreverse.i32(i32 %a) + %tmp2 = call i32 @llvm.bswap.i32(i32 %tmp) + ret i32 %tmp2 +} + +define i64 @test_bitreverse_bswap_i64(i64 %a) nounwind { +; XTENSA-LABEL: test_bitreverse_bswap_i64: +; XTENSA: srli a8, a2, 4 +; XTENSA-NEXT: l32r a9, .LCPI12_0 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: and a10, a2, a9 +; XTENSA-NEXT: slli a10, a10, 4 +; XTENSA-NEXT: or a8, a8, a10 +; XTENSA-NEXT: srli a10, a8, 2 +; XTENSA-NEXT: l32r a11, .LCPI12_1 +; XTENSA-NEXT: and a10, a10, a11 +; XTENSA-NEXT: and a8, a8, a11 +; XTENSA-NEXT: slli a8, a8, 2 +; XTENSA-NEXT: or a8, a10, a8 +; XTENSA-NEXT: srli a10, a8, 1 +; XTENSA-NEXT: l32r a7, .LCPI12_2 +; XTENSA-NEXT: and a10, a10, a7 +; XTENSA-NEXT: and a8, a8, a7 +; XTENSA-NEXT: slli a8, a8, 1 +; XTENSA-NEXT: or a2, a10, a8 +; XTENSA-NEXT: srli a8, a3, 4 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: and a9, a3, a9 +; XTENSA-NEXT: slli a9, a9, 4 +; XTENSA-NEXT: or a8, a8, a9 +; XTENSA-NEXT: srli a9, a8, 2 +; XTENSA-NEXT: and a9, a9, a11 +; XTENSA-NEXT: and a8, a8, a11 +; XTENSA-NEXT: slli a8, a8, 2 +; XTENSA-NEXT: or a8, a9, a8 +; XTENSA-NEXT: srli a9, a8, 1 +; XTENSA-NEXT: and a9, a9, a7 +; XTENSA-NEXT: and a8, a8, a7 +; XTENSA-NEXT: slli a8, a8, 1 +; XTENSA-NEXT: or a3, a9, a8 +; XTENSA-NEXT: ret + %tmp = call i64 @llvm.bitreverse.i64(i64 %a) + %tmp2 = call i64 @llvm.bswap.i64(i64 %tmp) + ret i64 %tmp2 +} diff --git a/llvm/test/CodeGen/Xtensa/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/Xtensa/ctlz-cttz-ctpop.ll new file mode 100644 index 00000000000000..f58bed19d4ee77 --- /dev/null +++ b/llvm/test/CodeGen/Xtensa/ctlz-cttz-ctpop.ll @@ -0,0 +1,176 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=xtensa -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=XTENSA %s + +declare i32 @llvm.cttz.i32(i32, i1) +declare i32 @llvm.ctlz.i32(i32, i1) +declare i32 @llvm.ctpop.i32(i32) + +define i32 @test_cttz_i32(i32 %a) nounwind { +; XTENSA-LABEL: test_cttz_i32: +; XTENSA: movi a8, 32 +; XTENSA-NEXT: beqz a2, .LBB0_2 +; XTENSA-NEXT: j .LBB0_1 +; XTENSA-NEXT: .LBB0_1: # %cond.false +; XTENSA-NEXT: movi a8, -1 +; XTENSA-NEXT: xor a8, a2, a8 +; XTENSA-NEXT: addi a9, a2, -1 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: srli a9, a8, 1 +; XTENSA-NEXT: l32r a10, .LCPI0_0 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: sub a8, a8, a9 +; XTENSA-NEXT: l32r a9, .LCPI0_1 +; XTENSA-NEXT: and a10, a8, a9 +; XTENSA-NEXT: srli a8, a8, 2 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: add a8, a10, a8 +; XTENSA-NEXT: srli a9, a8, 4 +; XTENSA-NEXT: add a8, a8, a9 +; XTENSA-NEXT: l32r a9, .LCPI0_2 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: slli a9, a8, 8 +; XTENSA-NEXT: add a8, a8, a9 +; XTENSA-NEXT: slli a9, a8, 16 +; XTENSA-NEXT: add a8, a8, a9 +; XTENSA-NEXT: extui a8, a8, 24, 8 +; XTENSA-NEXT: .LBB0_2: # %cond.end +; XTENSA-NEXT: or a2, a8, a8 +; XTENSA-NEXT: ret + %tmp = call i32 @llvm.cttz.i32(i32 %a, i1 false) + ret i32 %tmp +} + +define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind { +; XTENSA-LABEL: test_cttz_i32_zero_undef: +; XTENSA: movi a8, -1 +; XTENSA-NEXT: xor a8, a2, a8 +; XTENSA-NEXT: addi a9, a2, -1 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: srli a9, a8, 1 +; XTENSA-NEXT: l32r a10, .LCPI1_0 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: sub a8, a8, a9 +; XTENSA-NEXT: l32r a9, .LCPI1_1 +; XTENSA-NEXT: and a10, a8, a9 +; XTENSA-NEXT: srli a8, a8, 2 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: add a8, a10, a8 +; XTENSA-NEXT: srli a9, a8, 4 +; XTENSA-NEXT: add a8, a8, a9 +; XTENSA-NEXT: l32r a9, .LCPI1_2 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: slli a9, a8, 8 +; XTENSA-NEXT: add a8, a8, a9 +; XTENSA-NEXT: slli a9, a8, 16 +; XTENSA-NEXT: add a8, a8, a9 +; XTENSA-NEXT: extui a2, a8, 24, 8 +; XTENSA-NEXT: ret + %tmp = call i32 @llvm.cttz.i32(i32 %a, i1 true) + ret i32 %tmp +} + +define i32 @test_ctlz_i32(i32 %a) nounwind { +; XTENSA-LABEL: test_ctlz_i32: +; XTENSA: or a8, a2, a2 +; XTENSA-NEXT: movi a2, 32 +; XTENSA-NEXT: beqz a8, .LBB2_2 +; XTENSA-NEXT: j .LBB2_1 +; XTENSA-NEXT: .LBB2_1: # %cond.false +; XTENSA-NEXT: srli a9, a8, 1 +; XTENSA-NEXT: or a8, a8, a9 +; XTENSA-NEXT: srli a9, a8, 2 +; XTENSA-NEXT: or a8, a8, a9 +; XTENSA-NEXT: srli a9, a8, 4 +; XTENSA-NEXT: or a8, a8, a9 +; XTENSA-NEXT: srli a9, a8, 8 +; XTENSA-NEXT: or a8, a8, a9 +; XTENSA-NEXT: extui a9, a8, 16, 16 +; XTENSA-NEXT: or a8, a8, a9 +; XTENSA-NEXT: movi a9, -1 +; XTENSA-NEXT: xor a8, a8, a9 +; XTENSA-NEXT: srli a9, a8, 1 +; XTENSA-NEXT: l32r a10, .LCPI2_0 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: sub a8, a8, a9 +; XTENSA-NEXT: l32r a9, .LCPI2_1 +; XTENSA-NEXT: and a10, a8, a9 +; XTENSA-NEXT: srli a8, a8, 2 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: add a8, a10, a8 +; XTENSA-NEXT: srli a9, a8, 4 +; XTENSA-NEXT: add a8, a8, a9 +; XTENSA-NEXT: l32r a9, .LCPI2_2 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: slli a9, a8, 8 +; XTENSA-NEXT: add a8, a8, a9 +; XTENSA-NEXT: slli a9, a8, 16 +; XTENSA-NEXT: add a8, a8, a9 +; XTENSA-NEXT: extui a2, a8, 24, 8 +; XTENSA-NEXT: .LBB2_2: # %cond.end +; XTENSA-NEXT: ret + %tmp = call i32 @llvm.ctlz.i32(i32 %a, i1 false) + ret i32 %tmp +} + +define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { +; XTENSA-LABEL: test_ctlz_i32_zero_undef: +; XTENSA: srli a8, a2, 1 +; XTENSA-NEXT: or a8, a2, a8 +; XTENSA-NEXT: srli a9, a8, 2 +; XTENSA-NEXT: or a8, a8, a9 +; XTENSA-NEXT: srli a9, a8, 4 +; XTENSA-NEXT: or a8, a8, a9 +; XTENSA-NEXT: srli a9, a8, 8 +; XTENSA-NEXT: or a8, a8, a9 +; XTENSA-NEXT: extui a9, a8, 16, 16 +; XTENSA-NEXT: or a8, a8, a9 +; XTENSA-NEXT: movi a9, -1 +; XTENSA-NEXT: xor a8, a8, a9 +; XTENSA-NEXT: srli a9, a8, 1 +; XTENSA-NEXT: l32r a10, .LCPI3_0 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: sub a8, a8, a9 +; XTENSA-NEXT: l32r a9, .LCPI3_1 +; XTENSA-NEXT: and a10, a8, a9 +; XTENSA-NEXT: srli a8, a8, 2 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: add a8, a10, a8 +; XTENSA-NEXT: srli a9, a8, 4 +; XTENSA-NEXT: add a8, a8, a9 +; XTENSA-NEXT: l32r a9, .LCPI3_2 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: slli a9, a8, 8 +; XTENSA-NEXT: add a8, a8, a9 +; XTENSA-NEXT: slli a9, a8, 16 +; XTENSA-NEXT: add a8, a8, a9 +; XTENSA-NEXT: extui a2, a8, 24, 8 +; XTENSA-NEXT: ret + %tmp = call i32 @llvm.ctlz.i32(i32 %a, i1 true) + ret i32 %tmp +} + +define i32 @test_ctpop_i32(i32 %a) nounwind { +; XTENSA-LABEL: test_ctpop_i32: +; XTENSA: srli a8, a2, 1 +; XTENSA-NEXT: l32r a9, .LCPI4_0 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: sub a8, a2, a8 +; XTENSA-NEXT: l32r a9, .LCPI4_1 +; XTENSA-NEXT: and a10, a8, a9 +; XTENSA-NEXT: srli a8, a8, 2 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: add a8, a10, a8 +; XTENSA-NEXT: srli a9, a8, 4 +; XTENSA-NEXT: add a8, a8, a9 +; XTENSA-NEXT: l32r a9, .LCPI4_2 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: slli a9, a8, 8 +; XTENSA-NEXT: add a8, a8, a9 +; XTENSA-NEXT: slli a9, a8, 16 +; XTENSA-NEXT: add a8, a8, a9 +; XTENSA-NEXT: extui a2, a8, 24, 8 +; XTENSA-NEXT: ret + %1 = call i32 @llvm.ctpop.i32(i32 %a) + ret i32 %1 +} diff --git a/llvm/test/CodeGen/Xtensa/div.ll b/llvm/test/CodeGen/Xtensa/div.ll new file mode 100644 index 00000000000000..e10e976fb1b386 --- /dev/null +++ b/llvm/test/CodeGen/Xtensa/div.ll @@ -0,0 +1,486 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=xtensa -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=XTENSA %s + +define i32 @udiv(i32 %a, i32 %b) nounwind { +; XTENSA-LABEL: udiv: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: l32r a8, .LCPI0_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = udiv i32 %a, %b + ret i32 %1 +} + +define i32 @udiv_constant(i32 %a) nounwind { +; XTENSA-LABEL: udiv_constant: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: movi a3, 5 +; XTENSA-NEXT: l32r a8, .LCPI1_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = udiv i32 %a, 5 + ret i32 %1 +} + +define i32 @udiv_pow2(i32 %a) nounwind { +; XTENSA-LABEL: udiv_pow2: +; XTENSA: srli a2, a2, 3 +; XTENSA-NEXT: ret + %1 = udiv i32 %a, 8 + ret i32 %1 +} + +define i32 @udiv_constant_lhs(i32 %a) nounwind { +; XTENSA-LABEL: udiv_constant_lhs: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: or a3, a2, a2 +; XTENSA-NEXT: movi a2, 10 +; XTENSA-NEXT: l32r a8, .LCPI3_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = udiv i32 10, %a + ret i32 %1 +} + +define i64 @udiv64(i64 %a, i64 %b) nounwind { +; XTENSA-LABEL: udiv64: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: l32r a8, .LCPI4_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = udiv i64 %a, %b + ret i64 %1 +} + +define i64 @udiv64_constant(i64 %a) nounwind { +; XTENSA-LABEL: udiv64_constant: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: movi a4, 5 +; XTENSA-NEXT: movi a5, 0 +; XTENSA-NEXT: l32r a8, .LCPI5_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = udiv i64 %a, 5 + ret i64 %1 +} + +define i64 @udiv64_constant_lhs(i64 %a) nounwind { +; XTENSA-LABEL: udiv64_constant_lhs: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: or a5, a3, a3 +; XTENSA-NEXT: or a4, a2, a2 +; XTENSA-NEXT: movi a2, 10 +; XTENSA-NEXT: movi a3, 0 +; XTENSA-NEXT: l32r a8, .LCPI6_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = udiv i64 10, %a + ret i64 %1 +} + +define i8 @udiv8(i8 %a, i8 %b) nounwind { +; XTENSA-LABEL: udiv8: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: movi a8, 255 +; XTENSA-NEXT: and a2, a2, a8 +; XTENSA-NEXT: and a3, a3, a8 +; XTENSA-NEXT: l32r a8, .LCPI7_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = udiv i8 %a, %b + ret i8 %1 +} + +define i8 @udiv8_constant(i8 %a) nounwind { +; XTENSA-LABEL: udiv8_constant: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: movi a8, 255 +; XTENSA-NEXT: and a2, a2, a8 +; XTENSA-NEXT: movi a3, 5 +; XTENSA-NEXT: l32r a8, .LCPI8_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = udiv i8 %a, 5 + ret i8 %1 +} + +define i8 @udiv8_pow2(i8 %a) nounwind { +; XTENSA-LABEL: udiv8_pow2: +; XTENSA: movi a8, 248 +; XTENSA-NEXT: and a8, a2, a8 +; XTENSA-NEXT: srli a2, a8, 3 +; XTENSA-NEXT: ret + %1 = udiv i8 %a, 8 + ret i8 %1 +} + +define i8 @udiv8_constant_lhs(i8 %a) nounwind { +; XTENSA-LABEL: udiv8_constant_lhs: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: movi a8, 255 +; XTENSA-NEXT: and a3, a2, a8 +; XTENSA-NEXT: movi a2, 10 +; XTENSA-NEXT: l32r a8, .LCPI10_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = udiv i8 10, %a + ret i8 %1 +} + +define i16 @udiv16(i16 %a, i16 %b) nounwind { +; XTENSA-LABEL: udiv16: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: l32r a8, .LCPI11_0 +; XTENSA-NEXT: and a2, a2, a8 +; XTENSA-NEXT: and a3, a3, a8 +; XTENSA-NEXT: l32r a8, .LCPI11_1 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = udiv i16 %a, %b + ret i16 %1 +} + +define i16 @udiv16_constant(i16 %a) nounwind { +; XTENSA-LABEL: udiv16_constant: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: l32r a8, .LCPI12_0 +; XTENSA-NEXT: and a2, a2, a8 +; XTENSA-NEXT: movi a3, 5 +; XTENSA-NEXT: l32r a8, .LCPI12_1 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = udiv i16 %a, 5 + ret i16 %1 +} + +define i16 @udiv16_pow2(i16 %a) nounwind { +; XTENSA-LABEL: udiv16_pow2: +; XTENSA: l32r a8, .LCPI13_0 +; XTENSA-NEXT: and a8, a2, a8 +; XTENSA-NEXT: srli a2, a8, 3 +; XTENSA-NEXT: ret + %1 = udiv i16 %a, 8 + ret i16 %1 +} + +define i32 @sdiv(i32 %a, i32 %b) nounwind { +; XTENSA-LABEL: sdiv: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: l32r a8, .LCPI14_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = sdiv i32 %a, %b + ret i32 %1 +} + +define i32 @sdiv_constant_lhs(i32 %a) nounwind { +; XTENSA-LABEL: sdiv_constant_lhs: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: or a3, a2, a2 +; XTENSA-NEXT: movi a2, -10 +; XTENSA-NEXT: l32r a8, .LCPI15_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = sdiv i32 -10, %a + ret i32 %1 +} + +define i64 @sdiv64(i64 %a, i64 %b) nounwind { +; XTENSA-LABEL: sdiv64: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: l32r a8, .LCPI16_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = sdiv i64 %a, %b + ret i64 %1 +} + +define i64 @sdiv64_constant(i64 %a) nounwind { +; XTENSA-LABEL: sdiv64_constant: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: movi a4, 5 +; XTENSA-NEXT: movi a5, 0 +; XTENSA-NEXT: l32r a8, .LCPI17_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = sdiv i64 %a, 5 + ret i64 %1 +} + +define i64 @sdiv64_constant_lhs(i64 %a) nounwind { +; XTENSA-LABEL: sdiv64_constant_lhs: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: or a5, a3, a3 +; XTENSA-NEXT: or a4, a2, a2 +; XTENSA-NEXT: movi a2, 10 +; XTENSA-NEXT: movi a3, 0 +; XTENSA-NEXT: l32r a8, .LCPI18_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = sdiv i64 10, %a + ret i64 %1 +} + + +define i64 @sdiv64_sext_operands(i32 %a, i32 %b) nounwind { +; XTENSA-LABEL: sdiv64_sext_operands: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: or a4, a3, a3 +; XTENSA-NEXT: srai a3, a2, 31 +; XTENSA-NEXT: srai a5, a4, 31 +; XTENSA-NEXT: l32r a8, .LCPI19_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = sext i32 %a to i64 + %2 = sext i32 %b to i64 + %3 = sdiv i64 %1, %2 + ret i64 %3 +} + +define i8 @sdiv8(i8 %a, i8 %b) nounwind { +; XTENSA-LABEL: sdiv8: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: slli a8, a2, 24 +; XTENSA-NEXT: srai a2, a8, 24 +; XTENSA-NEXT: slli a8, a3, 24 +; XTENSA-NEXT: srai a3, a8, 24 +; XTENSA-NEXT: l32r a8, .LCPI20_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = sdiv i8 %a, %b + ret i8 %1 +} + +define i8 @sdiv8_constant(i8 %a) nounwind { +; XTENSA-LABEL: sdiv8_constant: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: slli a8, a2, 24 +; XTENSA-NEXT: srai a2, a8, 24 +; XTENSA-NEXT: movi a3, 5 +; XTENSA-NEXT: l32r a8, .LCPI21_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = sdiv i8 %a, 5 + ret i8 %1 +} + +define i8 @sdiv8_pow2(i8 %a) nounwind { +; XTENSA-LABEL: sdiv8_pow2: +; XTENSA: slli a8, a2, 24 +; XTENSA-NEXT: srai a8, a8, 24 +; XTENSA-NEXT: srli a8, a8, 12 +; XTENSA-NEXT: movi a9, 7 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: add a8, a2, a8 +; XTENSA-NEXT: slli a8, a8, 24 +; XTENSA-NEXT: srai a2, a8, 27 +; XTENSA-NEXT: ret + %1 = sdiv i8 %a, 8 + ret i8 %1 +} + +define i8 @sdiv8_constant_lhs(i8 %a) nounwind { +; XTENSA-LABEL: sdiv8_constant_lhs: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: slli a8, a2, 24 +; XTENSA-NEXT: srai a3, a8, 24 +; XTENSA-NEXT: movi a2, -10 +; XTENSA-NEXT: l32r a8, .LCPI23_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = sdiv i8 -10, %a + ret i8 %1 +} + +define i16 @sdiv16(i16 %a, i16 %b) nounwind { +; XTENSA-LABEL: sdiv16: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: slli a8, a2, 16 +; XTENSA-NEXT: srai a2, a8, 16 +; XTENSA-NEXT: slli a8, a3, 16 +; XTENSA-NEXT: srai a3, a8, 16 +; XTENSA-NEXT: l32r a8, .LCPI24_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = sdiv i16 %a, %b + ret i16 %1 +} + +define i16 @sdiv16_constant(i16 %a) nounwind { +; XTENSA-LABEL: sdiv16_constant: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: slli a8, a2, 16 +; XTENSA-NEXT: srai a2, a8, 16 +; XTENSA-NEXT: movi a3, 5 +; XTENSA-NEXT: l32r a8, .LCPI25_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = sdiv i16 %a, 5 + ret i16 %1 +} + +define i16 @sdiv16_constant_lhs(i16 %a) nounwind { +; XTENSA-LABEL: sdiv16_constant_lhs: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: slli a8, a2, 16 +; XTENSA-NEXT: srai a3, a8, 16 +; XTENSA-NEXT: movi a2, -10 +; XTENSA-NEXT: l32r a8, .LCPI26_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = sdiv i16 -10, %a + ret i16 %1 +} + +define i32 @sdiv_pow2(i32 %a) nounwind { +; XTENSA-LABEL: sdiv_pow2: +; XTENSA: srai a8, a2, 31 +; XTENSA-NEXT: extui a8, a8, 29, 3 +; XTENSA-NEXT: add a8, a2, a8 +; XTENSA-NEXT: srai a2, a8, 3 +; XTENSA-NEXT: ret + %1 = sdiv i32 %a, 8 + ret i32 %1 +} + +define i32 @sdiv_pow2_2(i32 %a) nounwind { +; XTENSA-LABEL: sdiv_pow2_2: +; XTENSA: srai a8, a2, 31 +; XTENSA-NEXT: extui a8, a8, 16, 16 +; XTENSA-NEXT: add a8, a2, a8 +; XTENSA-NEXT: srai a2, a8, 16 +; XTENSA-NEXT: ret + %1 = sdiv i32 %a, 65536 + ret i32 %1 +} + +define i16 @sdiv16_pow2(i16 %a) nounwind { +; XTENSA-LABEL: sdiv16_pow2: +; XTENSA: slli a8, a2, 16 +; XTENSA-NEXT: srai a8, a8, 16 +; XTENSA-NEXT: extui a8, a8, 28, 4 +; XTENSA-NEXT: movi a9, 7 +; XTENSA-NEXT: and a8, a8, a9 +; XTENSA-NEXT: add a8, a2, a8 +; XTENSA-NEXT: slli a8, a8, 16 +; XTENSA-NEXT: srai a2, a8, 19 +; XTENSA-NEXT: ret + %1 = sdiv i16 %a, 8 + ret i16 %1 +} diff --git a/llvm/test/CodeGen/Xtensa/mul.ll b/llvm/test/CodeGen/Xtensa/mul.ll new file mode 100644 index 00000000000000..9b13897293dc1b --- /dev/null +++ b/llvm/test/CodeGen/Xtensa/mul.ll @@ -0,0 +1,672 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=xtensa -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=XTENSA %s + +define signext i32 @square(i32 %a) nounwind { +; XTENSA-LABEL: square: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: l32r a8, .LCPI0_0 +; XTENSA-NEXT: or a3, a2, a2 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i32 %a, %a + ret i32 %1 +} + +define signext i32 @mul(i32 %a, i32 %b) nounwind { +; XTENSA-LABEL: mul: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: l32r a8, .LCPI1_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i32 %a, %b + ret i32 %1 +} + +define signext i32 @mul_constant(i32 %a) nounwind { +; XTENSA-LABEL: mul_constant: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: movi a3, 5 +; XTENSA-NEXT: l32r a8, .LCPI2_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i32 %a, 5 + ret i32 %1 +} + +define i32 @mul_pow2(i32 %a) nounwind { +; XTENSA-LABEL: mul_pow2: +; XTENSA: slli a2, a2, 3 +; XTENSA-NEXT: ret + %1 = mul i32 %a, 8 + ret i32 %1 +} + +define i64 @mul64(i64 %a, i64 %b) nounwind { +; XTENSA-LABEL: mul64: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: l32r a8, .LCPI4_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i64 %a, %b + ret i64 %1 +} + +define i64 @mul64_constant(i64 %a) nounwind { +; XTENSA-LABEL: mul64_constant: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: movi a4, 5 +; XTENSA-NEXT: movi a5, 0 +; XTENSA-NEXT: l32r a8, .LCPI5_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i64 %a, 5 + ret i64 %1 +} + +define i32 @mulhs(i32 %a, i32 %b) nounwind { +; XTENSA-LABEL: mulhs: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: or a4, a3, a3 +; XTENSA-NEXT: srai a3, a2, 31 +; XTENSA-NEXT: srai a5, a4, 31 +; XTENSA-NEXT: l32r a8, .LCPI6_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: or a2, a3, a3 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = sext i32 %a to i64 + %2 = sext i32 %b to i64 + %3 = mul i64 %1, %2 + %4 = lshr i64 %3, 32 + %5 = trunc i64 %4 to i32 + ret i32 %5 +} + +define i32 @mulhs_positive_constant(i32 %a) nounwind { +; XTENSA-LABEL: mulhs_positive_constant: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: srai a3, a2, 31 +; XTENSA-NEXT: movi a4, 5 +; XTENSA-NEXT: movi a5, 0 +; XTENSA-NEXT: l32r a8, .LCPI7_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: or a2, a3, a3 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = sext i32 %a to i64 + %2 = mul i64 %1, 5 + %3 = lshr i64 %2, 32 + %4 = trunc i64 %3 to i32 + ret i32 %4 +} + +define i32 @mulhs_negative_constant(i32 %a) nounwind { +; XTENSA-LABEL: mulhs_negative_constant: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: srai a3, a2, 31 +; XTENSA-NEXT: movi a4, -5 +; XTENSA-NEXT: movi a5, -1 +; XTENSA-NEXT: l32r a8, .LCPI8_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: or a2, a3, a3 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = sext i32 %a to i64 + %2 = mul i64 %1, -5 + %3 = lshr i64 %2, 32 + %4 = trunc i64 %3 to i32 + ret i32 %4 +} + +define zeroext i32 @mulhu(i32 zeroext %a, i32 zeroext %b) nounwind { +; XTENSA-LABEL: mulhu: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: or a4, a3, a3 +; XTENSA-NEXT: movi a3, 0 +; XTENSA-NEXT: l32r a8, .LCPI9_0 +; XTENSA-NEXT: or a5, a3, a3 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: or a2, a3, a3 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = zext i32 %a to i64 + %2 = zext i32 %b to i64 + %3 = mul i64 %1, %2 + %4 = lshr i64 %3, 32 + %5 = trunc i64 %4 to i32 + ret i32 %5 +} + +define i32 @mulhsu(i32 %a, i32 %b) nounwind { +; XTENSA-LABEL: mulhsu: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: or a4, a3, a3 +; XTENSA-NEXT: srai a5, a4, 31 +; XTENSA-NEXT: movi a3, 0 +; XTENSA-NEXT: l32r a8, .LCPI10_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: or a2, a3, a3 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = zext i32 %a to i64 + %2 = sext i32 %b to i64 + %3 = mul i64 %1, %2 + %4 = lshr i64 %3, 32 + %5 = trunc i64 %4 to i32 + ret i32 %5 +} + +define i32 @mulhu_constant(i32 %a) nounwind { +; XTENSA-LABEL: mulhu_constant: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: movi a4, 5 +; XTENSA-NEXT: movi a3, 0 +; XTENSA-NEXT: l32r a8, .LCPI11_0 +; XTENSA-NEXT: or a5, a3, a3 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: or a2, a3, a3 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = zext i32 %a to i64 + %2 = mul i64 %1, 5 + %3 = lshr i64 %2, 32 + %4 = trunc i64 %3 to i32 + ret i32 %4 +} + +define i32 @muli32_p65(i32 %a) nounwind { +; XTENSA-LABEL: muli32_p65: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: movi a3, 65 +; XTENSA-NEXT: l32r a8, .LCPI12_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i32 %a, 65 + ret i32 %1 +} + +define i32 @muli32_p63(i32 %a) nounwind { +; XTENSA-LABEL: muli32_p63: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: movi a3, 63 +; XTENSA-NEXT: l32r a8, .LCPI13_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i32 %a, 63 + ret i32 %1 +} + +define i64 @muli64_p65(i64 %a) nounwind { +; XTENSA-LABEL: muli64_p65: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: movi a4, 65 +; XTENSA-NEXT: movi a5, 0 +; XTENSA-NEXT: l32r a8, .LCPI14_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i64 %a, 65 + ret i64 %1 +} + +define i64 @muli64_p63(i64 %a) nounwind { +; XTENSA-LABEL: muli64_p63: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: movi a4, 63 +; XTENSA-NEXT: movi a5, 0 +; XTENSA-NEXT: l32r a8, .LCPI15_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i64 %a, 63 + ret i64 %1 +} + +define i32 @muli32_m63(i32 %a) nounwind { +; XTENSA-LABEL: muli32_m63: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: movi a3, -63 +; XTENSA-NEXT: l32r a8, .LCPI16_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i32 %a, -63 + ret i32 %1 +} + +define i32 @muli32_m65(i32 %a) nounwind { +; XTENSA-LABEL: muli32_m65: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: movi a3, -65 +; XTENSA-NEXT: l32r a8, .LCPI17_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i32 %a, -65 + ret i32 %1 +} + +define i64 @muli64_m63(i64 %a) nounwind { +; XTENSA-LABEL: muli64_m63: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: movi a4, -63 +; XTENSA-NEXT: movi a5, -1 +; XTENSA-NEXT: l32r a8, .LCPI18_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i64 %a, -63 + ret i64 %1 +} + +define i64 @muli64_m65(i64 %a) nounwind { +; XTENSA-LABEL: muli64_m65: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: movi a4, -65 +; XTENSA-NEXT: movi a5, -1 +; XTENSA-NEXT: l32r a8, .LCPI19_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i64 %a, -65 + ret i64 %1 +} + +define i32 @muli32_p384(i32 %a) nounwind { +; XTENSA-LABEL: muli32_p384: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: movi a3, 384 +; XTENSA-NEXT: l32r a8, .LCPI20_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i32 %a, 384 + ret i32 %1 +} + +define i32 @muli32_p12288(i32 %a) nounwind { +; XTENSA-LABEL: muli32_p12288: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: l32r a3, .LCPI21_0 +; XTENSA-NEXT: l32r a8, .LCPI21_1 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i32 %a, 12288 + ret i32 %1 +} + +define i32 @muli32_p4352(i32 %a) nounwind { +; XTENSA-LABEL: muli32_p4352: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: l32r a3, .LCPI22_0 +; XTENSA-NEXT: l32r a8, .LCPI22_1 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i32 %a, 4352 + ret i32 %1 +} + +define i32 @muli32_p3840(i32 %a) nounwind { +; XTENSA-LABEL: muli32_p3840: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: l32r a3, .LCPI23_0 +; XTENSA-NEXT: l32r a8, .LCPI23_1 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i32 %a, 3840 + ret i32 %1 +} + +define i32 @muli32_m3840(i32 %a) nounwind { +; XTENSA-LABEL: muli32_m3840: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: l32r a3, .LCPI24_0 +; XTENSA-NEXT: l32r a8, .LCPI24_1 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i32 %a, -3840 + ret i32 %1 +} + +define i32 @muli32_m4352(i32 %a) nounwind { +; XTENSA-LABEL: muli32_m4352: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: l32r a3, .LCPI25_0 +; XTENSA-NEXT: l32r a8, .LCPI25_1 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i32 %a, -4352 + ret i32 %1 +} + +define i64 @muli64_p4352(i64 %a) nounwind { +; XTENSA-LABEL: muli64_p4352: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: l32r a4, .LCPI26_0 +; XTENSA-NEXT: movi a5, 0 +; XTENSA-NEXT: l32r a8, .LCPI26_1 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i64 %a, 4352 + ret i64 %1 +} + +define i64 @muli64_p3840(i64 %a) nounwind { +; XTENSA-LABEL: muli64_p3840: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: l32r a4, .LCPI27_0 +; XTENSA-NEXT: movi a5, 0 +; XTENSA-NEXT: l32r a8, .LCPI27_1 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i64 %a, 3840 + ret i64 %1 +} + +define i64 @muli64_m4352(i64 %a) nounwind { +; XTENSA-LABEL: muli64_m4352: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: l32r a4, .LCPI28_0 +; XTENSA-NEXT: movi a5, -1 +; XTENSA-NEXT: l32r a8, .LCPI28_1 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i64 %a, -4352 + ret i64 %1 +} + +define i64 @muli64_m3840(i64 %a) nounwind { +; XTENSA-LABEL: muli64_m3840: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 0 # 4-byte Folded Spill +; XTENSA-NEXT: l32r a4, .LCPI29_0 +; XTENSA-NEXT: movi a5, -1 +; XTENSA-NEXT: l32r a8, .LCPI29_1 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 0 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i64 %a, -3840 + ret i64 %1 +} + +define i128 @muli128_m3840(i128 %a) nounwind { +; XTENSA-LABEL: muli128_m3840: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 8 # 4-byte Folded Spill +; XTENSA-NEXT: movi a7, -1 +; XTENSA-NEXT: s32i a7, a1, 4 +; XTENSA-NEXT: s32i a7, a1, 0 +; XTENSA-NEXT: l32r a6, .LCPI30_0 +; XTENSA-NEXT: l32r a8, .LCPI30_1 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 8 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i128 %a, -3840 + ret i128 %1 +} + +define i128 @muli128_m63(i128 %a) nounwind { +; XTENSA-LABEL: muli128_m63: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 8 # 4-byte Folded Spill +; XTENSA-NEXT: movi a7, -1 +; XTENSA-NEXT: s32i a7, a1, 4 +; XTENSA-NEXT: s32i a7, a1, 0 +; XTENSA-NEXT: movi a6, -63 +; XTENSA-NEXT: l32r a8, .LCPI31_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l32i a0, a1, 8 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = mul i128 %a, -63 + ret i128 %1 +} + +define i64 @mulhsu_i64(i64 %a, i64 %b) nounwind { +; XTENSA-LABEL: mulhsu_i64: +; XTENSA: addi a8, a1, -16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: s32i a0, a1, 8 # 4-byte Folded Spill +; XTENSA-NEXT: or a7, a5, a5 +; XTENSA-NEXT: or a6, a4, a4 +; XTENSA-NEXT: srai a8, a7, 31 +; XTENSA-NEXT: s32i a8, a1, 4 +; XTENSA-NEXT: s32i a8, a1, 0 +; XTENSA-NEXT: movi a4, 0 +; XTENSA-NEXT: l32r a8, .LCPI32_0 +; XTENSA-NEXT: or a5, a4, a4 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: or a2, a4, a4 +; XTENSA-NEXT: or a3, a5, a5 +; XTENSA-NEXT: l32i a0, a1, 8 # 4-byte Folded Reload +; XTENSA-NEXT: addi a8, a1, 16 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %1 = zext i64 %a to i128 + %2 = sext i64 %b to i128 + %3 = mul i128 %1, %2 + %4 = lshr i128 %3, 64 + %5 = trunc i128 %4 to i64 + ret i64 %5 +} + +define i8 @muladd_demand(i8 %x, i8 %y) nounwind { +; XTENSA-LABEL: muladd_demand: +; XTENSA: slli a8, a2, 1 +; XTENSA-NEXT: sub a8, a3, a8 +; XTENSA-NEXT: movi a9, 15 +; XTENSA-NEXT: and a2, a8, a9 +; XTENSA-NEXT: ret + %m = mul i8 %x, 14 + %a = add i8 %y, %m + %r = and i8 %a, 15 + ret i8 %r +} + +define i8 @mulsub_demand(i8 %x, i8 %y) nounwind { +; XTENSA-LABEL: mulsub_demand: +; XTENSA: addx2 a8, a2, a3 +; XTENSA-NEXT: movi a9, 15 +; XTENSA-NEXT: and a2, a8, a9 +; XTENSA-NEXT: ret + %m = mul i8 %x, 14 + %a = sub i8 %y, %m + %r = and i8 %a, 15 + ret i8 %r +} + +define i8 @muladd_demand_2(i8 %x, i8 %y) nounwind { +; XTENSA-LABEL: muladd_demand_2: +; XTENSA: slli a8, a2, 1 +; XTENSA-NEXT: sub a8, a3, a8 +; XTENSA-NEXT: movi a9, -16 +; XTENSA-NEXT: or a2, a8, a9 +; XTENSA-NEXT: ret + %m = mul i8 %x, 14 + %a = add i8 %y, %m + %r = or i8 %a, 240 + ret i8 %r +} + +define i8 @mulsub_demand_2(i8 %x, i8 %y) nounwind { +; XTENSA-LABEL: mulsub_demand_2: +; XTENSA: addx2 a8, a2, a3 +; XTENSA-NEXT: movi a9, -16 +; XTENSA-NEXT: or a2, a8, a9 +; XTENSA-NEXT: ret + %m = mul i8 %x, 14 + %a = sub i8 %y, %m + %r = or i8 %a, 240 + ret i8 %r +} + +define signext i32 @mul_imm_2(i32 %a) nounwind { +; XTENSA-LABEL: mul_imm_2: +; XTENSA: slli a2, a2, 1 +; XTENSA-NEXT: ret + %1 = mul i32 %a, 2 + ret i32 %1 +} + +define signext i32 @mul_imm_1024(i32 %a) nounwind { +; XTENSA-LABEL: mul_imm_1024: +; XTENSA: slli a2, a2, 10 +; XTENSA-NEXT: ret + %1 = mul i32 %a, 1024 + ret i32 %1 +} + +define signext i32 @mul_imm_16384(i32 %a) nounwind { +; XTENSA-LABEL: mul_imm_16384: +; XTENSA: slli a2, a2, 14 +; XTENSA-NEXT: ret + %1 = mul i32 %a, 16384 + ret i32 %1 +} + +define <4 x i32> @mul_vec_splat_constant(<4 x i32> %a) { +; XTENSA-LABEL: mul_vec_splat_constant: +; XTENSA: slli a2, a2, 2 +; XTENSA-NEXT: slli a3, a3, 2 +; XTENSA-NEXT: slli a4, a4, 2 +; XTENSA-NEXT: slli a5, a5, 2 +; XTENSA-NEXT: ret + %mul = mul <4 x i32> %a, + ret <4 x i32> %mul +} diff --git a/llvm/test/CodeGen/Xtensa/rotl-rotr.ll b/llvm/test/CodeGen/Xtensa/rotl-rotr.ll new file mode 100644 index 00000000000000..350315e9aefdae --- /dev/null +++ b/llvm/test/CodeGen/Xtensa/rotl-rotr.ll @@ -0,0 +1,501 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=xtensa -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=XTENSA %s + +define i32 @rotl_32(i32 %x, i32 %y) nounwind { +; XTENSA-LABEL: rotl_32: +; XTENSA: ssl a3 +; XTENSA-NEXT: sll a8, a2 +; XTENSA-NEXT: movi a9, 32 +; XTENSA-NEXT: sub a9, a9, a3 +; XTENSA-NEXT: ssr a9 +; XTENSA-NEXT: srl a9, a2 +; XTENSA-NEXT: or a2, a8, a9 +; XTENSA-NEXT: ret + %z = sub i32 32, %y + %b = shl i32 %x, %y + %c = lshr i32 %x, %z + %d = or i32 %b, %c + ret i32 %d +} + +define i32 @rotr_32(i32 %x, i32 %y) nounwind { +; XTENSA-LABEL: rotr_32: +; XTENSA: ssr a3 +; XTENSA-NEXT: srl a8, a2 +; XTENSA-NEXT: movi a9, 32 +; XTENSA-NEXT: sub a9, a9, a3 +; XTENSA-NEXT: ssl a9 +; XTENSA-NEXT: sll a9, a2 +; XTENSA-NEXT: or a2, a8, a9 +; XTENSA-NEXT: ret + %z = sub i32 32, %y + %b = lshr i32 %x, %y + %c = shl i32 %x, %z + %d = or i32 %b, %c + ret i32 %d +} + +define i64 @rotl_64(i64 %x, i64 %y) nounwind { +; XTENSA-LABEL: rotl_64: +; XTENSA: movi a8, 64 +; XTENSA-NEXT: sub a8, a8, a4 +; XTENSA-NEXT: ssr a8 +; XTENSA-NEXT: src a11, a3, a2 +; XTENSA-NEXT: movi a9, 32 +; XTENSA-NEXT: sub a9, a9, a4 +; XTENSA-NEXT: ssr a9 +; XTENSA-NEXT: srl a7, a3 +; XTENSA-NEXT: movi a10, 0 +; XTENSA-NEXT: blt a9, a10, .LBB2_2 +; XTENSA-NEXT: # %bb.1: +; XTENSA-NEXT: or a11, a7, a7 +; XTENSA-NEXT: .LBB2_2: +; XTENSA-NEXT: ssl a4 +; XTENSA-NEXT: sll a7, a2 +; XTENSA-NEXT: addi a5, a4, -32 +; XTENSA-NEXT: blt a5, a10, .LBB2_4 +; XTENSA-NEXT: # %bb.3: +; XTENSA-NEXT: or a7, a10, a10 +; XTENSA-NEXT: .LBB2_4: +; XTENSA-NEXT: ssl a4 +; XTENSA-NEXT: src a6, a3, a2 +; XTENSA-NEXT: ssl a5 +; XTENSA-NEXT: sll a4, a2 +; XTENSA-NEXT: blt a5, a10, .LBB2_6 +; XTENSA-NEXT: # %bb.5: +; XTENSA-NEXT: or a6, a4, a4 +; XTENSA-NEXT: .LBB2_6: +; XTENSA-NEXT: or a2, a7, a11 +; XTENSA-NEXT: ssr a8 +; XTENSA-NEXT: srl a8, a3 +; XTENSA-NEXT: blt a9, a10, .LBB2_8 +; XTENSA-NEXT: # %bb.7: +; XTENSA-NEXT: or a8, a10, a10 +; XTENSA-NEXT: .LBB2_8: +; XTENSA-NEXT: or a3, a6, a8 +; XTENSA-NEXT: ret + %z = sub i64 64, %y + %b = shl i64 %x, %y + %c = lshr i64 %x, %z + %d = or i64 %b, %c + ret i64 %d +} + +define i64 @rotr_64(i64 %x, i64 %y) nounwind { +; XTENSA-LABEL: rotr_64: +; XTENSA: ssr a4 +; XTENSA-NEXT: src a10, a3, a2 +; XTENSA-NEXT: addi a8, a4, -32 +; XTENSA-NEXT: ssr a8 +; XTENSA-NEXT: srl a11, a3 +; XTENSA-NEXT: movi a9, 0 +; XTENSA-NEXT: blt a8, a9, .LBB3_2 +; XTENSA-NEXT: # %bb.1: +; XTENSA-NEXT: or a10, a11, a11 +; XTENSA-NEXT: .LBB3_2: +; XTENSA-NEXT: movi a11, 32 +; XTENSA-NEXT: sub a7, a11, a4 +; XTENSA-NEXT: movi a11, 64 +; XTENSA-NEXT: sub a11, a11, a4 +; XTENSA-NEXT: ssl a11 +; XTENSA-NEXT: sll a6, a2 +; XTENSA-NEXT: blt a7, a9, .LBB3_4 +; XTENSA-NEXT: # %bb.3: +; XTENSA-NEXT: or a6, a9, a9 +; XTENSA-NEXT: .LBB3_4: +; XTENSA-NEXT: ssl a11 +; XTENSA-NEXT: src a11, a3, a2 +; XTENSA-NEXT: ssl a7 +; XTENSA-NEXT: sll a5, a2 +; XTENSA-NEXT: blt a7, a9, .LBB3_6 +; XTENSA-NEXT: # %bb.5: +; XTENSA-NEXT: or a11, a5, a5 +; XTENSA-NEXT: .LBB3_6: +; XTENSA-NEXT: or a2, a10, a6 +; XTENSA-NEXT: ssr a4 +; XTENSA-NEXT: srl a10, a3 +; XTENSA-NEXT: blt a8, a9, .LBB3_8 +; XTENSA-NEXT: # %bb.7: +; XTENSA-NEXT: or a10, a9, a9 +; XTENSA-NEXT: .LBB3_8: +; XTENSA-NEXT: or a3, a10, a11 +; XTENSA-NEXT: ret + %z = sub i64 64, %y + %b = lshr i64 %x, %y + %c = shl i64 %x, %z + %d = or i64 %b, %c + ret i64 %d +} + +define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind { +; XTENSA-LABEL: rotl_32_mask: +; XTENSA: ssl a3 +; XTENSA-NEXT: sll a8, a2 +; XTENSA-NEXT: neg a9, a3 +; XTENSA-NEXT: movi a10, 31 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: ssr a9 +; XTENSA-NEXT: srl a9, a2 +; XTENSA-NEXT: or a2, a8, a9 +; XTENSA-NEXT: ret + %z = sub i32 0, %y + %and = and i32 %z, 31 + %b = shl i32 %x, %y + %c = lshr i32 %x, %and + %d = or i32 %b, %c + ret i32 %d +} + +define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { +; XTENSA-LABEL: rotl_32_mask_and_63_and_31: +; XTENSA: movi a8, 63 +; XTENSA-NEXT: and a8, a3, a8 +; XTENSA-NEXT: ssl a8 +; XTENSA-NEXT: sll a8, a2 +; XTENSA-NEXT: neg a9, a3 +; XTENSA-NEXT: movi a10, 31 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: ssr a9 +; XTENSA-NEXT: srl a9, a2 +; XTENSA-NEXT: or a2, a8, a9 +; XTENSA-NEXT: ret + %a = and i32 %y, 63 + %b = shl i32 %x, %a + %c = sub i32 0, %y + %d = and i32 %c, 31 + %e = lshr i32 %x, %d + %f = or i32 %b, %e + ret i32 %f +} + +define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind { +; XTENSA-LABEL: rotr_32_mask: +; XTENSA: ssr a3 +; XTENSA-NEXT: srl a8, a2 +; XTENSA-NEXT: neg a9, a3 +; XTENSA-NEXT: movi a10, 31 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: ssl a9 +; XTENSA-NEXT: sll a9, a2 +; XTENSA-NEXT: or a2, a8, a9 +; XTENSA-NEXT: ret + %z = sub i32 0, %y + %and = and i32 %z, 31 + %b = lshr i32 %x, %y + %c = shl i32 %x, %and + %d = or i32 %b, %c + ret i32 %d +} + +define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { +; XTENSA-LABEL: rotr_32_mask_and_63_and_31: +; XTENSA: movi a8, 63 +; XTENSA-NEXT: and a8, a3, a8 +; XTENSA-NEXT: ssr a8 +; XTENSA-NEXT: srl a8, a2 +; XTENSA-NEXT: neg a9, a3 +; XTENSA-NEXT: movi a10, 31 +; XTENSA-NEXT: and a9, a9, a10 +; XTENSA-NEXT: ssl a9 +; XTENSA-NEXT: sll a9, a2 +; XTENSA-NEXT: or a2, a8, a9 +; XTENSA-NEXT: ret + %a = and i32 %y, 63 + %b = lshr i32 %x, %a + %c = sub i32 0, %y + %d = and i32 %c, 31 + %e = shl i32 %x, %d + %f = or i32 %b, %e + ret i32 %f +} + +define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { +; XTENSA-LABEL: rotl_64_mask: +; XTENSA: ssl a4 +; XTENSA-NEXT: src a10, a3, a2 +; XTENSA-NEXT: addi a8, a4, -32 +; XTENSA-NEXT: ssl a8 +; XTENSA-NEXT: sll a11, a2 +; XTENSA-NEXT: movi a9, 0 +; XTENSA-NEXT: blt a8, a9, .LBB8_2 +; XTENSA-NEXT: # %bb.1: +; XTENSA-NEXT: or a10, a11, a11 +; XTENSA-NEXT: .LBB8_2: +; XTENSA-NEXT: neg a11, a4 +; XTENSA-NEXT: movi a7, 63 +; XTENSA-NEXT: and a7, a11, a7 +; XTENSA-NEXT: ssr a7 +; XTENSA-NEXT: srl a11, a3 +; XTENSA-NEXT: addi a6, a7, -32 +; XTENSA-NEXT: blt a6, a9, .LBB8_4 +; XTENSA-NEXT: # %bb.3: +; XTENSA-NEXT: or a11, a9, a9 +; XTENSA-NEXT: .LBB8_4: +; XTENSA-NEXT: ssr a7 +; XTENSA-NEXT: src a7, a3, a2 +; XTENSA-NEXT: ssr a6 +; XTENSA-NEXT: srl a5, a3 +; XTENSA-NEXT: blt a6, a9, .LBB8_6 +; XTENSA-NEXT: # %bb.5: +; XTENSA-NEXT: or a7, a5, a5 +; XTENSA-NEXT: .LBB8_6: +; XTENSA-NEXT: or a3, a10, a11 +; XTENSA-NEXT: ssl a4 +; XTENSA-NEXT: sll a10, a2 +; XTENSA-NEXT: blt a8, a9, .LBB8_8 +; XTENSA-NEXT: # %bb.7: +; XTENSA-NEXT: or a10, a9, a9 +; XTENSA-NEXT: .LBB8_8: +; XTENSA-NEXT: or a2, a10, a7 +; XTENSA-NEXT: ret + %z = sub i64 0, %y + %and = and i64 %z, 63 + %b = shl i64 %x, %y + %c = lshr i64 %x, %and + %d = or i64 %b, %c + ret i64 %d +} + +define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { +; XTENSA-LABEL: rotl_64_mask_and_127_and_63: +; XTENSA: movi a8, 127 +; XTENSA-NEXT: and a8, a4, a8 +; XTENSA-NEXT: ssl a8 +; XTENSA-NEXT: src a11, a3, a2 +; XTENSA-NEXT: addi a9, a8, -32 +; XTENSA-NEXT: ssl a9 +; XTENSA-NEXT: sll a7, a2 +; XTENSA-NEXT: movi a10, 0 +; XTENSA-NEXT: blt a9, a10, .LBB9_2 +; XTENSA-NEXT: # %bb.1: +; XTENSA-NEXT: or a11, a7, a7 +; XTENSA-NEXT: .LBB9_2: +; XTENSA-NEXT: neg a7, a4 +; XTENSA-NEXT: movi a6, 63 +; XTENSA-NEXT: and a6, a7, a6 +; XTENSA-NEXT: ssr a6 +; XTENSA-NEXT: srl a7, a3 +; XTENSA-NEXT: addi a5, a6, -32 +; XTENSA-NEXT: blt a5, a10, .LBB9_4 +; XTENSA-NEXT: # %bb.3: +; XTENSA-NEXT: or a7, a10, a10 +; XTENSA-NEXT: .LBB9_4: +; XTENSA-NEXT: ssr a6 +; XTENSA-NEXT: src a6, a3, a2 +; XTENSA-NEXT: ssr a5 +; XTENSA-NEXT: srl a4, a3 +; XTENSA-NEXT: blt a5, a10, .LBB9_6 +; XTENSA-NEXT: # %bb.5: +; XTENSA-NEXT: or a6, a4, a4 +; XTENSA-NEXT: .LBB9_6: +; XTENSA-NEXT: or a3, a11, a7 +; XTENSA-NEXT: ssl a8 +; XTENSA-NEXT: sll a8, a2 +; XTENSA-NEXT: blt a9, a10, .LBB9_8 +; XTENSA-NEXT: # %bb.7: +; XTENSA-NEXT: or a8, a10, a10 +; XTENSA-NEXT: .LBB9_8: +; XTENSA-NEXT: or a2, a8, a6 +; XTENSA-NEXT: ret + %a = and i64 %y, 127 + %b = shl i64 %x, %a + %c = sub i64 0, %y + %d = and i64 %c, 63 + %e = lshr i64 %x, %d + %f = or i64 %b, %e + ret i64 %f +} + +define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind { +; XTENSA-LABEL: rotr_64_mask: +; XTENSA: ssr a4 +; XTENSA-NEXT: src a10, a3, a2 +; XTENSA-NEXT: addi a8, a4, -32 +; XTENSA-NEXT: ssr a8 +; XTENSA-NEXT: srl a11, a3 +; XTENSA-NEXT: movi a9, 0 +; XTENSA-NEXT: blt a8, a9, .LBB10_2 +; XTENSA-NEXT: # %bb.1: +; XTENSA-NEXT: or a10, a11, a11 +; XTENSA-NEXT: .LBB10_2: +; XTENSA-NEXT: neg a11, a4 +; XTENSA-NEXT: movi a7, 63 +; XTENSA-NEXT: and a7, a11, a7 +; XTENSA-NEXT: ssl a7 +; XTENSA-NEXT: sll a11, a2 +; XTENSA-NEXT: addi a6, a7, -32 +; XTENSA-NEXT: blt a6, a9, .LBB10_4 +; XTENSA-NEXT: # %bb.3: +; XTENSA-NEXT: or a11, a9, a9 +; XTENSA-NEXT: .LBB10_4: +; XTENSA-NEXT: ssl a7 +; XTENSA-NEXT: src a7, a3, a2 +; XTENSA-NEXT: ssl a6 +; XTENSA-NEXT: sll a5, a2 +; XTENSA-NEXT: blt a6, a9, .LBB10_6 +; XTENSA-NEXT: # %bb.5: +; XTENSA-NEXT: or a7, a5, a5 +; XTENSA-NEXT: .LBB10_6: +; XTENSA-NEXT: or a2, a10, a11 +; XTENSA-NEXT: ssr a4 +; XTENSA-NEXT: srl a10, a3 +; XTENSA-NEXT: blt a8, a9, .LBB10_8 +; XTENSA-NEXT: # %bb.7: +; XTENSA-NEXT: or a10, a9, a9 +; XTENSA-NEXT: .LBB10_8: +; XTENSA-NEXT: or a3, a10, a7 +; XTENSA-NEXT: ret + %z = sub i64 0, %y + %and = and i64 %z, 63 + %b = lshr i64 %x, %y + %c = shl i64 %x, %and + %d = or i64 %b, %c + ret i64 %d +} + +define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { +; XTENSA-LABEL: rotr_64_mask_and_127_and_63: +; XTENSA: movi a8, 127 +; XTENSA-NEXT: and a8, a4, a8 +; XTENSA-NEXT: ssr a8 +; XTENSA-NEXT: src a11, a3, a2 +; XTENSA-NEXT: addi a9, a8, -32 +; XTENSA-NEXT: ssr a9 +; XTENSA-NEXT: srl a7, a3 +; XTENSA-NEXT: movi a10, 0 +; XTENSA-NEXT: blt a9, a10, .LBB11_2 +; XTENSA-NEXT: # %bb.1: +; XTENSA-NEXT: or a11, a7, a7 +; XTENSA-NEXT: .LBB11_2: +; XTENSA-NEXT: neg a7, a4 +; XTENSA-NEXT: movi a6, 63 +; XTENSA-NEXT: and a6, a7, a6 +; XTENSA-NEXT: ssl a6 +; XTENSA-NEXT: sll a7, a2 +; XTENSA-NEXT: addi a5, a6, -32 +; XTENSA-NEXT: blt a5, a10, .LBB11_4 +; XTENSA-NEXT: # %bb.3: +; XTENSA-NEXT: or a7, a10, a10 +; XTENSA-NEXT: .LBB11_4: +; XTENSA-NEXT: ssl a6 +; XTENSA-NEXT: src a6, a3, a2 +; XTENSA-NEXT: ssl a5 +; XTENSA-NEXT: sll a4, a2 +; XTENSA-NEXT: blt a5, a10, .LBB11_6 +; XTENSA-NEXT: # %bb.5: +; XTENSA-NEXT: or a6, a4, a4 +; XTENSA-NEXT: .LBB11_6: +; XTENSA-NEXT: or a2, a11, a7 +; XTENSA-NEXT: ssr a8 +; XTENSA-NEXT: srl a8, a3 +; XTENSA-NEXT: blt a9, a10, .LBB11_8 +; XTENSA-NEXT: # %bb.7: +; XTENSA-NEXT: or a8, a10, a10 +; XTENSA-NEXT: .LBB11_8: +; XTENSA-NEXT: or a3, a8, a6 +; XTENSA-NEXT: ret + %a = and i64 %y, 127 + %b = lshr i64 %x, %a + %c = sub i64 0, %y + %d = and i64 %c, 63 + %e = shl i64 %x, %d + %f = or i64 %b, %e + ret i64 %f +} + +define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 signext %amt) nounwind { +; XTENSA-LABEL: rotl_32_mask_shared: +; XTENSA: movi a8, 31 +; XTENSA-NEXT: and a9, a4, a8 +; XTENSA-NEXT: ssl a9 +; XTENSA-NEXT: sll a10, a2 +; XTENSA-NEXT: neg a11, a4 +; XTENSA-NEXT: and a8, a11, a8 +; XTENSA-NEXT: ssr a8 +; XTENSA-NEXT: srl a8, a2 +; XTENSA-NEXT: or a8, a10, a8 +; XTENSA-NEXT: ssl a9 +; XTENSA-NEXT: sll a9, a3 +; XTENSA-NEXT: add a2, a8, a9 +; XTENSA-NEXT: ret + %maskedamt = and i32 %amt, 31 + %1 = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 %maskedamt) + %2 = shl i32 %b, %maskedamt + %3 = add i32 %1, %2 + ret i32 %3 +} +declare i32 @llvm.fshl.i32(i32, i32, i32) + +define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 signext %amt) nounwind { +; XTENSA-LABEL: rotr_32_mask_shared: +; XTENSA: movi a8, 31 +; XTENSA-NEXT: and a9, a4, a8 +; XTENSA-NEXT: ssr a9 +; XTENSA-NEXT: srl a10, a2 +; XTENSA-NEXT: neg a11, a4 +; XTENSA-NEXT: and a8, a11, a8 +; XTENSA-NEXT: ssl a8 +; XTENSA-NEXT: sll a8, a2 +; XTENSA-NEXT: or a8, a10, a8 +; XTENSA-NEXT: ssl a9 +; XTENSA-NEXT: sll a9, a3 +; XTENSA-NEXT: add a2, a8, a9 +; XTENSA-NEXT: ret + %maskedamt = and i32 %amt, 31 + %1 = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 %maskedamt) + %2 = shl i32 %b, %maskedamt + %3 = add i32 %1, %2 + ret i32 %3 +} +declare i32 @llvm.fshr.i32(i32, i32, i32) + +define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 signext %amt) nounwind { +; XTENSA-LABEL: rotl_32_mask_multiple: +; XTENSA: movi a8, 31 +; XTENSA-NEXT: and a9, a4, a8 +; XTENSA-NEXT: ssl a9 +; XTENSA-NEXT: sll a10, a3 +; XTENSA-NEXT: neg a11, a4 +; XTENSA-NEXT: and a8, a11, a8 +; XTENSA-NEXT: ssr a8 +; XTENSA-NEXT: srl a11, a3 +; XTENSA-NEXT: or a10, a10, a11 +; XTENSA-NEXT: ssl a9 +; XTENSA-NEXT: sll a9, a2 +; XTENSA-NEXT: ssr a8 +; XTENSA-NEXT: srl a8, a2 +; XTENSA-NEXT: or a8, a9, a8 +; XTENSA-NEXT: add a2, a8, a10 +; XTENSA-NEXT: ret + %maskedamt = and i32 %amt, 31 + %1 = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 %maskedamt) + %2 = tail call i32 @llvm.fshl.i32(i32 %b, i32 %b, i32 %maskedamt) + %3 = add i32 %1, %2 + ret i32 %3 +} + +define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 signext %amt) nounwind { +; XTENSA-LABEL: rotr_32_mask_multiple: +; XTENSA: movi a8, 31 +; XTENSA-NEXT: and a9, a4, a8 +; XTENSA-NEXT: ssr a9 +; XTENSA-NEXT: srl a10, a3 +; XTENSA-NEXT: neg a11, a4 +; XTENSA-NEXT: and a8, a11, a8 +; XTENSA-NEXT: ssl a8 +; XTENSA-NEXT: sll a11, a3 +; XTENSA-NEXT: or a10, a10, a11 +; XTENSA-NEXT: ssr a9 +; XTENSA-NEXT: srl a9, a2 +; XTENSA-NEXT: ssl a8 +; XTENSA-NEXT: sll a8, a2 +; XTENSA-NEXT: or a8, a9, a8 +; XTENSA-NEXT: add a2, a8, a10 +; XTENSA-NEXT: ret + %maskedamt = and i32 %amt, 31 + %1 = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 %maskedamt) + %2 = tail call i32 @llvm.fshr.i32(i32 %b, i32 %b, i32 %maskedamt) + %3 = add i32 %1, %2 + ret i32 %3 +} diff --git a/llvm/test/CodeGen/Xtensa/shift.ll b/llvm/test/CodeGen/Xtensa/shift.ll new file mode 100644 index 00000000000000..87e847f65d8370 --- /dev/null +++ b/llvm/test/CodeGen/Xtensa/shift.ll @@ -0,0 +1,172 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=xtensa -verify-machineinstrs < %s \ +; RUN: | FileCheck %s + +define i32 @lshl(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: lshl: +; CHECK: ssl a3 +; CHECK-NEXT: sll a2, a2 +; CHECK-NEXT: ret + %c = shl i32 %x, %y + ret i32 %c +} + +define i32 @lshl_imm_1(i32 %x) nounwind { +; CHECK-LABEL: lshl_imm_1: +; CHECK: slli a2, a2, 1 +; CHECK-NEXT: ret + %c = shl i32 %x, 1 + ret i32 %c +} + +define i32 @lshl_imm_10(i32 %x) nounwind { +; CHECK-LABEL: lshl_imm_10: +; CHECK: slli a2, a2, 10 +; CHECK-NEXT: ret + %c = shl i32 %x, 10 + ret i32 %c +} + +define i32 @lshl_imm_31(i32 %x) nounwind { +; CHECK-LABEL: lshl_imm_31: +; CHECK: slli a2, a2, 31 +; CHECK-NEXT: ret + %c = shl i32 %x, 31 + ret i32 %c +} + +define i32 @lshr(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: lshr: +; CHECK: ssr a3 +; CHECK-NEXT: srl a2, a2 +; CHECK-NEXT: ret + %c = lshr i32 %x, %y + ret i32 %c +} + +define i32 @lshr_imm_1(i32 %x) nounwind { +; CHECK-LABEL: lshr_imm_1: +; CHECK: srli a2, a2, 1 +; CHECK-NEXT: ret + %c = lshr i32 %x, 1 + ret i32 %c +} + +define i32 @lshr_imm_15(i32 %x) nounwind { +; CHECK-LABEL: lshr_imm_15: +; CHECK: srli a2, a2, 15 +; CHECK-NEXT: ret + %c = lshr i32 %x, 15 + ret i32 %c +} + +define i32 @lshr_imm_20(i32 %x) nounwind { +; CHECK-LABEL: lshr_imm_20: +; CHECK: extui a2, a2, 20, 12 +; CHECK-NEXT: ret + %c = lshr i32 %x, 20 + ret i32 %c +} + +define i32 @ashr(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: ashr: +; CHECK: ssr a3 +; CHECK-NEXT: sra a2, a2 +; CHECK-NEXT: ret + %c = ashr i32 %x, %y + ret i32 %c +} + +define i32 @ashr_imm_1(i32 %x) nounwind { +; CHECK-LABEL: ashr_imm_1: +; CHECK: srai a2, a2, 1 +; CHECK-NEXT: ret + %c = ashr i32 %x, 1 + ret i32 %c +} + +define i32 @ashr_imm_10(i32 %x) nounwind { +; CHECK-LABEL: ashr_imm_10: +; CHECK: srai a2, a2, 10 +; CHECK-NEXT: ret + %c = ashr i32 %x, 10 + ret i32 %c +} + +define i32 @ashr_imm_31(i32 %x) nounwind { +; CHECK-LABEL: ashr_imm_31: +; CHECK: srai a2, a2, 31 +; CHECK-NEXT: ret + %c = ashr i32 %x, 31 + ret i32 %c +} + +define i64 @lshl_64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: lshl_64: +; CHECK: ssl a4 +; CHECK-NEXT: src a3, a3, a2 +; CHECK-NEXT: addi a8, a4, -32 +; CHECK-NEXT: ssl a8 +; CHECK-NEXT: sll a10, a2 +; CHECK-NEXT: movi a9, 0 +; CHECK-NEXT: blt a8, a9, .LBB12_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: or a3, a10, a10 +; CHECK-NEXT: .LBB12_2: +; CHECK-NEXT: ssl a4 +; CHECK-NEXT: sll a2, a2 +; CHECK-NEXT: blt a8, a9, .LBB12_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: or a2, a9, a9 +; CHECK-NEXT: .LBB12_4: +; CHECK-NEXT: ret + %c = shl i64 %x, %y + ret i64 %c +} + +define i64 @lshr_64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: lshr_64: +; CHECK: ssr a4 +; CHECK-NEXT: src a2, a3, a2 +; CHECK-NEXT: addi a8, a4, -32 +; CHECK-NEXT: ssr a8 +; CHECK-NEXT: srl a10, a3 +; CHECK-NEXT: movi a9, 0 +; CHECK-NEXT: blt a8, a9, .LBB13_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: or a2, a10, a10 +; CHECK-NEXT: .LBB13_2: +; CHECK-NEXT: ssr a4 +; CHECK-NEXT: srl a3, a3 +; CHECK-NEXT: blt a8, a9, .LBB13_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: or a3, a9, a9 +; CHECK-NEXT: .LBB13_4: +; CHECK-NEXT: ret + %c = lshr i64 %x, %y + ret i64 %c +} + +define i64 @ashr_64(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: ashr_64: +; CHECK: ssr a4 +; CHECK-NEXT: src a2, a3, a2 +; CHECK-NEXT: addi a9, a4, -32 +; CHECK-NEXT: ssr a9 +; CHECK-NEXT: sra a8, a3 +; CHECK-NEXT: movi a10, 0 +; CHECK-NEXT: blt a9, a10, .LBB14_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: or a2, a8, a8 +; CHECK-NEXT: .LBB14_2: +; CHECK-NEXT: ssr a4 +; CHECK-NEXT: sra a8, a3 +; CHECK-NEXT: blt a9, a10, .LBB14_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: srai a8, a3, 31 +; CHECK-NEXT: .LBB14_4: +; CHECK-NEXT: or a3, a8, a8 +; CHECK-NEXT: ret + %c = ashr i64 %x, %y + ret i64 %c +} From c368a720a0b40bb8fe4aff3971fe9a7009c85aa6 Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Thu, 22 Aug 2024 10:35:49 +0200 Subject: [PATCH 160/426] [clang] Merge lifetimebound and GSL code paths for lifetime analysis (#104906) In the current lifetime analysis, we have two parallel code paths: one for lifetimebound and another for GSL. These paths perform the same logic, both determining whether to continue visiting subexpressions. This PR merges the two paths into a single code path. As a result, we'll reduce the overhead by eliminating a redundant visit to subexpressions. The change is mostly NFC (No Functional Change). The only notable difference is that when a subexpression is visited due to either lifetimebound or GSL, we will prioritize the lifetimebound path. This means the final diagnostic will be -Wdangling (rather than both `-Wdangling` and `-Wdangling-gsl`) This might cause a slight change in behavior if the -Wdangling diagnostic is disabled, but I think this is not a major concern since both diagnostics are enabled by default. Fixes #93386 --- clang/docs/ReleaseNotes.rst | 2 + clang/lib/Sema/CheckExprLifetime.cpp | 246 +++++++----------- .../Sema/warn-lifetime-analysis-nocfg.cpp | 13 + 3 files changed, 112 insertions(+), 149 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 5c156a9c073a9c..bb47350f76b308 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -239,6 +239,8 @@ Improvements to Clang's diagnostics - Clang now diagnoses when the result of a [[nodiscard]] function is discarded after being cast in C. Fixes #GH104391. +- Don't emit duplicated dangling diagnostics. (#GH93386). + Improvements to Clang's time-trace ---------------------------------- diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp index 7389046eaddde1..7e23c08cc79ffb 100644 --- a/clang/lib/Sema/CheckExprLifetime.cpp +++ b/clang/lib/Sema/CheckExprLifetime.cpp @@ -237,13 +237,11 @@ static bool pathContainsInit(IndirectLocalPath &Path) { static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, Expr *Init, LocalVisitor Visit, - bool RevisitSubinits, - bool EnableLifetimeWarnings); + bool RevisitSubinits); static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, Expr *Init, ReferenceKind RK, - LocalVisitor Visit, - bool EnableLifetimeWarnings); + LocalVisitor Visit); template static bool isRecordWithAttr(QualType Type) { if (auto *RD = Type->getAsCXXRecordDecl()) @@ -326,66 +324,6 @@ static bool shouldTrackFirstArgument(const FunctionDecl *FD) { return false; } -static void handleGslAnnotatedTypes(IndirectLocalPath &Path, Expr *Call, - LocalVisitor Visit) { - auto VisitPointerArg = [&](const Decl *D, Expr *Arg, bool Value) { - // We are not interested in the temporary base objects of gsl Pointers: - // Temp().ptr; // Here ptr might not dangle. - if (isa(Arg->IgnoreImpCasts())) - return; - // Once we initialized a value with a reference, it can no longer dangle. - if (!Value) { - for (const IndirectLocalPathEntry &PE : llvm::reverse(Path)) { - if (PE.Kind == IndirectLocalPathEntry::GslReferenceInit) - continue; - if (PE.Kind == IndirectLocalPathEntry::GslPointerInit || - PE.Kind == IndirectLocalPathEntry::GslPointerAssignment) - return; - break; - } - } - Path.push_back({Value ? IndirectLocalPathEntry::GslPointerInit - : IndirectLocalPathEntry::GslReferenceInit, - Arg, D}); - if (Arg->isGLValue()) - visitLocalsRetainedByReferenceBinding(Path, Arg, RK_ReferenceBinding, - Visit, - /*EnableLifetimeWarnings=*/true); - else - visitLocalsRetainedByInitializer(Path, Arg, Visit, true, - /*EnableLifetimeWarnings=*/true); - Path.pop_back(); - }; - - if (auto *MCE = dyn_cast(Call)) { - const auto *MD = cast_or_null(MCE->getDirectCallee()); - if (MD && shouldTrackImplicitObjectArg(MD)) - VisitPointerArg(MD, MCE->getImplicitObjectArgument(), - !MD->getReturnType()->isReferenceType()); - return; - } else if (auto *OCE = dyn_cast(Call)) { - FunctionDecl *Callee = OCE->getDirectCallee(); - if (Callee && Callee->isCXXInstanceMember() && - shouldTrackImplicitObjectArg(cast(Callee))) - VisitPointerArg(Callee, OCE->getArg(0), - !Callee->getReturnType()->isReferenceType()); - return; - } else if (auto *CE = dyn_cast(Call)) { - FunctionDecl *Callee = CE->getDirectCallee(); - if (Callee && shouldTrackFirstArgument(Callee)) - VisitPointerArg(Callee, CE->getArg(0), - !Callee->getReturnType()->isReferenceType()); - return; - } - - if (auto *CCE = dyn_cast(Call)) { - const auto *Ctor = CCE->getConstructor(); - const CXXRecordDecl *RD = Ctor->getParent(); - if (CCE->getNumArgs() > 0 && RD->hasAttr()) - VisitPointerArg(Ctor->getParamDecl(0), CCE->getArgs()[0], true); - } -} - static bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) { const TypeSourceInfo *TSI = FD->getTypeSourceInfo(); if (!TSI) @@ -423,8 +361,9 @@ static bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) { return false; } -static void visitLifetimeBoundArguments(IndirectLocalPath &Path, Expr *Call, - LocalVisitor Visit) { +// Visit lifetimebound or gsl-pointer arguments. +static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call, + LocalVisitor Visit) { const FunctionDecl *Callee; ArrayRef Args; @@ -439,6 +378,8 @@ static void visitLifetimeBoundArguments(IndirectLocalPath &Path, Expr *Call, if (!Callee) return; + bool EnableGSLAnalysis = !Callee->getASTContext().getDiagnostics().isIgnored( + diag::warn_dangling_lifetime_pointer, SourceLocation()); Expr *ObjectArg = nullptr; if (isa(Call) && Callee->isCXXInstanceMember()) { ObjectArg = Args[0]; @@ -451,11 +392,35 @@ static void visitLifetimeBoundArguments(IndirectLocalPath &Path, Expr *Call, Path.push_back({IndirectLocalPathEntry::LifetimeBoundCall, Arg, D}); if (Arg->isGLValue()) visitLocalsRetainedByReferenceBinding(Path, Arg, RK_ReferenceBinding, - Visit, - /*EnableLifetimeWarnings=*/false); + Visit); else - visitLocalsRetainedByInitializer(Path, Arg, Visit, true, - /*EnableLifetimeWarnings=*/false); + visitLocalsRetainedByInitializer(Path, Arg, Visit, true); + Path.pop_back(); + }; + auto VisitGSLPointerArg = [&](const Decl *D, Expr *Arg, bool Value) { + // We are not interested in the temporary base objects of gsl Pointers: + // Temp().ptr; // Here ptr might not dangle. + if (isa(Arg->IgnoreImpCasts())) + return; + // Once we initialized a value with a reference, it can no longer dangle. + if (!Value) { + for (const IndirectLocalPathEntry &PE : llvm::reverse(Path)) { + if (PE.Kind == IndirectLocalPathEntry::GslReferenceInit) + continue; + if (PE.Kind == IndirectLocalPathEntry::GslPointerInit || + PE.Kind == IndirectLocalPathEntry::GslPointerAssignment) + return; + break; + } + } + Path.push_back({Value ? IndirectLocalPathEntry::GslPointerInit + : IndirectLocalPathEntry::GslReferenceInit, + Arg, D}); + if (Arg->isGLValue()) + visitLocalsRetainedByReferenceBinding(Path, Arg, RK_ReferenceBinding, + Visit); + else + visitLocalsRetainedByInitializer(Path, Arg, Visit, true); Path.pop_back(); }; @@ -478,6 +443,12 @@ static void visitLifetimeBoundArguments(IndirectLocalPath &Path, Expr *Call, CheckCoroObjArg = false; if (implicitObjectParamIsLifetimeBound(Callee) || CheckCoroObjArg) VisitLifetimeBoundArg(Callee, ObjectArg); + else if (EnableGSLAnalysis) { + if (auto *CME = dyn_cast(Callee); + CME && shouldTrackImplicitObjectArg(CME)) + VisitGSLPointerArg(Callee, ObjectArg, + !Callee->getReturnType()->isReferenceType()); + } } for (unsigned I = 0, @@ -485,6 +456,17 @@ static void visitLifetimeBoundArguments(IndirectLocalPath &Path, Expr *Call, I != N; ++I) { if (CheckCoroCall || Callee->getParamDecl(I)->hasAttr()) VisitLifetimeBoundArg(Callee->getParamDecl(I), Args[I]); + else if (EnableGSLAnalysis && I == 0) { // GSL + if (shouldTrackFirstArgument(Callee)) { + VisitGSLPointerArg(Callee, Args[0], + !Callee->getReturnType()->isReferenceType()); + } else if (auto *CCE = dyn_cast(Call); + CCE && + CCE->getConstructor()->getParent()->hasAttr()) { + VisitGSLPointerArg(CCE->getConstructor()->getParamDecl(0), Args[0], + true); + } + } } } @@ -492,8 +474,7 @@ static void visitLifetimeBoundArguments(IndirectLocalPath &Path, Expr *Call, /// glvalue expression \c Init. static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, Expr *Init, ReferenceKind RK, - LocalVisitor Visit, - bool EnableLifetimeWarnings) { + LocalVisitor Visit) { RevertToOldSizeRAII RAII(Path); // Walk past any constructs which we can lifetime-extend across. @@ -530,8 +511,7 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, else // We can't lifetime extend through this but we might still find some // retained temporaries. - return visitLocalsRetainedByInitializer(Path, Init, Visit, true, - EnableLifetimeWarnings); + return visitLocalsRetainedByInitializer(Path, Init, Visit, true); } // Step into CXXDefaultInitExprs so we can diagnose cases where a @@ -545,23 +525,18 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, if (auto *MTE = dyn_cast(Init)) { if (Visit(Path, Local(MTE), RK)) - visitLocalsRetainedByInitializer(Path, MTE->getSubExpr(), Visit, true, - EnableLifetimeWarnings); + visitLocalsRetainedByInitializer(Path, MTE->getSubExpr(), Visit, true); } if (auto *M = dyn_cast(Init)) { // Lifetime of a non-reference type field is same as base object. if (auto *F = dyn_cast(M->getMemberDecl()); F && !F->getType()->isReferenceType()) - visitLocalsRetainedByInitializer(Path, M->getBase(), Visit, true, - EnableLifetimeWarnings); + visitLocalsRetainedByInitializer(Path, M->getBase(), Visit, true); } - if (isa(Init)) { - if (EnableLifetimeWarnings) - handleGslAnnotatedTypes(Path, Init, Visit); - return visitLifetimeBoundArguments(Path, Init, Visit); - } + if (isa(Init)) + return visitFunctionCallArguments(Path, Init, Visit); switch (Init->getStmtClass()) { case Stmt::DeclRefExprClass: { @@ -580,8 +555,7 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, } else if (VD->getInit() && !isVarOnPath(Path, VD)) { Path.push_back({IndirectLocalPathEntry::VarInit, DRE, VD}); visitLocalsRetainedByReferenceBinding(Path, VD->getInit(), - RK_ReferenceBinding, Visit, - EnableLifetimeWarnings); + RK_ReferenceBinding, Visit); } } break; @@ -593,15 +567,13 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, // handling all sorts of rvalues passed to a unary operator. const UnaryOperator *U = cast(Init); if (U->getOpcode() == UO_Deref) - visitLocalsRetainedByInitializer(Path, U->getSubExpr(), Visit, true, - EnableLifetimeWarnings); + visitLocalsRetainedByInitializer(Path, U->getSubExpr(), Visit, true); break; } case Stmt::ArraySectionExprClass: { - visitLocalsRetainedByInitializer(Path, - cast(Init)->getBase(), - Visit, true, EnableLifetimeWarnings); + visitLocalsRetainedByInitializer( + Path, cast(Init)->getBase(), Visit, true); break; } @@ -609,11 +581,9 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, case Stmt::BinaryConditionalOperatorClass: { auto *C = cast(Init); if (!C->getTrueExpr()->getType()->isVoidType()) - visitLocalsRetainedByReferenceBinding(Path, C->getTrueExpr(), RK, Visit, - EnableLifetimeWarnings); + visitLocalsRetainedByReferenceBinding(Path, C->getTrueExpr(), RK, Visit); if (!C->getFalseExpr()->getType()->isVoidType()) - visitLocalsRetainedByReferenceBinding(Path, C->getFalseExpr(), RK, Visit, - EnableLifetimeWarnings); + visitLocalsRetainedByReferenceBinding(Path, C->getFalseExpr(), RK, Visit); break; } @@ -636,8 +606,7 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, /// the prvalue expression \c Init. static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, Expr *Init, LocalVisitor Visit, - bool RevisitSubinits, - bool EnableLifetimeWarnings) { + bool RevisitSubinits) { RevertToOldSizeRAII RAII(Path); Expr *Old; @@ -678,18 +647,16 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, if (VD && VD->getType().isConstQualified() && VD->getInit() && !isVarOnPath(Path, VD)) { Path.push_back({IndirectLocalPathEntry::VarInit, DRE, VD}); - visitLocalsRetainedByInitializer( - Path, VD->getInit(), Visit, true, EnableLifetimeWarnings); + visitLocalsRetainedByInitializer(Path, VD->getInit(), Visit, + true); } } else if (auto *MTE = dyn_cast(L)) { if (MTE->getType().isConstQualified()) visitLocalsRetainedByInitializer(Path, MTE->getSubExpr(), - Visit, true, - EnableLifetimeWarnings); + Visit, true); } return false; - }, - EnableLifetimeWarnings); + }); // We assume that objects can be retained by pointers cast to integers, // but not if the integer is cast to floating-point type or to _Complex. @@ -718,9 +685,8 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, // Model array-to-pointer decay as taking the address of the array // lvalue. Path.push_back({IndirectLocalPathEntry::AddressOf, CE}); - return visitLocalsRetainedByReferenceBinding(Path, CE->getSubExpr(), - RK_ReferenceBinding, Visit, - EnableLifetimeWarnings); + return visitLocalsRetainedByReferenceBinding( + Path, CE->getSubExpr(), RK_ReferenceBinding, Visit); default: return; @@ -735,8 +701,7 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, // lifetime of the array exactly like binding a reference to a temporary. if (auto *ILE = dyn_cast(Init)) return visitLocalsRetainedByReferenceBinding(Path, ILE->getSubExpr(), - RK_StdInitializerList, Visit, - EnableLifetimeWarnings); + RK_StdInitializerList, Visit); if (InitListExpr *ILE = dyn_cast(Init)) { // We already visited the elements of this initializer list while @@ -747,14 +712,12 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, if (ILE->isTransparent()) return visitLocalsRetainedByInitializer(Path, ILE->getInit(0), Visit, - RevisitSubinits, - EnableLifetimeWarnings); + RevisitSubinits); if (ILE->getType()->isArrayType()) { for (unsigned I = 0, N = ILE->getNumInits(); I != N; ++I) visitLocalsRetainedByInitializer(Path, ILE->getInit(I), Visit, - RevisitSubinits, - EnableLifetimeWarnings); + RevisitSubinits); return; } @@ -767,14 +730,12 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, if (RD->isUnion() && ILE->getInitializedFieldInUnion() && ILE->getInitializedFieldInUnion()->getType()->isReferenceType()) visitLocalsRetainedByReferenceBinding(Path, ILE->getInit(0), - RK_ReferenceBinding, Visit, - EnableLifetimeWarnings); + RK_ReferenceBinding, Visit); else { unsigned Index = 0; for (; Index < RD->getNumBases() && Index < ILE->getNumInits(); ++Index) visitLocalsRetainedByInitializer(Path, ILE->getInit(Index), Visit, - RevisitSubinits, - EnableLifetimeWarnings); + RevisitSubinits); for (const auto *I : RD->fields()) { if (Index >= ILE->getNumInits()) break; @@ -783,14 +744,13 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, Expr *SubInit = ILE->getInit(Index); if (I->getType()->isReferenceType()) visitLocalsRetainedByReferenceBinding(Path, SubInit, - RK_ReferenceBinding, Visit, - EnableLifetimeWarnings); + RK_ReferenceBinding, Visit); else // This might be either aggregate-initialization of a member or // initialization of a std::initializer_list object. Regardless, // we should recursively lifetime-extend that initializer. - visitLocalsRetainedByInitializer( - Path, SubInit, Visit, RevisitSubinits, EnableLifetimeWarnings); + visitLocalsRetainedByInitializer(Path, SubInit, Visit, + RevisitSubinits); ++Index; } } @@ -811,10 +771,9 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, Path.push_back({IndirectLocalPathEntry::LambdaCaptureInit, E, &Cap}); if (E->isGLValue()) visitLocalsRetainedByReferenceBinding(Path, E, RK_ReferenceBinding, - Visit, EnableLifetimeWarnings); + Visit); else - visitLocalsRetainedByInitializer(Path, E, Visit, true, - EnableLifetimeWarnings); + visitLocalsRetainedByInitializer(Path, E, Visit, true); if (Cap.capturesVariable()) Path.pop_back(); } @@ -828,18 +787,14 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, Expr *Arg = MTE->getSubExpr(); Path.push_back({IndirectLocalPathEntry::TemporaryCopy, Arg, CCE->getConstructor()}); - visitLocalsRetainedByInitializer(Path, Arg, Visit, true, - /*EnableLifetimeWarnings*/ false); + visitLocalsRetainedByInitializer(Path, Arg, Visit, true); Path.pop_back(); } } } - if (isa(Init) || isa(Init)) { - if (EnableLifetimeWarnings) - handleGslAnnotatedTypes(Path, Init, Visit); - return visitLifetimeBoundArguments(Path, Init, Visit); - } + if (isa(Init) || isa(Init)) + return visitFunctionCallArguments(Path, Init, Visit); switch (Init->getStmtClass()) { case Stmt::UnaryOperatorClass: { @@ -855,8 +810,7 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, Path.push_back({IndirectLocalPathEntry::AddressOf, UO}); visitLocalsRetainedByReferenceBinding(Path, UO->getSubExpr(), - RK_ReferenceBinding, Visit, - EnableLifetimeWarnings); + RK_ReferenceBinding, Visit); } break; } @@ -869,11 +823,9 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, break; if (BO->getLHS()->getType()->isPointerType()) - visitLocalsRetainedByInitializer(Path, BO->getLHS(), Visit, true, - EnableLifetimeWarnings); + visitLocalsRetainedByInitializer(Path, BO->getLHS(), Visit, true); else if (BO->getRHS()->getType()->isPointerType()) - visitLocalsRetainedByInitializer(Path, BO->getRHS(), Visit, true, - EnableLifetimeWarnings); + visitLocalsRetainedByInitializer(Path, BO->getRHS(), Visit, true); break; } @@ -883,11 +835,9 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, // In C++, we can have a throw-expression operand, which has 'void' type // and isn't interesting from a lifetime perspective. if (!C->getTrueExpr()->getType()->isVoidType()) - visitLocalsRetainedByInitializer(Path, C->getTrueExpr(), Visit, true, - EnableLifetimeWarnings); + visitLocalsRetainedByInitializer(Path, C->getTrueExpr(), Visit, true); if (!C->getFalseExpr()->getType()->isVoidType()) - visitLocalsRetainedByInitializer(Path, C->getFalseExpr(), Visit, true, - EnableLifetimeWarnings); + visitLocalsRetainedByInitializer(Path, C->getFalseExpr(), Visit, true); break; } @@ -989,8 +939,7 @@ static void checkExprLifetimeImpl(Sema &SemaRef, const InitializedEntity *InitEntity, const InitializedEntity *ExtendingEntity, LifetimeKind LK, - const AssignedEntity *AEntity, Expr *Init, - bool EnableLifetimeWarnings) { + const AssignedEntity *AEntity, Expr *Init) { assert((AEntity && LK == LK_Assignment) || (InitEntity && LK != LK_Assignment)); // If this entity doesn't have an interesting lifetime, don't bother looking @@ -1284,19 +1233,20 @@ static void checkExprLifetimeImpl(Sema &SemaRef, }; llvm::SmallVector Path; - if (EnableLifetimeWarnings && LK == LK_Assignment && + if (!SemaRef.getDiagnostics().isIgnored(diag::warn_dangling_lifetime_pointer, + SourceLocation()) && + LK == LK_Assignment && isRecordWithAttr(AEntity->LHS->getType())) Path.push_back({IndirectLocalPathEntry::GslPointerAssignment, Init}); if (Init->isGLValue()) visitLocalsRetainedByReferenceBinding(Path, Init, RK_ReferenceBinding, - TemporaryVisitor, - EnableLifetimeWarnings); + TemporaryVisitor); else visitLocalsRetainedByInitializer( Path, Init, TemporaryVisitor, // Don't revisit the sub inits for the intialization case. - /*RevisitSubinits=*/!InitEntity, EnableLifetimeWarnings); + /*RevisitSubinits=*/!InitEntity); } void checkExprLifetime(Sema &SemaRef, const InitializedEntity &Entity, @@ -1304,10 +1254,8 @@ void checkExprLifetime(Sema &SemaRef, const InitializedEntity &Entity, auto LTResult = getEntityLifetime(&Entity); LifetimeKind LK = LTResult.getInt(); const InitializedEntity *ExtendingEntity = LTResult.getPointer(); - bool EnableLifetimeWarnings = !SemaRef.getDiagnostics().isIgnored( - diag::warn_dangling_lifetime_pointer, SourceLocation()); checkExprLifetimeImpl(SemaRef, &Entity, ExtendingEntity, LK, - /*AEntity*/ nullptr, Init, EnableLifetimeWarnings); + /*AEntity*/ nullptr, Init); } void checkExprLifetime(Sema &SemaRef, const AssignedEntity &Entity, @@ -1323,7 +1271,7 @@ void checkExprLifetime(Sema &SemaRef, const AssignedEntity &Entity, checkExprLifetimeImpl(SemaRef, /*InitEntity=*/nullptr, /*ExtendingEntity=*/nullptr, LK_Assignment, &Entity, - Init, EnableLifetimeWarnings); + Init); } } // namespace clang::sema diff --git a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp index 09dfb2b5d96a89..86ee90ed6df8dd 100644 --- a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp +++ b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp @@ -479,3 +479,16 @@ void testForBug49342() { auto it = std::iter{} - 2; // Used to be false positive. } + +namespace GH93386 { +// verify no duplicated diagnostics are emitted. +struct [[gsl::Pointer]] S { + S(const std::vector& abc [[clang::lifetimebound]]); +}; + +S test(std::vector a) { + return S(a); // expected-warning {{address of stack memory associated with}} +} + +auto s = S(std::vector()); // expected-warning {{temporary whose address is used as value of local variable}} +} From b4ac5c4b7cefae442fc8365586ff9d2d324380a8 Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Thu, 22 Aug 2024 10:52:50 +0200 Subject: [PATCH 161/426] [mlir][cuda] NFC: Remove accidentally committed 'asd' file. (#105491) Co-authored-by: Christian Sigg --- mlir/test/Integration/GPU/CUDA/sm90/asd | 207 ------------------------ 1 file changed, 207 deletions(-) delete mode 100644 mlir/test/Integration/GPU/CUDA/sm90/asd diff --git a/mlir/test/Integration/GPU/CUDA/sm90/asd b/mlir/test/Integration/GPU/CUDA/sm90/asd deleted file mode 100644 index 353d8e7c16b741..00000000000000 --- a/mlir/test/Integration/GPU/CUDA/sm90/asd +++ /dev/null @@ -1,207 +0,0 @@ -module attributes {gpu.container_module} { - llvm.mlir.global private constant @vector_print_str_0(dense<[73, 110, 99, 111, 114, 114, 101, 99, 116, 32, 82, 101, 115, 117, 108, 116, 115, 32, 58, 10, 0]> : tensor<21xi8>) {addr_space = 0 : i32} : !llvm.array<21 x i8> - llvm.func @printNewline() - llvm.func @printI64(i64) - llvm.func @printString(!llvm.ptr) - llvm.mlir.global private constant @vector_print_str(dense<[67, 111, 114, 114, 101, 99, 116, 32, 82, 101, 115, 117, 108, 116, 115, 32, 58, 10, 0]> : tensor<19xi8>) {addr_space = 0 : i32} : !llvm.array<19 x i8> - llvm.func @malloc(i64) -> !llvm.ptr - llvm.mlir.global private @__mbarrier() {addr_space = 3 : i32, alignment = 8 : i64} : !llvm.array<2 x i64> - llvm.func @printMemrefF32(i64, !llvm.ptr) attributes {sym_visibility = "private"} - llvm.mlir.global private @dynamicShmem() {addr_space = 3 : i32, alignment = 16 : i64} : !llvm.array<0 x f16> - llvm.mlir.global private @accShmem() {addr_space = 3 : i32, alignment = 16 : i64} : !llvm.array<0 x f32> - llvm.func @main() { - %0 = llvm.mlir.constant(2 : index) : i64 - %1 = llvm.mlir.constant(0 : i8) : i8 - %2 = llvm.mlir.constant(64 : index) : i64 - %3 = llvm.mlir.constant(65536 : i32) : i32 - %4 = llvm.mlir.constant(16 : index) : i64 - %5 = llvm.mlir.constant(8 : index) : i64 - %6 = llvm.mlir.constant(0.000000e+00 : f32) : f32 - %7 = llvm.mlir.constant(6 : i32) : i64 - %8 = llvm.mlir.constant(5 : i32) : i64 - %9 = llvm.mlir.constant(0 : i32) : i64 - %10 = llvm.mlir.constant(3 : i32) : i64 - %11 = llvm.mlir.constant(1 : i32) : i32 - %12 = llvm.mlir.constant(0 : i32) : i32 - %13 = llvm.mlir.constant(9.99999993E-9 : f32) : f32 - %14 = llvm.mlir.constant(1 : index) : i64 - %15 = llvm.mlir.constant(0 : index) : i64 - %16 = llvm.mlir.constant(128 : index) : i64 - %17 = llvm.mlir.zero : !llvm.ptr - %18 = llvm.getelementptr %17[16384] : (!llvm.ptr) -> !llvm.ptr, f16 - %19 = llvm.ptrtoint %18 : !llvm.ptr to i64 - %20 = llvm.call @malloc(%19) : (i64) -> !llvm.ptr - %21 = llvm.call @malloc(%19) : (i64) -> !llvm.ptr - %22 = llvm.getelementptr %17[16384] : (!llvm.ptr) -> !llvm.ptr, f32 - %23 = llvm.ptrtoint %22 : !llvm.ptr to i64 - %24 = llvm.call @malloc(%23) : (i64) -> !llvm.ptr - %25 = llvm.call @malloc(%23) : (i64) -> !llvm.ptr - llvm.br ^bb1(%15 : i64) - ^bb1(%26: i64): // 2 preds: ^bb0, ^bb5 - %27 = llvm.icmp "slt" %26, %16 : i64 - llvm.cond_br %27, ^bb2, ^bb6 - ^bb2: // pred: ^bb1 - llvm.br ^bb3(%15 : i64) - ^bb3(%28: i64): // 2 preds: ^bb2, ^bb4 - %29 = llvm.icmp "slt" %28, %16 : i64 - llvm.cond_br %29, ^bb4, ^bb5 - ^bb4: // pred: ^bb3 - %30 = llvm.mul %26, %16 : i64 - %31 = llvm.add %30, %28 : i64 - %32 = llvm.udiv %31, %5 : i64 - %33 = llvm.urem %32, %4 : i64 - %34 = llvm.trunc %33 : i64 to i32 - %35 = llvm.sitofp %34 : i32 to f16 - %36 = llvm.getelementptr %21[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %35, %36 : f16, !llvm.ptr - %37 = llvm.mul %28, %2 : i64 - %38 = llvm.add %37, %26 : i64 - %39 = llvm.udiv %38, %5 : i64 - %40 = llvm.urem %39, %4 : i64 - %41 = llvm.trunc %40 : i64 to i32 - %42 = llvm.sitofp %41 : i32 to f16 - %43 = llvm.mul %28, %16 : i64 - %44 = llvm.add %43, %26 : i64 - %45 = llvm.getelementptr %20[%44] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - llvm.store %42, %45 : f16, !llvm.ptr - %46 = llvm.getelementptr %24[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %6, %46 : f32, !llvm.ptr - %47 = llvm.getelementptr %25[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %6, %47 : f32, !llvm.ptr - %48 = llvm.add %28, %14 : i64 - llvm.br ^bb3(%48 : i64) - ^bb5: // pred: ^bb3 - %49 = llvm.add %26, %14 : i64 - llvm.br ^bb1(%49 : i64) - ^bb6: // pred: ^bb1 - %50 = llvm.call @mgpuStreamCreate() : () -> !llvm.ptr - %51 = llvm.call @mgpuMemAlloc(%19, %50, %1) : (i64, !llvm.ptr, i8) -> !llvm.ptr - %52 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %53 = llvm.insertvalue %51, %52[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %54 = llvm.insertvalue %51, %53[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %55 = llvm.insertvalue %15, %54[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %56 = llvm.insertvalue %16, %55[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %57 = llvm.insertvalue %16, %56[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %58 = llvm.insertvalue %16, %57[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %59 = llvm.insertvalue %14, %58[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %60 = llvm.call @mgpuMemAlloc(%19, %50, %1) : (i64, !llvm.ptr, i8) -> !llvm.ptr - %61 = llvm.insertvalue %60, %52[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %62 = llvm.insertvalue %60, %61[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %63 = llvm.insertvalue %15, %62[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %64 = llvm.insertvalue %16, %63[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %65 = llvm.insertvalue %16, %64[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %66 = llvm.insertvalue %16, %65[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %67 = llvm.insertvalue %14, %66[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %68 = llvm.call @mgpuMemAlloc(%23, %50, %1) : (i64, !llvm.ptr, i8) -> !llvm.ptr - llvm.call @mgpuMemcpy(%51, %20, %19, %50) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () - llvm.call @mgpuMemcpy(%60, %21, %19, %50) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () - %69 = llvm.alloca %14 x !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> : (i64) -> !llvm.ptr - llvm.store %59, %69 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>, !llvm.ptr - %70 = llvm.alloca %14 x !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> : (i64) -> !llvm.ptr - llvm.store %67, %70 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>, !llvm.ptr - %71 = llvm.alloca %8 x i64 : (i64) -> !llvm.ptr - llvm.store %16, %71 : i64, !llvm.ptr - %72 = llvm.getelementptr %71[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr - llvm.store %2, %72 : i64, !llvm.ptr - %73 = llvm.call @mgpuTensorMapEncodeTiledMemref(%0, %69, %7, %9, %10, %9, %9, %71) : (i64, !llvm.ptr, i64, i64, i64, i64, i64, !llvm.ptr) -> !llvm.ptr - %74 = llvm.alloca %8 x i64 : (i64) -> !llvm.ptr - llvm.store %2, %74 : i64, !llvm.ptr - %75 = llvm.getelementptr %74[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr - llvm.store %2, %75 : i64, !llvm.ptr - %76 = llvm.call @mgpuTensorMapEncodeTiledMemref(%0, %70, %7, %9, %10, %9, %9, %74) : (i64, !llvm.ptr, i64, i64, i64, i64, i64, !llvm.ptr) -> !llvm.ptr - gpu.launch_func @main_kernel::@main_kernel blocks in (%14, %14, %14) threads in (%16, %14, %14) : i64 dynamic_shared_memory_size %3 args(%68 : !llvm.ptr, %68 : !llvm.ptr, %15 : i64, %16 : i64, %16 : i64, %16 : i64, %14 : i64, %73 : !llvm.ptr, %76 : !llvm.ptr) - llvm.call @mgpuMemcpy(%24, %68, %23, %50) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () - llvm.br ^bb7(%15 : i64) - ^bb7(%77: i64): // 2 preds: ^bb6, ^bb14 - %78 = llvm.icmp "slt" %77, %16 : i64 - llvm.cond_br %78, ^bb8, ^bb15 - ^bb8: // pred: ^bb7 - llvm.br ^bb9(%15 : i64) - ^bb9(%79: i64): // 2 preds: ^bb8, ^bb13 - %80 = llvm.icmp "slt" %79, %16 : i64 - llvm.cond_br %80, ^bb10, ^bb14 - ^bb10: // pred: ^bb9 - llvm.br ^bb11(%15 : i64) - ^bb11(%81: i64): // 2 preds: ^bb10, ^bb12 - %82 = llvm.icmp "slt" %81, %16 : i64 - llvm.cond_br %82, ^bb12, ^bb13 - ^bb12: // pred: ^bb11 - %83 = llvm.mul %77, %16 : i64 - %84 = llvm.add %83, %81 : i64 - %85 = llvm.getelementptr %20[%84] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %86 = llvm.load %85 : !llvm.ptr -> f16 - %87 = llvm.mul %81, %16 : i64 - %88 = llvm.add %87, %79 : i64 - %89 = llvm.getelementptr %21[%88] : (!llvm.ptr, i64) -> !llvm.ptr, f16 - %90 = llvm.load %89 : !llvm.ptr -> f16 - %91 = llvm.add %83, %79 : i64 - %92 = llvm.getelementptr %25[%91] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %93 = llvm.load %92 : !llvm.ptr -> f32 - %94 = llvm.fpext %86 : f16 to f32 - %95 = llvm.fpext %90 : f16 to f32 - %96 = llvm.fmul %94, %95 : f32 - %97 = llvm.fadd %93, %96 : f32 - llvm.store %97, %92 : f32, !llvm.ptr - %98 = llvm.add %81, %14 : i64 - llvm.br ^bb11(%98 : i64) - ^bb13: // pred: ^bb11 - %99 = llvm.add %79, %14 : i64 - llvm.br ^bb9(%99 : i64) - ^bb14: // pred: ^bb9 - %100 = llvm.add %77, %14 : i64 - llvm.br ^bb7(%100 : i64) - ^bb15: // pred: ^bb7 - llvm.br ^bb16(%15, %12, %12 : i64, i32, i32) - ^bb16(%101: i64, %102: i32, %103: i32): // 2 preds: ^bb15, ^bb24 - %104 = llvm.icmp "slt" %101, %16 : i64 - llvm.cond_br %104, ^bb17, ^bb25 - ^bb17: // pred: ^bb16 - llvm.br ^bb18(%15, %102, %103 : i64, i32, i32) - ^bb18(%105: i64, %106: i32, %107: i32): // 2 preds: ^bb17, ^bb23 - %108 = llvm.icmp "slt" %105, %16 : i64 - llvm.cond_br %108, ^bb19, ^bb24 - ^bb19: // pred: ^bb18 - %109 = llvm.mul %101, %16 : i64 - %110 = llvm.add %109, %105 : i64 - %111 = llvm.getelementptr %25[%110] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %112 = llvm.load %111 : !llvm.ptr -> f32 - %113 = llvm.getelementptr %24[%110] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %114 = llvm.load %113 : !llvm.ptr -> f32 - %115 = llvm.fsub %112, %114 : f32 - %116 = llvm.intr.fabs(%115) : (f32) -> f32 - %117 = llvm.fcmp "ult" %13, %116 : f32 - llvm.cond_br %117, ^bb20, ^bb21 - ^bb20: // pred: ^bb19 - %118 = llvm.add %106, %11 : i32 - llvm.br ^bb22(%118, %107 : i32, i32) - ^bb21: // pred: ^bb19 - %119 = llvm.add %107, %11 : i32 - llvm.br ^bb22(%106, %119 : i32, i32) - ^bb22(%120: i32, %121: i32): // 2 preds: ^bb20, ^bb21 - llvm.br ^bb23 - ^bb23: // pred: ^bb22 - %122 = llvm.add %105, %14 : i64 - llvm.br ^bb18(%122, %120, %121 : i64, i32, i32) - ^bb24: // pred: ^bb18 - %123 = llvm.add %101, %14 : i64 - llvm.br ^bb16(%123, %106, %107 : i64, i32, i32) - ^bb25: // pred: ^bb16 - %124 = llvm.mlir.addressof @vector_print_str : !llvm.ptr - llvm.call @printString(%124) : (!llvm.ptr) -> () - %125 = llvm.sext %103 : i32 to i64 - llvm.call @printI64(%125) : (i64) -> () - llvm.call @printNewline() : () -> () - %126 = llvm.mlir.addressof @vector_print_str_0 : !llvm.ptr - llvm.call @printString(%126) : (!llvm.ptr) -> () - %127 = llvm.sext %102 : i32 to i64 - llvm.call @printI64(%127) : (i64) -> () - llvm.call @printNewline() : () -> () - llvm.return - } - gpu.binary @main_kernel [#gpu.object<#nvvm.target, "P\EDU\BA\01\00\10\00\A83\00\00\00\00\00\00\02\00\01\01@\00\00\00p$\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00Z\00\00\00\00\00\00\00\00\00\00\00\11\00\10\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00{\00\00\00\00\00\00\00\00\00\00\00X#\00\00\00\00\00\00X \00\00\00\00\00\00Z\0DZ\00@\008\00\05\00@\00\0C\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.uft.entry\00.nv.info\00.text.main_kernel\00.nv.info.main_kernel\00.nv.shared.main_kernel\00.rel.text.main_kernel\00.rela.text.main_kernel\00.debug_frame\00.rel.debug_frame\00.rela.debug_frame\00.nv.constant0.main_kernel\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.uft.entry\00.nv.info\00main_kernel\00.text.main_kernel\00.nv.info.main_kernel\00.nv.shared.main_kernel\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00.rel.text.main_kernel\00.rela.text.main_kernel\00$__dynamicShmem__31\00$____mbarrier__33\00$__accShmem__35\00.debug_frame\00.rel.debug_frame\00.rela.debug_frame\00.nv.constant0.main_kernel\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00L\00\00\00\03\00\09\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00s\00\00\00\03\00\0A\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\02\01\00\00\03\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00@\00\00\00\12\10\09\00\00\00\00\00\00\00\00\00\80\18\00\00\00\00\00\002\01\00\00\03\00\0B\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\FF\FF\FF\FF$\00\00\00\00\00\00\00\FF\FF\FF\FF\FF\FF\FF\FF\03\00\04|\FF\FF\FF\FF\0F\0C\81\80\80(\00\08\FF\81\80(\08\81\80\80(\00\00\00\FF\FF\FF\FF,\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80\18\00\00\00\00\00\00\04T\00\00\00\0C\81\80\80(\00\04\A0\05\00\00\00\00\00\00\04/\08\00\05\00\00\00\9A\00\00\00\04#\08\00\05\00\00\00\00\00\00\00\04\12\08\00\05\00\00\00\00\00\00\00\04\11\08\00\05\00\00\00\00\00\00\00\047\04\00{\00\00\00\04\17\0C\00\00\00\00\00\08\00@\00\00\F0!\00\04\17\0C\00\00\00\00\00\07\008\00\00\F0!\00\04\17\0C\00\00\00\00\00\06\000\00\00\F0!\00\04\17\0C\00\00\00\00\00\05\00(\00\00\F0!\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0!\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F0!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F0!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03\1B\FF\00\0490\00\C0\00\00\00\FF\00\00\00\00\00\00\00\00\01\09\00\00\01\00\00\FF\00\00\00\08\00\00\00\00\01\09\00p\09\00\00\00\00\00\00\00\00\00\00\0A\01?\00\038\02\00\04\1C\0C\00@\12\00\00\80\17\00\00\D0\17\00\00\04\1E\04\00\00\00\00\00\03\19H\00\04\0A\08\00\06\00\00\00\10\02H\00\00\00\00\00D\00\00\00\00\00\00\00\02\00\00\00\05\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\82{\01\FF\00\0A\00\00\00\08\00\00\00$\0E\00\C3y\04\00\00\00\00\00\00\88\00\00\00b\0E\00\19y\06\00\00\00\00\00\00!\00\00\00\A2\0E\00\82x\08\00\00\04\00\00\00\00\00\00\00\E2\0F\00\82x\0A\00\FE\FF\1F\00\00\00\00\00\00\E2\0F\00\90x\09\08\08\00\00\00?\E0\FF\0F\00\E2\0F\00Ey\00\00\A0\04\00\00\00\00\80\03\00\E2\0F\00\82x\0B\00\00\F8\FF\7F\00\00\00\00\00\E2\0F\00\B9z\06\00\00\92\00\00\00\0A\00\00\00\E2\0F\00\B9z\0C\00\00\94\00\00\00\0A\00\00\00\E2\0F\00\96x\09\04T\06\00\00\09\00\00\08\00\E4/\00\96x\08\04T\06\00\00\08\00\00\08\00\E2\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\E2\0F\00\0Cr\00\06\FF\00\00\00pR\F0\03\00\E2O\00\C6s\00\00\00\00\00\00\00\00\00\00\00n\0E\00\B2u?\09\0A\00\00\00\00\01\00\08\00b\02\00\18y\00\00\00\00\00\00\00\00\00\00\00\E2\0F\00\B2u?\09\0A\08\00\00\00\01\00\08\00b\02\00\B9y\00\06\00\00\00\00\00\00\04\08\00\E2\03\00\B9y\00\0C\00\00\00\00\00\00\04\08\00\E4\03\00G\09\EC\00\00\00\00\00\00\00\80\03\00\EA/\00$t\00\FF\00\80\00\00\FF\00\8E\07\00\E2\0F\00\1Cx\00\00\00\00\00\00p\F0\F0\03\00\E2\0F\00\82|\0A\00?\00\00\00\00\00\00\08\00\E4\0F\00\A7y\FF\FF\00\00\00\00\09\00\00\08\00\F4\03\00/\08?\00\00\00\00\00\00\00\82\03\00\E2\0F\00\82|\0B\00\0A\00\00\00\00\00\00\08\00\E4\0F\00\B4u\00\06\08\00\00\00\00\80\00\08\00\F4\05\00\1C\18\00\00\00\00\00\00p\E1\F0\00\00\C4\0F\00\1Cx\00\00\00\00\00\00p\E1\F2\03\00\D6\0F\00G\09\E8\00\FD\FF\FF\FF\FF\FF\93\03\00\EAO\00\1Cx\00\00\00\00\00\00p\F0\F0\03\00\E2\0F\00\90x\10\08\00\80\00\00?\E0\FF\0F\00\D8\0F\00/\08?\00\00\00\00\00\00\00\82\03\00\E2\0F\00\82|\11\00\09\00\00\00\00\00\00\08\00\E2\0F\00\82|\12\00\0A\00\00\00\00\00\00\08\00\E2\0F\00\82|\13\00\0A\00\00\00\00\00\00\08\00\E4\0F\00\B4u\00\0C\10\00\00\00\00\80\00\08\00\F0\05\00\1C\18\00\00\00\00\00\00p\E1\F0\00\00\C4\0F\00\1Cx\00\00\00\00\00\00p\E1\F2\03\00\D6\0F\00G\09\E0\00\FD\FF\FF\FF\FF\FF\93\03\00\EAO\00\1Cx\00\00\00\00\00\00p\F0\F0\03\00\E2\0F\00\90x\10\08\00\A0\00\00?\E0\FF\0F\00\E2\0F\00\82x\12\00@\00\00\00\00\00\00\00\00\D6\0F\00/\08?\00\00\00\00\00\00\00\82\03\00\E2\0F\00\82|\11\00\09\00\00\00\00\00\00\08\00\E2\0F\00\82|\13\00\0A\00\00\00\00\00\00\08\00\E4\0F\00\B4u\00\0C\10\00\00\00\00\80\00\08\00\F2\05\00\1C\18\00\00\00\00\00\00p\E1\F0\00\00\C4\0F\00\1Cx\00\00\00\00\00\00p\E1\F2\03\00\D6\0F\00G\09\E4\00\FD\FF\FF\FF\FF\FF\93\03\00\EAO\00\A7y\FF\FF\00\08\00\00\09\00\00\08\00\E2\05\00\1Cx\00\00\00\00\00\00p\F0\F0\03\00\E2\0F\00\90x\10\08\00@\00\00?\E0\FF\0F\00\E4\0F\00\90x\11\09\08\00\00\00?\E0\FF\0F\00\D8\0F\00/\08?\00\00\00\00\00\00\00\82\03\00\E2\0F\00\82x\12\00@\00\00\00\00\00\00\00\00\E2\0F\00\82|\13\00\0A\00\00\00\00\00\00\08\00\E4\0F\00\B4u\00\06\10\00\00\00\00\80\00\08\00\F2\07\00\1C\18\00\00\00\00\00\00p\E1\F0\00\00\C4\0F\00\1Cx\00\00\00\00\00\00p\E1\F2\03\00\D6\0F\00G\09\E4\00\FD\FF\FF\FF\FF\FF\93\03\00\EA\8F\00\1Cx\00\00\00\00\00\00p\F0\F0\03\00\E2\0F\00\90x\10\08\00\C0\00\00?\E0\FF\0F\00\D8\0F\00/\08?\00\00\00\00\00\00\00\82\03\00\E2\0F\00\82|\12\00\0A\00\00\00\00\00\00\08\00\E2\0F\00\82x\13\00@\00\00\00\00\00\00\00\00\E4\0F\00\B4u\00\0C\10\00\00\00\00\80\00\08\00\F2\07\00\1C\18\00\00\00\00\00\00p\E1\F0\00\00\C4\0F\00\1Cx\00\00\00\00\00\00p\E1\F2\03\00\D6\0F\00G\09\E4\00\FD\FF\FF\FF\FF\FF\93\03\00\EA\8F\00\1Cx\00\00\00\00\00\00p\F0\F0\03\00\E2\0F\00\90x\10\08\00\E0\00\00?\E0\FF\0F\00\D8\0F\00/\08?\00\00\00\00\00\00\00\82\03\00\E2\0F\00\82x\12\00@\00\00\00\00\00\00\00\00\E2\0F\00\82x\13\00@\00\00\00\00\00\00\00\00\E4\0F\00\B4u\00\0C\10\00\00\00\00\80\00\08\00\F2\07\00\1C\18\00\00\00\00\00\00p\E1\F0\00\00\C4\0F\00\1Cx\00\00\00\00\00\00p\E1\F2\03\00\D6\0F\00G\09\E4\00\FD\FF\FF\FF\FF\FF\93\03\00\EA\8F\00Ay\00\00\00\00\00\00\00\00\80\03\00\EA\0F\00\05xX\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05xZ\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x\\\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x^\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x`\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05xb\00\00\00\00\00\00\FF\01\00\00\C4\0F\00\05xd\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05xf\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05xh\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05xj\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05xl\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05xn\00\00\00\00\00\00\FF\01\00\00\C4\0F\00\05xp\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05xr\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05xt\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05xv\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05xx\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05xz\00\00\00\00\00\00\FF\01\00\00\C4\0F\00\05x|\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x~\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x\80\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x\82\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x\84\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x\86\00\00\00\00\00\00\FF\01\00\00\C4\0F\00\05x\88\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x\8A\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x\8C\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x\8E\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x\90\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x\92\00\00\00\00\00\00\FF\01\00\00\C4\0F\00\05x\94\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x\96\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x\18\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x\1A\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x\1C\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x\1E\00\00\00\00\00\00\FF\01\00\00\C4\0F\00\05x \00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x\22\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x$\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x&\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x(\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x*\00\00\00\00\00\00\FF\01\00\00\C4\0F\00\05x,\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x.\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x0\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x2\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x4\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x6\00\00\00\00\00\00\FF\01\00\00\C4\0F\00\05x8\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x:\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x<\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x>\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05x@\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05xB\00\00\00\00\00\00\FF\01\00\00\C4\0F\00\05xD\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05xF\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05xH\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05xJ\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05xL\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05xN\00\00\00\00\00\00\FF\01\00\00\C4\0F\00\05xP\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05xR\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05xT\00\00\00\00\00\00\FF\01\00\00\E4\0F\00\05xV\00\00\00\00\00\00\FF\01\00\00\E2\0F\00\9Cx\00\00\00\00\00\00p\F0\F0\03\00\E2\0F\00\82|\05\00?\00\00\00\00\00\00\08\00\E2\0F\00\82|\04\00?\00\00\00\00\00\00\08\00\D2\0F\00\1Cx\00\00\00\00\00\00\08\F0\F0\03\00\E2\0F\00\91r\06\05\09\00\00\00?\18\8E\0F\00\D8\0F\00$~\00\FF\06\00\00\00\FF\00\8E\0F\00\C8o\00\A7u\00\00\FF\00\00\00\7F\01\02\08\00b\02\00\1Ay\00\00\00\90\00\00\00\00\00\00\00\C8\0F\00]\99\00\00\81\96\98\00\00\00\90\03\00\EA\0F\00\A7\95\00\00\FF\00\00\00\7F\00\02\08\00$\0E\00G\99\E8\00\FC\FF\FF\FF\FF\FF\83\03\00\EA\1F\00\91r\07\05\08\00\00\00?p\80\0F\00\E2\0F\00\C5y\00\00\00\00\00\00\00\00\00\00\00\E2\0F\00\82x\19\00@\00\00@\00\00\00\00\00\E2\0F\00\82x\1B\00@\00\00@\00\00\00\00\00\E2\0F\00\91r\0A\05?\00\00\00\04t\0F\08\00\E4\0F\00\90x\05\07\00\80\00\00?\E0\F1\0F\00\E4\0F\00\99x\07\07\04\00\00\00\0A\12\00\08\00\E4\0F\00\90r\06?\0A\00\00\00?\E4\7F\08\00\C8\0F\00\99x\05\05\04\00\00\00\06\12\00\08\00\E4\0F\00\92x\06\07\FF?\00\00?\C0\8E\0F\00\E4\0F\00\92x\05\05\FF?\00\00?\C0\8E\0F\00\E4\0F\00\92x\18\06\00\00\00\04?\FC\8E\0F\00\E4\0F\00\92x\1A\05\00\00\00\02?\FC\8E\0F\00\E4\0F\00\90x\14\06\02\00\00\04?\E0\F1\0F\00\C4\0F\00\90x\16\05\80\00\00\02?\E0\F3\0F\00\E4\0F\00\90x\15?@\00\00@?\E4\7F\08\00\E4\0F\00\90x\17?@\00\00@?\E4\FF\08\00\E2\0F\00\F0yX\18\00\00\E0\01X\08p\08\00\E2\0F\00\90x\10\06\04\00\00\04?\E0\F1\0F\00\E4\0F\00\90x\12\05\00\01\00\02?\E0\F3\0F\00\E4\0F\00\90x\11?@\00\00@?\E4\7F\08\00\C4\0F\00\90x\13?@\00\00@?\E4\FF\08\00\E4\0F\00\90x\0C\06\06\00\00\04?\E0\F1\0F\00\E4\0F\00\90x\0E\05\80\01\00\02?\E0\F3\0F\00\E4\0F\00\90x\0D?@\00\00@?\E4\7F\08\00\E4\0F\00\90x\18\06\00\02\00\04?\E0\F1\0F\00\E4\0F\00\90x\0F?@\00\00@?\E4\FF\08\00\C4\0F\00\90x\19?@\00\00@?\E4\7F\08\00\E2\0F\00\82x\05\00\01\00\00\00\00\00\00\00\00\E2\0F\00\F0yX\14\00\00\E0\01X\08p\08\00\E2\0F\00\90x\14\06\02\02\00\04?\E0\F1\0F\00\C8\0F\00\90x\15?@\00\00@?\E4\7F\08\00\CE\0F\00\F0yX\10\00\00\E0\01X\08p\08\00\E2\0F\00\90x\10\06\04\02\00\04?\E0\F1\0F\00\C8\0F\00\90x\11?@\00\00@?\E4\7F\08\00\CE\0F\00\F0yX\0C\00\00\E0\01X\08p\08\00\E2\0F\00\90x\0C\06\06\02\00\04?\E0\F1\0F\00\C8\0F\00\90x\0D?@\00\00@?\E4\7F\08\00\E4\0F\00\9Cx\00\00\00\00\00\00p\E8\F0\03\00\CA\0F\00\F0y\18\18\00\00\E0\01\18\08p\08\00\D8\0F\00\F0y\18\14\00\00\E0\01\18\08p\08\00\D8\0F\00\F0y\18\10\00\00\E0\01\18\08p\08\00\D8\0F\00\F0y\18\0C\00\00\E0\01\18\08\00\08\00\E6\0F\00\C5y\00\00\00\80\00\00\00\01\01\00\00\E4\0F\00G\09,\00\FC\FF\FF\FF\FF\FF\83\03\00\EA\0F\00\C3y\07\00\00\00\00\00\00\88\00\00\00b\0E\00\19x\00\FF\02\00\00\00\06\16\01\00\00\E2\0F\10$x\02\06\02\00\00\00\FF\00\8E\07\00\E2\0F\00\19x\03\FF\01\00\00\00\06\16\01\00\00\E2\0F\00\82x\06\00\00\04\00\00\00\00\00\00\00\E2\0F\00\1Ax\00\00\03\00\00\00\00\00\00\00\00\E2\0F\00\90x\06\06 \00\00\00?\E0\FF\0F\00\E2\0F\00\12x\02\02\06\00\00\00\FF\C0\8E\07\00\E2\0F\00\C5y\00\00\00\80\00\00\00\00\01\00\00\E4\0F\00\12x\00\00\F0\FF\FF\7F\03\F8\8E\07\00\C4\0F\00\0Cx\00\06\FF\0F\00\00p@\F0\03\00\E4\0F\00\12x\04\00\08\00\00\00\FF\FC\8E\07\00\E2\0F\04$x\03\00\80\00\00\00\FF\00\8E\07\00\E4\0F\006x\05\00@\00\00\00\00\00\00\00\00\E4\0F\006x\00\00H\00\00\00\00\00\00\00\00\E2\0F\00\12r\08\03\02\00\00\00\FF\FC\8E\07\00\E2\0F\08$x\03\04\80\00\00\00\FF\00\8E\07\00\E4\0F\00$x\05\05\80\00\00\00\FF\00\8E\07\00\E2\0F\00\96x\06\07T\06\00\00\06\00\00\08\00\E2/\00$x\07\00\80\00\00\00\FF\00\8E\07\00\E2\0F\00\12r\03\03\02\00\00\00\FF\FC\8E\07\00\C4\0F\00\12r\05\05\02\00\00\00\FF\FC\8E\07\00\E4\0F\08\12r\00\07\02\00\00\00\FF\FC\8E\07\00\E4\0F\00\11|\04\08\06\00\00\00\FF\10\8E\0F\00\E4\0F\00\11|\03\03\06\00\00\00\FF\10\8E\0F\00\E4\0F\00\11|\02\05\06\00\00\00\FF\10\8E\0F\00\E2\0F\00\88s\00\04X\00\00\00\00\0A\00\00\00\E2\03\00\11|\00\00\06\00\00\00\FF\10\8E\0F\00\C6\0F\00\88s\00\04\\ \00\00\00\0A\00\00\00\E8\03\00\88s\00\04`@\00\00\00\0A\00\00\00\E8\03\00\88s\00\04d`\00\00\00\0A\00\00\00\E8\03\00\88s\00\04h\80\00\00\00\0A\00\00\00\E8\03\00\88s\00\04l\A0\00\00\00\0A\00\00\00\E8\03\00\88s\00\04p\C0\00\00\00\0A\00\00\00\E8\03\00\88s\00\04t\E0\00\00\00\0A\00\00\00\E8\03\00\88s\00\04x\00\01\00\00\0A\00\00\00\E8\03\00\88s\00\04| \01\00\00\0A\00\00\00\E8\03\00\88s\00\04\80@\01\00\00\0A\00\00\00\E8\03\00\88s\00\04\84`\01\00\00\0A\00\00\00\E8\03\00\88s\00\04\88\80\01\00\00\0A\00\00\00\E8\03\00\88s\00\04\8C\A0\01\00\00\0A\00\00\00\E8\03\00\88s\00\04\90\C0\01\00\00\0A\00\00\00\E8\03\00\88s\00\04\94\E0\01\00\00\0A\00\00\00\E8\03\00\88s\00\03Z\00\00\00\00\0A\00\00\00\E8\03\00\88s\00\03^ \00\00\00\0A\00\00\00\E8\03\00\88s\00\03b@\00\00\00\0A\00\00\00\E8\03\00\88s\00\03f`\00\00\00\0A\00\00\00\E8\03\00\88s\00\03j\80\00\00\00\0A\00\00\00\E8\03\00\88s\00\03n\A0\00\00\00\0A\00\00\00\E8\03\00\88s\00\03r\C0\00\00\00\0A\00\00\00\E8\03\00\88s\00\03v\E0\00\00\00\0A\00\00\00\E8\03\00\88s\00\03z\00\01\00\00\0A\00\00\00\E8\03\00\88s\00\03~ \01\00\00\0A\00\00\00\E8\03\00\88s\00\03\82@\01\00\00\0A\00\00\00\E8\03\00\88s\00\03\86`\01\00\00\0A\00\00\00\E8\03\00\88s\00\03\8A\80\01\00\00\0A\00\00\00\E8\03\00\88s\00\03\8E\A0\01\00\00\0A\00\00\00\E8\03\00\88s\00\03\92\C0\01\00\00\0A\00\00\00\E8\03\00\88s\00\03\96\E0\01\00\00\0A\00\00\00\E8\03\00\88s\00\02\18\00\00\00\00\0A\00\00\00\E8\03\00\88s\00\02\1C \00\00\00\0A\00\00\00\E8\03\00\88s\00\02 @\00\00\00\0A\00\00\00\E8\03\00\88s\00\02$`\00\00\00\0A\00\00\00\E8\03\00\88s\00\02(\80\00\00\00\0A\00\00\00\E8\03\00\88s\00\02,\A0\00\00\00\0A\00\00\00\E8\03\00\88s\00\020\C0\00\00\00\0A\00\00\00\E8\03\00\88s\00\024\E0\00\00\00\0A\00\00\00\E8\03\00\88s\00\028\00\01\00\00\0A\00\00\00\E8\03\00\88s\00\02< \01\00\00\0A\00\00\00\E8\03\00\88s\00\02@@\01\00\00\0A\00\00\00\E8\03\00\88s\00\02D`\01\00\00\0A\00\00\00\E8\03\00\88s\00\02H\80\01\00\00\0A\00\00\00\E8\03\00\88s\00\02L\A0\01\00\00\0A\00\00\00\E8\03\00\88s\00\02P\C0\01\00\00\0A\00\00\00\E8\03\00\88s\00\02T\E0\01\00\00\0A\00\00\00\E8\03\00\88s\00\00\1A\00\00\00\00\0A\00\00\00\E8\03\00\88s\00\00\1E \00\00\00\0A\00\00\00\E8\03\00\88s\00\00\22@\00\00\00\0A\00\00\00\E8\03\00\88s\00\00&`\00\00\00\0A\00\00\00\E8\03\00\88s\00\00*\80\00\00\00\0A\00\00\00\E8\03\00\88s\00\00.\A0\00\00\00\0A\00\00\00\E8\03\00\88s\00\002\C0\00\00\00\0A\00\00\00\E8\03\00\88s\00\006\E0\00\00\00\0A\00\00\00\E8\03\00\88s\00\00:\00\01\00\00\0A\00\00\00\E8\03\00\88s\00\00> \01\00\00\0A\00\00\00\E8\03\00\88s\00\00B@\01\00\00\0A\00\00\00\E8\03\00\88s\00\00F`\01\00\00\0A\00\00\00\E8\03\00\88s\00\00J\80\01\00\00\0A\00\00\00\E8\03\00\88s\00\00N\A0\01\00\00\0A\00\00\00\E8\03\00\88s\00\00R\C0\01\00\00\0A\00\00\00\E8\03\00\88s\00\00V\E0\01\00\00\0A\00\00\00\E2\03\00M\09\00\00\00\00\00\00\00\00\80\03\00\EA\0F\00\19x\03\FF\1F\00\00\00\06\14\01\00\00\E2/\00\B9z\04\00\00\86\00\00\00\0A\00\00\00\E2\0F\00\12x\02\06\1F\00\00\00\FF\C0\8E\07\00\E2\0F\04Ey\00\00\C0\01\00\00\00\00\80\03\00\E2\0F\00\19x\07\06\05\00\00\00\03\12\00\00\00\E4\0F\10\19x\00\FF\05\00\00\00\03\16\01\00\00\E2\0F\00%x\02\02\10\00\00\00\FF\00\8E\07\00\E2\0F\00\10x\16\07\FC\FF\FF\FF\FF\E0\F1\07\00\C6\0F\00$x\05\07\00\02\00\00\FF\00\8E\07\00\E2\0F\04\0Cx\00\16|\00\00\00p`\F2\03\00\E4\0F\00\10x\17\00\FF\FF\FF\FF\FF\E4\7F\00\00\E4\0F\00\19x\07\07\09\00\00\00\00\02\01\00\00\E4\0F\00\0Cr\00\17\FF\00\00\00\10a\F2\03\00\E4\0F\00\12r\00\05\02\00\00\00\FF\FC\8E\07\00\E4\0F\00\12r\02\07\03\00\00\00\FF\FC\8E\07\00\C4\0F\00\10|\14\00\04\00\00\00\FF\E0\F5\0F\00\E2\0F\006|\00\00\06\00\00\00\00\00\00\08\00\E2\0F\00\1Cx\00\00\00\00\00\00p\F0\F0\03\00\E4\0F\00\10|\15\02\05\00\00\00\FF\E4\7F\09\00\E2\0F\00\B9z\04\00\00\82\00\00\00\0A\00\00\00\E4\0F\00G\99(\00\00\00\00\00\00\00\80\03\00\F0\0F\00\84y\04\00\00\00\00\00\00\0C\00\00\00\A2\02\00$r\02\FF\FF\00\00\00\14\00\8E\07\00\E2\0F\00\10x\16\16\04\00\00\00\FF\E0\F3\07\00\E2\0F\00$r\03\FF\FF\00\00\00\15\00\8E\07\00\E2\0F\00\10x\14\14\00\08\00\00\FF\E0\F5\07\00\E4\0F\00\1Cx\00\00\00\00\00\00p\E1\F0\03\00\E2\0F\00$r\17\FF\FF\00\00\00\17\06\8E\00\00\E4\0F\006x\00\00\00\08\00\00\00\00\00\00\00\E4/\00$r\15\FF\FF\00\00\00\15\06\0E\01\00\E2\0F\00\86y\00\02\04\00\00\00\04\1D\10\0C\00\EEC\00Ay\00\00\00\00\00\00\00\00\80\03\00\EA\0F\00\10x\02\16|\00\00\00\FF\E1\F7\07\00\E2/\00Ey\00\00\A0\01\00\00\00\00\80\03\00\E2\0F\00\0Cx\00\16|\00\00\00p`\F4\03\00\E4\0F\00\0Cx\00\02\0C\00\00\00p0\F2\03\00\E2\0F\00$r\02\FF\FF\00\00\00\17\0E\8E\01\00\E2\0F\00\0Cr\00\17\FF\00\00\00 a\F4\03\00\C8\0F\00\0Cr\00\02\FF\00\00\00\105r\01\00\DA\0F\00G\19L\00\00\00\00\00\00\00\80\03\00\EA\0F\00\1Cx\00\00\00\00\00\00p\E1\F0\03\00\DA\0F\00\84y\04\00\00\00\00\00\00\0C\00\00\00b.\00$r\02\FF\FF\00\00\00\14\00\8E\07\00\E2\0F\00\10x\16\16\10\00\00\00\FF\E0\F3\07\00\E2\0F\00$r\03\FF\FF\00\00\00\15\00\8E\07\00\E2\0F\00\84y\08\00\00\00\08\00\00\0C\00\00\00\A4\0E\00\10x\14\02\00 \00\00\FF\E0\F5\07\00\E2\0F\00$r\17\FF\FF\00\00\00\17\06\8E\00\00\E2\0F\00\84y\0C\00\00\00\10\00\00\0C\00\00\00\E2\0E\00\0Cx\00\16p\00\00\00p`\F2\03\00\E4\0F\00$r\15\FF\FF\00\00\00\03\06\0E\01\00\E2\0F\00\84y\10\00\00\00\18\00\00\0C\00\00\00b\09\00\0Cr\00\17\FF\00\00\00\10a\F2\03\00\E2\0F\006x\00\00\00 \00\00\00\00\00\00\00\C4\0F\01\86y\00\02\04\00\00\00\04\1D\10\0C\00\E8#\00\86y\00\02\08\00\08\00\04\1D\10\0C\00\E8C\00\86y\00\02\0C\00\10\00\04\1D\10\0C\00\E8\83\00\86y\00\02\10\00\18\00\04\1D\10\0C\00\E2\03\02G\99\B8\00\FC\FF\FF\FF\FF\FF\83\03\00\EA\0F\00Ay\00\00\00\00\00\00\00\00\80\03\00\EA\0F\00\10x\02\16|\00\00\00\FF\E1\F7\07\00\E2/\00Ey\00\000\01\00\00\00\00\80\03\00\E2\0F\00\0Cx\00\16|\00\00\00p`\F4\03\00\E4\0F\00\0Cx\00\02\04\00\00\00p0\F2\03\00\E2\0F\00$r\02\FF\FF\00\00\00\17\0E\8E\01\00\E2\0F\00\0Cr\00\17\FF\00\00\00 a\F4\03\00\C8\0F\00\0Cr\00\02\FF\00\00\00\105r\01\00\DA\0F\00G\190\00\00\00\00\00\00\00\80\03\00\EA\0F\00\84y\04\00\00\00\00\00\00\0C\00\00\00b\0E\00$r\02\FF\FF\00\00\00\14\00\8E\07\00\E2\0F\00\10x\16\16\08\00\00\00\FF\E0\F3\07\00\E2\0F\00$r\03\FF\FF\00\00\00\15\00\8E\07\00\E2\0F\00\84y\08\00\00\00\08\00\00\0C\00\00\00\E2\04\00\10x\14\14\00\10\00\00\FF\E0\F5\07\00\E4\0F\00\1Cx\00\00\00\00\00\00p\E1\F0\03\00\E2\0F\00$r\17\FF\FF\00\00\00\17\06\8E\00\00\E4\0F\00$r\15\FF\FF\00\00\00\15\06\0E\01\00\C4\0F\006x\00\00\00\10\00\00\00\00\00\00\00\E2O\00\86y\00\02\04\00\00\00\04\1D\10\0C\00\E8#\00\86y\00\02\08\00\08\00\04\1D\10\0C\00\E6\83\00Ay\00\00\00\00\00\00\00\00\80\03\00\EA\0F\00\0Cx\00\16|\00\00\00p\10\F2\03\00\C8\0F\00\0Cr\00\17\FF\00\00\00\10\15p\00\00\DA\0F\00M\89\00\00\00\00\00\00\00\00\80\03\00\EA\0F\00\84y\04\00\00\00\00\00\00\0C\00\00\00\22.\00$r\02\FF\FF\00\00\00\14\00\8E\07\00\E4\0F\00$r\03\FF\FF\00\00\00\15\00\8E\07\00\CA\0F\00\86y\00\02\04\00\00\00\04\1D\10\0C\00\E2\1F\00My\00\00\00\00\00\00\00\00\80\03\00\EA\0F\00Gy\FC\00\FC\FF\FF\FF\FF\FF\83\03\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00@\00\00\00\00\00\00\00\F5\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\0B\00\00\00\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00`\01\00\00\00\00\00\00L\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\13\00\00\00\02\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\B0\02\00\00\00\00\00\00\A8\00\00\00\00\00\00\00\02\00\00\00\05\00\00\00\08\00\00\00\00\00\00\00\18\00\00\00\00\00\00\00\AB\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00X\03\00\00\00\00\00\00h\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\007\00\00\00\00\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\C0\03\00\00\00\00\00\000\00\00\00\00\00\00\00\03\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00R\00\00\00\00\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\F0\03\00\00\00\00\00\00\FC\00\00\00\00\00\00\00\03\00\00\00\09\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\94\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\F0\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\00\00\00\09\00\00\00\08\00\00\00\00\00\00\00\18\00\00\00\00\00\00\00\C9\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\F0\04\00\00\00\00\00\00\18\00\00\00\00\00\00\00\03\00\00\00\04\00\00\00\08\00\00\00\00\00\00\00\18\00\00\00\00\00\00\00@\00\00\00\01\00\00\00\06\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80\05\00\00\00\00\00\00\80\18\00\00\00\00\00\00\03\00\00\00\05\00\00\00\80\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00g\00\00\00\08\00\00\00\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\1E\00\00\00\00\00\00!\04\00\00\00\00\00\00\00\00\00\00\09\00\00\00\10\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\DB\00\00\00\01\00\00\00\02\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\1E\00\00\00\00\00\00X\02\00\00\00\00\00\00\00\00\00\00\09\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\06\00\00\00\04\00\00\00X#\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\18\01\00\00\00\00\00\00\18\01\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\00\00\04\00\00\00X#\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\18\01\00\00\00\00\00\00\18\01\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\00\00\05\00\00\00\80\05\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80\18\00\00\00\00\00\00\80\18\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\00\00\06\00\00\00\00\1E\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00!\04\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\00\00\04\00\00\00\00\1E\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00X\02\00\00\00\00\00\00X\02\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\01\01H\00\00\00\B0\0E\00\00\00\00\00\00\AD\0E\00\00@\00\00\00\00\00\08\00Z\00\00\00\00\00\00\00\00\00\00\00\11 \10\00\00\00\00\00\00\00\00\00\00\00\00\00\F7=\00\00\00\00\00\00\00\00\00\00\00\00\00\00\F2\22\0A\0A\0A\0A.version 8.0\0A.target sm_90a\0A.address_size 64\0A\01\00\F8\19.visible .entry main_kernel(\0A.param .u64\19\00\11_\17\00?_0,!\00\0C\1F1!\00\0D\1F2!\00\0D\1F3!\00\0D\1F4!\00\0D\1F5!\00\0D\1F6!\00\0D\1F7!\00\0D\F3\088\0A)\0A{\0A.reg .pred %p<8>;\12\00\95b32 %r<47\12\00\10f\12\00ff<1669&\00\F0\0364 %rd<83>;\0A\0A\09.shaJ\00\FF\0B.align 16 .b8 dynamicShmem&\00\00\118%\00\EF__mbarrier[16]M\00\07$acI\00\22ld\E0\00\22.u\85\00_21, [\E7\00\00\1F]+\00\00\1F0+\00\02\1F7+\00\00/17+\00\02\911];\0Amov.u'\01\F0\051, %tid.x;\0Asetp.ne.s\19\002p2,\1E\00\130.\00\03T\00'8,\F0\00\02\1B\00\02p\01d2, 1;\0A\1A\00S.init\07\01\01k\01\11[=\00\10]U\00\832;\0Aadd.sR\00$9,Y\00\1F8@\00\0D\149@\00\F1\05\0A\09prefetch.tensormap#\00!20\81\01\0F \00\07\111 \00\07\D2\00;78,\E7\01\06\D4\00\116\FF\00\F0\08@%p2 bra $L__BB0_2;\0AcvtA\01\03E\00\133\C3\00\08<\00\814, 32768o\00\04\D6\00\00\07\00\C8ve.expect_tx\E2\00 _,\A2\00\113\E3\00\1046\00\0Ad\00\225,\AE\00\01\1A\00\C3p.async.bulk\E2\003.2dN\00\F4\02::cluster.global.\7F\00\F0\05::complete_tx::bytest\0035],=\010, {\F6\00c%r6} ]\90\00\00q\00\01\B0\01\01\D7\00\01\AE\01\175\DC\00\0F\8C\00<\149\8C\00\1F1\8C\00\0F#13\8D\00X40960}\01o14, 64\9F\00@*13\A0\00/14\A1\00\09222,\1E\02\0F\06\02\1D&22\07\02\07\F0\00\04}\01O1638\DE\00B\1A9\0A\02\0A\DE\00,22\DF\00\05\80\01O9152\90\00@.23\0E\02/14\90\00\07\137\90\00O5734 \01A*27\90\00/14\91\00\00\05&\04\10:&\04Da.to_\00\02\C9\031d1,\06\00\127F\043s64\BF\00\12ds\05\02\C8\02\02\B3\060154\83\05#f0\01\00\09\8C\05%79\9E\05\03\05\07X7, -1\08\03o33, 10<\00\00\104F\02\09<\00\154M\00\05{\00#2,\83\00\0B\18\00\1F3\18\00\04\1F4\18\00\04\1F5\18\00\04\1F6\18\00\04\1F7\18\00\04\1F8\18\00\04\1F9\18\00\03/50\18\00\04\1F1\18\00\04\0F\F0\00\04\1F5\F0\00\04\1F5\F0\00\04\1F5\F0\00\04\1F5\F0\00\04\1F5\F0\00\04\1F5\F0\00\04\1F5\F0\00\04\1F6\F0\00\04\1F6\F0\00\04\1F6\F0\00\04\1F6\F0\00\04\1F6\F0\00\04\1F6\F0\00\04\1F6\F0\00\04\1F6\F0\00\04\1F6\F0\00\04\1F6\F0\00\04\1F7\F0\00\04\1F7\F0\00\04\1F7\F0\00\04\1F7\F0\00\04\1F7\F0\00\04\1F7\F0\00\04\1F7\F0\00\04\1F7\F0\00\04/78\18\00\04\0F\F0\00\04\1F8\F0\00\04\1F8\F0\00\04\1F8\F0\00\04\1F8\F0\00\04\1F8\F0\00\04\1F8\F0\00\04\1F8\F0\00\04\1F8\F0\00\04\1F8\F0\00\04\1F8\F0\00\04\1F9\F0\00\04\1F9\F0\00\04\1F9\F0\00\04\1F9\F0\00\04\1F9\F0\00\04\1F9\F0\00\04\1F9\F0\00\04\1F9\F0\00\04\1F9\F0\00\04\1F9\F0\00\03?600\18\00\04\1F1\18\00\04\1F2\18\00\04\1F3\18\00\04\1F4\18\00\04\1F5\18\00\04\1F6\18\00\04\1F7\18\00\04\1F8\18\00\04\0F\F0\00\04\1F1\F0\00\04\1F1\F0\00\04\1F1\F0\00\04/13\18\00\04\0F\F0\00\04\1F1\F0\00\04\1F1\F0\00\04/17\18\00\04\0F\F0\00\04/19\18\00\03/20\18\00\04\0F\F0\00\04/22\18\00\04\0F\F0\00\04\1F2\F0\00\04\1F2\F0\00\04\1F2\F0\00\04/27\18\00\04\0F\F0\00\04\1F2\F0\00\04\1F3\F0\00\04\1F3\F0\00\04\1F3\F0\00\04/33\18\00\04\0F\F0\00\04\1F3\F0\00\04\1F3\F0\00\04\1F3\F0\00\04\1F3\F0\00\04\1F3\F0\00\04\1F4\F0\00\04/41\18\00\04\0F\F0\00\04\0F`\09\04\1F6`\09\04\1F6`\09\04\1F6`\09\04\1F6`\09\04\1F6`\09\04\1F6`\09\04\1F6`\09\04\1F6`\09\04\1F6`\09\04\1F6`\09\04\1F6`\09\04\1F6`\09\04\1F6`\09\04\1F6`\09\04\1F6`\09\04\1F6`\09\04\1F6`\09\04\1F6`\09\04\1F6`\09\04\1F6`\09\04\1F6`\09\04\1F6`\09\04\1F6`\09\04\1F6`\09\04\186`\09\04\A5\0C(3:\04\0C\00\C9\00\93p7;\0Ashl.b)\0C\00e\00\02l\0C\193\B8\11\01.\02\02\B8\11\01'\00\0D\14\11\121\19\00\01\FB\0E\09\AD\13\F6\00P1; \0ALAB_WAIT: \F4\11\80try_wait\C7\128ity\17\0F!P1b\0D!31\18\0F\01\94\00\8033; \0A@P1\9B\11\C4.uni DONE; \0A\0F\00\04b\000; \0A\1A\00x: \0A}\0A\0A\09\DB\00\159\DB\00)14\DC\00\01\C2\02\02\1B\12\01(\00T;\0Abfe;\0D\01\C7\02\02$\00\124:\00$orS\00\01\0B\05\02#\00\FA\03461168629337240371\F8\12%53G\00\03L\11\08d\00$4,$\00\0Dd\00\01W\05\02#\00\07d\00\8038849280n\01\C1wgmma.fence.\FE\0E\01\87\14\22ed\1A\13\09\8C\01\16p\F7\13\002\0E2p, \F3\13\02C\00Bmma_@\0F\08G\00\B0.m64n128k16\\\02 .f\08\00B16 {]\02\1A,=\0E*3,\1D\0E*5,\FD\0D*7,\DD\0D)9,\BD\0D\00 \01\08\9D\0D\017\01\07}\0D\04\0D\04\04]\0D\04\ED\03\04=\0D\04\CD\03\04\1D\0D\04\AD\03\04\FD\0C\04\8D\03\04\DD\0C\04m\03\04\BD\0C\04M\03\04\9D\0C\04\8D\0C\04}\0C\04m\0C\04]\0C\04M\0C\04=\0C\04-\0C\04\1D\0C\04\0D\0C\04\FD\0B\00~\02\08\DD\0B\04\CD\0B\04\BD\0B\04\AD\0B\04\9D\0B\04\8D\0B\04}\0B\04m\0B\04]\0B\04M\0B\04=\0B\04-\0B\04\1D\0B\04\0D\0B\04\FD\0A\04\ED\0A\04\DD\0A\04\CD\0A\04\BD\0A\129P\00\04\9D\0A\04\8D\0A\04}\0A\04m\0AW604},\02\03\00\A5\02\02M\02\00P\02\01S\02$}\0A\AD\11\03\C8\02\1F1,\03\08\0A\90\03/32\F3\02\06.40a\04\0F\D5\02\FF\FF@\03\AB\02\00\87\02\0F\D5\02\0D\1F3\D5\02\08\1A6\D5\02\1F4\D5\02\06?536\D5\02\FF\FFR$3,\87\02\0F\D5\02\0D\1F5\D5\02\08\1A8\D5\02\1F6\D5\02\06?664\D5\02\FF\FFR$5,\87\02\0F\D5\02\0D\1F7\D5\02\05?422\AA\02O\04v\13\04f\13\04V\13\04F\13\046\13\04&\13\04\16\13\04\06\13\04\F6\12\04\E6\12\04\D6\12\04\C6\12\04\B6\12\04\A6\12\04\96\12\04\86\12\04v\12\04f\12\04V\12\04F\12\046\12\04&\12\04\16\12\04\06\12\04\F6\11\04\E6\11\04\D6\11\04\C6\11\04\B6\11\04\A6\11\04\96\11\04\86\11\04v\11\04f\11\04V\11\136F\11\046\11\04&\11\04\16\11\04\06\11\04\F6\10\04\E6\10\04\D6\10\04\C6\10\04\B6\10\04\A6\10\04\96\10\04\86\10\04v\10\04f\10\04V\10\04F\10\046\10\04&\10\04\16\10\04\06\10\04\F6\0F\04\E6\0F\04\D6\0F\04\C6\0F\04\B6\0F\04\A6\0F\04\96\0F3668\AA\02\1F7)\0B\16\1F9\AA\02\08\0F)\08O\0F\AA\02\FF\F0\1F9\FE\0A\15\01\0A\01\0F\AA\02\05\0F\A8\0DO\0F\AA\02\FF\EF/41\D3\0A\15\01\FA\00\0F\AA\02\04/30\AA\02\FF\FFR\1F3\A8\0A\0A\04\C4\15\CFcommit_group\CB\15\00\02%\00/wa#\00\02\00\E0#*\0A\09\1E$=%p4C$\00\8E\00\115\E6(\191\E6(\1D3\D2\17\114-\00\01\E9%\0Fz\00\0B\01A\03\01\CF\16\00{\18\11r\FD\01\01?\00\112F\18\01_\18\02\1A\00\146\1A\00d1;\0Aand\17\00#7,\1D\00\106.\00\14rH\00\1F8.\00\03#9,\1D\00\A221474836321\17\01\1F\00\01S\02\04\84\00\139\D5\00#64T\00\22d5\19\00\B57;\0Amul.wide\1A\00#6,?\008128\82\17\01\1D\02\04 \003d55\D2\00\03\1D\00$8,$\00\192l\01\00C\02\06\03,\07\8C\04\01T\02\02 \00\00\07\00E8;\0As\FB* v2<\04! ['\00!],=\04\07\E5\0E\1F},\00\065+32/\00\05\F4\0E\0F/\00\08%64/\00\05\03\0F\0F/\00\08$96/\00\06\12\0F\0F/\00\0851280\00\05\22\0F\0F0\00\09\06\ED\00\062\0F\0F0\00\09\159\EE\00\06B\0F\0F0\00\08%22\EF\00\06R\0F\0F0\00\09\155\F0\00\06b\0F\0F0\00\09\158\F0\00\06r\0F\0F\AD\01\0A\05\F0\00\06\82\0F\0F0\00\09\155\F0\00\06\92\0F\0F0\00\09\158\F0\00\06\A2\0F\0F0\00\08%41\F0\00\06\B2\0F\0F0\00\09\154\F0\00\06\C2\0F\0F0\00\09\148\F0\00\07\D2\0F\18}\B5\03\141|\03\1E8\99\03\01\13\02\01#\00\0B\99\03\01\8E\05\04 \00\0C\99\03\01\A3\05\02$\00\0A\8A\1B\01\B4\05\05\80\03/63\B7\00\05\05C\01\06U\12\0F\E3\00\06\184\80\03\05d\12\0F/\00\08\06^\00\06s\12\0F/\00\08\07\80\03\05\82\12\0F/\00\08\08\80\03\05\92\12\0F0\00\09\07\80\03\05\A2\12\0F0\00\09\07\80\03\05\B2\12\0F0\00\08\07\80\03\06\C2\12\0F0\00\09\07\80\03\05\D2\12\0F0\00\09\07\80\03\05\E2\12\0F\AD\01\0A\05\F0\00\06\F2\12\0F0\00\09\07\80\03\05\02\13\0F0\00\09\06\80\03\06\12\13\0F0\00\08\08\80\03\05\22\13\0F0\00\09\07\80\03\052\13\0F0\00\09\07\80\03\06B\13&;\0AL-\01y\09\01\81\03\00H/\05\82\03\03\AA,\01e\05\01$\00\0C\82\03&6, \00\0D\82\03$7,$\00\0B\82\03\198\82\03\1F7\B9\00\05\04\E5\00\07?\0B\0F\E5\00\06\168\82\03\07N\0B\0F/\00\08\05\82\03\07]\0B\0F/\00\08\05\82\03\07l\0B\0F/\00\08\06\82\03\07|\0B\0F0\00\09\05\82\03\07\8C\0B\0F0\00\09\05\82\03\07\9C\0B\0F0\00\08\06\82\03\07\AC\0B\0F0\00\09\05\82\03\07\BC\0B\0F0\00\09\05\82\03\07\CC\0B\0F\AD\01\0A\05\F0\00\06\DC\0B\0F0\00\09\05\82\03\07\EC\0B\0F0\00\09\05\82\03\07\FC\0B\0F0\00\08\06\82\03\07\0C\0C\0F0\00\09\05\82\03\07\1C\0C\0F0\00\09\06\82\03\06,\0C\0A\82\03\143\82\03/72\82\03\00#9,$\00\0B\82\03\01\1A\18\04 \00\0C\82\03\01m\05\02$\00\0A\82\03\01@\18\05\82\03/71\B9\00\04\157u\01\06\B1\0E\1F},\00\06\07\82\03\06\C0\0E\0F/\00\08\07\82\03\05\CF\0E\0F/\00\08\07\82\03\05\DE\0E\0F/\00\08\08\82\03\05\EE\0E\0F0\00\09\07\82\03\05\FE\0E\0F0\00\09\06\82\03\06\0E\0F\0F0\00\08\08\82\03\05\1E\0F\0F0\00\09\07\82\03\05.\0F\0F0\00\09\07\82\03\05>\0F\0F\AD\01\0A\05\F0\00\06N\0F\0F0\00\09\06\82\03\06^\0F\0F0\00\09\07\82\03\05n\0F\0F0\00\08\08\82\03\05~\0F\0F0\00\09\07\82\03\05\8E\0F\0F0\00\09\07\82\03\06\9E\0F\03\E9\11\11g\F9\0ED %p5s\0E2409&\0F\195&\0F\137\A1\0E\02\E4\0D\01\1D\0C\01\C93\195_\03\01O\1B\02\1F\00*-4\92\03\154\1A\00\189\D5\0E\154y\00.31\02\0B\01\05\09\01$\00)16\FD\03(6,U\00\0B\85\00&1,\7F4\1A7\8D \01\00\1C\06\00\04\166\22( 6:Y:\04B\01\114B\01\005\01\135\07\063538\08\00\02\A5\01A540}\AC'Ad80]~\01\03\FE4%v4~\01#81z\01\0FE\00\09\0FA\01\01\00\07\00\1F4\D5\00\01\00\07\00O2048\D5\00\01$80\1C\00\01\DB\01\11l\FF\0FE %p6S\001124\DB\01\196\DB\01\07\05\01\C07:\0Aret;\0A\0A}\0A\00\00\00\00">] - llvm.func @mgpuTensorMapEncodeTiledMemref(i64, !llvm.ptr, i64, i64, i64, i64, i64, !llvm.ptr) -> !llvm.ptr - llvm.func @mgpuStreamCreate() -> !llvm.ptr - llvm.func @mgpuMemAlloc(i64, !llvm.ptr, i8) -> !llvm.ptr - llvm.func @mgpuMemcpy(!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -} - From 1b664fe2548d4cd5ce7a495cde4a86b5531af123 Mon Sep 17 00:00:00 2001 From: Dhruv Srivastava Date: Thu, 22 Aug 2024 14:26:49 +0530 Subject: [PATCH 162/426] [lldb][AIX] Updating XCOFF,PPC entry in LLDB ArchSpec (#105523) This PR is in reference to porting LLDB on AIX. Link to discussions on llvm discourse and github: 1. https://discourse.llvm.org/t/port-lldb-to-ibm-aix/80640 2. #101657 The complete changes for porting are present in this draft PR: #102601 The changes in this PR are intended to update the Architecture entry for LLDB with XCOFF,PPC. 1. Added new ArchitectureType `eArchTypeXCOFF` 2. Added a new `ArchDefinitionEntry g_xcoff_arch_entries[]` 3. Added a new case for `XCOFF in ArchSpec::SetArchitecture(..)` 4. Updated `ArchDefinition *g_arch_definitions[]` --- lldb/include/lldb/lldb-private-enumerations.h | 1 + lldb/source/Utility/ArchSpec.cpp | 19 ++++++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/lldb/include/lldb/lldb-private-enumerations.h b/lldb/include/lldb/lldb-private-enumerations.h index c24a3538f58dac..98c1e956bf8f7b 100644 --- a/lldb/include/lldb/lldb-private-enumerations.h +++ b/lldb/include/lldb/lldb-private-enumerations.h @@ -65,6 +65,7 @@ enum ArchitectureType { eArchTypeMachO, eArchTypeELF, eArchTypeCOFF, + eArchTypeXCOFF, kNumArchTypes }; diff --git a/lldb/source/Utility/ArchSpec.cpp b/lldb/source/Utility/ArchSpec.cpp index 07ef435ef451d2..4fd1a800023ce3 100644 --- a/lldb/source/Utility/ArchSpec.cpp +++ b/lldb/source/Utility/ArchSpec.cpp @@ -16,6 +16,7 @@ #include "llvm/BinaryFormat/COFF.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/BinaryFormat/MachO.h" +#include "llvm/BinaryFormat/XCOFF.h" #include "llvm/Support/Compiler.h" #include "llvm/TargetParser/ARMTargetParser.h" @@ -459,10 +460,23 @@ static const ArchDefinition g_coff_arch_def = { "pe-coff", }; +static const ArchDefinitionEntry g_xcoff_arch_entries[] = { + {ArchSpec::eCore_ppc_generic, llvm::XCOFF::TCPU_COM, LLDB_INVALID_CPUTYPE, + 0xFFFFFFFFu, 0xFFFFFFFFu}, + {ArchSpec::eCore_ppc64_generic, llvm::XCOFF::TCPU_PPC64, + LLDB_INVALID_CPUTYPE, 0xFFFFFFFFu, 0xFFFFFFFFu}}; + +static const ArchDefinition g_xcoff_arch_def = { + eArchTypeXCOFF, + std::size(g_xcoff_arch_entries), + g_xcoff_arch_entries, + "xcoff", +}; + //===----------------------------------------------------------------------===// // Table of all ArchDefinitions static const ArchDefinition *g_arch_definitions[] = { - &g_macho_arch_def, &g_elf_arch_def, &g_coff_arch_def}; + &g_macho_arch_def, &g_elf_arch_def, &g_coff_arch_def, &g_xcoff_arch_def}; //===----------------------------------------------------------------------===// // Static helper functions. @@ -903,6 +917,9 @@ bool ArchSpec::SetArchitecture(ArchitectureType arch_type, uint32_t cpu, } else if (arch_type == eArchTypeCOFF && os == llvm::Triple::Win32) { m_triple.setVendor(llvm::Triple::PC); m_triple.setOS(llvm::Triple::Win32); + } else if (arch_type == eArchTypeXCOFF && os == llvm::Triple::AIX) { + m_triple.setVendor(llvm::Triple::IBM); + m_triple.setOS(llvm::Triple::AIX); } else { m_triple.setVendor(llvm::Triple::UnknownVendor); m_triple.setOS(llvm::Triple::UnknownOS); From 1e44e7afd799f582171a79355ce353fde134e806 Mon Sep 17 00:00:00 2001 From: Daniel Cederman Date: Thu, 22 Aug 2024 10:57:55 +0200 Subject: [PATCH 163/426] [Sparc] Add flags to enable errata workaround pass for GR712RC and UT700 (#104742) This adds the flags -mfix-gr712rc and -mfix-ut700 which enables the necessary errata workarounds for the GR712RC and UT700 processors. The functionality enabled by the flags is the same as the functionality provided by the corresponding GCC flags. --- clang/include/clang/Driver/Options.td | 4 ++++ clang/lib/Driver/ToolChains/Arch/Sparc.cpp | 13 +++++++++++++ clang/test/Driver/sparc-fix.c | 5 +++++ 3 files changed, 22 insertions(+) create mode 100644 clang/test/Driver/sparc-fix.c diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index c204062b4f7353..5d8791727d2109 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6155,6 +6155,10 @@ def mv8plus : Flag<["-"], "mv8plus">, Group, HelpText<"Enable V8+ mode, allowing use of 64-bit V9 instructions in 32-bit code">; def mno_v8plus : Flag<["-"], "mno-v8plus">, Group, HelpText<"Disable V8+ mode">; +def mfix_gr712rc : Flag<["-"], "mfix-gr712rc">, Group, + HelpText<"Enable workarounds for GR712RC errata">; +def mfix_ut700 : Flag<["-"], "mfix-ut700">, Group, + HelpText<"Enable workarounds for UT700 errata">; foreach i = 1 ... 7 in def ffixed_g#i : Flag<["-"], "ffixed-g"#i>, Group, HelpText<"Reserve the G"#i#" register (SPARC only)">; diff --git a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp index 5a1fedbec06adf..f7f0a265fef68b 100644 --- a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp +++ b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp @@ -264,4 +264,17 @@ void sparc::getSparcTargetFeatures(const Driver &D, const ArgList &Args, if (Args.hasArg(options::OPT_ffixed_i5)) Features.push_back("+reserve-i5"); + + if (Args.hasArg(options::OPT_mfix_gr712rc)) { + Features.push_back("+fix-tn0009"); + Features.push_back("+fix-tn0011"); + Features.push_back("+fix-tn0012"); + Features.push_back("+fix-tn0013"); + } + + if (Args.hasArg(options::OPT_mfix_ut700)) { + Features.push_back("+fix-tn0009"); + Features.push_back("+fix-tn0010"); + Features.push_back("+fix-tn0013"); + } } diff --git a/clang/test/Driver/sparc-fix.c b/clang/test/Driver/sparc-fix.c new file mode 100644 index 00000000000000..1f034399ce2245 --- /dev/null +++ b/clang/test/Driver/sparc-fix.c @@ -0,0 +1,5 @@ +// RUN: %clang --target=sparc -mfix-gr712rc -### %s 2>&1 | FileCheck --check-prefix=GR712RC %s +// GR712RC: "-target-feature" "+fix-tn0009" "-target-feature" "+fix-tn0011" "-target-feature" "+fix-tn0012" "-target-feature" "+fix-tn0013" + +// RUN: %clang --target=sparc -mfix-ut700 -### %s 2>&1 | FileCheck --check-prefix=UT700 %s +// UT700: "-target-feature" "+fix-tn0009" "-target-feature" "+fix-tn0010" "-target-feature" "+fix-tn0013" From 00a1a45a7dcdcd8b1f969958a6d927b595567090 Mon Sep 17 00:00:00 2001 From: Abid Qadeer Date: Thu, 22 Aug 2024 10:08:04 +0100 Subject: [PATCH 164/426] [mlir][llvmir][debug] Correctly generate location for phi nodes. (#105534) In [convertBlockImpl](https://github.com/llvm/llvm-project/blob/87eeed1f0ebe57abffde560c25dd9829dc6038f3/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp#L959), the debug location is set on the builder before the op is processed. This results in correct location being given to corresponding llvm instructions. But same is not done when phi nodes are created a few lines above. This result is phi nodes getting whatever the current debug location of the builder is. It can be nothing or in worst case a stale location. Fixed by calling SetCurrentDebugLocation before generating phi nodes. --- mlir/lib/Target/LLVMIR/ModuleTranslation.cpp | 2 ++ mlir/test/Target/LLVMIR/llvmir-phi-loc.mlir | 32 ++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 mlir/test/Target/LLVMIR/llvmir-phi-loc.mlir diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp index 930300d26c4479..2827713e2bf213 100644 --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -947,6 +947,8 @@ LogicalResult ModuleTranslation::convertBlockImpl(Block &bb, if (!isCompatibleType(wrappedType)) return emitError(bb.front().getLoc(), "block argument does not have an LLVM type"); + builder.SetCurrentDebugLocation( + debugTranslation->translateLoc(arg.getLoc(), subprogram)); llvm::Type *type = convertType(wrappedType); llvm::PHINode *phi = builder.CreatePHI(type, numPredecessors); mapValue(arg, phi); diff --git a/mlir/test/Target/LLVMIR/llvmir-phi-loc.mlir b/mlir/test/Target/LLVMIR/llvmir-phi-loc.mlir new file mode 100644 index 00000000000000..fd045026052848 --- /dev/null +++ b/mlir/test/Target/LLVMIR/llvmir-phi-loc.mlir @@ -0,0 +1,32 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +llvm.func @test_phi_locations(%arg0: !llvm.ptr) { + %0 = llvm.mlir.constant(1 : i64) : i64 loc(#loc1) + %1 = llvm.mlir.constant(100 : i32) : i32 loc(#loc1) + llvm.br ^bb1(%1, %0 : i32, i64) loc(#loc1) +^bb1(%2: i32 loc(#loc2), %3: i64 loc(#loc3)): + %4 = llvm.icmp "sgt" %3, %0 : i64 loc(#loc1) + llvm.cond_br %4, ^bb2, ^bb1(%2, %3 : i32, i64) loc(#loc1) +^bb2: + llvm.return loc(#loc1) +} loc(#loc4) + +#file = #llvm.di_file<"test.f90" in ""> +#cu = #llvm.di_compile_unit, + sourceLanguage = DW_LANG_Fortran95, file = #file, isOptimized = false, + emissionKind = Full> +#sp_ty = #llvm.di_subroutine_type +#sp = #llvm.di_subprogram, compileUnit = #cu, scope = #file, + name = "test_phi_locations", file = #file, subprogramFlags = Definition, + type = #sp_ty> + +#loc1 = loc("test.f90":15:22) +#loc2 = loc("test.f90":8:2) +#loc3 = loc("test.f90":9:5) +#loc4 = loc(fused<#sp>[#loc1]) + +// CHECK-LABEL: define void @test_phi_locations +// CHECK: phi i32{{.*}}!dbg ![[LOC1:[0-9]+]] +// CHECK: phi i64{{.*}}!dbg ![[LOC2:[0-9]+]] +// CHECK: ![[LOC1]] = !DILocation(line: 8, column: 2{{.*}}) +// CHECK: ![[LOC2]] = !DILocation(line: 9, column: 5{{.*}}) From 14c7e4a1844904f3db9b2dc93b722925a8c66b27 Mon Sep 17 00:00:00 2001 From: Matthew Devereau Date: Thu, 22 Aug 2024 10:12:59 +0100 Subject: [PATCH 165/426] Enable logf128 constant folding for hosts with 128bit long double (#104929) This is a reland of (#96287). This patch attempts to reduce the reverted patch's clang compile time by removing #includes of float128.h and inlining convertToQuad functions instead. --- llvm/CMakeLists.txt | 2 -- llvm/cmake/config-ix.cmake | 18 +++++++--------- llvm/include/llvm/ADT/APFloat.h | 15 +++----------- llvm/include/llvm/ADT/APInt.h | 8 ------- llvm/include/llvm/Support/float128.h | 14 ++++++------- llvm/lib/Analysis/CMakeLists.txt | 6 ------ llvm/lib/Analysis/ConstantFolding.cpp | 30 +++++++++++++++++++++------ llvm/lib/Support/APFloat.cpp | 24 ++------------------- 8 files changed, 42 insertions(+), 75 deletions(-) diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index d681b1ccab6299..b03d89a43c34b0 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -560,8 +560,6 @@ set(LLVM_USE_STATIC_ZSTD FALSE CACHE BOOL "Use static version of zstd. Can be TR set(LLVM_ENABLE_CURL "OFF" CACHE STRING "Use libcurl for the HTTP client if available. Can be ON, OFF, or FORCE_ON") -set(LLVM_HAS_LOGF128 "OFF" CACHE STRING "Use logf128 to constant fold fp128 logarithm calls. Can be ON, OFF, or FORCE_ON") - set(LLVM_ENABLE_HTTPLIB "OFF" CACHE STRING "Use cpp-httplib HTTP server library if available. Can be ON, OFF, or FORCE_ON") set(LLVM_Z3_INSTALL_DIR "" CACHE STRING "Install directory of the Z3 solver.") diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake index f76eacb9d51366..471dd1615c2e7b 100644 --- a/llvm/cmake/config-ix.cmake +++ b/llvm/cmake/config-ix.cmake @@ -246,17 +246,6 @@ else() set(HAVE_LIBEDIT 0) endif() -if(LLVM_HAS_LOGF128) - include(CheckCXXSymbolExists) - check_cxx_symbol_exists(logf128 math.h HAS_LOGF128) - - if(LLVM_HAS_LOGF128 STREQUAL FORCE_ON AND NOT HAS_LOGF128) - message(FATAL_ERROR "Failed to configure logf128") - endif() - - set(LLVM_HAS_LOGF128 "${HAS_LOGF128}") -endif() - # function checks check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM) find_package(Backtrace) @@ -270,6 +259,13 @@ if(C_SUPPORTS_WERROR_UNGUARDED_AVAILABILITY_NEW) set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror=unguarded-availability-new") endif() +check_cxx_symbol_exists(logf128 cmath HAS_LOGF128) +check_symbol_exists(__powerpc64le__ "" __PPC64LE) +if(HAS_LOGF128 AND NOT __PPC64LE) + set(LLVM_HAS_LOGF128 On) + add_compile_definitions(HAS_LOGF128) +endif() + # Determine whether we can register EH tables. check_symbol_exists(__register_frame "${CMAKE_CURRENT_LIST_DIR}/unwind.h" HAVE_REGISTER_FRAME) check_symbol_exists(__deregister_frame "${CMAKE_CURRENT_LIST_DIR}/unwind.h" HAVE_DEREGISTER_FRAME) diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h index 7039e961bff82d..925d03d4c06670 100644 --- a/llvm/include/llvm/ADT/APFloat.h +++ b/llvm/include/llvm/ADT/APFloat.h @@ -19,7 +19,6 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/FloatingPointMode.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/float128.h" #include #define APFLOAT_DISPATCH_ON_SEMANTICS(METHOD_CALL) \ @@ -378,9 +377,6 @@ class IEEEFloat final : public APFloatBase { Expected convertFromString(StringRef, roundingMode); APInt bitcastToAPInt() const; double convertToDouble() const; -#ifdef HAS_IEE754_FLOAT128 - float128 convertToQuad() const; -#endif float convertToFloat() const; /// @} @@ -1274,14 +1270,9 @@ class APFloat : public APFloatBase { /// shorter semantics, like IEEEsingle and others. double convertToDouble() const; - /// Converts this APFloat to host float value. - /// - /// \pre The APFloat must be built using semantics, that can be represented by - /// the host float type without loss of precision. It can be IEEEquad and - /// shorter semantics, like IEEEdouble and others. -#ifdef HAS_IEE754_FLOAT128 - float128 convertToQuad() const; -#endif + /// Return true if this APFloat has quadruple precision floating point + /// semantics + bool isValidIEEEQuad() const; /// Converts this APFloat to host float value. /// diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h index 65ba3f15305c78..13837413ae49fe 100644 --- a/llvm/include/llvm/ADT/APInt.h +++ b/llvm/include/llvm/ADT/APInt.h @@ -17,7 +17,6 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Support/float128.h" #include #include #include @@ -1677,13 +1676,6 @@ class [[nodiscard]] APInt { /// any bit width. Exactly 64 bits will be translated. double bitsToDouble() const { return llvm::bit_cast(getWord(0)); } -#ifdef HAS_IEE754_FLOAT128 - float128 bitsToQuad() const { - __uint128_t ul = ((__uint128_t)U.pVal[1] << 64) + U.pVal[0]; - return llvm::bit_cast(ul); - } -#endif - /// Converts APInt bits to a float /// /// The conversion does not do a translation from integer to float, it just diff --git a/llvm/include/llvm/Support/float128.h b/llvm/include/llvm/Support/float128.h index e15a98dc5a6779..618b320086ba59 100644 --- a/llvm/include/llvm/Support/float128.h +++ b/llvm/include/llvm/Support/float128.h @@ -9,18 +9,16 @@ #ifndef LLVM_FLOAT128 #define LLVM_FLOAT128 +#include + namespace llvm { -#if defined(__clang__) && defined(__FLOAT128__) && \ - defined(__SIZEOF_INT128__) && !defined(__LONG_DOUBLE_IBM128__) -#define HAS_IEE754_FLOAT128 -typedef __float128 float128; -#elif defined(__FLOAT128__) && defined(__SIZEOF_INT128__) && \ - !defined(__LONG_DOUBLE_IBM128__) && \ - (defined(__GNUC__) || defined(__GNUG__)) +#ifdef HAS_LOGF128 +#if !defined(__LONG_DOUBLE_IBM128__) && (__SIZEOF_INT128__ == 16) +typedef decltype(logf128(0.)) float128; #define HAS_IEE754_FLOAT128 -typedef _Float128 float128; #endif +#endif // HAS_LOGF128 } // namespace llvm #endif // LLVM_FLOAT128 diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt index 393803fad89383..3127f45cc54cb1 100644 --- a/llvm/lib/Analysis/CMakeLists.txt +++ b/llvm/lib/Analysis/CMakeLists.txt @@ -162,9 +162,3 @@ add_llvm_component_library(LLVMAnalysis Support TargetParser ) - -include(CheckCXXSymbolExists) -check_cxx_symbol_exists(logf128 math.h HAS_LOGF128) -if(HAS_LOGF128) - target_compile_definitions(LLVMAnalysis PRIVATE HAS_LOGF128) -endif() diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index defcacdfa8b105..81c4d4ec5be412 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -54,6 +54,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/float128.h" #include #include #include @@ -1741,7 +1742,7 @@ Constant *GetConstantFoldFPValue(double V, Type *Ty) { llvm_unreachable("Can only constant fold half/float/double"); } -#if defined(HAS_IEE754_FLOAT128) && defined(HAS_LOGF128) +#if defined(HAS_IEE754_FLOAT128) Constant *GetConstantFoldFPValue128(float128 V, Type *Ty) { if (Ty->isFP128Ty()) return ConstantFP::get(Ty, V); @@ -1781,11 +1782,25 @@ Constant *ConstantFoldFP(double (*NativeFP)(double), const APFloat &V, return GetConstantFoldFPValue(Result, Ty); } -#if defined(HAS_IEE754_FLOAT128) && defined(HAS_LOGF128) +#if defined(HAS_IEE754_FLOAT128) +float128 ConvertToQuad(const APFloat &Apf) { + APInt Api = Apf.bitcastToAPInt(); + __uint128_t Uint128 = + ((__uint128_t)Api.extractBitsAsZExtValue(64, 64) << 64) + + Api.extractBitsAsZExtValue(64, 0); + return llvm::bit_cast(Uint128); +} +#endif + +#if defined(HAS_IEE754_FLOAT128) Constant *ConstantFoldFP128(float128 (*NativeFP)(float128), const APFloat &V, Type *Ty) { llvm_fenv_clearexcept(); - float128 Result = NativeFP(V.convertToQuad()); + if (!V.isValidIEEEQuad()) + return nullptr; + + float128 Result = NativeFP(ConvertToQuad(V)); + if (llvm_fenv_testexcept()) { llvm_fenv_clearexcept(); return nullptr; @@ -2114,13 +2129,16 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, if (IntrinsicID == Intrinsic::canonicalize) return constantFoldCanonicalize(Ty, Call, U); -#if defined(HAS_IEE754_FLOAT128) && defined(HAS_LOGF128) +#if defined(HAS_IEE754_FLOAT128) if (Ty->isFP128Ty()) { if (IntrinsicID == Intrinsic::log) { - float128 Result = logf128(Op->getValueAPF().convertToQuad()); + APFloat Value = Op->getValueAPF(); + if (!Value.isValidIEEEQuad()) + return nullptr; + + float128 Result = logf128(ConvertToQuad(Value)); return GetConstantFoldFPValue128(Result, Ty); } - LibFunc Fp128Func = NotLibFunc; if (TLI->getLibFunc(Name, Fp128Func) && TLI->has(Fp128Func) && Fp128Func == LibFunc_logl) diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp index 7f68c5ab9b7cf7..2ddf99f56f88d5 100644 --- a/llvm/lib/Support/APFloat.cpp +++ b/llvm/lib/Support/APFloat.cpp @@ -3749,15 +3749,6 @@ double IEEEFloat::convertToDouble() const { return api.bitsToDouble(); } -#ifdef HAS_IEE754_FLOAT128 -float128 IEEEFloat::convertToQuad() const { - assert(semantics == (const llvm::fltSemantics *)&semIEEEquad && - "Float semantics are not IEEEquads"); - APInt api = bitcastToAPInt(); - return api.bitsToQuad(); -} -#endif - /// Integer bit is explicit in this format. Intel hardware (387 and later) /// does not support these bit patterns: /// exponent = all 1's, integer bit 0, significand 0 ("pseudoinfinity") @@ -5406,20 +5397,9 @@ double APFloat::convertToDouble() const { return Temp.getIEEE().convertToDouble(); } -#ifdef HAS_IEE754_FLOAT128 -float128 APFloat::convertToQuad() const { - if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEquad) - return getIEEE().convertToQuad(); - assert(getSemantics().isRepresentableBy(semIEEEquad) && - "Float semantics is not representable by IEEEquad"); - APFloat Temp = *this; - bool LosesInfo; - opStatus St = Temp.convert(semIEEEquad, rmNearestTiesToEven, &LosesInfo); - assert(!(St & opInexact) && !LosesInfo && "Unexpected imprecision"); - (void)St; - return Temp.getIEEE().convertToQuad(); +bool APFloat::isValidIEEEQuad() const { + return (&getSemantics() == (const llvm::fltSemantics *)&semIEEEquad); } -#endif float APFloat::convertToFloat() const { if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEsingle) From 15e915a44f0d0bf092214586d3ec86e2bb7636d7 Mon Sep 17 00:00:00 2001 From: Ivan Butygin Date: Thu, 22 Aug 2024 12:16:03 +0300 Subject: [PATCH 166/426] [mlir][dataflow] Propagate errors from `visitOperation` (#105448) Base `DataFlowAnalysis::visit` returns `LogicalResult`, but wrappers's Sparse/Dense/Forward/Backward `visitOperation` doesn't. Sometimes it's needed to abort solver early if some unrecoverable condition detected inside analysis. Update `visitOperation` to return `LogicalResult` and propagate it to `solver.initializeAndRun()`. Only `visitOperation` is updated for now, it's possible to update other hooks like `visitNonControlFlowArguments`, bit it's not needed immediately and let's keep this PR small. Hijacked `UnderlyingValueAnalysis` test analysis to test it. --- .../lib/Optimizer/Transforms/StackArrays.cpp | 29 ++++---- .../DataFlow/ConstantPropagationAnalysis.h | 7 +- .../mlir/Analysis/DataFlow/DenseAnalysis.h | 42 ++++++------ .../Analysis/DataFlow/IntegerRangeAnalysis.h | 7 +- .../mlir/Analysis/DataFlow/LivenessAnalysis.h | 4 +- .../mlir/Analysis/DataFlow/SparseAnalysis.h | 26 +++---- .../DataFlow/ConstantPropagationAnalysis.cpp | 11 +-- mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp | 51 ++++++++------ .../DataFlow/IntegerRangeAnalysis.cpp | 9 ++- .../Analysis/DataFlow/LivenessAnalysis.cpp | 11 +-- mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp | 68 +++++++++++-------- .../DataFlow/test-last-modified-error.mlir | 8 +++ .../TestDenseBackwardDataFlowAnalysis.cpp | 36 ++++++---- .../DataFlow/TestDenseDataFlowAnalysis.h | 12 +++- .../TestDenseForwardDataFlowAnalysis.cpp | 35 ++++++---- .../TestSparseBackwardDataFlowAnalysis.cpp | 13 ++-- 16 files changed, 220 insertions(+), 149 deletions(-) create mode 100644 mlir/test/Analysis/DataFlow/test-last-modified-error.mlir diff --git a/flang/lib/Optimizer/Transforms/StackArrays.cpp b/flang/lib/Optimizer/Transforms/StackArrays.cpp index a5a95138ac1281..6bd5724f52043c 100644 --- a/flang/lib/Optimizer/Transforms/StackArrays.cpp +++ b/flang/lib/Optimizer/Transforms/StackArrays.cpp @@ -149,8 +149,9 @@ class AllocationAnalysis public: using DenseForwardDataFlowAnalysis::DenseForwardDataFlowAnalysis; - void visitOperation(mlir::Operation *op, const LatticePoint &before, - LatticePoint *after) override; + mlir::LogicalResult visitOperation(mlir::Operation *op, + const LatticePoint &before, + LatticePoint *after) override; /// At an entry point, the last modifications of all memory resources are /// yet to be determined @@ -159,7 +160,7 @@ class AllocationAnalysis protected: /// Visit control flow operations and decide whether to call visitOperation /// to apply the transfer function - void processOperation(mlir::Operation *op) override; + mlir::LogicalResult processOperation(mlir::Operation *op) override; }; /// Drives analysis to find candidate fir.allocmem operations which could be @@ -329,9 +330,8 @@ std::optional LatticePoint::get(mlir::Value val) const { return it->second; } -void AllocationAnalysis::visitOperation(mlir::Operation *op, - const LatticePoint &before, - LatticePoint *after) { +mlir::LogicalResult AllocationAnalysis::visitOperation( + mlir::Operation *op, const LatticePoint &before, LatticePoint *after) { LLVM_DEBUG(llvm::dbgs() << "StackArrays: Visiting operation: " << *op << "\n"); LLVM_DEBUG(llvm::dbgs() << "--Lattice in: " << before << "\n"); @@ -346,14 +346,14 @@ void AllocationAnalysis::visitOperation(mlir::Operation *op, if (attr && attr.getValue()) { LLVM_DEBUG(llvm::dbgs() << "--Found fir.must_be_heap: skipping\n"); // skip allocation marked not to be moved - return; + return mlir::success(); } auto retTy = allocmem.getAllocatedType(); if (!mlir::isa(retTy)) { LLVM_DEBUG(llvm::dbgs() << "--Allocation is not for an array: skipping\n"); - return; + return mlir::success(); } mlir::Value result = op->getResult(0); @@ -387,6 +387,7 @@ void AllocationAnalysis::visitOperation(mlir::Operation *op, LLVM_DEBUG(llvm::dbgs() << "--Lattice out: " << *after << "\n"); propagateIfChanged(after, changed); + return mlir::success(); } void AllocationAnalysis::setToEntryState(LatticePoint *lattice) { @@ -395,18 +396,20 @@ void AllocationAnalysis::setToEntryState(LatticePoint *lattice) { /// Mostly a copy of AbstractDenseLattice::processOperation - the difference /// being that call operations are passed through to the transfer function -void AllocationAnalysis::processOperation(mlir::Operation *op) { +mlir::LogicalResult AllocationAnalysis::processOperation(mlir::Operation *op) { // If the containing block is not executable, bail out. if (!getOrCreateFor(op, op->getBlock())->isLive()) - return; + return mlir::success(); // Get the dense lattice to update mlir::dataflow::AbstractDenseLattice *after = getLattice(op); // If this op implements region control-flow, then control-flow dictates its // transfer function. - if (auto branch = mlir::dyn_cast(op)) - return visitRegionBranchOperation(op, branch, after); + if (auto branch = mlir::dyn_cast(op)) { + visitRegionBranchOperation(op, branch, after); + return mlir::success(); + } // pass call operations through to the transfer function @@ -418,7 +421,7 @@ void AllocationAnalysis::processOperation(mlir::Operation *op) { before = getLatticeFor(op, op->getBlock()); /// Invoke the operation transfer function - visitOperationImpl(op, *before, after); + return visitOperationImpl(op, *before, after); } llvm::LogicalResult diff --git a/mlir/include/mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h index 1bf991dc193874..d2d4ff9960ea36 100644 --- a/mlir/include/mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h +++ b/mlir/include/mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h @@ -101,9 +101,10 @@ class SparseConstantPropagation public: using SparseForwardDataFlowAnalysis::SparseForwardDataFlowAnalysis; - void visitOperation(Operation *op, - ArrayRef *> operands, - ArrayRef *> results) override; + LogicalResult + visitOperation(Operation *op, + ArrayRef *> operands, + ArrayRef *> results) override; void setToEntryState(Lattice *lattice) override; }; diff --git a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h index 088b6cd7d698fc..4ad5f3fcd838c0 100644 --- a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h +++ b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h @@ -87,9 +87,9 @@ class AbstractDenseForwardDataFlowAnalysis : public DataFlowAnalysis { protected: /// Propagate the dense lattice before the execution of an operation to the /// lattice after its execution. - virtual void visitOperationImpl(Operation *op, - const AbstractDenseLattice &before, - AbstractDenseLattice *after) = 0; + virtual LogicalResult visitOperationImpl(Operation *op, + const AbstractDenseLattice &before, + AbstractDenseLattice *after) = 0; /// Get the dense lattice after the execution of the given program point. virtual AbstractDenseLattice *getLattice(ProgramPoint point) = 0; @@ -114,7 +114,7 @@ class AbstractDenseForwardDataFlowAnalysis : public DataFlowAnalysis { /// operation, then the state after the execution of the operation is set by /// control-flow or the callgraph. Otherwise, this function invokes the /// operation transfer function. - virtual void processOperation(Operation *op); + virtual LogicalResult processOperation(Operation *op); /// Propagate the dense lattice forward along the control flow edge from /// `regionFrom` to `regionTo` regions of the `branch` operation. `nullopt` @@ -191,8 +191,8 @@ class DenseForwardDataFlowAnalysis /// Visit an operation with the dense lattice before its execution. This /// function is expected to set the dense lattice after its execution and /// trigger change propagation in case of change. - virtual void visitOperation(Operation *op, const LatticeT &before, - LatticeT *after) = 0; + virtual LogicalResult visitOperation(Operation *op, const LatticeT &before, + LatticeT *after) = 0; /// Hook for customizing the behavior of lattice propagation along the call /// control flow edges. Two types of (forward) propagation are possible here: @@ -263,10 +263,11 @@ class DenseForwardDataFlowAnalysis /// Type-erased wrappers that convert the abstract dense lattice to a derived /// lattice and invoke the virtual hooks operating on the derived lattice. - void visitOperationImpl(Operation *op, const AbstractDenseLattice &before, - AbstractDenseLattice *after) final { - visitOperation(op, static_cast(before), - static_cast(after)); + LogicalResult visitOperationImpl(Operation *op, + const AbstractDenseLattice &before, + AbstractDenseLattice *after) final { + return visitOperation(op, static_cast(before), + static_cast(after)); } void visitCallControlFlowTransfer(CallOpInterface call, CallControlFlowAction action, @@ -326,9 +327,9 @@ class AbstractDenseBackwardDataFlowAnalysis : public DataFlowAnalysis { protected: /// Propagate the dense lattice after the execution of an operation to the /// lattice before its execution. - virtual void visitOperationImpl(Operation *op, - const AbstractDenseLattice &after, - AbstractDenseLattice *before) = 0; + virtual LogicalResult visitOperationImpl(Operation *op, + const AbstractDenseLattice &after, + AbstractDenseLattice *before) = 0; /// Get the dense lattice before the execution of the program point. That is, /// before the execution of the given operation or after the execution of the @@ -353,7 +354,7 @@ class AbstractDenseBackwardDataFlowAnalysis : public DataFlowAnalysis { /// Visit an operation. Dispatches to specialized methods for call or region /// control-flow operations. Otherwise, this function invokes the operation /// transfer function. - virtual void processOperation(Operation *op); + virtual LogicalResult processOperation(Operation *op); /// Propagate the dense lattice backwards along the control flow edge from /// `regionFrom` to `regionTo` regions of the `branch` operation. `nullopt` @@ -442,8 +443,8 @@ class DenseBackwardDataFlowAnalysis /// Transfer function. Visits an operation with the dense lattice after its /// execution. This function is expected to set the dense lattice before its /// execution and trigger propagation in case of change. - virtual void visitOperation(Operation *op, const LatticeT &after, - LatticeT *before) = 0; + virtual LogicalResult visitOperation(Operation *op, const LatticeT &after, + LatticeT *before) = 0; /// Hook for customizing the behavior of lattice propagation along the call /// control flow edges. Two types of (back) propagation are possible here: @@ -513,10 +514,11 @@ class DenseBackwardDataFlowAnalysis /// Type-erased wrappers that convert the abstract dense lattice to a derived /// lattice and invoke the virtual hooks operating on the derived lattice. - void visitOperationImpl(Operation *op, const AbstractDenseLattice &after, - AbstractDenseLattice *before) final { - visitOperation(op, static_cast(after), - static_cast(before)); + LogicalResult visitOperationImpl(Operation *op, + const AbstractDenseLattice &after, + AbstractDenseLattice *before) final { + return visitOperation(op, static_cast(after), + static_cast(before)); } void visitCallControlFlowTransfer(CallOpInterface call, CallControlFlowAction action, diff --git a/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h index 191c023fb642cb..d4a5472cfde868 100644 --- a/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h +++ b/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h @@ -55,9 +55,10 @@ class IntegerRangeAnalysis /// Visit an operation. Invoke the transfer function on each operation that /// implements `InferIntRangeInterface`. - void visitOperation(Operation *op, - ArrayRef operands, - ArrayRef results) override; + LogicalResult + visitOperation(Operation *op, + ArrayRef operands, + ArrayRef results) override; /// Visit block arguments or operation results of an operation with region /// control-flow for which values are not defined by region control-flow. This diff --git a/mlir/include/mlir/Analysis/DataFlow/LivenessAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/LivenessAnalysis.h index caa03e26a3a423..cf1fd6e2d48caa 100644 --- a/mlir/include/mlir/Analysis/DataFlow/LivenessAnalysis.h +++ b/mlir/include/mlir/Analysis/DataFlow/LivenessAnalysis.h @@ -79,8 +79,8 @@ class LivenessAnalysis : public SparseBackwardDataFlowAnalysis { public: using SparseBackwardDataFlowAnalysis::SparseBackwardDataFlowAnalysis; - void visitOperation(Operation *op, ArrayRef operands, - ArrayRef results) override; + LogicalResult visitOperation(Operation *op, ArrayRef operands, + ArrayRef results) override; void visitBranchOperand(OpOperand &operand) override; diff --git a/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h index 7aadd5409cc695..89726ae3a855c8 100644 --- a/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h +++ b/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h @@ -197,7 +197,7 @@ class AbstractSparseForwardDataFlowAnalysis : public DataFlowAnalysis { /// The operation transfer function. Given the operand lattices, this /// function is expected to set the result lattices. - virtual void + virtual LogicalResult visitOperationImpl(Operation *op, ArrayRef operandLattices, ArrayRef resultLattices) = 0; @@ -238,7 +238,7 @@ class AbstractSparseForwardDataFlowAnalysis : public DataFlowAnalysis { /// Visit an operation. If this is a call operation or an operation with /// region control-flow, then its result lattices are set accordingly. /// Otherwise, the operation transfer function is invoked. - void visitOperation(Operation *op); + LogicalResult visitOperation(Operation *op); /// Visit a block to compute the lattice values of its arguments. If this is /// an entry block, then the argument values are determined from the block's @@ -277,8 +277,9 @@ class SparseForwardDataFlowAnalysis /// Visit an operation with the lattices of its operands. This function is /// expected to set the lattices of the operation's results. - virtual void visitOperation(Operation *op, ArrayRef operands, - ArrayRef results) = 0; + virtual LogicalResult visitOperation(Operation *op, + ArrayRef operands, + ArrayRef results) = 0; /// Visit a call operation to an externally defined function given the /// lattices of its arguments. @@ -328,10 +329,10 @@ class SparseForwardDataFlowAnalysis private: /// Type-erased wrappers that convert the abstract lattice operands to derived /// lattices and invoke the virtual hooks operating on the derived lattices. - void visitOperationImpl( + LogicalResult visitOperationImpl( Operation *op, ArrayRef operandLattices, ArrayRef resultLattices) override { - visitOperation( + return visitOperation( op, {reinterpret_cast(operandLattices.begin()), operandLattices.size()}, @@ -387,7 +388,7 @@ class AbstractSparseBackwardDataFlowAnalysis : public DataFlowAnalysis { /// The operation transfer function. Given the result lattices, this /// function is expected to set the operand lattices. - virtual void visitOperationImpl( + virtual LogicalResult visitOperationImpl( Operation *op, ArrayRef operandLattices, ArrayRef resultLattices) = 0; @@ -424,7 +425,7 @@ class AbstractSparseBackwardDataFlowAnalysis : public DataFlowAnalysis { /// Visit an operation. If this is a call operation or an operation with /// region control-flow, then its operand lattices are set accordingly. /// Otherwise, the operation transfer function is invoked. - void visitOperation(Operation *op); + LogicalResult visitOperation(Operation *op); /// Visit a block. void visitBlock(Block *block); @@ -474,8 +475,9 @@ class SparseBackwardDataFlowAnalysis /// Visit an operation with the lattices of its results. This function is /// expected to set the lattices of the operation's operands. - virtual void visitOperation(Operation *op, ArrayRef operands, - ArrayRef results) = 0; + virtual LogicalResult visitOperation(Operation *op, + ArrayRef operands, + ArrayRef results) = 0; /// Visit a call to an external function. This function is expected to set /// lattice values of the call operands. By default, calls `visitCallOperand` @@ -510,10 +512,10 @@ class SparseBackwardDataFlowAnalysis private: /// Type-erased wrappers that convert the abstract lattice operands to derived /// lattices and invoke the virtual hooks operating on the derived lattices. - void visitOperationImpl( + LogicalResult visitOperationImpl( Operation *op, ArrayRef operandLattices, ArrayRef resultLattices) override { - visitOperation( + return visitOperation( op, {reinterpret_cast(operandLattices.begin()), operandLattices.size()}, diff --git a/mlir/lib/Analysis/DataFlow/ConstantPropagationAnalysis.cpp b/mlir/lib/Analysis/DataFlow/ConstantPropagationAnalysis.cpp index 16799d3c82092e..56529acd71bbf8 100644 --- a/mlir/lib/Analysis/DataFlow/ConstantPropagationAnalysis.cpp +++ b/mlir/lib/Analysis/DataFlow/ConstantPropagationAnalysis.cpp @@ -43,7 +43,7 @@ void ConstantValue::print(raw_ostream &os) const { // SparseConstantPropagation //===----------------------------------------------------------------------===// -void SparseConstantPropagation::visitOperation( +LogicalResult SparseConstantPropagation::visitOperation( Operation *op, ArrayRef *> operands, ArrayRef *> results) { LLVM_DEBUG(llvm::dbgs() << "SCP: Visiting operation: " << *op << "\n"); @@ -54,14 +54,14 @@ void SparseConstantPropagation::visitOperation( // folding. if (op->getNumRegions()) { setAllToEntryStates(results); - return; + return success(); } SmallVector constantOperands; constantOperands.reserve(op->getNumOperands()); for (auto *operandLattice : operands) { if (operandLattice->getValue().isUninitialized()) - return; + return success(); constantOperands.push_back(operandLattice->getValue().getConstantValue()); } @@ -77,7 +77,7 @@ void SparseConstantPropagation::visitOperation( foldResults.reserve(op->getNumResults()); if (failed(op->fold(constantOperands, foldResults))) { setAllToEntryStates(results); - return; + return success(); } // If the folding was in-place, mark the results as overdefined and reset @@ -87,7 +87,7 @@ void SparseConstantPropagation::visitOperation( op->setOperands(originalOperands); op->setAttrs(originalAttrs); setAllToEntryStates(results); - return; + return success(); } // Merge the fold results into the lattice for this operation. @@ -108,6 +108,7 @@ void SparseConstantPropagation::visitOperation( lattice, *getLatticeElement(foldResult.get())); } } + return success(); } void SparseConstantPropagation::setToEntryState( diff --git a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp index 9894810f0e04b3..33c877f78f4bf6 100644 --- a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp +++ b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp @@ -30,7 +30,9 @@ using namespace mlir::dataflow; LogicalResult AbstractDenseForwardDataFlowAnalysis::initialize(Operation *top) { // Visit every operation and block. - processOperation(top); + if (failed(processOperation(top))) + return failure(); + for (Region ®ion : top->getRegions()) { for (Block &block : region) { visitBlock(&block); @@ -44,7 +46,7 @@ LogicalResult AbstractDenseForwardDataFlowAnalysis::initialize(Operation *top) { LogicalResult AbstractDenseForwardDataFlowAnalysis::visit(ProgramPoint point) { if (auto *op = llvm::dyn_cast_if_present(point)) - processOperation(op); + return processOperation(op); else if (auto *block = llvm::dyn_cast_if_present(point)) visitBlock(block); else @@ -94,10 +96,11 @@ void AbstractDenseForwardDataFlowAnalysis::visitCallOperation( } } -void AbstractDenseForwardDataFlowAnalysis::processOperation(Operation *op) { +LogicalResult +AbstractDenseForwardDataFlowAnalysis::processOperation(Operation *op) { // If the containing block is not executable, bail out. if (!getOrCreateFor(op, op->getBlock())->isLive()) - return; + return success(); // Get the dense lattice to update. AbstractDenseLattice *after = getLattice(op); @@ -111,16 +114,20 @@ void AbstractDenseForwardDataFlowAnalysis::processOperation(Operation *op) { // If this op implements region control-flow, then control-flow dictates its // transfer function. - if (auto branch = dyn_cast(op)) - return visitRegionBranchOperation(op, branch, after); + if (auto branch = dyn_cast(op)) { + visitRegionBranchOperation(op, branch, after); + return success(); + } // If this is a call operation, then join its lattices across known return // sites. - if (auto call = dyn_cast(op)) - return visitCallOperation(call, *before, after); + if (auto call = dyn_cast(op)) { + visitCallOperation(call, *before, after); + return success(); + } // Invoke the operation transfer function. - visitOperationImpl(op, *before, after); + return visitOperationImpl(op, *before, after); } void AbstractDenseForwardDataFlowAnalysis::visitBlock(Block *block) { @@ -254,7 +261,9 @@ AbstractDenseForwardDataFlowAnalysis::getLatticeFor(ProgramPoint dependent, LogicalResult AbstractDenseBackwardDataFlowAnalysis::initialize(Operation *top) { // Visit every operation and block. - processOperation(top); + if (failed(processOperation(top))) + return failure(); + for (Region ®ion : top->getRegions()) { for (Block &block : region) { visitBlock(&block); @@ -269,7 +278,7 @@ AbstractDenseBackwardDataFlowAnalysis::initialize(Operation *top) { LogicalResult AbstractDenseBackwardDataFlowAnalysis::visit(ProgramPoint point) { if (auto *op = llvm::dyn_cast_if_present(point)) - processOperation(op); + return processOperation(op); else if (auto *block = llvm::dyn_cast_if_present(point)) visitBlock(block); else @@ -323,10 +332,11 @@ void AbstractDenseBackwardDataFlowAnalysis::visitCallOperation( latticeAtCalleeEntry, latticeBeforeCall); } -void AbstractDenseBackwardDataFlowAnalysis::processOperation(Operation *op) { +LogicalResult +AbstractDenseBackwardDataFlowAnalysis::processOperation(Operation *op) { // If the containing block is not executable, bail out. if (!getOrCreateFor(op, op->getBlock())->isLive()) - return; + return success(); // Get the dense lattice to update. AbstractDenseLattice *before = getLattice(op); @@ -339,14 +349,17 @@ void AbstractDenseBackwardDataFlowAnalysis::processOperation(Operation *op) { after = getLatticeFor(op, op->getBlock()); // Special cases where control flow may dictate data flow. - if (auto branch = dyn_cast(op)) - return visitRegionBranchOperation(op, branch, RegionBranchPoint::parent(), - before); - if (auto call = dyn_cast(op)) - return visitCallOperation(call, *after, before); + if (auto branch = dyn_cast(op)) { + visitRegionBranchOperation(op, branch, RegionBranchPoint::parent(), before); + return success(); + } + if (auto call = dyn_cast(op)) { + visitCallOperation(call, *after, before); + return success(); + } // Invoke the operation transfer function. - visitOperationImpl(op, *after, before); + return visitOperationImpl(op, *after, before); } void AbstractDenseBackwardDataFlowAnalysis::visitBlock(Block *block) { diff --git a/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp b/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp index 244ce8b9c2ac63..35d38ea02d7162 100644 --- a/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp +++ b/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp @@ -58,12 +58,14 @@ void IntegerValueRangeLattice::onUpdate(DataFlowSolver *solver) const { dialect))); } -void IntegerRangeAnalysis::visitOperation( +LogicalResult IntegerRangeAnalysis::visitOperation( Operation *op, ArrayRef operands, ArrayRef results) { auto inferrable = dyn_cast(op); - if (!inferrable) - return setAllToEntryStates(results); + if (!inferrable) { + setAllToEntryStates(results); + return success(); + } LLVM_DEBUG(llvm::dbgs() << "Inferring ranges for " << *op << "\n"); auto argRanges = llvm::map_to_vector( @@ -99,6 +101,7 @@ void IntegerRangeAnalysis::visitOperation( }; inferrable.inferResultRangesFromOptional(argRanges, joinCallback); + return success(); } void IntegerRangeAnalysis::visitNonControlFlowArguments( diff --git a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp index 7875fa9d43d9e2..57a4d4a6800be0 100644 --- a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp +++ b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp @@ -68,9 +68,9 @@ ChangeResult Liveness::meet(const AbstractSparseLattice &other) { /// (3.b) `A` is used to compute some value `C` and `C` is used to compute /// `B`. -void LivenessAnalysis::visitOperation(Operation *op, - ArrayRef operands, - ArrayRef results) { +LogicalResult +LivenessAnalysis::visitOperation(Operation *op, ArrayRef operands, + ArrayRef results) { // This marks values of type (1.a) liveness as "live". if (!isMemoryEffectFree(op)) { for (auto *operand : operands) @@ -89,6 +89,7 @@ void LivenessAnalysis::visitOperation(Operation *op, } addDependency(const_cast(r), op); } + return success(); } void LivenessAnalysis::visitBranchOperand(OpOperand &operand) { @@ -158,7 +159,7 @@ void LivenessAnalysis::visitBranchOperand(OpOperand &operand) { SmallVector resultsLiveness; for (const Value result : op->getResults()) resultsLiveness.push_back(getLatticeElement(result)); - visitOperation(op, operandLiveness, resultsLiveness); + (void)visitOperation(op, operandLiveness, resultsLiveness); // We also visit the parent op with the parent's results and this operand if // `op` is a `RegionBranchTerminatorOpInterface` because its non-forwarded @@ -170,7 +171,7 @@ void LivenessAnalysis::visitBranchOperand(OpOperand &operand) { SmallVector parentResultsLiveness; for (const Value parentResult : parentOp->getResults()) parentResultsLiveness.push_back(getLatticeElement(parentResult)); - visitOperation(parentOp, operandLiveness, parentResultsLiveness); + (void)visitOperation(parentOp, operandLiveness, parentResultsLiveness); } void LivenessAnalysis::visitCallOperand(OpOperand &operand) { diff --git a/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp index ad956b73e4b1d4..d47d5fec8a9a6a 100644 --- a/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp +++ b/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp @@ -67,7 +67,9 @@ LogicalResult AbstractSparseForwardDataFlowAnalysis::initializeRecursively(Operation *op) { // Initialize the analysis by visiting every owner of an SSA value (all // operations and blocks). - visitOperation(op); + if (failed(visitOperation(op))) + return failure(); + for (Region ®ion : op->getRegions()) { for (Block &block : region) { getOrCreate(&block)->blockContentSubscribe(this); @@ -83,7 +85,7 @@ AbstractSparseForwardDataFlowAnalysis::initializeRecursively(Operation *op) { LogicalResult AbstractSparseForwardDataFlowAnalysis::visit(ProgramPoint point) { if (Operation *op = llvm::dyn_cast_if_present(point)) - visitOperation(op); + return visitOperation(op); else if (Block *block = llvm::dyn_cast_if_present(point)) visitBlock(block); else @@ -91,14 +93,15 @@ LogicalResult AbstractSparseForwardDataFlowAnalysis::visit(ProgramPoint point) { return success(); } -void AbstractSparseForwardDataFlowAnalysis::visitOperation(Operation *op) { +LogicalResult +AbstractSparseForwardDataFlowAnalysis::visitOperation(Operation *op) { // Exit early on operations with no results. if (op->getNumResults() == 0) - return; + return success(); // If the containing block is not executable, bail out. if (!getOrCreate(op->getBlock())->isLive()) - return; + return success(); // Get the result lattices. SmallVector resultLattices; @@ -110,9 +113,10 @@ void AbstractSparseForwardDataFlowAnalysis::visitOperation(Operation *op) { // The results of a region branch operation are determined by control-flow. if (auto branch = dyn_cast(op)) { - return visitRegionSuccessors({branch}, branch, - /*successor=*/RegionBranchPoint::parent(), - resultLattices); + visitRegionSuccessors({branch}, branch, + /*successor=*/RegionBranchPoint::parent(), + resultLattices); + return success(); } // Grab the lattice elements of the operands. @@ -131,7 +135,8 @@ void AbstractSparseForwardDataFlowAnalysis::visitOperation(Operation *op) { dyn_cast_if_present(call.resolveCallable()); if (!getSolverConfig().isInterprocedural() || (callable && !callable.getCallableRegion())) { - return visitExternalCallImpl(call, operandLattices, resultLattices); + visitExternalCallImpl(call, operandLattices, resultLattices); + return success(); } // Otherwise, the results of a call operation are determined by the @@ -139,16 +144,19 @@ void AbstractSparseForwardDataFlowAnalysis::visitOperation(Operation *op) { const auto *predecessors = getOrCreateFor(op, call); // If not all return sites are known, then conservatively assume we can't // reason about the data-flow. - if (!predecessors->allPredecessorsKnown()) - return setAllToEntryStates(resultLattices); + if (!predecessors->allPredecessorsKnown()) { + setAllToEntryStates(resultLattices); + return success(); + } for (Operation *predecessor : predecessors->getKnownPredecessors()) - for (auto it : llvm::zip(predecessor->getOperands(), resultLattices)) - join(std::get<1>(it), *getLatticeElementFor(op, std::get<0>(it))); - return; + for (auto &&[operand, resLattice] : + llvm::zip(predecessor->getOperands(), resultLattices)) + join(resLattice, *getLatticeElementFor(op, operand)); + return success(); } // Invoke the operation transfer function. - visitOperationImpl(op, operandLattices, resultLattices); + return visitOperationImpl(op, operandLattices, resultLattices); } void AbstractSparseForwardDataFlowAnalysis::visitBlock(Block *block) { @@ -326,7 +334,9 @@ AbstractSparseBackwardDataFlowAnalysis::initialize(Operation *top) { LogicalResult AbstractSparseBackwardDataFlowAnalysis::initializeRecursively(Operation *op) { - visitOperation(op); + if (failed(visitOperation(op))) + return failure(); + for (Region ®ion : op->getRegions()) { for (Block &block : region) { getOrCreate(&block)->blockContentSubscribe(this); @@ -344,7 +354,7 @@ AbstractSparseBackwardDataFlowAnalysis::initializeRecursively(Operation *op) { LogicalResult AbstractSparseBackwardDataFlowAnalysis::visit(ProgramPoint point) { if (Operation *op = llvm::dyn_cast_if_present(point)) - visitOperation(op); + return visitOperation(op); else if (llvm::dyn_cast_if_present(point)) // For backward dataflow, we don't have to do any work for the blocks // themselves. CFG edges between blocks are processed by the BranchOp @@ -384,10 +394,11 @@ static MutableArrayRef operandsToOpOperands(OperandRange &operands) { return MutableArrayRef(operands.getBase(), operands.size()); } -void AbstractSparseBackwardDataFlowAnalysis::visitOperation(Operation *op) { +LogicalResult +AbstractSparseBackwardDataFlowAnalysis::visitOperation(Operation *op) { // If we're in a dead block, bail out. if (!getOrCreate(op->getBlock())->isLive()) - return; + return success(); SmallVector operandLattices = getLatticeElements(op->getOperands()); @@ -398,7 +409,7 @@ void AbstractSparseBackwardDataFlowAnalysis::visitOperation(Operation *op) { // of the parent op if (auto branch = dyn_cast(op)) { visitRegionSuccessors(branch, operandLattices); - return; + return success(); } if (auto branch = dyn_cast(op)) { @@ -432,7 +443,7 @@ void AbstractSparseBackwardDataFlowAnalysis::visitOperation(Operation *op) { OpOperand &operand = op->getOpOperand(index); visitBranchOperand(operand); } - return; + return success(); } // For function calls, connect the arguments of the entry blocks to the @@ -451,8 +462,11 @@ void AbstractSparseBackwardDataFlowAnalysis::visitOperation(Operation *op) { MutableArrayRef argOpOperands = operandsToOpOperands(argOperands); Region *region = callable.getCallableRegion(); - if (!region || region->empty() || !getSolverConfig().isInterprocedural()) - return visitExternalCallImpl(call, operandLattices, resultLattices); + if (!region || region->empty() || + !getSolverConfig().isInterprocedural()) { + visitExternalCallImpl(call, operandLattices, resultLattices); + return success(); + } // Otherwise, propagate information from the entry point of the function // back to operands whenever possible. @@ -470,7 +484,7 @@ void AbstractSparseBackwardDataFlowAnalysis::visitOperation(Operation *op) { OpOperand &opOperand = op->getOpOperand(index); visitCallOperand(opOperand); } - return; + return success(); } } @@ -487,7 +501,7 @@ void AbstractSparseBackwardDataFlowAnalysis::visitOperation(Operation *op) { if (auto terminator = dyn_cast(op)) { if (auto branch = dyn_cast(op->getParentOp())) { visitRegionSuccessorsFromTerminator(terminator, branch); - return; + return success(); } } @@ -511,11 +525,11 @@ void AbstractSparseBackwardDataFlowAnalysis::visitOperation(Operation *op) { // for the return ops of any public functions. setAllToExitStates(operandLattices); } - return; + return success(); } } - visitOperationImpl(op, operandLattices, resultLattices); + return visitOperationImpl(op, operandLattices, resultLattices); } void AbstractSparseBackwardDataFlowAnalysis::visitRegionSuccessors( diff --git a/mlir/test/Analysis/DataFlow/test-last-modified-error.mlir b/mlir/test/Analysis/DataFlow/test-last-modified-error.mlir new file mode 100644 index 00000000000000..57476e60d3c2a9 --- /dev/null +++ b/mlir/test/Analysis/DataFlow/test-last-modified-error.mlir @@ -0,0 +1,8 @@ +// RUN: not mlir-opt -test-last-modified %s 2>&1 | FileCheck %s + +// test error propagation from UnderlyingValueAnalysis::visitOperation +// CHECK: this op is always fails +func.func @test() { + %c0 = arith.constant { always_fail } 0 : i32 + return +} diff --git a/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp b/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp index 65592a5c5d698b..6794cbbbd89941 100644 --- a/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp +++ b/mlir/test/lib/Analysis/DataFlow/TestDenseBackwardDataFlowAnalysis.cpp @@ -55,8 +55,8 @@ class NextAccessAnalysis : public DenseBackwardDataFlowAnalysis { : DenseBackwardDataFlowAnalysis(solver, symbolTable), assumeFuncReads(assumeFuncReads) {} - void visitOperation(Operation *op, const NextAccess &after, - NextAccess *before) override; + LogicalResult visitOperation(Operation *op, const NextAccess &after, + NextAccess *before) override; void visitCallControlFlowTransfer(CallOpInterface call, CallControlFlowAction action, @@ -80,13 +80,16 @@ class NextAccessAnalysis : public DenseBackwardDataFlowAnalysis { }; } // namespace -void NextAccessAnalysis::visitOperation(Operation *op, const NextAccess &after, - NextAccess *before) { +LogicalResult NextAccessAnalysis::visitOperation(Operation *op, + const NextAccess &after, + NextAccess *before) { auto memory = dyn_cast(op); // If we can't reason about the memory effects, conservatively assume we can't // say anything about the next access. - if (!memory) - return setToExitState(before); + if (!memory) { + setToExitState(before); + return success(); + } SmallVector effects; memory.getEffects(effects); @@ -102,8 +105,10 @@ void NextAccessAnalysis::visitOperation(Operation *op, const NextAccess &after, // Effects with unspecified value are treated conservatively and we cannot // assume anything about the next access. - if (!value) - return setToExitState(before); + if (!value) { + setToExitState(before); + return success(); + } // If cannot find the most underlying value, we cannot assume anything about // the next accesses. @@ -115,7 +120,7 @@ void NextAccessAnalysis::visitOperation(Operation *op, const NextAccess &after, // If the underlying value is not known yet, don't propagate. if (!underlyingValue) - return; + return success(); underlyingValues.push_back(*underlyingValue); } @@ -124,12 +129,15 @@ void NextAccessAnalysis::visitOperation(Operation *op, const NextAccess &after, ChangeResult result = before->meet(after); for (const auto &[effect, value] : llvm::zip(effects, underlyingValues)) { // If the underlying value is known to be unknown, set to fixpoint. - if (!value) - return setToExitState(before); + if (!value) { + setToExitState(before); + return success(); + } result |= before->set(value, op); } propagateIfChanged(before, result); + return success(); } void NextAccessAnalysis::visitCallControlFlowTransfer( @@ -162,7 +170,7 @@ void NextAccessAnalysis::visitCallControlFlowTransfer( testCallAndStore.getStoreBeforeCall()) || (action == CallControlFlowAction::ExitCallee && !testCallAndStore.getStoreBeforeCall()))) { - visitOperation(call, after, before); + (void)visitOperation(call, after, before); } else { AbstractDenseBackwardDataFlowAnalysis::visitCallControlFlowTransfer( call, action, after, before); @@ -179,8 +187,8 @@ void NextAccessAnalysis::visitRegionBranchControlFlowTransfer( ((regionTo.isParent() && !testStoreWithARegion.getStoreBeforeRegion()) || (regionFrom.isParent() && testStoreWithARegion.getStoreBeforeRegion()))) { - visitOperation(branch, static_cast(after), - static_cast(before)); + (void)visitOperation(branch, static_cast(after), + static_cast(before)); } else { propagateIfChanged(before, before->meet(after)); } diff --git a/mlir/test/lib/Analysis/DataFlow/TestDenseDataFlowAnalysis.h b/mlir/test/lib/Analysis/DataFlow/TestDenseDataFlowAnalysis.h index 61ddc13f8a3d4a..57fe0ca458de21 100644 --- a/mlir/test/lib/Analysis/DataFlow/TestDenseDataFlowAnalysis.h +++ b/mlir/test/lib/Analysis/DataFlow/TestDenseDataFlowAnalysis.h @@ -191,10 +191,16 @@ class UnderlyingValueAnalysis using SparseForwardDataFlowAnalysis::SparseForwardDataFlowAnalysis; /// The underlying value of the results of an operation are not known. - void visitOperation(Operation *op, - ArrayRef operands, - ArrayRef results) override { + LogicalResult + visitOperation(Operation *op, + ArrayRef operands, + ArrayRef results) override { + // Hook to test error propagation from visitOperation. + if (op->hasAttr("always_fail")) + return op->emitError("this op is always fails"); + setAllToEntryStates(results); + return success(); } /// At an entry point, the underlying value of a value is itself. diff --git a/mlir/test/lib/Analysis/DataFlow/TestDenseForwardDataFlowAnalysis.cpp b/mlir/test/lib/Analysis/DataFlow/TestDenseForwardDataFlowAnalysis.cpp index 3f9ce2dc0bc50a..301d2a20978c84 100644 --- a/mlir/test/lib/Analysis/DataFlow/TestDenseForwardDataFlowAnalysis.cpp +++ b/mlir/test/lib/Analysis/DataFlow/TestDenseForwardDataFlowAnalysis.cpp @@ -58,8 +58,8 @@ class LastModifiedAnalysis /// is propagated with no change. If the operation allocates a resource, then /// its reaching definitions is set to empty. If the operation writes to a /// resource, then its reaching definition is set to the written value. - void visitOperation(Operation *op, const LastModification &before, - LastModification *after) override; + LogicalResult visitOperation(Operation *op, const LastModification &before, + LastModification *after) override; void visitCallControlFlowTransfer(CallOpInterface call, CallControlFlowAction action, @@ -83,14 +83,15 @@ class LastModifiedAnalysis }; } // end anonymous namespace -void LastModifiedAnalysis::visitOperation(Operation *op, - const LastModification &before, - LastModification *after) { +LogicalResult LastModifiedAnalysis::visitOperation( + Operation *op, const LastModification &before, LastModification *after) { auto memory = dyn_cast(op); // If we can't reason about the memory effects, then conservatively assume we // can't deduce anything about the last modifications. - if (!memory) - return setToEntryState(after); + if (!memory) { + setToEntryState(after); + return success(); + } SmallVector effects; memory.getEffects(effects); @@ -106,8 +107,10 @@ void LastModifiedAnalysis::visitOperation(Operation *op, // If we see an effect on anything other than a value, assume we can't // deduce anything about the last modifications. - if (!value) - return setToEntryState(after); + if (!value) { + setToEntryState(after); + return success(); + } // If we cannot find the underlying value, we shouldn't just propagate the // effects through, return the pessimistic state. @@ -119,7 +122,7 @@ void LastModifiedAnalysis::visitOperation(Operation *op, // If the underlying value is not yet known, don't propagate yet. if (!underlyingValue) - return; + return success(); underlyingValues.push_back(*underlyingValue); } @@ -128,8 +131,10 @@ void LastModifiedAnalysis::visitOperation(Operation *op, ChangeResult result = after->join(before); for (const auto &[effect, value] : llvm::zip(effects, underlyingValues)) { // If the underlying value is known to be unknown, set to fixpoint state. - if (!value) - return setToEntryState(after); + if (!value) { + setToEntryState(after); + return success(); + } // Nothing to do for reads. if (isa(effect.getEffect())) @@ -138,6 +143,7 @@ void LastModifiedAnalysis::visitOperation(Operation *op, result |= after->set(value, op); } propagateIfChanged(after, result); + return success(); } void LastModifiedAnalysis::visitCallControlFlowTransfer( @@ -169,7 +175,8 @@ void LastModifiedAnalysis::visitCallControlFlowTransfer( testCallAndStore.getStoreBeforeCall()) || (action == CallControlFlowAction::ExitCallee && !testCallAndStore.getStoreBeforeCall()))) { - return visitOperation(call, before, after); + (void)visitOperation(call, before, after); + return; } AbstractDenseForwardDataFlowAnalysis::visitCallControlFlowTransfer( call, action, before, after); @@ -188,7 +195,7 @@ void LastModifiedAnalysis::visitRegionBranchControlFlowTransfer( [=](auto storeWithRegion) { if ((!regionTo && !storeWithRegion.getStoreBeforeRegion()) || (!regionFrom && storeWithRegion.getStoreBeforeRegion())) - visitOperation(branch, before, after); + (void)visitOperation(branch, before, after); defaultHandling(); }) .Default([=](auto) { defaultHandling(); }); diff --git a/mlir/test/lib/Analysis/DataFlow/TestSparseBackwardDataFlowAnalysis.cpp b/mlir/test/lib/Analysis/DataFlow/TestSparseBackwardDataFlowAnalysis.cpp index 30297380466442..2445b58452bd60 100644 --- a/mlir/test/lib/Analysis/DataFlow/TestSparseBackwardDataFlowAnalysis.cpp +++ b/mlir/test/lib/Analysis/DataFlow/TestSparseBackwardDataFlowAnalysis.cpp @@ -76,8 +76,8 @@ class WrittenToAnalysis : public SparseBackwardDataFlowAnalysis { : SparseBackwardDataFlowAnalysis(solver, symbolTable), assumeFuncWrites(assumeFuncWrites) {} - void visitOperation(Operation *op, ArrayRef operands, - ArrayRef results) override; + LogicalResult visitOperation(Operation *op, ArrayRef operands, + ArrayRef results) override; void visitBranchOperand(OpOperand &operand) override; @@ -94,15 +94,15 @@ class WrittenToAnalysis : public SparseBackwardDataFlowAnalysis { bool assumeFuncWrites; }; -void WrittenToAnalysis::visitOperation(Operation *op, - ArrayRef operands, - ArrayRef results) { +LogicalResult +WrittenToAnalysis::visitOperation(Operation *op, ArrayRef operands, + ArrayRef results) { if (auto store = dyn_cast(op)) { SetVector newWrites; newWrites.insert(op->getAttrOfType("tag_name")); propagateIfChanged(operands[0], operands[0]->getValue().addWrites(newWrites)); - return; + return success(); } // By default, every result of an op depends on every operand. for (const WrittenTo *r : results) { for (WrittenTo *operand : operands) { @@ -110,6 +110,7 @@ void WrittenToAnalysis::visitOperation(Operation *op, } addDependency(const_cast(r), op); } + return success(); } void WrittenToAnalysis::visitBranchOperand(OpOperand &operand) { From 378daa6c6fd10d3704be449f2fe9c55df522a6e9 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Thu, 22 Aug 2024 17:20:47 +0800 Subject: [PATCH 167/426] [MemCpyOpt] Avoid infinite loops in `MemCpyOptPass::processMemCpyMemCpyDependence` (#103218) Closes https://github.com/llvm/llvm-project/issues/102994. --- .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 4 ++ llvm/test/Transforms/MemCpyOpt/pr102994.ll | 39 +++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 llvm/test/Transforms/MemCpyOpt/pr102994.ll diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 1d779128e454c1..3f15fa2163d270 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1193,6 +1193,10 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, CopySourceAlign = commonAlignment(*CopySourceAlign, MForwardOffset); } + // Avoid infinite loops + if (BAA.isMustAlias(M->getSource(), CopySource)) + return false; + // Verify that the copied-from memory doesn't change in between the two // transfers. For example, in: // memcpy(a <- b) diff --git a/llvm/test/Transforms/MemCpyOpt/pr102994.ll b/llvm/test/Transforms/MemCpyOpt/pr102994.ll new file mode 100644 index 00000000000000..9a782b9b3752a2 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/pr102994.ll @@ -0,0 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=memcpyopt < %s | FileCheck %s + +@g1 = external global i8 +@g2 = external global [64 x i8] +@g3 = global i8 0, align 1 + +define void @func() { +; CHECK-LABEL: define void @func() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr getelementptr inbounds (i8, ptr @g2, i64 16), ptr getelementptr inbounds nuw (i8, ptr @g2, i64 16), i64 20, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr @g1, ptr getelementptr inbounds (i8, ptr @g2, i64 24), i64 1, i1 false) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.memcpy.p0.p0.i64(ptr getelementptr inbounds (i8, ptr @g2, i64 16), ptr getelementptr inbounds nuw (i8, ptr @g2, i64 16), i64 20, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr @g1, ptr getelementptr inbounds (i8, ptr @g2, i64 24), i64 1, i1 false) + ret void +} + +define void @func2(ptr %p) { +; CHECK-LABEL: define void @func2( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr [[P]], i64 32 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[P]], i64 34 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr [[P]], i64 32 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[GEP1]], ptr [[GEP3]], i64 32, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr @g3, ptr [[GEP2]], i64 1, i1 false) +; CHECK-NEXT: ret void +; +entry: + %gep1 = getelementptr i8, ptr %p, i64 32 + %gep2 = getelementptr i8, ptr %p, i64 34 + %gep3 = getelementptr i8, ptr %p, i64 32 + call void @llvm.memcpy.p0.p0.i64(ptr %gep1, ptr %gep3, i64 32, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr @g3, ptr %gep2, i64 1, i1 false) + ret void +} From ccb2b79655217587accfa592c575f9b7267308b9 Mon Sep 17 00:00:00 2001 From: Matt Devereau Date: Thu, 22 Aug 2024 09:42:02 +0000 Subject: [PATCH 168/426] Fix logf128 tests to allow negative NaNs from (#104929) --- llvm/test/Transforms/InstSimplify/ConstProp/logf128.ll | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/logf128.ll b/llvm/test/Transforms/InstSimplify/ConstProp/logf128.ll index 1f8e1d377f93b3..82db5e4066cb1b 100644 --- a/llvm/test/Transforms/InstSimplify/ConstProp/logf128.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/logf128.ll @@ -72,7 +72,7 @@ define fp128 @log_e_smallest_number_larger_than_one(){ define fp128 @log_e_negative_2(){ ; CHECK-LABEL: define fp128 @log_e_negative_2() { -; CHECK-NEXT: ret fp128 0xL00000000000000007FFF800000000000 +; CHECK-NEXT: ret fp128 0xL0000000000000000{{[7|F]}}FFF800000000000 ; %A = call fp128 @llvm.log.f128(fp128 noundef 0xL0000000000000000C000000000000000) ret fp128 %A @@ -104,7 +104,7 @@ define fp128 @log_e_infinity(){ define fp128 @log_e_negative_infinity(){ ; CHECK-LABEL: define fp128 @log_e_negative_infinity() { -; CHECK-NEXT: ret fp128 0xL00000000000000007FFF800000000000 +; CHECK-NEXT: ret fp128 0xL0000000000000000{{[7|F]}}FFF800000000000 ; %A = call fp128 @llvm.log.f128(fp128 noundef 0xL0000000000000000FFFF000000000000) ret fp128 %A @@ -120,7 +120,7 @@ define fp128 @log_e_nan(){ define <2 x fp128> @log_e_negative_2_vector(){ ; CHECK-LABEL: define <2 x fp128> @log_e_negative_2_vector() { -; CHECK-NEXT: ret <2 x fp128> +; CHECK-NEXT: ret <2 x fp128> ; %A = call <2 x fp128> @llvm.log.v2f128(<2 x fp128> ) ret <2 x fp128> %A From 9ff0468436c957fadcd8926683696a879cbc78a0 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 22 Aug 2024 11:47:56 +0200 Subject: [PATCH 169/426] [libc++] Refactor the std::unique_lock tests (#102151) This makes some of the tests not flaky anymore, updates some tests to also work in C++03 and modernizes them in general. --- ...ass.cpp => implicit_ctad.compile.pass.cpp} | 14 +-- .../thread.lock.unique/mutex.pass.cpp | 69 +++++++++++++++ .../copy_assign.compile.pass.cpp | 4 +- .../copy_ctor.compile.pass.cpp | 4 +- .../thread.lock.unique.cons/default.pass.cpp | 9 +- .../move_assign.pass.cpp | 41 +++------ .../move_ctor.pass.cpp | 39 ++++---- .../thread.lock.unique.cons/mutex.pass.cpp | 44 ++-------- .../mutex_adopt_lock.pass.cpp | 30 ++----- .../mutex_defer_lock.pass.cpp | 31 +++---- .../mutex_duration.pass.cpp | 79 +++++------------ .../mutex_time_point.pass.cpp | 80 +++++------------ .../mutex_try_to_lock.pass.cpp | 72 +++++---------- .../thread.lock.unique.locking/lock.pass.cpp | 81 ++++++----------- .../try_lock.pass.cpp | 41 +++++---- .../try_lock_for.pass.cpp | 43 +++++---- .../try_lock_until.pass.cpp | 39 ++++---- .../unlock.pass.cpp | 26 ++++-- .../member_swap.pass.cpp | 26 ++++-- .../nonmember_swap.pass.cpp | 26 ++++-- .../thread.lock.unique.mod/release.pass.cpp | 33 +++---- .../thread.lock.unique.obs/mutex.pass.cpp | 21 +++-- .../thread.lock.unique.obs/op_bool.pass.cpp | 24 ++--- .../thread.lock.unique.obs/owns_lock.pass.cpp | 21 +++-- .../thread.lock.unique/types.compile.pass.cpp | 5 +- .../thread.lock/thread.lock.unique/types.h | 88 ------------------- libcxx/test/support/checking_mutex.h | 80 +++++++++++++++++ 27 files changed, 503 insertions(+), 567 deletions(-) rename libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/{implicit_ctad.pass.cpp => implicit_ctad.compile.pass.cpp} (71%) create mode 100644 libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/mutex.pass.cpp delete mode 100644 libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.h create mode 100644 libcxx/test/support/checking_mutex.h diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.compile.pass.cpp similarity index 71% rename from libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.pass.cpp rename to libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.compile.pass.cpp index 8c7ca4279eead0..cc94c5704327fe 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.compile.pass.cpp @@ -16,15 +16,7 @@ #include -#include "test_macros.h" -#include "types.h" +#include "checking_mutex.h" -int main(int, char**) { - MyMutex mutex; - { - std::unique_lock lock(mutex); - ASSERT_SAME_TYPE(decltype(lock), std::unique_lock); - } - - return 0; -} +checking_mutex mux; +static_assert(std::is_same_v, decltype(std::unique_lock{mux})>); diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/mutex.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/mutex.pass.cpp new file mode 100644 index 00000000000000..fec8740fbf4404 --- /dev/null +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/mutex.pass.cpp @@ -0,0 +1,69 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: no-threads + +// + +// Make sure std::unique_lock works with std::mutex as expected. + +#include +#include +#include + +#include "make_test_thread.h" + +std::atomic keep_waiting; +std::atomic child_thread_locked; +std::mutex mux; +bool main_thread_unlocked = false; +bool child_thread_unlocked = false; + +void lock_thread() { + std::unique_lock lock(mux); + assert(main_thread_unlocked); + main_thread_unlocked = false; + child_thread_unlocked = true; +} + +void try_lock_thread() { + std::unique_lock lock(mux, std::try_to_lock_t()); + assert(lock.owns_lock()); + child_thread_locked = true; + + while (keep_waiting) + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + child_thread_unlocked = true; +} + +int main(int, char**) { + { + mux.lock(); + std::thread t = support::make_test_thread(lock_thread); + main_thread_unlocked = true; + mux.unlock(); + t.join(); + assert(child_thread_unlocked); + } + + { + child_thread_unlocked = false; + child_thread_locked = false; + keep_waiting = true; + std::thread t = support::make_test_thread(try_lock_thread); + while (!child_thread_locked) + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + assert(!mux.try_lock()); + keep_waiting = false; + t.join(); + assert(child_thread_unlocked); + } + + return 0; +} diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_assign.compile.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_assign.compile.pass.cpp index 9ab8369637cdc5..c0cb7d4ddd27a6 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_assign.compile.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_assign.compile.pass.cpp @@ -14,6 +14,6 @@ #include -#include "../types.h" +#include "checking_mutex.h" -static_assert(!std::is_copy_assignable >::value, ""); +static_assert(!std::is_copy_assignable >::value, ""); diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_ctor.compile.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_ctor.compile.pass.cpp index e846061f5fbd08..2846b24125e784 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_ctor.compile.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_ctor.compile.pass.cpp @@ -14,6 +14,6 @@ #include -#include "../types.h" +#include "checking_mutex.h" -static_assert(!std::is_copy_constructible >::value, ""); +static_assert(!std::is_copy_constructible >::value, ""); diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/default.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/default.pass.cpp index 6fc4f7f23ced3a..f6ca534de42fe9 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/default.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/default.pass.cpp @@ -14,12 +14,17 @@ #include #include +#include +#include "checking_mutex.h" #include "test_macros.h" -#include "../types.h" + +#if TEST_STD_VER >= 11 +static_assert(std::is_nothrow_default_constructible>::value, ""); +#endif int main(int, char**) { - std::unique_lock ul; + std::unique_lock ul; assert(!ul.owns_lock()); assert(ul.mutex() == nullptr); diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_assign.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_assign.pass.cpp index 9563fdebd3e060..588d8332c4164b 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_assign.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_assign.pass.cpp @@ -13,37 +13,24 @@ // unique_lock& operator=(unique_lock&& u); #include +#include #include -#include "nasty_containers.h" -#include "../types.h" -#include "test_macros.h" +#include "checking_mutex.h" int main(int, char**) { - { - typedef MyMutex M; - M m0; - M m1; - std::unique_lock lk0(m0); - std::unique_lock lk1(m1); - lk1 = std::move(lk0); - assert(lk1.mutex() == std::addressof(m0)); - assert(lk1.owns_lock() == true); - assert(lk0.mutex() == nullptr); - assert(lk0.owns_lock() == false); - } - { - typedef nasty_mutex M; - M m0; - M m1; - std::unique_lock lk0(m0); - std::unique_lock lk1(m1); - lk1 = std::move(lk0); - assert(lk1.mutex() == std::addressof(m0)); - assert(lk1.owns_lock() == true); - assert(lk0.mutex() == nullptr); - assert(lk0.owns_lock() == false); - } + checking_mutex m0; + checking_mutex m1; + std::unique_lock lk0(m0); + std::unique_lock lk1(m1); + + auto& result = (lk1 = std::move(lk0)); + + assert(&result == &lk1); + assert(lk1.mutex() == std::addressof(m0)); + assert(lk1.owns_lock()); + assert(lk0.mutex() == nullptr); + assert(!lk0.owns_lock()); return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_ctor.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_ctor.pass.cpp index 08f6fc8410e25e..7dab92ab69d987 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_ctor.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_ctor.pass.cpp @@ -5,8 +5,6 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// -// UNSUPPORTED: c++03 // @@ -15,33 +13,26 @@ // unique_lock(unique_lock&& u); #include +#include #include +#include -#include "nasty_containers.h" -#include "../types.h" +#include "checking_mutex.h" #include "test_macros.h" +#if TEST_STD_VER >= 11 +static_assert(std::is_nothrow_move_constructible>::value, ""); +#endif + int main(int, char**) { - { - typedef MyMutex M; - M m; - std::unique_lock lk0(m); - std::unique_lock lk = std::move(lk0); - assert(lk.mutex() == std::addressof(m)); - assert(lk.owns_lock() == true); - assert(lk0.mutex() == nullptr); - assert(lk0.owns_lock() == false); - } - { - typedef nasty_mutex M; - M m; - std::unique_lock lk0(m); - std::unique_lock lk = std::move(lk0); - assert(lk.mutex() == std::addressof(m)); - assert(lk.owns_lock() == true); - assert(lk0.mutex() == nullptr); - assert(lk0.owns_lock() == false); - } + checking_mutex m; + std::unique_lock lk0(m); + std::unique_lock lk = std::move(lk0); + + assert(lk.mutex() == std::addressof(m)); + assert(lk.owns_lock()); + assert(lk0.mutex() == nullptr); + assert(!lk0.owns_lock()); return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex.pass.cpp index 2be25748e903b0..31f15deec0cfaf 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex.pass.cpp @@ -5,9 +5,6 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// -// UNSUPPORTED: no-threads -// ALLOW_RETRIES: 2 // @@ -19,45 +16,22 @@ // -> unique_lock<_Mutex>; // C++17 #include -#include -#include #include -#include -#include "make_test_thread.h" +#include "checking_mutex.h" #include "test_macros.h" -std::mutex m; - -typedef std::chrono::system_clock Clock; -typedef Clock::time_point time_point; -typedef Clock::duration duration; -typedef std::chrono::milliseconds ms; -typedef std::chrono::nanoseconds ns; - -void f() -{ - time_point t0 = Clock::now(); - time_point t1; - { - std::unique_lock ul(m); - t1 = Clock::now(); - } - ns d = t1 - t0 - ms(250); - assert(d < ms(50)); // within 50ms -} +int main(int, char**) { + checking_mutex mux; -int main(int, char**) -{ - m.lock(); - std::thread t = support::make_test_thread(f); - std::this_thread::sleep_for(ms(250)); - m.unlock(); - t.join(); + { + std::unique_lock lock(mux); + assert(mux.current_state == checking_mutex::locked_via_lock); + } + assert(mux.current_state == checking_mutex::unlocked); #if TEST_STD_VER >= 17 - std::unique_lock ul(m); - static_assert((std::is_same>::value), "" ); + static_assert(std::is_same_v, decltype(std::unique_lock{mux})>, ""); #endif return 0; diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_adopt_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_adopt_lock.pass.cpp index 28cc43853180e6..14db741fa4adc3 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_adopt_lock.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_adopt_lock.pass.cpp @@ -5,8 +5,6 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// -// UNSUPPORTED: c++03 // @@ -15,29 +13,19 @@ // unique_lock(mutex_type& m, adopt_lock_t); #include +#include #include -#include "nasty_containers.h" -#include "../types.h" -#include "test_macros.h" +#include "checking_mutex.h" int main(int, char**) { - { - typedef MyMutex M; - M m; - m.lock(); - std::unique_lock lk(m, std::adopt_lock); - assert(lk.mutex() == std::addressof(m)); - assert(lk.owns_lock() == true); - } - { - typedef nasty_mutex M; - M m; - m.lock(); - std::unique_lock lk(m, std::adopt_lock); - assert(lk.mutex() == std::addressof(m)); - assert(lk.owns_lock() == true); - } + checking_mutex m; + m.lock(); + m.last_try = checking_mutex::none; + std::unique_lock lk(m, std::adopt_lock_t()); + assert(m.last_try == checking_mutex::none); + assert(lk.mutex() == std::addressof(m)); + assert(lk.owns_lock()); return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_defer_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_defer_lock.pass.cpp index 96a9afbc9438c4..4335892dd28477 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_defer_lock.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_defer_lock.pass.cpp @@ -5,8 +5,6 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// -// UNSUPPORTED: c++03 // @@ -15,27 +13,24 @@ // unique_lock(mutex_type& m, defer_lock_t); #include +#include #include +#include -#include "nasty_containers.h" -#include "../types.h" +#include "checking_mutex.h" #include "test_macros.h" +#if TEST_STD_VER >= 11 +static_assert( + std::is_nothrow_constructible, checking_mutex&, std::defer_lock_t>::value, ""); +#endif + int main(int, char**) { - { - typedef MyMutex M; - M m; - std::unique_lock lk(m, std::defer_lock); - assert(lk.mutex() == std::addressof(m)); - assert(lk.owns_lock() == false); - } - { - typedef nasty_mutex M; - M m; - std::unique_lock lk(m, std::defer_lock); - assert(lk.mutex() == std::addressof(m)); - assert(lk.owns_lock() == false); - } + checking_mutex m; + std::unique_lock lk(m, std::defer_lock_t()); + assert(m.last_try == checking_mutex::none); + assert(lk.mutex() == std::addressof(m)); + assert(lk.owns_lock() == false); return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_duration.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_duration.pass.cpp index 4bfabab919f177..624b99623d6be7 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_duration.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_duration.pass.cpp @@ -5,69 +5,36 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// -// UNSUPPORTED: no-threads -// ALLOW_RETRIES: 2 // -// class timed_mutex; - // template -// unique_lock(mutex_type& m, const chrono::duration& rel_time); +// unique_lock::unique_lock(mutex_type& m, const chrono::duration& rel_time); -#include -#include -#include #include +#include +#include -#include "make_test_thread.h" -#include "test_macros.h" - -std::timed_mutex m; - -typedef std::chrono::steady_clock Clock; -typedef Clock::time_point time_point; -typedef Clock::duration duration; -typedef std::chrono::milliseconds ms; -typedef std::chrono::nanoseconds ns; - -void f1() -{ - time_point t0 = Clock::now(); - std::unique_lock lk(m, ms(300)); - assert(lk.owns_lock() == true); - time_point t1 = Clock::now(); - ns d = t1 - t0 - ms(250); - assert(d < ms(50)); // within 50ms -} - -void f2() -{ - time_point t0 = Clock::now(); - std::unique_lock lk(m, ms(250)); - assert(lk.owns_lock() == false); - time_point t1 = Clock::now(); - ns d = t1 - t0 - ms(250); - assert(d < ms(50)); // within 50ms -} - -int main(int, char**) -{ - { - m.lock(); - std::thread t = support::make_test_thread(f1); - std::this_thread::sleep_for(ms(250)); - m.unlock(); - t.join(); - } - { - m.lock(); - std::thread t = support::make_test_thread(f2); - std::this_thread::sleep_for(ms(300)); - m.unlock(); - t.join(); - } +#include "checking_mutex.h" + +int main(int, char**) { + checking_mutex mux; + { // check successful lock + mux.reject = false; + std::unique_lock lock(mux, std::chrono::seconds()); + assert(mux.current_state == checking_mutex::locked_via_try_lock_for); + assert(lock.owns_lock()); + } + assert(mux.current_state == checking_mutex::unlocked); + + { // check unsuccessful lock + mux.reject = true; + std::unique_lock lock(mux, std::chrono::seconds()); + assert(mux.current_state == checking_mutex::unlocked); + assert(mux.last_try == checking_mutex::locked_via_try_lock_for); + assert(!lock.owns_lock()); + } + assert(mux.current_state == checking_mutex::unlocked); return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_time_point.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_time_point.pass.cpp index b85bbace3233c1..93d322050476f6 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_time_point.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_time_point.pass.cpp @@ -5,69 +5,37 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// -// UNSUPPORTED: no-threads -// ALLOW_RETRIES: 2 // -// class timed_mutex; - // template -// unique_lock(mutex_type& m, const chrono::time_point& abs_time); +// unique_lock::unique_lock(mutex_type& m, const chrono::time_point& abs_time); -#include -#include -#include #include +#include +#include -#include "make_test_thread.h" -#include "test_macros.h" - -std::timed_mutex m; - -typedef std::chrono::steady_clock Clock; -typedef Clock::time_point time_point; -typedef Clock::duration duration; -typedef std::chrono::milliseconds ms; -typedef std::chrono::nanoseconds ns; - -void f1() -{ - time_point t0 = Clock::now(); - std::unique_lock lk(m, Clock::now() + ms(300)); - assert(lk.owns_lock() == true); - time_point t1 = Clock::now(); - ns d = t1 - t0 - ms(250); - assert(d < ns(50000000)); // within 50ms -} - -void f2() -{ - time_point t0 = Clock::now(); - std::unique_lock lk(m, Clock::now() + ms(250)); - assert(lk.owns_lock() == false); - time_point t1 = Clock::now(); - ns d = t1 - t0 - ms(250); - assert(d < ms(50)); // within 50ms -} - -int main(int, char**) -{ - { - m.lock(); - std::thread t = support::make_test_thread(f1); - std::this_thread::sleep_for(ms(250)); - m.unlock(); - t.join(); - } - { - m.lock(); - std::thread t = support::make_test_thread(f2); - std::this_thread::sleep_for(ms(300)); - m.unlock(); - t.join(); - } +#include "checking_mutex.h" + +int main(int, char**) { + checking_mutex mux; + + { // check successful lock + mux.reject = false; + std::unique_lock lock(mux, std::chrono::time_point()); + assert(mux.current_state == checking_mutex::locked_via_try_lock_until); + assert(lock.owns_lock()); + } + assert(mux.current_state == checking_mutex::unlocked); + + { // check unsuccessful lock + mux.reject = true; + std::unique_lock lock(mux, std::chrono::time_point()); + assert(mux.current_state == checking_mutex::unlocked); + assert(mux.last_try == checking_mutex::locked_via_try_lock_until); + assert(!lock.owns_lock()); + } + assert(mux.current_state == checking_mutex::unlocked); return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_try_to_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_try_to_lock.pass.cpp index 992d383dfa780d..e7af0fc34e7509 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_try_to_lock.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_try_to_lock.pass.cpp @@ -6,10 +6,6 @@ // //===----------------------------------------------------------------------===// -// UNSUPPORTED: no-threads -// UNSUPPORTED: c++03 -// ALLOW_RETRIES: 2 - // // template class unique_lock; @@ -17,55 +13,29 @@ // unique_lock(mutex_type& m, try_to_lock_t); #include -#include -#include #include -#include - -#include "make_test_thread.h" -#include "test_macros.h" - -std::mutex m; - -typedef std::chrono::system_clock Clock; -typedef Clock::time_point time_point; -typedef Clock::duration duration; -typedef std::chrono::milliseconds ms; -typedef std::chrono::nanoseconds ns; - -void f() -{ - time_point t0 = Clock::now(); - { - std::unique_lock lk(m, std::try_to_lock); - assert(lk.owns_lock() == false); - } - { - std::unique_lock lk(m, std::try_to_lock); - assert(lk.owns_lock() == false); - } - { - std::unique_lock lk(m, std::try_to_lock); - assert(lk.owns_lock() == false); - } - while (true) - { - std::unique_lock lk(m, std::try_to_lock); - if (lk.owns_lock()) - break; - } - time_point t1 = Clock::now(); - ns d = t1 - t0 - ms(250); - assert(d < ms(200)); // within 200ms -} -int main(int, char**) -{ - m.lock(); - std::thread t = support::make_test_thread(f); - std::this_thread::sleep_for(ms(250)); - m.unlock(); - t.join(); +#include "checking_mutex.h" + +int main(int, char**) { + checking_mutex mux; + + { // check successful lock + mux.reject = false; + std::unique_lock lock(mux, std::try_to_lock_t()); + assert(mux.current_state == checking_mutex::locked_via_try_lock); + assert(lock.owns_lock()); + } + assert(mux.current_state == checking_mutex::unlocked); + + { // check successful lock + mux.reject = true; + std::unique_lock lock(mux, std::try_to_lock_t()); + assert(mux.last_try == checking_mutex::locked_via_try_lock); + assert(mux.current_state == checking_mutex::unlocked); + assert(!lock.owns_lock()); + } + assert(mux.current_state == checking_mutex::unlocked); return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/lock.pass.cpp index 6767e11a1f8b49..4be1eaa5e1b95f 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/lock.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/lock.pass.cpp @@ -5,10 +5,6 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// -// UNSUPPORTED: no-threads -// UNSUPPORTED: c++03 -// ALLOW_RETRIES: 2 // @@ -17,65 +13,42 @@ // void lock(); #include -#include -#include #include #include -#include -#include "make_test_thread.h" +#include "checking_mutex.h" #include "test_macros.h" -std::mutex m; +int main(int, char**) { + checking_mutex mux; + std::unique_lock lk(mux, std::defer_lock_t()); + assert(mux.last_try == checking_mutex::none); + lk.lock(); + assert(mux.current_state == checking_mutex::locked_via_lock); + mux.last_try = checking_mutex::none; -typedef std::chrono::system_clock Clock; -typedef Clock::time_point time_point; -typedef Clock::duration duration; -typedef std::chrono::milliseconds ms; -typedef std::chrono::nanoseconds ns; +#ifndef TEST_HAS_NO_EXCEPTIONS + try { + mux.last_try = checking_mutex::none; + lk.lock(); + assert(false); + } catch (std::system_error& e) { + assert(mux.last_try == checking_mutex::none); + assert(e.code() == std::errc::resource_deadlock_would_occur); + } -void f() -{ - std::unique_lock lk(m, std::defer_lock); - time_point t0 = Clock::now(); + lk.unlock(); + lk.release(); + + try { + mux.last_try = checking_mutex::none; lk.lock(); - time_point t1 = Clock::now(); - assert(lk.owns_lock() == true); - ns d = t1 - t0 - ms(250); - assert(d < ms(25)); // within 25ms -#ifndef TEST_HAS_NO_EXCEPTIONS - try - { - lk.lock(); - assert(false); - } - catch (std::system_error& e) - { - assert(e.code() == std::errc::resource_deadlock_would_occur); - } + assert(false); + } catch (std::system_error& e) { + assert(mux.last_try == checking_mutex::none); + assert(e.code() == std::errc::operation_not_permitted); + } #endif - lk.unlock(); - lk.release(); -#ifndef TEST_HAS_NO_EXCEPTIONS - try - { - lk.lock(); - assert(false); - } - catch (std::system_error& e) - { - assert(e.code() == std::errc::operation_not_permitted); - } -#endif -} - -int main(int, char**) -{ - m.lock(); - std::thread t = support::make_test_thread(f); - std::this_thread::sleep_for(ms(250)); - m.unlock(); - t.join(); return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock.pass.cpp index 2ee5d3766eb18e..41a5957480556b 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock.pass.cpp @@ -5,9 +5,6 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// -// UNSUPPORTED: c++03 -// ALLOW_RETRIES: 2 // @@ -20,33 +17,43 @@ #include #include "test_macros.h" -#include "../types.h" - -MyTimedMutex m; +#include "checking_mutex.h" int main(int, char**) { - std::unique_lock lk(m, std::defer_lock); - assert(lk.try_lock() == true); - assert(m.try_lock_called == true); - assert(lk.owns_lock() == true); + checking_mutex mux; + + std::unique_lock lock(mux, std::defer_lock_t()); + assert(lock.try_lock()); + assert(mux.current_state == checking_mutex::locked_via_try_lock); + assert(lock.owns_lock()); + #ifndef TEST_HAS_NO_EXCEPTIONS try { - TEST_IGNORE_NODISCARD lk.try_lock(); + mux.last_try = checking_mutex::none; + TEST_IGNORE_NODISCARD lock.try_lock(); assert(false); } catch (std::system_error& e) { + assert(mux.last_try == checking_mutex::none); assert(e.code() == std::errc::resource_deadlock_would_occur); } #endif - lk.unlock(); - assert(lk.try_lock() == false); - assert(m.try_lock_called == false); - assert(lk.owns_lock() == false); - lk.release(); + + lock.unlock(); + mux.reject = true; + + assert(!lock.try_lock()); + assert(mux.last_try == checking_mutex::locked_via_try_lock); + + assert(!lock.owns_lock()); + lock.release(); + #ifndef TEST_HAS_NO_EXCEPTIONS try { - TEST_IGNORE_NODISCARD lk.try_lock(); + mux.last_try = checking_mutex::none; + (void)lock.try_lock(); assert(false); } catch (std::system_error& e) { + assert(mux.last_try == checking_mutex::none); assert(e.code() == std::errc::operation_not_permitted); } #endif diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_for.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_for.pass.cpp index 603cc7b185620c..cfe81a8faf3386 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_for.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_for.pass.cpp @@ -5,8 +5,6 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// -// UNSUPPORTED: c++03 // @@ -20,34 +18,47 @@ #include #include "test_macros.h" -#include "../types.h" - -MyTimedMutex m; +#include "checking_mutex.h" int main(int, char**) { using ms = std::chrono::milliseconds; - std::unique_lock lk(m, std::defer_lock); - assert(lk.try_lock_for(ms(5)) == true); - assert(m.try_lock_for_called == true); - assert(lk.owns_lock() == true); + + checking_mutex mux; + + std::unique_lock lock(mux, std::defer_lock_t()); + + assert(lock.try_lock_for(ms(5))); + assert(mux.current_state == checking_mutex::locked_via_try_lock_for); + assert(lock.owns_lock()); + #ifndef TEST_HAS_NO_EXCEPTIONS try { - TEST_IGNORE_NODISCARD lk.try_lock_for(ms(5)); + mux.last_try = checking_mutex::none; + (void)lock.try_lock_for(ms(5)); + assert(false); } catch (std::system_error& e) { + assert(mux.last_try == checking_mutex::none); assert(e.code() == std::errc::resource_deadlock_would_occur); } #endif - lk.unlock(); - assert(lk.try_lock_for(ms(5)) == false); - assert(m.try_lock_for_called == false); - assert(lk.owns_lock() == false); - lk.release(); + + lock.unlock(); + mux.reject = true; + assert(!lock.try_lock_for(ms(5))); + assert(mux.last_try == checking_mutex::locked_via_try_lock_for); + assert(!lock.owns_lock()); + + lock.release(); + #ifndef TEST_HAS_NO_EXCEPTIONS try { - TEST_IGNORE_NODISCARD lk.try_lock_for(ms(5)); + mux.last_try = checking_mutex::none; + (void)lock.try_lock_for(ms(5)); + assert(false); } catch (std::system_error& e) { + assert(mux.last_try == checking_mutex::none); assert(e.code() == std::errc::operation_not_permitted); } #endif diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_until.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_until.pass.cpp index 46ab95197311c0..bc261f681020f3 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_until.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_until.pass.cpp @@ -5,8 +5,6 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// -// UNSUPPORTED: c++03 // @@ -20,35 +18,44 @@ #include #include +#include "checking_mutex.h" #include "test_macros.h" -#include "../types.h" - -MyTimedMutex m; int main(int, char**) { typedef std::chrono::system_clock Clock; - std::unique_lock lk(m, std::defer_lock); - assert(lk.try_lock_until(Clock::now()) == true); - assert(m.try_lock_until_called == true); - assert(lk.owns_lock() == true); + checking_mutex mux; + + std::unique_lock lock(mux, std::defer_lock_t()); + + assert(lock.try_lock_until(Clock::now())); + assert(mux.current_state == checking_mutex::locked_via_try_lock_until); + assert(lock.owns_lock()); + #ifndef TEST_HAS_NO_EXCEPTIONS try { - TEST_IGNORE_NODISCARD lk.try_lock_until(Clock::now()); + mux.last_try = checking_mutex::none; + (void)lock.try_lock_until(Clock::now()); assert(false); } catch (std::system_error& e) { + assert(mux.last_try == checking_mutex::none); assert(e.code() == std::errc::resource_deadlock_would_occur); } #endif - lk.unlock(); - assert(lk.try_lock_until(Clock::now()) == false); - assert(m.try_lock_until_called == false); - assert(lk.owns_lock() == false); - lk.release(); + + lock.unlock(); + mux.reject = true; + assert(!lock.try_lock_until(Clock::now())); + assert(mux.last_try == checking_mutex::locked_via_try_lock_until); + assert(lock.owns_lock() == false); + lock.release(); + #ifndef TEST_HAS_NO_EXCEPTIONS try { - TEST_IGNORE_NODISCARD lk.try_lock_until(Clock::now()); + mux.last_try = checking_mutex::none; + (void)lock.try_lock_until(Clock::now()); assert(false); } catch (std::system_error& e) { + assert(mux.last_try == checking_mutex::none); assert(e.code() == std::errc::operation_not_permitted); } #endif diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/unlock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/unlock.pass.cpp index 97808f60f2e552..cfc44a6cd5d254 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/unlock.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/unlock.pass.cpp @@ -17,28 +17,36 @@ #include #include "test_macros.h" -#include "../types.h" - -MyMutex m; +#include "checking_mutex.h" int main(int, char**) { - std::unique_lock lk(m); - lk.unlock(); - assert(lk.owns_lock() == false); + checking_mutex mux; + std::unique_lock lock(mux); + assert(mux.current_state == checking_mutex::locked_via_lock); + lock.unlock(); + assert(mux.current_state == checking_mutex::unlocked); + assert(!lock.owns_lock()); + #ifndef TEST_HAS_NO_EXCEPTIONS try { - lk.unlock(); + mux.last_try = checking_mutex::none; + lock.unlock(); assert(false); } catch (std::system_error& e) { + assert(mux.last_try == checking_mutex::none); assert(e.code() == std::errc::operation_not_permitted); } #endif - lk.release(); + + lock.release(); + #ifndef TEST_HAS_NO_EXCEPTIONS try { - lk.unlock(); + mux.last_try = checking_mutex::none; + lock.unlock(); assert(false); } catch (std::system_error& e) { + assert(mux.last_try == checking_mutex::none); assert(e.code() == std::errc::operation_not_permitted); } #endif diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/member_swap.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/member_swap.pass.cpp index 361c85e0150597..e2ffbf4a23a9c5 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/member_swap.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/member_swap.pass.cpp @@ -13,21 +13,29 @@ // void swap(unique_lock& u); #include +#include #include +#include "checking_mutex.h" #include "test_macros.h" -#include "../types.h" -MyMutex m; +#if TEST_STD_VER >= 11 +static_assert( + noexcept(std::declval&>().swap(std::declval&>())), + ""); +#endif int main(int, char**) { - std::unique_lock lk1(m); - std::unique_lock lk2; - lk1.swap(lk2); - assert(lk1.mutex() == nullptr); - assert(lk1.owns_lock() == false); - assert(lk2.mutex() == &m); - assert(lk2.owns_lock() == true); + checking_mutex mux; + std::unique_lock lock1(mux); + std::unique_lock lock2; + + lock1.swap(lock2); + + assert(lock1.mutex() == nullptr); + assert(!lock1.owns_lock()); + assert(lock2.mutex() == std::addressof(mux)); + assert(lock2.owns_lock() == true); return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/nonmember_swap.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/nonmember_swap.pass.cpp index 5133032f6ae39e..3e89e6c66bf3e0 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/nonmember_swap.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/nonmember_swap.pass.cpp @@ -14,21 +14,29 @@ // void swap(unique_lock& x, unique_lock& y); #include +#include #include +#include "checking_mutex.h" #include "test_macros.h" -#include "../types.h" -MyMutex m; +#if TEST_STD_VER >= 11 +static_assert(noexcept(swap(std::declval&>(), + std::declval&>())), + ""); +#endif int main(int, char**) { - std::unique_lock lk1(m); - std::unique_lock lk2; - swap(lk1, lk2); - assert(lk1.mutex() == nullptr); - assert(lk1.owns_lock() == false); - assert(lk2.mutex() == &m); - assert(lk2.owns_lock() == true); + checking_mutex mux; + std::unique_lock lock1(mux); + std::unique_lock lock2; + + swap(lock1, lock2); + + assert(lock1.mutex() == nullptr); + assert(!lock1.owns_lock()); + assert(lock2.mutex() == std::addressof(mux)); + assert(lock2.owns_lock() == true); return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/release.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/release.pass.cpp index a726c8ccc060a1..a7724504a667c1 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/release.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/release.pass.cpp @@ -13,27 +13,30 @@ // mutex_type* release() noexcept; #include +#include #include +#include "checking_mutex.h" #include "test_macros.h" -#include "../types.h" -int MyCountingMutex::lock_count = 0; -int MyCountingMutex::unlock_count = 0; - -MyCountingMutex m; +#if TEST_STD_VER >= 11 +static_assert(noexcept(std::declval&>().release()), ""); +#endif int main(int, char**) { - std::unique_lock lk(m); - assert(lk.mutex() == &m); - assert(lk.owns_lock() == true); - assert(MyCountingMutex::lock_count == 1); - assert(MyCountingMutex::unlock_count == 0); - assert(lk.release() == &m); - assert(lk.mutex() == nullptr); - assert(lk.owns_lock() == false); - assert(MyCountingMutex::lock_count == 1); - assert(MyCountingMutex::unlock_count == 0); + checking_mutex mux; + std::unique_lock lock(mux); + assert(lock.mutex() == std::addressof(mux)); + assert(lock.owns_lock()); + + assert(mux.current_state == checking_mutex::locked_via_lock); + + assert(lock.release() == std::addressof(mux)); + assert(lock.mutex() == nullptr); + assert(!lock.owns_lock()); + assert(mux.last_try == checking_mutex::locked_via_lock); + assert(mux.current_state == checking_mutex::locked_via_lock); + mux.unlock(); return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/mutex.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/mutex.pass.cpp index 72346e8c67e257..f00614015bbc3b 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/mutex.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/mutex.pass.cpp @@ -13,20 +13,25 @@ // mutex_type *mutex() const; #include +#include #include +#include "checking_mutex.h" #include "test_macros.h" -#include "../types.h" -MyMutex m; +#if TEST_STD_VER >= 11 +static_assert(noexcept(std::declval&>().mutex()), ""); +#endif int main(int, char**) { - std::unique_lock lk0; - assert(lk0.mutex() == nullptr); - std::unique_lock lk1(m); - assert(lk1.mutex() == &m); - lk1.unlock(); - assert(lk1.mutex() == &m); + checking_mutex mux; + const std::unique_lock lock0; // Make sure `mutex()` is `const` + static_assert(std::is_same::value, ""); + assert(lock0.mutex() == nullptr); + std::unique_lock lock1(mux); + assert(lock1.mutex() == std::addressof(mux)); + lock1.unlock(); + assert(lock1.mutex() == std::addressof(mux)); return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/op_bool.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/op_bool.pass.cpp index 3759302a483eb2..3542a40d25d39e 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/op_bool.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/op_bool.pass.cpp @@ -16,21 +16,25 @@ #include #include +#include "checking_mutex.h" #include "test_macros.h" -#include "../types.h" -MyMutex m; +#if TEST_STD_VER >= 11 +static_assert(noexcept(static_cast(std::declval&>())), ""); +#endif int main(int, char**) { - static_assert(std::is_constructible >::value, ""); - static_assert(!std::is_convertible, bool>::value, ""); - - std::unique_lock lk0; - assert(static_cast(lk0) == false); - std::unique_lock lk1(m); - assert(static_cast(lk1) == true); + static_assert(std::is_constructible >::value, ""); + static_assert(!std::is_convertible, bool>::value, ""); + + checking_mutex mux; + const std::unique_lock lk0; // Make sure `operator bool()` is `const` + assert(!static_cast(lk0)); + std::unique_lock lk1(mux); + assert(static_cast(lk1)); lk1.unlock(); - assert(static_cast(lk1) == false); + assert(!static_cast(lk1)); + ASSERT_NOEXCEPT(static_cast(lk0)); return 0; diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/owns_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/owns_lock.pass.cpp index 163942786323af..11a674a55392fe 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/owns_lock.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/owns_lock.pass.cpp @@ -15,18 +15,23 @@ #include #include +#include "checking_mutex.h" #include "test_macros.h" -#include "../types.h" -MyMutex m; +#if TEST_STD_VER >= 11 +static_assert(noexcept(std::declval&>().owns_lock()), ""); +#endif int main(int, char**) { - std::unique_lock lk0; - assert(lk0.owns_lock() == false); - std::unique_lock lk1(m); - assert(lk1.owns_lock() == true); - lk1.unlock(); - assert(lk1.owns_lock() == false); + { + checking_mutex mux; + const std::unique_lock lock0; // Make sure `owns_lock()` is `const` + assert(!lock0.owns_lock()); + std::unique_lock lock1(mux); + assert(lock1.owns_lock()); + lock1.unlock(); + assert(!lock1.owns_lock()); + } return 0; } diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.compile.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.compile.pass.cpp index 312863ae8e743a..56055788965d53 100644 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.compile.pass.cpp +++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.compile.pass.cpp @@ -19,7 +19,6 @@ #include #include -#include "test_macros.h" -#include "types.h" +#include "checking_mutex.h" -static_assert((std::is_same::mutex_type, MyMutex>::value), ""); +static_assert(std::is_same::mutex_type, checking_mutex>::value, ""); diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.h b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.h deleted file mode 100644 index 15a1a531487f50..00000000000000 --- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.h +++ /dev/null @@ -1,88 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef TEST_STD_THREAD_THREAD_MUTEX_THREAD_LOCK_THREAD_LOCK_GUARD_TYPES_H -#define TEST_STD_THREAD_THREAD_MUTEX_THREAD_LOCK_THREAD_LOCK_GUARD_TYPES_H - -#include -#include - -struct MyMutex { - bool locked = false; - - MyMutex() = default; - ~MyMutex() { assert(!locked); } - - void lock() { - assert(!locked); - locked = true; - } - - void unlock() { - assert(locked); - locked = false; - } - - bool try_lock() { - if (locked) - return false; - lock(); - return true; - } - - template - bool try_lock_for(const std::chrono::duration& rel_time) { - using ms = std::chrono::milliseconds; - assert(rel_time == ms(5)); - if (locked) - return false; - lock(); - return true; - } - - MyMutex(MyMutex const&) = delete; - MyMutex& operator=(MyMutex const&) = delete; -}; - -struct MyTimedMutex { - using ms = std::chrono::milliseconds; - - bool try_lock_called = false; - bool try_lock_for_called = false; - bool try_lock_until_called = false; - - bool try_lock() { - try_lock_called = !try_lock_called; - return try_lock_called; - } - - template - bool try_lock_for(const std::chrono::duration& rel_time) { - assert(rel_time == ms(5)); - try_lock_for_called = !try_lock_for_called; - return try_lock_for_called; - } - - template - bool try_lock_until(const std::chrono::time_point& abs_time) { - assert(Clock::now() - abs_time < ms(5)); - try_lock_until_called = !try_lock_until_called; - return try_lock_until_called; - } - - void unlock() {} -}; - -struct MyCountingMutex { - static int lock_count; - static int unlock_count; - void lock() { ++lock_count; } - void unlock() { ++unlock_count; } -}; - -#endif // TEST_STD_THREAD_THREAD_MUTEX_THREAD_LOCK_THREAD_LOCK_GUARD_TYPES_H diff --git a/libcxx/test/support/checking_mutex.h b/libcxx/test/support/checking_mutex.h new file mode 100644 index 00000000000000..1a635c32f29a6b --- /dev/null +++ b/libcxx/test/support/checking_mutex.h @@ -0,0 +1,80 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef TEST_SUPPORT_CHECKING_MUTEX_H +#define TEST_SUPPORT_CHECKING_MUTEX_H + +#include +#include + +struct checking_mutex { + enum state { + locked_via_lock, + locked_via_try_lock, + locked_via_try_lock_for, + locked_via_try_lock_until, + unlocked, + none, + }; + + state current_state = unlocked; + state last_try = none; + bool reject = false; + + checking_mutex() = default; + checking_mutex(const checking_mutex&) = delete; + ~checking_mutex() { assert(current_state == unlocked); } + + void lock() { + assert(current_state == unlocked); + assert(!reject); + current_state = locked_via_lock; + last_try = locked_via_lock; + reject = true; + } + + void unlock() { + assert(current_state != unlocked && current_state != none); + last_try = unlocked; + current_state = unlocked; + reject = false; + } + + bool try_lock() { + last_try = locked_via_try_lock; + if (reject) + return false; + current_state = locked_via_try_lock; + return true; + } + + template + bool try_lock_for(const std::chrono::duration&) { + last_try = locked_via_try_lock_for; + if (reject) + return false; + current_state = locked_via_try_lock_for; + return true; + } + + template + bool try_lock_until(const std::chrono::time_point&) { + last_try = locked_via_try_lock_until; + if (reject) + return false; + current_state = locked_via_try_lock_until; + return true; + } + + checking_mutex* operator&() = delete; + + template + void operator,(const T&) = delete; +}; + +#endif // TEST_SUPPORT_CHECKING_MUTEX_H From 716f7e2d18d03039c13ad90d5b3cb4f65c413b74 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 22 Aug 2024 11:51:28 +0200 Subject: [PATCH 170/426] [SimplifyCFG] Add tests for switch over cmp intrinsic (NFC) --- .../Transforms/SimplifyCFG/switch-on-cmp.ll | 384 ++++++++++++++++++ 1 file changed, 384 insertions(+) create mode 100644 llvm/test/Transforms/SimplifyCFG/switch-on-cmp.ll diff --git a/llvm/test/Transforms/SimplifyCFG/switch-on-cmp.ll b/llvm/test/Transforms/SimplifyCFG/switch-on-cmp.ll new file mode 100644 index 00000000000000..1ce18533d156d0 --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/switch-on-cmp.ll @@ -0,0 +1,384 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s + +define void @ucmp_gt1(i32 %a, i32 %b) { +; CHECK-LABEL: define void @ucmp_gt1( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) +; CHECK-NEXT: switch i8 [[RES]], label %[[BB1:.*]] [ +; CHECK-NEXT: i8 -1, label %[[BB2:.*]] +; CHECK-NEXT: i8 0, label %[[BB2]] +; CHECK-NEXT: ] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: ret void +; + %res = call i8 @llvm.ucmp.i8.i32(i32 %a, i32 %b) + switch i8 %res, label %bb1 [ + i8 -1, label %bb2 + i8 0, label %bb2 + ] + +bb1: + call void @foo() + br label %bb2 + +bb2: + ret void +} + +define void @ucmp_gt2(i32 %a, i32 %b) { +; CHECK-LABEL: define void @ucmp_gt2( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) +; CHECK-NEXT: switch i8 [[RES]], label %[[BB1:.*]] [ +; CHECK-NEXT: i8 0, label %[[BB2:.*]] +; CHECK-NEXT: i8 -1, label %[[BB2]] +; CHECK-NEXT: ] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: ret void +; + %res = call i8 @llvm.ucmp.i8.i32(i32 %a, i32 %b) + switch i8 %res, label %bb1 [ + i8 0, label %bb2 + i8 -1, label %bb2 + ] + +bb1: + call void @foo() + br label %bb2 + +bb2: + ret void +} + +define void @ucmp_lt1(i32 %a, i32 %b) { +; CHECK-LABEL: define void @ucmp_lt1( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) +; CHECK-NEXT: switch i8 [[RES]], label %[[BB2:.*]] [ +; CHECK-NEXT: i8 1, label %[[BB1:.*]] +; CHECK-NEXT: i8 0, label %[[BB1]] +; CHECK-NEXT: ] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: ret void +; + %res = call i8 @llvm.ucmp.i8.i32(i32 %a, i32 %b) + switch i8 %res, label %bb2 [ + i8 1, label %bb1 + i8 0, label %bb1 + ] + +bb1: + call void @foo() + br label %bb2 + +bb2: + ret void +} + +define void @ucmp_lt2(i32 %a, i32 %b) { +; CHECK-LABEL: define void @ucmp_lt2( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) +; CHECK-NEXT: switch i8 [[RES]], label %[[BB2:.*]] [ +; CHECK-NEXT: i8 0, label %[[BB1:.*]] +; CHECK-NEXT: i8 1, label %[[BB1]] +; CHECK-NEXT: ] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: ret void +; + %res = call i8 @llvm.ucmp.i8.i32(i32 %a, i32 %b) + switch i8 %res, label %bb2 [ + i8 0, label %bb1 + i8 1, label %bb1 + ] + +bb1: + call void @foo() + br label %bb2 + +bb2: + ret void +} + +define void @ucmp_eq1(i32 %a, i32 %b) { +; CHECK-LABEL: define void @ucmp_eq1( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) +; CHECK-NEXT: switch i8 [[RES]], label %[[BB1:.*]] [ +; CHECK-NEXT: i8 -1, label %[[BB2:.*]] +; CHECK-NEXT: i8 1, label %[[BB2]] +; CHECK-NEXT: ] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: ret void +; + %res = call i8 @llvm.ucmp.i8.i32(i32 %a, i32 %b) + switch i8 %res, label %bb1 [ + i8 -1, label %bb2 + i8 1, label %bb2 + ] + +bb1: + call void @foo() + br label %bb2 + +bb2: + ret void +} + +define void @ucmp_eq2(i32 %a, i32 %b) { +; CHECK-LABEL: define void @ucmp_eq2( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) +; CHECK-NEXT: switch i8 [[RES]], label %[[BB1:.*]] [ +; CHECK-NEXT: i8 1, label %[[BB2:.*]] +; CHECK-NEXT: i8 -1, label %[[BB2]] +; CHECK-NEXT: ] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: ret void +; + %res = call i8 @llvm.ucmp.i8.i32(i32 %a, i32 %b) + switch i8 %res, label %bb1 [ + i8 1, label %bb2 + i8 -1, label %bb2 + ] + +bb1: + call void @foo() + br label %bb2 + +bb2: + ret void +} + +define void @scmp_gt1(i32 %a, i32 %b) { +; CHECK-LABEL: define void @scmp_gt1( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[A]], i32 [[B]]) +; CHECK-NEXT: switch i8 [[RES]], label %[[BB1:.*]] [ +; CHECK-NEXT: i8 -1, label %[[BB2:.*]] +; CHECK-NEXT: i8 0, label %[[BB2]] +; CHECK-NEXT: ] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: ret void +; + %res = call i8 @llvm.scmp.i8.i32(i32 %a, i32 %b) + switch i8 %res, label %bb1 [ + i8 -1, label %bb2 + i8 0, label %bb2 + ] + +bb1: + call void @foo() + br label %bb2 + +bb2: + ret void +} + +define void @scmp_gt2(i32 %a, i32 %b) { +; CHECK-LABEL: define void @scmp_gt2( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[A]], i32 [[B]]) +; CHECK-NEXT: switch i8 [[RES]], label %[[BB1:.*]] [ +; CHECK-NEXT: i8 0, label %[[BB2:.*]] +; CHECK-NEXT: i8 -1, label %[[BB2]] +; CHECK-NEXT: ] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: ret void +; + %res = call i8 @llvm.scmp.i8.i32(i32 %a, i32 %b) + switch i8 %res, label %bb1 [ + i8 0, label %bb2 + i8 -1, label %bb2 + ] + +bb1: + call void @foo() + br label %bb2 + +bb2: + ret void +} + +define void @ucmp_gt_multiuse(i32 %a, i32 %b) { +; CHECK-LABEL: define void @ucmp_gt_multiuse( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) +; CHECK-NEXT: call void @use(i8 [[RES]]) +; CHECK-NEXT: switch i8 [[RES]], label %[[BB1:.*]] [ +; CHECK-NEXT: i8 -1, label %[[BB2:.*]] +; CHECK-NEXT: i8 0, label %[[BB2]] +; CHECK-NEXT: ] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: ret void +; + %res = call i8 @llvm.ucmp.i8.i32(i32 %a, i32 %b) + call void @use(i8 %res) + switch i8 %res, label %bb1 [ + i8 -1, label %bb2 + i8 0, label %bb2 + ] + +bb1: + call void @foo() + br label %bb2 + +bb2: + ret void +} + +define i32 @ucmp_gt_phi(i32 %a, i32 %b) { +; CHECK-LABEL: define i32 @ucmp_gt_phi( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) +; CHECK-NEXT: switch i8 [[RES]], label %[[BB1:.*]] [ +; CHECK-NEXT: i8 -1, label %[[BB2:.*]] +; CHECK-NEXT: i8 0, label %[[BB2]] +; CHECK-NEXT: ] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, %[[BB1]] ], [ 1, %[[ENTRY]] ], [ 1, %[[ENTRY]] ] +; CHECK-NEXT: ret i32 [[PHI]] +; +entry: + %res = call i8 @llvm.ucmp.i8.i32(i32 %a, i32 %b) + switch i8 %res, label %bb1 [ + i8 -1, label %bb2 + i8 0, label %bb2 + ] + +bb1: + call void @foo() + br label %bb2 + +bb2: + %phi = phi i32 [ 0, %bb1 ], [ 1, %entry ], [ 1, %entry ] + ret i32 %phi +} + +define void @ucmp_gt_extra_case(i32 %a, i32 %b) { +; CHECK-LABEL: define void @ucmp_gt_extra_case( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) +; CHECK-NEXT: switch i8 [[RES]], label %[[BB1:.*]] [ +; CHECK-NEXT: i8 -1, label %[[BB2:.*]] +; CHECK-NEXT: i8 0, label %[[BB2]] +; CHECK-NEXT: i8 1, label %[[BB2]] +; CHECK-NEXT: ] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: ret void +; + %res = call i8 @llvm.ucmp.i8.i32(i32 %a, i32 %b) + switch i8 %res, label %bb1 [ + i8 -1, label %bb2 + i8 0, label %bb2 + i8 1, label %bb2 + ] + +bb1: + call void @foo() + br label %bb2 + +bb2: + ret void +} + +define void @ucmp_gt_wrong_case(i32 %a, i32 %b) { +; CHECK-LABEL: define void @ucmp_gt_wrong_case( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) +; CHECK-NEXT: switch i8 [[RES]], label %[[BB1:.*]] [ +; CHECK-NEXT: i8 -2, label %[[BB2:.*]] +; CHECK-NEXT: i8 0, label %[[BB2]] +; CHECK-NEXT: ] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: ret void +; + %res = call i8 @llvm.ucmp.i8.i32(i32 %a, i32 %b) + switch i8 %res, label %bb1 [ + i8 -2, label %bb2 + i8 0, label %bb2 + ] + +bb1: + call void @foo() + br label %bb2 + +bb2: + ret void +} + +define void @ucmp_gt_not_same_succ(i32 %a, i32 %b) { +; CHECK-LABEL: define void @ucmp_gt_not_same_succ( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) +; CHECK-NEXT: switch i8 [[RES]], label %[[BB1:.*]] [ +; CHECK-NEXT: i8 -1, label %[[BB2:.*]] +; CHECK-NEXT: i8 0, label %[[BB3:.*]] +; CHECK-NEXT: ] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB3]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: ret void +; + %res = call i8 @llvm.ucmp.i8.i32(i32 %a, i32 %b) + switch i8 %res, label %bb1 [ + i8 -1, label %bb2 + i8 0, label %bb3 + ] + +bb1: + call void @foo() + br label %bb2 + +bb3: + call void @foo() + br label %bb2 + +bb2: + ret void +} + +declare void @use(i8) +declare void @foo() From 57dc09341e5eef758b1abce78822c51069157869 Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Thu, 22 Aug 2024 10:56:28 +0100 Subject: [PATCH 171/426] [Dexter] Sanitize user details from git repo URL in dexter --version (#105533) Currently the output of dexter --version contains the raw output of `git remote get-url origin`, which may contain a username and password. This patch adds a small change to remove these from the output string. A similar patch for LLVM's default version string* also removes the git URL altogether unless opted-in to; it's not clear whether this is a necessary or desirable step yet, but if so we can trivially remove the URL from Dexter as well. *PR here: https://github.com/llvm/llvm-project/pull/105220 --- .../debuginfo-tests/dexter/dex/utils/Version.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/utils/Version.py b/cross-project-tests/debuginfo-tests/dexter/dex/utils/Version.py index 505aebaebc4f42..89ab5c2dbd72a3 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/utils/Version.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/utils/Version.py @@ -9,10 +9,23 @@ import os from subprocess import CalledProcessError, check_output, STDOUT import sys +from urllib.parse import urlparse, urlunparse from dex import __version__ +def sanitize_repo_url(repo): + parsed = urlparse(repo) + # No username present, repo URL is fine. + if parsed.username is None: + return repo + # Otherwise, strip the login details from the URL by reconstructing the netloc from just `(:)?`. + sanitized_netloc = parsed.hostname + if parsed.port: + sanitized_netloc = f"{sanitized_netloc}:{parsed.port}" + return urlunparse(parsed._replace(netloc=sanitized_netloc)) + + def _git_version(): dir_ = os.path.dirname(__file__) try: @@ -28,7 +41,7 @@ def _git_version(): .rstrip() .decode("utf-8") ) - repo = ( + repo = sanitize_repo_url( check_output( ["git", "remote", "get-url", "origin"], stderr=STDOUT, cwd=dir_ ) From 51ca2354d0a4083b9219df131ceff98bccb622b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 22 Aug 2024 12:57:39 +0300 Subject: [PATCH 172/426] [lit] Fix substitutions containing backslashes (#103042) Substitutions can be added in a couple different ways; they can be added via the calling python scripts by adding entries to the config.substitutions dictionary, or via DEFINE lines in the scripts themselves. The substitution strings passed to Python's re classes are interpreted so that backslashes expand to escape sequences, and literal backslashes need to be escaped. On Unix, the script defined substitutions don't (usually, so far) contain backslashes - but on Windows, they often do, due to paths containing backslashes. This lead to a Windows specific escaping of backslashes before doing Python re substitutions - since 7c9eab8fef0ed79a5911d21eb97b6b0fa9d39f82. There's nothing inherently Windows specific about this though - any intended literal backslashes in the substitution strings need to be escaped; this is how the Python re API works. The DEFINE lines were added later, and in order to cope with backslashes, escaping of backslashes was added in the SubstDirective class in TestRunner, applying to DEFINE lines in the tests only. The fact that the escaping right before passing to the Python re API was done conditionally on Windows led to two inconsistencies: - DEFINE lines in the tests that contain backslashes got double backslashes on Windows. (This was visible as a FIXME in llvm/utils/lit/tests/Inputs/shtest-define/value-escaped.txt.) - Script provided substitutions containing backslashes did not work on Unix, but they did work on Windows. By removing the escaping from SubstDirective and escaping it unconditionally in the processLine function, before feeding the substitutions to Python's re classes, we should have consistent behaviour across platforms, and get rid of the FIXME in the lit test. This fixes issues with substitutions containing backslashes on Unix platforms, as encountered in PR #86649. --- llvm/docs/TestingGuide.rst | 5 +++-- llvm/utils/lit/lit/TestRunner.py | 8 +++----- llvm/utils/lit/tests/Inputs/shtest-define/lit.cfg | 4 ++++ .../lit/tests/Inputs/shtest-define/value-escaped.txt | 10 +++++----- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/llvm/docs/TestingGuide.rst b/llvm/docs/TestingGuide.rst index c35e58bc53b671..08617933519fdb 100644 --- a/llvm/docs/TestingGuide.rst +++ b/llvm/docs/TestingGuide.rst @@ -864,8 +864,9 @@ Additional substitutions can be defined as follows: - Lit configuration files (e.g., ``lit.cfg`` or ``lit.local.cfg``) can define substitutions for all tests in a test directory. They do so by extending the substitution list, ``config.substitutions``. Each item in the list is a tuple - consisting of a pattern and its replacement, which lit applies using python's - ``re.sub`` function. + consisting of a pattern and its replacement, which lit applies as plain text + (even if it contains sequences that python's ``re.sub`` considers to be + escape sequences). - To define substitutions within a single test file, lit supports the ``DEFINE:`` and ``REDEFINE:`` directives, described in detail below. So that they have no effect on other test files, these directives modify a copy of the diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py index cc903f9e3a1520..2d9af9fbbb3634 100644 --- a/llvm/utils/lit/lit/TestRunner.py +++ b/llvm/utils/lit/lit/TestRunner.py @@ -1591,7 +1591,6 @@ def adjust_substitutions(self, substitutions): assert ( not self.needs_continuation() ), "expected directive continuations to be parsed before applying" - value_repl = self.value.replace("\\", "\\\\") existing = [i for i, subst in enumerate(substitutions) if self.name in subst[0]] existing_res = "".join( "\nExisting pattern: " + substitutions[i][0] for i in existing @@ -1604,7 +1603,7 @@ def adjust_substitutions(self, substitutions): f"{self.get_location()}" f"{existing_res}" ) - substitutions.insert(0, (self.name, value_repl)) + substitutions.insert(0, (self.name, self.value)) return if len(existing) > 1: raise ValueError( @@ -1626,7 +1625,7 @@ def adjust_substitutions(self, substitutions): f"Expected pattern: {self.name}" f"{existing_res}" ) - substitutions[existing[0]] = (self.name, value_repl) + substitutions[existing[0]] = (self.name, self.value) def applySubstitutions(script, substitutions, conditions={}, recursion_limit=None): @@ -1742,8 +1741,7 @@ def processLine(ln): # Apply substitutions ln = substituteIfElse(escapePercents(ln)) for a, b in substitutions: - if kIsWindows: - b = b.replace("\\", "\\\\") + b = b.replace("\\", "\\\\") # re.compile() has a built-in LRU cache with 512 entries. In some # test suites lit ends up thrashing that cache, which made e.g. # check-llvm run 50% slower. Use an explicit, unbounded cache diff --git a/llvm/utils/lit/tests/Inputs/shtest-define/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-define/lit.cfg index a29755eb2b6007..ffe7cce8f03365 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-define/lit.cfg +++ b/llvm/utils/lit/tests/Inputs/shtest-define/lit.cfg @@ -23,6 +23,10 @@ config.substitutions.insert( 0, ("%{global:echo}", "echo GLOBAL: %{global:greeting} %{global:what}") ) +# This substitution includes an re.sub replacement string escape sequence, +# which lit should treat as plain text. +config.substitutions.insert(0, ("%{global:subst-with-escapes}", r"value-with-\g")) + # The following substitution definitions are confusing and should be avoided. # We define them here so we can test that 'DEFINE:' and 'REDEFINE:' directives # guard against the confusion they cause. diff --git a/llvm/utils/lit/tests/Inputs/shtest-define/value-escaped.txt b/llvm/utils/lit/tests/Inputs/shtest-define/value-escaped.txt index 68cf35825e2a64..92fe4c27664fac 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-define/value-escaped.txt +++ b/llvm/utils/lit/tests/Inputs/shtest-define/value-escaped.txt @@ -1,16 +1,16 @@ -# FIXME: The doubled backslashes occur under windows. That's almost surely a -# lit issue beyond DEFINE/REDEFINE. - # Escape sequences that can appear in python re.sub replacement strings have no # special meaning in the value. # DEFINE: %{escape} = \g<0>\n # RUN: echo '%{escape}' -# CHECK:# | {{\\?}}\g<0>{{\\?}}\n +# CHECK:# | \g<0>\n # REDEFINE: %{escape} = \n \ # REDEFINE: \g # RUN: echo '%{escape}' -# CHECK:# | {{\\?}}\n {{\\?}}\g +# CHECK:# | \n \g + +# RUN: echo '%{global:subst-with-escapes}' +# CHECK:# | value-with-\g # CHECK: Passed: 1 {{\([0-9]*.[0-9]*%\)}} From f67388232384682fb442d6e5501d9259c41fd714 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 22 Aug 2024 11:11:00 +0100 Subject: [PATCH 173/426] [X86] Allow speculative BSR/BSF instructions on targets with CMOV (#102885) Currently targets without LZCNT/TZCNT won't speculate with BSR/BSF instructions in case they have a zero value input, meaning we always insert a test+branch for the zero-input case. This patch proposes we allow speculation if the target has CMOV, and perform a branchless select instead to handle the zero input case. This will predominately help x86-64 targets where we haven't set any particular cpu target. We already always perform BSR/BSF instructions if we were lowering a CTLZ/CTTZ_ZERO_UNDEF instruction. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 4 +- .../lib/Target/X86/X86TargetTransformInfo.cpp | 10 +- .../Analysis/CostModel/X86/ctlz-codesize.ll | 8 +- .../CostModel/X86/ctlz-sizelatency.ll | 8 +- llvm/test/Analysis/CostModel/X86/ctlz.ll | 8 +- .../Analysis/CostModel/X86/cttz-codesize.ll | 2 +- .../CostModel/X86/cttz-sizelatency.ll | 2 +- llvm/test/CodeGen/X86/atomic-bit-test.ll | 1 - llvm/test/CodeGen/X86/bit_ceil.ll | 53 +-- llvm/test/CodeGen/X86/combine-or.ll | 47 ++- llvm/test/CodeGen/X86/ctlo.ll | 161 ++++++---- llvm/test/CodeGen/X86/ctlz.ll | 304 +++++++++--------- llvm/test/CodeGen/X86/cttz.ll | 37 ++- llvm/test/CodeGen/X86/known-never-zero.ll | 269 +++++----------- llvm/test/CodeGen/X86/lzcnt-cmp.ll | 52 +-- llvm/test/CodeGen/X86/pr57673.ll | 50 +-- llvm/test/CodeGen/X86/pr89877.ll | 8 +- llvm/test/CodeGen/X86/pr92569.ll | 16 +- .../CodeGenPrepare/X86/cttz-ctlz.ll | 80 ++--- .../test/Transforms/SLPVectorizer/X86/ctlz.ll | 78 ++++- 20 files changed, 516 insertions(+), 682 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index da5ea50f80ce04..97775ce40aee4f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3239,14 +3239,14 @@ bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT, bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const { // Speculate cttz only if we can directly use TZCNT or can promote to i32/i64. - return Subtarget.hasBMI() || + return Subtarget.hasBMI() || Subtarget.canUseCMOV() || (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u)); } bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const { // Speculate ctlz only if we can directly use LZCNT. - return Subtarget.hasLZCNT(); + return Subtarget.hasLZCNT() || Subtarget.canUseCMOV(); } bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const { diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 9a11c33386fd0b..cb9ee64a677a7e 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -4210,9 +4210,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } }, { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } }, - { ISD::CTLZ, MVT::i64, { 3, 2, 6, 6 } }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTLZ, MVT::i64, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR - { ISD::CTTZ, MVT::i64, { 2, 2, 5, 5 } }, // TEST+BSF+CMOV/BRANCH + { ISD::CTTZ, MVT::i64, { 2, 2, 3, 4 } }, // TEST+BSF+CMOV/BRANCH { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } }, { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } }, @@ -4241,9 +4241,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } }, { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } }, { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL - { ISD::CTLZ, MVT::i32, { 3, 2, 6, 6 } }, // BSR+XOR or BSR+XOR+CMOV - { ISD::CTLZ, MVT::i16, { 3, 2, 6, 6 } }, // BSR+XOR or BSR+XOR+CMOV - { ISD::CTLZ, MVT::i8, { 3, 2, 7, 7 } }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTLZ, MVT::i32, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTLZ, MVT::i16, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTLZ, MVT::i8, { 2, 2, 5, 6 } }, // BSR+XOR or BSR+XOR+CMOV { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR diff --git a/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll b/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll index ae0f1a3cfad307..da0f71c63ef80e 100644 --- a/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll @@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1) define i64 @var_ctlz_i64(i64 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i64' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i64' @@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) { define i32 @var_ctlz_i32(i32 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i32' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i32' @@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) { define i16 @var_ctlz_i16(i16 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i16' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i16' @@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) { define i8 @var_ctlz_i8(i8 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i8' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i8' diff --git a/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll index 8c6c3228d8fc6e..2425e7286265b0 100644 --- a/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll @@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1) define i64 @var_ctlz_i64(i64 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i64' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i64' @@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) { define i32 @var_ctlz_i32(i32 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i32' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i32' @@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) { define i16 @var_ctlz_i16(i16 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i16' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i16' @@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) { define i8 @var_ctlz_i8(i8 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i8' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i8' diff --git a/llvm/test/Analysis/CostModel/X86/ctlz.ll b/llvm/test/Analysis/CostModel/X86/ctlz.ll index 99e682b8e17826..fa7982ce09e9ce 100644 --- a/llvm/test/Analysis/CostModel/X86/ctlz.ll +++ b/llvm/test/Analysis/CostModel/X86/ctlz.ll @@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1) define i64 @var_ctlz_i64(i64 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i64' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i64' @@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) { define i32 @var_ctlz_i32(i32 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i32' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i32' @@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) { define i16 @var_ctlz_i16(i16 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i16' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i16' @@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) { define i8 @var_ctlz_i8(i8 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i8' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i8' diff --git a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll index 1d40debb7ab816..07bf1dd7a2ff6c 100644 --- a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll @@ -18,7 +18,7 @@ declare i8 @llvm.cttz.i8(i8, i1) define i64 @var_cttz_i64(i64 %a) { ; NOBMI-LABEL: 'var_cttz_i64' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz ; ; BMI-LABEL: 'var_cttz_i64' diff --git a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll index 351e863f132067..afe5cb8c55fe65 100644 --- a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll @@ -18,7 +18,7 @@ declare i8 @llvm.cttz.i8(i8, i1) define i64 @var_cttz_i64(i64 %a) { ; NOBMI-LABEL: 'var_cttz_i64' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz ; ; BMI-LABEL: 'var_cttz_i64' diff --git a/llvm/test/CodeGen/X86/atomic-bit-test.ll b/llvm/test/CodeGen/X86/atomic-bit-test.ll index f39c4b5e620d0e..10b6605c3fb05e 100644 --- a/llvm/test/CodeGen/X86/atomic-bit-test.ll +++ b/llvm/test/CodeGen/X86/atomic-bit-test.ll @@ -582,7 +582,6 @@ define i32 @split_hoist_and(i32 %0) nounwind { ; X64-NEXT: lock btsl $3, v32(%rip) ; X64-NEXT: setb %al ; X64-NEXT: shll $3, %eax -; X64-NEXT: testl %edi, %edi ; X64-NEXT: retq %2 = atomicrmw or ptr @v32, i32 8 monotonic, align 4 %3 = tail call i32 @llvm.ctlz.i32(i32 %0, i1 false) diff --git a/llvm/test/CodeGen/X86/bit_ceil.ll b/llvm/test/CodeGen/X86/bit_ceil.ll index 4641c114238f8f..823453087f6180 100644 --- a/llvm/test/CodeGen/X86/bit_ceil.ll +++ b/llvm/test/CodeGen/X86/bit_ceil.ll @@ -8,16 +8,12 @@ define i32 @bit_ceil_i32(i32 %x) { ; NOBMI-LABEL: bit_ceil_i32: ; NOBMI: # %bb.0: -; NOBMI-NEXT: movl %edi, %eax -; NOBMI-NEXT: decl %eax -; NOBMI-NEXT: je .LBB0_1 -; NOBMI-NEXT: # %bb.2: # %cond.false -; NOBMI-NEXT: bsrl %eax, %ecx +; NOBMI-NEXT: # kill: def $edi killed $edi def $rdi +; NOBMI-NEXT: leal -1(%rdi), %eax +; NOBMI-NEXT: bsrl %eax, %eax +; NOBMI-NEXT: movl $63, %ecx +; NOBMI-NEXT: cmovnel %eax, %ecx ; NOBMI-NEXT: xorl $31, %ecx -; NOBMI-NEXT: jmp .LBB0_3 -; NOBMI-NEXT: .LBB0_1: -; NOBMI-NEXT: movl $32, %ecx -; NOBMI-NEXT: .LBB0_3: # %cond.end ; NOBMI-NEXT: negb %cl ; NOBMI-NEXT: movl $1, %edx ; NOBMI-NEXT: movl $1, %eax @@ -51,15 +47,10 @@ define i32 @bit_ceil_i32(i32 %x) { define i32 @bit_ceil_i32_plus1(i32 noundef %x) { ; NOBMI-LABEL: bit_ceil_i32_plus1: ; NOBMI: # %bb.0: # %entry -; NOBMI-NEXT: testl %edi, %edi -; NOBMI-NEXT: je .LBB1_1 -; NOBMI-NEXT: # %bb.2: # %cond.false -; NOBMI-NEXT: bsrl %edi, %ecx +; NOBMI-NEXT: bsrl %edi, %eax +; NOBMI-NEXT: movl $63, %ecx +; NOBMI-NEXT: cmovnel %eax, %ecx ; NOBMI-NEXT: xorl $31, %ecx -; NOBMI-NEXT: jmp .LBB1_3 -; NOBMI-NEXT: .LBB1_1: -; NOBMI-NEXT: movl $32, %ecx -; NOBMI-NEXT: .LBB1_3: # %cond.end ; NOBMI-NEXT: negb %cl ; NOBMI-NEXT: movl $1, %edx ; NOBMI-NEXT: movl $1, %eax @@ -94,16 +85,11 @@ entry: define i64 @bit_ceil_i64(i64 %x) { ; NOBMI-LABEL: bit_ceil_i64: ; NOBMI: # %bb.0: -; NOBMI-NEXT: movq %rdi, %rax -; NOBMI-NEXT: decq %rax -; NOBMI-NEXT: je .LBB2_1 -; NOBMI-NEXT: # %bb.2: # %cond.false -; NOBMI-NEXT: bsrq %rax, %rcx -; NOBMI-NEXT: xorq $63, %rcx -; NOBMI-NEXT: jmp .LBB2_3 -; NOBMI-NEXT: .LBB2_1: -; NOBMI-NEXT: movl $64, %ecx -; NOBMI-NEXT: .LBB2_3: # %cond.end +; NOBMI-NEXT: leaq -1(%rdi), %rax +; NOBMI-NEXT: bsrq %rax, %rax +; NOBMI-NEXT: movl $127, %ecx +; NOBMI-NEXT: cmovneq %rax, %rcx +; NOBMI-NEXT: xorl $63, %ecx ; NOBMI-NEXT: negb %cl ; NOBMI-NEXT: movl $1, %edx ; NOBMI-NEXT: movl $1, %eax @@ -136,15 +122,10 @@ define i64 @bit_ceil_i64(i64 %x) { define i64 @bit_ceil_i64_plus1(i64 noundef %x) { ; NOBMI-LABEL: bit_ceil_i64_plus1: ; NOBMI: # %bb.0: # %entry -; NOBMI-NEXT: testq %rdi, %rdi -; NOBMI-NEXT: je .LBB3_1 -; NOBMI-NEXT: # %bb.2: # %cond.false -; NOBMI-NEXT: bsrq %rdi, %rcx -; NOBMI-NEXT: xorq $63, %rcx -; NOBMI-NEXT: jmp .LBB3_3 -; NOBMI-NEXT: .LBB3_1: -; NOBMI-NEXT: movl $64, %ecx -; NOBMI-NEXT: .LBB3_3: # %cond.end +; NOBMI-NEXT: bsrq %rdi, %rax +; NOBMI-NEXT: movl $127, %ecx +; NOBMI-NEXT: cmovneq %rax, %rcx +; NOBMI-NEXT: xorl $63, %ecx ; NOBMI-NEXT: negb %cl ; NOBMI-NEXT: movl $1, %edx ; NOBMI-NEXT: movl $1, %eax diff --git a/llvm/test/CodeGen/X86/combine-or.ll b/llvm/test/CodeGen/X86/combine-or.ll index 3b2102f46a297a..4060355495eb3b 100644 --- a/llvm/test/CodeGen/X86/combine-or.ll +++ b/llvm/test/CodeGen/X86/combine-or.ll @@ -213,21 +213,18 @@ define i64 @PR89533(<64 x i8> %a0) { ; SSE-NEXT: shll $16, %ecx ; SSE-NEXT: orl %eax, %ecx ; SSE-NEXT: pcmpeqb %xmm4, %xmm2 -; SSE-NEXT: pmovmskb %xmm2, %edx -; SSE-NEXT: xorl $65535, %edx # imm = 0xFFFF +; SSE-NEXT: pmovmskb %xmm2, %eax +; SSE-NEXT: xorl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: pcmpeqb %xmm4, %xmm3 -; SSE-NEXT: pmovmskb %xmm3, %eax -; SSE-NEXT: notl %eax -; SSE-NEXT: shll $16, %eax -; SSE-NEXT: orl %edx, %eax -; SSE-NEXT: shlq $32, %rax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: je .LBB11_2 -; SSE-NEXT: # %bb.1: # %cond.false -; SSE-NEXT: rep bsfq %rax, %rax -; SSE-NEXT: retq -; SSE-NEXT: .LBB11_2: # %cond.end +; SSE-NEXT: pmovmskb %xmm3, %edx +; SSE-NEXT: notl %edx +; SSE-NEXT: shll $16, %edx +; SSE-NEXT: orl %eax, %edx +; SSE-NEXT: shlq $32, %rdx +; SSE-NEXT: orq %rcx, %rdx +; SSE-NEXT: bsfq %rdx, %rcx ; SSE-NEXT: movl $64, %eax +; SSE-NEXT: cmovneq %rcx, %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: PR89533: @@ -243,23 +240,19 @@ define i64 @PR89533(<64 x i8> %a0) { ; AVX1-NEXT: shll $16, %ecx ; AVX1-NEXT: orl %eax, %ecx ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %edx -; AVX1-NEXT: xorl $65535, %edx # imm = 0xFFFF +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: xorl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: notl %eax -; AVX1-NEXT: shll $16, %eax -; AVX1-NEXT: orl %edx, %eax -; AVX1-NEXT: shlq $32, %rax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: je .LBB11_2 -; AVX1-NEXT: # %bb.1: # %cond.false -; AVX1-NEXT: rep bsfq %rax, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; AVX1-NEXT: .LBB11_2: # %cond.end +; AVX1-NEXT: vpmovmskb %xmm0, %edx +; AVX1-NEXT: notl %edx +; AVX1-NEXT: shll $16, %edx +; AVX1-NEXT: orl %eax, %edx +; AVX1-NEXT: shlq $32, %rdx +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: bsfq %rdx, %rcx ; AVX1-NEXT: movl $64, %eax +; AVX1-NEXT: cmovneq %rcx, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/ctlo.ll b/llvm/test/CodeGen/X86/ctlo.ll index bb80279e28f3d3..f383c9a2544fca 100644 --- a/llvm/test/CodeGen/X86/ctlo.ll +++ b/llvm/test/CodeGen/X86/ctlo.ll @@ -13,36 +13,44 @@ declare i32 @llvm.ctlz.i32(i32, i1) declare i64 @llvm.ctlz.i64(i64, i1) define i8 @ctlo_i8(i8 %x) { -; X86-LABEL: ctlo_i8: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorb $-1, %al -; X86-NEXT: je .LBB0_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: bsrl %eax, %eax -; X86-NEXT: xorl $7, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; X86-NEXT: .LBB0_1: -; X86-NEXT: movb $8, %al -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl +; X86-NOCMOV-LABEL: ctlo_i8: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: xorb $-1, %al +; X86-NOCMOV-NEXT: je .LBB0_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: movzbl %al, %eax +; X86-NOCMOV-NEXT: bsrl %eax, %eax +; X86-NOCMOV-NEXT: xorl $7, %eax +; X86-NOCMOV-NEXT: # kill: def $al killed $al killed $eax +; X86-NOCMOV-NEXT: retl +; X86-NOCMOV-NEXT: .LBB0_1: +; X86-NOCMOV-NEXT: movb $8, %al +; X86-NOCMOV-NEXT: # kill: def $al killed $al killed $eax +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: ctlo_i8: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-CMOV-NEXT: notb %al +; X86-CMOV-NEXT: movzbl %al, %eax +; X86-CMOV-NEXT: bsrl %eax, %ecx +; X86-CMOV-NEXT: movl $15, %eax +; X86-CMOV-NEXT: cmovnel %ecx, %eax +; X86-CMOV-NEXT: xorl $7, %eax +; X86-CMOV-NEXT: # kill: def $al killed $al killed $eax +; X86-CMOV-NEXT: retl ; ; X64-LABEL: ctlo_i8: ; X64: # %bb.0: -; X64-NEXT: xorb $-1, %dil -; X64-NEXT: je .LBB0_1 -; X64-NEXT: # %bb.2: # %cond.false +; X64-NEXT: notb %dil ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: bsrl %eax, %eax +; X64-NEXT: bsrl %eax, %ecx +; X64-NEXT: movl $15, %eax +; X64-NEXT: cmovnel %ecx, %eax ; X64-NEXT: xorl $7, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq -; X64-NEXT: .LBB0_1: -; X64-NEXT: movb $8, %al -; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlo_i8: ; X86-CLZ: # %bb.0: @@ -111,34 +119,41 @@ define i8 @ctlo_i8_undef(i8 %x) { } define i16 @ctlo_i16(i16 %x) { -; X86-LABEL: ctlo_i16: -; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorw $-1, %ax -; X86-NEXT: je .LBB2_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: bsrw %ax, %ax -; X86-NEXT: xorl $15, %eax -; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: retl -; X86-NEXT: .LBB2_1: -; X86-NEXT: movw $16, %ax -; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: retl +; X86-NOCMOV-LABEL: ctlo_i16: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: xorw $-1, %ax +; X86-NOCMOV-NEXT: je .LBB2_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: bsrw %ax, %ax +; X86-NOCMOV-NEXT: xorl $15, %eax +; X86-NOCMOV-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NOCMOV-NEXT: retl +; X86-NOCMOV-NEXT: .LBB2_1: +; X86-NOCMOV-NEXT: movw $16, %ax +; X86-NOCMOV-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: ctlo_i16: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-CMOV-NEXT: notl %eax +; X86-CMOV-NEXT: bsrw %ax, %cx +; X86-CMOV-NEXT: movw $31, %ax +; X86-CMOV-NEXT: cmovnew %cx, %ax +; X86-CMOV-NEXT: xorl $15, %eax +; X86-CMOV-NEXT: # kill: def $ax killed $ax killed $eax +; X86-CMOV-NEXT: retl ; ; X64-LABEL: ctlo_i16: ; X64: # %bb.0: -; X64-NEXT: xorw $-1, %di -; X64-NEXT: je .LBB2_1 -; X64-NEXT: # %bb.2: # %cond.false -; X64-NEXT: bsrw %di, %ax +; X64-NEXT: notl %edi +; X64-NEXT: bsrw %di, %cx +; X64-NEXT: movw $31, %ax +; X64-NEXT: cmovnew %cx, %ax ; X64-NEXT: xorl $15, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq -; X64-NEXT: .LBB2_1: -; X64-NEXT: movw $16, %ax -; X64-NEXT: # kill: def $ax killed $ax killed $eax -; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlo_i16: ; X86-CLZ: # %bb.0: @@ -193,30 +208,37 @@ define i16 @ctlo_i16_undef(i16 %x) { } define i32 @ctlo_i32(i32 %x) { -; X86-LABEL: ctlo_i32: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl $-1, %eax -; X86-NEXT: je .LBB4_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: bsrl %eax, %eax -; X86-NEXT: xorl $31, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB4_1: -; X86-NEXT: movl $32, %eax -; X86-NEXT: retl +; X86-NOCMOV-LABEL: ctlo_i32: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: xorl $-1, %eax +; X86-NOCMOV-NEXT: je .LBB4_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: bsrl %eax, %eax +; X86-NOCMOV-NEXT: xorl $31, %eax +; X86-NOCMOV-NEXT: retl +; X86-NOCMOV-NEXT: .LBB4_1: +; X86-NOCMOV-NEXT: movl $32, %eax +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: ctlo_i32: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-CMOV-NEXT: notl %eax +; X86-CMOV-NEXT: bsrl %eax, %ecx +; X86-CMOV-NEXT: movl $63, %eax +; X86-CMOV-NEXT: cmovnel %ecx, %eax +; X86-CMOV-NEXT: xorl $31, %eax +; X86-CMOV-NEXT: retl ; ; X64-LABEL: ctlo_i32: ; X64: # %bb.0: -; X64-NEXT: xorl $-1, %edi -; X64-NEXT: je .LBB4_1 -; X64-NEXT: # %bb.2: # %cond.false -; X64-NEXT: bsrl %edi, %eax +; X64-NEXT: notl %edi +; X64-NEXT: bsrl %edi, %ecx +; X64-NEXT: movl $63, %eax +; X64-NEXT: cmovnel %ecx, %eax ; X64-NEXT: xorl $31, %eax ; X64-NEXT: retq -; X64-NEXT: .LBB4_1: -; X64-NEXT: movl $32, %eax -; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlo_i32: ; X86-CLZ: # %bb.0: @@ -314,15 +336,12 @@ define i64 @ctlo_i64(i64 %x) { ; ; X64-LABEL: ctlo_i64: ; X64: # %bb.0: -; X64-NEXT: xorq $-1, %rdi -; X64-NEXT: je .LBB6_1 -; X64-NEXT: # %bb.2: # %cond.false -; X64-NEXT: bsrq %rdi, %rax +; X64-NEXT: notq %rdi +; X64-NEXT: bsrq %rdi, %rcx +; X64-NEXT: movl $127, %eax +; X64-NEXT: cmovneq %rcx, %rax ; X64-NEXT: xorq $63, %rax ; X64-NEXT: retq -; X64-NEXT: .LBB6_1: -; X64-NEXT: movl $64, %eax -; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlo_i64: ; X86-CLZ: # %bb.0: diff --git a/llvm/test/CodeGen/X86/ctlz.ll b/llvm/test/CodeGen/X86/ctlz.ll index d8f83502bd849a..6635be18b0f7a7 100644 --- a/llvm/test/CodeGen/X86/ctlz.ll +++ b/llvm/test/CodeGen/X86/ctlz.ll @@ -218,36 +218,41 @@ define i64 @ctlz_i64(i64 %x) { ; Generate a test and branch to handle zero inputs because bsr/bsf are very slow. define i8 @ctlz_i8_zero_test(i8 %n) { -; X86-LABEL: ctlz_i8_zero_test: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: testb %al, %al -; X86-NEXT: je .LBB4_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: bsrl %eax, %eax -; X86-NEXT: xorl $7, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; X86-NEXT: .LBB4_1: -; X86-NEXT: movb $8, %al -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl +; X86-NOCMOV-LABEL: ctlz_i8_zero_test: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: testb %al, %al +; X86-NOCMOV-NEXT: je .LBB4_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: movzbl %al, %eax +; X86-NOCMOV-NEXT: bsrl %eax, %eax +; X86-NOCMOV-NEXT: xorl $7, %eax +; X86-NOCMOV-NEXT: # kill: def $al killed $al killed $eax +; X86-NOCMOV-NEXT: retl +; X86-NOCMOV-NEXT: .LBB4_1: +; X86-NOCMOV-NEXT: movb $8, %al +; X86-NOCMOV-NEXT: # kill: def $al killed $al killed $eax +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: ctlz_i8_zero_test: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-CMOV-NEXT: bsrl %eax, %ecx +; X86-CMOV-NEXT: movl $15, %eax +; X86-CMOV-NEXT: cmovnel %ecx, %eax +; X86-CMOV-NEXT: xorl $7, %eax +; X86-CMOV-NEXT: # kill: def $al killed $al killed $eax +; X86-CMOV-NEXT: retl ; ; X64-LABEL: ctlz_i8_zero_test: ; X64: # %bb.0: -; X64-NEXT: testb %dil, %dil -; X64-NEXT: je .LBB4_1 -; X64-NEXT: # %bb.2: # %cond.false ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: bsrl %eax, %eax +; X64-NEXT: bsrl %eax, %ecx +; X64-NEXT: movl $15, %eax +; X64-NEXT: cmovnel %ecx, %eax ; X64-NEXT: xorl $7, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq -; X64-NEXT: .LBB4_1: -; X64-NEXT: movb $8, %al -; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlz_i8_zero_test: ; X86-CLZ: # %bb.0: @@ -286,34 +291,38 @@ define i8 @ctlz_i8_zero_test(i8 %n) { ; Generate a test and branch to handle zero inputs because bsr/bsf are very slow. define i16 @ctlz_i16_zero_test(i16 %n) { -; X86-LABEL: ctlz_i16_zero_test: -; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: testw %ax, %ax -; X86-NEXT: je .LBB5_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: bsrw %ax, %ax -; X86-NEXT: xorl $15, %eax -; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: retl -; X86-NEXT: .LBB5_1: -; X86-NEXT: movw $16, %ax -; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: retl +; X86-NOCMOV-LABEL: ctlz_i16_zero_test: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: testw %ax, %ax +; X86-NOCMOV-NEXT: je .LBB5_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: bsrw %ax, %ax +; X86-NOCMOV-NEXT: xorl $15, %eax +; X86-NOCMOV-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NOCMOV-NEXT: retl +; X86-NOCMOV-NEXT: .LBB5_1: +; X86-NOCMOV-NEXT: movw $16, %ax +; X86-NOCMOV-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: ctlz_i16_zero_test: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: bsrw {{[0-9]+}}(%esp), %cx +; X86-CMOV-NEXT: movw $31, %ax +; X86-CMOV-NEXT: cmovnew %cx, %ax +; X86-CMOV-NEXT: xorl $15, %eax +; X86-CMOV-NEXT: # kill: def $ax killed $ax killed $eax +; X86-CMOV-NEXT: retl ; ; X64-LABEL: ctlz_i16_zero_test: ; X64: # %bb.0: -; X64-NEXT: testw %di, %di -; X64-NEXT: je .LBB5_1 -; X64-NEXT: # %bb.2: # %cond.false -; X64-NEXT: bsrw %di, %ax +; X64-NEXT: bsrw %di, %cx +; X64-NEXT: movw $31, %ax +; X64-NEXT: cmovnew %cx, %ax ; X64-NEXT: xorl $15, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq -; X64-NEXT: .LBB5_1: -; X64-NEXT: movw $16, %ax -; X64-NEXT: # kill: def $ax killed $ax killed $eax -; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlz_i16_zero_test: ; X86-CLZ: # %bb.0: @@ -340,30 +349,34 @@ define i16 @ctlz_i16_zero_test(i16 %n) { ; Generate a test and branch to handle zero inputs because bsr/bsf are very slow. define i32 @ctlz_i32_zero_test(i32 %n) { -; X86-LABEL: ctlz_i32_zero_test: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB6_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: bsrl %eax, %eax -; X86-NEXT: xorl $31, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB6_1: -; X86-NEXT: movl $32, %eax -; X86-NEXT: retl +; X86-NOCMOV-LABEL: ctlz_i32_zero_test: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: testl %eax, %eax +; X86-NOCMOV-NEXT: je .LBB6_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: bsrl %eax, %eax +; X86-NOCMOV-NEXT: xorl $31, %eax +; X86-NOCMOV-NEXT: retl +; X86-NOCMOV-NEXT: .LBB6_1: +; X86-NOCMOV-NEXT: movl $32, %eax +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: ctlz_i32_zero_test: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: bsrl {{[0-9]+}}(%esp), %ecx +; X86-CMOV-NEXT: movl $63, %eax +; X86-CMOV-NEXT: cmovnel %ecx, %eax +; X86-CMOV-NEXT: xorl $31, %eax +; X86-CMOV-NEXT: retl ; ; X64-LABEL: ctlz_i32_zero_test: ; X64: # %bb.0: -; X64-NEXT: testl %edi, %edi -; X64-NEXT: je .LBB6_1 -; X64-NEXT: # %bb.2: # %cond.false -; X64-NEXT: bsrl %edi, %eax +; X64-NEXT: bsrl %edi, %ecx +; X64-NEXT: movl $63, %eax +; X64-NEXT: cmovnel %ecx, %eax ; X64-NEXT: xorl $31, %eax ; X64-NEXT: retq -; X64-NEXT: .LBB6_1: -; X64-NEXT: movl $32, %eax -; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlz_i32_zero_test: ; X86-CLZ: # %bb.0: @@ -429,15 +442,11 @@ define i64 @ctlz_i64_zero_test(i64 %n) { ; ; X64-LABEL: ctlz_i64_zero_test: ; X64: # %bb.0: -; X64-NEXT: testq %rdi, %rdi -; X64-NEXT: je .LBB7_1 -; X64-NEXT: # %bb.2: # %cond.false -; X64-NEXT: bsrq %rdi, %rax +; X64-NEXT: bsrq %rdi, %rcx +; X64-NEXT: movl $127, %eax +; X64-NEXT: cmovneq %rcx, %rax ; X64-NEXT: xorq $63, %rax ; X64-NEXT: retq -; X64-NEXT: .LBB7_1: -; X64-NEXT: movl $64, %eax -; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlz_i64_zero_test: ; X86-CLZ: # %bb.0: @@ -580,33 +589,33 @@ define i32 @ctlz_bsr(i32 %n) { ; FIXME: The compare and branch are produced late in IR (by CodeGenPrepare), and ; codegen doesn't know how to combine the $32 and $31 into $63. define i32 @ctlz_bsr_zero_test(i32 %n) { -; X86-LABEL: ctlz_bsr_zero_test: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB10_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: bsrl %eax, %eax -; X86-NEXT: xorl $31, %eax -; X86-NEXT: xorl $31, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB10_1: -; X86-NEXT: movl $32, %eax -; X86-NEXT: xorl $31, %eax -; X86-NEXT: retl +; X86-NOCMOV-LABEL: ctlz_bsr_zero_test: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: testl %eax, %eax +; X86-NOCMOV-NEXT: je .LBB10_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: bsrl %eax, %eax +; X86-NOCMOV-NEXT: xorl $31, %eax +; X86-NOCMOV-NEXT: xorl $31, %eax +; X86-NOCMOV-NEXT: retl +; X86-NOCMOV-NEXT: .LBB10_1: +; X86-NOCMOV-NEXT: movl $32, %eax +; X86-NOCMOV-NEXT: xorl $31, %eax +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: ctlz_bsr_zero_test: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: bsrl {{[0-9]+}}(%esp), %ecx +; X86-CMOV-NEXT: movl $63, %eax +; X86-CMOV-NEXT: cmovnel %ecx, %eax +; X86-CMOV-NEXT: retl ; ; X64-LABEL: ctlz_bsr_zero_test: ; X64: # %bb.0: -; X64-NEXT: testl %edi, %edi -; X64-NEXT: je .LBB10_1 -; X64-NEXT: # %bb.2: # %cond.false -; X64-NEXT: bsrl %edi, %eax -; X64-NEXT: xorl $31, %eax -; X64-NEXT: xorl $31, %eax -; X64-NEXT: retq -; X64-NEXT: .LBB10_1: -; X64-NEXT: movl $32, %eax -; X64-NEXT: xorl $31, %eax +; X64-NEXT: bsrl %edi, %ecx +; X64-NEXT: movl $63, %eax +; X64-NEXT: cmovnel %ecx, %eax ; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlz_bsr_zero_test: @@ -945,38 +954,39 @@ define i8 @ctlz_xor7_i8_true(i8 %x) { } define i8 @ctlz_xor7_i8_false(i8 %x) { -; X86-LABEL: ctlz_xor7_i8_false: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: testb %al, %al -; X86-NEXT: je .LBB16_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: bsrl %eax, %eax -; X86-NEXT: xorl $7, %eax -; X86-NEXT: xorb $7, %al -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; X86-NEXT: .LBB16_1: -; X86-NEXT: movb $8, %al -; X86-NEXT: xorb $7, %al -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl +; X86-NOCMOV-LABEL: ctlz_xor7_i8_false: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: testb %al, %al +; X86-NOCMOV-NEXT: je .LBB16_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: movzbl %al, %eax +; X86-NOCMOV-NEXT: bsrl %eax, %eax +; X86-NOCMOV-NEXT: xorl $7, %eax +; X86-NOCMOV-NEXT: xorb $7, %al +; X86-NOCMOV-NEXT: # kill: def $al killed $al killed $eax +; X86-NOCMOV-NEXT: retl +; X86-NOCMOV-NEXT: .LBB16_1: +; X86-NOCMOV-NEXT: movb $8, %al +; X86-NOCMOV-NEXT: xorb $7, %al +; X86-NOCMOV-NEXT: # kill: def $al killed $al killed $eax +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: ctlz_xor7_i8_false: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-CMOV-NEXT: bsrl %eax, %ecx +; X86-CMOV-NEXT: movl $15, %eax +; X86-CMOV-NEXT: cmovnel %ecx, %eax +; X86-CMOV-NEXT: # kill: def $al killed $al killed $eax +; X86-CMOV-NEXT: retl ; ; X64-LABEL: ctlz_xor7_i8_false: ; X64: # %bb.0: -; X64-NEXT: testb %dil, %dil -; X64-NEXT: je .LBB16_1 -; X64-NEXT: # %bb.2: # %cond.false ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: bsrl %eax, %eax -; X64-NEXT: xorl $7, %eax -; X64-NEXT: xorb $7, %al -; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: retq -; X64-NEXT: .LBB16_1: -; X64-NEXT: movb $8, %al -; X64-NEXT: xorb $7, %al +; X64-NEXT: bsrl %eax, %ecx +; X64-NEXT: movl $15, %eax +; X64-NEXT: cmovnel %ecx, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; @@ -1060,33 +1070,33 @@ define i16 @ctlz_xor15_i16_true(i16 %x) { } define i32 @ctlz_xor31_i32_false(i32 %x) { -; X86-LABEL: ctlz_xor31_i32_false: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB18_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: bsrl %eax, %eax -; X86-NEXT: xorl $31, %eax -; X86-NEXT: xorl $31, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB18_1: -; X86-NEXT: movl $32, %eax -; X86-NEXT: xorl $31, %eax -; X86-NEXT: retl +; X86-NOCMOV-LABEL: ctlz_xor31_i32_false: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: testl %eax, %eax +; X86-NOCMOV-NEXT: je .LBB18_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: bsrl %eax, %eax +; X86-NOCMOV-NEXT: xorl $31, %eax +; X86-NOCMOV-NEXT: xorl $31, %eax +; X86-NOCMOV-NEXT: retl +; X86-NOCMOV-NEXT: .LBB18_1: +; X86-NOCMOV-NEXT: movl $32, %eax +; X86-NOCMOV-NEXT: xorl $31, %eax +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: ctlz_xor31_i32_false: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: bsrl {{[0-9]+}}(%esp), %ecx +; X86-CMOV-NEXT: movl $63, %eax +; X86-CMOV-NEXT: cmovnel %ecx, %eax +; X86-CMOV-NEXT: retl ; ; X64-LABEL: ctlz_xor31_i32_false: ; X64: # %bb.0: -; X64-NEXT: testl %edi, %edi -; X64-NEXT: je .LBB18_1 -; X64-NEXT: # %bb.2: # %cond.false -; X64-NEXT: bsrl %edi, %eax -; X64-NEXT: xorl $31, %eax -; X64-NEXT: xorl $31, %eax -; X64-NEXT: retq -; X64-NEXT: .LBB18_1: -; X64-NEXT: movl $32, %eax -; X64-NEXT: xorl $31, %eax +; X64-NEXT: bsrl %edi, %ecx +; X64-NEXT: movl $63, %eax +; X64-NEXT: cmovnel %ecx, %eax ; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlz_xor31_i32_false: diff --git a/llvm/test/CodeGen/X86/cttz.ll b/llvm/test/CodeGen/X86/cttz.ll index b35a1b72fcb6f1..27f229b18bf057 100644 --- a/llvm/test/CodeGen/X86/cttz.ll +++ b/llvm/test/CodeGen/X86/cttz.ll @@ -303,17 +303,24 @@ define i16 @cttz_i16_zero_test(i16 %n) { ; Generate a test and branch to handle zero inputs because bsr/bsf are very slow. define i32 @cttz_i32_zero_test(i32 %n) { -; X86-LABEL: cttz_i32_zero_test: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB6_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB6_1: -; X86-NEXT: movl $32, %eax -; X86-NEXT: retl +; X86-NOCMOV-LABEL: cttz_i32_zero_test: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: testl %eax, %eax +; X86-NOCMOV-NEXT: je .LBB6_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: rep bsfl %eax, %eax +; X86-NOCMOV-NEXT: retl +; X86-NOCMOV-NEXT: .LBB6_1: +; X86-NOCMOV-NEXT: movl $32, %eax +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: cttz_i32_zero_test: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: bsfl {{[0-9]+}}(%esp), %ecx +; X86-CMOV-NEXT: movl $32, %eax +; X86-CMOV-NEXT: cmovnel %ecx, %eax +; X86-CMOV-NEXT: retl ; ; X64-LABEL: cttz_i32_zero_test: ; X64: # %bb.0: @@ -386,13 +393,9 @@ define i64 @cttz_i64_zero_test(i64 %n) { ; ; X64-LABEL: cttz_i64_zero_test: ; X64: # %bb.0: -; X64-NEXT: testq %rdi, %rdi -; X64-NEXT: je .LBB7_1 -; X64-NEXT: # %bb.2: # %cond.false -; X64-NEXT: rep bsfq %rdi, %rax -; X64-NEXT: retq -; X64-NEXT: .LBB7_1: +; X64-NEXT: bsfq %rdi, %rcx ; X64-NEXT: movl $64, %eax +; X64-NEXT: cmovneq %rcx, %rax ; X64-NEXT: retq ; ; X86-CLZ-LABEL: cttz_i64_zero_test: diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll index d5d604a138a719..ac41a3fe6bb7e4 100644 --- a/llvm/test/CodeGen/X86/known-never-zero.ll +++ b/llvm/test/CodeGen/X86/known-never-zero.ll @@ -44,12 +44,9 @@ define i32 @or_maybe_zero(i32 %x, i32 %y) { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-NEXT: je .LBB1_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB1_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: or_maybe_zero: @@ -94,18 +91,14 @@ define i32 @select_known_nonzero(i1 %c, i32 %x) { define i32 @select_maybe_zero(i1 %c, i32 %x) { ; X86-LABEL: select_maybe_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl $1, %ecx -; X86-NEXT: xorl %eax, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl $1, %eax +; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: testb $1, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnel %ecx, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB3_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB3_1: +; X86-NEXT: cmovnel %eax, %ecx +; X86-NEXT: bsfl %ecx, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: select_maybe_zero: @@ -201,13 +194,9 @@ define i32 @shl_maybe_zero(i32 %x, i32 %y) { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shll %cl, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB7_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB7_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: shl_maybe_zero: @@ -251,17 +240,13 @@ define i32 @uaddsat_known_nonzero(i32 %x) { define i32 @uaddsat_maybe_zero(i32 %x, i32 %y) { ; X86-LABEL: uaddsat_maybe_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $-1, %eax -; X86-NEXT: cmovael %ecx, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB9_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB9_1: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: cmovael %eax, %ecx +; X86-NEXT: bsfl %ecx, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: uaddsat_maybe_zero: @@ -314,13 +299,9 @@ define i32 @umax_maybe_zero(i32 %x, i32 %y) { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: cmoval %ecx, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB11_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB11_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: umax_maybe_zero: @@ -372,17 +353,13 @@ define i32 @umin_known_nonzero(i32 %xx, i32 %yy) { define i32 @umin_maybe_zero(i32 %x, i32 %y) { ; X86-LABEL: umin_maybe_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl $54, %ecx -; X86-NEXT: movl $54, %eax -; X86-NEXT: cmovbl %ecx, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB13_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB13_1: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $54, %eax +; X86-NEXT: movl $54, %ecx +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: bsfl %ecx, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: umin_maybe_zero: @@ -490,17 +467,13 @@ define <4 x i32> @smin_known_zero_vec(<4 x i32> %x, <4 x i32> %y) { define i32 @smin_maybe_zero(i32 %x, i32 %y) { ; X86-LABEL: smin_maybe_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl $54, %ecx -; X86-NEXT: movl $54, %eax -; X86-NEXT: cmovll %ecx, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB17_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB17_1: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $54, %eax +; X86-NEXT: movl $54, %ecx +; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: bsfl %ecx, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: smin_maybe_zero: @@ -608,17 +581,13 @@ define <4 x i32> @smax_known_zero_vec(<4 x i32> %x, <4 x i32> %y) { define i32 @smax_known_zero(i32 %x, i32 %y) { ; X86-LABEL: smax_known_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: testl %ecx, %ecx -; X86-NEXT: movl $-1, %eax -; X86-NEXT: cmovnsl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB21_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB21_1: +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: cmovnsl %eax, %ecx +; X86-NEXT: bsfl %ecx, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: smax_known_zero: @@ -643,14 +612,8 @@ define i32 @rotr_known_nonzero(i32 %xx, i32 %y) { ; X86-NEXT: movl $256, %eax # imm = 0x100 ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: rorl %cl, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB22_1 -; X86-NEXT: # %bb.2: # %cond.false ; X86-NEXT: rep bsfl %eax, %eax ; X86-NEXT: retl -; X86-NEXT: .LBB22_1: -; X86-NEXT: movl $32, %eax -; X86-NEXT: retl ; ; X64-LABEL: rotr_known_nonzero: ; X64: # %bb.0: @@ -675,13 +638,9 @@ define i32 @rotr_maybe_zero(i32 %x, i32 %y) { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: rorl %cl, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB23_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB23_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: rotr_maybe_zero: @@ -733,13 +692,9 @@ define i32 @rotr_with_fshr_maybe_zero(i32 %x, i32 %y) { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: rorl %cl, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB25_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB25_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: rotr_with_fshr_maybe_zero: @@ -765,14 +720,8 @@ define i32 @rotl_known_nonzero(i32 %xx, i32 %y) { ; X86-NEXT: movl $256, %eax # imm = 0x100 ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: roll %cl, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB26_1 -; X86-NEXT: # %bb.2: # %cond.false ; X86-NEXT: rep bsfl %eax, %eax ; X86-NEXT: retl -; X86-NEXT: .LBB26_1: -; X86-NEXT: movl $32, %eax -; X86-NEXT: retl ; ; X64-LABEL: rotl_known_nonzero: ; X64: # %bb.0: @@ -797,13 +746,9 @@ define i32 @rotl_maybe_zero(i32 %x, i32 %y) { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: roll %cl, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB27_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB27_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: rotl_maybe_zero: @@ -855,13 +800,9 @@ define i32 @rotl_with_fshl_maybe_zero(i32 %x, i32 %y) { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: roll %cl, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB29_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB29_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: rotl_with_fshl_maybe_zero: @@ -932,13 +873,9 @@ define i32 @sra_maybe_zero(i32 %x, i32 %y) { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sarl %cl, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB32_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB32_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: sra_maybe_zero: @@ -1009,13 +946,9 @@ define i32 @srl_maybe_zero(i32 %x, i32 %y) { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl %cl, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB35_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB35_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: srl_maybe_zero: @@ -1064,13 +997,9 @@ define i32 @udiv_maybe_zero(i32 %x, i32 %y) { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl {{[0-9]+}}(%esp) -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB37_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB37_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: udiv_maybe_zero: @@ -1119,13 +1048,9 @@ define i32 @sdiv_maybe_zero(i32 %x, i32 %y) { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cltd ; X86-NEXT: idivl {{[0-9]+}}(%esp) -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB39_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB39_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: sdiv_maybe_zero: @@ -1171,12 +1096,9 @@ define i32 @add_maybe_zero(i32 %xx, i32 %y) { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orl $1, %eax ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax -; X86-NEXT: je .LBB41_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB41_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: add_maybe_zero: @@ -1249,16 +1171,13 @@ define i32 @sub_known_nonzero_ne_case(i32 %xx, i32 %yy) { define i32 @sub_maybe_zero(i32 %x) { ; X86-LABEL: sub_maybe_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: orl $64, %eax -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: je .LBB44_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB44_1: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: orl $64, %ecx +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: bsfl %ecx, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: sub_maybe_zero: @@ -1280,14 +1199,11 @@ define i32 @sub_maybe_zero(i32 %x) { define i32 @sub_maybe_zero2(i32 %x) { ; X86-LABEL: sub_maybe_zero2: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: negl %eax -; X86-NEXT: je .LBB45_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB45_1: +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: sub_maybe_zero2: @@ -1310,13 +1226,9 @@ define i32 @mul_known_nonzero_nsw(i32 %x, i32 %yy) { ; X86-NEXT: movl $256, %eax # imm = 0x100 ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull {{[0-9]+}}(%esp), %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB46_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB46_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: mul_known_nonzero_nsw: @@ -1341,13 +1253,9 @@ define i32 @mul_known_nonzero_nuw(i32 %x, i32 %yy) { ; X86-NEXT: movl $256, %eax # imm = 0x100 ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull {{[0-9]+}}(%esp), %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB47_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB47_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: mul_known_nonzero_nuw: @@ -1371,13 +1279,9 @@ define i32 @mul_maybe_zero(i32 %x, i32 %y) { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull {{[0-9]+}}(%esp), %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB48_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB48_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: mul_maybe_zero: @@ -1433,13 +1337,9 @@ define i32 @bitcast_maybe_zero(<2 x i16> %x) { ; X86-LABEL: bitcast_maybe_zero: ; X86: # %bb.0: ; X86-NEXT: movd %xmm0, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB50_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB50_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: bitcast_maybe_zero: @@ -1458,15 +1358,9 @@ define i32 @bitcast_maybe_zero(<2 x i16> %x) { define i32 @bitcast_from_float(float %x) { ; X86-LABEL: bitcast_from_float: ; X86: # %bb.0: -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movd %xmm0, %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB51_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB51_1: +; X86-NEXT: bsfl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: bitcast_from_float: @@ -1511,14 +1405,9 @@ define i32 @zext_maybe_zero(i16 %x) { ; X86-LABEL: zext_maybe_zero: ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: testw %ax, %ax -; X86-NEXT: je .LBB53_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: movzwl %ax, %eax -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB53_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: zext_maybe_zero: @@ -1563,13 +1452,9 @@ define i32 @sext_maybe_zero(i16 %x) { ; X86-LABEL: sext_maybe_zero: ; X86: # %bb.0: ; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB55_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB55_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: sext_maybe_zero: diff --git a/llvm/test/CodeGen/X86/lzcnt-cmp.ll b/llvm/test/CodeGen/X86/lzcnt-cmp.ll index a9513a373661f4..4f65739cc70dd1 100644 --- a/llvm/test/CodeGen/X86/lzcnt-cmp.ll +++ b/llvm/test/CodeGen/X86/lzcnt-cmp.ll @@ -12,27 +12,11 @@ define i1 @lshr_ctlz_cmpeq_one_i64(i64 %in) nounwind { ; X86-NEXT: sete %al ; X86-NEXT: retl ; -; X64-BSR-LABEL: lshr_ctlz_cmpeq_one_i64: -; X64-BSR: # %bb.0: -; X64-BSR-NEXT: testq %rdi, %rdi -; X64-BSR-NEXT: je .LBB0_1 -; X64-BSR-NEXT: # %bb.2: # %cond.false -; X64-BSR-NEXT: bsrq %rdi, %rax -; X64-BSR-NEXT: xorq $63, %rax -; X64-BSR-NEXT: shrl $6, %eax -; X64-BSR-NEXT: # kill: def $al killed $al killed $rax -; X64-BSR-NEXT: retq -; X64-BSR-NEXT: .LBB0_1: -; X64-BSR-NEXT: movl $64, %eax -; X64-BSR-NEXT: shrl $6, %eax -; X64-BSR-NEXT: # kill: def $al killed $al killed $rax -; X64-BSR-NEXT: retq -; -; X64-LZCNT-LABEL: lshr_ctlz_cmpeq_one_i64: -; X64-LZCNT: # %bb.0: -; X64-LZCNT-NEXT: testq %rdi, %rdi -; X64-LZCNT-NEXT: sete %al -; X64-LZCNT-NEXT: retq +; X64-LABEL: lshr_ctlz_cmpeq_one_i64: +; X64: # %bb.0: +; X64-NEXT: testq %rdi, %rdi +; X64-NEXT: sete %al +; X64-NEXT: retq %ctlz = call i64 @llvm.ctlz.i64(i64 %in, i1 0) %lshr = lshr i64 %ctlz, 6 %icmp = icmp eq i64 %lshr, 1 @@ -81,27 +65,11 @@ define i1 @lshr_ctlz_cmpne_zero_i64(i64 %in) nounwind { ; X86-NEXT: sete %al ; X86-NEXT: retl ; -; X64-BSR-LABEL: lshr_ctlz_cmpne_zero_i64: -; X64-BSR: # %bb.0: -; X64-BSR-NEXT: testq %rdi, %rdi -; X64-BSR-NEXT: je .LBB2_1 -; X64-BSR-NEXT: # %bb.2: # %cond.false -; X64-BSR-NEXT: bsrq %rdi, %rax -; X64-BSR-NEXT: xorq $63, %rax -; X64-BSR-NEXT: shrl $6, %eax -; X64-BSR-NEXT: # kill: def $al killed $al killed $rax -; X64-BSR-NEXT: retq -; X64-BSR-NEXT: .LBB2_1: -; X64-BSR-NEXT: movl $64, %eax -; X64-BSR-NEXT: shrl $6, %eax -; X64-BSR-NEXT: # kill: def $al killed $al killed $rax -; X64-BSR-NEXT: retq -; -; X64-LZCNT-LABEL: lshr_ctlz_cmpne_zero_i64: -; X64-LZCNT: # %bb.0: -; X64-LZCNT-NEXT: testq %rdi, %rdi -; X64-LZCNT-NEXT: sete %al -; X64-LZCNT-NEXT: retq +; X64-LABEL: lshr_ctlz_cmpne_zero_i64: +; X64: # %bb.0: +; X64-NEXT: testq %rdi, %rdi +; X64-NEXT: sete %al +; X64-NEXT: retq %ctlz = call i64 @llvm.ctlz.i64(i64 %in, i1 0) %lshr = lshr i64 %ctlz, 6 %icmp = icmp ne i64 %lshr, 0 diff --git a/llvm/test/CodeGen/X86/pr57673.ll b/llvm/test/CodeGen/X86/pr57673.ll index d0ae6cea068dc0..cf7717f420480b 100644 --- a/llvm/test/CodeGen/X86/pr57673.ll +++ b/llvm/test/CodeGen/X86/pr57673.ll @@ -24,35 +24,24 @@ define void @foo() { ; NORMAL-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY [[MOV32r0_]].sub_8bit ; NORMAL-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r %stack.1.i, 1, $noreg, 0, $noreg ; NORMAL-NEXT: [[DEF:%[0-9]+]]:gr64 = IMPLICIT_DEF - ; NORMAL-NEXT: [[DEF1:%[0-9]+]]:gr64 = IMPLICIT_DEF ; NORMAL-NEXT: {{ $}} ; NORMAL-NEXT: bb.1.bb_8: - ; NORMAL-NEXT: successors: %bb.5(0x40000000), %bb.2(0x40000000) + ; NORMAL-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) ; NORMAL-NEXT: {{ $}} ; NORMAL-NEXT: TEST8rr [[COPY]], [[COPY]], implicit-def $eflags - ; NORMAL-NEXT: JCC_1 %bb.5, 5, implicit $eflags + ; NORMAL-NEXT: JCC_1 %bb.3, 5, implicit $eflags ; NORMAL-NEXT: JMP_1 %bb.2 ; NORMAL-NEXT: {{ $}} ; NORMAL-NEXT: bb.2.bb_mid: - ; NORMAL-NEXT: successors: %bb.4(0x30000000), %bb.3(0x50000000) + ; NORMAL-NEXT: successors: %bb.3(0x80000000) ; NORMAL-NEXT: {{ $}} - ; NORMAL-NEXT: TEST64rr [[DEF1]], [[DEF1]], implicit-def $eflags - ; NORMAL-NEXT: JCC_1 %bb.4, 4, implicit $eflags - ; NORMAL-NEXT: JMP_1 %bb.3 - ; NORMAL-NEXT: {{ $}} - ; NORMAL-NEXT: bb.3.cond.false: - ; NORMAL-NEXT: successors: %bb.4(0x80000000) - ; NORMAL-NEXT: {{ $}} - ; NORMAL-NEXT: bb.4.cond.end: - ; NORMAL-NEXT: successors: %bb.5(0x80000000) - ; NORMAL-NEXT: {{ $}} - ; NORMAL-NEXT: [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm [[LEA64r]], 1, $noreg, 40, $noreg :: (load (s128) from %ir.i4, align 8) + ; NORMAL-NEXT: [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i4, align 8) ; NORMAL-NEXT: MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm]] :: (store (s128) into `ptr null`, align 8) - ; NORMAL-NEXT: DBG_VALUE_LIST !3, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_plus_uconst, 40), [[LEA64r]], [[LEA64r]], debug-location !8 - ; NORMAL-NEXT: [[MOVUPSrm1:%[0-9]+]]:vr128 = MOVUPSrm [[LEA64r]], 1, $noreg, 40, $noreg :: (load (s128) from %ir.i6, align 8) + ; NORMAL-NEXT: DBG_VALUE $noreg, $noreg, !3, !DIExpression(), debug-location !8 + ; NORMAL-NEXT: [[MOVUPSrm1:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i6, align 8) ; NORMAL-NEXT: MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm1]] :: (store (s128) into `ptr null`, align 8) ; NORMAL-NEXT: {{ $}} - ; NORMAL-NEXT: bb.5.bb_last: + ; NORMAL-NEXT: bb.3.bb_last: ; NORMAL-NEXT: successors: %bb.1(0x80000000) ; NORMAL-NEXT: {{ $}} ; NORMAL-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp @@ -74,35 +63,24 @@ define void @foo() { ; INSTRREF-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY [[MOV32r0_]].sub_8bit ; INSTRREF-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r %stack.1.i, 1, $noreg, 0, $noreg ; INSTRREF-NEXT: [[DEF:%[0-9]+]]:gr64 = IMPLICIT_DEF - ; INSTRREF-NEXT: [[DEF1:%[0-9]+]]:gr64 = IMPLICIT_DEF ; INSTRREF-NEXT: {{ $}} ; INSTRREF-NEXT: bb.1.bb_8: - ; INSTRREF-NEXT: successors: %bb.5(0x40000000), %bb.2(0x40000000) + ; INSTRREF-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) ; INSTRREF-NEXT: {{ $}} ; INSTRREF-NEXT: TEST8rr [[COPY]], [[COPY]], implicit-def $eflags - ; INSTRREF-NEXT: JCC_1 %bb.5, 5, implicit $eflags + ; INSTRREF-NEXT: JCC_1 %bb.3, 5, implicit $eflags ; INSTRREF-NEXT: JMP_1 %bb.2 ; INSTRREF-NEXT: {{ $}} ; INSTRREF-NEXT: bb.2.bb_mid: - ; INSTRREF-NEXT: successors: %bb.4(0x30000000), %bb.3(0x50000000) - ; INSTRREF-NEXT: {{ $}} - ; INSTRREF-NEXT: TEST64rr [[DEF1]], [[DEF1]], implicit-def $eflags - ; INSTRREF-NEXT: JCC_1 %bb.4, 4, implicit $eflags - ; INSTRREF-NEXT: JMP_1 %bb.3 - ; INSTRREF-NEXT: {{ $}} - ; INSTRREF-NEXT: bb.3.cond.false: - ; INSTRREF-NEXT: successors: %bb.4(0x80000000) - ; INSTRREF-NEXT: {{ $}} - ; INSTRREF-NEXT: bb.4.cond.end: - ; INSTRREF-NEXT: successors: %bb.5(0x80000000) + ; INSTRREF-NEXT: successors: %bb.3(0x80000000) ; INSTRREF-NEXT: {{ $}} - ; INSTRREF-NEXT: [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm [[LEA64r]], 1, $noreg, 40, $noreg :: (load (s128) from %ir.i4, align 8) + ; INSTRREF-NEXT: [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i4, align 8) ; INSTRREF-NEXT: MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm]] :: (store (s128) into `ptr null`, align 8) - ; INSTRREF-NEXT: DBG_INSTR_REF !3, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), dbg-instr-ref(1, 0), debug-location !8 - ; INSTRREF-NEXT: [[MOVUPSrm1:%[0-9]+]]:vr128 = MOVUPSrm [[LEA64r]], 1, $noreg, 40, $noreg :: (load (s128) from %ir.i6, align 8) + ; INSTRREF-NEXT: DBG_VALUE $noreg, $noreg, !3, !DIExpression(), debug-location !8 + ; INSTRREF-NEXT: [[MOVUPSrm1:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i6, align 8) ; INSTRREF-NEXT: MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm1]] :: (store (s128) into `ptr null`, align 8) ; INSTRREF-NEXT: {{ $}} - ; INSTRREF-NEXT: bb.5.bb_last: + ; INSTRREF-NEXT: bb.3.bb_last: ; INSTRREF-NEXT: successors: %bb.1(0x80000000) ; INSTRREF-NEXT: {{ $}} ; INSTRREF-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp diff --git a/llvm/test/CodeGen/X86/pr89877.ll b/llvm/test/CodeGen/X86/pr89877.ll index fdbe75b467d992..19baad26583ada 100644 --- a/llvm/test/CodeGen/X86/pr89877.ll +++ b/llvm/test/CodeGen/X86/pr89877.ll @@ -9,13 +9,9 @@ define i32 @sext_known_nonzero(i16 %xx) { ; X86-NEXT: movl $256, %eax # imm = 0x100 ; X86-NEXT: shll %cl, %eax ; X86-NEXT: cwtl -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB0_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: rep bsfl %eax, %eax -; X86-NEXT: retl -; X86-NEXT: .LBB0_1: +; X86-NEXT: bsfl %eax, %ecx ; X86-NEXT: movl $32, %eax +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: sext_known_nonzero: diff --git a/llvm/test/CodeGen/X86/pr92569.ll b/llvm/test/CodeGen/X86/pr92569.ll index f91063089e3a90..0fb4ed7905287c 100644 --- a/llvm/test/CodeGen/X86/pr92569.ll +++ b/llvm/test/CodeGen/X86/pr92569.ll @@ -4,17 +4,13 @@ define void @PR92569(i64 %arg, <8 x i8> %arg1) { ; CHECK-LABEL: PR92569: ; CHECK: # %bb.0: -; CHECK-NEXT: testq %rdi, %rdi -; CHECK-NEXT: je .LBB0_1 -; CHECK-NEXT: # %bb.2: # %cond.false -; CHECK-NEXT: rep bsfq %rdi, %rax -; CHECK-NEXT: jmp .LBB0_3 -; CHECK-NEXT: .LBB0_1: -; CHECK-NEXT: movl $64, %eax -; CHECK-NEXT: .LBB0_3: # %cond.end -; CHECK-NEXT: shrb $3, %al +; CHECK-NEXT: bsfq %rdi, %rax +; CHECK-NEXT: movl $64, %ecx +; CHECK-NEXT: cmovneq %rax, %rcx +; CHECK-NEXT: shrb $3, %cl ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: movzbl %cl, %eax +; CHECK-NEXT: andl $15, %eax ; CHECK-NEXT: movzbl -24(%rsp,%rax), %eax ; CHECK-NEXT: movl %eax, 0 ; CHECK-NEXT: retq diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll b/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll index 06909d950addb6..2c2923440bf7c2 100644 --- a/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll +++ b/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -passes='require,function(codegenprepare)' < %s | FileCheck %s --check-prefix=SLOW -; RUN: opt -S -passes='require,function(codegenprepare)' -mattr=+bmi < %s | FileCheck %s --check-prefix=FAST_TZ -; RUN: opt -S -passes='require,function(codegenprepare)' -mattr=+lzcnt < %s | FileCheck %s --check-prefix=FAST_LZ +; RUN: opt -S -passes="require,function(codegenprepare)" < %s | FileCheck %s --check-prefix=SLOW +; RUN: opt -S -passes="require,function(codegenprepare)" -mattr=+bmi < %s | FileCheck %s --check-prefix=FAST_TZ +; RUN: opt -S -passes="require,function(codegenprepare)" -mattr=+lzcnt < %s | FileCheck %s --check-prefix=FAST_LZ -; RUN: opt -S -enable-debugify -passes='require,function(codegenprepare)' < %s | FileCheck %s --check-prefix=DEBUGINFO -; RUN: opt -S -enable-debugify -passes='require,function(codegenprepare)' --try-experimental-debuginfo-iterators < %s | FileCheck %s --check-prefix=DEBUGINFO +; RUN: opt -S -enable-debugify -passes="require,function(codegenprepare)" < %s | FileCheck %s --check-prefix=DEBUGINFO +; RUN: opt -S -enable-debugify -passes="require,function(codegenprepare)" --try-experimental-debuginfo-iterators < %s | FileCheck %s --check-prefix=DEBUGINFO target triple = "x86_64-unknown-unknown" target datalayout = "e-n32:64" @@ -16,15 +16,8 @@ target datalayout = "e-n32:64" define i64 @cttz(i64 %A) { ; SLOW-LABEL: @cttz( ; SLOW-NEXT: entry: -; SLOW-NEXT: [[A_FR:%.*]] = freeze i64 [[A:%.*]] -; SLOW-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0 -; SLOW-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]] -; SLOW: cond.false: -; SLOW-NEXT: [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A_FR]], i1 true) -; SLOW-NEXT: br label [[COND_END]] -; SLOW: cond.end: -; SLOW-NEXT: [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ] -; SLOW-NEXT: ret i64 [[CTZ]] +; SLOW-NEXT: [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A:%.*]], i1 false) +; SLOW-NEXT: ret i64 [[Z]] ; ; FAST_TZ-LABEL: @cttz( ; FAST_TZ-NEXT: entry: @@ -33,28 +26,14 @@ define i64 @cttz(i64 %A) { ; ; FAST_LZ-LABEL: @cttz( ; FAST_LZ-NEXT: entry: -; FAST_LZ-NEXT: [[A_FR:%.*]] = freeze i64 [[A:%.*]] -; FAST_LZ-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0 -; FAST_LZ-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]] -; FAST_LZ: cond.false: -; FAST_LZ-NEXT: [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A_FR]], i1 true) -; FAST_LZ-NEXT: br label [[COND_END]] -; FAST_LZ: cond.end: -; FAST_LZ-NEXT: [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ] -; FAST_LZ-NEXT: ret i64 [[CTZ]] +; FAST_LZ-NEXT: [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A:%.*]], i1 false) +; FAST_LZ-NEXT: ret i64 [[Z]] ; ; DEBUGINFO-LABEL: @cttz( ; DEBUGINFO-NEXT: entry: -; DEBUGINFO-NEXT: [[A_FR:%.*]] = freeze i64 [[A:%.*]], !dbg [[DBG11:![0-9]+]] -; DEBUGINFO-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0, !dbg [[DBG11]] -; DEBUGINFO-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG11]] -; DEBUGINFO: cond.false: -; DEBUGINFO-NEXT: [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A_FR]], i1 true), !dbg [[DBG11]] -; DEBUGINFO-NEXT: br label [[COND_END]], !dbg [[DBG12:![0-9]+]] -; DEBUGINFO: cond.end: -; DEBUGINFO-NEXT: [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ], !dbg [[DBG12]] -; DEBUGINFO-NEXT: #dbg_value(i64 [[CTZ]], [[META9:![0-9]+]], !DIExpression(), [[DBG11]]) -; DEBUGINFO-NEXT: ret i64 [[CTZ]], !dbg [[DBG12]] +; DEBUGINFO-NEXT: [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A:%.*]], i1 false), !dbg [[DBG11:![0-9]+]] +; DEBUGINFO-NEXT: #dbg_value(i64 [[Z]], [[META9:![0-9]+]], !DIExpression(), [[DBG11]]) +; DEBUGINFO-NEXT: ret i64 [[Z]], !dbg [[DBG12:![0-9]+]] ; entry: %z = call i64 @llvm.cttz.i64(i64 %A, i1 false) @@ -64,27 +43,13 @@ entry: define i64 @ctlz(i64 %A) { ; SLOW-LABEL: @ctlz( ; SLOW-NEXT: entry: -; SLOW-NEXT: [[A_FR:%.*]] = freeze i64 [[A:%.*]] -; SLOW-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0 -; SLOW-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]] -; SLOW: cond.false: -; SLOW-NEXT: [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A_FR]], i1 true) -; SLOW-NEXT: br label [[COND_END]] -; SLOW: cond.end: -; SLOW-NEXT: [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ] -; SLOW-NEXT: ret i64 [[CTZ]] +; SLOW-NEXT: [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A:%.*]], i1 false) +; SLOW-NEXT: ret i64 [[Z]] ; ; FAST_TZ-LABEL: @ctlz( ; FAST_TZ-NEXT: entry: -; FAST_TZ-NEXT: [[A_FR:%.*]] = freeze i64 [[A:%.*]] -; FAST_TZ-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0 -; FAST_TZ-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]] -; FAST_TZ: cond.false: -; FAST_TZ-NEXT: [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A_FR]], i1 true) -; FAST_TZ-NEXT: br label [[COND_END]] -; FAST_TZ: cond.end: -; FAST_TZ-NEXT: [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ] -; FAST_TZ-NEXT: ret i64 [[CTZ]] +; FAST_TZ-NEXT: [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A:%.*]], i1 false) +; FAST_TZ-NEXT: ret i64 [[Z]] ; ; FAST_LZ-LABEL: @ctlz( ; FAST_LZ-NEXT: entry: @@ -93,16 +58,9 @@ define i64 @ctlz(i64 %A) { ; ; DEBUGINFO-LABEL: @ctlz( ; DEBUGINFO-NEXT: entry: -; DEBUGINFO-NEXT: [[A_FR:%.*]] = freeze i64 [[A:%.*]], !dbg [[DBG16:![0-9]+]] -; DEBUGINFO-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0, !dbg [[DBG16]] -; DEBUGINFO-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG16]] -; DEBUGINFO: cond.false: -; DEBUGINFO-NEXT: [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A_FR]], i1 true), !dbg [[DBG16]] -; DEBUGINFO-NEXT: br label [[COND_END]], !dbg [[DBG17:![0-9]+]] -; DEBUGINFO: cond.end: -; DEBUGINFO-NEXT: [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ], !dbg [[DBG17]] -; DEBUGINFO-NEXT: #dbg_value(i64 [[CTZ]], [[META15:![0-9]+]], !DIExpression(), [[DBG16]]) -; DEBUGINFO-NEXT: ret i64 [[CTZ]], !dbg [[DBG17]] +; DEBUGINFO-NEXT: [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A:%.*]], i1 false), !dbg [[DBG16:![0-9]+]] +; DEBUGINFO-NEXT: #dbg_value(i64 [[Z]], [[META15:![0-9]+]], !DIExpression(), [[DBG16]]) +; DEBUGINFO-NEXT: ret i64 [[Z]], !dbg [[DBG17:![0-9]+]] ; entry: %z = call i64 @llvm.ctlz.i64(i64 %A, i1 false) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll index 0462f125955bf4..8a22e45fe1ca57 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE4 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=icelake-server -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 @@ -136,11 +136,32 @@ define void @ctlz_4i64() #0 { } define void @ctlz_4i32() #0 { -; CHECK-LABEL: @ctlz_4i32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) -; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 -; CHECK-NEXT: ret void +; SSE2-LABEL: @ctlz_4i32( +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 +; SSE2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) +; SSE2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 +; SSE2-NEXT: ret void +; +; SSE4-LABEL: @ctlz_4i32( +; SSE4-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4 +; SSE4-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4 +; SSE4-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4 +; SSE4-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4 +; SSE4-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false) +; SSE4-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false) +; SSE4-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false) +; SSE4-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false) +; SSE4-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 4 +; SSE4-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4 +; SSE4-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4 +; SSE4-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4 +; SSE4-NEXT: ret void +; +; AVX-LABEL: @ctlz_4i32( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 +; AVX-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) +; AVX-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 +; AVX-NEXT: ret void ; %ld0 = load i32, ptr @src32, align 4 %ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4 @@ -158,14 +179,41 @@ define void @ctlz_4i32() #0 { } define void @ctlz_8i32() #0 { -; SSE-LABEL: @ctlz_8i32( -; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 2 -; SSE-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) -; SSE-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 2 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2 -; SSE-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP3]], i1 false) -; SSE-NEXT: store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2 -; SSE-NEXT: ret void +; SSE2-LABEL: @ctlz_8i32( +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 2 +; SSE2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) +; SSE2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 2 +; SSE2-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2 +; SSE2-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP3]], i1 false) +; SSE2-NEXT: store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2 +; SSE2-NEXT: ret void +; +; SSE4-LABEL: @ctlz_8i32( +; SSE4-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 2 +; SSE4-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2 +; SSE4-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2 +; SSE4-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2 +; SSE4-NEXT: [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2 +; SSE4-NEXT: [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2 +; SSE4-NEXT: [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2 +; SSE4-NEXT: [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2 +; SSE4-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false) +; SSE4-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false) +; SSE4-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false) +; SSE4-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false) +; SSE4-NEXT: [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false) +; SSE4-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false) +; SSE4-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false) +; SSE4-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false) +; SSE4-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 2 +; SSE4-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2 +; SSE4-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2 +; SSE4-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2 +; SSE4-NEXT: store i32 [[CTLZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2 +; SSE4-NEXT: store i32 [[CTLZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2 +; SSE4-NEXT: store i32 [[CTLZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2 +; SSE4-NEXT: store i32 [[CTLZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2 +; SSE4-NEXT: ret void ; ; AVX-LABEL: @ctlz_8i32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2 From c46b41aaa6eaa787f808738d14c61a2f8b6d839f Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Thu, 22 Aug 2024 11:37:19 +0100 Subject: [PATCH 174/426] LSV: forbid load-cycles when vectorizing; fix bug (#104815) Forbid load-load cycles which would crash LoadStoreVectorizer when reordering instructions. Fixes #37865. --- .../Vectorize/LoadStoreVectorizer.cpp | 24 +++++- .../LoadStoreVectorizer/AArch64/pr37865.ll | 79 ++++++++++++++++++- 2 files changed, 98 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 02ec1d5c259cd6..dd37c95eca61a3 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -998,10 +998,32 @@ bool Vectorizer::isSafeToMove( LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> " << *ChainBegin << ")\n"); - assert(isa(ChainElem) == IsLoadChain); + assert(isa(ChainElem) == IsLoadChain && + isa(ChainBegin) == IsLoadChain); + if (ChainElem == ChainBegin) return true; + if constexpr (IsLoadChain) { + // If ChainElem depends on ChainBegin, they're not safe to reorder. + SmallVector Worklist; + Worklist.emplace_back(ChainElem); + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + for (Use &O : I->operands()) { + if (isa(O)) + continue; + if (auto *J = dyn_cast(O)) { + if (J == ChainBegin) { + LLVM_DEBUG(dbgs() << "LSV: dependent loads; not safe to reorder\n"); + return false; + } + Worklist.emplace_back(J); + } + } + } + } + // Invariant loads can always be reordered; by definition they are not // clobbered by stores. if (isInvariantLoad(ChainElem)) diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AArch64/pr37865.ll b/llvm/test/Transforms/LoadStoreVectorizer/AArch64/pr37865.ll index 833e70814c2917..0beca8c15305f6 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/AArch64/pr37865.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/AArch64/pr37865.ll @@ -1,9 +1,22 @@ -; REQUIRES: asserts -; RUN: not --crash opt -mtriple=aarch64 -passes=load-store-vectorizer \ -; RUN: -disable-output %s 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=aarch64 -passes=load-store-vectorizer -S %s | FileCheck %s + +; LSV was attempting to vectorize this earlier, but crashed while re-ordering +; instructions due to the load-load cycle. Now, the candidate loads are no +; longer considered safe for reordering. define i32 @load_cycle(ptr %x) { -; CHECK: Unexpected cycle while re-ordering instructions +; CHECK-LABEL: define i32 @load_cycle( +; CHECK-SAME: ptr [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[GEP_X_1:%.*]] = getelementptr inbounds [2 x i32], ptr [[X]], i32 0, i32 1 +; CHECK-NEXT: [[LOAD_X_1:%.*]] = load i32, ptr [[GEP_X_1]], align 4 +; CHECK-NEXT: [[REM:%.*]] = urem i32 [[LOAD_X_1]], 1 +; CHECK-NEXT: [[GEP_X_2:%.*]] = getelementptr inbounds [2 x i32], ptr [[X]], i32 [[REM]], i32 0 +; CHECK-NEXT: [[LOAD_X_2:%.*]] = load i32, ptr [[GEP_X_2]], align 4 +; CHECK-NEXT: [[RET:%.*]] = add i32 [[LOAD_X_2]], [[LOAD_X_1]] +; CHECK-NEXT: ret i32 [[RET]] +; entry: %gep.x.1 = getelementptr inbounds [2 x i32], ptr %x, i32 0, i32 1 %load.x.1 = load i32, ptr %gep.x.1 @@ -13,3 +26,61 @@ entry: %ret = add i32 %load.x.2, %load.x.1 ret i32 %ret } + +define i32 @load_cycle2(ptr %x, i32 %y) { +; CHECK-LABEL: define i32 @load_cycle2( +; CHECK-SAME: ptr [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[GEP_X_1:%.*]] = getelementptr inbounds [2 x i32], ptr [[X]], i32 [[Y]], i32 1 +; CHECK-NEXT: [[LOAD_X_1:%.*]] = load i32, ptr [[GEP_X_1]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[LOAD_X_1]], 2 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[Y]], [[MUL]] +; CHECK-NEXT: [[SUB_1:%.*]] = sub i32 [[ADD]], [[LOAD_X_1]] +; CHECK-NEXT: [[SUB_2:%.*]] = sub i32 [[SUB_1]], [[LOAD_X_1]] +; CHECK-NEXT: [[GEP_X_2:%.*]] = getelementptr inbounds [2 x i32], ptr [[X]], i32 [[SUB_2]], i32 0 +; CHECK-NEXT: [[LOAD_X_2:%.*]] = load i32, ptr [[GEP_X_2]], align 4 +; CHECK-NEXT: [[RET:%.*]] = add i32 [[LOAD_X_2]], [[LOAD_X_1]] +; CHECK-NEXT: ret i32 [[RET]] +; +entry: + %gep.x.1 = getelementptr inbounds [2 x i32], ptr %x, i32 %y, i32 1 + %load.x.1 = load i32, ptr %gep.x.1 + %mul = mul i32 %load.x.1, 2 + %add = add i32 %y, %mul + %sub.1 = sub i32 %add, %load.x.1 + %sub.2 = sub i32 %sub.1, %load.x.1 + %gep.x.2 = getelementptr inbounds [2 x i32], ptr %x, i32 %sub.2, i32 0 + %load.x.2 = load i32, ptr %gep.x.2 + %ret = add i32 %load.x.2, %load.x.1 + ret i32 %ret +} + +@global.1 = global i32 0 +@global.2 = global [1 x [3 x i32]] zeroinitializer + +define i16 @load_cycle3() { +; CHECK-LABEL: define i16 @load_cycle3() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[LOAD_1:%.*]] = load i32, ptr @global.1, align 4 +; CHECK-NEXT: [[UREM_1:%.*]] = urem i32 [[LOAD_1]], 1 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [1 x [3 x i32]], ptr @global.2, i32 0, i32 [[UREM_1]] +; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds [3 x i32], ptr [[GEP_1]], i32 0, i32 2 +; CHECK-NEXT: [[LOAD_2:%.*]] = load i32, ptr [[GEP_2]], align 4 +; CHECK-NEXT: [[UREM_2:%.*]] = urem i32 [[LOAD_2]], 1 +; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds [1 x [3 x i32]], ptr @global.2, i32 0, i32 [[UREM_2]] +; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds [3 x i32], ptr [[GEP_3]], i32 0, i32 1 +; CHECK-NEXT: [[LOAD_3:%.*]] = load i32, ptr [[GEP_4]], align 4 +; CHECK-NEXT: ret i16 0 +; +entry: + %load.1 = load i32, ptr @global.1 + %urem.1 = urem i32 %load.1, 1 + %gep.1 = getelementptr inbounds [1 x [3 x i32]], ptr @global.2, i32 0, i32 %urem.1 + %gep.2 = getelementptr inbounds [3 x i32], ptr %gep.1, i32 0, i32 2 + %load.2 = load i32, ptr %gep.2 + %urem.2 = urem i32 %load.2, 1 + %gep.3 = getelementptr inbounds [1 x [3 x i32]], ptr @global.2, i32 0, i32 %urem.2 + %gep.4 = getelementptr inbounds [3 x i32], ptr %gep.3, i32 0, i32 1 + %load.3 = load i32, ptr %gep.4 + ret i16 0 +} From 93a9406af52a190ed37270839678b98f2e86a739 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 22 Aug 2024 08:45:35 +0100 Subject: [PATCH 175/426] [AArch64] Add GISel srem/urem tests of various sizes. NFC --- llvm/test/CodeGen/AArch64/rem.ll | 3984 ++++++++++++++++++++++++++++++ 1 file changed, 3984 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/rem.ll diff --git a/llvm/test/CodeGen/AArch64/rem.ll b/llvm/test/CodeGen/AArch64/rem.ll new file mode 100644 index 00000000000000..7f4df00d4aa794 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/rem.ll @@ -0,0 +1,3984 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +; CHECK-GI: warning: Instruction selection used fallback path for si128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for ui128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sv3i8 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uv3i8 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sv3i16 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uv3i16 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sv3i32 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uv3i32 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sv2i128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sv3i128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sv4i128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uv2i128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uv3i128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uv4i128 + +define i8 @si8(i8 %a, i8 %b) { +; CHECK-SD-LABEL: si8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sxtb w8, w1 +; CHECK-SD-NEXT: sxtb w9, w0 +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: msub w0, w10, w8, w9 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: si8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sxtb w8, w0 +; CHECK-GI-NEXT: sxtb w9, w1 +; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: msub w0, w8, w1, w0 +; CHECK-GI-NEXT: ret +entry: + %s = srem i8 %a, %b + ret i8 %s +} + +define i8 @ui8(i8 %a, i8 %b) { +; CHECK-SD-LABEL: ui8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: and w8, w1, #0xff +; CHECK-SD-NEXT: and w9, w0, #0xff +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: msub w0, w10, w8, w9 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ui8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: and w8, w0, #0xff +; CHECK-GI-NEXT: and w9, w1, #0xff +; CHECK-GI-NEXT: udiv w8, w8, w9 +; CHECK-GI-NEXT: msub w0, w8, w1, w0 +; CHECK-GI-NEXT: ret +entry: + %s = urem i8 %a, %b + ret i8 %s +} + +define i16 @si16(i16 %a, i16 %b) { +; CHECK-SD-LABEL: si16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sxth w8, w1 +; CHECK-SD-NEXT: sxth w9, w0 +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: msub w0, w10, w8, w9 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: si16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sxth w8, w0 +; CHECK-GI-NEXT: sxth w9, w1 +; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: msub w0, w8, w1, w0 +; CHECK-GI-NEXT: ret +entry: + %s = srem i16 %a, %b + ret i16 %s +} + +define i16 @ui16(i16 %a, i16 %b) { +; CHECK-SD-LABEL: ui16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: and w8, w1, #0xffff +; CHECK-SD-NEXT: and w9, w0, #0xffff +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: msub w0, w10, w8, w9 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ui16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: and w8, w0, #0xffff +; CHECK-GI-NEXT: and w9, w1, #0xffff +; CHECK-GI-NEXT: udiv w8, w8, w9 +; CHECK-GI-NEXT: msub w0, w8, w1, w0 +; CHECK-GI-NEXT: ret +entry: + %s = urem i16 %a, %b + ret i16 %s +} + +define i32 @si32(i32 %a, i32 %b) { +; CHECK-LABEL: si32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sdiv w8, w0, w1 +; CHECK-NEXT: msub w0, w8, w1, w0 +; CHECK-NEXT: ret +entry: + %s = srem i32 %a, %b + ret i32 %s +} + +define i32 @ui32(i32 %a, i32 %b) { +; CHECK-LABEL: ui32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: udiv w8, w0, w1 +; CHECK-NEXT: msub w0, w8, w1, w0 +; CHECK-NEXT: ret +entry: + %s = urem i32 %a, %b + ret i32 %s +} + +define i64 @si64(i64 %a, i64 %b) { +; CHECK-LABEL: si64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sdiv x8, x0, x1 +; CHECK-NEXT: msub x0, x8, x1, x0 +; CHECK-NEXT: ret +entry: + %s = srem i64 %a, %b + ret i64 %s +} + +define i64 @ui64(i64 %a, i64 %b) { +; CHECK-LABEL: ui64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: udiv x8, x0, x1 +; CHECK-NEXT: msub x0, x8, x1, x0 +; CHECK-NEXT: ret +entry: + %s = urem i64 %a, %b + ret i64 %s +} + +define i128 @si128(i128 %a, i128 %b) { +; CHECK-LABEL: si128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: bl __modti3 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %s = srem i128 %a, %b + ret i128 %s +} + +define i128 @ui128(i128 %a, i128 %b) { +; CHECK-LABEL: ui128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: bl __umodti3 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %s = urem i128 %a, %b + ret i128 %s +} + +define <2 x i8> @sv2i8(<2 x i8> %d, <2 x i8> %e) { +; CHECK-SD-LABEL: sv2i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-SD-NEXT: shl v1.2s, v1.2s, #24 +; CHECK-SD-NEXT: sshr v0.2s, v0.2s, #24 +; CHECK-SD-NEXT: sshr v1.2s, v1.2s, #24 +; CHECK-SD-NEXT: fmov w8, s1 +; CHECK-SD-NEXT: fmov w9, s0 +; CHECK-SD-NEXT: mov w11, v1.s[1] +; CHECK-SD-NEXT: mov w12, v0.s[1] +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: sdiv w13, w12, w11 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: fmov s0, w8 +; CHECK-SD-NEXT: msub w9, w13, w11, w12 +; CHECK-SD-NEXT: mov v0.s[1], w9 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv2i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-GI-NEXT: shl v1.2s, v1.2s, #24 +; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #24 +; CHECK-GI-NEXT: sshr v1.2s, v1.2s, #24 +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: mov w10, v1.s[1] +; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: mov w9, v0.s[1] +; CHECK-GI-NEXT: sdiv w9, w9, w10 +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.s[1], w9 +; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret +entry: + %s = srem <2 x i8> %d, %e + ret <2 x i8> %s +} + +define <3 x i8> @sv3i8(<3 x i8> %d, <3 x i8> %e) { +; CHECK-LABEL: sv3i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtb w8, w3 +; CHECK-NEXT: sxtb w9, w0 +; CHECK-NEXT: sxtb w11, w4 +; CHECK-NEXT: sxtb w12, w1 +; CHECK-NEXT: sxtb w14, w5 +; CHECK-NEXT: sxtb w15, w2 +; CHECK-NEXT: sdiv w10, w9, w8 +; CHECK-NEXT: sdiv w13, w12, w11 +; CHECK-NEXT: msub w0, w10, w8, w9 +; CHECK-NEXT: sdiv w16, w15, w14 +; CHECK-NEXT: msub w1, w13, w11, w12 +; CHECK-NEXT: msub w2, w16, w14, w15 +; CHECK-NEXT: ret +entry: + %s = srem <3 x i8> %d, %e + ret <3 x i8> %s +} + +define <4 x i8> @sv4i8(<4 x i8> %d, <4 x i8> %e) { +; CHECK-SD-LABEL: sv4i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: shl v0.4h, v0.4h, #8 +; CHECK-SD-NEXT: shl v1.4h, v1.4h, #8 +; CHECK-SD-NEXT: sshr v0.4h, v0.4h, #8 +; CHECK-SD-NEXT: sshr v1.4h, v1.4h, #8 +; CHECK-SD-NEXT: smov w11, v1.h[0] +; CHECK-SD-NEXT: smov w12, v0.h[0] +; CHECK-SD-NEXT: smov w8, v1.h[1] +; CHECK-SD-NEXT: smov w9, v0.h[1] +; CHECK-SD-NEXT: smov w14, v1.h[2] +; CHECK-SD-NEXT: smov w15, v0.h[2] +; CHECK-SD-NEXT: smov w17, v1.h[3] +; CHECK-SD-NEXT: smov w18, v0.h[3] +; CHECK-SD-NEXT: sdiv w13, w12, w11 +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: msub w11, w13, w11, w12 +; CHECK-SD-NEXT: fmov s0, w11 +; CHECK-SD-NEXT: sdiv w16, w15, w14 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: mov v0.h[1], w8 +; CHECK-SD-NEXT: sdiv w9, w18, w17 +; CHECK-SD-NEXT: msub w8, w16, w14, w15 +; CHECK-SD-NEXT: mov v0.h[2], w8 +; CHECK-SD-NEXT: msub w8, w9, w17, w18 +; CHECK-SD-NEXT: mov v0.h[3], w8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv4i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: shl v0.4s, v0.4s, #24 +; CHECK-GI-NEXT: shl v1.4s, v1.4s, #24 +; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #24 +; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #24 +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: mov w10, v1.s[1] +; CHECK-GI-NEXT: mov w11, v1.s[2] +; CHECK-GI-NEXT: mov w12, v1.s[3] +; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: mov w9, v0.s[1] +; CHECK-GI-NEXT: sdiv w9, w9, w10 +; CHECK-GI-NEXT: mov w10, v0.s[2] +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: sdiv w10, w10, w11 +; CHECK-GI-NEXT: mov w11, v0.s[3] +; CHECK-GI-NEXT: mov v2.s[1], w9 +; CHECK-GI-NEXT: sdiv w8, w11, w12 +; CHECK-GI-NEXT: mov v2.s[2], w10 +; CHECK-GI-NEXT: mov v2.s[3], w8 +; CHECK-GI-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: ret +entry: + %s = srem <4 x i8> %d, %e + ret <4 x i8> %s +} + +define <8 x i8> @sv8i8(<8 x i8> %d, <8 x i8> %e) { +; CHECK-SD-LABEL: sv8i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: smov w11, v1.b[0] +; CHECK-SD-NEXT: smov w12, v0.b[0] +; CHECK-SD-NEXT: smov w8, v1.b[1] +; CHECK-SD-NEXT: smov w9, v0.b[1] +; CHECK-SD-NEXT: smov w14, v1.b[2] +; CHECK-SD-NEXT: smov w15, v0.b[2] +; CHECK-SD-NEXT: smov w17, v1.b[3] +; CHECK-SD-NEXT: smov w18, v0.b[3] +; CHECK-SD-NEXT: smov w1, v1.b[4] +; CHECK-SD-NEXT: smov w2, v0.b[4] +; CHECK-SD-NEXT: smov w4, v1.b[5] +; CHECK-SD-NEXT: smov w5, v0.b[5] +; CHECK-SD-NEXT: sdiv w13, w12, w11 +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: msub w11, w13, w11, w12 +; CHECK-SD-NEXT: smov w13, v1.b[7] +; CHECK-SD-NEXT: fmov s2, w11 +; CHECK-SD-NEXT: smov w11, v0.b[6] +; CHECK-SD-NEXT: sdiv w16, w15, w14 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: smov w10, v1.b[6] +; CHECK-SD-NEXT: mov v2.b[1], w8 +; CHECK-SD-NEXT: sdiv w0, w18, w17 +; CHECK-SD-NEXT: msub w8, w16, w14, w15 +; CHECK-SD-NEXT: smov w14, v0.b[7] +; CHECK-SD-NEXT: mov v2.b[2], w8 +; CHECK-SD-NEXT: sdiv w3, w2, w1 +; CHECK-SD-NEXT: msub w8, w0, w17, w18 +; CHECK-SD-NEXT: mov v2.b[3], w8 +; CHECK-SD-NEXT: sdiv w9, w5, w4 +; CHECK-SD-NEXT: msub w8, w3, w1, w2 +; CHECK-SD-NEXT: mov v2.b[4], w8 +; CHECK-SD-NEXT: sdiv w12, w11, w10 +; CHECK-SD-NEXT: msub w8, w9, w4, w5 +; CHECK-SD-NEXT: mov v2.b[5], w8 +; CHECK-SD-NEXT: sdiv w9, w14, w13 +; CHECK-SD-NEXT: msub w8, w12, w10, w11 +; CHECK-SD-NEXT: mov v2.b[6], w8 +; CHECK-SD-NEXT: msub w8, w9, w13, w14 +; CHECK-SD-NEXT: mov v2.b[7], w8 +; CHECK-SD-NEXT: fmov d0, d2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv8i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: sshll v2.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll v3.4s, v1.4h, #0 +; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov w10, v3.s[1] +; CHECK-GI-NEXT: mov w11, v3.s[2] +; CHECK-GI-NEXT: mov w12, v3.s[3] +; CHECK-GI-NEXT: fmov w13, s1 +; CHECK-GI-NEXT: mov w14, v1.s[1] +; CHECK-GI-NEXT: mov w15, v1.s[2] +; CHECK-GI-NEXT: mov w16, v1.s[3] +; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: mov w9, v2.s[1] +; CHECK-GI-NEXT: sdiv w9, w9, w10 +; CHECK-GI-NEXT: mov w10, v2.s[2] +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: sdiv w10, w10, w11 +; CHECK-GI-NEXT: mov w11, v2.s[3] +; CHECK-GI-NEXT: mov v4.s[1], w9 +; CHECK-GI-NEXT: sdiv w11, w11, w12 +; CHECK-GI-NEXT: fmov w12, s0 +; CHECK-GI-NEXT: mov v4.s[2], w10 +; CHECK-GI-NEXT: sdiv w12, w12, w13 +; CHECK-GI-NEXT: mov w13, v0.s[1] +; CHECK-GI-NEXT: mov v4.s[3], w11 +; CHECK-GI-NEXT: mls v2.4s, v4.4s, v3.4s +; CHECK-GI-NEXT: sdiv w13, w13, w14 +; CHECK-GI-NEXT: mov w14, v0.s[2] +; CHECK-GI-NEXT: fmov s5, w12 +; CHECK-GI-NEXT: sdiv w14, w14, w15 +; CHECK-GI-NEXT: mov w15, v0.s[3] +; CHECK-GI-NEXT: mov v5.s[1], w13 +; CHECK-GI-NEXT: sdiv w8, w15, w16 +; CHECK-GI-NEXT: mov v5.s[2], w14 +; CHECK-GI-NEXT: mov v5.s[3], w8 +; CHECK-GI-NEXT: mls v0.4s, v5.4s, v1.4s +; CHECK-GI-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: ret +entry: + %s = srem <8 x i8> %d, %e + ret <8 x i8> %s +} + +define <16 x i8> @sv16i8(<16 x i8> %d, <16 x i8> %e) { +; CHECK-SD-LABEL: sv16i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: stp x28, x27, [sp, #-80]! // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x26, x25, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x24, x23, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 80 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w23, -40 +; CHECK-SD-NEXT: .cfi_offset w24, -48 +; CHECK-SD-NEXT: .cfi_offset w25, -56 +; CHECK-SD-NEXT: .cfi_offset w26, -64 +; CHECK-SD-NEXT: .cfi_offset w27, -72 +; CHECK-SD-NEXT: .cfi_offset w28, -80 +; CHECK-SD-NEXT: smov w11, v1.b[0] +; CHECK-SD-NEXT: smov w12, v0.b[0] +; CHECK-SD-NEXT: smov w8, v1.b[1] +; CHECK-SD-NEXT: smov w9, v0.b[1] +; CHECK-SD-NEXT: smov w14, v1.b[2] +; CHECK-SD-NEXT: smov w15, v0.b[2] +; CHECK-SD-NEXT: smov w17, v1.b[3] +; CHECK-SD-NEXT: smov w18, v0.b[3] +; CHECK-SD-NEXT: smov w1, v1.b[4] +; CHECK-SD-NEXT: smov w2, v0.b[4] +; CHECK-SD-NEXT: smov w4, v1.b[5] +; CHECK-SD-NEXT: smov w5, v0.b[5] +; CHECK-SD-NEXT: sdiv w13, w12, w11 +; CHECK-SD-NEXT: smov w7, v1.b[6] +; CHECK-SD-NEXT: smov w19, v0.b[6] +; CHECK-SD-NEXT: smov w21, v1.b[7] +; CHECK-SD-NEXT: smov w22, v0.b[7] +; CHECK-SD-NEXT: smov w24, v1.b[8] +; CHECK-SD-NEXT: smov w25, v0.b[8] +; CHECK-SD-NEXT: smov w27, v1.b[9] +; CHECK-SD-NEXT: smov w28, v0.b[9] +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: msub w11, w13, w11, w12 +; CHECK-SD-NEXT: smov w13, v1.b[11] +; CHECK-SD-NEXT: fmov s2, w11 +; CHECK-SD-NEXT: smov w11, v0.b[10] +; CHECK-SD-NEXT: sdiv w16, w15, w14 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: smov w10, v1.b[10] +; CHECK-SD-NEXT: mov v2.b[1], w8 +; CHECK-SD-NEXT: sdiv w0, w18, w17 +; CHECK-SD-NEXT: msub w8, w16, w14, w15 +; CHECK-SD-NEXT: smov w14, v0.b[11] +; CHECK-SD-NEXT: smov w16, v1.b[12] +; CHECK-SD-NEXT: mov v2.b[2], w8 +; CHECK-SD-NEXT: sdiv w3, w2, w1 +; CHECK-SD-NEXT: msub w8, w0, w17, w18 +; CHECK-SD-NEXT: smov w17, v0.b[12] +; CHECK-SD-NEXT: smov w0, v1.b[13] +; CHECK-SD-NEXT: mov v2.b[3], w8 +; CHECK-SD-NEXT: sdiv w6, w5, w4 +; CHECK-SD-NEXT: msub w8, w3, w1, w2 +; CHECK-SD-NEXT: smov w1, v0.b[13] +; CHECK-SD-NEXT: mov v2.b[4], w8 +; CHECK-SD-NEXT: sdiv w20, w19, w7 +; CHECK-SD-NEXT: msub w8, w6, w4, w5 +; CHECK-SD-NEXT: mov v2.b[5], w8 +; CHECK-SD-NEXT: sdiv w23, w22, w21 +; CHECK-SD-NEXT: msub w8, w20, w7, w19 +; CHECK-SD-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[6], w8 +; CHECK-SD-NEXT: sdiv w26, w25, w24 +; CHECK-SD-NEXT: msub w8, w23, w21, w22 +; CHECK-SD-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[7], w8 +; CHECK-SD-NEXT: sdiv w9, w28, w27 +; CHECK-SD-NEXT: msub w8, w26, w24, w25 +; CHECK-SD-NEXT: ldp x24, x23, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x26, x25, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[8], w8 +; CHECK-SD-NEXT: sdiv w12, w11, w10 +; CHECK-SD-NEXT: msub w8, w9, w27, w28 +; CHECK-SD-NEXT: mov v2.b[9], w8 +; CHECK-SD-NEXT: sdiv w15, w14, w13 +; CHECK-SD-NEXT: msub w8, w12, w10, w11 +; CHECK-SD-NEXT: smov w10, v1.b[14] +; CHECK-SD-NEXT: smov w11, v0.b[14] +; CHECK-SD-NEXT: mov v2.b[10], w8 +; CHECK-SD-NEXT: sdiv w18, w17, w16 +; CHECK-SD-NEXT: msub w8, w15, w13, w14 +; CHECK-SD-NEXT: smov w13, v1.b[15] +; CHECK-SD-NEXT: smov w14, v0.b[15] +; CHECK-SD-NEXT: mov v2.b[11], w8 +; CHECK-SD-NEXT: sdiv w9, w1, w0 +; CHECK-SD-NEXT: msub w8, w18, w16, w17 +; CHECK-SD-NEXT: mov v2.b[12], w8 +; CHECK-SD-NEXT: sdiv w12, w11, w10 +; CHECK-SD-NEXT: msub w8, w9, w0, w1 +; CHECK-SD-NEXT: mov v2.b[13], w8 +; CHECK-SD-NEXT: sdiv w9, w14, w13 +; CHECK-SD-NEXT: msub w8, w12, w10, w11 +; CHECK-SD-NEXT: mov v2.b[14], w8 +; CHECK-SD-NEXT: msub w8, w9, w13, w14 +; CHECK-SD-NEXT: mov v2.b[15], w8 +; CHECK-SD-NEXT: mov v0.16b, v2.16b +; CHECK-SD-NEXT: ldp x28, x27, [sp], #80 // 16-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv16i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v4.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll v5.8h, v1.8b, #0 +; CHECK-GI-NEXT: sshll2 v6.8h, v0.16b, #0 +; CHECK-GI-NEXT: sshll2 v7.8h, v1.16b, #0 +; CHECK-GI-NEXT: sshll v2.4s, v4.4h, #0 +; CHECK-GI-NEXT: sshll v3.4s, v5.4h, #0 +; CHECK-GI-NEXT: sshll2 v4.4s, v4.8h, #0 +; CHECK-GI-NEXT: sshll2 v5.4s, v5.8h, #0 +; CHECK-GI-NEXT: sshll v0.4s, v6.4h, #0 +; CHECK-GI-NEXT: sshll v1.4s, v7.4h, #0 +; CHECK-GI-NEXT: sshll2 v6.4s, v6.8h, #0 +; CHECK-GI-NEXT: sshll2 v7.4s, v7.8h, #0 +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov w12, v3.s[3] +; CHECK-GI-NEXT: fmov w13, s5 +; CHECK-GI-NEXT: mov w16, v5.s[3] +; CHECK-GI-NEXT: fmov w17, s1 +; CHECK-GI-NEXT: mov w18, v1.s[1] +; CHECK-GI-NEXT: mov w0, v1.s[2] +; CHECK-GI-NEXT: mov w1, v1.s[3] +; CHECK-GI-NEXT: sdiv w10, w8, w9 +; CHECK-GI-NEXT: mov w8, v2.s[1] +; CHECK-GI-NEXT: mov w9, v3.s[1] +; CHECK-GI-NEXT: fmov w2, s7 +; CHECK-GI-NEXT: mov w3, v7.s[1] +; CHECK-GI-NEXT: mov w4, v7.s[2] +; CHECK-GI-NEXT: mov w5, v7.s[3] +; CHECK-GI-NEXT: sdiv w11, w8, w9 +; CHECK-GI-NEXT: mov w8, v2.s[2] +; CHECK-GI-NEXT: mov w9, v3.s[2] +; CHECK-GI-NEXT: fmov s16, w10 +; CHECK-GI-NEXT: sdiv w9, w8, w9 +; CHECK-GI-NEXT: mov w8, v2.s[3] +; CHECK-GI-NEXT: mov v16.s[1], w11 +; CHECK-GI-NEXT: sdiv w8, w8, w12 +; CHECK-GI-NEXT: fmov w12, s4 +; CHECK-GI-NEXT: mov v16.s[2], w9 +; CHECK-GI-NEXT: sdiv w14, w12, w13 +; CHECK-GI-NEXT: mov w12, v4.s[1] +; CHECK-GI-NEXT: mov w13, v5.s[1] +; CHECK-GI-NEXT: mov v16.s[3], w8 +; CHECK-GI-NEXT: mls v2.4s, v16.4s, v3.4s +; CHECK-GI-NEXT: sdiv w15, w12, w13 +; CHECK-GI-NEXT: mov w12, v4.s[2] +; CHECK-GI-NEXT: mov w13, v5.s[2] +; CHECK-GI-NEXT: fmov s17, w14 +; CHECK-GI-NEXT: sdiv w13, w12, w13 +; CHECK-GI-NEXT: mov w12, v4.s[3] +; CHECK-GI-NEXT: mov v17.s[1], w15 +; CHECK-GI-NEXT: sdiv w12, w12, w16 +; CHECK-GI-NEXT: fmov w16, s0 +; CHECK-GI-NEXT: mov v17.s[2], w13 +; CHECK-GI-NEXT: sdiv w16, w16, w17 +; CHECK-GI-NEXT: mov w17, v0.s[1] +; CHECK-GI-NEXT: mov v17.s[3], w12 +; CHECK-GI-NEXT: mls v4.4s, v17.4s, v5.4s +; CHECK-GI-NEXT: sdiv w17, w17, w18 +; CHECK-GI-NEXT: mov w18, v0.s[2] +; CHECK-GI-NEXT: fmov s18, w16 +; CHECK-GI-NEXT: sdiv w18, w18, w0 +; CHECK-GI-NEXT: mov w0, v0.s[3] +; CHECK-GI-NEXT: mov v18.s[1], w17 +; CHECK-GI-NEXT: sdiv w0, w0, w1 +; CHECK-GI-NEXT: fmov w1, s6 +; CHECK-GI-NEXT: mov v18.s[2], w18 +; CHECK-GI-NEXT: sdiv w1, w1, w2 +; CHECK-GI-NEXT: mov w2, v6.s[1] +; CHECK-GI-NEXT: mov v18.s[3], w0 +; CHECK-GI-NEXT: mls v0.4s, v18.4s, v1.4s +; CHECK-GI-NEXT: uzp1 v1.8h, v2.8h, v4.8h +; CHECK-GI-NEXT: sdiv w2, w2, w3 +; CHECK-GI-NEXT: mov w3, v6.s[2] +; CHECK-GI-NEXT: fmov s19, w1 +; CHECK-GI-NEXT: sdiv w3, w3, w4 +; CHECK-GI-NEXT: mov w4, v6.s[3] +; CHECK-GI-NEXT: mov v19.s[1], w2 +; CHECK-GI-NEXT: sdiv w10, w4, w5 +; CHECK-GI-NEXT: mov v19.s[2], w3 +; CHECK-GI-NEXT: mov v19.s[3], w10 +; CHECK-GI-NEXT: mls v6.4s, v19.4s, v7.4s +; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v6.8h +; CHECK-GI-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: ret +entry: + %s = srem <16 x i8> %d, %e + ret <16 x i8> %s +} + +define <32 x i8> @sv32i8(<32 x i8> %d, <32 x i8> %e) { +; CHECK-SD-LABEL: sv32i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #304 +; CHECK-SD-NEXT: stp x29, x30, [sp, #208] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x28, x27, [sp, #224] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x26, x25, [sp, #240] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x24, x23, [sp, #256] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #272] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #288] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 304 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w23, -40 +; CHECK-SD-NEXT: .cfi_offset w24, -48 +; CHECK-SD-NEXT: .cfi_offset w25, -56 +; CHECK-SD-NEXT: .cfi_offset w26, -64 +; CHECK-SD-NEXT: .cfi_offset w27, -72 +; CHECK-SD-NEXT: .cfi_offset w28, -80 +; CHECK-SD-NEXT: .cfi_offset w30, -88 +; CHECK-SD-NEXT: .cfi_offset w29, -96 +; CHECK-SD-NEXT: smov w8, v2.b[1] +; CHECK-SD-NEXT: smov w9, v0.b[1] +; CHECK-SD-NEXT: smov w19, v3.b[7] +; CHECK-SD-NEXT: smov w7, v1.b[7] +; CHECK-SD-NEXT: smov w6, v3.b[8] +; CHECK-SD-NEXT: smov w3, v1.b[8] +; CHECK-SD-NEXT: smov w13, v3.b[0] +; CHECK-SD-NEXT: smov w5, v3.b[1] +; CHECK-SD-NEXT: smov w0, v1.b[1] +; CHECK-SD-NEXT: smov w12, v3.b[2] +; CHECK-SD-NEXT: smov w17, v3.b[3] +; CHECK-SD-NEXT: smov w16, v1.b[3] +; CHECK-SD-NEXT: str w8, [sp, #80] // 4-byte Folded Spill +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: smov w8, v2.b[0] +; CHECK-SD-NEXT: str w9, [sp, #88] // 4-byte Folded Spill +; CHECK-SD-NEXT: smov w9, v0.b[0] +; CHECK-SD-NEXT: ldr w30, [sp, #80] // 4-byte Folded Reload +; CHECK-SD-NEXT: smov w15, v3.b[4] +; CHECK-SD-NEXT: smov w14, v1.b[4] +; CHECK-SD-NEXT: smov w4, v3.b[5] +; CHECK-SD-NEXT: smov w1, v1.b[5] +; CHECK-SD-NEXT: smov w2, v3.b[6] +; CHECK-SD-NEXT: smov w18, v1.b[6] +; CHECK-SD-NEXT: str w8, [sp, #32] // 4-byte Folded Spill +; CHECK-SD-NEXT: smov w21, v3.b[9] +; CHECK-SD-NEXT: smov w20, v1.b[9] +; CHECK-SD-NEXT: str w9, [sp, #40] // 4-byte Folded Spill +; CHECK-SD-NEXT: ldr w29, [sp, #32] // 4-byte Folded Reload +; CHECK-SD-NEXT: sdiv w11, w9, w8 +; CHECK-SD-NEXT: smov w8, v2.b[2] +; CHECK-SD-NEXT: smov w9, v0.b[2] +; CHECK-SD-NEXT: str w10, [sp, #96] // 4-byte Folded Spill +; CHECK-SD-NEXT: stp w8, w9, [sp, #16] // 8-byte Folded Spill +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: smov w8, v2.b[3] +; CHECK-SD-NEXT: smov w9, v0.b[3] +; CHECK-SD-NEXT: stp w11, w8, [sp, #48] // 8-byte Folded Spill +; CHECK-SD-NEXT: str w10, [sp, #24] // 4-byte Folded Spill +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: smov w8, v2.b[4] +; CHECK-SD-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; CHECK-SD-NEXT: stp w9, w10, [sp, #56] // 8-byte Folded Spill +; CHECK-SD-NEXT: smov w9, v0.b[4] +; CHECK-SD-NEXT: sdiv w27, w0, w5 +; CHECK-SD-NEXT: str w9, [sp, #36] // 4-byte Folded Spill +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: smov w8, v2.b[5] +; CHECK-SD-NEXT: smov w9, v0.b[5] +; CHECK-SD-NEXT: str w8, [sp, #76] // 4-byte Folded Spill +; CHECK-SD-NEXT: str w9, [sp, #84] // 4-byte Folded Spill +; CHECK-SD-NEXT: str w10, [sp, #44] // 4-byte Folded Spill +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: smov w8, v2.b[6] +; CHECK-SD-NEXT: smov w9, v0.b[6] +; CHECK-SD-NEXT: stp w8, w9, [sp, #64] // 8-byte Folded Spill +; CHECK-SD-NEXT: str w10, [sp, #92] // 4-byte Folded Spill +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: smov w8, v2.b[7] +; CHECK-SD-NEXT: smov w9, v0.b[7] +; CHECK-SD-NEXT: stp w8, w9, [sp, #112] // 8-byte Folded Spill +; CHECK-SD-NEXT: sdiv w11, w9, w8 +; CHECK-SD-NEXT: smov w8, v2.b[8] +; CHECK-SD-NEXT: smov w9, v0.b[8] +; CHECK-SD-NEXT: str w10, [sp, #72] // 4-byte Folded Spill +; CHECK-SD-NEXT: stp w8, w9, [sp, #100] // 8-byte Folded Spill +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: smov w8, v2.b[9] +; CHECK-SD-NEXT: smov w9, v0.b[9] +; CHECK-SD-NEXT: stp w8, w9, [sp, #136] // 8-byte Folded Spill +; CHECK-SD-NEXT: str w10, [sp, #108] // 4-byte Folded Spill +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: smov w8, v2.b[10] +; CHECK-SD-NEXT: smov w9, v0.b[10] +; CHECK-SD-NEXT: stp w11, w8, [sp, #120] // 8-byte Folded Spill +; CHECK-SD-NEXT: str w10, [sp, #144] // 4-byte Folded Spill +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: smov w8, v2.b[11] +; CHECK-SD-NEXT: stp w9, w10, [sp, #128] // 8-byte Folded Spill +; CHECK-SD-NEXT: smov w9, v0.b[11] +; CHECK-SD-NEXT: sdiv w25, w16, w17 +; CHECK-SD-NEXT: stp w8, w9, [sp, #172] // 8-byte Folded Spill +; CHECK-SD-NEXT: sdiv w11, w9, w8 +; CHECK-SD-NEXT: smov w8, v2.b[12] +; CHECK-SD-NEXT: smov w9, v0.b[12] +; CHECK-SD-NEXT: str w8, [sp, #152] // 4-byte Folded Spill +; CHECK-SD-NEXT: str w9, [sp, #160] // 4-byte Folded Spill +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: smov w8, v2.b[13] +; CHECK-SD-NEXT: smov w9, v0.b[13] +; CHECK-SD-NEXT: stp w8, w9, [sp, #196] // 8-byte Folded Spill +; CHECK-SD-NEXT: str w10, [sp, #168] // 4-byte Folded Spill +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: smov w8, v2.b[14] +; CHECK-SD-NEXT: smov w9, v0.b[14] +; CHECK-SD-NEXT: stp w11, w8, [sp, #180] // 8-byte Folded Spill +; CHECK-SD-NEXT: smov w11, v1.b[2] +; CHECK-SD-NEXT: str w10, [sp, #204] // 4-byte Folded Spill +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: smov w8, v2.b[15] +; CHECK-SD-NEXT: str w8, [sp, #148] // 4-byte Folded Spill +; CHECK-SD-NEXT: stp w9, w10, [sp, #188] // 8-byte Folded Spill +; CHECK-SD-NEXT: smov w9, v0.b[15] +; CHECK-SD-NEXT: sdiv w22, w11, w12 +; CHECK-SD-NEXT: str w9, [sp, #156] // 4-byte Folded Spill +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: str w10, [sp, #164] // 4-byte Folded Spill +; CHECK-SD-NEXT: smov w10, v1.b[0] +; CHECK-SD-NEXT: sdiv w9, w7, w19 +; CHECK-SD-NEXT: sdiv w8, w3, w6 +; CHECK-SD-NEXT: sdiv w23, w10, w13 +; CHECK-SD-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill +; CHECK-SD-NEXT: ldr w8, [sp, #96] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldr w9, [sp, #88] // 4-byte Folded Reload +; CHECK-SD-NEXT: msub w9, w8, w30, w9 +; CHECK-SD-NEXT: ldr w8, [sp, #48] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldr w30, [sp, #40] // 4-byte Folded Reload +; CHECK-SD-NEXT: msub w8, w8, w29, w30 +; CHECK-SD-NEXT: ldp x29, x30, [sp, #208] // 16-byte Folded Reload +; CHECK-SD-NEXT: fmov s0, w8 +; CHECK-SD-NEXT: msub w10, w23, w13, w10 +; CHECK-SD-NEXT: sdiv w24, w14, w15 +; CHECK-SD-NEXT: msub w13, w27, w5, w0 +; CHECK-SD-NEXT: ldr w5, [sp, #16] // 4-byte Folded Reload +; CHECK-SD-NEXT: mov v0.b[1], w9 +; CHECK-SD-NEXT: msub w9, w22, w12, w11 +; CHECK-SD-NEXT: smov w11, v1.b[10] +; CHECK-SD-NEXT: fmov s2, w10 +; CHECK-SD-NEXT: ldp w10, w8, [sp, #20] // 8-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[1], w13 +; CHECK-SD-NEXT: msub w8, w8, w5, w10 +; CHECK-SD-NEXT: ldr w5, [sp, #52] // 4-byte Folded Reload +; CHECK-SD-NEXT: smov w10, v3.b[10] +; CHECK-SD-NEXT: sdiv w28, w1, w4 +; CHECK-SD-NEXT: ldp w13, w12, [sp, #56] // 8-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[2], w9 +; CHECK-SD-NEXT: mov v0.b[2], w8 +; CHECK-SD-NEXT: msub w8, w25, w17, w16 +; CHECK-SD-NEXT: ldr w17, [sp, #28] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldr w16, [sp, #36] // 4-byte Folded Reload +; CHECK-SD-NEXT: msub w12, w12, w5, w13 +; CHECK-SD-NEXT: ldr w13, [sp, #44] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldr w5, [sp, #136] // 4-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[3], w8 +; CHECK-SD-NEXT: msub w8, w24, w15, w14 +; CHECK-SD-NEXT: ldr w15, [sp, #92] // 4-byte Folded Reload +; CHECK-SD-NEXT: mov v0.b[3], w12 +; CHECK-SD-NEXT: msub w13, w13, w17, w16 +; CHECK-SD-NEXT: ldr w17, [sp, #76] // 4-byte Folded Reload +; CHECK-SD-NEXT: sdiv w26, w18, w2 +; CHECK-SD-NEXT: ldr w16, [sp, #84] // 4-byte Folded Reload +; CHECK-SD-NEXT: smov w12, v3.b[11] +; CHECK-SD-NEXT: msub w15, w15, w17, w16 +; CHECK-SD-NEXT: smov w14, v1.b[11] +; CHECK-SD-NEXT: mov v2.b[4], w8 +; CHECK-SD-NEXT: msub w8, w28, w4, w1 +; CHECK-SD-NEXT: ldr w1, [sp, #64] // 4-byte Folded Reload +; CHECK-SD-NEXT: mov v0.b[4], w13 +; CHECK-SD-NEXT: ldr w4, [sp, #100] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldp w17, w16, [sp, #68] // 8-byte Folded Reload +; CHECK-SD-NEXT: ldp x24, x23, [sp, #256] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[5], w8 +; CHECK-SD-NEXT: ldp x28, x27, [sp, #224] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v0.b[5], w15 +; CHECK-SD-NEXT: msub w16, w16, w1, w17 +; CHECK-SD-NEXT: smov w15, v3.b[12] +; CHECK-SD-NEXT: msub w8, w26, w2, w18 +; CHECK-SD-NEXT: ldr w2, [sp, #112] // 4-byte Folded Reload +; CHECK-SD-NEXT: sdiv w0, w20, w21 +; CHECK-SD-NEXT: ldp w1, w18, [sp, #116] // 8-byte Folded Reload +; CHECK-SD-NEXT: smov w17, v1.b[12] +; CHECK-SD-NEXT: ldp x26, x25, [sp, #240] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[6], w8 +; CHECK-SD-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload +; CHECK-SD-NEXT: mov v0.b[6], w16 +; CHECK-SD-NEXT: msub w18, w18, w2, w1 +; CHECK-SD-NEXT: msub w8, w8, w19, w7 +; CHECK-SD-NEXT: ldp w2, w1, [sp, #104] // 8-byte Folded Reload +; CHECK-SD-NEXT: mov v0.b[7], w18 +; CHECK-SD-NEXT: smov w18, v3.b[13] +; CHECK-SD-NEXT: mov v2.b[7], w8 +; CHECK-SD-NEXT: ldr w8, [sp, #8] // 4-byte Folded Reload +; CHECK-SD-NEXT: sdiv w9, w11, w10 +; CHECK-SD-NEXT: msub w1, w1, w4, w2 +; CHECK-SD-NEXT: smov w2, v1.b[13] +; CHECK-SD-NEXT: msub w8, w8, w6, w3 +; CHECK-SD-NEXT: ldp w4, w3, [sp, #140] // 8-byte Folded Reload +; CHECK-SD-NEXT: mov v0.b[8], w1 +; CHECK-SD-NEXT: mov v2.b[8], w8 +; CHECK-SD-NEXT: msub w8, w0, w21, w20 +; CHECK-SD-NEXT: msub w3, w3, w5, w4 +; CHECK-SD-NEXT: ldr w5, [sp, #124] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldp w4, w1, [sp, #128] // 8-byte Folded Reload +; CHECK-SD-NEXT: sdiv w13, w14, w12 +; CHECK-SD-NEXT: ldp x20, x19, [sp, #288] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[9], w8 +; CHECK-SD-NEXT: mov v0.b[9], w3 +; CHECK-SD-NEXT: msub w8, w9, w10, w11 +; CHECK-SD-NEXT: msub w1, w1, w5, w4 +; CHECK-SD-NEXT: ldr w4, [sp, #172] // 4-byte Folded Reload +; CHECK-SD-NEXT: smov w9, v3.b[14] +; CHECK-SD-NEXT: ldp w3, w11, [sp, #176] // 8-byte Folded Reload +; CHECK-SD-NEXT: smov w10, v1.b[14] +; CHECK-SD-NEXT: ldp x22, x21, [sp, #272] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[10], w8 +; CHECK-SD-NEXT: mov v0.b[10], w1 +; CHECK-SD-NEXT: ldr w1, [sp, #152] // 4-byte Folded Reload +; CHECK-SD-NEXT: msub w11, w11, w4, w3 +; CHECK-SD-NEXT: sdiv w16, w17, w15 +; CHECK-SD-NEXT: msub w8, w13, w12, w14 +; CHECK-SD-NEXT: ldr w13, [sp, #168] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldr w14, [sp, #160] // 4-byte Folded Reload +; CHECK-SD-NEXT: mov v0.b[11], w11 +; CHECK-SD-NEXT: smov w11, v3.b[15] +; CHECK-SD-NEXT: msub w13, w13, w1, w14 +; CHECK-SD-NEXT: smov w14, v1.b[15] +; CHECK-SD-NEXT: mov v2.b[11], w8 +; CHECK-SD-NEXT: mov v0.b[12], w13 +; CHECK-SD-NEXT: sdiv w0, w2, w18 +; CHECK-SD-NEXT: msub w8, w16, w15, w17 +; CHECK-SD-NEXT: ldr w17, [sp, #196] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldp w16, w15, [sp, #200] // 8-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[12], w8 +; CHECK-SD-NEXT: msub w15, w15, w17, w16 +; CHECK-SD-NEXT: ldp w17, w16, [sp, #188] // 8-byte Folded Reload +; CHECK-SD-NEXT: mov v0.b[13], w15 +; CHECK-SD-NEXT: sdiv w12, w10, w9 +; CHECK-SD-NEXT: msub w8, w0, w18, w2 +; CHECK-SD-NEXT: ldr w18, [sp, #184] // 4-byte Folded Reload +; CHECK-SD-NEXT: msub w16, w16, w18, w17 +; CHECK-SD-NEXT: mov v2.b[13], w8 +; CHECK-SD-NEXT: mov v0.b[14], w16 +; CHECK-SD-NEXT: sdiv w13, w14, w11 +; CHECK-SD-NEXT: msub w8, w12, w9, w10 +; CHECK-SD-NEXT: ldr w9, [sp, #164] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldr w12, [sp, #148] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldr w10, [sp, #156] // 4-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[14], w8 +; CHECK-SD-NEXT: msub w9, w9, w12, w10 +; CHECK-SD-NEXT: mov v0.b[15], w9 +; CHECK-SD-NEXT: msub w8, w13, w11, w14 +; CHECK-SD-NEXT: mov v2.b[15], w8 +; CHECK-SD-NEXT: mov v1.16b, v2.16b +; CHECK-SD-NEXT: add sp, sp, #304 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv32i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #112 +; CHECK-GI-NEXT: stp x29, x30, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x26, x25, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x24, x23, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 112 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w21, -24 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w23, -40 +; CHECK-GI-NEXT: .cfi_offset w24, -48 +; CHECK-GI-NEXT: .cfi_offset w25, -56 +; CHECK-GI-NEXT: .cfi_offset w26, -64 +; CHECK-GI-NEXT: .cfi_offset w27, -72 +; CHECK-GI-NEXT: .cfi_offset w28, -80 +; CHECK-GI-NEXT: .cfi_offset w30, -88 +; CHECK-GI-NEXT: .cfi_offset w29, -96 +; CHECK-GI-NEXT: sshll v4.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll v5.8h, v2.8b, #0 +; CHECK-GI-NEXT: sshll v16.8h, v1.8b, #0 +; CHECK-GI-NEXT: sshll v17.8h, v3.8b, #0 +; CHECK-GI-NEXT: sshll v6.4s, v4.4h, #0 +; CHECK-GI-NEXT: sshll v7.4s, v5.4h, #0 +; CHECK-GI-NEXT: sshll2 v4.4s, v4.8h, #0 +; CHECK-GI-NEXT: sshll2 v5.4s, v5.8h, #0 +; CHECK-GI-NEXT: sshll v18.4s, v16.4h, #0 +; CHECK-GI-NEXT: sshll v19.4s, v17.4h, #0 +; CHECK-GI-NEXT: sshll2 v16.4s, v16.8h, #0 +; CHECK-GI-NEXT: sshll2 v17.4s, v17.8h, #0 +; CHECK-GI-NEXT: fmov w8, s6 +; CHECK-GI-NEXT: fmov w9, s7 +; CHECK-GI-NEXT: mov w12, v7.s[3] +; CHECK-GI-NEXT: fmov w13, s5 +; CHECK-GI-NEXT: mov w14, v5.s[1] +; CHECK-GI-NEXT: mov w16, v5.s[3] +; CHECK-GI-NEXT: fmov w6, s19 +; CHECK-GI-NEXT: mov w7, v19.s[3] +; CHECK-GI-NEXT: fmov w21, s17 +; CHECK-GI-NEXT: sdiv w10, w8, w9 +; CHECK-GI-NEXT: mov w8, v6.s[1] +; CHECK-GI-NEXT: mov w9, v7.s[1] +; CHECK-GI-NEXT: mov w22, v17.s[3] +; CHECK-GI-NEXT: sdiv w11, w8, w9 +; CHECK-GI-NEXT: mov w8, v6.s[2] +; CHECK-GI-NEXT: mov w9, v7.s[2] +; CHECK-GI-NEXT: fmov s20, w10 +; CHECK-GI-NEXT: sdiv w9, w8, w9 +; CHECK-GI-NEXT: mov w8, v6.s[3] +; CHECK-GI-NEXT: sshll2 v6.8h, v0.16b, #0 +; CHECK-GI-NEXT: mov v20.s[1], w11 +; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll v28.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 +; CHECK-GI-NEXT: sdiv w8, w8, w12 +; CHECK-GI-NEXT: fmov w12, s4 +; CHECK-GI-NEXT: mov v20.s[2], w9 +; CHECK-GI-NEXT: sdiv w13, w12, w13 +; CHECK-GI-NEXT: mov w12, v4.s[1] +; CHECK-GI-NEXT: str w8, [sp, #12] // 4-byte Folded Spill +; CHECK-GI-NEXT: ldr w11, [sp, #12] // 4-byte Folded Reload +; CHECK-GI-NEXT: mov v20.s[3], w11 +; CHECK-GI-NEXT: sdiv w15, w12, w14 +; CHECK-GI-NEXT: mov w12, v4.s[2] +; CHECK-GI-NEXT: mov w14, v5.s[2] +; CHECK-GI-NEXT: sshll v5.4s, v6.4h, #0 +; CHECK-GI-NEXT: fmov s21, w13 +; CHECK-GI-NEXT: sdiv w14, w12, w14 +; CHECK-GI-NEXT: mov w12, v4.s[3] +; CHECK-GI-NEXT: sshll2 v4.8h, v2.16b, #0 +; CHECK-GI-NEXT: mov v21.s[1], w15 +; CHECK-GI-NEXT: sshll v2.8h, v2.8b, #0 +; CHECK-GI-NEXT: sshll v7.4s, v4.4h, #0 +; CHECK-GI-NEXT: sshll v30.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0 +; CHECK-GI-NEXT: fmov w17, s7 +; CHECK-GI-NEXT: mls v28.4s, v20.4s, v30.4s +; CHECK-GI-NEXT: sdiv w12, w12, w16 +; CHECK-GI-NEXT: fmov w16, s5 +; CHECK-GI-NEXT: mov v21.s[2], w14 +; CHECK-GI-NEXT: sdiv w18, w16, w17 +; CHECK-GI-NEXT: mov w16, v5.s[1] +; CHECK-GI-NEXT: mov w17, v7.s[1] +; CHECK-GI-NEXT: mov v21.s[3], w12 +; CHECK-GI-NEXT: mls v0.4s, v21.4s, v2.4s +; CHECK-GI-NEXT: sdiv w1, w16, w17 +; CHECK-GI-NEXT: mov w16, v5.s[2] +; CHECK-GI-NEXT: mov w17, v7.s[2] +; CHECK-GI-NEXT: fmov s22, w18 +; CHECK-GI-NEXT: uzp1 v0.8h, v28.8h, v0.8h +; CHECK-GI-NEXT: sdiv w0, w16, w17 +; CHECK-GI-NEXT: mov w16, v5.s[3] +; CHECK-GI-NEXT: mov w17, v7.s[3] +; CHECK-GI-NEXT: sshll2 v5.4s, v6.8h, #0 +; CHECK-GI-NEXT: sshll2 v7.4s, v4.8h, #0 +; CHECK-GI-NEXT: mov v22.s[1], w1 +; CHECK-GI-NEXT: sshll v6.4s, v6.4h, #0 +; CHECK-GI-NEXT: sshll v4.4s, v4.4h, #0 +; CHECK-GI-NEXT: fmov w2, s7 +; CHECK-GI-NEXT: mov w3, v7.s[3] +; CHECK-GI-NEXT: sdiv w16, w16, w17 +; CHECK-GI-NEXT: fmov w17, s5 +; CHECK-GI-NEXT: mov v22.s[2], w0 +; CHECK-GI-NEXT: sdiv w5, w17, w2 +; CHECK-GI-NEXT: mov w17, v5.s[1] +; CHECK-GI-NEXT: mov w2, v7.s[1] +; CHECK-GI-NEXT: mov v22.s[3], w16 +; CHECK-GI-NEXT: mls v6.4s, v22.4s, v4.4s +; CHECK-GI-NEXT: sdiv w4, w17, w2 +; CHECK-GI-NEXT: mov w17, v5.s[2] +; CHECK-GI-NEXT: mov w2, v7.s[2] +; CHECK-GI-NEXT: fmov s23, w5 +; CHECK-GI-NEXT: sdiv w2, w17, w2 +; CHECK-GI-NEXT: mov w17, v5.s[3] +; CHECK-GI-NEXT: mov v23.s[1], w4 +; CHECK-GI-NEXT: sdiv w17, w17, w3 +; CHECK-GI-NEXT: fmov w3, s18 +; CHECK-GI-NEXT: mov v23.s[2], w2 +; CHECK-GI-NEXT: sdiv w20, w3, w6 +; CHECK-GI-NEXT: mov w3, v18.s[1] +; CHECK-GI-NEXT: mov w6, v19.s[1] +; CHECK-GI-NEXT: mov v23.s[3], w17 +; CHECK-GI-NEXT: mls v5.4s, v23.4s, v7.4s +; CHECK-GI-NEXT: sdiv w19, w3, w6 +; CHECK-GI-NEXT: mov w3, v18.s[2] +; CHECK-GI-NEXT: mov w6, v19.s[2] +; CHECK-GI-NEXT: fmov s24, w20 +; CHECK-GI-NEXT: uzp1 v2.8h, v6.8h, v5.8h +; CHECK-GI-NEXT: uzp1 v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: sdiv w6, w3, w6 +; CHECK-GI-NEXT: mov w3, v18.s[3] +; CHECK-GI-NEXT: mov v24.s[1], w19 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: sdiv w3, w3, w7 +; CHECK-GI-NEXT: fmov w7, s16 +; CHECK-GI-NEXT: mov v24.s[2], w6 +; CHECK-GI-NEXT: sdiv w23, w7, w21 +; CHECK-GI-NEXT: mov w7, v16.s[1] +; CHECK-GI-NEXT: mov w21, v17.s[1] +; CHECK-GI-NEXT: mov v24.s[3], w3 +; CHECK-GI-NEXT: sdiv w24, w7, w21 +; CHECK-GI-NEXT: mov w7, v16.s[2] +; CHECK-GI-NEXT: mov w21, v17.s[2] +; CHECK-GI-NEXT: sshll2 v17.8h, v1.16b, #0 +; CHECK-GI-NEXT: fmov s25, w23 +; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: sshll v18.4s, v17.4h, #0 +; CHECK-GI-NEXT: sshll v29.4s, v1.4h, #0 +; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 +; CHECK-GI-NEXT: sdiv w21, w7, w21 +; CHECK-GI-NEXT: mov w7, v16.s[3] +; CHECK-GI-NEXT: sshll2 v16.8h, v3.16b, #0 +; CHECK-GI-NEXT: mov v25.s[1], w24 +; CHECK-GI-NEXT: sshll v3.8h, v3.8b, #0 +; CHECK-GI-NEXT: ldp x24, x23, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: sshll v19.4s, v16.4h, #0 +; CHECK-GI-NEXT: sshll v31.4s, v3.4h, #0 +; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 +; CHECK-GI-NEXT: fmov w25, s19 +; CHECK-GI-NEXT: mov w26, v19.s[1] +; CHECK-GI-NEXT: mov w27, v19.s[2] +; CHECK-GI-NEXT: mov w28, v19.s[3] +; CHECK-GI-NEXT: sshll2 v19.4s, v16.8h, #0 +; CHECK-GI-NEXT: sshll v16.4s, v16.4h, #0 +; CHECK-GI-NEXT: sdiv w7, w7, w22 +; CHECK-GI-NEXT: fmov w22, s18 +; CHECK-GI-NEXT: mov v25.s[2], w21 +; CHECK-GI-NEXT: mls v29.4s, v24.4s, v31.4s +; CHECK-GI-NEXT: fmov w29, s19 +; CHECK-GI-NEXT: mov w30, v19.s[1] +; CHECK-GI-NEXT: mov w8, v19.s[2] +; CHECK-GI-NEXT: mov w10, v19.s[3] +; CHECK-GI-NEXT: sdiv w25, w22, w25 +; CHECK-GI-NEXT: mov w22, v18.s[1] +; CHECK-GI-NEXT: mov v25.s[3], w7 +; CHECK-GI-NEXT: mls v1.4s, v25.4s, v3.4s +; CHECK-GI-NEXT: sdiv w26, w22, w26 +; CHECK-GI-NEXT: mov w22, v18.s[2] +; CHECK-GI-NEXT: fmov s26, w25 +; CHECK-GI-NEXT: uzp1 v1.8h, v29.8h, v1.8h +; CHECK-GI-NEXT: sdiv w27, w22, w27 +; CHECK-GI-NEXT: mov w22, v18.s[3] +; CHECK-GI-NEXT: sshll2 v18.4s, v17.8h, #0 +; CHECK-GI-NEXT: mov v26.s[1], w26 +; CHECK-GI-NEXT: sshll v17.4s, v17.4h, #0 +; CHECK-GI-NEXT: ldp x26, x25, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov w9, v18.s[3] +; CHECK-GI-NEXT: sdiv w22, w22, w28 +; CHECK-GI-NEXT: fmov w28, s18 +; CHECK-GI-NEXT: mov v26.s[2], w27 +; CHECK-GI-NEXT: sdiv w28, w28, w29 +; CHECK-GI-NEXT: mov w29, v18.s[1] +; CHECK-GI-NEXT: mov v26.s[3], w22 +; CHECK-GI-NEXT: ldp x22, x21, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: mls v17.4s, v26.4s, v16.4s +; CHECK-GI-NEXT: sdiv w29, w29, w30 +; CHECK-GI-NEXT: mov w30, v18.s[2] +; CHECK-GI-NEXT: fmov s27, w28 +; CHECK-GI-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: sdiv w8, w30, w8 +; CHECK-GI-NEXT: mov v27.s[1], w29 +; CHECK-GI-NEXT: ldp x29, x30, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: sdiv w9, w9, w10 +; CHECK-GI-NEXT: mov v27.s[2], w8 +; CHECK-GI-NEXT: mov v27.s[3], w9 +; CHECK-GI-NEXT: mls v18.4s, v27.4s, v19.4s +; CHECK-GI-NEXT: uzp1 v3.8h, v17.8h, v18.8h +; CHECK-GI-NEXT: uzp1 v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: add sp, sp, #112 +; CHECK-GI-NEXT: ret +entry: + %s = srem <32 x i8> %d, %e + ret <32 x i8> %s +} + +define <2 x i8> @uv2i8(<2 x i8> %d, <2 x i8> %e) { +; CHECK-SD-LABEL: uv2i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi d2, #0x0000ff000000ff +; CHECK-SD-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-SD-NEXT: and v1.8b, v1.8b, v2.8b +; CHECK-SD-NEXT: fmov w8, s1 +; CHECK-SD-NEXT: fmov w9, s0 +; CHECK-SD-NEXT: mov w11, v1.s[1] +; CHECK-SD-NEXT: mov w12, v0.s[1] +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: udiv w13, w12, w11 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: fmov s0, w8 +; CHECK-SD-NEXT: msub w9, w13, w11, w12 +; CHECK-SD-NEXT: mov v0.s[1], w9 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv2i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi d2, #0x0000ff000000ff +; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: and v1.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: mov w10, v1.s[1] +; CHECK-GI-NEXT: udiv w8, w8, w9 +; CHECK-GI-NEXT: mov w9, v0.s[1] +; CHECK-GI-NEXT: udiv w9, w9, w10 +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.s[1], w9 +; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret +entry: + %s = urem <2 x i8> %d, %e + ret <2 x i8> %s +} + +define <3 x i8> @uv3i8(<3 x i8> %d, <3 x i8> %e) { +; CHECK-LABEL: uv3i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w3, #0xff +; CHECK-NEXT: and w9, w0, #0xff +; CHECK-NEXT: and w11, w4, #0xff +; CHECK-NEXT: and w12, w1, #0xff +; CHECK-NEXT: and w14, w5, #0xff +; CHECK-NEXT: and w15, w2, #0xff +; CHECK-NEXT: udiv w10, w9, w8 +; CHECK-NEXT: udiv w13, w12, w11 +; CHECK-NEXT: msub w0, w10, w8, w9 +; CHECK-NEXT: udiv w16, w15, w14 +; CHECK-NEXT: msub w1, w13, w11, w12 +; CHECK-NEXT: msub w2, w16, w14, w15 +; CHECK-NEXT: ret +entry: + %s = urem <3 x i8> %d, %e + ret <3 x i8> %s +} + +define <4 x i8> @uv4i8(<4 x i8> %d, <4 x i8> %e) { +; CHECK-SD-LABEL: uv4i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8 +; CHECK-SD-NEXT: bic v1.4h, #255, lsl #8 +; CHECK-SD-NEXT: umov w11, v1.h[0] +; CHECK-SD-NEXT: umov w12, v0.h[0] +; CHECK-SD-NEXT: umov w8, v1.h[1] +; CHECK-SD-NEXT: umov w9, v0.h[1] +; CHECK-SD-NEXT: umov w14, v1.h[2] +; CHECK-SD-NEXT: umov w15, v0.h[2] +; CHECK-SD-NEXT: umov w17, v1.h[3] +; CHECK-SD-NEXT: umov w18, v0.h[3] +; CHECK-SD-NEXT: udiv w13, w12, w11 +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: msub w11, w13, w11, w12 +; CHECK-SD-NEXT: fmov s0, w11 +; CHECK-SD-NEXT: udiv w16, w15, w14 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: mov v0.h[1], w8 +; CHECK-SD-NEXT: udiv w9, w18, w17 +; CHECK-SD-NEXT: msub w8, w16, w14, w15 +; CHECK-SD-NEXT: mov v0.h[2], w8 +; CHECK-SD-NEXT: msub w8, w9, w17, w18 +; CHECK-SD-NEXT: mov v0.h[3], w8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv4i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0x0000ff000000ff +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: mov w10, v1.s[1] +; CHECK-GI-NEXT: mov w11, v1.s[2] +; CHECK-GI-NEXT: mov w12, v1.s[3] +; CHECK-GI-NEXT: udiv w8, w8, w9 +; CHECK-GI-NEXT: mov w9, v0.s[1] +; CHECK-GI-NEXT: udiv w9, w9, w10 +; CHECK-GI-NEXT: mov w10, v0.s[2] +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: udiv w10, w10, w11 +; CHECK-GI-NEXT: mov w11, v0.s[3] +; CHECK-GI-NEXT: mov v2.s[1], w9 +; CHECK-GI-NEXT: udiv w8, w11, w12 +; CHECK-GI-NEXT: mov v2.s[2], w10 +; CHECK-GI-NEXT: mov v2.s[3], w8 +; CHECK-GI-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: ret +entry: + %s = urem <4 x i8> %d, %e + ret <4 x i8> %s +} + +define <8 x i8> @uv8i8(<8 x i8> %d, <8 x i8> %e) { +; CHECK-SD-LABEL: uv8i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: umov w11, v1.b[0] +; CHECK-SD-NEXT: umov w12, v0.b[0] +; CHECK-SD-NEXT: umov w8, v1.b[1] +; CHECK-SD-NEXT: umov w9, v0.b[1] +; CHECK-SD-NEXT: umov w14, v1.b[2] +; CHECK-SD-NEXT: umov w15, v0.b[2] +; CHECK-SD-NEXT: umov w17, v1.b[3] +; CHECK-SD-NEXT: umov w18, v0.b[3] +; CHECK-SD-NEXT: umov w1, v1.b[4] +; CHECK-SD-NEXT: umov w2, v0.b[4] +; CHECK-SD-NEXT: umov w4, v1.b[5] +; CHECK-SD-NEXT: umov w5, v0.b[5] +; CHECK-SD-NEXT: udiv w13, w12, w11 +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: msub w11, w13, w11, w12 +; CHECK-SD-NEXT: umov w13, v1.b[7] +; CHECK-SD-NEXT: fmov s2, w11 +; CHECK-SD-NEXT: umov w11, v0.b[6] +; CHECK-SD-NEXT: udiv w16, w15, w14 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: umov w10, v1.b[6] +; CHECK-SD-NEXT: mov v2.b[1], w8 +; CHECK-SD-NEXT: udiv w0, w18, w17 +; CHECK-SD-NEXT: msub w8, w16, w14, w15 +; CHECK-SD-NEXT: umov w14, v0.b[7] +; CHECK-SD-NEXT: mov v2.b[2], w8 +; CHECK-SD-NEXT: udiv w3, w2, w1 +; CHECK-SD-NEXT: msub w8, w0, w17, w18 +; CHECK-SD-NEXT: mov v2.b[3], w8 +; CHECK-SD-NEXT: udiv w9, w5, w4 +; CHECK-SD-NEXT: msub w8, w3, w1, w2 +; CHECK-SD-NEXT: mov v2.b[4], w8 +; CHECK-SD-NEXT: udiv w12, w11, w10 +; CHECK-SD-NEXT: msub w8, w9, w4, w5 +; CHECK-SD-NEXT: mov v2.b[5], w8 +; CHECK-SD-NEXT: udiv w9, w14, w13 +; CHECK-SD-NEXT: msub w8, w12, w10, w11 +; CHECK-SD-NEXT: mov v2.b[6], w8 +; CHECK-SD-NEXT: msub w8, w9, w13, w14 +; CHECK-SD-NEXT: mov v2.b[7], w8 +; CHECK-SD-NEXT: fmov d0, d2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv8i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v3.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0 +; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0 +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov w10, v3.s[1] +; CHECK-GI-NEXT: mov w11, v3.s[2] +; CHECK-GI-NEXT: mov w12, v3.s[3] +; CHECK-GI-NEXT: fmov w13, s1 +; CHECK-GI-NEXT: mov w14, v1.s[1] +; CHECK-GI-NEXT: mov w15, v1.s[2] +; CHECK-GI-NEXT: mov w16, v1.s[3] +; CHECK-GI-NEXT: udiv w8, w8, w9 +; CHECK-GI-NEXT: mov w9, v2.s[1] +; CHECK-GI-NEXT: udiv w9, w9, w10 +; CHECK-GI-NEXT: mov w10, v2.s[2] +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: udiv w10, w10, w11 +; CHECK-GI-NEXT: mov w11, v2.s[3] +; CHECK-GI-NEXT: mov v4.s[1], w9 +; CHECK-GI-NEXT: udiv w11, w11, w12 +; CHECK-GI-NEXT: fmov w12, s0 +; CHECK-GI-NEXT: mov v4.s[2], w10 +; CHECK-GI-NEXT: udiv w12, w12, w13 +; CHECK-GI-NEXT: mov w13, v0.s[1] +; CHECK-GI-NEXT: mov v4.s[3], w11 +; CHECK-GI-NEXT: mls v2.4s, v4.4s, v3.4s +; CHECK-GI-NEXT: udiv w13, w13, w14 +; CHECK-GI-NEXT: mov w14, v0.s[2] +; CHECK-GI-NEXT: fmov s5, w12 +; CHECK-GI-NEXT: udiv w14, w14, w15 +; CHECK-GI-NEXT: mov w15, v0.s[3] +; CHECK-GI-NEXT: mov v5.s[1], w13 +; CHECK-GI-NEXT: udiv w8, w15, w16 +; CHECK-GI-NEXT: mov v5.s[2], w14 +; CHECK-GI-NEXT: mov v5.s[3], w8 +; CHECK-GI-NEXT: mls v0.4s, v5.4s, v1.4s +; CHECK-GI-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: ret +entry: + %s = urem <8 x i8> %d, %e + ret <8 x i8> %s +} + +define <16 x i8> @uv16i8(<16 x i8> %d, <16 x i8> %e) { +; CHECK-SD-LABEL: uv16i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: stp x28, x27, [sp, #-80]! // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x26, x25, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x24, x23, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 80 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w23, -40 +; CHECK-SD-NEXT: .cfi_offset w24, -48 +; CHECK-SD-NEXT: .cfi_offset w25, -56 +; CHECK-SD-NEXT: .cfi_offset w26, -64 +; CHECK-SD-NEXT: .cfi_offset w27, -72 +; CHECK-SD-NEXT: .cfi_offset w28, -80 +; CHECK-SD-NEXT: umov w11, v1.b[0] +; CHECK-SD-NEXT: umov w12, v0.b[0] +; CHECK-SD-NEXT: umov w8, v1.b[1] +; CHECK-SD-NEXT: umov w9, v0.b[1] +; CHECK-SD-NEXT: umov w14, v1.b[2] +; CHECK-SD-NEXT: umov w15, v0.b[2] +; CHECK-SD-NEXT: umov w17, v1.b[3] +; CHECK-SD-NEXT: umov w18, v0.b[3] +; CHECK-SD-NEXT: umov w1, v1.b[4] +; CHECK-SD-NEXT: umov w2, v0.b[4] +; CHECK-SD-NEXT: umov w4, v1.b[5] +; CHECK-SD-NEXT: umov w5, v0.b[5] +; CHECK-SD-NEXT: udiv w13, w12, w11 +; CHECK-SD-NEXT: umov w7, v1.b[6] +; CHECK-SD-NEXT: umov w19, v0.b[6] +; CHECK-SD-NEXT: umov w21, v1.b[7] +; CHECK-SD-NEXT: umov w22, v0.b[7] +; CHECK-SD-NEXT: umov w24, v1.b[8] +; CHECK-SD-NEXT: umov w25, v0.b[8] +; CHECK-SD-NEXT: umov w27, v1.b[9] +; CHECK-SD-NEXT: umov w28, v0.b[9] +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: msub w11, w13, w11, w12 +; CHECK-SD-NEXT: umov w13, v1.b[11] +; CHECK-SD-NEXT: fmov s2, w11 +; CHECK-SD-NEXT: umov w11, v0.b[10] +; CHECK-SD-NEXT: udiv w16, w15, w14 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: umov w10, v1.b[10] +; CHECK-SD-NEXT: mov v2.b[1], w8 +; CHECK-SD-NEXT: udiv w0, w18, w17 +; CHECK-SD-NEXT: msub w8, w16, w14, w15 +; CHECK-SD-NEXT: umov w14, v0.b[11] +; CHECK-SD-NEXT: umov w16, v1.b[12] +; CHECK-SD-NEXT: mov v2.b[2], w8 +; CHECK-SD-NEXT: udiv w3, w2, w1 +; CHECK-SD-NEXT: msub w8, w0, w17, w18 +; CHECK-SD-NEXT: umov w17, v0.b[12] +; CHECK-SD-NEXT: umov w0, v1.b[13] +; CHECK-SD-NEXT: mov v2.b[3], w8 +; CHECK-SD-NEXT: udiv w6, w5, w4 +; CHECK-SD-NEXT: msub w8, w3, w1, w2 +; CHECK-SD-NEXT: umov w1, v0.b[13] +; CHECK-SD-NEXT: mov v2.b[4], w8 +; CHECK-SD-NEXT: udiv w20, w19, w7 +; CHECK-SD-NEXT: msub w8, w6, w4, w5 +; CHECK-SD-NEXT: mov v2.b[5], w8 +; CHECK-SD-NEXT: udiv w23, w22, w21 +; CHECK-SD-NEXT: msub w8, w20, w7, w19 +; CHECK-SD-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[6], w8 +; CHECK-SD-NEXT: udiv w26, w25, w24 +; CHECK-SD-NEXT: msub w8, w23, w21, w22 +; CHECK-SD-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[7], w8 +; CHECK-SD-NEXT: udiv w9, w28, w27 +; CHECK-SD-NEXT: msub w8, w26, w24, w25 +; CHECK-SD-NEXT: ldp x24, x23, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x26, x25, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[8], w8 +; CHECK-SD-NEXT: udiv w12, w11, w10 +; CHECK-SD-NEXT: msub w8, w9, w27, w28 +; CHECK-SD-NEXT: mov v2.b[9], w8 +; CHECK-SD-NEXT: udiv w15, w14, w13 +; CHECK-SD-NEXT: msub w8, w12, w10, w11 +; CHECK-SD-NEXT: umov w10, v1.b[14] +; CHECK-SD-NEXT: umov w11, v0.b[14] +; CHECK-SD-NEXT: mov v2.b[10], w8 +; CHECK-SD-NEXT: udiv w18, w17, w16 +; CHECK-SD-NEXT: msub w8, w15, w13, w14 +; CHECK-SD-NEXT: umov w13, v1.b[15] +; CHECK-SD-NEXT: umov w14, v0.b[15] +; CHECK-SD-NEXT: mov v2.b[11], w8 +; CHECK-SD-NEXT: udiv w9, w1, w0 +; CHECK-SD-NEXT: msub w8, w18, w16, w17 +; CHECK-SD-NEXT: mov v2.b[12], w8 +; CHECK-SD-NEXT: udiv w12, w11, w10 +; CHECK-SD-NEXT: msub w8, w9, w0, w1 +; CHECK-SD-NEXT: mov v2.b[13], w8 +; CHECK-SD-NEXT: udiv w9, w14, w13 +; CHECK-SD-NEXT: msub w8, w12, w10, w11 +; CHECK-SD-NEXT: mov v2.b[14], w8 +; CHECK-SD-NEXT: msub w8, w9, w13, w14 +; CHECK-SD-NEXT: mov v2.b[15], w8 +; CHECK-SD-NEXT: mov v0.16b, v2.16b +; CHECK-SD-NEXT: ldp x28, x27, [sp], #80 // 16-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv16i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ushll v4.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll v5.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll2 v6.8h, v0.16b, #0 +; CHECK-GI-NEXT: ushll2 v7.8h, v1.16b, #0 +; CHECK-GI-NEXT: ushll v2.4s, v4.4h, #0 +; CHECK-GI-NEXT: ushll v3.4s, v5.4h, #0 +; CHECK-GI-NEXT: ushll2 v4.4s, v4.8h, #0 +; CHECK-GI-NEXT: ushll2 v5.4s, v5.8h, #0 +; CHECK-GI-NEXT: ushll v0.4s, v6.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v7.4h, #0 +; CHECK-GI-NEXT: ushll2 v6.4s, v6.8h, #0 +; CHECK-GI-NEXT: ushll2 v7.4s, v7.8h, #0 +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov w12, v3.s[3] +; CHECK-GI-NEXT: fmov w13, s5 +; CHECK-GI-NEXT: mov w16, v5.s[3] +; CHECK-GI-NEXT: fmov w17, s1 +; CHECK-GI-NEXT: mov w18, v1.s[1] +; CHECK-GI-NEXT: mov w0, v1.s[2] +; CHECK-GI-NEXT: mov w1, v1.s[3] +; CHECK-GI-NEXT: udiv w10, w8, w9 +; CHECK-GI-NEXT: mov w8, v2.s[1] +; CHECK-GI-NEXT: mov w9, v3.s[1] +; CHECK-GI-NEXT: fmov w2, s7 +; CHECK-GI-NEXT: mov w3, v7.s[1] +; CHECK-GI-NEXT: mov w4, v7.s[2] +; CHECK-GI-NEXT: mov w5, v7.s[3] +; CHECK-GI-NEXT: udiv w11, w8, w9 +; CHECK-GI-NEXT: mov w8, v2.s[2] +; CHECK-GI-NEXT: mov w9, v3.s[2] +; CHECK-GI-NEXT: fmov s16, w10 +; CHECK-GI-NEXT: udiv w9, w8, w9 +; CHECK-GI-NEXT: mov w8, v2.s[3] +; CHECK-GI-NEXT: mov v16.s[1], w11 +; CHECK-GI-NEXT: udiv w8, w8, w12 +; CHECK-GI-NEXT: fmov w12, s4 +; CHECK-GI-NEXT: mov v16.s[2], w9 +; CHECK-GI-NEXT: udiv w14, w12, w13 +; CHECK-GI-NEXT: mov w12, v4.s[1] +; CHECK-GI-NEXT: mov w13, v5.s[1] +; CHECK-GI-NEXT: mov v16.s[3], w8 +; CHECK-GI-NEXT: mls v2.4s, v16.4s, v3.4s +; CHECK-GI-NEXT: udiv w15, w12, w13 +; CHECK-GI-NEXT: mov w12, v4.s[2] +; CHECK-GI-NEXT: mov w13, v5.s[2] +; CHECK-GI-NEXT: fmov s17, w14 +; CHECK-GI-NEXT: udiv w13, w12, w13 +; CHECK-GI-NEXT: mov w12, v4.s[3] +; CHECK-GI-NEXT: mov v17.s[1], w15 +; CHECK-GI-NEXT: udiv w12, w12, w16 +; CHECK-GI-NEXT: fmov w16, s0 +; CHECK-GI-NEXT: mov v17.s[2], w13 +; CHECK-GI-NEXT: udiv w16, w16, w17 +; CHECK-GI-NEXT: mov w17, v0.s[1] +; CHECK-GI-NEXT: mov v17.s[3], w12 +; CHECK-GI-NEXT: mls v4.4s, v17.4s, v5.4s +; CHECK-GI-NEXT: udiv w17, w17, w18 +; CHECK-GI-NEXT: mov w18, v0.s[2] +; CHECK-GI-NEXT: fmov s18, w16 +; CHECK-GI-NEXT: udiv w18, w18, w0 +; CHECK-GI-NEXT: mov w0, v0.s[3] +; CHECK-GI-NEXT: mov v18.s[1], w17 +; CHECK-GI-NEXT: udiv w0, w0, w1 +; CHECK-GI-NEXT: fmov w1, s6 +; CHECK-GI-NEXT: mov v18.s[2], w18 +; CHECK-GI-NEXT: udiv w1, w1, w2 +; CHECK-GI-NEXT: mov w2, v6.s[1] +; CHECK-GI-NEXT: mov v18.s[3], w0 +; CHECK-GI-NEXT: mls v0.4s, v18.4s, v1.4s +; CHECK-GI-NEXT: uzp1 v1.8h, v2.8h, v4.8h +; CHECK-GI-NEXT: udiv w2, w2, w3 +; CHECK-GI-NEXT: mov w3, v6.s[2] +; CHECK-GI-NEXT: fmov s19, w1 +; CHECK-GI-NEXT: udiv w3, w3, w4 +; CHECK-GI-NEXT: mov w4, v6.s[3] +; CHECK-GI-NEXT: mov v19.s[1], w2 +; CHECK-GI-NEXT: udiv w10, w4, w5 +; CHECK-GI-NEXT: mov v19.s[2], w3 +; CHECK-GI-NEXT: mov v19.s[3], w10 +; CHECK-GI-NEXT: mls v6.4s, v19.4s, v7.4s +; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v6.8h +; CHECK-GI-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: ret +entry: + %s = urem <16 x i8> %d, %e + ret <16 x i8> %s +} + +define <32 x i8> @uv32i8(<32 x i8> %d, <32 x i8> %e) { +; CHECK-SD-LABEL: uv32i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #304 +; CHECK-SD-NEXT: stp x29, x30, [sp, #208] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x28, x27, [sp, #224] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x26, x25, [sp, #240] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x24, x23, [sp, #256] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #272] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #288] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 304 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w23, -40 +; CHECK-SD-NEXT: .cfi_offset w24, -48 +; CHECK-SD-NEXT: .cfi_offset w25, -56 +; CHECK-SD-NEXT: .cfi_offset w26, -64 +; CHECK-SD-NEXT: .cfi_offset w27, -72 +; CHECK-SD-NEXT: .cfi_offset w28, -80 +; CHECK-SD-NEXT: .cfi_offset w30, -88 +; CHECK-SD-NEXT: .cfi_offset w29, -96 +; CHECK-SD-NEXT: umov w8, v2.b[1] +; CHECK-SD-NEXT: umov w9, v0.b[1] +; CHECK-SD-NEXT: umov w19, v3.b[7] +; CHECK-SD-NEXT: umov w7, v1.b[7] +; CHECK-SD-NEXT: umov w6, v3.b[8] +; CHECK-SD-NEXT: umov w3, v1.b[8] +; CHECK-SD-NEXT: umov w13, v3.b[0] +; CHECK-SD-NEXT: umov w5, v3.b[1] +; CHECK-SD-NEXT: umov w0, v1.b[1] +; CHECK-SD-NEXT: umov w12, v3.b[2] +; CHECK-SD-NEXT: umov w17, v3.b[3] +; CHECK-SD-NEXT: umov w16, v1.b[3] +; CHECK-SD-NEXT: str w8, [sp, #80] // 4-byte Folded Spill +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: umov w8, v2.b[0] +; CHECK-SD-NEXT: str w9, [sp, #88] // 4-byte Folded Spill +; CHECK-SD-NEXT: umov w9, v0.b[0] +; CHECK-SD-NEXT: ldr w30, [sp, #80] // 4-byte Folded Reload +; CHECK-SD-NEXT: umov w15, v3.b[4] +; CHECK-SD-NEXT: umov w14, v1.b[4] +; CHECK-SD-NEXT: umov w4, v3.b[5] +; CHECK-SD-NEXT: umov w1, v1.b[5] +; CHECK-SD-NEXT: umov w2, v3.b[6] +; CHECK-SD-NEXT: umov w18, v1.b[6] +; CHECK-SD-NEXT: str w8, [sp, #32] // 4-byte Folded Spill +; CHECK-SD-NEXT: umov w21, v3.b[9] +; CHECK-SD-NEXT: umov w20, v1.b[9] +; CHECK-SD-NEXT: str w9, [sp, #40] // 4-byte Folded Spill +; CHECK-SD-NEXT: ldr w29, [sp, #32] // 4-byte Folded Reload +; CHECK-SD-NEXT: udiv w11, w9, w8 +; CHECK-SD-NEXT: umov w8, v2.b[2] +; CHECK-SD-NEXT: umov w9, v0.b[2] +; CHECK-SD-NEXT: str w10, [sp, #96] // 4-byte Folded Spill +; CHECK-SD-NEXT: stp w8, w9, [sp, #16] // 8-byte Folded Spill +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: umov w8, v2.b[3] +; CHECK-SD-NEXT: umov w9, v0.b[3] +; CHECK-SD-NEXT: stp w11, w8, [sp, #48] // 8-byte Folded Spill +; CHECK-SD-NEXT: str w10, [sp, #24] // 4-byte Folded Spill +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: umov w8, v2.b[4] +; CHECK-SD-NEXT: str w8, [sp, #28] // 4-byte Folded Spill +; CHECK-SD-NEXT: stp w9, w10, [sp, #56] // 8-byte Folded Spill +; CHECK-SD-NEXT: umov w9, v0.b[4] +; CHECK-SD-NEXT: udiv w27, w0, w5 +; CHECK-SD-NEXT: str w9, [sp, #36] // 4-byte Folded Spill +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: umov w8, v2.b[5] +; CHECK-SD-NEXT: umov w9, v0.b[5] +; CHECK-SD-NEXT: str w8, [sp, #76] // 4-byte Folded Spill +; CHECK-SD-NEXT: str w9, [sp, #84] // 4-byte Folded Spill +; CHECK-SD-NEXT: str w10, [sp, #44] // 4-byte Folded Spill +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: umov w8, v2.b[6] +; CHECK-SD-NEXT: umov w9, v0.b[6] +; CHECK-SD-NEXT: stp w8, w9, [sp, #64] // 8-byte Folded Spill +; CHECK-SD-NEXT: str w10, [sp, #92] // 4-byte Folded Spill +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: umov w8, v2.b[7] +; CHECK-SD-NEXT: umov w9, v0.b[7] +; CHECK-SD-NEXT: stp w8, w9, [sp, #112] // 8-byte Folded Spill +; CHECK-SD-NEXT: udiv w11, w9, w8 +; CHECK-SD-NEXT: umov w8, v2.b[8] +; CHECK-SD-NEXT: umov w9, v0.b[8] +; CHECK-SD-NEXT: str w10, [sp, #72] // 4-byte Folded Spill +; CHECK-SD-NEXT: stp w8, w9, [sp, #100] // 8-byte Folded Spill +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: umov w8, v2.b[9] +; CHECK-SD-NEXT: umov w9, v0.b[9] +; CHECK-SD-NEXT: stp w8, w9, [sp, #136] // 8-byte Folded Spill +; CHECK-SD-NEXT: str w10, [sp, #108] // 4-byte Folded Spill +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: umov w8, v2.b[10] +; CHECK-SD-NEXT: umov w9, v0.b[10] +; CHECK-SD-NEXT: stp w11, w8, [sp, #120] // 8-byte Folded Spill +; CHECK-SD-NEXT: str w10, [sp, #144] // 4-byte Folded Spill +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: umov w8, v2.b[11] +; CHECK-SD-NEXT: stp w9, w10, [sp, #128] // 8-byte Folded Spill +; CHECK-SD-NEXT: umov w9, v0.b[11] +; CHECK-SD-NEXT: udiv w25, w16, w17 +; CHECK-SD-NEXT: stp w8, w9, [sp, #172] // 8-byte Folded Spill +; CHECK-SD-NEXT: udiv w11, w9, w8 +; CHECK-SD-NEXT: umov w8, v2.b[12] +; CHECK-SD-NEXT: umov w9, v0.b[12] +; CHECK-SD-NEXT: str w8, [sp, #152] // 4-byte Folded Spill +; CHECK-SD-NEXT: str w9, [sp, #160] // 4-byte Folded Spill +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: umov w8, v2.b[13] +; CHECK-SD-NEXT: umov w9, v0.b[13] +; CHECK-SD-NEXT: stp w8, w9, [sp, #196] // 8-byte Folded Spill +; CHECK-SD-NEXT: str w10, [sp, #168] // 4-byte Folded Spill +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: umov w8, v2.b[14] +; CHECK-SD-NEXT: umov w9, v0.b[14] +; CHECK-SD-NEXT: stp w11, w8, [sp, #180] // 8-byte Folded Spill +; CHECK-SD-NEXT: umov w11, v1.b[2] +; CHECK-SD-NEXT: str w10, [sp, #204] // 4-byte Folded Spill +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: umov w8, v2.b[15] +; CHECK-SD-NEXT: str w8, [sp, #148] // 4-byte Folded Spill +; CHECK-SD-NEXT: stp w9, w10, [sp, #188] // 8-byte Folded Spill +; CHECK-SD-NEXT: umov w9, v0.b[15] +; CHECK-SD-NEXT: udiv w22, w11, w12 +; CHECK-SD-NEXT: str w9, [sp, #156] // 4-byte Folded Spill +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: str w10, [sp, #164] // 4-byte Folded Spill +; CHECK-SD-NEXT: umov w10, v1.b[0] +; CHECK-SD-NEXT: udiv w9, w7, w19 +; CHECK-SD-NEXT: udiv w8, w3, w6 +; CHECK-SD-NEXT: udiv w23, w10, w13 +; CHECK-SD-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill +; CHECK-SD-NEXT: ldr w8, [sp, #96] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldr w9, [sp, #88] // 4-byte Folded Reload +; CHECK-SD-NEXT: msub w9, w8, w30, w9 +; CHECK-SD-NEXT: ldr w8, [sp, #48] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldr w30, [sp, #40] // 4-byte Folded Reload +; CHECK-SD-NEXT: msub w8, w8, w29, w30 +; CHECK-SD-NEXT: ldp x29, x30, [sp, #208] // 16-byte Folded Reload +; CHECK-SD-NEXT: fmov s0, w8 +; CHECK-SD-NEXT: msub w10, w23, w13, w10 +; CHECK-SD-NEXT: udiv w24, w14, w15 +; CHECK-SD-NEXT: msub w13, w27, w5, w0 +; CHECK-SD-NEXT: ldr w5, [sp, #16] // 4-byte Folded Reload +; CHECK-SD-NEXT: mov v0.b[1], w9 +; CHECK-SD-NEXT: msub w9, w22, w12, w11 +; CHECK-SD-NEXT: umov w11, v1.b[10] +; CHECK-SD-NEXT: fmov s2, w10 +; CHECK-SD-NEXT: ldp w10, w8, [sp, #20] // 8-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[1], w13 +; CHECK-SD-NEXT: msub w8, w8, w5, w10 +; CHECK-SD-NEXT: ldr w5, [sp, #52] // 4-byte Folded Reload +; CHECK-SD-NEXT: umov w10, v3.b[10] +; CHECK-SD-NEXT: udiv w28, w1, w4 +; CHECK-SD-NEXT: ldp w13, w12, [sp, #56] // 8-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[2], w9 +; CHECK-SD-NEXT: mov v0.b[2], w8 +; CHECK-SD-NEXT: msub w8, w25, w17, w16 +; CHECK-SD-NEXT: ldr w17, [sp, #28] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldr w16, [sp, #36] // 4-byte Folded Reload +; CHECK-SD-NEXT: msub w12, w12, w5, w13 +; CHECK-SD-NEXT: ldr w13, [sp, #44] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldr w5, [sp, #136] // 4-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[3], w8 +; CHECK-SD-NEXT: msub w8, w24, w15, w14 +; CHECK-SD-NEXT: ldr w15, [sp, #92] // 4-byte Folded Reload +; CHECK-SD-NEXT: mov v0.b[3], w12 +; CHECK-SD-NEXT: msub w13, w13, w17, w16 +; CHECK-SD-NEXT: ldr w17, [sp, #76] // 4-byte Folded Reload +; CHECK-SD-NEXT: udiv w26, w18, w2 +; CHECK-SD-NEXT: ldr w16, [sp, #84] // 4-byte Folded Reload +; CHECK-SD-NEXT: umov w12, v3.b[11] +; CHECK-SD-NEXT: msub w15, w15, w17, w16 +; CHECK-SD-NEXT: umov w14, v1.b[11] +; CHECK-SD-NEXT: mov v2.b[4], w8 +; CHECK-SD-NEXT: msub w8, w28, w4, w1 +; CHECK-SD-NEXT: ldr w1, [sp, #64] // 4-byte Folded Reload +; CHECK-SD-NEXT: mov v0.b[4], w13 +; CHECK-SD-NEXT: ldr w4, [sp, #100] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldp w17, w16, [sp, #68] // 8-byte Folded Reload +; CHECK-SD-NEXT: ldp x24, x23, [sp, #256] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[5], w8 +; CHECK-SD-NEXT: ldp x28, x27, [sp, #224] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v0.b[5], w15 +; CHECK-SD-NEXT: msub w16, w16, w1, w17 +; CHECK-SD-NEXT: umov w15, v3.b[12] +; CHECK-SD-NEXT: msub w8, w26, w2, w18 +; CHECK-SD-NEXT: ldr w2, [sp, #112] // 4-byte Folded Reload +; CHECK-SD-NEXT: udiv w0, w20, w21 +; CHECK-SD-NEXT: ldp w1, w18, [sp, #116] // 8-byte Folded Reload +; CHECK-SD-NEXT: umov w17, v1.b[12] +; CHECK-SD-NEXT: ldp x26, x25, [sp, #240] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[6], w8 +; CHECK-SD-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload +; CHECK-SD-NEXT: mov v0.b[6], w16 +; CHECK-SD-NEXT: msub w18, w18, w2, w1 +; CHECK-SD-NEXT: msub w8, w8, w19, w7 +; CHECK-SD-NEXT: ldp w2, w1, [sp, #104] // 8-byte Folded Reload +; CHECK-SD-NEXT: mov v0.b[7], w18 +; CHECK-SD-NEXT: umov w18, v3.b[13] +; CHECK-SD-NEXT: mov v2.b[7], w8 +; CHECK-SD-NEXT: ldr w8, [sp, #8] // 4-byte Folded Reload +; CHECK-SD-NEXT: udiv w9, w11, w10 +; CHECK-SD-NEXT: msub w1, w1, w4, w2 +; CHECK-SD-NEXT: umov w2, v1.b[13] +; CHECK-SD-NEXT: msub w8, w8, w6, w3 +; CHECK-SD-NEXT: ldp w4, w3, [sp, #140] // 8-byte Folded Reload +; CHECK-SD-NEXT: mov v0.b[8], w1 +; CHECK-SD-NEXT: mov v2.b[8], w8 +; CHECK-SD-NEXT: msub w8, w0, w21, w20 +; CHECK-SD-NEXT: msub w3, w3, w5, w4 +; CHECK-SD-NEXT: ldr w5, [sp, #124] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldp w4, w1, [sp, #128] // 8-byte Folded Reload +; CHECK-SD-NEXT: udiv w13, w14, w12 +; CHECK-SD-NEXT: ldp x20, x19, [sp, #288] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[9], w8 +; CHECK-SD-NEXT: mov v0.b[9], w3 +; CHECK-SD-NEXT: msub w8, w9, w10, w11 +; CHECK-SD-NEXT: msub w1, w1, w5, w4 +; CHECK-SD-NEXT: ldr w4, [sp, #172] // 4-byte Folded Reload +; CHECK-SD-NEXT: umov w9, v3.b[14] +; CHECK-SD-NEXT: ldp w3, w11, [sp, #176] // 8-byte Folded Reload +; CHECK-SD-NEXT: umov w10, v1.b[14] +; CHECK-SD-NEXT: ldp x22, x21, [sp, #272] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[10], w8 +; CHECK-SD-NEXT: mov v0.b[10], w1 +; CHECK-SD-NEXT: ldr w1, [sp, #152] // 4-byte Folded Reload +; CHECK-SD-NEXT: msub w11, w11, w4, w3 +; CHECK-SD-NEXT: udiv w16, w17, w15 +; CHECK-SD-NEXT: msub w8, w13, w12, w14 +; CHECK-SD-NEXT: ldr w13, [sp, #168] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldr w14, [sp, #160] // 4-byte Folded Reload +; CHECK-SD-NEXT: mov v0.b[11], w11 +; CHECK-SD-NEXT: umov w11, v3.b[15] +; CHECK-SD-NEXT: msub w13, w13, w1, w14 +; CHECK-SD-NEXT: umov w14, v1.b[15] +; CHECK-SD-NEXT: mov v2.b[11], w8 +; CHECK-SD-NEXT: mov v0.b[12], w13 +; CHECK-SD-NEXT: udiv w0, w2, w18 +; CHECK-SD-NEXT: msub w8, w16, w15, w17 +; CHECK-SD-NEXT: ldr w17, [sp, #196] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldp w16, w15, [sp, #200] // 8-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[12], w8 +; CHECK-SD-NEXT: msub w15, w15, w17, w16 +; CHECK-SD-NEXT: ldp w17, w16, [sp, #188] // 8-byte Folded Reload +; CHECK-SD-NEXT: mov v0.b[13], w15 +; CHECK-SD-NEXT: udiv w12, w10, w9 +; CHECK-SD-NEXT: msub w8, w0, w18, w2 +; CHECK-SD-NEXT: ldr w18, [sp, #184] // 4-byte Folded Reload +; CHECK-SD-NEXT: msub w16, w16, w18, w17 +; CHECK-SD-NEXT: mov v2.b[13], w8 +; CHECK-SD-NEXT: mov v0.b[14], w16 +; CHECK-SD-NEXT: udiv w13, w14, w11 +; CHECK-SD-NEXT: msub w8, w12, w9, w10 +; CHECK-SD-NEXT: ldr w9, [sp, #164] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldr w12, [sp, #148] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldr w10, [sp, #156] // 4-byte Folded Reload +; CHECK-SD-NEXT: mov v2.b[14], w8 +; CHECK-SD-NEXT: msub w9, w9, w12, w10 +; CHECK-SD-NEXT: mov v0.b[15], w9 +; CHECK-SD-NEXT: msub w8, w13, w11, w14 +; CHECK-SD-NEXT: mov v2.b[15], w8 +; CHECK-SD-NEXT: mov v1.16b, v2.16b +; CHECK-SD-NEXT: add sp, sp, #304 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv32i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #112 +; CHECK-GI-NEXT: stp x29, x30, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x26, x25, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x24, x23, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 112 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w21, -24 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w23, -40 +; CHECK-GI-NEXT: .cfi_offset w24, -48 +; CHECK-GI-NEXT: .cfi_offset w25, -56 +; CHECK-GI-NEXT: .cfi_offset w26, -64 +; CHECK-GI-NEXT: .cfi_offset w27, -72 +; CHECK-GI-NEXT: .cfi_offset w28, -80 +; CHECK-GI-NEXT: .cfi_offset w30, -88 +; CHECK-GI-NEXT: .cfi_offset w29, -96 +; CHECK-GI-NEXT: ushll v4.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll v5.8h, v2.8b, #0 +; CHECK-GI-NEXT: ushll v16.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll v17.8h, v3.8b, #0 +; CHECK-GI-NEXT: ushll v6.4s, v4.4h, #0 +; CHECK-GI-NEXT: ushll v7.4s, v5.4h, #0 +; CHECK-GI-NEXT: ushll2 v4.4s, v4.8h, #0 +; CHECK-GI-NEXT: ushll2 v5.4s, v5.8h, #0 +; CHECK-GI-NEXT: ushll v18.4s, v16.4h, #0 +; CHECK-GI-NEXT: ushll v19.4s, v17.4h, #0 +; CHECK-GI-NEXT: ushll2 v16.4s, v16.8h, #0 +; CHECK-GI-NEXT: ushll2 v17.4s, v17.8h, #0 +; CHECK-GI-NEXT: fmov w8, s6 +; CHECK-GI-NEXT: fmov w9, s7 +; CHECK-GI-NEXT: mov w12, v7.s[3] +; CHECK-GI-NEXT: fmov w13, s5 +; CHECK-GI-NEXT: mov w14, v5.s[1] +; CHECK-GI-NEXT: mov w16, v5.s[3] +; CHECK-GI-NEXT: fmov w6, s19 +; CHECK-GI-NEXT: mov w7, v19.s[3] +; CHECK-GI-NEXT: fmov w21, s17 +; CHECK-GI-NEXT: udiv w10, w8, w9 +; CHECK-GI-NEXT: mov w8, v6.s[1] +; CHECK-GI-NEXT: mov w9, v7.s[1] +; CHECK-GI-NEXT: mov w22, v17.s[3] +; CHECK-GI-NEXT: udiv w11, w8, w9 +; CHECK-GI-NEXT: mov w8, v6.s[2] +; CHECK-GI-NEXT: mov w9, v7.s[2] +; CHECK-GI-NEXT: fmov s20, w10 +; CHECK-GI-NEXT: udiv w9, w8, w9 +; CHECK-GI-NEXT: mov w8, v6.s[3] +; CHECK-GI-NEXT: ushll2 v6.8h, v0.16b, #0 +; CHECK-GI-NEXT: mov v20.s[1], w11 +; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll v28.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0 +; CHECK-GI-NEXT: udiv w8, w8, w12 +; CHECK-GI-NEXT: fmov w12, s4 +; CHECK-GI-NEXT: mov v20.s[2], w9 +; CHECK-GI-NEXT: udiv w13, w12, w13 +; CHECK-GI-NEXT: mov w12, v4.s[1] +; CHECK-GI-NEXT: str w8, [sp, #12] // 4-byte Folded Spill +; CHECK-GI-NEXT: ldr w11, [sp, #12] // 4-byte Folded Reload +; CHECK-GI-NEXT: mov v20.s[3], w11 +; CHECK-GI-NEXT: udiv w15, w12, w14 +; CHECK-GI-NEXT: mov w12, v4.s[2] +; CHECK-GI-NEXT: mov w14, v5.s[2] +; CHECK-GI-NEXT: ushll v5.4s, v6.4h, #0 +; CHECK-GI-NEXT: fmov s21, w13 +; CHECK-GI-NEXT: udiv w14, w12, w14 +; CHECK-GI-NEXT: mov w12, v4.s[3] +; CHECK-GI-NEXT: ushll2 v4.8h, v2.16b, #0 +; CHECK-GI-NEXT: mov v21.s[1], w15 +; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-GI-NEXT: ushll v7.4s, v4.4h, #0 +; CHECK-GI-NEXT: ushll v30.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll2 v2.4s, v2.8h, #0 +; CHECK-GI-NEXT: fmov w17, s7 +; CHECK-GI-NEXT: mls v28.4s, v20.4s, v30.4s +; CHECK-GI-NEXT: udiv w12, w12, w16 +; CHECK-GI-NEXT: fmov w16, s5 +; CHECK-GI-NEXT: mov v21.s[2], w14 +; CHECK-GI-NEXT: udiv w18, w16, w17 +; CHECK-GI-NEXT: mov w16, v5.s[1] +; CHECK-GI-NEXT: mov w17, v7.s[1] +; CHECK-GI-NEXT: mov v21.s[3], w12 +; CHECK-GI-NEXT: mls v0.4s, v21.4s, v2.4s +; CHECK-GI-NEXT: udiv w1, w16, w17 +; CHECK-GI-NEXT: mov w16, v5.s[2] +; CHECK-GI-NEXT: mov w17, v7.s[2] +; CHECK-GI-NEXT: fmov s22, w18 +; CHECK-GI-NEXT: uzp1 v0.8h, v28.8h, v0.8h +; CHECK-GI-NEXT: udiv w0, w16, w17 +; CHECK-GI-NEXT: mov w16, v5.s[3] +; CHECK-GI-NEXT: mov w17, v7.s[3] +; CHECK-GI-NEXT: ushll2 v5.4s, v6.8h, #0 +; CHECK-GI-NEXT: ushll2 v7.4s, v4.8h, #0 +; CHECK-GI-NEXT: mov v22.s[1], w1 +; CHECK-GI-NEXT: ushll v6.4s, v6.4h, #0 +; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-GI-NEXT: fmov w2, s7 +; CHECK-GI-NEXT: mov w3, v7.s[3] +; CHECK-GI-NEXT: udiv w16, w16, w17 +; CHECK-GI-NEXT: fmov w17, s5 +; CHECK-GI-NEXT: mov v22.s[2], w0 +; CHECK-GI-NEXT: udiv w5, w17, w2 +; CHECK-GI-NEXT: mov w17, v5.s[1] +; CHECK-GI-NEXT: mov w2, v7.s[1] +; CHECK-GI-NEXT: mov v22.s[3], w16 +; CHECK-GI-NEXT: mls v6.4s, v22.4s, v4.4s +; CHECK-GI-NEXT: udiv w4, w17, w2 +; CHECK-GI-NEXT: mov w17, v5.s[2] +; CHECK-GI-NEXT: mov w2, v7.s[2] +; CHECK-GI-NEXT: fmov s23, w5 +; CHECK-GI-NEXT: udiv w2, w17, w2 +; CHECK-GI-NEXT: mov w17, v5.s[3] +; CHECK-GI-NEXT: mov v23.s[1], w4 +; CHECK-GI-NEXT: udiv w17, w17, w3 +; CHECK-GI-NEXT: fmov w3, s18 +; CHECK-GI-NEXT: mov v23.s[2], w2 +; CHECK-GI-NEXT: udiv w20, w3, w6 +; CHECK-GI-NEXT: mov w3, v18.s[1] +; CHECK-GI-NEXT: mov w6, v19.s[1] +; CHECK-GI-NEXT: mov v23.s[3], w17 +; CHECK-GI-NEXT: mls v5.4s, v23.4s, v7.4s +; CHECK-GI-NEXT: udiv w19, w3, w6 +; CHECK-GI-NEXT: mov w3, v18.s[2] +; CHECK-GI-NEXT: mov w6, v19.s[2] +; CHECK-GI-NEXT: fmov s24, w20 +; CHECK-GI-NEXT: uzp1 v2.8h, v6.8h, v5.8h +; CHECK-GI-NEXT: uzp1 v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: udiv w6, w3, w6 +; CHECK-GI-NEXT: mov w3, v18.s[3] +; CHECK-GI-NEXT: mov v24.s[1], w19 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: udiv w3, w3, w7 +; CHECK-GI-NEXT: fmov w7, s16 +; CHECK-GI-NEXT: mov v24.s[2], w6 +; CHECK-GI-NEXT: udiv w23, w7, w21 +; CHECK-GI-NEXT: mov w7, v16.s[1] +; CHECK-GI-NEXT: mov w21, v17.s[1] +; CHECK-GI-NEXT: mov v24.s[3], w3 +; CHECK-GI-NEXT: udiv w24, w7, w21 +; CHECK-GI-NEXT: mov w7, v16.s[2] +; CHECK-GI-NEXT: mov w21, v17.s[2] +; CHECK-GI-NEXT: ushll2 v17.8h, v1.16b, #0 +; CHECK-GI-NEXT: fmov s25, w23 +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll v18.4s, v17.4h, #0 +; CHECK-GI-NEXT: ushll v29.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0 +; CHECK-GI-NEXT: udiv w21, w7, w21 +; CHECK-GI-NEXT: mov w7, v16.s[3] +; CHECK-GI-NEXT: ushll2 v16.8h, v3.16b, #0 +; CHECK-GI-NEXT: mov v25.s[1], w24 +; CHECK-GI-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-GI-NEXT: ldp x24, x23, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ushll v19.4s, v16.4h, #0 +; CHECK-GI-NEXT: ushll v31.4s, v3.4h, #0 +; CHECK-GI-NEXT: ushll2 v3.4s, v3.8h, #0 +; CHECK-GI-NEXT: fmov w25, s19 +; CHECK-GI-NEXT: mov w26, v19.s[1] +; CHECK-GI-NEXT: mov w27, v19.s[2] +; CHECK-GI-NEXT: mov w28, v19.s[3] +; CHECK-GI-NEXT: ushll2 v19.4s, v16.8h, #0 +; CHECK-GI-NEXT: ushll v16.4s, v16.4h, #0 +; CHECK-GI-NEXT: udiv w7, w7, w22 +; CHECK-GI-NEXT: fmov w22, s18 +; CHECK-GI-NEXT: mov v25.s[2], w21 +; CHECK-GI-NEXT: mls v29.4s, v24.4s, v31.4s +; CHECK-GI-NEXT: fmov w29, s19 +; CHECK-GI-NEXT: mov w30, v19.s[1] +; CHECK-GI-NEXT: mov w8, v19.s[2] +; CHECK-GI-NEXT: mov w10, v19.s[3] +; CHECK-GI-NEXT: udiv w25, w22, w25 +; CHECK-GI-NEXT: mov w22, v18.s[1] +; CHECK-GI-NEXT: mov v25.s[3], w7 +; CHECK-GI-NEXT: mls v1.4s, v25.4s, v3.4s +; CHECK-GI-NEXT: udiv w26, w22, w26 +; CHECK-GI-NEXT: mov w22, v18.s[2] +; CHECK-GI-NEXT: fmov s26, w25 +; CHECK-GI-NEXT: uzp1 v1.8h, v29.8h, v1.8h +; CHECK-GI-NEXT: udiv w27, w22, w27 +; CHECK-GI-NEXT: mov w22, v18.s[3] +; CHECK-GI-NEXT: ushll2 v18.4s, v17.8h, #0 +; CHECK-GI-NEXT: mov v26.s[1], w26 +; CHECK-GI-NEXT: ushll v17.4s, v17.4h, #0 +; CHECK-GI-NEXT: ldp x26, x25, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov w9, v18.s[3] +; CHECK-GI-NEXT: udiv w22, w22, w28 +; CHECK-GI-NEXT: fmov w28, s18 +; CHECK-GI-NEXT: mov v26.s[2], w27 +; CHECK-GI-NEXT: udiv w28, w28, w29 +; CHECK-GI-NEXT: mov w29, v18.s[1] +; CHECK-GI-NEXT: mov v26.s[3], w22 +; CHECK-GI-NEXT: ldp x22, x21, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: mls v17.4s, v26.4s, v16.4s +; CHECK-GI-NEXT: udiv w29, w29, w30 +; CHECK-GI-NEXT: mov w30, v18.s[2] +; CHECK-GI-NEXT: fmov s27, w28 +; CHECK-GI-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: udiv w8, w30, w8 +; CHECK-GI-NEXT: mov v27.s[1], w29 +; CHECK-GI-NEXT: ldp x29, x30, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: udiv w9, w9, w10 +; CHECK-GI-NEXT: mov v27.s[2], w8 +; CHECK-GI-NEXT: mov v27.s[3], w9 +; CHECK-GI-NEXT: mls v18.4s, v27.4s, v19.4s +; CHECK-GI-NEXT: uzp1 v3.8h, v17.8h, v18.8h +; CHECK-GI-NEXT: uzp1 v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: add sp, sp, #112 +; CHECK-GI-NEXT: ret +entry: + %s = urem <32 x i8> %d, %e + ret <32 x i8> %s +} + +define <2 x i16> @sv2i16(<2 x i16> %d, <2 x i16> %e) { +; CHECK-SD-LABEL: sv2i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-SD-NEXT: shl v1.2s, v1.2s, #16 +; CHECK-SD-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-SD-NEXT: sshr v1.2s, v1.2s, #16 +; CHECK-SD-NEXT: fmov w8, s1 +; CHECK-SD-NEXT: fmov w9, s0 +; CHECK-SD-NEXT: mov w11, v1.s[1] +; CHECK-SD-NEXT: mov w12, v0.s[1] +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: sdiv w13, w12, w11 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: fmov s0, w8 +; CHECK-SD-NEXT: msub w9, w13, w11, w12 +; CHECK-SD-NEXT: mov v0.s[1], w9 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv2i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-GI-NEXT: shl v1.2s, v1.2s, #16 +; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-GI-NEXT: sshr v1.2s, v1.2s, #16 +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: mov w10, v1.s[1] +; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: mov w9, v0.s[1] +; CHECK-GI-NEXT: sdiv w9, w9, w10 +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.s[1], w9 +; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret +entry: + %s = srem <2 x i16> %d, %e + ret <2 x i16> %s +} + +define <3 x i16> @sv3i16(<3 x i16> %d, <3 x i16> %e) { +; CHECK-LABEL: sv3i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w11, v1.h[0] +; CHECK-NEXT: smov w12, v0.h[0] +; CHECK-NEXT: smov w8, v1.h[1] +; CHECK-NEXT: smov w9, v0.h[1] +; CHECK-NEXT: smov w14, v1.h[2] +; CHECK-NEXT: smov w15, v0.h[2] +; CHECK-NEXT: sdiv w13, w12, w11 +; CHECK-NEXT: sdiv w10, w9, w8 +; CHECK-NEXT: msub w11, w13, w11, w12 +; CHECK-NEXT: fmov s0, w11 +; CHECK-NEXT: sdiv w16, w15, w14 +; CHECK-NEXT: msub w8, w10, w8, w9 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: msub w8, w16, w14, w15 +; CHECK-NEXT: mov v0.h[2], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret +entry: + %s = srem <3 x i16> %d, %e + ret <3 x i16> %s +} + +define <4 x i16> @sv4i16(<4 x i16> %d, <4 x i16> %e) { +; CHECK-SD-LABEL: sv4i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: smov w11, v1.h[0] +; CHECK-SD-NEXT: smov w12, v0.h[0] +; CHECK-SD-NEXT: smov w8, v1.h[1] +; CHECK-SD-NEXT: smov w9, v0.h[1] +; CHECK-SD-NEXT: smov w14, v1.h[2] +; CHECK-SD-NEXT: smov w15, v0.h[2] +; CHECK-SD-NEXT: smov w17, v1.h[3] +; CHECK-SD-NEXT: smov w18, v0.h[3] +; CHECK-SD-NEXT: sdiv w13, w12, w11 +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: msub w11, w13, w11, w12 +; CHECK-SD-NEXT: fmov s0, w11 +; CHECK-SD-NEXT: sdiv w16, w15, w14 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: mov v0.h[1], w8 +; CHECK-SD-NEXT: sdiv w9, w18, w17 +; CHECK-SD-NEXT: msub w8, w16, w14, w15 +; CHECK-SD-NEXT: mov v0.h[2], w8 +; CHECK-SD-NEXT: msub w8, w9, w17, w18 +; CHECK-SD-NEXT: mov v0.h[3], w8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv4i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: mov w10, v1.s[1] +; CHECK-GI-NEXT: mov w11, v1.s[2] +; CHECK-GI-NEXT: mov w12, v1.s[3] +; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: mov w9, v0.s[1] +; CHECK-GI-NEXT: sdiv w9, w9, w10 +; CHECK-GI-NEXT: mov w10, v0.s[2] +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: sdiv w10, w10, w11 +; CHECK-GI-NEXT: mov w11, v0.s[3] +; CHECK-GI-NEXT: mov v2.s[1], w9 +; CHECK-GI-NEXT: sdiv w8, w11, w12 +; CHECK-GI-NEXT: mov v2.s[2], w10 +; CHECK-GI-NEXT: mov v2.s[3], w8 +; CHECK-GI-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: ret +entry: + %s = srem <4 x i16> %d, %e + ret <4 x i16> %s +} + +define <8 x i16> @sv8i16(<8 x i16> %d, <8 x i16> %e) { +; CHECK-SD-LABEL: sv8i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: smov w11, v1.h[0] +; CHECK-SD-NEXT: smov w12, v0.h[0] +; CHECK-SD-NEXT: smov w8, v1.h[1] +; CHECK-SD-NEXT: smov w9, v0.h[1] +; CHECK-SD-NEXT: smov w14, v1.h[2] +; CHECK-SD-NEXT: smov w15, v0.h[2] +; CHECK-SD-NEXT: smov w17, v1.h[3] +; CHECK-SD-NEXT: smov w18, v0.h[3] +; CHECK-SD-NEXT: smov w1, v1.h[4] +; CHECK-SD-NEXT: smov w2, v0.h[4] +; CHECK-SD-NEXT: smov w4, v1.h[5] +; CHECK-SD-NEXT: smov w5, v0.h[5] +; CHECK-SD-NEXT: sdiv w13, w12, w11 +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: msub w11, w13, w11, w12 +; CHECK-SD-NEXT: smov w13, v1.h[7] +; CHECK-SD-NEXT: fmov s2, w11 +; CHECK-SD-NEXT: smov w11, v0.h[6] +; CHECK-SD-NEXT: sdiv w16, w15, w14 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: smov w10, v1.h[6] +; CHECK-SD-NEXT: mov v2.h[1], w8 +; CHECK-SD-NEXT: sdiv w0, w18, w17 +; CHECK-SD-NEXT: msub w8, w16, w14, w15 +; CHECK-SD-NEXT: smov w14, v0.h[7] +; CHECK-SD-NEXT: mov v2.h[2], w8 +; CHECK-SD-NEXT: sdiv w3, w2, w1 +; CHECK-SD-NEXT: msub w8, w0, w17, w18 +; CHECK-SD-NEXT: mov v2.h[3], w8 +; CHECK-SD-NEXT: sdiv w9, w5, w4 +; CHECK-SD-NEXT: msub w8, w3, w1, w2 +; CHECK-SD-NEXT: mov v2.h[4], w8 +; CHECK-SD-NEXT: sdiv w12, w11, w10 +; CHECK-SD-NEXT: msub w8, w9, w4, w5 +; CHECK-SD-NEXT: mov v2.h[5], w8 +; CHECK-SD-NEXT: sdiv w9, w14, w13 +; CHECK-SD-NEXT: msub w8, w12, w10, w11 +; CHECK-SD-NEXT: mov v2.h[6], w8 +; CHECK-SD-NEXT: msub w8, w9, w13, w14 +; CHECK-SD-NEXT: mov v2.h[7], w8 +; CHECK-SD-NEXT: mov v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv8i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v2.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll v3.4s, v1.4h, #0 +; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov w10, v3.s[1] +; CHECK-GI-NEXT: mov w11, v3.s[2] +; CHECK-GI-NEXT: mov w12, v3.s[3] +; CHECK-GI-NEXT: fmov w13, s1 +; CHECK-GI-NEXT: mov w14, v1.s[1] +; CHECK-GI-NEXT: mov w15, v1.s[2] +; CHECK-GI-NEXT: mov w16, v1.s[3] +; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: mov w9, v2.s[1] +; CHECK-GI-NEXT: sdiv w9, w9, w10 +; CHECK-GI-NEXT: mov w10, v2.s[2] +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: sdiv w10, w10, w11 +; CHECK-GI-NEXT: mov w11, v2.s[3] +; CHECK-GI-NEXT: mov v4.s[1], w9 +; CHECK-GI-NEXT: sdiv w11, w11, w12 +; CHECK-GI-NEXT: fmov w12, s0 +; CHECK-GI-NEXT: mov v4.s[2], w10 +; CHECK-GI-NEXT: sdiv w12, w12, w13 +; CHECK-GI-NEXT: mov w13, v0.s[1] +; CHECK-GI-NEXT: mov v4.s[3], w11 +; CHECK-GI-NEXT: mls v2.4s, v4.4s, v3.4s +; CHECK-GI-NEXT: sdiv w13, w13, w14 +; CHECK-GI-NEXT: mov w14, v0.s[2] +; CHECK-GI-NEXT: fmov s5, w12 +; CHECK-GI-NEXT: sdiv w14, w14, w15 +; CHECK-GI-NEXT: mov w15, v0.s[3] +; CHECK-GI-NEXT: mov v5.s[1], w13 +; CHECK-GI-NEXT: sdiv w8, w15, w16 +; CHECK-GI-NEXT: mov v5.s[2], w14 +; CHECK-GI-NEXT: mov v5.s[3], w8 +; CHECK-GI-NEXT: mls v0.4s, v5.4s, v1.4s +; CHECK-GI-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; CHECK-GI-NEXT: ret +entry: + %s = srem <8 x i16> %d, %e + ret <8 x i16> %s +} + +define <16 x i16> @sv16i16(<16 x i16> %d, <16 x i16> %e) { +; CHECK-SD-LABEL: sv16i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #160 +; CHECK-SD-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x28, x27, [sp, #80] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x26, x25, [sp, #96] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x24, x23, [sp, #112] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #128] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #144] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 160 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w23, -40 +; CHECK-SD-NEXT: .cfi_offset w24, -48 +; CHECK-SD-NEXT: .cfi_offset w25, -56 +; CHECK-SD-NEXT: .cfi_offset w26, -64 +; CHECK-SD-NEXT: .cfi_offset w27, -72 +; CHECK-SD-NEXT: .cfi_offset w28, -80 +; CHECK-SD-NEXT: .cfi_offset w30, -88 +; CHECK-SD-NEXT: .cfi_offset w29, -96 +; CHECK-SD-NEXT: smov w8, v2.h[1] +; CHECK-SD-NEXT: smov w9, v0.h[1] +; CHECK-SD-NEXT: smov w19, v2.h[2] +; CHECK-SD-NEXT: smov w22, v0.h[2] +; CHECK-SD-NEXT: smov w1, v2.h[0] +; CHECK-SD-NEXT: smov w3, v0.h[0] +; CHECK-SD-NEXT: smov w7, v2.h[3] +; CHECK-SD-NEXT: smov w18, v0.h[3] +; CHECK-SD-NEXT: smov w4, v0.h[6] +; CHECK-SD-NEXT: smov w0, v2.h[4] +; CHECK-SD-NEXT: smov w5, v0.h[4] +; CHECK-SD-NEXT: smov w2, v2.h[7] +; CHECK-SD-NEXT: str w8, [sp, #52] // 4-byte Folded Spill +; CHECK-SD-NEXT: smov w6, v0.h[7] +; CHECK-SD-NEXT: smov w27, v3.h[0] +; CHECK-SD-NEXT: str w9, [sp, #44] // 4-byte Folded Spill +; CHECK-SD-NEXT: sdiv w9, w9, w8 +; CHECK-SD-NEXT: smov w28, v1.h[0] +; CHECK-SD-NEXT: smov w24, v3.h[1] +; CHECK-SD-NEXT: smov w25, v1.h[1] +; CHECK-SD-NEXT: ldr w21, [sp, #52] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldr w23, [sp, #44] // 4-byte Folded Reload +; CHECK-SD-NEXT: smov w30, v3.h[2] +; CHECK-SD-NEXT: smov w12, v3.h[3] +; CHECK-SD-NEXT: smov w11, v1.h[3] +; CHECK-SD-NEXT: smov w14, v3.h[5] +; CHECK-SD-NEXT: smov w13, v1.h[5] +; CHECK-SD-NEXT: sdiv w8, w22, w19 +; CHECK-SD-NEXT: str w9, [sp, #60] // 4-byte Folded Spill +; CHECK-SD-NEXT: ldr w20, [sp, #60] // 4-byte Folded Reload +; CHECK-SD-NEXT: msub w21, w20, w21, w23 +; CHECK-SD-NEXT: sdiv w9, w3, w1 +; CHECK-SD-NEXT: str w8, [sp, #12] // 4-byte Folded Spill +; CHECK-SD-NEXT: sdiv w8, w18, w7 +; CHECK-SD-NEXT: stp w9, w8, [sp, #24] // 8-byte Folded Spill +; CHECK-SD-NEXT: smov w8, v2.h[5] +; CHECK-SD-NEXT: smov w9, v0.h[5] +; CHECK-SD-NEXT: sdiv w10, w5, w0 +; CHECK-SD-NEXT: ldr w20, [sp, #24] // 4-byte Folded Reload +; CHECK-SD-NEXT: msub w1, w20, w1, w3 +; CHECK-SD-NEXT: str w9, [sp, #40] // 4-byte Folded Spill +; CHECK-SD-NEXT: str w8, [sp, #48] // 4-byte Folded Spill +; CHECK-SD-NEXT: fmov s0, w1 +; CHECK-SD-NEXT: ldr w1, [sp, #12] // 4-byte Folded Reload +; CHECK-SD-NEXT: msub w1, w1, w19, w22 +; CHECK-SD-NEXT: ldr w19, [sp, #28] // 4-byte Folded Reload +; CHECK-SD-NEXT: sdiv w9, w9, w8 +; CHECK-SD-NEXT: smov w8, v2.h[6] +; CHECK-SD-NEXT: mov v0.h[1], w21 +; CHECK-SD-NEXT: msub w18, w19, w7, w18 +; CHECK-SD-NEXT: ldp x20, x19, [sp, #144] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x22, x21, [sp, #128] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v0.h[2], w1 +; CHECK-SD-NEXT: str w9, [sp, #56] // 4-byte Folded Spill +; CHECK-SD-NEXT: sdiv w9, w4, w8 +; CHECK-SD-NEXT: mov v0.h[3], w18 +; CHECK-SD-NEXT: ldr w18, [sp, #40] // 4-byte Folded Reload +; CHECK-SD-NEXT: stp w8, w9, [sp, #32] // 8-byte Folded Spill +; CHECK-SD-NEXT: sdiv w8, w6, w2 +; CHECK-SD-NEXT: smov w9, v1.h[4] +; CHECK-SD-NEXT: sdiv w29, w28, w27 +; CHECK-SD-NEXT: stp w8, w10, [sp, #16] // 8-byte Folded Spill +; CHECK-SD-NEXT: smov w8, v1.h[2] +; CHECK-SD-NEXT: smov w10, v3.h[4] +; CHECK-SD-NEXT: sdiv w26, w25, w24 +; CHECK-SD-NEXT: msub w3, w29, w27, w28 +; CHECK-SD-NEXT: ldp x28, x27, [sp, #80] // 16-byte Folded Reload +; CHECK-SD-NEXT: fmov s2, w3 +; CHECK-SD-NEXT: smov w3, v1.h[6] +; CHECK-SD-NEXT: sdiv w15, w8, w30 +; CHECK-SD-NEXT: msub w24, w26, w24, w25 +; CHECK-SD-NEXT: mov v2.h[1], w24 +; CHECK-SD-NEXT: ldp x24, x23, [sp, #112] // 16-byte Folded Reload +; CHECK-SD-NEXT: sdiv w17, w11, w12 +; CHECK-SD-NEXT: msub w8, w15, w30, w8 +; CHECK-SD-NEXT: smov w15, v3.h[6] +; CHECK-SD-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v2.h[2], w8 +; CHECK-SD-NEXT: sdiv w16, w9, w10 +; CHECK-SD-NEXT: msub w8, w17, w12, w11 +; CHECK-SD-NEXT: ldr w12, [sp, #20] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldr w17, [sp, #48] // 4-byte Folded Reload +; CHECK-SD-NEXT: msub w12, w12, w0, w5 +; CHECK-SD-NEXT: mov v2.h[3], w8 +; CHECK-SD-NEXT: mov v0.h[4], w12 +; CHECK-SD-NEXT: sdiv w25, w13, w14 +; CHECK-SD-NEXT: msub w8, w16, w10, w9 +; CHECK-SD-NEXT: smov w9, v3.h[7] +; CHECK-SD-NEXT: smov w10, v1.h[7] +; CHECK-SD-NEXT: ldr w16, [sp, #56] // 4-byte Folded Reload +; CHECK-SD-NEXT: mov v2.h[4], w8 +; CHECK-SD-NEXT: msub w16, w16, w17, w18 +; CHECK-SD-NEXT: mov v0.h[5], w16 +; CHECK-SD-NEXT: sdiv w11, w3, w15 +; CHECK-SD-NEXT: msub w8, w25, w14, w13 +; CHECK-SD-NEXT: ldp w14, w13, [sp, #32] // 8-byte Folded Reload +; CHECK-SD-NEXT: ldp x26, x25, [sp, #96] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v2.h[5], w8 +; CHECK-SD-NEXT: msub w13, w13, w14, w4 +; CHECK-SD-NEXT: mov v0.h[6], w13 +; CHECK-SD-NEXT: sdiv w12, w10, w9 +; CHECK-SD-NEXT: msub w8, w11, w15, w3 +; CHECK-SD-NEXT: ldr w11, [sp, #16] // 4-byte Folded Reload +; CHECK-SD-NEXT: msub w11, w11, w2, w6 +; CHECK-SD-NEXT: mov v2.h[6], w8 +; CHECK-SD-NEXT: mov v0.h[7], w11 +; CHECK-SD-NEXT: msub w8, w12, w9, w10 +; CHECK-SD-NEXT: mov v2.h[7], w8 +; CHECK-SD-NEXT: mov v1.16b, v2.16b +; CHECK-SD-NEXT: add sp, sp, #160 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv16i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v4.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll v5.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll v6.4s, v1.4h, #0 +; CHECK-GI-NEXT: sshll v7.4s, v3.4h, #0 +; CHECK-GI-NEXT: fmov w8, s4 +; CHECK-GI-NEXT: fmov w9, s5 +; CHECK-GI-NEXT: mov w12, v5.s[3] +; CHECK-GI-NEXT: fmov w17, s7 +; CHECK-GI-NEXT: mov w18, v7.s[1] +; CHECK-GI-NEXT: mov w0, v7.s[2] +; CHECK-GI-NEXT: mov w1, v7.s[3] +; CHECK-GI-NEXT: sshll2 v7.4s, v3.8h, #0 +; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 +; CHECK-GI-NEXT: sdiv w10, w8, w9 +; CHECK-GI-NEXT: mov w8, v4.s[1] +; CHECK-GI-NEXT: mov w9, v5.s[1] +; CHECK-GI-NEXT: fmov w2, s7 +; CHECK-GI-NEXT: mov w3, v7.s[1] +; CHECK-GI-NEXT: mov w4, v7.s[2] +; CHECK-GI-NEXT: mov w5, v7.s[3] +; CHECK-GI-NEXT: sdiv w11, w8, w9 +; CHECK-GI-NEXT: mov w8, v4.s[2] +; CHECK-GI-NEXT: mov w9, v5.s[2] +; CHECK-GI-NEXT: sshll2 v5.4s, v2.8h, #0 +; CHECK-GI-NEXT: fmov s16, w10 +; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: fmov w13, s5 +; CHECK-GI-NEXT: mov w14, v5.s[1] +; CHECK-GI-NEXT: mov w15, v5.s[2] +; CHECK-GI-NEXT: mov w16, v5.s[3] +; CHECK-GI-NEXT: sdiv w9, w8, w9 +; CHECK-GI-NEXT: mov w8, v4.s[3] +; CHECK-GI-NEXT: sshll2 v4.4s, v0.8h, #0 +; CHECK-GI-NEXT: mov v16.s[1], w11 +; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: sdiv w8, w8, w12 +; CHECK-GI-NEXT: fmov w12, s4 +; CHECK-GI-NEXT: mov v16.s[2], w9 +; CHECK-GI-NEXT: sdiv w13, w12, w13 +; CHECK-GI-NEXT: mov w12, v4.s[1] +; CHECK-GI-NEXT: mov v16.s[3], w8 +; CHECK-GI-NEXT: mls v0.4s, v16.4s, v2.4s +; CHECK-GI-NEXT: sdiv w14, w12, w14 +; CHECK-GI-NEXT: mov w12, v4.s[2] +; CHECK-GI-NEXT: fmov s17, w13 +; CHECK-GI-NEXT: sdiv w15, w12, w15 +; CHECK-GI-NEXT: mov w12, v4.s[3] +; CHECK-GI-NEXT: mov v17.s[1], w14 +; CHECK-GI-NEXT: sdiv w12, w12, w16 +; CHECK-GI-NEXT: fmov w16, s6 +; CHECK-GI-NEXT: mov v17.s[2], w15 +; CHECK-GI-NEXT: sdiv w16, w16, w17 +; CHECK-GI-NEXT: mov w17, v6.s[1] +; CHECK-GI-NEXT: mov v17.s[3], w12 +; CHECK-GI-NEXT: mls v4.4s, v17.4s, v5.4s +; CHECK-GI-NEXT: sdiv w17, w17, w18 +; CHECK-GI-NEXT: mov w18, v6.s[2] +; CHECK-GI-NEXT: fmov s18, w16 +; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v4.8h +; CHECK-GI-NEXT: sdiv w18, w18, w0 +; CHECK-GI-NEXT: mov w0, v6.s[3] +; CHECK-GI-NEXT: sshll2 v6.4s, v1.8h, #0 +; CHECK-GI-NEXT: mov v18.s[1], w17 +; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: sdiv w0, w0, w1 +; CHECK-GI-NEXT: fmov w1, s6 +; CHECK-GI-NEXT: mov v18.s[2], w18 +; CHECK-GI-NEXT: sdiv w1, w1, w2 +; CHECK-GI-NEXT: mov w2, v6.s[1] +; CHECK-GI-NEXT: mov v18.s[3], w0 +; CHECK-GI-NEXT: mls v1.4s, v18.4s, v3.4s +; CHECK-GI-NEXT: sdiv w2, w2, w3 +; CHECK-GI-NEXT: mov w3, v6.s[2] +; CHECK-GI-NEXT: fmov s19, w1 +; CHECK-GI-NEXT: sdiv w3, w3, w4 +; CHECK-GI-NEXT: mov w4, v6.s[3] +; CHECK-GI-NEXT: mov v19.s[1], w2 +; CHECK-GI-NEXT: sdiv w10, w4, w5 +; CHECK-GI-NEXT: mov v19.s[2], w3 +; CHECK-GI-NEXT: mov v19.s[3], w10 +; CHECK-GI-NEXT: mls v6.4s, v19.4s, v7.4s +; CHECK-GI-NEXT: uzp1 v1.8h, v1.8h, v6.8h +; CHECK-GI-NEXT: ret +entry: + %s = srem <16 x i16> %d, %e + ret <16 x i16> %s +} + +define <2 x i16> @uv2i16(<2 x i16> %d, <2 x i16> %e) { +; CHECK-SD-LABEL: uv2i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi d2, #0x00ffff0000ffff +; CHECK-SD-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-SD-NEXT: and v1.8b, v1.8b, v2.8b +; CHECK-SD-NEXT: fmov w8, s1 +; CHECK-SD-NEXT: fmov w9, s0 +; CHECK-SD-NEXT: mov w11, v1.s[1] +; CHECK-SD-NEXT: mov w12, v0.s[1] +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: udiv w13, w12, w11 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: fmov s0, w8 +; CHECK-SD-NEXT: msub w9, w13, w11, w12 +; CHECK-SD-NEXT: mov v0.s[1], w9 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv2i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi d2, #0x00ffff0000ffff +; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: and v1.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: mov w10, v1.s[1] +; CHECK-GI-NEXT: udiv w8, w8, w9 +; CHECK-GI-NEXT: mov w9, v0.s[1] +; CHECK-GI-NEXT: udiv w9, w9, w10 +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.s[1], w9 +; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret +entry: + %s = urem <2 x i16> %d, %e + ret <2 x i16> %s +} + +define <3 x i16> @uv3i16(<3 x i16> %d, <3 x i16> %e) { +; CHECK-LABEL: uv3i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w11, v1.h[0] +; CHECK-NEXT: umov w12, v0.h[0] +; CHECK-NEXT: umov w8, v1.h[1] +; CHECK-NEXT: umov w9, v0.h[1] +; CHECK-NEXT: umov w13, v0.h[2] +; CHECK-NEXT: umov w14, v1.h[0] +; CHECK-NEXT: umov w16, v0.h[0] +; CHECK-NEXT: udiv w11, w12, w11 +; CHECK-NEXT: umov w12, v1.h[2] +; CHECK-NEXT: udiv w10, w9, w8 +; CHECK-NEXT: msub w11, w11, w14, w16 +; CHECK-NEXT: udiv w15, w13, w12 +; CHECK-NEXT: msub w8, w10, w8, w9 +; CHECK-NEXT: sxth w9, w11 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: sxth w8, w8 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: msub w10, w15, w12, w13 +; CHECK-NEXT: sxth w8, w10 +; CHECK-NEXT: mov v0.h[2], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret +entry: + %s = urem <3 x i16> %d, %e + ret <3 x i16> %s +} + +define <4 x i16> @uv4i16(<4 x i16> %d, <4 x i16> %e) { +; CHECK-SD-LABEL: uv4i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: umov w11, v1.h[0] +; CHECK-SD-NEXT: umov w12, v0.h[0] +; CHECK-SD-NEXT: umov w8, v1.h[1] +; CHECK-SD-NEXT: umov w9, v0.h[1] +; CHECK-SD-NEXT: umov w14, v1.h[2] +; CHECK-SD-NEXT: umov w15, v0.h[2] +; CHECK-SD-NEXT: umov w17, v1.h[3] +; CHECK-SD-NEXT: umov w18, v0.h[3] +; CHECK-SD-NEXT: udiv w13, w12, w11 +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: msub w11, w13, w11, w12 +; CHECK-SD-NEXT: fmov s0, w11 +; CHECK-SD-NEXT: udiv w16, w15, w14 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: mov v0.h[1], w8 +; CHECK-SD-NEXT: udiv w9, w18, w17 +; CHECK-SD-NEXT: msub w8, w16, w14, w15 +; CHECK-SD-NEXT: mov v0.h[2], w8 +; CHECK-SD-NEXT: msub w8, w9, w17, w18 +; CHECK-SD-NEXT: mov v0.h[3], w8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv4i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: mov w10, v1.s[1] +; CHECK-GI-NEXT: mov w11, v1.s[2] +; CHECK-GI-NEXT: mov w12, v1.s[3] +; CHECK-GI-NEXT: udiv w8, w8, w9 +; CHECK-GI-NEXT: mov w9, v0.s[1] +; CHECK-GI-NEXT: udiv w9, w9, w10 +; CHECK-GI-NEXT: mov w10, v0.s[2] +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: udiv w10, w10, w11 +; CHECK-GI-NEXT: mov w11, v0.s[3] +; CHECK-GI-NEXT: mov v2.s[1], w9 +; CHECK-GI-NEXT: udiv w8, w11, w12 +; CHECK-GI-NEXT: mov v2.s[2], w10 +; CHECK-GI-NEXT: mov v2.s[3], w8 +; CHECK-GI-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: ret +entry: + %s = urem <4 x i16> %d, %e + ret <4 x i16> %s +} + +define <8 x i16> @uv8i16(<8 x i16> %d, <8 x i16> %e) { +; CHECK-SD-LABEL: uv8i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: umov w11, v1.h[0] +; CHECK-SD-NEXT: umov w12, v0.h[0] +; CHECK-SD-NEXT: umov w8, v1.h[1] +; CHECK-SD-NEXT: umov w9, v0.h[1] +; CHECK-SD-NEXT: umov w14, v1.h[2] +; CHECK-SD-NEXT: umov w15, v0.h[2] +; CHECK-SD-NEXT: umov w17, v1.h[3] +; CHECK-SD-NEXT: umov w18, v0.h[3] +; CHECK-SD-NEXT: umov w1, v1.h[4] +; CHECK-SD-NEXT: umov w2, v0.h[4] +; CHECK-SD-NEXT: umov w4, v1.h[5] +; CHECK-SD-NEXT: umov w5, v0.h[5] +; CHECK-SD-NEXT: udiv w13, w12, w11 +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: msub w11, w13, w11, w12 +; CHECK-SD-NEXT: umov w13, v1.h[7] +; CHECK-SD-NEXT: fmov s2, w11 +; CHECK-SD-NEXT: umov w11, v0.h[6] +; CHECK-SD-NEXT: udiv w16, w15, w14 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: umov w10, v1.h[6] +; CHECK-SD-NEXT: mov v2.h[1], w8 +; CHECK-SD-NEXT: udiv w0, w18, w17 +; CHECK-SD-NEXT: msub w8, w16, w14, w15 +; CHECK-SD-NEXT: umov w14, v0.h[7] +; CHECK-SD-NEXT: mov v2.h[2], w8 +; CHECK-SD-NEXT: udiv w3, w2, w1 +; CHECK-SD-NEXT: msub w8, w0, w17, w18 +; CHECK-SD-NEXT: mov v2.h[3], w8 +; CHECK-SD-NEXT: udiv w9, w5, w4 +; CHECK-SD-NEXT: msub w8, w3, w1, w2 +; CHECK-SD-NEXT: mov v2.h[4], w8 +; CHECK-SD-NEXT: udiv w12, w11, w10 +; CHECK-SD-NEXT: msub w8, w9, w4, w5 +; CHECK-SD-NEXT: mov v2.h[5], w8 +; CHECK-SD-NEXT: udiv w9, w14, w13 +; CHECK-SD-NEXT: msub w8, w12, w10, w11 +; CHECK-SD-NEXT: mov v2.h[6], w8 +; CHECK-SD-NEXT: msub w8, w9, w13, w14 +; CHECK-SD-NEXT: mov v2.h[7], w8 +; CHECK-SD-NEXT: mov v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv8i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v3.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0 +; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0 +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov w10, v3.s[1] +; CHECK-GI-NEXT: mov w11, v3.s[2] +; CHECK-GI-NEXT: mov w12, v3.s[3] +; CHECK-GI-NEXT: fmov w13, s1 +; CHECK-GI-NEXT: mov w14, v1.s[1] +; CHECK-GI-NEXT: mov w15, v1.s[2] +; CHECK-GI-NEXT: mov w16, v1.s[3] +; CHECK-GI-NEXT: udiv w8, w8, w9 +; CHECK-GI-NEXT: mov w9, v2.s[1] +; CHECK-GI-NEXT: udiv w9, w9, w10 +; CHECK-GI-NEXT: mov w10, v2.s[2] +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: udiv w10, w10, w11 +; CHECK-GI-NEXT: mov w11, v2.s[3] +; CHECK-GI-NEXT: mov v4.s[1], w9 +; CHECK-GI-NEXT: udiv w11, w11, w12 +; CHECK-GI-NEXT: fmov w12, s0 +; CHECK-GI-NEXT: mov v4.s[2], w10 +; CHECK-GI-NEXT: udiv w12, w12, w13 +; CHECK-GI-NEXT: mov w13, v0.s[1] +; CHECK-GI-NEXT: mov v4.s[3], w11 +; CHECK-GI-NEXT: mls v2.4s, v4.4s, v3.4s +; CHECK-GI-NEXT: udiv w13, w13, w14 +; CHECK-GI-NEXT: mov w14, v0.s[2] +; CHECK-GI-NEXT: fmov s5, w12 +; CHECK-GI-NEXT: udiv w14, w14, w15 +; CHECK-GI-NEXT: mov w15, v0.s[3] +; CHECK-GI-NEXT: mov v5.s[1], w13 +; CHECK-GI-NEXT: udiv w8, w15, w16 +; CHECK-GI-NEXT: mov v5.s[2], w14 +; CHECK-GI-NEXT: mov v5.s[3], w8 +; CHECK-GI-NEXT: mls v0.4s, v5.4s, v1.4s +; CHECK-GI-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; CHECK-GI-NEXT: ret +entry: + %s = urem <8 x i16> %d, %e + ret <8 x i16> %s +} + +define <16 x i16> @uv16i16(<16 x i16> %d, <16 x i16> %e) { +; CHECK-SD-LABEL: uv16i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #160 +; CHECK-SD-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x28, x27, [sp, #80] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x26, x25, [sp, #96] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x24, x23, [sp, #112] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #128] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #144] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 160 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w23, -40 +; CHECK-SD-NEXT: .cfi_offset w24, -48 +; CHECK-SD-NEXT: .cfi_offset w25, -56 +; CHECK-SD-NEXT: .cfi_offset w26, -64 +; CHECK-SD-NEXT: .cfi_offset w27, -72 +; CHECK-SD-NEXT: .cfi_offset w28, -80 +; CHECK-SD-NEXT: .cfi_offset w30, -88 +; CHECK-SD-NEXT: .cfi_offset w29, -96 +; CHECK-SD-NEXT: umov w8, v2.h[1] +; CHECK-SD-NEXT: umov w9, v0.h[1] +; CHECK-SD-NEXT: umov w19, v2.h[2] +; CHECK-SD-NEXT: umov w22, v0.h[2] +; CHECK-SD-NEXT: umov w1, v2.h[0] +; CHECK-SD-NEXT: umov w3, v0.h[0] +; CHECK-SD-NEXT: umov w7, v2.h[3] +; CHECK-SD-NEXT: umov w18, v0.h[3] +; CHECK-SD-NEXT: umov w4, v0.h[6] +; CHECK-SD-NEXT: umov w0, v2.h[4] +; CHECK-SD-NEXT: umov w5, v0.h[4] +; CHECK-SD-NEXT: umov w2, v2.h[7] +; CHECK-SD-NEXT: str w8, [sp, #52] // 4-byte Folded Spill +; CHECK-SD-NEXT: umov w6, v0.h[7] +; CHECK-SD-NEXT: umov w27, v3.h[0] +; CHECK-SD-NEXT: str w9, [sp, #44] // 4-byte Folded Spill +; CHECK-SD-NEXT: udiv w9, w9, w8 +; CHECK-SD-NEXT: umov w28, v1.h[0] +; CHECK-SD-NEXT: umov w24, v3.h[1] +; CHECK-SD-NEXT: umov w25, v1.h[1] +; CHECK-SD-NEXT: ldr w21, [sp, #52] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldr w23, [sp, #44] // 4-byte Folded Reload +; CHECK-SD-NEXT: umov w30, v3.h[2] +; CHECK-SD-NEXT: umov w12, v3.h[3] +; CHECK-SD-NEXT: umov w11, v1.h[3] +; CHECK-SD-NEXT: umov w14, v3.h[5] +; CHECK-SD-NEXT: umov w13, v1.h[5] +; CHECK-SD-NEXT: udiv w8, w22, w19 +; CHECK-SD-NEXT: str w9, [sp, #60] // 4-byte Folded Spill +; CHECK-SD-NEXT: ldr w20, [sp, #60] // 4-byte Folded Reload +; CHECK-SD-NEXT: msub w21, w20, w21, w23 +; CHECK-SD-NEXT: udiv w9, w3, w1 +; CHECK-SD-NEXT: str w8, [sp, #12] // 4-byte Folded Spill +; CHECK-SD-NEXT: udiv w8, w18, w7 +; CHECK-SD-NEXT: stp w9, w8, [sp, #24] // 8-byte Folded Spill +; CHECK-SD-NEXT: umov w8, v2.h[5] +; CHECK-SD-NEXT: umov w9, v0.h[5] +; CHECK-SD-NEXT: udiv w10, w5, w0 +; CHECK-SD-NEXT: ldr w20, [sp, #24] // 4-byte Folded Reload +; CHECK-SD-NEXT: msub w1, w20, w1, w3 +; CHECK-SD-NEXT: str w9, [sp, #40] // 4-byte Folded Spill +; CHECK-SD-NEXT: str w8, [sp, #48] // 4-byte Folded Spill +; CHECK-SD-NEXT: fmov s0, w1 +; CHECK-SD-NEXT: ldr w1, [sp, #12] // 4-byte Folded Reload +; CHECK-SD-NEXT: msub w1, w1, w19, w22 +; CHECK-SD-NEXT: ldr w19, [sp, #28] // 4-byte Folded Reload +; CHECK-SD-NEXT: udiv w9, w9, w8 +; CHECK-SD-NEXT: umov w8, v2.h[6] +; CHECK-SD-NEXT: mov v0.h[1], w21 +; CHECK-SD-NEXT: msub w18, w19, w7, w18 +; CHECK-SD-NEXT: ldp x20, x19, [sp, #144] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x22, x21, [sp, #128] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v0.h[2], w1 +; CHECK-SD-NEXT: str w9, [sp, #56] // 4-byte Folded Spill +; CHECK-SD-NEXT: udiv w9, w4, w8 +; CHECK-SD-NEXT: mov v0.h[3], w18 +; CHECK-SD-NEXT: ldr w18, [sp, #40] // 4-byte Folded Reload +; CHECK-SD-NEXT: stp w8, w9, [sp, #32] // 8-byte Folded Spill +; CHECK-SD-NEXT: udiv w8, w6, w2 +; CHECK-SD-NEXT: umov w9, v1.h[4] +; CHECK-SD-NEXT: udiv w29, w28, w27 +; CHECK-SD-NEXT: stp w8, w10, [sp, #16] // 8-byte Folded Spill +; CHECK-SD-NEXT: umov w8, v1.h[2] +; CHECK-SD-NEXT: umov w10, v3.h[4] +; CHECK-SD-NEXT: udiv w26, w25, w24 +; CHECK-SD-NEXT: msub w3, w29, w27, w28 +; CHECK-SD-NEXT: ldp x28, x27, [sp, #80] // 16-byte Folded Reload +; CHECK-SD-NEXT: fmov s2, w3 +; CHECK-SD-NEXT: umov w3, v1.h[6] +; CHECK-SD-NEXT: udiv w15, w8, w30 +; CHECK-SD-NEXT: msub w24, w26, w24, w25 +; CHECK-SD-NEXT: mov v2.h[1], w24 +; CHECK-SD-NEXT: ldp x24, x23, [sp, #112] // 16-byte Folded Reload +; CHECK-SD-NEXT: udiv w17, w11, w12 +; CHECK-SD-NEXT: msub w8, w15, w30, w8 +; CHECK-SD-NEXT: umov w15, v3.h[6] +; CHECK-SD-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v2.h[2], w8 +; CHECK-SD-NEXT: udiv w16, w9, w10 +; CHECK-SD-NEXT: msub w8, w17, w12, w11 +; CHECK-SD-NEXT: ldr w12, [sp, #20] // 4-byte Folded Reload +; CHECK-SD-NEXT: ldr w17, [sp, #48] // 4-byte Folded Reload +; CHECK-SD-NEXT: msub w12, w12, w0, w5 +; CHECK-SD-NEXT: mov v2.h[3], w8 +; CHECK-SD-NEXT: mov v0.h[4], w12 +; CHECK-SD-NEXT: udiv w25, w13, w14 +; CHECK-SD-NEXT: msub w8, w16, w10, w9 +; CHECK-SD-NEXT: umov w9, v3.h[7] +; CHECK-SD-NEXT: umov w10, v1.h[7] +; CHECK-SD-NEXT: ldr w16, [sp, #56] // 4-byte Folded Reload +; CHECK-SD-NEXT: mov v2.h[4], w8 +; CHECK-SD-NEXT: msub w16, w16, w17, w18 +; CHECK-SD-NEXT: mov v0.h[5], w16 +; CHECK-SD-NEXT: udiv w11, w3, w15 +; CHECK-SD-NEXT: msub w8, w25, w14, w13 +; CHECK-SD-NEXT: ldp w14, w13, [sp, #32] // 8-byte Folded Reload +; CHECK-SD-NEXT: ldp x26, x25, [sp, #96] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v2.h[5], w8 +; CHECK-SD-NEXT: msub w13, w13, w14, w4 +; CHECK-SD-NEXT: mov v0.h[6], w13 +; CHECK-SD-NEXT: udiv w12, w10, w9 +; CHECK-SD-NEXT: msub w8, w11, w15, w3 +; CHECK-SD-NEXT: ldr w11, [sp, #16] // 4-byte Folded Reload +; CHECK-SD-NEXT: msub w11, w11, w2, w6 +; CHECK-SD-NEXT: mov v2.h[6], w8 +; CHECK-SD-NEXT: mov v0.h[7], w11 +; CHECK-SD-NEXT: msub w8, w12, w9, w10 +; CHECK-SD-NEXT: mov v2.h[7], w8 +; CHECK-SD-NEXT: mov v1.16b, v2.16b +; CHECK-SD-NEXT: add sp, sp, #160 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv16i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v5.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll v6.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll v7.4s, v3.4h, #0 +; CHECK-GI-NEXT: fmov w8, s4 +; CHECK-GI-NEXT: fmov w9, s5 +; CHECK-GI-NEXT: mov w12, v5.s[3] +; CHECK-GI-NEXT: fmov w17, s7 +; CHECK-GI-NEXT: mov w18, v7.s[1] +; CHECK-GI-NEXT: mov w0, v7.s[2] +; CHECK-GI-NEXT: mov w1, v7.s[3] +; CHECK-GI-NEXT: ushll2 v7.4s, v3.8h, #0 +; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-GI-NEXT: udiv w10, w8, w9 +; CHECK-GI-NEXT: mov w8, v4.s[1] +; CHECK-GI-NEXT: mov w9, v5.s[1] +; CHECK-GI-NEXT: fmov w2, s7 +; CHECK-GI-NEXT: mov w3, v7.s[1] +; CHECK-GI-NEXT: mov w4, v7.s[2] +; CHECK-GI-NEXT: mov w5, v7.s[3] +; CHECK-GI-NEXT: udiv w11, w8, w9 +; CHECK-GI-NEXT: mov w8, v4.s[2] +; CHECK-GI-NEXT: mov w9, v5.s[2] +; CHECK-GI-NEXT: ushll2 v5.4s, v2.8h, #0 +; CHECK-GI-NEXT: fmov s16, w10 +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: fmov w13, s5 +; CHECK-GI-NEXT: mov w14, v5.s[1] +; CHECK-GI-NEXT: mov w15, v5.s[2] +; CHECK-GI-NEXT: mov w16, v5.s[3] +; CHECK-GI-NEXT: udiv w9, w8, w9 +; CHECK-GI-NEXT: mov w8, v4.s[3] +; CHECK-GI-NEXT: ushll2 v4.4s, v0.8h, #0 +; CHECK-GI-NEXT: mov v16.s[1], w11 +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: udiv w8, w8, w12 +; CHECK-GI-NEXT: fmov w12, s4 +; CHECK-GI-NEXT: mov v16.s[2], w9 +; CHECK-GI-NEXT: udiv w13, w12, w13 +; CHECK-GI-NEXT: mov w12, v4.s[1] +; CHECK-GI-NEXT: mov v16.s[3], w8 +; CHECK-GI-NEXT: mls v0.4s, v16.4s, v2.4s +; CHECK-GI-NEXT: udiv w14, w12, w14 +; CHECK-GI-NEXT: mov w12, v4.s[2] +; CHECK-GI-NEXT: fmov s17, w13 +; CHECK-GI-NEXT: udiv w15, w12, w15 +; CHECK-GI-NEXT: mov w12, v4.s[3] +; CHECK-GI-NEXT: mov v17.s[1], w14 +; CHECK-GI-NEXT: udiv w12, w12, w16 +; CHECK-GI-NEXT: fmov w16, s6 +; CHECK-GI-NEXT: mov v17.s[2], w15 +; CHECK-GI-NEXT: udiv w16, w16, w17 +; CHECK-GI-NEXT: mov w17, v6.s[1] +; CHECK-GI-NEXT: mov v17.s[3], w12 +; CHECK-GI-NEXT: mls v4.4s, v17.4s, v5.4s +; CHECK-GI-NEXT: udiv w17, w17, w18 +; CHECK-GI-NEXT: mov w18, v6.s[2] +; CHECK-GI-NEXT: fmov s18, w16 +; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v4.8h +; CHECK-GI-NEXT: udiv w18, w18, w0 +; CHECK-GI-NEXT: mov w0, v6.s[3] +; CHECK-GI-NEXT: ushll2 v6.4s, v1.8h, #0 +; CHECK-GI-NEXT: mov v18.s[1], w17 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: udiv w0, w0, w1 +; CHECK-GI-NEXT: fmov w1, s6 +; CHECK-GI-NEXT: mov v18.s[2], w18 +; CHECK-GI-NEXT: udiv w1, w1, w2 +; CHECK-GI-NEXT: mov w2, v6.s[1] +; CHECK-GI-NEXT: mov v18.s[3], w0 +; CHECK-GI-NEXT: mls v1.4s, v18.4s, v3.4s +; CHECK-GI-NEXT: udiv w2, w2, w3 +; CHECK-GI-NEXT: mov w3, v6.s[2] +; CHECK-GI-NEXT: fmov s19, w1 +; CHECK-GI-NEXT: udiv w3, w3, w4 +; CHECK-GI-NEXT: mov w4, v6.s[3] +; CHECK-GI-NEXT: mov v19.s[1], w2 +; CHECK-GI-NEXT: udiv w10, w4, w5 +; CHECK-GI-NEXT: mov v19.s[2], w3 +; CHECK-GI-NEXT: mov v19.s[3], w10 +; CHECK-GI-NEXT: mls v6.4s, v19.4s, v7.4s +; CHECK-GI-NEXT: uzp1 v1.8h, v1.8h, v6.8h +; CHECK-GI-NEXT: ret +entry: + %s = urem <16 x i16> %d, %e + ret <16 x i16> %s +} + +define <2 x i32> @sv2i32(<2 x i32> %d, <2 x i32> %e) { +; CHECK-SD-LABEL: sv2i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: fmov w8, s1 +; CHECK-SD-NEXT: fmov w9, s0 +; CHECK-SD-NEXT: mov w11, v1.s[1] +; CHECK-SD-NEXT: mov w12, v0.s[1] +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: sdiv w13, w12, w11 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: fmov s0, w8 +; CHECK-SD-NEXT: msub w9, w13, w11, w12 +; CHECK-SD-NEXT: mov v0.s[1], w9 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv2i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: mov w10, v1.s[1] +; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: mov w9, v0.s[1] +; CHECK-GI-NEXT: sdiv w9, w9, w10 +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.s[1], w9 +; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret +entry: + %s = srem <2 x i32> %d, %e + ret <2 x i32> %s +} + +define <3 x i32> @sv3i32(<3 x i32> %d, <3 x i32> %e) { +; CHECK-LABEL: sv3i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: fmov w12, s0 +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: mov w9, v0.s[1] +; CHECK-NEXT: mov w14, v1.s[2] +; CHECK-NEXT: mov w15, v0.s[2] +; CHECK-NEXT: sdiv w13, w12, w11 +; CHECK-NEXT: sdiv w10, w9, w8 +; CHECK-NEXT: msub w11, w13, w11, w12 +; CHECK-NEXT: fmov s0, w11 +; CHECK-NEXT: sdiv w16, w15, w14 +; CHECK-NEXT: msub w8, w10, w8, w9 +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: msub w8, w16, w14, w15 +; CHECK-NEXT: mov v0.s[2], w8 +; CHECK-NEXT: ret +entry: + %s = srem <3 x i32> %d, %e + ret <3 x i32> %s +} + +define <4 x i32> @sv4i32(<4 x i32> %d, <4 x i32> %e) { +; CHECK-SD-LABEL: sv4i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov w11, s1 +; CHECK-SD-NEXT: fmov w12, s0 +; CHECK-SD-NEXT: mov w8, v1.s[1] +; CHECK-SD-NEXT: mov w9, v0.s[1] +; CHECK-SD-NEXT: mov w14, v1.s[2] +; CHECK-SD-NEXT: mov w15, v0.s[2] +; CHECK-SD-NEXT: mov w17, v1.s[3] +; CHECK-SD-NEXT: mov w18, v0.s[3] +; CHECK-SD-NEXT: sdiv w13, w12, w11 +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: msub w11, w13, w11, w12 +; CHECK-SD-NEXT: fmov s0, w11 +; CHECK-SD-NEXT: sdiv w16, w15, w14 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: mov v0.s[1], w8 +; CHECK-SD-NEXT: sdiv w9, w18, w17 +; CHECK-SD-NEXT: msub w8, w16, w14, w15 +; CHECK-SD-NEXT: mov v0.s[2], w8 +; CHECK-SD-NEXT: msub w8, w9, w17, w18 +; CHECK-SD-NEXT: mov v0.s[3], w8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv4i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: mov w10, v1.s[1] +; CHECK-GI-NEXT: mov w11, v1.s[2] +; CHECK-GI-NEXT: mov w12, v1.s[3] +; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: mov w9, v0.s[1] +; CHECK-GI-NEXT: sdiv w9, w9, w10 +; CHECK-GI-NEXT: mov w10, v0.s[2] +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: sdiv w10, w10, w11 +; CHECK-GI-NEXT: mov w11, v0.s[3] +; CHECK-GI-NEXT: mov v2.s[1], w9 +; CHECK-GI-NEXT: sdiv w8, w11, w12 +; CHECK-GI-NEXT: mov v2.s[2], w10 +; CHECK-GI-NEXT: mov v2.s[3], w8 +; CHECK-GI-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-GI-NEXT: ret +entry: + %s = srem <4 x i32> %d, %e + ret <4 x i32> %s +} + +define <8 x i32> @sv8i32(<8 x i32> %d, <8 x i32> %e) { +; CHECK-SD-LABEL: sv8i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: stp x22, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 32 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: mov w8, v2.s[1] +; CHECK-SD-NEXT: mov w9, v0.s[1] +; CHECK-SD-NEXT: fmov w11, s2 +; CHECK-SD-NEXT: fmov w12, s0 +; CHECK-SD-NEXT: fmov w4, s3 +; CHECK-SD-NEXT: fmov w5, s1 +; CHECK-SD-NEXT: mov w1, v3.s[1] +; CHECK-SD-NEXT: mov w2, v1.s[1] +; CHECK-SD-NEXT: mov w14, v2.s[2] +; CHECK-SD-NEXT: mov w15, v0.s[2] +; CHECK-SD-NEXT: mov w7, v3.s[2] +; CHECK-SD-NEXT: mov w19, v1.s[2] +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: mov w17, v2.s[3] +; CHECK-SD-NEXT: mov w18, v0.s[3] +; CHECK-SD-NEXT: mov w21, v3.s[3] +; CHECK-SD-NEXT: mov w22, v1.s[3] +; CHECK-SD-NEXT: sdiv w13, w12, w11 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: sdiv w6, w5, w4 +; CHECK-SD-NEXT: msub w9, w13, w11, w12 +; CHECK-SD-NEXT: fmov s0, w9 +; CHECK-SD-NEXT: mov v0.s[1], w8 +; CHECK-SD-NEXT: sdiv w3, w2, w1 +; CHECK-SD-NEXT: msub w10, w6, w4, w5 +; CHECK-SD-NEXT: fmov s1, w10 +; CHECK-SD-NEXT: sdiv w16, w15, w14 +; CHECK-SD-NEXT: msub w11, w3, w1, w2 +; CHECK-SD-NEXT: mov v1.s[1], w11 +; CHECK-SD-NEXT: sdiv w20, w19, w7 +; CHECK-SD-NEXT: msub w9, w16, w14, w15 +; CHECK-SD-NEXT: mov v0.s[2], w9 +; CHECK-SD-NEXT: sdiv w0, w18, w17 +; CHECK-SD-NEXT: msub w8, w20, w7, w19 +; CHECK-SD-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v1.s[2], w8 +; CHECK-SD-NEXT: sdiv w12, w22, w21 +; CHECK-SD-NEXT: msub w10, w0, w17, w18 +; CHECK-SD-NEXT: mov v0.s[3], w10 +; CHECK-SD-NEXT: msub w8, w12, w21, w22 +; CHECK-SD-NEXT: mov v1.s[3], w8 +; CHECK-SD-NEXT: ldp x22, x21, [sp], #32 // 16-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv8i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s2 +; CHECK-GI-NEXT: mov w10, v2.s[1] +; CHECK-GI-NEXT: mov w11, v2.s[2] +; CHECK-GI-NEXT: mov w12, v2.s[3] +; CHECK-GI-NEXT: fmov w13, s3 +; CHECK-GI-NEXT: mov w14, v3.s[1] +; CHECK-GI-NEXT: mov w15, v3.s[2] +; CHECK-GI-NEXT: mov w16, v3.s[3] +; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: mov w9, v0.s[1] +; CHECK-GI-NEXT: sdiv w9, w9, w10 +; CHECK-GI-NEXT: mov w10, v0.s[2] +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: sdiv w10, w10, w11 +; CHECK-GI-NEXT: mov w11, v0.s[3] +; CHECK-GI-NEXT: mov v4.s[1], w9 +; CHECK-GI-NEXT: sdiv w11, w11, w12 +; CHECK-GI-NEXT: fmov w12, s1 +; CHECK-GI-NEXT: mov v4.s[2], w10 +; CHECK-GI-NEXT: sdiv w12, w12, w13 +; CHECK-GI-NEXT: mov w13, v1.s[1] +; CHECK-GI-NEXT: mov v4.s[3], w11 +; CHECK-GI-NEXT: mls v0.4s, v4.4s, v2.4s +; CHECK-GI-NEXT: sdiv w13, w13, w14 +; CHECK-GI-NEXT: mov w14, v1.s[2] +; CHECK-GI-NEXT: fmov s5, w12 +; CHECK-GI-NEXT: sdiv w14, w14, w15 +; CHECK-GI-NEXT: mov w15, v1.s[3] +; CHECK-GI-NEXT: mov v5.s[1], w13 +; CHECK-GI-NEXT: sdiv w8, w15, w16 +; CHECK-GI-NEXT: mov v5.s[2], w14 +; CHECK-GI-NEXT: mov v5.s[3], w8 +; CHECK-GI-NEXT: mls v1.4s, v5.4s, v3.4s +; CHECK-GI-NEXT: ret +entry: + %s = srem <8 x i32> %d, %e + ret <8 x i32> %s +} + +define <2 x i32> @uv2i32(<2 x i32> %d, <2 x i32> %e) { +; CHECK-SD-LABEL: uv2i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: fmov w8, s1 +; CHECK-SD-NEXT: fmov w9, s0 +; CHECK-SD-NEXT: mov w11, v1.s[1] +; CHECK-SD-NEXT: mov w12, v0.s[1] +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: udiv w13, w12, w11 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: fmov s0, w8 +; CHECK-SD-NEXT: msub w9, w13, w11, w12 +; CHECK-SD-NEXT: mov v0.s[1], w9 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv2i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: mov w10, v1.s[1] +; CHECK-GI-NEXT: udiv w8, w8, w9 +; CHECK-GI-NEXT: mov w9, v0.s[1] +; CHECK-GI-NEXT: udiv w9, w9, w10 +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.s[1], w9 +; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret +entry: + %s = urem <2 x i32> %d, %e + ret <2 x i32> %s +} + +define <3 x i32> @uv3i32(<3 x i32> %d, <3 x i32> %e) { +; CHECK-LABEL: uv3i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: fmov w12, s0 +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: mov w9, v0.s[1] +; CHECK-NEXT: mov w14, v1.s[2] +; CHECK-NEXT: mov w15, v0.s[2] +; CHECK-NEXT: udiv w13, w12, w11 +; CHECK-NEXT: udiv w10, w9, w8 +; CHECK-NEXT: msub w11, w13, w11, w12 +; CHECK-NEXT: fmov s0, w11 +; CHECK-NEXT: udiv w16, w15, w14 +; CHECK-NEXT: msub w8, w10, w8, w9 +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: msub w8, w16, w14, w15 +; CHECK-NEXT: mov v0.s[2], w8 +; CHECK-NEXT: ret +entry: + %s = urem <3 x i32> %d, %e + ret <3 x i32> %s +} + +define <4 x i32> @uv4i32(<4 x i32> %d, <4 x i32> %e) { +; CHECK-SD-LABEL: uv4i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov w11, s1 +; CHECK-SD-NEXT: fmov w12, s0 +; CHECK-SD-NEXT: mov w8, v1.s[1] +; CHECK-SD-NEXT: mov w9, v0.s[1] +; CHECK-SD-NEXT: mov w14, v1.s[2] +; CHECK-SD-NEXT: mov w15, v0.s[2] +; CHECK-SD-NEXT: mov w17, v1.s[3] +; CHECK-SD-NEXT: mov w18, v0.s[3] +; CHECK-SD-NEXT: udiv w13, w12, w11 +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: msub w11, w13, w11, w12 +; CHECK-SD-NEXT: fmov s0, w11 +; CHECK-SD-NEXT: udiv w16, w15, w14 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: mov v0.s[1], w8 +; CHECK-SD-NEXT: udiv w9, w18, w17 +; CHECK-SD-NEXT: msub w8, w16, w14, w15 +; CHECK-SD-NEXT: mov v0.s[2], w8 +; CHECK-SD-NEXT: msub w8, w9, w17, w18 +; CHECK-SD-NEXT: mov v0.s[3], w8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv4i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: mov w10, v1.s[1] +; CHECK-GI-NEXT: mov w11, v1.s[2] +; CHECK-GI-NEXT: mov w12, v1.s[3] +; CHECK-GI-NEXT: udiv w8, w8, w9 +; CHECK-GI-NEXT: mov w9, v0.s[1] +; CHECK-GI-NEXT: udiv w9, w9, w10 +; CHECK-GI-NEXT: mov w10, v0.s[2] +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: udiv w10, w10, w11 +; CHECK-GI-NEXT: mov w11, v0.s[3] +; CHECK-GI-NEXT: mov v2.s[1], w9 +; CHECK-GI-NEXT: udiv w8, w11, w12 +; CHECK-GI-NEXT: mov v2.s[2], w10 +; CHECK-GI-NEXT: mov v2.s[3], w8 +; CHECK-GI-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-GI-NEXT: ret +entry: + %s = urem <4 x i32> %d, %e + ret <4 x i32> %s +} + +define <8 x i32> @uv8i32(<8 x i32> %d, <8 x i32> %e) { +; CHECK-SD-LABEL: uv8i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: stp x22, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 32 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: mov w8, v2.s[1] +; CHECK-SD-NEXT: mov w9, v0.s[1] +; CHECK-SD-NEXT: fmov w11, s2 +; CHECK-SD-NEXT: fmov w12, s0 +; CHECK-SD-NEXT: fmov w4, s3 +; CHECK-SD-NEXT: fmov w5, s1 +; CHECK-SD-NEXT: mov w1, v3.s[1] +; CHECK-SD-NEXT: mov w2, v1.s[1] +; CHECK-SD-NEXT: mov w14, v2.s[2] +; CHECK-SD-NEXT: mov w15, v0.s[2] +; CHECK-SD-NEXT: mov w7, v3.s[2] +; CHECK-SD-NEXT: mov w19, v1.s[2] +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: mov w17, v2.s[3] +; CHECK-SD-NEXT: mov w18, v0.s[3] +; CHECK-SD-NEXT: mov w21, v3.s[3] +; CHECK-SD-NEXT: mov w22, v1.s[3] +; CHECK-SD-NEXT: udiv w13, w12, w11 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: udiv w6, w5, w4 +; CHECK-SD-NEXT: msub w9, w13, w11, w12 +; CHECK-SD-NEXT: fmov s0, w9 +; CHECK-SD-NEXT: mov v0.s[1], w8 +; CHECK-SD-NEXT: udiv w3, w2, w1 +; CHECK-SD-NEXT: msub w10, w6, w4, w5 +; CHECK-SD-NEXT: fmov s1, w10 +; CHECK-SD-NEXT: udiv w16, w15, w14 +; CHECK-SD-NEXT: msub w11, w3, w1, w2 +; CHECK-SD-NEXT: mov v1.s[1], w11 +; CHECK-SD-NEXT: udiv w20, w19, w7 +; CHECK-SD-NEXT: msub w9, w16, w14, w15 +; CHECK-SD-NEXT: mov v0.s[2], w9 +; CHECK-SD-NEXT: udiv w0, w18, w17 +; CHECK-SD-NEXT: msub w8, w20, w7, w19 +; CHECK-SD-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v1.s[2], w8 +; CHECK-SD-NEXT: udiv w12, w22, w21 +; CHECK-SD-NEXT: msub w10, w0, w17, w18 +; CHECK-SD-NEXT: mov v0.s[3], w10 +; CHECK-SD-NEXT: msub w8, w12, w21, w22 +; CHECK-SD-NEXT: mov v1.s[3], w8 +; CHECK-SD-NEXT: ldp x22, x21, [sp], #32 // 16-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv8i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s2 +; CHECK-GI-NEXT: mov w10, v2.s[1] +; CHECK-GI-NEXT: mov w11, v2.s[2] +; CHECK-GI-NEXT: mov w12, v2.s[3] +; CHECK-GI-NEXT: fmov w13, s3 +; CHECK-GI-NEXT: mov w14, v3.s[1] +; CHECK-GI-NEXT: mov w15, v3.s[2] +; CHECK-GI-NEXT: mov w16, v3.s[3] +; CHECK-GI-NEXT: udiv w8, w8, w9 +; CHECK-GI-NEXT: mov w9, v0.s[1] +; CHECK-GI-NEXT: udiv w9, w9, w10 +; CHECK-GI-NEXT: mov w10, v0.s[2] +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: udiv w10, w10, w11 +; CHECK-GI-NEXT: mov w11, v0.s[3] +; CHECK-GI-NEXT: mov v4.s[1], w9 +; CHECK-GI-NEXT: udiv w11, w11, w12 +; CHECK-GI-NEXT: fmov w12, s1 +; CHECK-GI-NEXT: mov v4.s[2], w10 +; CHECK-GI-NEXT: udiv w12, w12, w13 +; CHECK-GI-NEXT: mov w13, v1.s[1] +; CHECK-GI-NEXT: mov v4.s[3], w11 +; CHECK-GI-NEXT: mls v0.4s, v4.4s, v2.4s +; CHECK-GI-NEXT: udiv w13, w13, w14 +; CHECK-GI-NEXT: mov w14, v1.s[2] +; CHECK-GI-NEXT: fmov s5, w12 +; CHECK-GI-NEXT: udiv w14, w14, w15 +; CHECK-GI-NEXT: mov w15, v1.s[3] +; CHECK-GI-NEXT: mov v5.s[1], w13 +; CHECK-GI-NEXT: udiv w8, w15, w16 +; CHECK-GI-NEXT: mov v5.s[2], w14 +; CHECK-GI-NEXT: mov v5.s[3], w8 +; CHECK-GI-NEXT: mls v1.4s, v5.4s, v3.4s +; CHECK-GI-NEXT: ret +entry: + %s = urem <8 x i32> %d, %e + ret <8 x i32> %s +} + +define <2 x i64> @sv2i64(<2 x i64> %d, <2 x i64> %e) { +; CHECK-SD-LABEL: sv2i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov x8, d1 +; CHECK-SD-NEXT: fmov x9, d0 +; CHECK-SD-NEXT: mov x11, v1.d[1] +; CHECK-SD-NEXT: mov x12, v0.d[1] +; CHECK-SD-NEXT: sdiv x10, x9, x8 +; CHECK-SD-NEXT: sdiv x13, x12, x11 +; CHECK-SD-NEXT: msub x8, x10, x8, x9 +; CHECK-SD-NEXT: fmov d0, x8 +; CHECK-SD-NEXT: msub x9, x13, x11, x12 +; CHECK-SD-NEXT: mov v0.d[1], x9 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv2i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: fmov x9, d1 +; CHECK-GI-NEXT: mov x10, v1.d[1] +; CHECK-GI-NEXT: mov x11, v0.d[1] +; CHECK-GI-NEXT: sdiv x8, x8, x9 +; CHECK-GI-NEXT: sdiv x11, x11, x10 +; CHECK-GI-NEXT: fmov d1, x8 +; CHECK-GI-NEXT: mov v1.d[1], x11 +; CHECK-GI-NEXT: fmov x11, d1 +; CHECK-GI-NEXT: mov x8, v1.d[1] +; CHECK-GI-NEXT: mul x9, x11, x9 +; CHECK-GI-NEXT: mul x8, x8, x10 +; CHECK-GI-NEXT: fmov d1, x9 +; CHECK-GI-NEXT: mov v1.d[1], x8 +; CHECK-GI-NEXT: sub v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: ret +entry: + %s = srem <2 x i64> %d, %e + ret <2 x i64> %s +} + +define <3 x i64> @sv3i64(<3 x i64> %d, <3 x i64> %e) { +; CHECK-SD-LABEL: sv3i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d5 killed $d5 def $q5 +; CHECK-SD-NEXT: // kill: def $d4 killed $d4 def $q4 +; CHECK-SD-NEXT: // kill: def $d3 killed $d3 def $q3 +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: fmov x8, d3 +; CHECK-SD-NEXT: fmov x9, d0 +; CHECK-SD-NEXT: fmov x11, d4 +; CHECK-SD-NEXT: fmov x12, d1 +; CHECK-SD-NEXT: fmov x14, d5 +; CHECK-SD-NEXT: fmov x15, d2 +; CHECK-SD-NEXT: sdiv x10, x9, x8 +; CHECK-SD-NEXT: sdiv x13, x12, x11 +; CHECK-SD-NEXT: msub x8, x10, x8, x9 +; CHECK-SD-NEXT: fmov d0, x8 +; CHECK-SD-NEXT: sdiv x16, x15, x14 +; CHECK-SD-NEXT: msub x9, x13, x11, x12 +; CHECK-SD-NEXT: fmov d1, x9 +; CHECK-SD-NEXT: msub x10, x16, x14, x15 +; CHECK-SD-NEXT: fmov d2, x10 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv3i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3 +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: fmov x9, d3 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4 +; CHECK-GI-NEXT: fmov x10, d4 +; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: sdiv x8, x8, x9 +; CHECK-GI-NEXT: fmov x9, d1 +; CHECK-GI-NEXT: fmov x14, d3 +; CHECK-GI-NEXT: mov x12, v3.d[1] +; CHECK-GI-NEXT: sdiv x9, x9, x10 +; CHECK-GI-NEXT: fmov d6, x8 +; CHECK-GI-NEXT: fmov x8, d2 +; CHECK-GI-NEXT: mov v6.d[1], x9 +; CHECK-GI-NEXT: fmov x9, d5 +; CHECK-GI-NEXT: sdiv x10, x8, x9 +; CHECK-GI-NEXT: fmov x13, d6 +; CHECK-GI-NEXT: mov x11, v6.d[1] +; CHECK-GI-NEXT: mul x13, x13, x14 +; CHECK-GI-NEXT: mul x11, x11, x12 +; CHECK-GI-NEXT: fmov d2, x13 +; CHECK-GI-NEXT: mov v2.d[1], x11 +; CHECK-GI-NEXT: msub x8, x10, x9, x8 +; CHECK-GI-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: ret +entry: + %s = srem <3 x i64> %d, %e + ret <3 x i64> %s +} + +define <4 x i64> @sv4i64(<4 x i64> %d, <4 x i64> %e) { +; CHECK-SD-LABEL: sv4i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov x8, v2.d[1] +; CHECK-SD-NEXT: mov x9, v0.d[1] +; CHECK-SD-NEXT: fmov x11, d2 +; CHECK-SD-NEXT: fmov x12, d0 +; CHECK-SD-NEXT: fmov x14, d3 +; CHECK-SD-NEXT: fmov x15, d1 +; CHECK-SD-NEXT: mov x17, v3.d[1] +; CHECK-SD-NEXT: mov x18, v1.d[1] +; CHECK-SD-NEXT: sdiv x10, x9, x8 +; CHECK-SD-NEXT: sdiv x13, x12, x11 +; CHECK-SD-NEXT: msub x8, x10, x8, x9 +; CHECK-SD-NEXT: sdiv x16, x15, x14 +; CHECK-SD-NEXT: msub x9, x13, x11, x12 +; CHECK-SD-NEXT: fmov d0, x9 +; CHECK-SD-NEXT: mov v0.d[1], x8 +; CHECK-SD-NEXT: sdiv x0, x18, x17 +; CHECK-SD-NEXT: msub x10, x16, x14, x15 +; CHECK-SD-NEXT: fmov d1, x10 +; CHECK-SD-NEXT: msub x11, x0, x17, x18 +; CHECK-SD-NEXT: mov v1.d[1], x11 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv4i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: fmov x9, d2 +; CHECK-GI-NEXT: mov x10, v2.d[1] +; CHECK-GI-NEXT: mov x11, v0.d[1] +; CHECK-GI-NEXT: fmov x12, d1 +; CHECK-GI-NEXT: fmov x13, d3 +; CHECK-GI-NEXT: mov x14, v3.d[1] +; CHECK-GI-NEXT: mov x15, v1.d[1] +; CHECK-GI-NEXT: sdiv x8, x8, x9 +; CHECK-GI-NEXT: sdiv x11, x11, x10 +; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: sdiv x12, x12, x13 +; CHECK-GI-NEXT: mov v2.d[1], x11 +; CHECK-GI-NEXT: fmov x11, d2 +; CHECK-GI-NEXT: mov x8, v2.d[1] +; CHECK-GI-NEXT: mul x9, x11, x9 +; CHECK-GI-NEXT: mul x8, x8, x10 +; CHECK-GI-NEXT: fmov d2, x9 +; CHECK-GI-NEXT: mov v2.d[1], x8 +; CHECK-GI-NEXT: sdiv x15, x15, x14 +; CHECK-GI-NEXT: fmov d3, x12 +; CHECK-GI-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: mov v3.d[1], x15 +; CHECK-GI-NEXT: fmov x11, d3 +; CHECK-GI-NEXT: mov x10, v3.d[1] +; CHECK-GI-NEXT: mul x11, x11, x13 +; CHECK-GI-NEXT: mul x10, x10, x14 +; CHECK-GI-NEXT: fmov d3, x11 +; CHECK-GI-NEXT: mov v3.d[1], x10 +; CHECK-GI-NEXT: sub v1.2d, v1.2d, v3.2d +; CHECK-GI-NEXT: ret +entry: + %s = srem <4 x i64> %d, %e + ret <4 x i64> %s +} + +define <2 x i64> @uv2i64(<2 x i64> %d, <2 x i64> %e) { +; CHECK-SD-LABEL: uv2i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov x8, d1 +; CHECK-SD-NEXT: fmov x9, d0 +; CHECK-SD-NEXT: mov x11, v1.d[1] +; CHECK-SD-NEXT: mov x12, v0.d[1] +; CHECK-SD-NEXT: udiv x10, x9, x8 +; CHECK-SD-NEXT: udiv x13, x12, x11 +; CHECK-SD-NEXT: msub x8, x10, x8, x9 +; CHECK-SD-NEXT: fmov d0, x8 +; CHECK-SD-NEXT: msub x9, x13, x11, x12 +; CHECK-SD-NEXT: mov v0.d[1], x9 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv2i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: fmov x9, d1 +; CHECK-GI-NEXT: mov x10, v1.d[1] +; CHECK-GI-NEXT: mov x11, v0.d[1] +; CHECK-GI-NEXT: udiv x8, x8, x9 +; CHECK-GI-NEXT: udiv x11, x11, x10 +; CHECK-GI-NEXT: fmov d1, x8 +; CHECK-GI-NEXT: mov v1.d[1], x11 +; CHECK-GI-NEXT: fmov x11, d1 +; CHECK-GI-NEXT: mov x8, v1.d[1] +; CHECK-GI-NEXT: mul x9, x11, x9 +; CHECK-GI-NEXT: mul x8, x8, x10 +; CHECK-GI-NEXT: fmov d1, x9 +; CHECK-GI-NEXT: mov v1.d[1], x8 +; CHECK-GI-NEXT: sub v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: ret +entry: + %s = urem <2 x i64> %d, %e + ret <2 x i64> %s +} + +define <3 x i64> @uv3i64(<3 x i64> %d, <3 x i64> %e) { +; CHECK-SD-LABEL: uv3i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d5 killed $d5 def $q5 +; CHECK-SD-NEXT: // kill: def $d4 killed $d4 def $q4 +; CHECK-SD-NEXT: // kill: def $d3 killed $d3 def $q3 +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: fmov x8, d3 +; CHECK-SD-NEXT: fmov x9, d0 +; CHECK-SD-NEXT: fmov x11, d4 +; CHECK-SD-NEXT: fmov x12, d1 +; CHECK-SD-NEXT: fmov x14, d5 +; CHECK-SD-NEXT: fmov x15, d2 +; CHECK-SD-NEXT: udiv x10, x9, x8 +; CHECK-SD-NEXT: udiv x13, x12, x11 +; CHECK-SD-NEXT: msub x8, x10, x8, x9 +; CHECK-SD-NEXT: fmov d0, x8 +; CHECK-SD-NEXT: udiv x16, x15, x14 +; CHECK-SD-NEXT: msub x9, x13, x11, x12 +; CHECK-SD-NEXT: fmov d1, x9 +; CHECK-SD-NEXT: msub x10, x16, x14, x15 +; CHECK-SD-NEXT: fmov d2, x10 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv3i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3 +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: fmov x9, d3 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4 +; CHECK-GI-NEXT: fmov x10, d4 +; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: udiv x8, x8, x9 +; CHECK-GI-NEXT: fmov x9, d1 +; CHECK-GI-NEXT: fmov x14, d3 +; CHECK-GI-NEXT: mov x12, v3.d[1] +; CHECK-GI-NEXT: udiv x9, x9, x10 +; CHECK-GI-NEXT: fmov d6, x8 +; CHECK-GI-NEXT: fmov x8, d2 +; CHECK-GI-NEXT: mov v6.d[1], x9 +; CHECK-GI-NEXT: fmov x9, d5 +; CHECK-GI-NEXT: udiv x10, x8, x9 +; CHECK-GI-NEXT: fmov x13, d6 +; CHECK-GI-NEXT: mov x11, v6.d[1] +; CHECK-GI-NEXT: mul x13, x13, x14 +; CHECK-GI-NEXT: mul x11, x11, x12 +; CHECK-GI-NEXT: fmov d2, x13 +; CHECK-GI-NEXT: mov v2.d[1], x11 +; CHECK-GI-NEXT: msub x8, x10, x9, x8 +; CHECK-GI-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: ret +entry: + %s = urem <3 x i64> %d, %e + ret <3 x i64> %s +} + +define <4 x i64> @uv4i64(<4 x i64> %d, <4 x i64> %e) { +; CHECK-SD-LABEL: uv4i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov x8, v2.d[1] +; CHECK-SD-NEXT: mov x9, v0.d[1] +; CHECK-SD-NEXT: fmov x11, d2 +; CHECK-SD-NEXT: fmov x12, d0 +; CHECK-SD-NEXT: fmov x14, d3 +; CHECK-SD-NEXT: fmov x15, d1 +; CHECK-SD-NEXT: mov x17, v3.d[1] +; CHECK-SD-NEXT: mov x18, v1.d[1] +; CHECK-SD-NEXT: udiv x10, x9, x8 +; CHECK-SD-NEXT: udiv x13, x12, x11 +; CHECK-SD-NEXT: msub x8, x10, x8, x9 +; CHECK-SD-NEXT: udiv x16, x15, x14 +; CHECK-SD-NEXT: msub x9, x13, x11, x12 +; CHECK-SD-NEXT: fmov d0, x9 +; CHECK-SD-NEXT: mov v0.d[1], x8 +; CHECK-SD-NEXT: udiv x0, x18, x17 +; CHECK-SD-NEXT: msub x10, x16, x14, x15 +; CHECK-SD-NEXT: fmov d1, x10 +; CHECK-SD-NEXT: msub x11, x0, x17, x18 +; CHECK-SD-NEXT: mov v1.d[1], x11 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv4i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: fmov x9, d2 +; CHECK-GI-NEXT: mov x10, v2.d[1] +; CHECK-GI-NEXT: mov x11, v0.d[1] +; CHECK-GI-NEXT: fmov x12, d1 +; CHECK-GI-NEXT: fmov x13, d3 +; CHECK-GI-NEXT: mov x14, v3.d[1] +; CHECK-GI-NEXT: mov x15, v1.d[1] +; CHECK-GI-NEXT: udiv x8, x8, x9 +; CHECK-GI-NEXT: udiv x11, x11, x10 +; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: udiv x12, x12, x13 +; CHECK-GI-NEXT: mov v2.d[1], x11 +; CHECK-GI-NEXT: fmov x11, d2 +; CHECK-GI-NEXT: mov x8, v2.d[1] +; CHECK-GI-NEXT: mul x9, x11, x9 +; CHECK-GI-NEXT: mul x8, x8, x10 +; CHECK-GI-NEXT: fmov d2, x9 +; CHECK-GI-NEXT: mov v2.d[1], x8 +; CHECK-GI-NEXT: udiv x15, x15, x14 +; CHECK-GI-NEXT: fmov d3, x12 +; CHECK-GI-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: mov v3.d[1], x15 +; CHECK-GI-NEXT: fmov x11, d3 +; CHECK-GI-NEXT: mov x10, v3.d[1] +; CHECK-GI-NEXT: mul x11, x11, x13 +; CHECK-GI-NEXT: mul x10, x10, x14 +; CHECK-GI-NEXT: fmov d3, x11 +; CHECK-GI-NEXT: mov v3.d[1], x10 +; CHECK-GI-NEXT: sub v1.2d, v1.2d, v3.2d +; CHECK-GI-NEXT: ret +entry: + %s = urem <4 x i64> %d, %e + ret <4 x i64> %s +} + +define <2 x i128> @sv2i128(<2 x i128> %d, <2 x i128> %e) { +; CHECK-LABEL: sv2i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; CHECK-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w21, -24 +; CHECK-NEXT: .cfi_offset w22, -32 +; CHECK-NEXT: .cfi_offset w23, -40 +; CHECK-NEXT: .cfi_offset w24, -48 +; CHECK-NEXT: .cfi_offset w30, -64 +; CHECK-NEXT: mov x21, x3 +; CHECK-NEXT: mov x22, x2 +; CHECK-NEXT: mov x2, x4 +; CHECK-NEXT: mov x3, x5 +; CHECK-NEXT: mov x19, x7 +; CHECK-NEXT: mov x20, x6 +; CHECK-NEXT: bl __modti3 +; CHECK-NEXT: mov x23, x0 +; CHECK-NEXT: mov x24, x1 +; CHECK-NEXT: mov x0, x22 +; CHECK-NEXT: mov x1, x21 +; CHECK-NEXT: mov x2, x20 +; CHECK-NEXT: mov x3, x19 +; CHECK-NEXT: bl __modti3 +; CHECK-NEXT: mov x2, x0 +; CHECK-NEXT: mov x3, x1 +; CHECK-NEXT: mov x0, x23 +; CHECK-NEXT: mov x1, x24 +; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %s = srem <2 x i128> %d, %e + ret <2 x i128> %s +} + +define <3 x i128> @sv3i128(<3 x i128> %d, <3 x i128> %e) { +; CHECK-LABEL: sv3i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-96]! // 8-byte Folded Spill +; CHECK-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w21, -24 +; CHECK-NEXT: .cfi_offset w22, -32 +; CHECK-NEXT: .cfi_offset w23, -40 +; CHECK-NEXT: .cfi_offset w24, -48 +; CHECK-NEXT: .cfi_offset w25, -56 +; CHECK-NEXT: .cfi_offset w26, -64 +; CHECK-NEXT: .cfi_offset w27, -72 +; CHECK-NEXT: .cfi_offset w28, -80 +; CHECK-NEXT: .cfi_offset w30, -96 +; CHECK-NEXT: ldp x23, x24, [sp, #112] +; CHECK-NEXT: mov x21, x3 +; CHECK-NEXT: ldp x25, x26, [sp, #96] +; CHECK-NEXT: mov x22, x2 +; CHECK-NEXT: mov x2, x6 +; CHECK-NEXT: mov x3, x7 +; CHECK-NEXT: mov x19, x5 +; CHECK-NEXT: mov x20, x4 +; CHECK-NEXT: bl __modti3 +; CHECK-NEXT: mov x27, x0 +; CHECK-NEXT: mov x28, x1 +; CHECK-NEXT: mov x0, x22 +; CHECK-NEXT: mov x1, x21 +; CHECK-NEXT: mov x2, x25 +; CHECK-NEXT: mov x3, x26 +; CHECK-NEXT: bl __modti3 +; CHECK-NEXT: mov x21, x0 +; CHECK-NEXT: mov x22, x1 +; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: mov x1, x19 +; CHECK-NEXT: mov x2, x23 +; CHECK-NEXT: mov x3, x24 +; CHECK-NEXT: bl __modti3 +; CHECK-NEXT: mov x4, x0 +; CHECK-NEXT: mov x5, x1 +; CHECK-NEXT: mov x0, x27 +; CHECK-NEXT: mov x1, x28 +; CHECK-NEXT: mov x2, x21 +; CHECK-NEXT: mov x3, x22 +; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp x26, x25, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp x28, x27, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #96 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %s = srem <3 x i128> %d, %e + ret <3 x i128> %s +} + +define <4 x i128> @sv4i128(<4 x i128> %d, <4 x i128> %e) { +; CHECK-LABEL: sv4i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #128 +; CHECK-NEXT: stp x29, x30, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp x28, x27, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x26, x25, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x24, x23, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #112] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 128 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w21, -24 +; CHECK-NEXT: .cfi_offset w22, -32 +; CHECK-NEXT: .cfi_offset w23, -40 +; CHECK-NEXT: .cfi_offset w24, -48 +; CHECK-NEXT: .cfi_offset w25, -56 +; CHECK-NEXT: .cfi_offset w26, -64 +; CHECK-NEXT: .cfi_offset w27, -72 +; CHECK-NEXT: .cfi_offset w28, -80 +; CHECK-NEXT: .cfi_offset w30, -88 +; CHECK-NEXT: .cfi_offset w29, -96 +; CHECK-NEXT: mov x23, x3 +; CHECK-NEXT: mov x24, x2 +; CHECK-NEXT: stp x6, x7, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: ldp x8, x26, [sp, #176] +; CHECK-NEXT: mov x21, x5 +; CHECK-NEXT: ldp x2, x3, [sp, #128] +; CHECK-NEXT: mov x22, x4 +; CHECK-NEXT: ldp x27, x28, [sp, #160] +; CHECK-NEXT: ldp x29, x19, [sp, #144] +; CHECK-NEXT: str x8, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: bl __modti3 +; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: mov x25, x1 +; CHECK-NEXT: mov x0, x24 +; CHECK-NEXT: mov x1, x23 +; CHECK-NEXT: mov x2, x29 +; CHECK-NEXT: mov x3, x19 +; CHECK-NEXT: bl __modti3 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: mov x23, x1 +; CHECK-NEXT: mov x0, x22 +; CHECK-NEXT: mov x1, x21 +; CHECK-NEXT: mov x2, x27 +; CHECK-NEXT: mov x3, x28 +; CHECK-NEXT: bl __modti3 +; CHECK-NEXT: mov x21, x0 +; CHECK-NEXT: mov x22, x1 +; CHECK-NEXT: ldr x2, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ldp x0, x1, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: mov x3, x26 +; CHECK-NEXT: bl __modti3 +; CHECK-NEXT: mov x6, x0 +; CHECK-NEXT: mov x7, x1 +; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: mov x1, x25 +; CHECK-NEXT: mov x2, x19 +; CHECK-NEXT: mov x3, x23 +; CHECK-NEXT: mov x4, x21 +; CHECK-NEXT: mov x5, x22 +; CHECK-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload +; CHECK-NEXT: ldp x22, x21, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: ldp x24, x23, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp x26, x25, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp x28, x27, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #128 +; CHECK-NEXT: ret +entry: + %s = srem <4 x i128> %d, %e + ret <4 x i128> %s +} + +define <2 x i128> @uv2i128(<2 x i128> %d, <2 x i128> %e) { +; CHECK-LABEL: uv2i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; CHECK-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w21, -24 +; CHECK-NEXT: .cfi_offset w22, -32 +; CHECK-NEXT: .cfi_offset w23, -40 +; CHECK-NEXT: .cfi_offset w24, -48 +; CHECK-NEXT: .cfi_offset w30, -64 +; CHECK-NEXT: mov x21, x3 +; CHECK-NEXT: mov x22, x2 +; CHECK-NEXT: mov x2, x4 +; CHECK-NEXT: mov x3, x5 +; CHECK-NEXT: mov x19, x7 +; CHECK-NEXT: mov x20, x6 +; CHECK-NEXT: bl __umodti3 +; CHECK-NEXT: mov x23, x0 +; CHECK-NEXT: mov x24, x1 +; CHECK-NEXT: mov x0, x22 +; CHECK-NEXT: mov x1, x21 +; CHECK-NEXT: mov x2, x20 +; CHECK-NEXT: mov x3, x19 +; CHECK-NEXT: bl __umodti3 +; CHECK-NEXT: mov x2, x0 +; CHECK-NEXT: mov x3, x1 +; CHECK-NEXT: mov x0, x23 +; CHECK-NEXT: mov x1, x24 +; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %s = urem <2 x i128> %d, %e + ret <2 x i128> %s +} + +define <3 x i128> @uv3i128(<3 x i128> %d, <3 x i128> %e) { +; CHECK-LABEL: uv3i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-96]! // 8-byte Folded Spill +; CHECK-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w21, -24 +; CHECK-NEXT: .cfi_offset w22, -32 +; CHECK-NEXT: .cfi_offset w23, -40 +; CHECK-NEXT: .cfi_offset w24, -48 +; CHECK-NEXT: .cfi_offset w25, -56 +; CHECK-NEXT: .cfi_offset w26, -64 +; CHECK-NEXT: .cfi_offset w27, -72 +; CHECK-NEXT: .cfi_offset w28, -80 +; CHECK-NEXT: .cfi_offset w30, -96 +; CHECK-NEXT: ldp x23, x24, [sp, #112] +; CHECK-NEXT: mov x21, x3 +; CHECK-NEXT: ldp x25, x26, [sp, #96] +; CHECK-NEXT: mov x22, x2 +; CHECK-NEXT: mov x2, x6 +; CHECK-NEXT: mov x3, x7 +; CHECK-NEXT: mov x19, x5 +; CHECK-NEXT: mov x20, x4 +; CHECK-NEXT: bl __umodti3 +; CHECK-NEXT: mov x27, x0 +; CHECK-NEXT: mov x28, x1 +; CHECK-NEXT: mov x0, x22 +; CHECK-NEXT: mov x1, x21 +; CHECK-NEXT: mov x2, x25 +; CHECK-NEXT: mov x3, x26 +; CHECK-NEXT: bl __umodti3 +; CHECK-NEXT: mov x21, x0 +; CHECK-NEXT: mov x22, x1 +; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: mov x1, x19 +; CHECK-NEXT: mov x2, x23 +; CHECK-NEXT: mov x3, x24 +; CHECK-NEXT: bl __umodti3 +; CHECK-NEXT: mov x4, x0 +; CHECK-NEXT: mov x5, x1 +; CHECK-NEXT: mov x0, x27 +; CHECK-NEXT: mov x1, x28 +; CHECK-NEXT: mov x2, x21 +; CHECK-NEXT: mov x3, x22 +; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp x26, x25, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp x28, x27, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #96 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %s = urem <3 x i128> %d, %e + ret <3 x i128> %s +} + +define <4 x i128> @uv4i128(<4 x i128> %d, <4 x i128> %e) { +; CHECK-LABEL: uv4i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #128 +; CHECK-NEXT: stp x29, x30, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp x28, x27, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x26, x25, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x24, x23, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #112] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 128 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w21, -24 +; CHECK-NEXT: .cfi_offset w22, -32 +; CHECK-NEXT: .cfi_offset w23, -40 +; CHECK-NEXT: .cfi_offset w24, -48 +; CHECK-NEXT: .cfi_offset w25, -56 +; CHECK-NEXT: .cfi_offset w26, -64 +; CHECK-NEXT: .cfi_offset w27, -72 +; CHECK-NEXT: .cfi_offset w28, -80 +; CHECK-NEXT: .cfi_offset w30, -88 +; CHECK-NEXT: .cfi_offset w29, -96 +; CHECK-NEXT: mov x23, x3 +; CHECK-NEXT: mov x24, x2 +; CHECK-NEXT: stp x6, x7, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: ldp x8, x26, [sp, #176] +; CHECK-NEXT: mov x21, x5 +; CHECK-NEXT: ldp x2, x3, [sp, #128] +; CHECK-NEXT: mov x22, x4 +; CHECK-NEXT: ldp x27, x28, [sp, #160] +; CHECK-NEXT: ldp x29, x19, [sp, #144] +; CHECK-NEXT: str x8, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: bl __umodti3 +; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: mov x25, x1 +; CHECK-NEXT: mov x0, x24 +; CHECK-NEXT: mov x1, x23 +; CHECK-NEXT: mov x2, x29 +; CHECK-NEXT: mov x3, x19 +; CHECK-NEXT: bl __umodti3 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: mov x23, x1 +; CHECK-NEXT: mov x0, x22 +; CHECK-NEXT: mov x1, x21 +; CHECK-NEXT: mov x2, x27 +; CHECK-NEXT: mov x3, x28 +; CHECK-NEXT: bl __umodti3 +; CHECK-NEXT: mov x21, x0 +; CHECK-NEXT: mov x22, x1 +; CHECK-NEXT: ldr x2, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ldp x0, x1, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: mov x3, x26 +; CHECK-NEXT: bl __umodti3 +; CHECK-NEXT: mov x6, x0 +; CHECK-NEXT: mov x7, x1 +; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: mov x1, x25 +; CHECK-NEXT: mov x2, x19 +; CHECK-NEXT: mov x3, x23 +; CHECK-NEXT: mov x4, x21 +; CHECK-NEXT: mov x5, x22 +; CHECK-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload +; CHECK-NEXT: ldp x22, x21, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: ldp x24, x23, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp x26, x25, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp x28, x27, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #128 +; CHECK-NEXT: ret +entry: + %s = urem <4 x i128> %d, %e + ret <4 x i128> %s +} From 02cb7c9ef5aecea3a820bc98b50adf4d7c4c5eb6 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 22 Aug 2024 11:39:27 +0100 Subject: [PATCH 176/426] [AArch64][GlobalISel] Libcall i128 srem/urem and scalarize more vector types. This better handles i128 scalar and vector types, and allows some of the other odd-sized-vectors to successfully lower under GISel. --- .../AArch64/GISel/AArch64LegalizerInfo.cpp | 6 +- llvm/test/CodeGen/AArch64/rem.ll | 1349 +++++++++++------ 2 files changed, 899 insertions(+), 456 deletions(-) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 33a1fa1ad04fdf..35d73d36df46fe 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -196,12 +196,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) .lowerFor({s8, s16, s32, s64, v2s64, v4s32, v2s32}) + .libcallFor({s128}) .widenScalarOrEltToNextPow2(0) - .clampScalarOrElt(0, s32, s64) + .minScalarOrElt(0, s32) .clampNumElements(0, v2s32, v4s32) .clampNumElements(0, v2s64, v2s64) - .moreElementsToNextPow2(0); - + .scalarize(0); getActionDefinitionsBuilder({G_SMULO, G_UMULO}) .widenScalarToNextPow2(0, /*Min = */ 32) diff --git a/llvm/test/CodeGen/AArch64/rem.ll b/llvm/test/CodeGen/AArch64/rem.ll index 7f4df00d4aa794..81682c5f0ce85d 100644 --- a/llvm/test/CodeGen/AArch64/rem.ll +++ b/llvm/test/CodeGen/AArch64/rem.ll @@ -1,21 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD -; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI - -; CHECK-GI: warning: Instruction selection used fallback path for si128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for ui128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sv3i8 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uv3i8 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sv3i16 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uv3i16 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sv3i32 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uv3i32 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sv2i128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sv3i128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sv4i128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uv2i128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uv3i128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uv4i128 +; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI define i8 @si8(i8 %a, i8 %b) { ; CHECK-SD-LABEL: si8: @@ -216,21 +201,37 @@ entry: } define <3 x i8> @sv3i8(<3 x i8> %d, <3 x i8> %e) { -; CHECK-LABEL: sv3i8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sxtb w8, w3 -; CHECK-NEXT: sxtb w9, w0 -; CHECK-NEXT: sxtb w11, w4 -; CHECK-NEXT: sxtb w12, w1 -; CHECK-NEXT: sxtb w14, w5 -; CHECK-NEXT: sxtb w15, w2 -; CHECK-NEXT: sdiv w10, w9, w8 -; CHECK-NEXT: sdiv w13, w12, w11 -; CHECK-NEXT: msub w0, w10, w8, w9 -; CHECK-NEXT: sdiv w16, w15, w14 -; CHECK-NEXT: msub w1, w13, w11, w12 -; CHECK-NEXT: msub w2, w16, w14, w15 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sv3i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sxtb w8, w3 +; CHECK-SD-NEXT: sxtb w9, w0 +; CHECK-SD-NEXT: sxtb w11, w4 +; CHECK-SD-NEXT: sxtb w12, w1 +; CHECK-SD-NEXT: sxtb w14, w5 +; CHECK-SD-NEXT: sxtb w15, w2 +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: sdiv w13, w12, w11 +; CHECK-SD-NEXT: msub w0, w10, w8, w9 +; CHECK-SD-NEXT: sdiv w16, w15, w14 +; CHECK-SD-NEXT: msub w1, w13, w11, w12 +; CHECK-SD-NEXT: msub w2, w16, w14, w15 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv3i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sxtb w8, w0 +; CHECK-GI-NEXT: sxtb w9, w3 +; CHECK-GI-NEXT: sxtb w11, w1 +; CHECK-GI-NEXT: sxtb w12, w4 +; CHECK-GI-NEXT: sxtb w14, w2 +; CHECK-GI-NEXT: sxtb w15, w5 +; CHECK-GI-NEXT: sdiv w10, w8, w9 +; CHECK-GI-NEXT: sdiv w13, w11, w12 +; CHECK-GI-NEXT: msub w0, w10, w9, w8 +; CHECK-GI-NEXT: sdiv w16, w14, w15 +; CHECK-GI-NEXT: msub w1, w13, w12, w11 +; CHECK-GI-NEXT: msub w2, w16, w15, w14 +; CHECK-GI-NEXT: ret entry: %s = srem <3 x i8> %d, %e ret <3 x i8> %s @@ -1123,21 +1124,37 @@ entry: } define <3 x i8> @uv3i8(<3 x i8> %d, <3 x i8> %e) { -; CHECK-LABEL: uv3i8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: and w8, w3, #0xff -; CHECK-NEXT: and w9, w0, #0xff -; CHECK-NEXT: and w11, w4, #0xff -; CHECK-NEXT: and w12, w1, #0xff -; CHECK-NEXT: and w14, w5, #0xff -; CHECK-NEXT: and w15, w2, #0xff -; CHECK-NEXT: udiv w10, w9, w8 -; CHECK-NEXT: udiv w13, w12, w11 -; CHECK-NEXT: msub w0, w10, w8, w9 -; CHECK-NEXT: udiv w16, w15, w14 -; CHECK-NEXT: msub w1, w13, w11, w12 -; CHECK-NEXT: msub w2, w16, w14, w15 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: uv3i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: and w8, w3, #0xff +; CHECK-SD-NEXT: and w9, w0, #0xff +; CHECK-SD-NEXT: and w11, w4, #0xff +; CHECK-SD-NEXT: and w12, w1, #0xff +; CHECK-SD-NEXT: and w14, w5, #0xff +; CHECK-SD-NEXT: and w15, w2, #0xff +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: udiv w13, w12, w11 +; CHECK-SD-NEXT: msub w0, w10, w8, w9 +; CHECK-SD-NEXT: udiv w16, w15, w14 +; CHECK-SD-NEXT: msub w1, w13, w11, w12 +; CHECK-SD-NEXT: msub w2, w16, w14, w15 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv3i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: and w8, w0, #0xff +; CHECK-GI-NEXT: and w9, w3, #0xff +; CHECK-GI-NEXT: and w11, w1, #0xff +; CHECK-GI-NEXT: and w12, w4, #0xff +; CHECK-GI-NEXT: and w14, w2, #0xff +; CHECK-GI-NEXT: and w15, w5, #0xff +; CHECK-GI-NEXT: udiv w10, w8, w9 +; CHECK-GI-NEXT: udiv w13, w11, w12 +; CHECK-GI-NEXT: msub w0, w10, w9, w8 +; CHECK-GI-NEXT: udiv w16, w14, w15 +; CHECK-GI-NEXT: msub w1, w13, w12, w11 +; CHECK-GI-NEXT: msub w2, w16, w15, w14 +; CHECK-GI-NEXT: ret entry: %s = urem <3 x i8> %d, %e ret <3 x i8> %s @@ -2031,27 +2048,51 @@ entry: } define <3 x i16> @sv3i16(<3 x i16> %d, <3 x i16> %e) { -; CHECK-LABEL: sv3i16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w11, v1.h[0] -; CHECK-NEXT: smov w12, v0.h[0] -; CHECK-NEXT: smov w8, v1.h[1] -; CHECK-NEXT: smov w9, v0.h[1] -; CHECK-NEXT: smov w14, v1.h[2] -; CHECK-NEXT: smov w15, v0.h[2] -; CHECK-NEXT: sdiv w13, w12, w11 -; CHECK-NEXT: sdiv w10, w9, w8 -; CHECK-NEXT: msub w11, w13, w11, w12 -; CHECK-NEXT: fmov s0, w11 -; CHECK-NEXT: sdiv w16, w15, w14 -; CHECK-NEXT: msub w8, w10, w8, w9 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: msub w8, w16, w14, w15 -; CHECK-NEXT: mov v0.h[2], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sv3i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: smov w11, v1.h[0] +; CHECK-SD-NEXT: smov w12, v0.h[0] +; CHECK-SD-NEXT: smov w8, v1.h[1] +; CHECK-SD-NEXT: smov w9, v0.h[1] +; CHECK-SD-NEXT: smov w14, v1.h[2] +; CHECK-SD-NEXT: smov w15, v0.h[2] +; CHECK-SD-NEXT: sdiv w13, w12, w11 +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: msub w11, w13, w11, w12 +; CHECK-SD-NEXT: fmov s0, w11 +; CHECK-SD-NEXT: sdiv w16, w15, w14 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: mov v0.h[1], w8 +; CHECK-SD-NEXT: msub w8, w16, w14, w15 +; CHECK-SD-NEXT: mov v0.h[2], w8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv3i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: smov w8, v0.h[0] +; CHECK-GI-NEXT: smov w9, v1.h[0] +; CHECK-GI-NEXT: smov w11, v0.h[1] +; CHECK-GI-NEXT: smov w12, v1.h[1] +; CHECK-GI-NEXT: smov w14, v0.h[2] +; CHECK-GI-NEXT: smov w15, v1.h[2] +; CHECK-GI-NEXT: sdiv w10, w8, w9 +; CHECK-GI-NEXT: sdiv w13, w11, w12 +; CHECK-GI-NEXT: msub w8, w10, w9, w8 +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: sdiv w16, w14, w15 +; CHECK-GI-NEXT: msub w9, w13, w12, w11 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: msub w8, w16, w15, w14 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret entry: %s = srem <3 x i16> %d, %e ret <3 x i16> %s @@ -2472,32 +2513,56 @@ entry: } define <3 x i16> @uv3i16(<3 x i16> %d, <3 x i16> %e) { -; CHECK-LABEL: uv3i16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w11, v1.h[0] -; CHECK-NEXT: umov w12, v0.h[0] -; CHECK-NEXT: umov w8, v1.h[1] -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: umov w13, v0.h[2] -; CHECK-NEXT: umov w14, v1.h[0] -; CHECK-NEXT: umov w16, v0.h[0] -; CHECK-NEXT: udiv w11, w12, w11 -; CHECK-NEXT: umov w12, v1.h[2] -; CHECK-NEXT: udiv w10, w9, w8 -; CHECK-NEXT: msub w11, w11, w14, w16 -; CHECK-NEXT: udiv w15, w13, w12 -; CHECK-NEXT: msub w8, w10, w8, w9 -; CHECK-NEXT: sxth w9, w11 -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: sxth w8, w8 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: msub w10, w15, w12, w13 -; CHECK-NEXT: sxth w8, w10 -; CHECK-NEXT: mov v0.h[2], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: uv3i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: umov w11, v1.h[0] +; CHECK-SD-NEXT: umov w12, v0.h[0] +; CHECK-SD-NEXT: umov w8, v1.h[1] +; CHECK-SD-NEXT: umov w9, v0.h[1] +; CHECK-SD-NEXT: umov w13, v0.h[2] +; CHECK-SD-NEXT: umov w14, v1.h[0] +; CHECK-SD-NEXT: umov w16, v0.h[0] +; CHECK-SD-NEXT: udiv w11, w12, w11 +; CHECK-SD-NEXT: umov w12, v1.h[2] +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: msub w11, w11, w14, w16 +; CHECK-SD-NEXT: udiv w15, w13, w12 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: sxth w9, w11 +; CHECK-SD-NEXT: fmov s0, w9 +; CHECK-SD-NEXT: sxth w8, w8 +; CHECK-SD-NEXT: mov v0.h[1], w8 +; CHECK-SD-NEXT: msub w10, w15, w12, w13 +; CHECK-SD-NEXT: sxth w8, w10 +; CHECK-SD-NEXT: mov v0.h[2], w8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv3i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: umov w8, v0.h[0] +; CHECK-GI-NEXT: umov w9, v1.h[0] +; CHECK-GI-NEXT: umov w11, v0.h[1] +; CHECK-GI-NEXT: umov w12, v1.h[1] +; CHECK-GI-NEXT: umov w14, v0.h[2] +; CHECK-GI-NEXT: umov w15, v1.h[2] +; CHECK-GI-NEXT: udiv w10, w8, w9 +; CHECK-GI-NEXT: udiv w13, w11, w12 +; CHECK-GI-NEXT: msub w8, w10, w9, w8 +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: udiv w16, w14, w15 +; CHECK-GI-NEXT: msub w9, w13, w12, w11 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: msub w8, w16, w15, w14 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret entry: %s = urem <3 x i16> %d, %e ret <3 x i16> %s @@ -2916,24 +2981,47 @@ entry: } define <3 x i32> @sv3i32(<3 x i32> %d, <3 x i32> %e) { -; CHECK-LABEL: sv3i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov w11, s1 -; CHECK-NEXT: fmov w12, s0 -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: mov w9, v0.s[1] -; CHECK-NEXT: mov w14, v1.s[2] -; CHECK-NEXT: mov w15, v0.s[2] -; CHECK-NEXT: sdiv w13, w12, w11 -; CHECK-NEXT: sdiv w10, w9, w8 -; CHECK-NEXT: msub w11, w13, w11, w12 -; CHECK-NEXT: fmov s0, w11 -; CHECK-NEXT: sdiv w16, w15, w14 -; CHECK-NEXT: msub w8, w10, w8, w9 -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: msub w8, w16, w14, w15 -; CHECK-NEXT: mov v0.s[2], w8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sv3i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov w11, s1 +; CHECK-SD-NEXT: fmov w12, s0 +; CHECK-SD-NEXT: mov w8, v1.s[1] +; CHECK-SD-NEXT: mov w9, v0.s[1] +; CHECK-SD-NEXT: mov w14, v1.s[2] +; CHECK-SD-NEXT: mov w15, v0.s[2] +; CHECK-SD-NEXT: sdiv w13, w12, w11 +; CHECK-SD-NEXT: sdiv w10, w9, w8 +; CHECK-SD-NEXT: msub w11, w13, w11, w12 +; CHECK-SD-NEXT: fmov s0, w11 +; CHECK-SD-NEXT: sdiv w16, w15, w14 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: mov v0.s[1], w8 +; CHECK-SD-NEXT: msub w8, w16, w14, w15 +; CHECK-SD-NEXT: mov v0.s[2], w8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv3i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov s2, v0.s[1] +; CHECK-GI-NEXT: mov s3, v1.s[1] +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: mov s0, v0.s[2] +; CHECK-GI-NEXT: mov s1, v1.s[2] +; CHECK-GI-NEXT: sdiv w10, w8, w9 +; CHECK-GI-NEXT: fmov w11, s2 +; CHECK-GI-NEXT: fmov w12, s3 +; CHECK-GI-NEXT: fmov w14, s0 +; CHECK-GI-NEXT: fmov w15, s1 +; CHECK-GI-NEXT: sdiv w13, w11, w12 +; CHECK-GI-NEXT: msub w8, w10, w9, w8 +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: sdiv w16, w14, w15 +; CHECK-GI-NEXT: msub w9, w13, w12, w11 +; CHECK-GI-NEXT: mov v0.s[1], w9 +; CHECK-GI-NEXT: msub w8, w16, w15, w14 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: ret entry: %s = srem <3 x i32> %d, %e ret <3 x i32> %s @@ -3124,24 +3212,47 @@ entry: } define <3 x i32> @uv3i32(<3 x i32> %d, <3 x i32> %e) { -; CHECK-LABEL: uv3i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov w11, s1 -; CHECK-NEXT: fmov w12, s0 -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: mov w9, v0.s[1] -; CHECK-NEXT: mov w14, v1.s[2] -; CHECK-NEXT: mov w15, v0.s[2] -; CHECK-NEXT: udiv w13, w12, w11 -; CHECK-NEXT: udiv w10, w9, w8 -; CHECK-NEXT: msub w11, w13, w11, w12 -; CHECK-NEXT: fmov s0, w11 -; CHECK-NEXT: udiv w16, w15, w14 -; CHECK-NEXT: msub w8, w10, w8, w9 -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: msub w8, w16, w14, w15 -; CHECK-NEXT: mov v0.s[2], w8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: uv3i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov w11, s1 +; CHECK-SD-NEXT: fmov w12, s0 +; CHECK-SD-NEXT: mov w8, v1.s[1] +; CHECK-SD-NEXT: mov w9, v0.s[1] +; CHECK-SD-NEXT: mov w14, v1.s[2] +; CHECK-SD-NEXT: mov w15, v0.s[2] +; CHECK-SD-NEXT: udiv w13, w12, w11 +; CHECK-SD-NEXT: udiv w10, w9, w8 +; CHECK-SD-NEXT: msub w11, w13, w11, w12 +; CHECK-SD-NEXT: fmov s0, w11 +; CHECK-SD-NEXT: udiv w16, w15, w14 +; CHECK-SD-NEXT: msub w8, w10, w8, w9 +; CHECK-SD-NEXT: mov v0.s[1], w8 +; CHECK-SD-NEXT: msub w8, w16, w14, w15 +; CHECK-SD-NEXT: mov v0.s[2], w8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv3i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov s2, v0.s[1] +; CHECK-GI-NEXT: mov s3, v1.s[1] +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: mov s0, v0.s[2] +; CHECK-GI-NEXT: mov s1, v1.s[2] +; CHECK-GI-NEXT: udiv w10, w8, w9 +; CHECK-GI-NEXT: fmov w11, s2 +; CHECK-GI-NEXT: fmov w12, s3 +; CHECK-GI-NEXT: fmov w14, s0 +; CHECK-GI-NEXT: fmov w15, s1 +; CHECK-GI-NEXT: udiv w13, w11, w12 +; CHECK-GI-NEXT: msub w8, w10, w9, w8 +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: udiv w16, w14, w15 +; CHECK-GI-NEXT: msub w9, w13, w12, w11 +; CHECK-GI-NEXT: mov v0.s[1], w9 +; CHECK-GI-NEXT: msub w8, w16, w15, w14 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: ret entry: %s = urem <3 x i32> %d, %e ret <3 x i32> %s @@ -3624,360 +3735,692 @@ entry: } define <2 x i128> @sv2i128(<2 x i128> %d, <2 x i128> %e) { -; CHECK-LABEL: sv2i128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill -; CHECK-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -24 -; CHECK-NEXT: .cfi_offset w22, -32 -; CHECK-NEXT: .cfi_offset w23, -40 -; CHECK-NEXT: .cfi_offset w24, -48 -; CHECK-NEXT: .cfi_offset w30, -64 -; CHECK-NEXT: mov x21, x3 -; CHECK-NEXT: mov x22, x2 -; CHECK-NEXT: mov x2, x4 -; CHECK-NEXT: mov x3, x5 -; CHECK-NEXT: mov x19, x7 -; CHECK-NEXT: mov x20, x6 -; CHECK-NEXT: bl __modti3 -; CHECK-NEXT: mov x23, x0 -; CHECK-NEXT: mov x24, x1 -; CHECK-NEXT: mov x0, x22 -; CHECK-NEXT: mov x1, x21 -; CHECK-NEXT: mov x2, x20 -; CHECK-NEXT: mov x3, x19 -; CHECK-NEXT: bl __modti3 -; CHECK-NEXT: mov x2, x0 -; CHECK-NEXT: mov x3, x1 -; CHECK-NEXT: mov x0, x23 -; CHECK-NEXT: mov x1, x24 -; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sv2i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 64 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w23, -40 +; CHECK-SD-NEXT: .cfi_offset w24, -48 +; CHECK-SD-NEXT: .cfi_offset w30, -64 +; CHECK-SD-NEXT: mov x21, x3 +; CHECK-SD-NEXT: mov x22, x2 +; CHECK-SD-NEXT: mov x2, x4 +; CHECK-SD-NEXT: mov x3, x5 +; CHECK-SD-NEXT: mov x19, x7 +; CHECK-SD-NEXT: mov x20, x6 +; CHECK-SD-NEXT: bl __modti3 +; CHECK-SD-NEXT: mov x23, x0 +; CHECK-SD-NEXT: mov x24, x1 +; CHECK-SD-NEXT: mov x0, x22 +; CHECK-SD-NEXT: mov x1, x21 +; CHECK-SD-NEXT: mov x2, x20 +; CHECK-SD-NEXT: mov x3, x19 +; CHECK-SD-NEXT: bl __modti3 +; CHECK-SD-NEXT: mov x2, x0 +; CHECK-SD-NEXT: mov x3, x1 +; CHECK-SD-NEXT: mov x0, x23 +; CHECK-SD-NEXT: mov x1, x24 +; CHECK-SD-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv2i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w21, -24 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w23, -40 +; CHECK-GI-NEXT: .cfi_offset w24, -48 +; CHECK-GI-NEXT: .cfi_offset w30, -64 +; CHECK-GI-NEXT: mov x19, x2 +; CHECK-GI-NEXT: mov x20, x3 +; CHECK-GI-NEXT: mov x2, x4 +; CHECK-GI-NEXT: mov x3, x5 +; CHECK-GI-NEXT: mov x21, x6 +; CHECK-GI-NEXT: mov x22, x7 +; CHECK-GI-NEXT: bl __modti3 +; CHECK-GI-NEXT: mov x23, x0 +; CHECK-GI-NEXT: mov x24, x1 +; CHECK-GI-NEXT: mov x0, x19 +; CHECK-GI-NEXT: mov x1, x20 +; CHECK-GI-NEXT: mov x2, x21 +; CHECK-GI-NEXT: mov x3, x22 +; CHECK-GI-NEXT: bl __modti3 +; CHECK-GI-NEXT: mov x2, x0 +; CHECK-GI-NEXT: mov x3, x1 +; CHECK-GI-NEXT: mov x0, x23 +; CHECK-GI-NEXT: mov x1, x24 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; CHECK-GI-NEXT: ret entry: %s = srem <2 x i128> %d, %e ret <2 x i128> %s } define <3 x i128> @sv3i128(<3 x i128> %d, <3 x i128> %e) { -; CHECK-LABEL: sv3i128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-96]! // 8-byte Folded Spill -; CHECK-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -24 -; CHECK-NEXT: .cfi_offset w22, -32 -; CHECK-NEXT: .cfi_offset w23, -40 -; CHECK-NEXT: .cfi_offset w24, -48 -; CHECK-NEXT: .cfi_offset w25, -56 -; CHECK-NEXT: .cfi_offset w26, -64 -; CHECK-NEXT: .cfi_offset w27, -72 -; CHECK-NEXT: .cfi_offset w28, -80 -; CHECK-NEXT: .cfi_offset w30, -96 -; CHECK-NEXT: ldp x23, x24, [sp, #112] -; CHECK-NEXT: mov x21, x3 -; CHECK-NEXT: ldp x25, x26, [sp, #96] -; CHECK-NEXT: mov x22, x2 -; CHECK-NEXT: mov x2, x6 -; CHECK-NEXT: mov x3, x7 -; CHECK-NEXT: mov x19, x5 -; CHECK-NEXT: mov x20, x4 -; CHECK-NEXT: bl __modti3 -; CHECK-NEXT: mov x27, x0 -; CHECK-NEXT: mov x28, x1 -; CHECK-NEXT: mov x0, x22 -; CHECK-NEXT: mov x1, x21 -; CHECK-NEXT: mov x2, x25 -; CHECK-NEXT: mov x3, x26 -; CHECK-NEXT: bl __modti3 -; CHECK-NEXT: mov x21, x0 -; CHECK-NEXT: mov x22, x1 -; CHECK-NEXT: mov x0, x20 -; CHECK-NEXT: mov x1, x19 -; CHECK-NEXT: mov x2, x23 -; CHECK-NEXT: mov x3, x24 -; CHECK-NEXT: bl __modti3 -; CHECK-NEXT: mov x4, x0 -; CHECK-NEXT: mov x5, x1 -; CHECK-NEXT: mov x0, x27 -; CHECK-NEXT: mov x1, x28 -; CHECK-NEXT: mov x2, x21 -; CHECK-NEXT: mov x3, x22 -; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp x26, x25, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp x28, x27, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp], #96 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sv3i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-96]! // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 96 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w23, -40 +; CHECK-SD-NEXT: .cfi_offset w24, -48 +; CHECK-SD-NEXT: .cfi_offset w25, -56 +; CHECK-SD-NEXT: .cfi_offset w26, -64 +; CHECK-SD-NEXT: .cfi_offset w27, -72 +; CHECK-SD-NEXT: .cfi_offset w28, -80 +; CHECK-SD-NEXT: .cfi_offset w30, -96 +; CHECK-SD-NEXT: ldp x23, x24, [sp, #112] +; CHECK-SD-NEXT: mov x21, x3 +; CHECK-SD-NEXT: ldp x25, x26, [sp, #96] +; CHECK-SD-NEXT: mov x22, x2 +; CHECK-SD-NEXT: mov x2, x6 +; CHECK-SD-NEXT: mov x3, x7 +; CHECK-SD-NEXT: mov x19, x5 +; CHECK-SD-NEXT: mov x20, x4 +; CHECK-SD-NEXT: bl __modti3 +; CHECK-SD-NEXT: mov x27, x0 +; CHECK-SD-NEXT: mov x28, x1 +; CHECK-SD-NEXT: mov x0, x22 +; CHECK-SD-NEXT: mov x1, x21 +; CHECK-SD-NEXT: mov x2, x25 +; CHECK-SD-NEXT: mov x3, x26 +; CHECK-SD-NEXT: bl __modti3 +; CHECK-SD-NEXT: mov x21, x0 +; CHECK-SD-NEXT: mov x22, x1 +; CHECK-SD-NEXT: mov x0, x20 +; CHECK-SD-NEXT: mov x1, x19 +; CHECK-SD-NEXT: mov x2, x23 +; CHECK-SD-NEXT: mov x3, x24 +; CHECK-SD-NEXT: bl __modti3 +; CHECK-SD-NEXT: mov x4, x0 +; CHECK-SD-NEXT: mov x5, x1 +; CHECK-SD-NEXT: mov x0, x27 +; CHECK-SD-NEXT: mov x1, x28 +; CHECK-SD-NEXT: mov x2, x21 +; CHECK-SD-NEXT: mov x3, x22 +; CHECK-SD-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x26, x25, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x28, x27, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp], #96 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv3i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: str x30, [sp, #-96]! // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 96 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w21, -24 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w23, -40 +; CHECK-GI-NEXT: .cfi_offset w24, -48 +; CHECK-GI-NEXT: .cfi_offset w25, -56 +; CHECK-GI-NEXT: .cfi_offset w26, -64 +; CHECK-GI-NEXT: .cfi_offset w27, -72 +; CHECK-GI-NEXT: .cfi_offset w28, -80 +; CHECK-GI-NEXT: .cfi_offset w30, -96 +; CHECK-GI-NEXT: ldp x23, x24, [sp, #96] +; CHECK-GI-NEXT: mov x19, x2 +; CHECK-GI-NEXT: ldp x25, x26, [sp, #112] +; CHECK-GI-NEXT: mov x20, x3 +; CHECK-GI-NEXT: mov x2, x6 +; CHECK-GI-NEXT: mov x3, x7 +; CHECK-GI-NEXT: mov x21, x4 +; CHECK-GI-NEXT: mov x22, x5 +; CHECK-GI-NEXT: bl __modti3 +; CHECK-GI-NEXT: mov x27, x0 +; CHECK-GI-NEXT: mov x28, x1 +; CHECK-GI-NEXT: mov x0, x19 +; CHECK-GI-NEXT: mov x1, x20 +; CHECK-GI-NEXT: mov x2, x23 +; CHECK-GI-NEXT: mov x3, x24 +; CHECK-GI-NEXT: bl __modti3 +; CHECK-GI-NEXT: mov x19, x0 +; CHECK-GI-NEXT: mov x20, x1 +; CHECK-GI-NEXT: mov x0, x21 +; CHECK-GI-NEXT: mov x1, x22 +; CHECK-GI-NEXT: mov x2, x25 +; CHECK-GI-NEXT: mov x3, x26 +; CHECK-GI-NEXT: bl __modti3 +; CHECK-GI-NEXT: mov x4, x0 +; CHECK-GI-NEXT: mov x5, x1 +; CHECK-GI-NEXT: mov x0, x27 +; CHECK-GI-NEXT: mov x1, x28 +; CHECK-GI-NEXT: mov x2, x19 +; CHECK-GI-NEXT: mov x3, x20 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x26, x25, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x28, x27, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp], #96 // 8-byte Folded Reload +; CHECK-GI-NEXT: ret entry: %s = srem <3 x i128> %d, %e ret <3 x i128> %s } define <4 x i128> @sv4i128(<4 x i128> %d, <4 x i128> %e) { -; CHECK-LABEL: sv4i128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #128 -; CHECK-NEXT: stp x29, x30, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp x28, x27, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x26, x25, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x24, x23, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #96] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #112] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -24 -; CHECK-NEXT: .cfi_offset w22, -32 -; CHECK-NEXT: .cfi_offset w23, -40 -; CHECK-NEXT: .cfi_offset w24, -48 -; CHECK-NEXT: .cfi_offset w25, -56 -; CHECK-NEXT: .cfi_offset w26, -64 -; CHECK-NEXT: .cfi_offset w27, -72 -; CHECK-NEXT: .cfi_offset w28, -80 -; CHECK-NEXT: .cfi_offset w30, -88 -; CHECK-NEXT: .cfi_offset w29, -96 -; CHECK-NEXT: mov x23, x3 -; CHECK-NEXT: mov x24, x2 -; CHECK-NEXT: stp x6, x7, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: ldp x8, x26, [sp, #176] -; CHECK-NEXT: mov x21, x5 -; CHECK-NEXT: ldp x2, x3, [sp, #128] -; CHECK-NEXT: mov x22, x4 -; CHECK-NEXT: ldp x27, x28, [sp, #160] -; CHECK-NEXT: ldp x29, x19, [sp, #144] -; CHECK-NEXT: str x8, [sp, #8] // 8-byte Folded Spill -; CHECK-NEXT: bl __modti3 -; CHECK-NEXT: mov x20, x0 -; CHECK-NEXT: mov x25, x1 -; CHECK-NEXT: mov x0, x24 -; CHECK-NEXT: mov x1, x23 -; CHECK-NEXT: mov x2, x29 -; CHECK-NEXT: mov x3, x19 -; CHECK-NEXT: bl __modti3 -; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: mov x23, x1 -; CHECK-NEXT: mov x0, x22 -; CHECK-NEXT: mov x1, x21 -; CHECK-NEXT: mov x2, x27 -; CHECK-NEXT: mov x3, x28 -; CHECK-NEXT: bl __modti3 -; CHECK-NEXT: mov x21, x0 -; CHECK-NEXT: mov x22, x1 -; CHECK-NEXT: ldr x2, [sp, #8] // 8-byte Folded Reload -; CHECK-NEXT: ldp x0, x1, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: mov x3, x26 -; CHECK-NEXT: bl __modti3 -; CHECK-NEXT: mov x6, x0 -; CHECK-NEXT: mov x7, x1 -; CHECK-NEXT: mov x0, x20 -; CHECK-NEXT: mov x1, x25 -; CHECK-NEXT: mov x2, x19 -; CHECK-NEXT: mov x3, x23 -; CHECK-NEXT: mov x4, x21 -; CHECK-NEXT: mov x5, x22 -; CHECK-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload -; CHECK-NEXT: ldp x22, x21, [sp, #96] // 16-byte Folded Reload -; CHECK-NEXT: ldp x24, x23, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldp x26, x25, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldp x28, x27, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #128 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sv4i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #128 +; CHECK-SD-NEXT: stp x29, x30, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x28, x27, [sp, #48] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x26, x25, [sp, #64] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x24, x23, [sp, #80] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #96] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #112] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 128 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w23, -40 +; CHECK-SD-NEXT: .cfi_offset w24, -48 +; CHECK-SD-NEXT: .cfi_offset w25, -56 +; CHECK-SD-NEXT: .cfi_offset w26, -64 +; CHECK-SD-NEXT: .cfi_offset w27, -72 +; CHECK-SD-NEXT: .cfi_offset w28, -80 +; CHECK-SD-NEXT: .cfi_offset w30, -88 +; CHECK-SD-NEXT: .cfi_offset w29, -96 +; CHECK-SD-NEXT: mov x23, x3 +; CHECK-SD-NEXT: mov x24, x2 +; CHECK-SD-NEXT: stp x6, x7, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldp x8, x26, [sp, #176] +; CHECK-SD-NEXT: mov x21, x5 +; CHECK-SD-NEXT: ldp x2, x3, [sp, #128] +; CHECK-SD-NEXT: mov x22, x4 +; CHECK-SD-NEXT: ldp x27, x28, [sp, #160] +; CHECK-SD-NEXT: ldp x29, x19, [sp, #144] +; CHECK-SD-NEXT: str x8, [sp, #8] // 8-byte Folded Spill +; CHECK-SD-NEXT: bl __modti3 +; CHECK-SD-NEXT: mov x20, x0 +; CHECK-SD-NEXT: mov x25, x1 +; CHECK-SD-NEXT: mov x0, x24 +; CHECK-SD-NEXT: mov x1, x23 +; CHECK-SD-NEXT: mov x2, x29 +; CHECK-SD-NEXT: mov x3, x19 +; CHECK-SD-NEXT: bl __modti3 +; CHECK-SD-NEXT: mov x19, x0 +; CHECK-SD-NEXT: mov x23, x1 +; CHECK-SD-NEXT: mov x0, x22 +; CHECK-SD-NEXT: mov x1, x21 +; CHECK-SD-NEXT: mov x2, x27 +; CHECK-SD-NEXT: mov x3, x28 +; CHECK-SD-NEXT: bl __modti3 +; CHECK-SD-NEXT: mov x21, x0 +; CHECK-SD-NEXT: mov x22, x1 +; CHECK-SD-NEXT: ldr x2, [sp, #8] // 8-byte Folded Reload +; CHECK-SD-NEXT: ldp x0, x1, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov x3, x26 +; CHECK-SD-NEXT: bl __modti3 +; CHECK-SD-NEXT: mov x6, x0 +; CHECK-SD-NEXT: mov x7, x1 +; CHECK-SD-NEXT: mov x0, x20 +; CHECK-SD-NEXT: mov x1, x25 +; CHECK-SD-NEXT: mov x2, x19 +; CHECK-SD-NEXT: mov x3, x23 +; CHECK-SD-NEXT: mov x4, x21 +; CHECK-SD-NEXT: mov x5, x22 +; CHECK-SD-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x22, x21, [sp, #96] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x24, x23, [sp, #80] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x26, x25, [sp, #64] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x28, x27, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x29, x30, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #128 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv4i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #128 +; CHECK-GI-NEXT: stp x29, x30, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x28, x27, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x26, x25, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x24, x23, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #112] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 128 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w21, -24 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w23, -40 +; CHECK-GI-NEXT: .cfi_offset w24, -48 +; CHECK-GI-NEXT: .cfi_offset w25, -56 +; CHECK-GI-NEXT: .cfi_offset w26, -64 +; CHECK-GI-NEXT: .cfi_offset w27, -72 +; CHECK-GI-NEXT: .cfi_offset w28, -80 +; CHECK-GI-NEXT: .cfi_offset w30, -88 +; CHECK-GI-NEXT: .cfi_offset w29, -96 +; CHECK-GI-NEXT: mov x19, x2 +; CHECK-GI-NEXT: mov x20, x3 +; CHECK-GI-NEXT: mov x21, x4 +; CHECK-GI-NEXT: ldp x2, x3, [sp, #128] +; CHECK-GI-NEXT: mov x22, x5 +; CHECK-GI-NEXT: ldp x9, x8, [sp, #176] +; CHECK-GI-NEXT: mov x23, x7 +; CHECK-GI-NEXT: ldp x24, x25, [sp, #144] +; CHECK-GI-NEXT: ldp x26, x27, [sp, #160] +; CHECK-GI-NEXT: stp x9, x6, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: str x8, [sp, #8] // 8-byte Folded Spill +; CHECK-GI-NEXT: bl __modti3 +; CHECK-GI-NEXT: mov x28, x0 +; CHECK-GI-NEXT: mov x29, x1 +; CHECK-GI-NEXT: mov x0, x19 +; CHECK-GI-NEXT: mov x1, x20 +; CHECK-GI-NEXT: mov x2, x24 +; CHECK-GI-NEXT: mov x3, x25 +; CHECK-GI-NEXT: bl __modti3 +; CHECK-GI-NEXT: mov x19, x0 +; CHECK-GI-NEXT: mov x20, x1 +; CHECK-GI-NEXT: mov x0, x21 +; CHECK-GI-NEXT: mov x1, x22 +; CHECK-GI-NEXT: mov x2, x26 +; CHECK-GI-NEXT: mov x3, x27 +; CHECK-GI-NEXT: bl __modti3 +; CHECK-GI-NEXT: mov x21, x0 +; CHECK-GI-NEXT: ldp x2, x0, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x3, [sp, #8] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov x22, x1 +; CHECK-GI-NEXT: mov x1, x23 +; CHECK-GI-NEXT: bl __modti3 +; CHECK-GI-NEXT: mov x6, x0 +; CHECK-GI-NEXT: mov x7, x1 +; CHECK-GI-NEXT: mov x0, x28 +; CHECK-GI-NEXT: mov x1, x29 +; CHECK-GI-NEXT: mov x2, x19 +; CHECK-GI-NEXT: mov x3, x20 +; CHECK-GI-NEXT: mov x4, x21 +; CHECK-GI-NEXT: mov x5, x22 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x24, x23, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x26, x25, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x28, x27, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x29, x30, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #128 +; CHECK-GI-NEXT: ret entry: %s = srem <4 x i128> %d, %e ret <4 x i128> %s } define <2 x i128> @uv2i128(<2 x i128> %d, <2 x i128> %e) { -; CHECK-LABEL: uv2i128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill -; CHECK-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -24 -; CHECK-NEXT: .cfi_offset w22, -32 -; CHECK-NEXT: .cfi_offset w23, -40 -; CHECK-NEXT: .cfi_offset w24, -48 -; CHECK-NEXT: .cfi_offset w30, -64 -; CHECK-NEXT: mov x21, x3 -; CHECK-NEXT: mov x22, x2 -; CHECK-NEXT: mov x2, x4 -; CHECK-NEXT: mov x3, x5 -; CHECK-NEXT: mov x19, x7 -; CHECK-NEXT: mov x20, x6 -; CHECK-NEXT: bl __umodti3 -; CHECK-NEXT: mov x23, x0 -; CHECK-NEXT: mov x24, x1 -; CHECK-NEXT: mov x0, x22 -; CHECK-NEXT: mov x1, x21 -; CHECK-NEXT: mov x2, x20 -; CHECK-NEXT: mov x3, x19 -; CHECK-NEXT: bl __umodti3 -; CHECK-NEXT: mov x2, x0 -; CHECK-NEXT: mov x3, x1 -; CHECK-NEXT: mov x0, x23 -; CHECK-NEXT: mov x1, x24 -; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: uv2i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 64 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w23, -40 +; CHECK-SD-NEXT: .cfi_offset w24, -48 +; CHECK-SD-NEXT: .cfi_offset w30, -64 +; CHECK-SD-NEXT: mov x21, x3 +; CHECK-SD-NEXT: mov x22, x2 +; CHECK-SD-NEXT: mov x2, x4 +; CHECK-SD-NEXT: mov x3, x5 +; CHECK-SD-NEXT: mov x19, x7 +; CHECK-SD-NEXT: mov x20, x6 +; CHECK-SD-NEXT: bl __umodti3 +; CHECK-SD-NEXT: mov x23, x0 +; CHECK-SD-NEXT: mov x24, x1 +; CHECK-SD-NEXT: mov x0, x22 +; CHECK-SD-NEXT: mov x1, x21 +; CHECK-SD-NEXT: mov x2, x20 +; CHECK-SD-NEXT: mov x3, x19 +; CHECK-SD-NEXT: bl __umodti3 +; CHECK-SD-NEXT: mov x2, x0 +; CHECK-SD-NEXT: mov x3, x1 +; CHECK-SD-NEXT: mov x0, x23 +; CHECK-SD-NEXT: mov x1, x24 +; CHECK-SD-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv2i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w21, -24 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w23, -40 +; CHECK-GI-NEXT: .cfi_offset w24, -48 +; CHECK-GI-NEXT: .cfi_offset w30, -64 +; CHECK-GI-NEXT: mov x19, x2 +; CHECK-GI-NEXT: mov x20, x3 +; CHECK-GI-NEXT: mov x2, x4 +; CHECK-GI-NEXT: mov x3, x5 +; CHECK-GI-NEXT: mov x21, x6 +; CHECK-GI-NEXT: mov x22, x7 +; CHECK-GI-NEXT: bl __umodti3 +; CHECK-GI-NEXT: mov x23, x0 +; CHECK-GI-NEXT: mov x24, x1 +; CHECK-GI-NEXT: mov x0, x19 +; CHECK-GI-NEXT: mov x1, x20 +; CHECK-GI-NEXT: mov x2, x21 +; CHECK-GI-NEXT: mov x3, x22 +; CHECK-GI-NEXT: bl __umodti3 +; CHECK-GI-NEXT: mov x2, x0 +; CHECK-GI-NEXT: mov x3, x1 +; CHECK-GI-NEXT: mov x0, x23 +; CHECK-GI-NEXT: mov x1, x24 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; CHECK-GI-NEXT: ret entry: %s = urem <2 x i128> %d, %e ret <2 x i128> %s } define <3 x i128> @uv3i128(<3 x i128> %d, <3 x i128> %e) { -; CHECK-LABEL: uv3i128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-96]! // 8-byte Folded Spill -; CHECK-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -24 -; CHECK-NEXT: .cfi_offset w22, -32 -; CHECK-NEXT: .cfi_offset w23, -40 -; CHECK-NEXT: .cfi_offset w24, -48 -; CHECK-NEXT: .cfi_offset w25, -56 -; CHECK-NEXT: .cfi_offset w26, -64 -; CHECK-NEXT: .cfi_offset w27, -72 -; CHECK-NEXT: .cfi_offset w28, -80 -; CHECK-NEXT: .cfi_offset w30, -96 -; CHECK-NEXT: ldp x23, x24, [sp, #112] -; CHECK-NEXT: mov x21, x3 -; CHECK-NEXT: ldp x25, x26, [sp, #96] -; CHECK-NEXT: mov x22, x2 -; CHECK-NEXT: mov x2, x6 -; CHECK-NEXT: mov x3, x7 -; CHECK-NEXT: mov x19, x5 -; CHECK-NEXT: mov x20, x4 -; CHECK-NEXT: bl __umodti3 -; CHECK-NEXT: mov x27, x0 -; CHECK-NEXT: mov x28, x1 -; CHECK-NEXT: mov x0, x22 -; CHECK-NEXT: mov x1, x21 -; CHECK-NEXT: mov x2, x25 -; CHECK-NEXT: mov x3, x26 -; CHECK-NEXT: bl __umodti3 -; CHECK-NEXT: mov x21, x0 -; CHECK-NEXT: mov x22, x1 -; CHECK-NEXT: mov x0, x20 -; CHECK-NEXT: mov x1, x19 -; CHECK-NEXT: mov x2, x23 -; CHECK-NEXT: mov x3, x24 -; CHECK-NEXT: bl __umodti3 -; CHECK-NEXT: mov x4, x0 -; CHECK-NEXT: mov x5, x1 -; CHECK-NEXT: mov x0, x27 -; CHECK-NEXT: mov x1, x28 -; CHECK-NEXT: mov x2, x21 -; CHECK-NEXT: mov x3, x22 -; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp x26, x25, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp x28, x27, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp], #96 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: uv3i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-96]! // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 96 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w23, -40 +; CHECK-SD-NEXT: .cfi_offset w24, -48 +; CHECK-SD-NEXT: .cfi_offset w25, -56 +; CHECK-SD-NEXT: .cfi_offset w26, -64 +; CHECK-SD-NEXT: .cfi_offset w27, -72 +; CHECK-SD-NEXT: .cfi_offset w28, -80 +; CHECK-SD-NEXT: .cfi_offset w30, -96 +; CHECK-SD-NEXT: ldp x23, x24, [sp, #112] +; CHECK-SD-NEXT: mov x21, x3 +; CHECK-SD-NEXT: ldp x25, x26, [sp, #96] +; CHECK-SD-NEXT: mov x22, x2 +; CHECK-SD-NEXT: mov x2, x6 +; CHECK-SD-NEXT: mov x3, x7 +; CHECK-SD-NEXT: mov x19, x5 +; CHECK-SD-NEXT: mov x20, x4 +; CHECK-SD-NEXT: bl __umodti3 +; CHECK-SD-NEXT: mov x27, x0 +; CHECK-SD-NEXT: mov x28, x1 +; CHECK-SD-NEXT: mov x0, x22 +; CHECK-SD-NEXT: mov x1, x21 +; CHECK-SD-NEXT: mov x2, x25 +; CHECK-SD-NEXT: mov x3, x26 +; CHECK-SD-NEXT: bl __umodti3 +; CHECK-SD-NEXT: mov x21, x0 +; CHECK-SD-NEXT: mov x22, x1 +; CHECK-SD-NEXT: mov x0, x20 +; CHECK-SD-NEXT: mov x1, x19 +; CHECK-SD-NEXT: mov x2, x23 +; CHECK-SD-NEXT: mov x3, x24 +; CHECK-SD-NEXT: bl __umodti3 +; CHECK-SD-NEXT: mov x4, x0 +; CHECK-SD-NEXT: mov x5, x1 +; CHECK-SD-NEXT: mov x0, x27 +; CHECK-SD-NEXT: mov x1, x28 +; CHECK-SD-NEXT: mov x2, x21 +; CHECK-SD-NEXT: mov x3, x22 +; CHECK-SD-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x26, x25, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x28, x27, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp], #96 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv3i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: str x30, [sp, #-96]! // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 96 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w21, -24 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w23, -40 +; CHECK-GI-NEXT: .cfi_offset w24, -48 +; CHECK-GI-NEXT: .cfi_offset w25, -56 +; CHECK-GI-NEXT: .cfi_offset w26, -64 +; CHECK-GI-NEXT: .cfi_offset w27, -72 +; CHECK-GI-NEXT: .cfi_offset w28, -80 +; CHECK-GI-NEXT: .cfi_offset w30, -96 +; CHECK-GI-NEXT: ldp x23, x24, [sp, #96] +; CHECK-GI-NEXT: mov x19, x2 +; CHECK-GI-NEXT: ldp x25, x26, [sp, #112] +; CHECK-GI-NEXT: mov x20, x3 +; CHECK-GI-NEXT: mov x2, x6 +; CHECK-GI-NEXT: mov x3, x7 +; CHECK-GI-NEXT: mov x21, x4 +; CHECK-GI-NEXT: mov x22, x5 +; CHECK-GI-NEXT: bl __umodti3 +; CHECK-GI-NEXT: mov x27, x0 +; CHECK-GI-NEXT: mov x28, x1 +; CHECK-GI-NEXT: mov x0, x19 +; CHECK-GI-NEXT: mov x1, x20 +; CHECK-GI-NEXT: mov x2, x23 +; CHECK-GI-NEXT: mov x3, x24 +; CHECK-GI-NEXT: bl __umodti3 +; CHECK-GI-NEXT: mov x19, x0 +; CHECK-GI-NEXT: mov x20, x1 +; CHECK-GI-NEXT: mov x0, x21 +; CHECK-GI-NEXT: mov x1, x22 +; CHECK-GI-NEXT: mov x2, x25 +; CHECK-GI-NEXT: mov x3, x26 +; CHECK-GI-NEXT: bl __umodti3 +; CHECK-GI-NEXT: mov x4, x0 +; CHECK-GI-NEXT: mov x5, x1 +; CHECK-GI-NEXT: mov x0, x27 +; CHECK-GI-NEXT: mov x1, x28 +; CHECK-GI-NEXT: mov x2, x19 +; CHECK-GI-NEXT: mov x3, x20 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x26, x25, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x28, x27, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp], #96 // 8-byte Folded Reload +; CHECK-GI-NEXT: ret entry: %s = urem <3 x i128> %d, %e ret <3 x i128> %s } define <4 x i128> @uv4i128(<4 x i128> %d, <4 x i128> %e) { -; CHECK-LABEL: uv4i128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #128 -; CHECK-NEXT: stp x29, x30, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp x28, x27, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x26, x25, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x24, x23, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #96] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #112] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -24 -; CHECK-NEXT: .cfi_offset w22, -32 -; CHECK-NEXT: .cfi_offset w23, -40 -; CHECK-NEXT: .cfi_offset w24, -48 -; CHECK-NEXT: .cfi_offset w25, -56 -; CHECK-NEXT: .cfi_offset w26, -64 -; CHECK-NEXT: .cfi_offset w27, -72 -; CHECK-NEXT: .cfi_offset w28, -80 -; CHECK-NEXT: .cfi_offset w30, -88 -; CHECK-NEXT: .cfi_offset w29, -96 -; CHECK-NEXT: mov x23, x3 -; CHECK-NEXT: mov x24, x2 -; CHECK-NEXT: stp x6, x7, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: ldp x8, x26, [sp, #176] -; CHECK-NEXT: mov x21, x5 -; CHECK-NEXT: ldp x2, x3, [sp, #128] -; CHECK-NEXT: mov x22, x4 -; CHECK-NEXT: ldp x27, x28, [sp, #160] -; CHECK-NEXT: ldp x29, x19, [sp, #144] -; CHECK-NEXT: str x8, [sp, #8] // 8-byte Folded Spill -; CHECK-NEXT: bl __umodti3 -; CHECK-NEXT: mov x20, x0 -; CHECK-NEXT: mov x25, x1 -; CHECK-NEXT: mov x0, x24 -; CHECK-NEXT: mov x1, x23 -; CHECK-NEXT: mov x2, x29 -; CHECK-NEXT: mov x3, x19 -; CHECK-NEXT: bl __umodti3 -; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: mov x23, x1 -; CHECK-NEXT: mov x0, x22 -; CHECK-NEXT: mov x1, x21 -; CHECK-NEXT: mov x2, x27 -; CHECK-NEXT: mov x3, x28 -; CHECK-NEXT: bl __umodti3 -; CHECK-NEXT: mov x21, x0 -; CHECK-NEXT: mov x22, x1 -; CHECK-NEXT: ldr x2, [sp, #8] // 8-byte Folded Reload -; CHECK-NEXT: ldp x0, x1, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: mov x3, x26 -; CHECK-NEXT: bl __umodti3 -; CHECK-NEXT: mov x6, x0 -; CHECK-NEXT: mov x7, x1 -; CHECK-NEXT: mov x0, x20 -; CHECK-NEXT: mov x1, x25 -; CHECK-NEXT: mov x2, x19 -; CHECK-NEXT: mov x3, x23 -; CHECK-NEXT: mov x4, x21 -; CHECK-NEXT: mov x5, x22 -; CHECK-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload -; CHECK-NEXT: ldp x22, x21, [sp, #96] // 16-byte Folded Reload -; CHECK-NEXT: ldp x24, x23, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldp x26, x25, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldp x28, x27, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #128 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: uv4i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #128 +; CHECK-SD-NEXT: stp x29, x30, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x28, x27, [sp, #48] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x26, x25, [sp, #64] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x24, x23, [sp, #80] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #96] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #112] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 128 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w23, -40 +; CHECK-SD-NEXT: .cfi_offset w24, -48 +; CHECK-SD-NEXT: .cfi_offset w25, -56 +; CHECK-SD-NEXT: .cfi_offset w26, -64 +; CHECK-SD-NEXT: .cfi_offset w27, -72 +; CHECK-SD-NEXT: .cfi_offset w28, -80 +; CHECK-SD-NEXT: .cfi_offset w30, -88 +; CHECK-SD-NEXT: .cfi_offset w29, -96 +; CHECK-SD-NEXT: mov x23, x3 +; CHECK-SD-NEXT: mov x24, x2 +; CHECK-SD-NEXT: stp x6, x7, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldp x8, x26, [sp, #176] +; CHECK-SD-NEXT: mov x21, x5 +; CHECK-SD-NEXT: ldp x2, x3, [sp, #128] +; CHECK-SD-NEXT: mov x22, x4 +; CHECK-SD-NEXT: ldp x27, x28, [sp, #160] +; CHECK-SD-NEXT: ldp x29, x19, [sp, #144] +; CHECK-SD-NEXT: str x8, [sp, #8] // 8-byte Folded Spill +; CHECK-SD-NEXT: bl __umodti3 +; CHECK-SD-NEXT: mov x20, x0 +; CHECK-SD-NEXT: mov x25, x1 +; CHECK-SD-NEXT: mov x0, x24 +; CHECK-SD-NEXT: mov x1, x23 +; CHECK-SD-NEXT: mov x2, x29 +; CHECK-SD-NEXT: mov x3, x19 +; CHECK-SD-NEXT: bl __umodti3 +; CHECK-SD-NEXT: mov x19, x0 +; CHECK-SD-NEXT: mov x23, x1 +; CHECK-SD-NEXT: mov x0, x22 +; CHECK-SD-NEXT: mov x1, x21 +; CHECK-SD-NEXT: mov x2, x27 +; CHECK-SD-NEXT: mov x3, x28 +; CHECK-SD-NEXT: bl __umodti3 +; CHECK-SD-NEXT: mov x21, x0 +; CHECK-SD-NEXT: mov x22, x1 +; CHECK-SD-NEXT: ldr x2, [sp, #8] // 8-byte Folded Reload +; CHECK-SD-NEXT: ldp x0, x1, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov x3, x26 +; CHECK-SD-NEXT: bl __umodti3 +; CHECK-SD-NEXT: mov x6, x0 +; CHECK-SD-NEXT: mov x7, x1 +; CHECK-SD-NEXT: mov x0, x20 +; CHECK-SD-NEXT: mov x1, x25 +; CHECK-SD-NEXT: mov x2, x19 +; CHECK-SD-NEXT: mov x3, x23 +; CHECK-SD-NEXT: mov x4, x21 +; CHECK-SD-NEXT: mov x5, x22 +; CHECK-SD-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x22, x21, [sp, #96] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x24, x23, [sp, #80] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x26, x25, [sp, #64] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x28, x27, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x29, x30, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #128 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv4i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #128 +; CHECK-GI-NEXT: stp x29, x30, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x28, x27, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x26, x25, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x24, x23, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #112] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 128 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w21, -24 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w23, -40 +; CHECK-GI-NEXT: .cfi_offset w24, -48 +; CHECK-GI-NEXT: .cfi_offset w25, -56 +; CHECK-GI-NEXT: .cfi_offset w26, -64 +; CHECK-GI-NEXT: .cfi_offset w27, -72 +; CHECK-GI-NEXT: .cfi_offset w28, -80 +; CHECK-GI-NEXT: .cfi_offset w30, -88 +; CHECK-GI-NEXT: .cfi_offset w29, -96 +; CHECK-GI-NEXT: mov x19, x2 +; CHECK-GI-NEXT: mov x20, x3 +; CHECK-GI-NEXT: mov x21, x4 +; CHECK-GI-NEXT: ldp x2, x3, [sp, #128] +; CHECK-GI-NEXT: mov x22, x5 +; CHECK-GI-NEXT: ldp x9, x8, [sp, #176] +; CHECK-GI-NEXT: mov x23, x7 +; CHECK-GI-NEXT: ldp x24, x25, [sp, #144] +; CHECK-GI-NEXT: ldp x26, x27, [sp, #160] +; CHECK-GI-NEXT: stp x9, x6, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: str x8, [sp, #8] // 8-byte Folded Spill +; CHECK-GI-NEXT: bl __umodti3 +; CHECK-GI-NEXT: mov x28, x0 +; CHECK-GI-NEXT: mov x29, x1 +; CHECK-GI-NEXT: mov x0, x19 +; CHECK-GI-NEXT: mov x1, x20 +; CHECK-GI-NEXT: mov x2, x24 +; CHECK-GI-NEXT: mov x3, x25 +; CHECK-GI-NEXT: bl __umodti3 +; CHECK-GI-NEXT: mov x19, x0 +; CHECK-GI-NEXT: mov x20, x1 +; CHECK-GI-NEXT: mov x0, x21 +; CHECK-GI-NEXT: mov x1, x22 +; CHECK-GI-NEXT: mov x2, x26 +; CHECK-GI-NEXT: mov x3, x27 +; CHECK-GI-NEXT: bl __umodti3 +; CHECK-GI-NEXT: mov x21, x0 +; CHECK-GI-NEXT: ldp x2, x0, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x3, [sp, #8] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov x22, x1 +; CHECK-GI-NEXT: mov x1, x23 +; CHECK-GI-NEXT: bl __umodti3 +; CHECK-GI-NEXT: mov x6, x0 +; CHECK-GI-NEXT: mov x7, x1 +; CHECK-GI-NEXT: mov x0, x28 +; CHECK-GI-NEXT: mov x1, x29 +; CHECK-GI-NEXT: mov x2, x19 +; CHECK-GI-NEXT: mov x3, x20 +; CHECK-GI-NEXT: mov x4, x21 +; CHECK-GI-NEXT: mov x5, x22 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x24, x23, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x26, x25, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x28, x27, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x29, x30, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #128 +; CHECK-GI-NEXT: ret entry: %s = urem <4 x i128> %d, %e ret <4 x i128> %s From 61194617ad7862f144e0f6db34175553e8c34763 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 22 Aug 2024 11:42:57 +0100 Subject: [PATCH 177/426] [AMDGPU] Add GFX12 test coverage for vmcnt flushing in loop headers (#105548) --- .../CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir | 129 ++++++++++++++++++ 1 file changed, 129 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir index 2417becb7c2167..e51174919b8d3a 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir @@ -1,5 +1,6 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX10 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX12 %s --- @@ -20,6 +21,13 @@ # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop +# GFX12-LABEL: bb.0: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop body: | bb.0: @@ -58,6 +66,13 @@ body: | # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop_noterm +# GFX12-LABEL: bb.0: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop_noterm body: | bb.0: @@ -129,6 +144,13 @@ body: | # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop_load +# GFX12-LABEL: bb.0: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop_load body: | bb.0: @@ -170,6 +192,13 @@ body: | # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop_no_store +# GFX12-LABEL: bb.0: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop_no_store body: | bb.0: @@ -212,6 +241,13 @@ body: | # GFX10-LABEL: bb.1: # GFX10-NOT: S_WAITCNT 16 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop_no_use +# GFX12-LABEL: bb.0: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop_no_use body: | bb.0: @@ -255,6 +291,14 @@ body: | # GFX10-LABEL: bb.1: # GFX10-NOT: S_WAITCNT 16 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop2 +# GFX12-LABEL: bb.0: +# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop2 body: | bb.0: @@ -294,6 +338,14 @@ body: | # GFX10-LABEL: bb.1: # GFX10-NOT: S_WAITCNT 16 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop2_store +# GFX12-LABEL: bb.0: +# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop2_store body: | bb.0: @@ -334,6 +386,13 @@ body: | # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop2_use_in_loop +# GFX12-LABEL: bb.0: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop2_use_in_loop body: | bb.0: @@ -379,6 +438,15 @@ body: | # GFX10-LABEL: bb.2: # GFX10-NOT: S_WAITCNT 16 # GFX10-LABEL: bb.3: + +# GFX12-LABEL: waitcnt_vm_loop2_nowait +# GFX12-LABEL: bb.0: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.3: name: waitcnt_vm_loop2_nowait body: | bb.0: @@ -427,6 +495,14 @@ body: | # GFX10-LABEL: bb.1: # GFX10-NOT: S_WAITCNT 16 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop2_reginterval +# GFX12-LABEL: bb.0: +# GFX12: GLOBAL_LOAD_DWORDX4 +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop2_reginterval body: | bb.0: @@ -467,6 +543,13 @@ body: | # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop2_reginterval2 +# GFX12-LABEL: bb.0: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop2_reginterval2 body: | bb.0: @@ -513,6 +596,15 @@ body: | # GFX10-NOT: S_WAITCNT 16240 # GFX10-LABEL: bb.2: +# GFX12-LABEL: waitcnt_vm_zero +# GFX12-LABEL: bb.0: +# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN +# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: + name: waitcnt_vm_zero body: | bb.0: @@ -548,6 +640,14 @@ body: | # GFX10-LABEL: bb.1: # GFX10-NOT: S_WAITCNT +# GFX12-LABEL: waitcnt_vm_necessary +# GFX12-LABEL: bb.0: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12: $vgpr4 +# GFX12-NOT: S_WAITCNT +# GFX12-LABEL: bb.1: +# GFX12-NOT: S_WAITCNT + # GFX9-LABEL: waitcnt_vm_necessary # GFX9-LABEL: bb.0: # GFX9: S_WAITCNT 3952 @@ -590,6 +690,13 @@ body: | # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: +# GFX12-LABEL: waitcnt_vm_loop_global_mem +# GFX12-LABEL: bb.0: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: + name: waitcnt_vm_loop_global_mem body: | bb.0: @@ -631,6 +738,13 @@ body: | # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: +# GFX12-LABEL: waitcnt_vm_loop_scratch_mem +# GFX12-LABEL: bb.0: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: + name: waitcnt_vm_loop_scratch_mem body: | bb.0: @@ -671,6 +785,14 @@ body: | # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 11 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop_flat_mem +# GFX12-LABEL: bb.0: +# GFX12: FLAT_LOAD_DWORD +# GFX12-NOT: S_WAIT_LOADCNT_DSCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT_DSCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop_flat_mem body: | bb.0: @@ -713,6 +835,13 @@ body: | # GFX10-LABEL: bb.1: # GFX10: S_WAITCNT 16 # GFX10-LABEL: bb.2: + +# GFX12-LABEL: waitcnt_vm_loop_flat_load +# GFX12-LABEL: bb.0: +# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.2: name: waitcnt_vm_loop_flat_load body: | bb.0: From 5506831f7bc8dc04ebe77f4d26940007bfb4ab39 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 22 Aug 2024 11:46:51 +0100 Subject: [PATCH 178/426] [AMDGPU] GFX12 VMEM loads can write VGPR results out of order (#105549) Fix SIInsertWaitcnts to account for this by adding extra waits to avoid WAW dependencies. --- llvm/lib/Target/AMDGPU/AMDGPU.td | 23 ++++++++++++++----- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 +++ llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 7 +++--- .../buffer-fat-pointer-atomicrmw-fadd.ll | 3 +++ .../buffer-fat-pointer-atomicrmw-fmax.ll | 5 ++++ .../buffer-fat-pointer-atomicrmw-fmin.ll | 5 ++++ ....amdgcn.struct.buffer.load.format.v3f16.ll | 1 + llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 10 +++++++- llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 10 ++++++++ llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 2 ++ .../AMDGPU/spill-csr-frame-ptr-reg-copy.ll | 1 + .../CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir | 8 +++---- 12 files changed, 64 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 7906e0ee9d7858..9efdbd751d96e3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -953,6 +953,12 @@ def FeatureRequiredExportPriority : SubtargetFeature<"required-export-priority", "Export priority must be explicitly manipulated on GFX11.5" >; +def FeatureVmemWriteVgprInOrder : SubtargetFeature<"vmem-write-vgpr-in-order", + "HasVmemWriteVgprInOrder", + "true", + "VMEM instructions of the same type write VGPR results in order" +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -1123,7 +1129,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS", FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel, FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, - FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts + FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, + FeatureVmemWriteVgprInOrder ] >; @@ -1136,7 +1143,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, - FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts + FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts, + FeatureVmemWriteVgprInOrder ] >; @@ -1152,7 +1160,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32, FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS, - FeatureDefaultComponentZero + FeatureDefaultComponentZero, FeatureVmemWriteVgprInOrder ] >; @@ -1170,7 +1178,8 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16, FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, - FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero + FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero, + FeatureVmemWriteVgprInOrder ] >; @@ -1193,7 +1202,8 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, FeatureMaxHardClauseLength63, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, - FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts + FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts, + FeatureVmemWriteVgprInOrder ] >; @@ -1215,7 +1225,8 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, FeatureMaxHardClauseLength32, - FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts + FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, + FeatureVmemWriteVgprInOrder ] >; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 902f51ae358d59..9386bcf0d74b22 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -239,6 +239,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasVALUTransUseHazard = false; bool HasForceStoreSC0SC1 = false; bool HasRequiredExportPriority = false; + bool HasVmemWriteVgprInOrder = false; bool RequiresCOV6 = false; @@ -1285,6 +1286,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasRequiredExportPriority() const { return HasRequiredExportPriority; } + bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; } + /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively. bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; } diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 59a1eee8d4f91d..4262e7b5d9c250 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1778,11 +1778,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, if (IsVGPR) { // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the // previous write and this write are the same type of VMEM - // instruction, in which case they're guaranteed to write their - // results in order anyway. + // instruction, in which case they are (in some architectures) + // guaranteed to write their results in order anyway. if (Op.isUse() || !updateVMCntOnly(MI) || ScoreBrackets.hasOtherPendingVmemTypes(RegNo, - getVmemType(MI))) { + getVmemType(MI)) || + !ST->hasVmemWriteVgprInOrder()) { ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); ScoreBrackets.determineWait(SAMPLE_CNT, RegNo, Wait); ScoreBrackets.determineWait(BVH_CNT, RegNo, Wait); diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index 9d9e6898417e87..63cdd8a3bb16dc 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -2599,6 +2599,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 @@ -4432,6 +4433,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 @@ -5911,6 +5913,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index 172ce4c065e13d..c90296124eb127 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -1737,6 +1737,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 @@ -3459,6 +3460,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 @@ -4959,6 +4961,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 @@ -6329,6 +6332,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 @@ -7822,6 +7826,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index 61ee956747135f..91adbfa5599761 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -1737,6 +1737,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 @@ -3459,6 +3460,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 @@ -4959,6 +4961,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 @@ -6329,6 +6332,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 @@ -7822,6 +7826,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll index 4c1ae4c228adb3..0522d5258b9b5f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll @@ -128,6 +128,7 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], null idxen ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: ; implicit-def: $vgpr4 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 355c296d122ff2..22b718935738bd 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -745,7 +745,7 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_clause 0xf +; GFX12-NEXT: s_clause 0x7 ; GFX12-NEXT: global_load_u16 v3, v8, s[0:1] offset:28 ; GFX12-NEXT: global_load_u16 v2, v8, s[0:1] offset:24 ; GFX12-NEXT: global_load_u16 v1, v8, s[0:1] offset:20 @@ -754,13 +754,21 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GFX12-NEXT: global_load_u16 v6, v8, s[0:1] offset:8 ; GFX12-NEXT: global_load_u16 v5, v8, s[0:1] offset:4 ; GFX12-NEXT: global_load_u16 v4, v8, s[0:1] +; GFX12-NEXT: s_wait_loadcnt 0x7 ; GFX12-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:30 +; GFX12-NEXT: s_wait_loadcnt 0x7 ; GFX12-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:26 +; GFX12-NEXT: s_wait_loadcnt 0x7 ; GFX12-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:22 +; GFX12-NEXT: s_wait_loadcnt 0x7 ; GFX12-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:18 +; GFX12-NEXT: s_wait_loadcnt 0x7 ; GFX12-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:14 +; GFX12-NEXT: s_wait_loadcnt 0x7 ; GFX12-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:10 +; GFX12-NEXT: s_wait_loadcnt 0x7 ; GFX12-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:6 +; GFX12-NEXT: s_wait_loadcnt 0x7 ; GFX12-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:2 ; GFX12-NEXT: s_wait_loadcnt 0x4 ; GFX12-NEXT: global_store_b128 v[0:1], v[0:3], off diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 142bc37fdeb755..4cc47b09d813d6 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -3563,15 +3563,19 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 @@ -4371,8 +4375,10 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 @@ -7341,8 +7347,10 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v12, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v39 @@ -7364,8 +7372,10 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index c0649322c81953..7cdf270810dea0 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -3091,8 +3091,10 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:192 ; SI-NOHSA-NEXT: buffer_load_dword v8, off, s[12:15], 0 ; 4-byte Folded Reload +; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) ; SI-NOHSA-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; SI-NOHSA-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) ; SI-NOHSA-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll index b045dd559aac26..34bcc3f02ac66d 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll @@ -15,6 +15,7 @@ ; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4 ; GCN: s_xor_saveexec_b64 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir index e51174919b8d3a..bdef55ab956a01 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir @@ -297,7 +297,7 @@ body: | # GFX12: BUFFER_LOAD_FORMAT_X_IDXEN # GFX12: S_WAIT_LOADCNT 0 # GFX12-LABEL: bb.1: -# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12: S_WAIT_LOADCNT 0 # GFX12-LABEL: bb.2: name: waitcnt_vm_loop2 body: | @@ -344,7 +344,7 @@ body: | # GFX12: BUFFER_LOAD_FORMAT_X_IDXEN # GFX12: S_WAIT_LOADCNT 0 # GFX12-LABEL: bb.1: -# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12: S_WAIT_LOADCNT 0 # GFX12-LABEL: bb.2: name: waitcnt_vm_loop2_store body: | @@ -445,7 +445,7 @@ body: | # GFX12-LABEL: bb.1: # GFX12-NOT: S_WAIT_LOADCNT 0 # GFX12-LABEL: bb.2: -# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12: S_WAIT_LOADCNT 0 # GFX12-LABEL: bb.3: name: waitcnt_vm_loop2_nowait body: | @@ -602,7 +602,7 @@ body: | # GFX12: BUFFER_LOAD_FORMAT_X_IDXEN # GFX12: S_WAIT_LOADCNT 0 # GFX12-LABEL: bb.1: -# GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12: S_WAIT_LOADCNT 0 # GFX12-LABEL: bb.2: name: waitcnt_vm_zero From 5bbd5984306ab0bdd89a2e81cd4965e5ae51c3fb Mon Sep 17 00:00:00 2001 From: Vassil Vassilev Date: Thu, 22 Aug 2024 13:04:33 +0200 Subject: [PATCH 179/426] [cmake] Include GNUInstallDirs before using variables defined by it. (#83807) This fixes an odd problem with the regex when `CMAKE_INSTALL_LIBDIR` is not defined: `string sub-command REGEX, mode REPLACE: regex "$" matched an empty string.` Fixes llvm/llvm-project#83802 --- llvm/cmake/modules/Findzstd.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/cmake/modules/Findzstd.cmake b/llvm/cmake/modules/Findzstd.cmake index 4bc0b793e51c9a..86b6d48b6ec6b6 100644 --- a/llvm/cmake/modules/Findzstd.cmake +++ b/llvm/cmake/modules/Findzstd.cmake @@ -34,6 +34,7 @@ if(zstd_FOUND) elseif (NOT TARGET zstd::libzstd_shared) add_library(zstd::libzstd_shared SHARED IMPORTED) if(MSVC) + include(GNUInstallDirs) # For CMAKE_INSTALL_LIBDIR and friends. # IMPORTED_LOCATION is the path to the DLL and IMPORTED_IMPLIB is the "library". get_filename_component(zstd_DIRNAME "${zstd_LIBRARY}" DIRECTORY) if(NOT "${CMAKE_INSTALL_LIBDIR}" STREQUAL "" AND NOT "${CMAKE_INSTALL_BINDIR}" STREQUAL "") From 743e70bb7578276ac331c534547ef0d65600a8c1 Mon Sep 17 00:00:00 2001 From: Matt Davis Date: Thu, 22 Aug 2024 04:12:33 -0700 Subject: [PATCH 180/426] [DebugInfo][NFC] Constify debug DbgVariableRecord::{isDbgValue,isDbgDeclare} (#105570) Constify debug DbgVariableRecord::{isDbgValue,isDbgDeclare}. --- llvm/include/llvm/IR/DebugProgramInstruction.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h index e6dd1e979794e2..a6605052ba83d3 100644 --- a/llvm/include/llvm/IR/DebugProgramInstruction.h +++ b/llvm/include/llvm/IR/DebugProgramInstruction.h @@ -398,8 +398,8 @@ class DbgVariableRecord : public DbgRecord, protected DebugValueUser { } }; - bool isDbgDeclare() { return Type == LocationType::Declare; } - bool isDbgValue() { return Type == LocationType::Value; } + bool isDbgDeclare() const { return Type == LocationType::Declare; } + bool isDbgValue() const { return Type == LocationType::Value; } /// Get the locations corresponding to the variable referenced by the debug /// info intrinsic. Depending on the intrinsic, this could be the From 7323e7eee3a819e9a2d8ec29f00d362bcad87731 Mon Sep 17 00:00:00 2001 From: Dmitri Gribenko Date: Thu, 22 Aug 2024 11:56:23 +0200 Subject: [PATCH 181/426] Revert "[lldb][swig] Use the correct variable in the return statement" This reverts commit 65281570afd7e35e01533b07c6c2937de410fc52. I'm reverting https://github.com/llvm/llvm-project/pull/104523 (https://github.com/llvm/llvm-project/commit/f01f80ce6ca7640bb0e267b84b1ed0e89b57e2d9) and this fixup belongs to the same series of changes. --- lldb/bindings/python/python-wrapper.swig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/bindings/python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig index 360c392235a866..2ce42e3e017d5b 100644 --- a/lldb/bindings/python/python-wrapper.swig +++ b/lldb/bindings/python/python-wrapper.swig @@ -837,7 +837,7 @@ bool lldb_private::python::SWIGBridge::LLDBSwigPython_ShouldHide( bool ret_val = result ? PyObject_IsTrue(result) : false; Py_XDECREF(result); - return ret_val; + return result; } void *lldb_private::python::SWIGBridge::LLDBSWIGPython_GetDynamicSetting( From aa70f83e660453c006193aab7ba67c94db236948 Mon Sep 17 00:00:00 2001 From: Dmitri Gribenko Date: Thu, 22 Aug 2024 13:24:15 +0200 Subject: [PATCH 182/426] Revert "[lldb-dap] Mark hidden frames as "subtle" (#105457)" This reverts commit 6f456024c37424d9c8cc1cea07126a28f246588d, which depends on https://github.com/llvm/llvm-project/pull/104523, which I'm reverting. --- .../lldb-dap/stackTrace/subtleFrames/Makefile | 3 -- .../subtleFrames/TestDAP_subtleFrames.py | 29 ------------------- .../lldb-dap/stackTrace/subtleFrames/main.cpp | 13 --------- lldb/tools/lldb-dap/JSONUtils.cpp | 3 -- 4 files changed, 48 deletions(-) delete mode 100644 lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/Makefile delete mode 100644 lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/TestDAP_subtleFrames.py delete mode 100644 lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/main.cpp diff --git a/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/Makefile b/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/Makefile deleted file mode 100644 index 99998b20bcb050..00000000000000 --- a/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -CXX_SOURCES := main.cpp - -include Makefile.rules diff --git a/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/TestDAP_subtleFrames.py b/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/TestDAP_subtleFrames.py deleted file mode 100644 index 1e41e841e39bc8..00000000000000 --- a/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/TestDAP_subtleFrames.py +++ /dev/null @@ -1,29 +0,0 @@ -""" -Test lldb-dap stack trace response -""" - - -import dap_server -from lldbsuite.test.decorators import * - -import lldbdap_testcase -from lldbsuite.test.lldbtest import * - - -class TestDAP_subtleFrames(lldbdap_testcase.DAPTestCaseBase): - @add_test_categories(["libc++"]) - def test_subtleFrames(self): - """ - Internal stack frames (such as the ones used by `std::function`) are marked as "subtle". - """ - program = self.getBuildArtifact("a.out") - self.build_and_launch(program) - source = "main.cpp" - self.set_source_breakpoints(source, [line_number(source, "BREAK HERE")]) - self.continue_to_next_stop() - - frames = self.get_stackFrames() - for f in frames: - if "__function" in f["name"]: - self.assertEqual(f["presentationHint"], "subtle") - self.assertTrue(any(f.get("presentationHint") == "subtle" for f in frames)) diff --git a/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/main.cpp b/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/main.cpp deleted file mode 100644 index 71944528441e38..00000000000000 --- a/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/main.cpp +++ /dev/null @@ -1,13 +0,0 @@ -#include -#include - -void greet() { - // BREAK HERE - std::cout << "Hello\n"; -} - -int main() { - std::function func{greet}; - func(); - return 0; -} diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp index c080fd395b7288..a8b85f55939e17 100644 --- a/lldb/tools/lldb-dap/JSONUtils.cpp +++ b/lldb/tools/lldb-dap/JSONUtils.cpp @@ -763,9 +763,6 @@ llvm::json::Value CreateStackFrame(lldb::SBFrame &frame) { object.try_emplace("instructionPointerReference", formatted_addr); } - if (frame.IsArtificial() || frame.IsHidden()) - object.try_emplace("presentationHint", "subtle"); - return llvm::json::Value(std::move(object)); } From 547917aebd1e79a8929b53f0ddf3b5185ee4df74 Mon Sep 17 00:00:00 2001 From: Dmitri Gribenko Date: Thu, 22 Aug 2024 11:58:19 +0200 Subject: [PATCH 183/426] Revert "[lldb] Extend frame recognizers to hide frames from backtraces (#104523)" This reverts commit f01f80ce6ca7640bb0e267b84b1ed0e89b57e2d9. This commit introduces an msan violation. See the discussion on https://github.com/llvm/llvm-project/pull/104523. --- lldb/bindings/python/python-wrapper.swig | 18 +--- lldb/include/lldb/API/SBFrame.h | 4 - .../lldb/Interpreter/ScriptInterpreter.h | 5 -- lldb/include/lldb/Target/StackFrame.h | 36 ++++---- lldb/include/lldb/Target/StackFrameList.h | 2 +- .../lldb/Target/StackFrameRecognizer.h | 21 ++--- lldb/include/lldb/Target/Thread.h | 4 +- lldb/source/API/SBFrame.cpp | 15 +--- lldb/source/API/SBThread.cpp | 3 +- lldb/source/Commands/CommandCompletions.cpp | 4 +- lldb/source/Commands/CommandObjectFrame.cpp | 24 ------ lldb/source/Commands/CommandObjectMemory.cpp | 3 +- lldb/source/Commands/CommandObjectThread.cpp | 19 +---- lldb/source/Commands/Options.td | 2 - lldb/source/Core/Debugger.cpp | 3 +- .../source/Interpreter/CommandInterpreter.cpp | 9 +- .../CPlusPlus/CPPLanguageRuntime.cpp | 44 +--------- .../Python/SWIGPythonBridge.h | 3 - .../Python/ScriptInterpreterPython.cpp | 29 ------- .../Python/ScriptInterpreterPythonImpl.h | 3 - lldb/source/Target/Process.cpp | 7 +- lldb/source/Target/StackFrame.cpp | 26 ++---- lldb/source/Target/StackFrameList.cpp | 8 +- lldb/source/Target/StackFrameRecognizer.cpp | 29 ++----- lldb/source/Target/Thread.cpp | 12 ++- lldb/source/Target/ThreadPlanStepOut.cpp | 2 +- .../frame/recognizer/TestFrameRecognizer.py | 40 --------- .../test/API/commands/frame/recognizer/main.m | 21 +++-- .../commands/frame/recognizer/recognizer.py | 5 -- .../lang/cpp/std-function-recognizer/Makefile | 4 - .../TestStdFunctionRecognizer.py | 84 ------------------- .../lang/cpp/std-function-recognizer/main.cpp | 10 --- 32 files changed, 75 insertions(+), 424 deletions(-) delete mode 100644 lldb/test/API/lang/cpp/std-function-recognizer/Makefile delete mode 100644 lldb/test/API/lang/cpp/std-function-recognizer/TestStdFunctionRecognizer.py delete mode 100644 lldb/test/API/lang/cpp/std-function-recognizer/main.cpp diff --git a/lldb/bindings/python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig index 2ce42e3e017d5b..8f050643fa68b3 100644 --- a/lldb/bindings/python/python-wrapper.swig +++ b/lldb/bindings/python/python-wrapper.swig @@ -813,7 +813,7 @@ PythonObject lldb_private::python::SWIGBridge::LLDBSWIGPython_CreateFrameRecogni } PyObject *lldb_private::python::SWIGBridge::LLDBSwigPython_GetRecognizedArguments( - PyObject *implementor, const lldb::StackFrameSP &frame_sp) { + PyObject * implementor, const lldb::StackFrameSP &frame_sp) { static char callee_name[] = "get_recognized_arguments"; PythonObject arg = SWIGBridge::ToSWIGWrapper(frame_sp); @@ -824,22 +824,6 @@ PyObject *lldb_private::python::SWIGBridge::LLDBSwigPython_GetRecognizedArgument return result; } -bool lldb_private::python::SWIGBridge::LLDBSwigPython_ShouldHide( - PyObject *implementor, const lldb::StackFrameSP &frame_sp) { - static char callee_name[] = "should_hide"; - - PythonObject arg = SWIGBridge::ToSWIGWrapper(frame_sp); - - PythonString str(callee_name); - - PyObject *result = - PyObject_CallMethodObjArgs(implementor, str.get(), arg.get(), NULL); - bool ret_val = result ? PyObject_IsTrue(result) : false; - Py_XDECREF(result); - - return result; -} - void *lldb_private::python::SWIGBridge::LLDBSWIGPython_GetDynamicSetting( void *module, const char *setting, const lldb::TargetSP &target_sp) { if (!module || !setting) diff --git a/lldb/include/lldb/API/SBFrame.h b/lldb/include/lldb/API/SBFrame.h index e0d15c3ecc5b1c..821ff3cf7ce519 100644 --- a/lldb/include/lldb/API/SBFrame.h +++ b/lldb/include/lldb/API/SBFrame.h @@ -104,10 +104,6 @@ class LLDB_API SBFrame { bool IsArtificial() const; - /// Return whether a frame recognizer decided this frame should not - /// be displayes in backtraces etc. - bool IsHidden() const; - /// The version that doesn't supply a 'use_dynamic' value will use the /// target's default. lldb::SBValue EvaluateExpression(const char *expr); diff --git a/lldb/include/lldb/Interpreter/ScriptInterpreter.h b/lldb/include/lldb/Interpreter/ScriptInterpreter.h index 89a480a28880aa..05f0d7f0955f3e 100644 --- a/lldb/include/lldb/Interpreter/ScriptInterpreter.h +++ b/lldb/include/lldb/Interpreter/ScriptInterpreter.h @@ -252,11 +252,6 @@ class ScriptInterpreter : public PluginInterface { return lldb::ValueObjectListSP(); } - virtual bool ShouldHide(const StructuredData::ObjectSP &implementor, - lldb::StackFrameSP frame_sp) { - return false; - } - virtual StructuredData::GenericSP CreateScriptedBreakpointResolver(const char *class_name, const StructuredDataImpl &args_data, diff --git a/lldb/include/lldb/Target/StackFrame.h b/lldb/include/lldb/Target/StackFrame.h index e4d17847763acf..52f0a1ee662176 100644 --- a/lldb/include/lldb/Target/StackFrame.h +++ b/lldb/include/lldb/Target/StackFrame.h @@ -407,11 +407,6 @@ class StackFrame : public ExecutionContextScope, /// may have limited support for inspecting variables. bool IsArtificial() const; - /// Query whether this frame should be hidden from backtraces. Frame - /// recognizers can customize this behavior and hide distracting - /// system implementation details this way. - bool IsHidden(); - /// Query this frame to find what frame it is in this Thread's /// StackFrameList. /// @@ -523,36 +518,33 @@ class StackFrame : public ExecutionContextScope, bool HasCachedData() const; private: - /// For StackFrame only. - /// \{ + // For StackFrame only lldb::ThreadWP m_thread_wp; uint32_t m_frame_index; uint32_t m_concrete_frame_index; lldb::RegisterContextSP m_reg_context_sp; StackID m_id; - /// \} - - /// The frame code address (might not be the same as the actual PC - /// for inlined frames) as a section/offset address. - Address m_frame_code_addr; + Address m_frame_code_addr; // The frame code address (might not be the same as + // the actual PC for inlined frames) as a + // section/offset address SymbolContext m_sc; Flags m_flags; Scalar m_frame_base; Status m_frame_base_error; - uint16_t m_frame_recognizer_generation; - /// Does this frame have a CFA? Different from CFA == LLDB_INVALID_ADDRESS. - bool m_cfa_is_valid; + bool m_cfa_is_valid; // Does this frame have a CFA? Different from CFA == + // LLDB_INVALID_ADDRESS Kind m_stack_frame_kind; - /// Whether this frame behaves like the zeroth frame, in the sense - /// that its pc value might not immediately follow a call (and thus might - /// be the first address of its function). True for actual frame zero as - /// well as any other frame with the same trait. + // Whether this frame behaves like the zeroth frame, in the sense + // that its pc value might not immediately follow a call (and thus might + // be the first address of its function). True for actual frame zero as + // well as any other frame with the same trait. bool m_behaves_like_zeroth_frame; lldb::VariableListSP m_variable_list_sp; - /// Value objects for each variable in m_variable_list_sp. - ValueObjectList m_variable_list_value_objects; - std::optional m_recognized_frame_sp; + ValueObjectList m_variable_list_value_objects; // Value objects for each + // variable in + // m_variable_list_sp + lldb::RecognizedStackFrameSP m_recognized_frame_sp; StreamString m_disassembly; std::recursive_mutex m_mutex; diff --git a/lldb/include/lldb/Target/StackFrameList.h b/lldb/include/lldb/Target/StackFrameList.h index 7d0e7a5b9a71b2..88e211ff692bd9 100644 --- a/lldb/include/lldb/Target/StackFrameList.h +++ b/lldb/include/lldb/Target/StackFrameList.h @@ -91,7 +91,7 @@ class StackFrameList { size_t GetStatus(Stream &strm, uint32_t first_frame, uint32_t num_frames, bool show_frame_info, uint32_t num_frames_with_source, - bool show_unique = false, bool show_hidden = false, + bool show_unique = false, const char *frame_marker = nullptr); protected: diff --git a/lldb/include/lldb/Target/StackFrameRecognizer.h b/lldb/include/lldb/Target/StackFrameRecognizer.h index 8acebc12c4b1dc..e9ac2750192ef6 100644 --- a/lldb/include/lldb/Target/StackFrameRecognizer.h +++ b/lldb/include/lldb/Target/StackFrameRecognizer.h @@ -17,7 +17,6 @@ #include "lldb/lldb-private-forward.h" #include "lldb/lldb-public.h" -#include #include #include #include @@ -29,23 +28,20 @@ namespace lldb_private { /// This class provides extra information about a stack frame that was /// provided by a specific stack frame recognizer. Right now, this class only /// holds recognized arguments (via GetRecognizedArguments). + class RecognizedStackFrame : public std::enable_shared_from_this { public: - virtual ~RecognizedStackFrame() = default; - virtual lldb::ValueObjectListSP GetRecognizedArguments() { return m_arguments; } virtual lldb::ValueObjectSP GetExceptionObject() { return lldb::ValueObjectSP(); } - virtual lldb::StackFrameSP GetMostRelevantFrame() { return nullptr; } + virtual lldb::StackFrameSP GetMostRelevantFrame() { return nullptr; }; + virtual ~RecognizedStackFrame() = default; std::string GetStopDescription() { return m_stop_desc; } - /// Controls whether this frame should be filtered out when - /// displaying backtraces, for example. - virtual bool ShouldHide() { return false; } protected: lldb::ValueObjectListSP m_arguments; @@ -57,6 +53,7 @@ class RecognizedStackFrame /// A base class for frame recognizers. Subclasses (actual frame recognizers) /// should implement RecognizeFrame to provide a RecognizedStackFrame for a /// given stack frame. + class StackFrameRecognizer : public std::enable_shared_from_this { public: @@ -76,10 +73,10 @@ class StackFrameRecognizer /// Python implementation for frame recognizers. An instance of this class /// tracks a particular Python classobject, which will be asked to recognize /// stack frames. + class ScriptedStackFrameRecognizer : public StackFrameRecognizer { lldb_private::ScriptInterpreter *m_interpreter; lldb_private::StructuredData::ObjectSP m_python_object_sp; - std::string m_python_class; public: @@ -126,14 +123,8 @@ class StackFrameRecognizerManager { lldb::StackFrameRecognizerSP GetRecognizerForFrame(lldb::StackFrameSP frame); lldb::RecognizedStackFrameSP RecognizeFrame(lldb::StackFrameSP frame); - /// Returns a number that changes whenever the list of recognizers - /// has been modified. - uint16_t GetGeneration() const { return m_generation; } private: - /// Increase the generation counter. - void BumpGeneration(); - struct RegisteredEntry { uint32_t recognizer_id; lldb::StackFrameRecognizerSP recognizer; @@ -146,7 +137,6 @@ class StackFrameRecognizerManager { }; std::deque m_recognizers; - uint16_t m_generation; }; /// \class ValueObjectRecognizerSynthesizedValue @@ -154,6 +144,7 @@ class StackFrameRecognizerManager { /// ValueObject subclass that presents the passed ValueObject as a recognized /// value with the specified ValueType. Frame recognizers should return /// instances of this class as the returned objects in GetRecognizedArguments(). + class ValueObjectRecognizerSynthesizedValue : public ValueObject { public: static lldb::ValueObjectSP Create(ValueObject &parent, lldb::ValueType type) { diff --git a/lldb/include/lldb/Target/Thread.h b/lldb/include/lldb/Target/Thread.h index 38b65b2bc58490..aacc59c292ec79 100644 --- a/lldb/include/lldb/Target/Thread.h +++ b/lldb/include/lldb/Target/Thread.h @@ -1128,11 +1128,11 @@ class Thread : public std::enable_shared_from_this, size_t GetStatus(Stream &strm, uint32_t start_frame, uint32_t num_frames, uint32_t num_frames_with_source, bool stop_format, - bool show_hidden, bool only_stacks = false); + bool only_stacks = false); size_t GetStackFrameStatus(Stream &strm, uint32_t first_frame, uint32_t num_frames, bool show_frame_info, - uint32_t num_frames_with_source, bool show_hidden); + uint32_t num_frames_with_source); // We need a way to verify that even though we have a thread in a shared // pointer that the object itself is still valid. Currently this won't be the diff --git a/lldb/source/API/SBFrame.cpp b/lldb/source/API/SBFrame.cpp index 2689ecb2ab7bc7..47fc88625e30c5 100644 --- a/lldb/source/API/SBFrame.cpp +++ b/lldb/source/API/SBFrame.cpp @@ -1195,24 +1195,13 @@ bool SBFrame::IsArtificial() const { std::unique_lock lock; ExecutionContext exe_ctx(m_opaque_sp.get(), lock); - if (StackFrame *frame = exe_ctx.GetFramePtr()) + StackFrame *frame = exe_ctx.GetFramePtr(); + if (frame) return frame->IsArtificial(); return false; } -bool SBFrame::IsHidden() const { - LLDB_INSTRUMENT_VA(this); - - std::unique_lock lock; - ExecutionContext exe_ctx(m_opaque_sp.get(), lock); - - if (StackFrame *frame = exe_ctx.GetFramePtr()) - return frame->IsHidden(); - - return false; -} - const char *SBFrame::GetFunctionName() { LLDB_INSTRUMENT_VA(this); diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp index 140a2920f05673..786f62bd66d520 100644 --- a/lldb/source/API/SBThread.cpp +++ b/lldb/source/API/SBThread.cpp @@ -1208,8 +1208,7 @@ bool SBThread::GetStatus(SBStream &status) const { ExecutionContext exe_ctx(m_opaque_sp.get(), lock); if (exe_ctx.HasThreadScope()) { - exe_ctx.GetThreadPtr()->GetStatus(strm, 0, 1, 1, true, - /*show_hidden=*/true); + exe_ctx.GetThreadPtr()->GetStatus(strm, 0, 1, 1, true); } else strm.PutCString("No status"); diff --git a/lldb/source/Commands/CommandCompletions.cpp b/lldb/source/Commands/CommandCompletions.cpp index 216aaf9abce6cf..54f4b368166492 100644 --- a/lldb/source/Commands/CommandCompletions.cpp +++ b/lldb/source/Commands/CommandCompletions.cpp @@ -791,7 +791,7 @@ void CommandCompletions::ThreadIndexes(CommandInterpreter &interpreter, lldb::ThreadSP thread_sp; for (uint32_t idx = 0; (thread_sp = threads.GetThreadAtIndex(idx)); ++idx) { StreamString strm; - thread_sp->GetStatus(strm, 0, 1, 1, true, /*show_hidden*/ true); + thread_sp->GetStatus(strm, 0, 1, 1, true); request.TryCompleteCurrentArg(std::to_string(thread_sp->GetIndexID()), strm.GetString()); } @@ -835,7 +835,7 @@ void CommandCompletions::ThreadIDs(CommandInterpreter &interpreter, lldb::ThreadSP thread_sp; for (uint32_t idx = 0; (thread_sp = threads.GetThreadAtIndex(idx)); ++idx) { StreamString strm; - thread_sp->GetStatus(strm, 0, 1, 1, true, /*show_hidden*/ true); + thread_sp->GetStatus(strm, 0, 1, 1, true); request.TryCompleteCurrentArg(std::to_string(thread_sp->GetID()), strm.GetString()); } diff --git a/lldb/source/Commands/CommandObjectFrame.cpp b/lldb/source/Commands/CommandObjectFrame.cpp index 46c75e3dd159c0..29e460fe3885ff 100644 --- a/lldb/source/Commands/CommandObjectFrame.cpp +++ b/lldb/source/Commands/CommandObjectFrame.cpp @@ -278,30 +278,6 @@ class CommandObjectFrameSelect : public CommandObjectParsed { if (frame_idx == UINT32_MAX) frame_idx = 0; - // If moving up/down by one, skip over hidden frames. - if (*m_options.relative_frame_offset == 1 || - *m_options.relative_frame_offset == -1) { - uint32_t candidate_idx = frame_idx; - const unsigned max_depth = 12; - for (unsigned num_try = 0; num_try < max_depth; ++num_try) { - if (candidate_idx == 0 && *m_options.relative_frame_offset == -1) { - candidate_idx = UINT32_MAX; - break; - } - candidate_idx += *m_options.relative_frame_offset; - if (auto candidate_sp = thread->GetStackFrameAtIndex(candidate_idx)) { - if (candidate_sp->IsHidden()) - continue; - // Now candidate_idx is the first non-hidden frame. - break; - } - candidate_idx = UINT32_MAX; - break; - }; - if (candidate_idx != UINT32_MAX) - m_options.relative_frame_offset = candidate_idx - frame_idx; - } - if (*m_options.relative_frame_offset < 0) { if (static_cast(frame_idx) >= -*m_options.relative_frame_offset) diff --git a/lldb/source/Commands/CommandObjectMemory.cpp b/lldb/source/Commands/CommandObjectMemory.cpp index baf5d9196e553e..137b1ad981073c 100644 --- a/lldb/source/Commands/CommandObjectMemory.cpp +++ b/lldb/source/Commands/CommandObjectMemory.cpp @@ -1570,8 +1570,7 @@ class CommandObjectMemoryHistory : public CommandObjectParsed { const bool stop_format = false; for (auto thread : thread_list) { - thread->GetStatus(*output_stream, 0, UINT32_MAX, 0, stop_format, - /*should_filter*/ false); + thread->GetStatus(*output_stream, 0, UINT32_MAX, 0, stop_format); } result.SetStatus(eReturnStatusSuccessFinishResult); diff --git a/lldb/source/Commands/CommandObjectThread.cpp b/lldb/source/Commands/CommandObjectThread.cpp index 6a89c163f37d51..605f872a9f45e1 100644 --- a/lldb/source/Commands/CommandObjectThread.cpp +++ b/lldb/source/Commands/CommandObjectThread.cpp @@ -89,9 +89,6 @@ class CommandObjectThreadBacktrace : public CommandObjectIterateOverThreads { "invalid boolean value for option '%c': %s", short_option, option_arg.data()); } break; - case 'u': - m_filtered_backtrace = false; - break; default: llvm_unreachable("Unimplemented option"); } @@ -102,7 +99,6 @@ class CommandObjectThreadBacktrace : public CommandObjectIterateOverThreads { m_count = UINT32_MAX; m_start = 0; m_extended_backtrace = false; - m_filtered_backtrace = true; } llvm::ArrayRef GetDefinitions() override { @@ -113,7 +109,6 @@ class CommandObjectThreadBacktrace : public CommandObjectIterateOverThreads { uint32_t m_count; uint32_t m_start; bool m_extended_backtrace; - bool m_filtered_backtrace; }; CommandObjectThreadBacktrace(CommandInterpreter &interpreter) @@ -126,10 +121,7 @@ class CommandObjectThreadBacktrace : public CommandObjectIterateOverThreads { "call stacks.\n" "Use 'settings set frame-format' to customize the printing of " "frames in the backtrace and 'settings set thread-format' to " - "customize the thread header.\n" - "Customizable frame recognizers may filter out less interesting " - "frames, which results in gaps in the numbering. " - "Use '-u' to see all frames.", + "customize the thread header.", nullptr, eCommandRequiresProcess | eCommandRequiresThread | eCommandTryTargetAPILock | eCommandProcessMustBeLaunched | @@ -207,8 +199,7 @@ class CommandObjectThreadBacktrace : public CommandObjectIterateOverThreads { strm.PutChar('\n'); if (ext_thread_sp->GetStatus(strm, m_options.m_start, m_options.m_count, - num_frames_with_source, stop_format, - !m_options.m_filtered_backtrace)) { + num_frames_with_source, stop_format)) { DoExtendedBacktrace(ext_thread_sp.get(), result); } } @@ -237,8 +228,7 @@ class CommandObjectThreadBacktrace : public CommandObjectIterateOverThreads { const uint32_t num_frames_with_source = 0; const bool stop_format = true; if (!thread->GetStatus(strm, m_options.m_start, m_options.m_count, - num_frames_with_source, stop_format, - !m_options.m_filtered_backtrace, only_stacks)) { + num_frames_with_source, stop_format, only_stacks)) { result.AppendErrorWithFormat( "error displaying backtrace for thread: \"0x%4.4x\"\n", thread->GetIndexID()); @@ -1402,8 +1392,7 @@ class CommandObjectThreadException : public CommandObjectIterateOverThreads { const uint32_t num_frames_with_source = 0; const bool stop_format = false; exception_thread_sp->GetStatus(strm, 0, UINT32_MAX, - num_frames_with_source, stop_format, - /*filtered*/ false); + num_frames_with_source, stop_format); } return true; diff --git a/lldb/source/Commands/Options.td b/lldb/source/Commands/Options.td index 9c4dbed6939ba9..f050cd2ebb5ae0 100644 --- a/lldb/source/Commands/Options.td +++ b/lldb/source/Commands/Options.td @@ -1048,8 +1048,6 @@ let Command = "thread backtrace" in { Arg<"FrameIndex">, Desc<"Frame in which to start the backtrace">; def thread_backtrace_extended : Option<"extended", "e">, Group<1>, Arg<"Boolean">, Desc<"Show the extended backtrace, if available">; - def thread_backtrace_unfiltered : Option<"unfiltered", "u">, Group<1>, - Desc<"Filter out frames according to installed frame recognizers">; } let Command = "thread step scope" in { diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp index 67f01707a2afee..309e01e456580c 100644 --- a/lldb/source/Core/Debugger.cpp +++ b/lldb/source/Core/Debugger.cpp @@ -1869,8 +1869,7 @@ void Debugger::HandleThreadEvent(const EventSP &event_sp) { ThreadSP thread_sp( Thread::ThreadEventData::GetThreadFromEvent(event_sp.get())); if (thread_sp) { - thread_sp->GetStatus(*GetAsyncOutputStream(), 0, 1, 1, stop_format, - /*show_hidden*/ true); + thread_sp->GetStatus(*GetAsyncOutputStream(), 0, 1, 1, stop_format); } } } diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp index 87298803e8415a..e45112530404b8 100644 --- a/lldb/source/Interpreter/CommandInterpreter.cpp +++ b/lldb/source/Interpreter/CommandInterpreter.cpp @@ -835,12 +835,11 @@ void CommandInterpreter::LoadCommandDictionary() { std::unique_ptr bt_regex_cmd_up( new CommandObjectRegexCommand( *this, "_regexp-bt", - "Show backtrace of the current thread's call stack. Any numeric " - "argument displays at most that many frames. The argument 'all' " - "displays all threads. Use 'settings set frame-format' to customize " + "Show backtrace of the current thread's call stack. Any numeric " + "argument displays at most that many frames. The argument 'all' " + "displays all threads. Use 'settings set frame-format' to customize " "the printing of individual frames and 'settings set thread-format' " - "to customize the thread header. Frame recognizers may filter the" - "list. Use 'thread backtrace -u (--unfiltered)' to see them all.", + "to customize the thread header.", "bt [ | all]", 0, false)); if (bt_regex_cmd_up) { // accept but don't document "bt -c " -- before bt was a regex diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp index c60200ab186d09..c7202a47d0157e 100644 --- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp @@ -26,7 +26,6 @@ #include "lldb/Target/RegisterContext.h" #include "lldb/Target/SectionLoadList.h" #include "lldb/Target/StackFrame.h" -#include "lldb/Target/StackFrameRecognizer.h" #include "lldb/Target/ThreadPlanRunToAddress.h" #include "lldb/Target/ThreadPlanStepInRange.h" #include "lldb/Utility/Timer.h" @@ -41,49 +40,8 @@ static ConstString g_coro_frame = ConstString("__coro_frame"); char CPPLanguageRuntime::ID = 0; -/// A frame recognizer that is installed to hide libc++ implementation -/// details from the backtrace. -class LibCXXFrameRecognizer : public StackFrameRecognizer { - RegularExpression m_hidden_function_regex; - RecognizedStackFrameSP m_hidden_frame; - - struct LibCXXHiddenFrame : public RecognizedStackFrame { - bool ShouldHide() override { return true; } - }; - -public: - LibCXXFrameRecognizer() - : m_hidden_function_regex( - R"(^std::__1::(__function.*::operator\(\)|__invoke))" - R"((\[.*\])?)" // ABI tag. - R"(( const)?$)"), // const. - m_hidden_frame(new LibCXXHiddenFrame()) {} - - std::string GetName() override { return "libc++ frame recognizer"; } - - lldb::RecognizedStackFrameSP - RecognizeFrame(lldb::StackFrameSP frame_sp) override { - if (!frame_sp) - return {}; - const auto &sc = frame_sp->GetSymbolContext(lldb::eSymbolContextFunction); - if (!sc.function) - return {}; - - if (m_hidden_function_regex.Execute(sc.function->GetNameNoArguments())) - return m_hidden_frame; - - return {}; - } -}; - CPPLanguageRuntime::CPPLanguageRuntime(Process *process) - : LanguageRuntime(process) { - if (process) - process->GetTarget().GetFrameRecognizerManager().AddRecognizer( - StackFrameRecognizerSP(new LibCXXFrameRecognizer()), {}, - std::make_shared("^std::__1::"), - /*first_instruction_only*/ false); -} + : LanguageRuntime(process) {} bool CPPLanguageRuntime::IsAllowedRuntimeValue(ConstString name) { return name == g_this || name == g_promise || name == g_coro_frame; diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h index 5351c1a698b4a7..3026b6113ae8f3 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h @@ -227,9 +227,6 @@ class SWIGBridge { LLDBSwigPython_GetRecognizedArguments(PyObject *implementor, const lldb::StackFrameSP &frame_sp); - static bool LLDBSwigPython_ShouldHide(PyObject *implementor, - const lldb::StackFrameSP &frame_sp); - static bool LLDBSWIGPythonRunScriptKeywordProcess( const char *python_function_name, const char *session_dictionary_name, const lldb::ProcessSP &process, std::string &output); diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp index 2a94f110910400..335c482f8495ad 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp @@ -1524,35 +1524,6 @@ lldb::ValueObjectListSP ScriptInterpreterPythonImpl::GetRecognizedArguments( return ValueObjectListSP(); } -bool ScriptInterpreterPythonImpl::ShouldHide( - const StructuredData::ObjectSP &os_plugin_object_sp, - lldb::StackFrameSP frame_sp) { - Locker py_lock(this, Locker::AcquireLock | Locker::NoSTDIN, Locker::FreeLock); - - if (!os_plugin_object_sp) - return false; - - StructuredData::Generic *generic = os_plugin_object_sp->GetAsGeneric(); - if (!generic) - return false; - - PythonObject implementor(PyRefType::Borrowed, - (PyObject *)generic->GetValue()); - - if (!implementor.IsAllocated()) - return false; - - bool result = - SWIGBridge::LLDBSwigPython_ShouldHide(implementor.get(), frame_sp); - - // if it fails, print the error but otherwise go on - if (PyErr_Occurred()) { - PyErr_Print(); - PyErr_Clear(); - } - return result; -} - ScriptedProcessInterfaceUP ScriptInterpreterPythonImpl::CreateScriptedProcessInterface() { return std::make_unique(*this); diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h index 85d79955e45efc..c2024efb395d70 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h @@ -107,9 +107,6 @@ class ScriptInterpreterPythonImpl : public ScriptInterpreterPython { GetRecognizedArguments(const StructuredData::ObjectSP &implementor, lldb::StackFrameSP frame_sp) override; - bool ShouldHide(const StructuredData::ObjectSP &implementor, - lldb::StackFrameSP frame_sp) override; - lldb::ScriptedProcessInterfaceUP CreateScriptedProcessInterface() override; lldb::ScriptedThreadInterfaceSP CreateScriptedThreadInterface() override; diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index b2a0f13b9a1549..3c9247fdbbbc96 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -5545,8 +5545,7 @@ Process::RunThreadPlan(ExecutionContext &exe_ctx, // Print a backtrace into the log so we can figure out where we are: StreamString s; s.PutCString("Thread state after unsuccessful completion: \n"); - thread->GetStackFrameStatus(s, 0, UINT32_MAX, true, UINT32_MAX, - /*show_hidden*/ true); + thread->GetStackFrameStatus(s, 0, UINT32_MAX, true, UINT32_MAX); log->PutString(s.GetString()); } // Restore the thread state if we are going to discard the plan execution. @@ -5820,8 +5819,8 @@ size_t Process::GetThreadStatus(Stream &strm, continue; } thread_sp->GetStatus(strm, start_frame, num_frames, - num_frames_with_source, stop_format, - /*show_hidden*/ num_frames <= 1); + num_frames_with_source, + stop_format); ++num_thread_infos_dumped; } else { Log *log = GetLog(LLDBLog::Process); diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp index 0ebaf555f86beb..3a2b4d05b28810 100644 --- a/lldb/source/Target/StackFrame.cpp +++ b/lldb/source/Target/StackFrame.cpp @@ -1198,12 +1198,6 @@ bool StackFrame::IsArtificial() const { return m_stack_frame_kind == StackFrame::Kind::Artificial; } -bool StackFrame::IsHidden() { - if (auto recognized_frame_sp = GetRecognizedFrame()) - return recognized_frame_sp->ShouldHide(); - return false; -} - SourceLanguage StackFrame::GetLanguage() { CompileUnit *cu = GetSymbolContext(eSymbolContextCompUnit).comp_unit; if (cu) @@ -1977,16 +1971,12 @@ bool StackFrame::GetStatus(Stream &strm, bool show_frame_info, bool show_source, } RecognizedStackFrameSP StackFrame::GetRecognizedFrame() { - auto process = GetThread()->GetProcess(); - if (!process) - return {}; - // If recognizer list has been modified, discard cache. - auto &manager = process->GetTarget().GetFrameRecognizerManager(); - auto new_generation = manager.GetGeneration(); - if (m_frame_recognizer_generation != new_generation) - m_recognized_frame_sp.reset(); - m_frame_recognizer_generation = new_generation; - if (!m_recognized_frame_sp.has_value()) - m_recognized_frame_sp = manager.RecognizeFrame(CalculateStackFrame()); - return m_recognized_frame_sp.value(); + if (!m_recognized_frame_sp) { + m_recognized_frame_sp = GetThread() + ->GetProcess() + ->GetTarget() + .GetFrameRecognizerManager() + .RecognizeFrame(CalculateStackFrame()); + } + return m_recognized_frame_sp; } diff --git a/lldb/source/Target/StackFrameList.cpp b/lldb/source/Target/StackFrameList.cpp index 7808bd3674ab19..0cf9ce1bf043f5 100644 --- a/lldb/source/Target/StackFrameList.cpp +++ b/lldb/source/Target/StackFrameList.cpp @@ -924,7 +924,7 @@ StackFrameList::GetStackFrameSPForStackFramePtr(StackFrame *stack_frame_ptr) { size_t StackFrameList::GetStatus(Stream &strm, uint32_t first_frame, uint32_t num_frames, bool show_frame_info, uint32_t num_frames_with_source, - bool show_unique, bool show_hidden, + bool show_unique, const char *selected_frame_marker) { size_t num_frames_displayed = 0; @@ -951,6 +951,7 @@ size_t StackFrameList::GetStatus(Stream &strm, uint32_t first_frame, unselected_marker = buffer.c_str(); } const char *marker = nullptr; + for (frame_idx = first_frame; frame_idx < last_frame; ++frame_idx) { frame_sp = GetFrameAtIndex(frame_idx); if (!frame_sp) @@ -962,11 +963,6 @@ size_t StackFrameList::GetStatus(Stream &strm, uint32_t first_frame, else marker = unselected_marker; } - - // Hide uninteresting frames unless it's the selected frame. - if (!show_hidden && frame_sp != selected_frame_sp && frame_sp->IsHidden()) - continue; - // Check for interruption here. If we're fetching arguments, this loop // can go slowly: Debugger &dbg = m_thread.GetProcess()->GetTarget().GetDebugger(); diff --git a/lldb/source/Target/StackFrameRecognizer.cpp b/lldb/source/Target/StackFrameRecognizer.cpp index 44411afc65dda9..0ccb1ae9c031e3 100644 --- a/lldb/source/Target/StackFrameRecognizer.cpp +++ b/lldb/source/Target/StackFrameRecognizer.cpp @@ -17,14 +17,10 @@ using namespace lldb; using namespace lldb_private; class ScriptedRecognizedStackFrame : public RecognizedStackFrame { - bool m_hidden; - public: - ScriptedRecognizedStackFrame(ValueObjectListSP args, bool hidden) - : m_hidden(hidden) { - m_arguments = std::move(args); + ScriptedRecognizedStackFrame(ValueObjectListSP args) { + m_arguments = args; } - bool ShouldHide() override { return m_hidden; } }; ScriptedStackFrameRecognizer::ScriptedStackFrameRecognizer( @@ -42,22 +38,13 @@ ScriptedStackFrameRecognizer::RecognizeFrame(lldb::StackFrameSP frame) { ValueObjectListSP args = m_interpreter->GetRecognizedArguments(m_python_object_sp, frame); auto args_synthesized = ValueObjectListSP(new ValueObjectList()); - if (args) { - for (const auto &o : args->GetObjects()) - args_synthesized->Append(ValueObjectRecognizerSynthesizedValue::Create( - *o, eValueTypeVariableArgument)); + for (const auto &o : args->GetObjects()) { + args_synthesized->Append(ValueObjectRecognizerSynthesizedValue::Create( + *o, eValueTypeVariableArgument)); } - bool hidden = m_interpreter->ShouldHide(m_python_object_sp, frame); - return RecognizedStackFrameSP( - new ScriptedRecognizedStackFrame(args_synthesized, hidden)); -} - -void StackFrameRecognizerManager::BumpGeneration() { - uint32_t n = m_generation; - n = (n + 1) & ((1 << 16) - 1); - m_generation = n; + new ScriptedRecognizedStackFrame(args_synthesized)); } void StackFrameRecognizerManager::AddRecognizer( @@ -66,7 +53,6 @@ void StackFrameRecognizerManager::AddRecognizer( m_recognizers.push_front({(uint32_t)m_recognizers.size(), recognizer, false, module, RegularExpressionSP(), symbols, RegularExpressionSP(), first_instruction_only}); - BumpGeneration(); } void StackFrameRecognizerManager::AddRecognizer( @@ -75,7 +61,6 @@ void StackFrameRecognizerManager::AddRecognizer( m_recognizers.push_front({(uint32_t)m_recognizers.size(), recognizer, true, ConstString(), module, std::vector(), symbol, first_instruction_only}); - BumpGeneration(); } void StackFrameRecognizerManager::ForEach( @@ -112,12 +97,10 @@ bool StackFrameRecognizerManager::RemoveRecognizerWithID( if (found == m_recognizers.end()) return false; m_recognizers.erase(found); - BumpGeneration(); return true; } void StackFrameRecognizerManager::RemoveAllRecognizers() { - BumpGeneration(); m_recognizers.clear(); } diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp index fcf0f4e2519085..74d1a268c6dffb 100644 --- a/lldb/source/Target/Thread.cpp +++ b/lldb/source/Target/Thread.cpp @@ -1748,7 +1748,7 @@ std::string Thread::RunModeAsString(lldb::RunMode mode) { size_t Thread::GetStatus(Stream &strm, uint32_t start_frame, uint32_t num_frames, uint32_t num_frames_with_source, - bool stop_format, bool show_hidden, bool only_stacks) { + bool stop_format, bool only_stacks) { if (!only_stacks) { ExecutionContext exe_ctx(shared_from_this()); @@ -1795,7 +1795,7 @@ size_t Thread::GetStatus(Stream &strm, uint32_t start_frame, num_frames_shown = GetStackFrameList()->GetStatus( strm, start_frame, num_frames, show_frame_info, num_frames_with_source, - show_frame_unique, show_hidden, selected_frame_marker); + show_frame_unique, selected_frame_marker); if (num_frames == 1) strm.IndentLess(); strm.IndentLess(); @@ -1893,11 +1893,9 @@ bool Thread::GetDescription(Stream &strm, lldb::DescriptionLevel level, size_t Thread::GetStackFrameStatus(Stream &strm, uint32_t first_frame, uint32_t num_frames, bool show_frame_info, - uint32_t num_frames_with_source, - bool show_hidden) { - return GetStackFrameList()->GetStatus(strm, first_frame, num_frames, - show_frame_info, num_frames_with_source, - /*show_unique*/ false, show_hidden); + uint32_t num_frames_with_source) { + return GetStackFrameList()->GetStatus( + strm, first_frame, num_frames, show_frame_info, num_frames_with_source); } Unwind &Thread::GetUnwinder() { diff --git a/lldb/source/Target/ThreadPlanStepOut.cpp b/lldb/source/Target/ThreadPlanStepOut.cpp index 8ca1dbc2fe4c46..0a1e2ae605efcf 100644 --- a/lldb/source/Target/ThreadPlanStepOut.cpp +++ b/lldb/source/Target/ThreadPlanStepOut.cpp @@ -58,7 +58,7 @@ ThreadPlanStepOut::ThreadPlanStepOut( return; // we can't do anything here. ValidatePlan() will return false. // While stepping out, behave as-if artificial frames are not present. - while (return_frame_sp->IsArtificial() || return_frame_sp->IsHidden()) { + while (return_frame_sp->IsArtificial()) { m_stepped_past_frames.push_back(return_frame_sp); ++return_frame_index; diff --git a/lldb/test/API/commands/frame/recognizer/TestFrameRecognizer.py b/lldb/test/API/commands/frame/recognizer/TestFrameRecognizer.py index 6174ac61a709dd..eea0aafce6e25e 100644 --- a/lldb/test/API/commands/frame/recognizer/TestFrameRecognizer.py +++ b/lldb/test/API/commands/frame/recognizer/TestFrameRecognizer.py @@ -162,46 +162,6 @@ def test_frame_recognizer_1(self): substrs=['*a = 78']) """ - @skipUnlessDarwin - def test_frame_recognizer_hiding(self): - self.build() - - target, process, thread, _ = lldbutil.run_to_name_breakpoint(self, "nested") - frame = thread.GetSelectedFrame() - - # Sanity check. - self.expect( - "thread backtrace", patterns=["frame.*nested", "frame.*baz", "frame.*main"] - ) - - self.expect("frame recognizer clear") - self.expect( - "command script import " - + os.path.join(self.getSourceDir(), "recognizer.py") - ) - - self.expect( - "frame recognizer add -l recognizer.BazFrameRecognizer -f false -s a.out -n baz" - ) - - self.expect( - "frame recognizer list", - substrs=["0: recognizer.BazFrameRecognizer"], - ) - - # Now main should be hidden. - self.expect("thread backtrace", matching=False, patterns=["frame.*baz"]) - self.assertFalse(frame.IsHidden()) - frame = thread.SetSelectedFrame(1) - self.assertIn("baz", frame.name) - self.assertTrue(frame.IsHidden()) - - # Test StepOut. - frame = thread.SetSelectedFrame(0) - thread.StepOut() - frame = thread.GetSelectedFrame() - self.assertIn("main", frame.name) - @skipUnlessDarwin def test_frame_recognizer_multi_symbol(self): self.build() diff --git a/lldb/test/API/commands/frame/recognizer/main.m b/lldb/test/API/commands/frame/recognizer/main.m index 74d219f1fff4c5..6546692bba772e 100644 --- a/lldb/test/API/commands/frame/recognizer/main.m +++ b/lldb/test/API/commands/frame/recognizer/main.m @@ -1,17 +1,16 @@ #import -void foo(int a, int b) { printf("%d %d\n", a, b); } +void foo(int a, int b) +{ + printf("%d %d\n", a, b); +} void bar(int *ptr) { printf("%d\n", *ptr); } -void nested(int *ptr) { bar(ptr); } - -void baz(int *ptr) { nested(ptr); } - -int main(int argc, const char *argv[]) { - foo(42, 56); - int i = 78; - bar(&i); - baz(&i); - return 0; +int main (int argc, const char * argv[]) +{ + foo(42, 56); + int i = 78; + bar(&i); + return 0; } diff --git a/lldb/test/API/commands/frame/recognizer/recognizer.py b/lldb/test/API/commands/frame/recognizer/recognizer.py index 98666b720b1e2b..1a2a2d5c265070 100644 --- a/lldb/test/API/commands/frame/recognizer/recognizer.py +++ b/lldb/test/API/commands/frame/recognizer/recognizer.py @@ -36,8 +36,3 @@ def get_recognized_arguments(self, frame): class MyOtherFrameRecognizer(object): def get_recognized_arguments(self, frame): return [] - - -class BazFrameRecognizer(object): - def should_hide(self, frame): - return "baz" in frame.name diff --git a/lldb/test/API/lang/cpp/std-function-recognizer/Makefile b/lldb/test/API/lang/cpp/std-function-recognizer/Makefile deleted file mode 100644 index ab034edd121f9f..00000000000000 --- a/lldb/test/API/lang/cpp/std-function-recognizer/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -CXX_SOURCES := main.cpp -USE_LIBCPP := 1 - -include Makefile.rules diff --git a/lldb/test/API/lang/cpp/std-function-recognizer/TestStdFunctionRecognizer.py b/lldb/test/API/lang/cpp/std-function-recognizer/TestStdFunctionRecognizer.py deleted file mode 100644 index 30fe3ecb1e4bf4..00000000000000 --- a/lldb/test/API/lang/cpp/std-function-recognizer/TestStdFunctionRecognizer.py +++ /dev/null @@ -1,84 +0,0 @@ -import lldb -from lldbsuite.test.decorators import * -from lldbsuite.test.lldbtest import * -from lldbsuite.test import lldbutil - - -class LibCxxStdFunctionRecognizerTestCase(TestBase): - NO_DEBUG_INFO_TESTCASE = True - - @add_test_categories(["libc++"]) - def test_backtrace(self): - """Test that std::function implementation details are hidden in bt""" - self.build() - (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( - self, "// break here", lldb.SBFileSpec("main.cpp") - ) - # Filtered. - self.expect( - "thread backtrace", - ordered=True, - substrs=["frame", "foo", "frame", "main"], - ) - self.expect( - "thread backtrace", matching=False, patterns=["frame.*std::__1::__function"] - ) - # Unfiltered. - self.expect( - "thread backtrace -u", - ordered=True, - patterns=["frame.*foo", "frame.*std::__1::__function", "frame.*main"], - ) - self.expect( - "thread backtrace --unfiltered", - ordered=True, - patterns=["frame.*foo", "frame.*std::__1::__function", "frame.*main"], - ) - - @add_test_categories(["libc++"]) - def test_up_down(self): - """Test that std::function implementation details are skipped""" - self.build() - (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( - self, "// break here", lldb.SBFileSpec("main.cpp") - ) - frame = thread.GetSelectedFrame() - # up - self.assertIn("foo", frame.GetFunctionName()) - start_idx = frame.GetFrameID() - i = 0 - while i < thread.GetNumFrames(): - self.expect("up") - frame = thread.GetSelectedFrame() - if frame.GetFunctionName() == "main": - break - end_idx = frame.GetFrameID() - self.assertLess(i, end_idx - start_idx, "skipped frames") - - # Back down again. - start_idx = frame.GetFrameID() - for i in range(1, thread.GetNumFrames()): - self.expect("down") - frame = thread.GetSelectedFrame() - if "foo" in frame.GetFunctionName(): - break - end_idx = frame.GetFrameID() - self.assertLess(i, start_idx - end_idx, "skipped frames") - - @add_test_categories(["libc++"]) - def test_api(self): - """Test that std::function implementation details are skipped""" - self.build() - (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( - self, "// break here", lldb.SBFileSpec("main.cpp") - ) - frame = thread.GetSelectedFrame() - num_hidden = 0 - for i in range(1, thread.GetNumFrames()): - thread.SetSelectedFrame(i) - frame = thread.GetSelectedFrame() - if frame.IsHidden(): - num_hidden += 1 - - self.assertGreater(num_hidden, 0) - self.assertLess(num_hidden, thread.GetNumFrames()) diff --git a/lldb/test/API/lang/cpp/std-function-recognizer/main.cpp b/lldb/test/API/lang/cpp/std-function-recognizer/main.cpp deleted file mode 100644 index 8cf4eaa2e51929..00000000000000 --- a/lldb/test/API/lang/cpp/std-function-recognizer/main.cpp +++ /dev/null @@ -1,10 +0,0 @@ -#include - -int foo(int x, int y) { - return x * y; // break here -} - -int main(int argc, char *argv[]) { - std::function fn = foo; - return fn(argc, 1); -} From 125aa10b3d645bd26523a1bc321bb2e6b1cf04e1 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Thu, 22 Aug 2024 14:05:17 +0200 Subject: [PATCH 184/426] [clang][bytecode] Fix void unary * operators (#105640) Discard the subexpr. --- clang/lib/AST/ByteCode/Compiler.cpp | 2 +- clang/test/AST/ByteCode/invalid.cpp | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 10f3222726fd43..9d376641f9c5a3 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -5145,7 +5145,7 @@ bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { // We should already have a pointer when we get here. return this->delegate(SubExpr); case UO_Deref: // *x - if (DiscardResult) + if (DiscardResult || E->getType()->isVoidType()) return this->discard(SubExpr); return this->visit(SubExpr); case UO_Not: // ~x diff --git a/clang/test/AST/ByteCode/invalid.cpp b/clang/test/AST/ByteCode/invalid.cpp index 522ad02f71ce07..3c142481f78119 100644 --- a/clang/test/AST/ByteCode/invalid.cpp +++ b/clang/test/AST/ByteCode/invalid.cpp @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -fcxx-exceptions -std=c++20 -fexperimental-new-constant-interpreter -verify %s -// RUN: %clang_cc1 -fcxx-exceptions -std=c++20 -verify=ref %s +// RUN: %clang_cc1 -fcxx-exceptions -std=c++20 -fexperimental-new-constant-interpreter -verify=expected,both %s +// RUN: %clang_cc1 -fcxx-exceptions -std=c++20 -verify=ref,both %s namespace Throw { @@ -65,4 +65,9 @@ namespace Casts { // ref-error {{must be initialized by a constant expression}} \ // ref-note {{reinterpret_cast is not allowed}} + void func() { + struct B {}; + B b; + (void)*reinterpret_cast(&b); // both-error {{indirection not permitted on operand of type 'void *'}} + } } From 6932f47cfdf4734d68759586047aee240861058e Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Thu, 22 Aug 2024 12:04:26 +0000 Subject: [PATCH 185/426] [NFC][VPlan] Correct two typos in comments. --- llvm/lib/Transforms/Vectorize/VPlan.h | 2 +- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 24da8f6700dfae..36a1aa08654d5b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2042,7 +2042,7 @@ class VPBlendRecipe : public VPSingleDefRecipe { public: /// The blend operation is a User of the incoming values and of their /// respective masks, ordered [I0, M0, I1, M1, I2, M2, ...]. Note that M0 can - /// be ommited (implied by passing an odd number of operands) in which case + /// be omitted (implied by passing an odd number of operands) in which case /// all other incoming values are merged into it. VPBlendRecipe(PHINode *Phi, ArrayRef Operands) : VPSingleDefRecipe(VPDef::VPBlendSC, Operands, Phi, Phi->getDebugLoc()) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 55e90298b36cda..8deded031dc391 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -914,7 +914,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { if (Blend->isNormalized()) return; - // Normalize the blend so its first incomming value is used as the initial + // Normalize the blend so its first incoming value is used as the initial // value with the others blended into it. unsigned StartIndex = 0; From d7da79f2cd025ab1a526c7011aab062817a656b2 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Thu, 22 Aug 2024 05:47:31 -0700 Subject: [PATCH 186/426] [NFC][SetTheory] Refactor to use const pointers and range loops (#105544) - Refactor SetTheory code to use const pointers when possible. - Use auto for variables initialized using dyn_cast<>. - Use range based for loops and early continue. --- clang/utils/TableGen/NeonEmitter.cpp | 9 +- llvm/include/llvm/TableGen/SetTheory.h | 10 +- llvm/lib/TableGen/SetTheory.cpp | 94 +++++++++---------- .../TableGen/Common/CodeGenRegisters.cpp | 3 +- .../utils/TableGen/Common/CodeGenSchedule.cpp | 4 +- 5 files changed, 61 insertions(+), 59 deletions(-) diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp index 30fbb8c5d65e5f..8ec8e67388bbd2 100644 --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -1569,7 +1569,7 @@ std::pair Intrinsic::DagEmitter::emitDagShuffle(DagInit *DI){ // See the documentation in arm_neon.td for a description of these operators. class LowHalf : public SetTheory::Operator { public: - void apply(SetTheory &ST, DagInit *Expr, SetTheory::RecSet &Elts, + void apply(SetTheory &ST, const DagInit *Expr, SetTheory::RecSet &Elts, ArrayRef Loc) override { SetTheory::RecSet Elts2; ST.evaluate(Expr->arg_begin(), Expr->arg_end(), Elts2, Loc); @@ -1579,7 +1579,7 @@ std::pair Intrinsic::DagEmitter::emitDagShuffle(DagInit *DI){ class HighHalf : public SetTheory::Operator { public: - void apply(SetTheory &ST, DagInit *Expr, SetTheory::RecSet &Elts, + void apply(SetTheory &ST, const DagInit *Expr, SetTheory::RecSet &Elts, ArrayRef Loc) override { SetTheory::RecSet Elts2; ST.evaluate(Expr->arg_begin(), Expr->arg_end(), Elts2, Loc); @@ -1593,7 +1593,7 @@ std::pair Intrinsic::DagEmitter::emitDagShuffle(DagInit *DI){ public: Rev(unsigned ElementSize) : ElementSize(ElementSize) {} - void apply(SetTheory &ST, DagInit *Expr, SetTheory::RecSet &Elts, + void apply(SetTheory &ST, const DagInit *Expr, SetTheory::RecSet &Elts, ArrayRef Loc) override { SetTheory::RecSet Elts2; ST.evaluate(Expr->arg_begin() + 1, Expr->arg_end(), Elts2, Loc); @@ -1618,7 +1618,8 @@ std::pair Intrinsic::DagEmitter::emitDagShuffle(DagInit *DI){ public: MaskExpander(unsigned N) : N(N) {} - void expand(SetTheory &ST, Record *R, SetTheory::RecSet &Elts) override { + void expand(SetTheory &ST, const Record *R, + SetTheory::RecSet &Elts) override { unsigned Addend = 0; if (R->getName() == "mask0") Addend = 0; diff --git a/llvm/include/llvm/TableGen/SetTheory.h b/llvm/include/llvm/TableGen/SetTheory.h index 4cff688164b0c4..954453b783d4d8 100644 --- a/llvm/include/llvm/TableGen/SetTheory.h +++ b/llvm/include/llvm/TableGen/SetTheory.h @@ -76,7 +76,7 @@ class SetTheory { /// apply - Apply this operator to Expr's arguments and insert the result /// in Elts. - virtual void apply(SetTheory&, DagInit *Expr, RecSet &Elts, + virtual void apply(SetTheory &, const DagInit *Expr, RecSet &Elts, ArrayRef Loc) = 0; }; @@ -89,13 +89,13 @@ class SetTheory { public: virtual ~Expander() = default; - virtual void expand(SetTheory&, Record*, RecSet &Elts) = 0; + virtual void expand(SetTheory &, const Record *, RecSet &Elts) = 0; }; private: // Map set defs to their fully expanded contents. This serves as a memoization // cache and it makes it possible to return const references on queries. - using ExpandMap = std::map; + using ExpandMap = std::map; ExpandMap Expansions; // Known DAG operators by name. @@ -125,7 +125,7 @@ class SetTheory { void addOperator(StringRef Name, std::unique_ptr); /// evaluate - Evaluate Expr and append the resulting set to Elts. - void evaluate(Init *Expr, RecSet &Elts, ArrayRef Loc); + void evaluate(const Init *Expr, RecSet &Elts, ArrayRef Loc); /// evaluate - Evaluate a sequence of Inits and append to Elts. template @@ -137,7 +137,7 @@ class SetTheory { /// expand - Expand a record into a set of elements if possible. Return a /// pointer to the expanded elements, or NULL if Set cannot be expanded /// further. - const RecVec *expand(Record *Set); + const RecVec *expand(const Record *Set); }; } // end namespace llvm diff --git a/llvm/lib/TableGen/SetTheory.cpp b/llvm/lib/TableGen/SetTheory.cpp index f4e3e3d4ce473b..edb99827f7c676 100644 --- a/llvm/lib/TableGen/SetTheory.cpp +++ b/llvm/lib/TableGen/SetTheory.cpp @@ -13,6 +13,7 @@ #include "llvm/TableGen/SetTheory.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Casting.h" @@ -36,7 +37,7 @@ using RecVec = SetTheory::RecVec; // (add a, b, ...) Evaluate and union all arguments. struct AddOp : public SetTheory::Operator { - void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, + void apply(SetTheory &ST, const DagInit *Expr, RecSet &Elts, ArrayRef Loc) override { ST.evaluate(Expr->arg_begin(), Expr->arg_end(), Elts, Loc); } @@ -44,7 +45,7 @@ struct AddOp : public SetTheory::Operator { // (sub Add, Sub, ...) Set difference. struct SubOp : public SetTheory::Operator { - void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, + void apply(SetTheory &ST, const DagInit *Expr, RecSet &Elts, ArrayRef Loc) override { if (Expr->arg_size() < 2) PrintFatalError(Loc, "Set difference needs at least two arguments: " + @@ -60,7 +61,7 @@ struct SubOp : public SetTheory::Operator { // (and S1, S2) Set intersection. struct AndOp : public SetTheory::Operator { - void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, + void apply(SetTheory &ST, const DagInit *Expr, RecSet &Elts, ArrayRef Loc) override { if (Expr->arg_size() != 2) PrintFatalError(Loc, "Set intersection requires two arguments: " + @@ -76,17 +77,17 @@ struct AndOp : public SetTheory::Operator { // SetIntBinOp - Abstract base class for (Op S, N) operators. struct SetIntBinOp : public SetTheory::Operator { - virtual void apply2(SetTheory &ST, DagInit *Expr, RecSet &Set, int64_t N, - RecSet &Elts, ArrayRef Loc) = 0; + virtual void apply2(SetTheory &ST, const DagInit *Expr, RecSet &Set, + int64_t N, RecSet &Elts, ArrayRef Loc) = 0; - void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, + void apply(SetTheory &ST, const DagInit *Expr, RecSet &Elts, ArrayRef Loc) override { if (Expr->arg_size() != 2) PrintFatalError(Loc, "Operator requires (Op Set, Int) arguments: " + Expr->getAsString()); RecSet Set; ST.evaluate(Expr->arg_begin()[0], Set, Loc); - IntInit *II = dyn_cast(Expr->arg_begin()[1]); + const auto *II = dyn_cast(Expr->arg_begin()[1]); if (!II) PrintFatalError(Loc, "Second argument must be an integer: " + Expr->getAsString()); @@ -96,7 +97,7 @@ struct SetIntBinOp : public SetTheory::Operator { // (shl S, N) Shift left, remove the first N elements. struct ShlOp : public SetIntBinOp { - void apply2(SetTheory &ST, DagInit *Expr, RecSet &Set, int64_t N, + void apply2(SetTheory &ST, const DagInit *Expr, RecSet &Set, int64_t N, RecSet &Elts, ArrayRef Loc) override { if (N < 0) PrintFatalError(Loc, "Positive shift required: " + @@ -108,7 +109,7 @@ struct ShlOp : public SetIntBinOp { // (trunc S, N) Truncate after the first N elements. struct TruncOp : public SetIntBinOp { - void apply2(SetTheory &ST, DagInit *Expr, RecSet &Set, int64_t N, + void apply2(SetTheory &ST, const DagInit *Expr, RecSet &Set, int64_t N, RecSet &Elts, ArrayRef Loc) override { if (N < 0) PrintFatalError(Loc, "Positive length required: " + @@ -125,7 +126,7 @@ struct RotOp : public SetIntBinOp { RotOp(bool Rev) : Reverse(Rev) {} - void apply2(SetTheory &ST, DagInit *Expr, RecSet &Set, int64_t N, + void apply2(SetTheory &ST, const DagInit *Expr, RecSet &Set, int64_t N, RecSet &Elts, ArrayRef Loc) override { if (Reverse) N = -N; @@ -143,7 +144,7 @@ struct RotOp : public SetIntBinOp { // (decimate S, N) Pick every N'th element of S. struct DecimateOp : public SetIntBinOp { - void apply2(SetTheory &ST, DagInit *Expr, RecSet &Set, int64_t N, + void apply2(SetTheory &ST, const DagInit *Expr, RecSet &Set, int64_t N, RecSet &Elts, ArrayRef Loc) override { if (N <= 0) PrintFatalError(Loc, "Positive stride required: " + @@ -155,62 +156,62 @@ struct DecimateOp : public SetIntBinOp { // (interleave S1, S2, ...) Interleave elements of the arguments. struct InterleaveOp : public SetTheory::Operator { - void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, + void apply(SetTheory &ST, const DagInit *Expr, RecSet &Elts, ArrayRef Loc) override { // Evaluate the arguments individually. - SmallVector Args(Expr->getNumArgs()); + SmallVector Values(Expr->getNumArgs()); unsigned MaxSize = 0; - for (unsigned i = 0, e = Expr->getNumArgs(); i != e; ++i) { - ST.evaluate(Expr->getArg(i), Args[i], Loc); - MaxSize = std::max(MaxSize, unsigned(Args[i].size())); + for (auto [Arg, Value] : zip(Expr->getArgs(), Values)) { + ST.evaluate(Arg, Value, Loc); + MaxSize = std::max(MaxSize, unsigned(Value.size())); } // Interleave arguments into Elts. for (unsigned n = 0; n != MaxSize; ++n) - for (unsigned i = 0, e = Expr->getNumArgs(); i != e; ++i) - if (n < Args[i].size()) - Elts.insert(Args[i][n]); + for (const RecSet &Value : Values) + if (n < Value.size()) + Elts.insert(Value[n]); } }; // (sequence "Format", From, To) Generate a sequence of records by name. struct SequenceOp : public SetTheory::Operator { - void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, + void apply(SetTheory &ST, const DagInit *Expr, RecSet &Elts, ArrayRef Loc) override { int Step = 1; if (Expr->arg_size() > 4) PrintFatalError(Loc, "Bad args to (sequence \"Format\", From, To): " + Expr->getAsString()); - else if (Expr->arg_size() == 4) { - if (IntInit *II = dyn_cast(Expr->arg_begin()[3])) { + if (Expr->arg_size() == 4) { + if (const auto *II = dyn_cast(Expr->arg_begin()[3])) Step = II->getValue(); - } else + else PrintFatalError(Loc, "Stride must be an integer: " + Expr->getAsString()); } std::string Format; - if (StringInit *SI = dyn_cast(Expr->arg_begin()[0])) + if (const auto *SI = dyn_cast(Expr->arg_begin()[0])) Format = std::string(SI->getValue()); else PrintFatalError(Loc, "Format must be a string: " + Expr->getAsString()); int64_t From, To; - if (IntInit *II = dyn_cast(Expr->arg_begin()[1])) + if (const auto *II = dyn_cast(Expr->arg_begin()[1])) From = II->getValue(); else PrintFatalError(Loc, "From must be an integer: " + Expr->getAsString()); if (From < 0 || From >= (1 << 30)) PrintFatalError(Loc, "From out of range"); - if (IntInit *II = dyn_cast(Expr->arg_begin()[2])) + if (const auto *II = dyn_cast(Expr->arg_begin()[2])) To = II->getValue(); else PrintFatalError(Loc, "To must be an integer: " + Expr->getAsString()); if (To < 0 || To >= (1 << 30)) PrintFatalError(Loc, "To out of range"); - RecordKeeper &Records = - cast(Expr->getOperator())->getDef()->getRecords(); + const RecordKeeper &Records = + cast(Expr->getOperator())->getDef()->getRecords(); Step *= From <= To ? 1 : -1; while (true) { @@ -242,7 +243,7 @@ struct FieldExpander : public SetTheory::Expander { FieldExpander(StringRef fn) : FieldName(fn) {} - void expand(SetTheory &ST, Record *Def, RecSet &Elts) override { + void expand(SetTheory &ST, const Record *Def, RecSet &Elts) override { ST.evaluate(Def->getValueInit(FieldName), Elts, Def->getLoc()); } }; @@ -278,9 +279,9 @@ void SetTheory::addFieldExpander(StringRef ClassName, StringRef FieldName) { addExpander(ClassName, std::make_unique(FieldName)); } -void SetTheory::evaluate(Init *Expr, RecSet &Elts, ArrayRef Loc) { +void SetTheory::evaluate(const Init *Expr, RecSet &Elts, ArrayRef Loc) { // A def in a list can be a just an element, or it may expand. - if (DefInit *Def = dyn_cast(Expr)) { + if (const auto *Def = dyn_cast(Expr)) { if (const RecVec *Result = expand(Def->getDef())) return Elts.insert(Result->begin(), Result->end()); Elts.insert(Def->getDef()); @@ -288,14 +289,14 @@ void SetTheory::evaluate(Init *Expr, RecSet &Elts, ArrayRef Loc) { } // Lists simply expand. - if (ListInit *LI = dyn_cast(Expr)) + if (const auto *LI = dyn_cast(Expr)) return evaluate(LI->begin(), LI->end(), Elts, Loc); // Anything else must be a DAG. - DagInit *DagExpr = dyn_cast(Expr); + const auto *DagExpr = dyn_cast(Expr); if (!DagExpr) PrintFatalError(Loc, "Invalid set element: " + Expr->getAsString()); - DefInit *OpInit = dyn_cast(DagExpr->getOperator()); + const DefInit *OpInit = dyn_cast(DagExpr->getOperator()); if (!OpInit) PrintFatalError(Loc, "Bad set expression: " + Expr->getAsString()); auto I = Operators.find(OpInit->getDef()->getName()); @@ -304,27 +305,26 @@ void SetTheory::evaluate(Init *Expr, RecSet &Elts, ArrayRef Loc) { I->second->apply(*this, DagExpr, Elts, Loc); } -const RecVec *SetTheory::expand(Record *Set) { +const RecVec *SetTheory::expand(const Record *Set) { // Check existing entries for Set and return early. ExpandMap::iterator I = Expansions.find(Set); if (I != Expansions.end()) return &I->second; // This is the first time we see Set. Find a suitable expander. - ArrayRef> SC = Set->getSuperClasses(); - for (const auto &SCPair : SC) { + for (const auto &[SuperClass, Loc] : Set->getSuperClasses()) { // Skip unnamed superclasses. - if (!isa(SCPair.first->getNameInit())) + if (!isa(SuperClass->getNameInit())) continue; - auto I = Expanders.find(SCPair.first->getName()); - if (I != Expanders.end()) { - // This breaks recursive definitions. - RecVec &EltVec = Expansions[Set]; - RecSet Elts; - I->second->expand(*this, Set, Elts); - EltVec.assign(Elts.begin(), Elts.end()); - return &EltVec; - } + auto I = Expanders.find(SuperClass->getName()); + if (I == Expanders.end()) + continue; + // This breaks recursive definitions. + RecVec &EltVec = Expansions[Set]; + RecSet Elts; + I->second->expand(*this, Set, Elts); + EltVec.assign(Elts.begin(), Elts.end()); + return &EltVec; } // Set is not expandable. diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp index b5a6c1395c60e8..ee58cad358a4f1 100644 --- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp +++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp @@ -625,7 +625,8 @@ struct TupleExpander : SetTheory::Expander { TupleExpander(std::vector> &SynthDefs) : SynthDefs(SynthDefs) {} - void expand(SetTheory &ST, Record *Def, SetTheory::RecSet &Elts) override { + void expand(SetTheory &ST, const Record *Def, + SetTheory::RecSet &Elts) override { std::vector Indices = Def->getValueAsListOfDefs("SubRegIndices"); unsigned Dim = Indices.size(); ListInit *SubRegs = Def->getValueAsListInit("SubRegs"); diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp index 6386cc8eb32db3..5c266808f2e272 100644 --- a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp +++ b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp @@ -43,7 +43,7 @@ namespace { // (instrs a, b, ...) Evaluate and union all arguments. Identical to AddOp. struct InstrsOp : public SetTheory::Operator { - void apply(SetTheory &ST, DagInit *Expr, SetTheory::RecSet &Elts, + void apply(SetTheory &ST, const DagInit *Expr, SetTheory::RecSet &Elts, ArrayRef Loc) override { ST.evaluate(Expr->arg_begin(), Expr->arg_end(), Elts, Loc); } @@ -75,7 +75,7 @@ struct InstRegexOp : public SetTheory::Operator { return Result; } - void apply(SetTheory &ST, DagInit *Expr, SetTheory::RecSet &Elts, + void apply(SetTheory &ST, const DagInit *Expr, SetTheory::RecSet &Elts, ArrayRef Loc) override { ArrayRef Instructions = Target.getInstructionsByEnumValue(); From c73b14ceaaea9b98d7318b97b70453388e758704 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Thu, 22 Aug 2024 08:51:21 -0400 Subject: [PATCH 187/426] [libc++] Fix the documentation build There was a duplicate link target. --- libcxx/docs/Status/Cxx17.rst | 4 ++-- libcxx/docs/Status/Cxx17Papers.csv | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/libcxx/docs/Status/Cxx17.rst b/libcxx/docs/Status/Cxx17.rst index c1073c0b411b06..13928fe9e1b272 100644 --- a/libcxx/docs/Status/Cxx17.rst +++ b/libcxx/docs/Status/Cxx17.rst @@ -40,14 +40,14 @@ Paper Status .. note:: - .. [#note-P0067] P0067: ``std::(to|from)_chars`` for integrals has been available since version 7.0. ``std::to_chars`` for ``float`` and ``double`` since version 14.0 ``std::to_chars`` for ``long double`` uses the implementation for ``double``. + .. [#note-P0067R5] P0067R5: ``std::(to|from)_chars`` for integrals has been available since version 7.0. ``std::to_chars`` for ``float`` and ``double`` since version 14.0 ``std::to_chars`` for ``long double`` uses the implementation for ``double``. .. [#note-P0226] P0226: Progress is tracked `here `_. .. [#note-P0607] P0607: The parts of P0607 that are not done are the ```` bits. .. [#note-P0154] P0154: The required macros are only implemented as of clang 19. .. [#note-P0452] P0452: The changes to ``std::transform_inclusive_scan`` and ``std::transform_exclusive_scan`` have not yet been implemented. .. [#note-P0156] P0156: That paper was pulled out of the draft at the 2017-01 meeting in Kona. .. [#note-P0181] P0181: That paper was pulled out of the draft at the 2017-01 meeting in Kona. - .. [#note-P0067] P0067: That paper was resolved by `P0067R5 `__. + .. [#note-P0067R3] P0067R3: That paper was resolved by `P0067R5 `__. .. [#note-LWG2587] LWG2587: That LWG issue was resolved by `LWG2567 `__. .. [#note-LWG2588] LWG2588: That LWG issue was resolved by `LWG2568 `__. .. [#note-LWG2955] LWG2955: That LWG issue was resolved by `P0682R1 `__. diff --git a/libcxx/docs/Status/Cxx17Papers.csv b/libcxx/docs/Status/Cxx17Papers.csv index 0aeb15f18b76bb..0c65436c3b6bcd 100644 --- a/libcxx/docs/Status/Cxx17Papers.csv +++ b/libcxx/docs/Status/Cxx17Papers.csv @@ -44,7 +44,7 @@ "`P0032R3 `__","Homogeneous interface for variant, any and optional","2016-06 (Oulu)","|Complete|","4.0","" "`P0040R3 `__","Extending memory management tools","2016-06 (Oulu)","|Complete|","4.0","" "`P0063R3 `__","C++17 should refer to C11 instead of C99","2016-06 (Oulu)","|Complete|","7.0","" -"`P0067R3 `__","Elementary string conversions","2016-06 (Oulu)","|Nothing To Do| [#note-P0067]_","n/a","" +"`P0067R3 `__","Elementary string conversions","2016-06 (Oulu)","|Nothing To Do| [#note-P0067R3]_","n/a","" "`P0083R3 `__","Splicing Maps and Sets","2016-06 (Oulu)","|Complete|","8.0","" "`P0084R2 `__","Emplace Return Type","2016-06 (Oulu)","|Complete|","4.0","" "`P0088R3 `__","Variant: a type-safe union for C++17","2016-06 (Oulu)","|Complete|","4.0","" @@ -71,7 +71,7 @@ "`P0394R4 `__","Hotel Parallelifornia: terminate() for Parallel Algorithms Exception Handling","2016-06 (Oulu)","|Complete|","17.0","" "","","","","","" "`P0003R5 `__","Removing Deprecated Exception Specifications from C++17","2016-11 (Issaquah)","|Complete|","5.0","" -"`P0067R5 `__","Elementary string conversions, revision 5","2016-11 (Issaquah)","|Partial| [#note-P0067]_","","" +"`P0067R5 `__","Elementary string conversions, revision 5","2016-11 (Issaquah)","|Partial| [#note-P0067R5]_","","" "`P0403R1 `__","Literal suffixes for ``basic_string_view``\ ","2016-11 (Issaquah)","|Complete|","4.0","" "`P0414R2 `__","Merging shared_ptr changes from Library Fundamentals to C++17","2016-11 (Issaquah)","|Complete|","11.0","" "`P0418R2 `__","Fail or succeed: there is no atomic lattice","2016-11 (Issaquah)","","","" From 6d30b67cf0fdd5f417af53b4acd593ded37b2db9 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Thu, 22 Aug 2024 09:01:31 -0400 Subject: [PATCH 188/426] [libc++] Add link to the Github conformance table from the documentation --- libcxx/docs/Status/Cxx17.rst | 2 -- libcxx/docs/Status/Cxx20.rst | 2 -- libcxx/docs/Status/Cxx23.rst | 2 -- libcxx/docs/index.rst | 9 +++++++-- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/libcxx/docs/Status/Cxx17.rst b/libcxx/docs/Status/Cxx17.rst index 13928fe9e1b272..94f9d890c36417 100644 --- a/libcxx/docs/Status/Cxx17.rst +++ b/libcxx/docs/Status/Cxx17.rst @@ -18,8 +18,6 @@ In February 2017, the C++ standard committee approved this draft, and sent it to This page shows the status of libc++; the status of clang's support of the language features is `here `__. -.. attention:: Features in unreleased drafts of the standard are subject to change. - The groups that have contributed papers: - CWG - Core Language Working group diff --git a/libcxx/docs/Status/Cxx20.rst b/libcxx/docs/Status/Cxx20.rst index f5b35d7ccc39e7..2a26212f1f8806 100644 --- a/libcxx/docs/Status/Cxx20.rst +++ b/libcxx/docs/Status/Cxx20.rst @@ -18,8 +18,6 @@ In September 2020, the C++ standard committee approved this draft, and sent it t This page shows the status of libc++; the status of clang's support of the language features is `here `__. -.. attention:: Features in unreleased drafts of the standard are subject to change. - The groups that have contributed papers: - CWG - Core Language Working group diff --git a/libcxx/docs/Status/Cxx23.rst b/libcxx/docs/Status/Cxx23.rst index b3918149a735f1..1a8d43bff74752 100644 --- a/libcxx/docs/Status/Cxx23.rst +++ b/libcxx/docs/Status/Cxx23.rst @@ -18,8 +18,6 @@ In February 2023, the C++ standard committee approved this draft, and sent it to This page shows the status of libc++; the status of clang's support of the language features is `here `__. -.. attention:: Features in unreleased drafts of the standard are subject to change. - The groups that have contributed papers: - CWG - Core Language Working group diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst index c3b724568bc51e..2dc08563358aba 100644 --- a/libcxx/docs/index.rst +++ b/libcxx/docs/index.rst @@ -168,8 +168,13 @@ to be formally supported and listed here, please work with the libc++ team to se up testing for your configuration. -C++ Dialect Support -=================== +C++ Standards Conformance +========================= + +Libc++ provides full support for C++11 and C++14, and provides most of newer standards +with a few omissions. The conformance status of the library's tip is tracked in real-time +using `this page `_. The conformance status of +this release is described in the pages below: * C++11 - Complete * C++14 - Complete From a964635939ed9fadcaf6833b29f4ebeb9a9df4ef Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Thu, 22 Aug 2024 14:11:22 +0100 Subject: [PATCH 189/426] [mlir][OpenMP] Add optional alloc region to reduction decl (#102522) This region is intended to separate alloca operations from reduction variable initialization. This makes it easier to hoist allocas to the entry block before control flow and complex code for initialization. The verifier checks that there is at most one block in the alloc region. This is not sufficient to avoid control flow in general MLIR, but by the time we are converting to LLVMIR structured control flow should already have been lowered to the cf dialect. 1/3 Part 2: https://github.com/llvm/llvm-project/pull/102524 Part 3: https://github.com/llvm/llvm-project/pull/102525 --- mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 40 +++++++-- mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 72 ++++++++++++---- mlir/test/Dialect/OpenMP/invalid.mlir | 85 ++++++++++++++++++- mlir/test/Dialect/OpenMP/ops.mlir | 30 +++++++ 4 files changed, 201 insertions(+), 26 deletions(-) diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index d63fdd88f79104..739b1f67be7cb2 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -1528,21 +1528,32 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [IsolatedFromAbove, Symbol]> { let summary = "declares a reduction kind"; let description = [{ - Declares an OpenMP reduction kind. This requires two mandatory and two + Declares an OpenMP reduction kind. This requires two mandatory and three optional regions. - 1. The initializer region specifies how to initialize the thread-local + 1. The optional alloc region specifies how to allocate the thread-local + reduction value. This region should not contain control flow and all + IR should be suitable for inlining straight into an entry block. In + the common case this is expected to contain only allocas. It is + expected to `omp.yield` the allocated value on all control paths. + If allocation is conditional (e.g. only allocate if the mold is + allocated), this should be done in the initilizer region and this + region not included. The alloc region is not used for by-value + reductions (where allocation is implicit). + 2. The initializer region specifies how to initialize the thread-local reduction value. This is usually the neutral element of the reduction. For convenience, the region has an argument that contains the value - of the reduction accumulator at the start of the reduction. It is - expected to `omp.yield` the new value on all control flow paths. - 2. The reduction region specifies how to combine two values into one, i.e. + of the reduction accumulator at the start of the reduction. If an alloc + region is specified, there is a second block argument containing the + address of the allocated memory. The initializer region is expected to + `omp.yield` the new value on all control flow paths. + 3. The reduction region specifies how to combine two values into one, i.e. the reduction operator. It accepts the two values as arguments and is expected to `omp.yield` the combined value on all control flow paths. - 3. The atomic reduction region is optional and specifies how two values + 4. The atomic reduction region is optional and specifies how two values can be combined atomically given local accumulator variables. It is expected to store the combined value in the first accumulator variable. - 4. The cleanup region is optional and specifies how to clean up any memory + 5. The cleanup region is optional and specifies how to clean up any memory allocated by the initializer region. The region has an argument that contains the value of the thread-local reduction accumulator. This will be executed after the reduction has completed. @@ -1558,12 +1569,14 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [IsolatedFromAbove, let arguments = (ins SymbolNameAttr:$sym_name, TypeAttr:$type); - let regions = (region AnyRegion:$initializerRegion, + let regions = (region MaxSizedRegion<1>:$allocRegion, + AnyRegion:$initializerRegion, AnyRegion:$reductionRegion, AnyRegion:$atomicReductionRegion, AnyRegion:$cleanupRegion); let assemblyFormat = "$sym_name `:` $type attr-dict-with-keyword " + "custom($allocRegion) " "`init` $initializerRegion " "`combiner` $reductionRegion " "custom($atomicReductionRegion) " @@ -1576,6 +1589,17 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [IsolatedFromAbove, return cast(getAtomicReductionRegion().front().getArgument(0).getType()); } + + Value getInitializerMoldArg() { + return getInitializerRegion().front().getArgument(0); + } + + Value getInitializerAllocArg() { + if (getAllocRegion().empty() || + getInitializerRegion().front().getNumArguments() != 2) + return {nullptr}; + return getInitializerRegion().front().getArgument(1); + } }]; let hasRegionVerifier = 1; } diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 4c943ebbe3144f..273f49b8b12b67 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -1883,46 +1883,84 @@ LogicalResult DistributeOp::verify() { // DeclareReductionOp //===----------------------------------------------------------------------===// -static ParseResult parseAtomicReductionRegion(OpAsmParser &parser, - Region ®ion) { - if (parser.parseOptionalKeyword("atomic")) +static ParseResult parseOptionalReductionRegion(OpAsmParser &parser, + Region ®ion, + StringRef keyword) { + if (parser.parseOptionalKeyword(keyword)) return success(); return parser.parseRegion(region); } -static void printAtomicReductionRegion(OpAsmPrinter &printer, - DeclareReductionOp op, Region ®ion) { +static void printOptionalReductionRegion(OpAsmPrinter &printer, Region ®ion, + StringRef keyword) { if (region.empty()) return; - printer << "atomic "; + printer << keyword << " "; printer.printRegion(region); } +static ParseResult parseAllocReductionRegion(OpAsmParser &parser, + Region ®ion) { + return parseOptionalReductionRegion(parser, region, "alloc"); +} + +static void printAllocReductionRegion(OpAsmPrinter &printer, + DeclareReductionOp op, Region ®ion) { + printOptionalReductionRegion(printer, region, "alloc"); +} + +static ParseResult parseAtomicReductionRegion(OpAsmParser &parser, + Region ®ion) { + return parseOptionalReductionRegion(parser, region, "atomic"); +} + +static void printAtomicReductionRegion(OpAsmPrinter &printer, + DeclareReductionOp op, Region ®ion) { + printOptionalReductionRegion(printer, region, "atomic"); +} + static ParseResult parseCleanupReductionRegion(OpAsmParser &parser, Region ®ion) { - if (parser.parseOptionalKeyword("cleanup")) - return success(); - return parser.parseRegion(region); + return parseOptionalReductionRegion(parser, region, "cleanup"); } static void printCleanupReductionRegion(OpAsmPrinter &printer, DeclareReductionOp op, Region ®ion) { - if (region.empty()) - return; - printer << "cleanup "; - printer.printRegion(region); + printOptionalReductionRegion(printer, region, "cleanup"); } LogicalResult DeclareReductionOp::verifyRegions() { + if (!getAllocRegion().empty()) { + for (YieldOp yieldOp : getAllocRegion().getOps()) { + if (yieldOp.getResults().size() != 1 || + yieldOp.getResults().getTypes()[0] != getType()) + return emitOpError() << "expects alloc region to yield a value " + "of the reduction type"; + } + } + if (getInitializerRegion().empty()) return emitOpError() << "expects non-empty initializer region"; Block &initializerEntryBlock = getInitializerRegion().front(); - if (initializerEntryBlock.getNumArguments() != 1 || - initializerEntryBlock.getArgument(0).getType() != getType()) { - return emitOpError() << "expects initializer region with one argument " - "of the reduction type"; + + if (initializerEntryBlock.getNumArguments() == 1) { + if (!getAllocRegion().empty()) + return emitOpError() << "expects two arguments to the initializer region " + "when an allocation region is used"; + } else if (initializerEntryBlock.getNumArguments() == 2) { + if (getAllocRegion().empty()) + return emitOpError() << "expects one argument to the initializer region " + "when no allocation region is used"; + } else { + return emitOpError() + << "expects one or two arguments to the initializer region"; } + for (mlir::Value arg : initializerEntryBlock.getArguments()) + if (arg.getType() != getType()) + return emitOpError() << "expects initializer region argument to match " + "the reduction type"; + for (YieldOp yieldOp : getInitializerRegion().getOps()) { if (yieldOp.getResults().size() != 1 || yieldOp.getResults().getTypes()[0] != getType()) diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir index c76b07ec94a597..332d22fc2c6425 100644 --- a/mlir/test/Dialect/OpenMP/invalid.mlir +++ b/mlir/test/Dialect/OpenMP/invalid.mlir @@ -565,7 +565,63 @@ func.func @omp_simd_pretty_simdlen_safelen(%lb : index, %ub : index, %step : ind // ----- -// expected-error @below {{op expects initializer region with one argument of the reduction type}} +// expected-error @below {{op expects alloc region to yield a value of the reduction type}} +omp.declare_reduction @add_f32 : f32 +alloc { +^bb0(%arg: f32): +// nonsense test code + %0 = arith.constant 0.0 : f64 + omp.yield (%0 : f64) +} +init { +^bb0(%arg0: f32, %arg1: f32): + %0 = arith.constant 0.0 : f32 + omp.yield (%0 : f32) +} +combiner { +^bb1(%arg0: f32, %arg1: f32): + %1 = arith.addf %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} + +// ----- + +// expected-error @below {{op expects two arguments to the initializer region when an allocation region is used}} +omp.declare_reduction @add_f32 : f32 +alloc { +^bb0(%arg: f32): +// nonsense test code + omp.yield (%arg : f32) +} +init { +^bb0(%arg0: f32): + %0 = arith.constant 0.0 : f32 + omp.yield (%0 : f32) +} +combiner { +^bb1(%arg0: f32, %arg1: f32): + %1 = arith.addf %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} + +// ----- + +// expected-error @below {{op expects one argument to the initializer region when no allocation region is used}} +omp.declare_reduction @add_f32 : f32 +init { +^bb0(%arg: f32, %arg2: f32): + %0 = arith.constant 0.0 : f32 + omp.yield (%0 : f32) +} +combiner { +^bb1(%arg0: f32, %arg1: f32): + %1 = arith.addf %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} + +// ----- + +// expected-error @below {{op expects initializer region argument to match the reduction type}} omp.declare_reduction @add_f32 : f64 init { ^bb0(%arg: f32): @@ -683,6 +739,33 @@ cleanup { // ----- +// expected-error @below {{op region #0 ('allocRegion') failed to verify constraint: region with at most 1 blocks}} +omp.declare_reduction @alloc_reduction : !llvm.ptr +alloc { +^bb0(%arg: !llvm.ptr): + %c1 = arith.constant 1 : i32 + %0 = llvm.alloca %c1 x f32 : (i32) -> !llvm.ptr + cf.br ^bb1(%0: !llvm.ptr) +^bb1(%ret: !llvm.ptr): + omp.yield (%ret : !llvm.ptr) +} +init { +^bb0(%arg: !llvm.ptr): + %cst = arith.constant 1.0 : f32 + llvm.store %cst, %arg : f32, !llvm.ptr + omp.yield (%arg : !llvm.ptr) +} +combiner { +^bb1(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.load %arg0 : !llvm.ptr -> f32 + %1 = llvm.load %arg1 : !llvm.ptr -> f32 + %2 = arith.addf %0, %1 : f32 + llvm.store %2, %arg0 : f32, !llvm.ptr + omp.yield (%arg0 : !llvm.ptr) +} + +// ----- + func.func @foo(%lb : index, %ub : index, %step : index) { %c1 = arith.constant 1 : i32 %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir index 9ac97e069addd2..9c308cc0108493 100644 --- a/mlir/test/Dialect/OpenMP/ops.mlir +++ b/mlir/test/Dialect/OpenMP/ops.mlir @@ -2541,6 +2541,36 @@ atomic { omp.yield } +// CHECK-LABEL: @alloc_reduction +// CHECK-SAME: alloc { +// CHECK-NEXT: ^bb0(%[[ARG0:.*]]: !llvm.ptr): +// ... +// CHECK: omp.yield +// CHECK-NEXT: } init { +// CHECK: } combiner { +// CHECK: } +omp.declare_reduction @alloc_reduction : !llvm.ptr +alloc { +^bb0(%arg: !llvm.ptr): + %c1 = arith.constant 1 : i32 + %0 = llvm.alloca %c1 x f32 : (i32) -> !llvm.ptr + omp.yield (%0 : !llvm.ptr) +} +init { +^bb0(%mold: !llvm.ptr, %alloc: !llvm.ptr): + %cst = arith.constant 1.0 : f32 + llvm.store %cst, %alloc : f32, !llvm.ptr + omp.yield (%alloc : !llvm.ptr) +} +combiner { +^bb1(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.load %arg0 : !llvm.ptr -> f32 + %1 = llvm.load %arg1 : !llvm.ptr -> f32 + %2 = arith.addf %0, %1 : f32 + llvm.store %2, %arg0 : f32, !llvm.ptr + omp.yield (%arg0 : !llvm.ptr) +} + // CHECK-LABEL: omp_targets_with_map_bounds // CHECK-SAME: (%[[ARG0:.*]]: !llvm.ptr, %[[ARG1:.*]]: !llvm.ptr) func.func @omp_targets_with_map_bounds(%arg0: !llvm.ptr, %arg1: !llvm.ptr) -> () { From 2efc81aff4a18a640c585d507c357868162dbd43 Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Thu, 22 Aug 2024 14:11:51 +0100 Subject: [PATCH 190/426] [mlir][OpenMP] Convert reduction alloc region to LLVMIR (#102524) The intention of this change is to ensure that allocas end up in the entry block not spread out amongst complex reduction variable initialization code. The tests we have are quite minimized for readability and maintainability, making the benefits less obvious. The use case for this is when there are multiple reduction variables each will multiple blocks inside of the init region for that reduction. 2/3 Part 1: https://github.com/llvm/llvm-project/pull/102522 Part 3: https://github.com/llvm/llvm-project/pull/102525 --- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 134 +++++++++++++----- mlir/test/Target/LLVMIR/openmp-private.mlir | 6 +- .../openmp-reduction-array-sections.mlir | 14 +- .../Target/LLVMIR/openmp-reduction-byref.mlir | 12 +- 4 files changed, 119 insertions(+), 47 deletions(-) diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 1f3fb95c339c7c..6d14d77c440e67 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -594,45 +594,85 @@ convertOmpOrderedRegion(Operation &opInst, llvm::IRBuilderBase &builder, /// Allocate space for privatized reduction variables. template -static void allocByValReductionVars( - T loop, ArrayRef reductionArgs, llvm::IRBuilderBase &builder, - LLVM::ModuleTranslation &moduleTranslation, - llvm::OpenMPIRBuilder::InsertPointTy &allocaIP, - SmallVectorImpl &reductionDecls, - SmallVectorImpl &privateReductionVariables, - DenseMap &reductionVariableMap, - llvm::ArrayRef isByRefs) { +static LogicalResult +allocReductionVars(T loop, ArrayRef reductionArgs, + llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation, + llvm::OpenMPIRBuilder::InsertPointTy &allocaIP, + SmallVectorImpl &reductionDecls, + SmallVectorImpl &privateReductionVariables, + DenseMap &reductionVariableMap, + llvm::ArrayRef isByRefs) { llvm::IRBuilderBase::InsertPointGuard guard(builder); builder.SetInsertPoint(allocaIP.getBlock()->getTerminator()); + // delay creating stores until after all allocas + SmallVector> storesToCreate; + storesToCreate.reserve(loop.getNumReductionVars()); + for (std::size_t i = 0; i < loop.getNumReductionVars(); ++i) { - if (isByRefs[i]) - continue; - llvm::Value *var = builder.CreateAlloca( - moduleTranslation.convertType(reductionDecls[i].getType())); - moduleTranslation.mapValue(reductionArgs[i], var); - privateReductionVariables[i] = var; - reductionVariableMap.try_emplace(loop.getReductionVars()[i], var); + Region &allocRegion = reductionDecls[i].getAllocRegion(); + if (isByRefs[i]) { + if (allocRegion.empty()) + continue; + + SmallVector phis; + if (failed(inlineConvertOmpRegions(allocRegion, "omp.reduction.alloc", + builder, moduleTranslation, &phis))) + return failure(); + assert(phis.size() == 1 && "expected one allocation to be yielded"); + + builder.SetInsertPoint(allocaIP.getBlock()->getTerminator()); + + // Allocate reduction variable (which is a pointer to the real reduction + // variable allocated in the inlined region) + llvm::Value *var = builder.CreateAlloca( + moduleTranslation.convertType(reductionDecls[i].getType())); + storesToCreate.emplace_back(phis[0], var); + + privateReductionVariables[i] = var; + moduleTranslation.mapValue(reductionArgs[i], phis[0]); + reductionVariableMap.try_emplace(loop.getReductionVars()[i], phis[0]); + } else { + assert(allocRegion.empty() && + "allocaction is implicit for by-val reduction"); + llvm::Value *var = builder.CreateAlloca( + moduleTranslation.convertType(reductionDecls[i].getType())); + moduleTranslation.mapValue(reductionArgs[i], var); + privateReductionVariables[i] = var; + reductionVariableMap.try_emplace(loop.getReductionVars()[i], var); + } } + + // TODO: further delay this so it doesn't come in the entry block at all + for (auto [data, addr] : storesToCreate) + builder.CreateStore(data, addr); + + return success(); } -/// Map input argument to all reduction initialization regions +/// Map input arguments to reduction initialization region template static void -mapInitializationArg(T loop, LLVM::ModuleTranslation &moduleTranslation, - SmallVectorImpl &reductionDecls, - unsigned i) { +mapInitializationArgs(T loop, LLVM::ModuleTranslation &moduleTranslation, + SmallVectorImpl &reductionDecls, + DenseMap &reductionVariableMap, + unsigned i) { // map input argument to the initialization region mlir::omp::DeclareReductionOp &reduction = reductionDecls[i]; Region &initializerRegion = reduction.getInitializerRegion(); Block &entry = initializerRegion.front(); - assert(entry.getNumArguments() == 1 && - "the initialization region has one argument"); mlir::Value mlirSource = loop.getReductionVars()[i]; llvm::Value *llvmSource = moduleTranslation.lookupValue(mlirSource); assert(llvmSource && "lookup reduction var"); - moduleTranslation.mapValue(entry.getArgument(0), llvmSource); + moduleTranslation.mapValue(reduction.getInitializerMoldArg(), llvmSource); + + if (entry.getNumArguments() > 1) { + llvm::Value *allocation = + reductionVariableMap.lookup(loop.getReductionVars()[i]); + moduleTranslation.mapValue(reduction.getInitializerAllocArg(), allocation); + } } /// Collect reduction info @@ -779,18 +819,21 @@ static LogicalResult allocAndInitializeReductionVars( if (op.getNumReductionVars() == 0) return success(); - allocByValReductionVars(op, reductionArgs, builder, moduleTranslation, - allocaIP, reductionDecls, privateReductionVariables, - reductionVariableMap, isByRef); + if (failed(allocReductionVars(op, reductionArgs, builder, moduleTranslation, + allocaIP, reductionDecls, + privateReductionVariables, reductionVariableMap, + isByRef))) + return failure(); // Before the loop, store the initial values of reductions into reduction // variables. Although this could be done after allocas, we don't want to mess // up with the alloca insertion point. for (unsigned i = 0; i < op.getNumReductionVars(); ++i) { - SmallVector phis; + SmallVector phis; // map block argument to initializer region - mapInitializationArg(op, moduleTranslation, reductionDecls, i); + mapInitializationArgs(op, moduleTranslation, reductionDecls, + reductionVariableMap, i); if (failed(inlineConvertOmpRegions(reductionDecls[i].getInitializerRegion(), "omp.reduction.neutral", builder, @@ -799,6 +842,13 @@ static LogicalResult allocAndInitializeReductionVars( assert(phis.size() == 1 && "expected one value to be yielded from the " "reduction neutral element declaration region"); if (isByRef[i]) { + if (!reductionDecls[i].getAllocRegion().empty()) + // done in allocReductionVars + continue; + + // TODO: this path can be removed once all users of by-ref are updated to + // use an alloc region + // Allocate reduction variable (which is a pointer to the real reduction // variable allocated in the inlined region) llvm::Value *var = builder.CreateAlloca( @@ -1319,9 +1369,15 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, opInst.getNumAllocateVars() + opInst.getNumAllocatorsVars(), opInst.getNumReductionVars()); - allocByValReductionVars(opInst, reductionArgs, builder, moduleTranslation, - allocaIP, reductionDecls, privateReductionVariables, - reductionVariableMap, isByRef); + allocaIP = + InsertPointTy(allocaIP.getBlock(), + allocaIP.getBlock()->getTerminator()->getIterator()); + + if (failed(allocReductionVars(opInst, reductionArgs, builder, + moduleTranslation, allocaIP, reductionDecls, + privateReductionVariables, + reductionVariableMap, isByRef))) + bodyGenStatus = failure(); // Initialize reduction vars builder.restoreIP(allocaIP); @@ -1332,8 +1388,12 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, SmallVector byRefVars(opInst.getNumReductionVars()); for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) { if (isByRef[i]) { - // Allocate reduction variable (which is a pointer to the real reduciton - // variable allocated in the inlined region) + if (!reductionDecls[i].getAllocRegion().empty()) + continue; + + // TODO: remove after all users of by-ref are updated to use the alloc + // region: Allocate reduction variable (which is a pointer to the real + // reduciton variable allocated in the inlined region) byRefVars[i] = builder.CreateAlloca( moduleTranslation.convertType(reductionDecls[i].getType())); } @@ -1345,7 +1405,8 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, SmallVector phis; // map the block argument - mapInitializationArg(opInst, moduleTranslation, reductionDecls, i); + mapInitializationArgs(opInst, moduleTranslation, reductionDecls, + reductionVariableMap, i); if (failed(inlineConvertOmpRegions( reductionDecls[i].getInitializerRegion(), "omp.reduction.neutral", builder, moduleTranslation, &phis))) @@ -1354,11 +1415,14 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, "expected one value to be yielded from the " "reduction neutral element declaration region"); - // mapInitializationArg finishes its block with a terminator. We need to - // insert before that terminator. builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator()); if (isByRef[i]) { + if (!reductionDecls[i].getAllocRegion().empty()) + continue; + + // TODO: remove after all users of by-ref are updated to use the alloc + // Store the result of the inlined region to the allocated reduction var // ptr builder.CreateStore(phis[0], byRefVars[i]); diff --git a/mlir/test/Target/LLVMIR/openmp-private.mlir b/mlir/test/Target/LLVMIR/openmp-private.mlir index e76f7e4d40f7af..21167668bbee16 100644 --- a/mlir/test/Target/LLVMIR/openmp-private.mlir +++ b/mlir/test/Target/LLVMIR/openmp-private.mlir @@ -222,11 +222,13 @@ omp.private {type = private} @privatizer.part : !llvm.ptr alloc { omp.yield(%1 : !llvm.ptr) } -omp.declare_reduction @reducer.part : !llvm.ptr init { -^bb0(%arg0: !llvm.ptr): +omp.declare_reduction @reducer.part : !llvm.ptr alloc { %0 = llvm.mlir.constant(1 : i64) : i64 %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr omp.yield(%1 : !llvm.ptr) +} init { +^bb0(%mold: !llvm.ptr, %alloc: !llvm.ptr): + omp.yield(%alloc : !llvm.ptr) } combiner { ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): omp.yield(%arg0 : !llvm.ptr) diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir index 5682e7e96ab186..da6f9430046123 100644 --- a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir +++ b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir @@ -4,11 +4,15 @@ // for array reductions. The important thing here is that we are testing a byref // reduction with a cleanup region, and the various regions contain multiple // blocks -omp.declare_reduction @add_reduction_byref_box_Uxf32 : !llvm.ptr init { -^bb0(%arg0: !llvm.ptr): +omp.declare_reduction @add_reduction_byref_box_Uxf32 : !llvm.ptr alloc { %0 = llvm.mlir.constant(1 : i64) : i64 %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr omp.yield(%1 : !llvm.ptr) +} init { +^bb0(%arg0: !llvm.ptr, %alloc: !llvm.ptr): + %0 = llvm.mlir.constant(1 : i64) : i64 + llvm.store %0, %alloc : i64, !llvm.ptr + omp.yield(%alloc : !llvm.ptr) } combiner { ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): %0 = llvm.mlir.constant(0 : i64) : i64 @@ -83,6 +87,9 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute // CHECK: %[[VAL_11:.*]] = load i32, ptr %[[VAL_12:.*]], align 4 // CHECK: store i32 %[[VAL_11]], ptr %[[VAL_10]], align 4 // CHECK: %[[VAL_13:.*]] = load i32, ptr %[[VAL_10]], align 4 +// CHECK: %[[VAL_20:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8 +// CHECK: %[[VAL_21:.*]] = alloca ptr, align 8 +// CHECK: store ptr %[[VAL_20]], ptr %[[VAL_21]], align 8 // CHECK: %[[VAL_14:.*]] = alloca [1 x ptr], align 8 // CHECK: br label %[[VAL_15:.*]] // CHECK: omp.reduction.init: ; preds = %[[VAL_16:.*]] @@ -91,9 +98,6 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute // CHECK: br label %[[VAL_18:.*]] // CHECK: omp.par.region1: ; preds = %[[VAL_17]] // CHECK: %[[VAL_19:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8 -// CHECK: %[[VAL_20:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8 -// CHECK: %[[VAL_21:.*]] = alloca ptr, align 8 -// CHECK: store ptr %[[VAL_20]], ptr %[[VAL_21]], align 8 // CHECK: br label %[[VAL_22:.*]] // CHECK: omp_section_loop.preheader: ; preds = %[[VAL_18]] // CHECK: store i32 0, ptr %[[VAL_7]], align 4 diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir index ef1284547a88a7..a0ca31b7d811e3 100644 --- a/mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir +++ b/mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir @@ -1,12 +1,14 @@ // RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s - omp.declare_reduction @add_reduction_i_32 : !llvm.ptr init { - ^bb0(%arg0: !llvm.ptr): - %0 = llvm.mlir.constant(0 : i32) : i32 + omp.declare_reduction @add_reduction_i_32 : !llvm.ptr alloc { %1 = llvm.mlir.constant(1 : i64) : i64 %2 = llvm.alloca %1 x i32 : (i64) -> !llvm.ptr - llvm.store %0, %2 : i32, !llvm.ptr omp.yield(%2 : !llvm.ptr) + } init { + ^bb0(%arg0: !llvm.ptr, %alloc: !llvm.ptr): + %0 = llvm.mlir.constant(0 : i32) : i32 + llvm.store %0, %alloc : i32, !llvm.ptr + omp.yield(%alloc : !llvm.ptr) } combiner { ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): %0 = llvm.load %arg0 : !llvm.ptr -> i32 @@ -42,8 +44,8 @@ // Private reduction variable and its initialization. // CHECK: %tid.addr.local = alloca i32 // CHECK: %[[PRIVATE:.+]] = alloca i32 -// CHECK: store i32 0, ptr %[[PRIVATE]] // CHECK: store ptr %[[PRIVATE]], ptr %[[PRIV_PTR:.+]], +// CHECK: store i32 0, ptr %[[PRIVATE]] // Call to the reduction function. // CHECK: call i32 @__kmpc_reduce From f2027a9388728094d84837fc0fdd2e0325362e51 Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Thu, 22 Aug 2024 14:12:07 +0100 Subject: [PATCH 191/426] [flang][OpenMP] use reduction alloc region (#102525) I removed the `*-hlfir*` tests because they are duplicate now that the other tests have been updated to use the HLFIR lowering. 3/3 Part 1: https://github.com/llvm/llvm-project/pull/102522 Part 2: https://github.com/llvm/llvm-project/pull/102524 --- flang/lib/Lower/OpenMP/ReductionProcessor.cpp | 86 ++++++++++++++----- .../delayed-privatization-reduction-byref.f90 | 2 +- .../OpenMP/parallel-reduction-add-byref.f90 | 20 +++-- .../parallel-reduction-allocatable-array.f90 | 14 +-- .../OpenMP/parallel-reduction-array-lb.f90 | 12 +-- .../Lower/OpenMP/parallel-reduction-array.f90 | 12 +-- .../OpenMP/parallel-reduction-array2.f90 | 12 +-- .../Lower/OpenMP/parallel-reduction-byref.f90 | 12 +-- .../Lower/OpenMP/parallel-reduction-mixed.f90 | 4 +- .../parallel-reduction-pointer-array.f90 | 14 +-- .../test/Lower/OpenMP/parallel-reduction3.f90 | 12 +-- .../OpenMP/reduction-array-intrinsic.f90 | 12 +-- .../Lower/OpenMP/sections-array-reduction.f90 | 5 +- .../OpenMP/wsloop-reduction-add-byref.f90 | 48 ++++++----- .../wsloop-reduction-add-hlfir-byref.f90 | 58 ------------- .../OpenMP/wsloop-reduction-add-hlfir.f90 | 54 ------------ ...oop-reduction-allocatable-array-minmax.f90 | 28 +++--- .../OpenMP/wsloop-reduction-allocatable.f90 | 14 +-- .../wsloop-reduction-array-assumed-shape.f90 | 12 +-- .../Lower/OpenMP/wsloop-reduction-array.f90 | 12 +-- .../Lower/OpenMP/wsloop-reduction-array2.f90 | 12 +-- .../OpenMP/wsloop-reduction-iand-byref.f90 | 10 ++- .../OpenMP/wsloop-reduction-ieor-byref.f90 | 10 ++- .../OpenMP/wsloop-reduction-ior-byref.f90 | 10 ++- .../wsloop-reduction-logical-and-byref.f90 | 12 +-- .../wsloop-reduction-logical-eqv-byref.f90 | 12 +-- .../wsloop-reduction-logical-neqv-byref.f90 | 12 +-- .../wsloop-reduction-logical-or-byref.f90 | 12 +-- .../OpenMP/wsloop-reduction-max-byref.f90 | 18 ++-- .../wsloop-reduction-max-hlfir-byref.f90 | 64 -------------- .../OpenMP/wsloop-reduction-max-hlfir.f90 | 60 ------------- .../OpenMP/wsloop-reduction-min-byref.f90 | 18 ++-- .../OpenMP/wsloop-reduction-mul-byref.f90 | 40 +++++---- .../wsloop-reduction-multiple-clauses.f90 | 12 +-- .../Lower/OpenMP/wsloop-reduction-pointer.f90 | 8 +- 35 files changed, 317 insertions(+), 436 deletions(-) delete mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-add-hlfir-byref.f90 delete mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-add-hlfir.f90 delete mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir-byref.f90 delete mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir.f90 diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp index c3c1f363033c27..c87182abe3d187 100644 --- a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp @@ -489,23 +489,57 @@ static mlir::Type unwrapSeqOrBoxedType(mlir::Type ty) { return ty; } -static mlir::Value -createReductionInitRegion(fir::FirOpBuilder &builder, mlir::Location loc, - mlir::omp::DeclareReductionOp &reductionDecl, - const ReductionProcessor::ReductionIdentifier redId, - mlir::Type type, bool isByRef) { +static void createReductionAllocAndInitRegions( + fir::FirOpBuilder &builder, mlir::Location loc, + mlir::omp::DeclareReductionOp &reductionDecl, + const ReductionProcessor::ReductionIdentifier redId, mlir::Type type, + bool isByRef) { + auto yield = [&](mlir::Value ret) { + builder.create(loc, ret); + }; + + mlir::Block *allocBlock = nullptr; + mlir::Block *initBlock = nullptr; + if (isByRef) { + allocBlock = + builder.createBlock(&reductionDecl.getAllocRegion(), + reductionDecl.getAllocRegion().end(), {}, {}); + initBlock = builder.createBlock(&reductionDecl.getInitializerRegion(), + reductionDecl.getInitializerRegion().end(), + {type, type}, {loc, loc}); + } else { + initBlock = builder.createBlock(&reductionDecl.getInitializerRegion(), + reductionDecl.getInitializerRegion().end(), + {type}, {loc}); + } + mlir::Type ty = fir::unwrapRefType(type); + builder.setInsertionPointToEnd(initBlock); mlir::Value initValue = ReductionProcessor::getReductionInitValue( loc, unwrapSeqOrBoxedType(ty), redId, builder); if (fir::isa_trivial(ty)) { if (isByRef) { - mlir::Value alloca = builder.create(loc, ty); - builder.createStoreWithConvert(loc, initValue, alloca); - return alloca; + // alloc region + { + builder.setInsertionPointToEnd(allocBlock); + mlir::Value alloca = builder.create(loc, ty); + yield(alloca); + } + + // init region + { + builder.setInsertionPointToEnd(initBlock); + // block arg is mapped to the alloca yielded from the alloc region + mlir::Value alloc = reductionDecl.getInitializerAllocArg(); + builder.createStoreWithConvert(loc, initValue, alloc); + yield(alloc); + } + return; } // by val - return initValue; + yield(initValue); + return; } // check if an allocatable box is unallocated. If so, initialize the boxAlloca @@ -520,10 +554,10 @@ createReductionInitRegion(fir::FirOpBuilder &builder, mlir::Location loc, // fir.store %something to %box_alloca // } // omp.yield %box_alloca - mlir::Value blockArg = - builder.loadIfRef(loc, builder.getBlock()->getArgument(0)); + mlir::Value moldArg = + builder.loadIfRef(loc, reductionDecl.getInitializerMoldArg()); auto handleNullAllocatable = [&](mlir::Value boxAlloca) -> fir::IfOp { - mlir::Value addr = builder.create(loc, blockArg); + mlir::Value addr = builder.create(loc, moldArg); mlir::Value isNotAllocated = builder.genIsNullAddr(loc, addr); fir::IfOp ifOp = builder.create(loc, isNotAllocated, /*withElseRegion=*/true); @@ -539,7 +573,17 @@ createReductionInitRegion(fir::FirOpBuilder &builder, mlir::Location loc, assert(isByRef && "passing boxes by value is unsupported"); bool isAllocatableOrPointer = mlir::isa(boxTy.getEleTy()); - mlir::Value boxAlloca = builder.create(loc, ty); + + // alloc region + { + builder.setInsertionPointToEnd(allocBlock); + mlir::Value boxAlloca = builder.create(loc, ty); + yield(boxAlloca); + } + + // init region + builder.setInsertionPointToEnd(initBlock); + mlir::Value boxAlloca = reductionDecl.getInitializerAllocArg(); mlir::Type innerTy = fir::unwrapRefType(boxTy.getEleTy()); if (fir::isa_trivial(innerTy)) { // boxed non-sequence value e.g. !fir.box> @@ -558,7 +602,8 @@ createReductionInitRegion(fir::FirOpBuilder &builder, mlir::Location loc, createReductionCleanupRegion(builder, loc, reductionDecl); builder.restoreInsertionPoint(insPt); builder.setInsertionPointAfter(ifUnallocated); - return boxAlloca; + yield(boxAlloca); + return; } innerTy = fir::extractSequenceType(boxTy); if (!mlir::isa(innerTy)) @@ -571,7 +616,7 @@ createReductionInitRegion(fir::FirOpBuilder &builder, mlir::Location loc, } // Create the private copy from the initial fir.box: - mlir::Value loadedBox = builder.loadIfRef(loc, blockArg); + mlir::Value loadedBox = builder.loadIfRef(loc, moldArg); hlfir::Entity source = hlfir::Entity{loadedBox}; // Allocating on the heap in case the whole reduction is nested inside of a @@ -616,7 +661,8 @@ createReductionInitRegion(fir::FirOpBuilder &builder, mlir::Location loc, builder.create(loc, box, boxAlloca); if (ifUnallocated) builder.setInsertionPointAfter(ifUnallocated); - return boxAlloca; + yield(boxAlloca); + return; } TODO(loc, "createReductionInitRegion for unsupported type"); @@ -643,13 +689,7 @@ mlir::omp::DeclareReductionOp ReductionProcessor::createDeclareReduction( decl = modBuilder.create(loc, reductionOpName, type); - builder.createBlock(&decl.getInitializerRegion(), - decl.getInitializerRegion().end(), {type}, {loc}); - builder.setInsertionPointToEnd(&decl.getInitializerRegion().back()); - - mlir::Value init = - createReductionInitRegion(builder, loc, decl, redId, type, isByRef); - builder.create(loc, init); + createReductionAllocAndInitRegions(builder, loc, decl, redId, type, isByRef); builder.createBlock(&decl.getReductionRegion(), decl.getReductionRegion().end(), {type, type}, diff --git a/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90 b/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90 index 72e91680a43104..29439571179322 100644 --- a/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90 +++ b/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90 @@ -22,7 +22,7 @@ subroutine red_and_delayed_private ! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : !fir.ref alloc { ! CHECK-LABEL: omp.declare_reduction -! CHECK-SAME: @[[REDUCTION_SYM:.*]] : !fir.ref init +! CHECK-SAME: @[[REDUCTION_SYM:.*]] : !fir.ref alloc ! CHECK-LABEL: _QPred_and_delayed_private ! CHECK: omp.parallel diff --git a/flang/test/Lower/OpenMP/parallel-reduction-add-byref.f90 b/flang/test/Lower/OpenMP/parallel-reduction-add-byref.f90 index 7347d9324feac8..ad97b17d6857d6 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-add-byref.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-add-byref.f90 @@ -3,12 +3,14 @@ !CHECK-LABEL: omp.declare_reduction !CHECK-SAME: @[[RED_F32_NAME:.*]] : !fir.ref -!CHECK-SAME: init { -!CHECK: ^bb0(%{{.*}}: !fir.ref): -!CHECK: %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32 +!CHECK-SAME: alloc { !CHECK: %[[REF:.*]] = fir.alloca f32 -!CHECKL fir.store [[%C0_1]] to %[[REF]] : !fir.ref !CHECK: omp.yield(%[[REF]] : !fir.ref) +!CHECK-LABEL: } init { +!CHECK: ^bb0(%{{.*}}: !fir.ref, %[[ALLOC:.*]]: !fir.ref): +!CHECK: %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32 +!CHECKL fir.store [[%C0_1]] to %[[ALLOC]] : !fir.ref +!CHECK: omp.yield(%[[ALLOC]] : !fir.ref) !CHECK: } combiner { !CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref): !CHECK: %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref @@ -20,12 +22,14 @@ !CHECK-LABEL: omp.declare_reduction !CHECK-SAME: @[[RED_I32_NAME:.*]] : !fir.ref -!CHECK-SAME: init { -!CHECK: ^bb0(%{{.*}}: !fir.ref): -!CHECK: %[[C0_1:.*]] = arith.constant 0 : i32 +!CHECK-SAME: alloc { !CHECK: %[[REF:.*]] = fir.alloca i32 -!CHECK: fir.store %[[C0_1]] to %[[REF]] : !fir.ref !CHECK: omp.yield(%[[REF]] : !fir.ref) +!CHECK-LABEL: } init { +!CHECK: ^bb0(%{{.*}}: !fir.ref, %[[ALLOC:.*]]: !fir.ref): +!CHECK: %[[C0_1:.*]] = arith.constant 0 : i32 +!CHECK: fir.store %[[C0_1]] to %[[ALLOC]] : !fir.ref +!CHECK: omp.yield(%[[ALLOC]] : !fir.ref) !CHECK: } combiner { !CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref): !CHECK: %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref diff --git a/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 index fdb7e974f1c5c6..7a2db3299784c7 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 @@ -18,18 +18,20 @@ program reduce end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_heap_Uxi32 : !fir.ref>>> init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>>): +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_heap_Uxi32 : !fir.ref>>> alloc { +! CHECK: %[[VAL_10:.*]] = fir.alloca !fir.box>> +! CHECK: omp.yield(%[[VAL_10]] : !fir.ref>>>) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>>, %[[ALLOC:.*]]: !fir.ref>>>): ! CHECK: %[[VAL_1:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref>>> -! CHECK: %[[VAL_10:.*]] = fir.alloca !fir.box>> ! CHECK: %[[ADDR:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box>>) -> !fir.heap> ! CHECK: %[[ADDRI:.*]] = fir.convert %[[ADDR]] : (!fir.heap>) -> i64 ! CHECK: %[[C0_I64:.*]] = arith.constant 0 : i64 ! CHECK: %[[IS_NULL:.*]] = arith.cmpi eq, %[[ADDRI]], %[[C0_I64]] : i64 ! CHECK: fir.if %[[IS_NULL]] { ! CHECK: %[[NULL_BOX:.*]] = fir.embox %[[ADDR]] : (!fir.heap>) -> !fir.box>> -! CHECK: fir.store %[[NULL_BOX]] to %[[VAL_10]] : !fir.ref>>> +! CHECK: fir.store %[[NULL_BOX]] to %[[ALLOC]] : !fir.ref>>> ! CHECK: } else { ! CHECK: %[[VAL_3:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_4:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_3]] : (!fir.box>>, index) -> (index, index, index) @@ -42,9 +44,9 @@ program reduce ! CHECK: %[[SHIFT:.*]] = fir.shape_shift %[[DIMS]]#0, %[[DIMS]]#1 : (index, index) -> !fir.shapeshift<1> ! CHECK: %[[REBOX:.*]] = fir.rebox %[[VAL_8]]#0(%[[SHIFT]]) : (!fir.box>, !fir.shapeshift<1>) -> !fir.box>> ! CHECK: hlfir.assign %[[VAL_1]] to %[[REBOX]] : i32, !fir.box>> -! CHECK: fir.store %[[REBOX]] to %[[VAL_10]] : !fir.ref>>> +! CHECK: fir.store %[[REBOX]] to %[[ALLOC]] : !fir.ref>>> ! CHECK: } -! CHECK: omp.yield(%[[VAL_10]] : !fir.ref>>>) +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref>>>) ! CHECK: } combiner { ! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>>, %[[VAL_1:.*]]: !fir.ref>>>): ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref>>> diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 index b44fe4c1f4cc28..59902bd13a1c2e 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 @@ -12,11 +12,13 @@ program reduce end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3x2xi32 : !fir.ref>> init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>): +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3x2xi32 : !fir.ref>> alloc { +! CHECK: %[[VAL_15:.*]] = fir.alloca !fir.box> +! CHECK: omp.yield(%[[VAL_15]] : !fir.ref>>) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[ALLOC:.*]]: !fir.ref>>): ! CHECK: %[[VAL_1:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref>> -! CHECK: %[[VAL_15:.*]] = fir.alloca !fir.box> ! CHECK: %[[VAL_3:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_4:.*]] = arith.constant 2 : index ! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_3]], %[[VAL_4]] : (index, index) -> !fir.shape<2> @@ -30,8 +32,8 @@ program reduce ! CHECK: %[[VAL_13:.*]] = fir.shape_shift %[[VAL_10]]#0, %[[VAL_10]]#1, %[[VAL_12]]#0, %[[VAL_12]]#1 : (index, index, index, index) -> !fir.shapeshift<2> ! CHECK: %[[VAL_14:.*]] = fir.embox %[[VAL_8]]#0(%[[VAL_13]]) : (!fir.heap>, !fir.shapeshift<2>) -> !fir.box> ! CHECK: hlfir.assign %[[VAL_1]] to %[[VAL_14]] : i32, !fir.box> -! CHECK: fir.store %[[VAL_14]] to %[[VAL_15]] : !fir.ref>> -! CHECK: omp.yield(%[[VAL_15]] : !fir.ref>>) +! CHECK: fir.store %[[VAL_14]] to %[[ALLOC]] : !fir.ref>> +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref>>) ! CHECK: } combiner { ! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[VAL_1:.*]]: !fir.ref>>): ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref>> diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array.f90 index 60b21c9b1ebbe0..8835c1f5b5e18d 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-array.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-array.f90 @@ -13,11 +13,13 @@ program reduce print *,i end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref>> init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>): +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref>> alloc { +! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box> +! CHECK: omp.yield(%[[VAL_8]] : !fir.ref>>) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[ALLOC:.*]]: !fir.ref>>): ! CHECK: %[[VAL_2:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_0]] : !fir.ref>> -! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box> ! CHECK: %[[VAL_4:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1> ! CHECK: %[[VAL_1:.*]] = fir.allocmem !fir.array<3xi32> {bindc_name = ".tmp", uniq_name = ""} @@ -29,8 +31,8 @@ program reduce ! CHECK: %[[SHIFT:.*]] = fir.shape_shift %[[DIMS]]#0, %[[DIMS]]#1 : (index, index) -> !fir.shapeshift<1> ! CHECK: %[[VAL_7:.*]] = fir.embox %[[VAL_6]]#0(%[[SHIFT]]) : (!fir.heap>, !fir.shapeshift<1>) -> !fir.box> ! CHECK: hlfir.assign %[[VAL_2]] to %[[VAL_7]] : i32, !fir.box> -! CHECK: fir.store %[[VAL_7]] to %[[VAL_8]] : !fir.ref>> -! CHECK: omp.yield(%[[VAL_8]] : !fir.ref>>) +! CHECK: fir.store %[[VAL_7]] to %[[ALLOC]] : !fir.ref>> +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref>>) ! CHECK: } combiner { ! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[VAL_1:.*]]: !fir.ref>>): ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref>> diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 index 5d4c86d1d76e84..2d4c0239b5d2e6 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 @@ -13,11 +13,13 @@ program reduce print *,i end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref>> init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>): +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref>> alloc { +! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box> +! CHECK: omp.yield(%[[VAL_8]] : !fir.ref>>) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[ALLOC:.*]]: !fir.ref>>): ! CHECK: %[[VAL_2:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_0]] : !fir.ref>> -! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box> ! CHECK: %[[VAL_4:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1> ! CHECK: %[[VAL_1:.*]] = fir.allocmem !fir.array<3xi32> @@ -28,8 +30,8 @@ program reduce ! CHECK: %[[SHIFT:.*]] = fir.shape_shift %[[DIMS]]#0, %[[DIMS]]#1 : (index, index) -> !fir.shapeshift<1> ! CHECK: %[[VAL_7:.*]] = fir.embox %[[VAL_6]]#0(%[[SHIFT]]) : (!fir.heap>, !fir.shapeshift<1>) -> !fir.box> ! CHECK: hlfir.assign %[[VAL_2]] to %[[VAL_7]] : i32, !fir.box> -! CHECK: fir.store %[[VAL_7]] to %[[VAL_8]] : !fir.ref>> -! CHECK: omp.yield(%[[VAL_8]] : !fir.ref>>) +! CHECK: fir.store %[[VAL_7]] to %[[ALLOC]] : !fir.ref>> +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref>>) ! CHECK: } combiner { ! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[VAL_1:.*]]: !fir.ref>>): ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref>> diff --git a/flang/test/Lower/OpenMP/parallel-reduction-byref.f90 b/flang/test/Lower/OpenMP/parallel-reduction-byref.f90 index 5685e2c584ace7..596276a99cafc9 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-byref.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-byref.f90 @@ -2,12 +2,14 @@ ! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction -o - %s 2>&1 | FileCheck %s !CHECK: omp.declare_reduction @[[REDUCTION_DECLARE:[_a-z0-9]+]] : !fir.ref -!CHECK-SAME: init { -!CHECK: ^bb0(%{{.*}}: !fir.ref): -!CHECK: %[[I0:[_a-z0-9]+]] = arith.constant 0 : i32 -!CHECK: %[[REF:.*]] = fir.alloca i32 -!CHECKL fir.store [[%I0]] to %[[REF]] : !fir.ref +!CHECK-SAME: alloc { +!CHECK: %[[REF:.*]] = fir.alloca i32 !CHECK: omp.yield(%[[REF]] : !fir.ref) +!CHECK-LABEL: } init { +!CHECK: ^bb0(%{{.*}}: !fir.ref, %[[ALLOC:.*]]: !fir.ref): +!CHECK: %[[I0:[_a-z0-9]+]] = arith.constant 0 : i32 +!CHECKL fir.store [[%I0]] to %[[ALLOC]] : !fir.ref +!CHECK: omp.yield(%[[ALLOC]] : !fir.ref) !CHECK: } combiner { !CHECK: ^bb0(%[[C0:[_a-z0-9]+]]: !fir.ref, %[[C1:[_a-z0-9]+]]: !fir.ref): !CHECK: %[[LD0:.*]] = fir.load %[[C0]] : !fir.ref diff --git a/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90 b/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90 index 6a2eacaaf7bd1a..1457be05ca1025 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90 @@ -24,11 +24,11 @@ end subroutine proc !CHECK: %[[TID_LOCAL:.*]] = alloca i32 !CHECK: %[[TID:.*]] = load i32, ptr %[[TID_ADDR]] !CHECK: store i32 %[[TID]], ptr %[[TID_LOCAL]] -!CHECK: %[[I_priv:.*]] = alloca i32 !CHECK: %[[F_priv:.*]] = alloca ptr +!CHECK: %[[I_priv:.*]] = alloca i32 +!CHECK: store ptr %{{.*}}, ptr %[[F_priv]] !CHECK: omp.reduction.init: -!CHECK: store ptr %{{.*}}, ptr %[[F_priv]] !CHECK: store i32 0, ptr %[[I_priv]] !CHECK: omp.par.region: diff --git a/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 index 2c2f60cb72c9a1..1273b250117da4 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 @@ -19,18 +19,20 @@ program reduce end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_ptr_Uxi32 : !fir.ref>>> init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>>): +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_ptr_Uxi32 : !fir.ref>>> alloc { +! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box>> +! CHECK: omp.yield(%[[VAL_3]] : !fir.ref>>>) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>>, %[[ALLOC:.*]]: !fir.ref>>>): ! CHECK: %[[VAL_1:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref>>> -! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box>> ! CHECK: %[[VAL_4:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box>>) -> !fir.ptr> ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.ptr>) -> i64 ! CHECK: %[[VAL_6:.*]] = arith.constant 0 : i64 ! CHECK: %[[VAL_7:.*]] = arith.cmpi eq, %[[VAL_5]], %[[VAL_6]] : i64 ! CHECK: fir.if %[[VAL_7]] { ! CHECK: %[[VAL_8:.*]] = fir.embox %[[VAL_4]] : (!fir.ptr>) -> !fir.box>> -! CHECK: fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref>>> +! CHECK: fir.store %[[VAL_8]] to %[[ALLOC]] : !fir.ref>>> ! CHECK: } else { ! CHECK: %[[VAL_9:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_10:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_9]] : (!fir.box>>, index) -> (index, index, index) @@ -43,9 +45,9 @@ program reduce ! CHECK: %[[VAL_17:.*]] = fir.shape_shift %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index) -> !fir.shapeshift<1> ! CHECK: %[[VAL_18:.*]] = fir.rebox %[[VAL_14]]#0(%[[VAL_17]]) : (!fir.box>, !fir.shapeshift<1>) -> !fir.box>> ! CHECK: hlfir.assign %[[VAL_1]] to %[[VAL_18]] : i32, !fir.box>> -! CHECK: fir.store %[[VAL_18]] to %[[VAL_3]] : !fir.ref>>> +! CHECK: fir.store %[[VAL_18]] to %[[ALLOC]] : !fir.ref>>> ! CHECK: } -! CHECK: omp.yield(%[[VAL_3]] : !fir.ref>>>) +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref>>>) ! CHECK-LABEL: } combiner { ! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>>, %[[VAL_1:.*]]: !fir.ref>>>): ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref>>> diff --git a/flang/test/Lower/OpenMP/parallel-reduction3.f90 b/flang/test/Lower/OpenMP/parallel-reduction3.f90 index 669d528a8ae14a..441dff34553d4f 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction3.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction3.f90 @@ -1,11 +1,13 @@ ! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s ! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref>> init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>): +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref>> alloc { +! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box> +! CHECK: omp.yield(%[[VAL_8]] : !fir.ref>>) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[ALLOC:.*]]: !fir.ref>>): ! CHECK: %[[VAL_1:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref>> -! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box> ! CHECK: %[[VAL_3:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_4:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_3]] : (!fir.box>, index) -> (index, index, index) ! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_4]]#1 : (index) -> !fir.shape<1> @@ -17,8 +19,8 @@ ! CHECK: %[[SHIFT:.*]] = fir.shape_shift %[[DIMS]]#0, %[[DIMS]]#1 : (index, index) -> !fir.shapeshift<1> ! CHECK: %[[REBOX:.*]] = fir.rebox %[[VAL_7]]#0(%[[SHIFT]]) : (!fir.box>, !fir.shapeshift<1>) -> !fir.box> ! CHECK: hlfir.assign %[[VAL_1]] to %[[REBOX]] : i32, !fir.box> -! CHECK: fir.store %[[REBOX]] to %[[VAL_8]] : !fir.ref>> -! CHECK: omp.yield(%[[VAL_8]] : !fir.ref>>) +! CHECK: fir.store %[[REBOX]] to %[[ALLOC]] : !fir.ref>> +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref>>) ! CHECK: } combiner { ! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[VAL_1:.*]]: !fir.ref>>): ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref>> diff --git a/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 b/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 index 208cda28a3e594..d7af34923827cd 100644 --- a/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 +++ b/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 @@ -9,11 +9,13 @@ subroutine max_array_reduction(l, r) !$omp end parallel end subroutine -! CHECK-LABEL: omp.declare_reduction @max_byref_box_Uxi32 : !fir.ref>> init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>): +! CHECK-LABEL: omp.declare_reduction @max_byref_box_Uxi32 : !fir.ref>> alloc { +! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box> +! CHECK: omp.yield(%[[VAL_3]] : !fir.ref>>) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[ALLOC:.*]]: !fir.ref>>): ! CHECK: %[[VAL_1:.*]] = arith.constant -2147483648 : i32 ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref>> -! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box> ! CHECK: %[[VAL_4:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_4]] : (!fir.box>, index) -> (index, index, index) ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_5]]#1 : (index) -> !fir.shape<1> @@ -25,8 +27,8 @@ subroutine max_array_reduction(l, r) ! CHECK: %[[VAL_12:.*]] = fir.shape_shift %[[VAL_11]]#0, %[[VAL_11]]#1 : (index, index) -> !fir.shapeshift<1> ! CHECK: %[[VAL_13:.*]] = fir.rebox %[[VAL_9]]#0(%[[VAL_12]]) : (!fir.box>, !fir.shapeshift<1>) -> !fir.box> ! CHECK: hlfir.assign %[[VAL_1]] to %[[VAL_13]] : i32, !fir.box> -! CHECK: fir.store %[[VAL_13]] to %[[VAL_3]] : !fir.ref>> -! CHECK: omp.yield(%[[VAL_3]] : !fir.ref>>) +! CHECK: fir.store %[[VAL_13]] to %[[ALLOC]] : !fir.ref>> +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref>>) ! CHECK-LABEL: } combiner { ! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[VAL_1:.*]]: !fir.ref>>): ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref>> diff --git a/flang/test/Lower/OpenMP/sections-array-reduction.f90 b/flang/test/Lower/OpenMP/sections-array-reduction.f90 index 709d4c444dd0fa..e5319e8d6bcc79 100644 --- a/flang/test/Lower/OpenMP/sections-array-reduction.f90 +++ b/flang/test/Lower/OpenMP/sections-array-reduction.f90 @@ -14,7 +14,10 @@ subroutine sectionsReduction(x) end subroutine -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref>> init { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref>> alloc { +! [...] +! CHECK: omp.yield +! CHECK-LABEL: } init { ! [...] ! CHECK: omp.yield ! CHECK-LABEL: } combiner { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-add-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-add-byref.f90 index 8dc2b43ad56a34..67d8964622275e 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-add-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-add-byref.f90 @@ -2,12 +2,14 @@ ! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s ! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_f64 : !fir.ref -! CHECK-SAME: init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref): -! CHECK: %[[C0_1:.*]] = arith.constant 0.000000e+00 : f64 +! CHECK-SAME: alloc { ! CHECK: %[[REF:.*]] = fir.alloca f64 -! CHECK: fir.store %[[C0_1]] to %[[REF]] : !fir.ref -! CHECK: omp.yield(%[[REF]] : !fir.ref) +! CHECK: omp.yield(%[[REF:.*]] : !fir.ref) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref, %[[ALLOC:.*]]: !fir.ref): +! CHECK: %[[C0_1:.*]] = arith.constant 0.000000e+00 : f64 +! CHECK: fir.store %[[C0_1]] to %[[ALLOC]] : !fir.ref +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref) ! CHECK-LABEL: } combiner { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref): @@ -19,12 +21,14 @@ ! CHECK: } ! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_i64 : !fir.ref -! CHECK-SAME: init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref): -! CHECK: %[[C0_1:.*]] = arith.constant 0 : i64 +! CHECK-SAME: alloc { ! CHECK: %[[REF:.*]] = fir.alloca i64 -! CHECK: fir.store %[[C0_1]] to %[[REF]] : !fir.ref -! CHECK: omp.yield(%[[REF]] : !fir.ref) +! CHECK: omp.yield(%[[REF:.*]] : !fir.ref) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref, %[[ALLOC:.*]]: !fir.ref): +! CHECK: %[[C0_1:.*]] = arith.constant 0 : i64 +! CHECK: fir.store %[[C0_1]] to %[[ALLOC]] : !fir.ref +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref) ! CHECK-LABEL: } combiner { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref): @@ -36,12 +40,14 @@ ! CHECK: } ! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_f32 : !fir.ref -! CHECK-SAME: init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref): -! CHECK: %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32 -! CHECK: %[[REF:.*]] = fir.alloca f32 -! CHECK: fir.store %[[C0_1]] to %[[REF]] : !fir.ref +! CHECK-SAME: alloc { +! CHECK: %[[REF:.*]] = fir.alloca f32 ! CHECK: omp.yield(%[[REF]] : !fir.ref) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref, %[[ALLOC:.*]]: !fir.ref): +! CHECK: %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32 +! CHECK: fir.store %[[C0_1]] to %[[ALLOC]] : !fir.ref +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref) ! CHECK-LABEL: } combiner { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref): @@ -53,12 +59,14 @@ ! CHECK: } ! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_i32 : !fir.ref -! CHECK-SAME: init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref): -! CHECK: %[[C0_1:.*]] = arith.constant 0 : i32 -! CHECK: %[[REF:.*]] = fir.alloca i32 -! CHECK: fir.store %[[C0_1]] to %[[REF]] : !fir.ref +! CHECK-SAME: alloc { +! CHECK: %[[REF:.*]] = fir.alloca i32 ! CHECK: omp.yield(%[[REF]] : !fir.ref) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref, %[[ALLOC:.*]]: !fir.ref): +! CHECK: %[[C0_1:.*]] = arith.constant 0 : i32 +! CHECK: fir.store %[[C0_1]] to %[[ALLOC]] : !fir.ref +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref) ! CHECK-LABEL: } combiner { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref): diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-add-hlfir-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-add-hlfir-byref.f90 deleted file mode 100644 index cef86d1c1bd494..00000000000000 --- a/flang/test/Lower/OpenMP/wsloop-reduction-add-hlfir-byref.f90 +++ /dev/null @@ -1,58 +0,0 @@ -! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction %s -o - | FileCheck %s -! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s - -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_i32 : !fir.ref -! CHECK-SAME: init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref): -! CHECK: %[[C0_1:.*]] = arith.constant 0 : i32 -! CHECK: %[[REF:.*]] = fir.alloca i32 -! CHECK: fir.store %[[C0_1]] to %[[REF]] : !fir.ref -! CHECK: omp.yield(%[[REF]] : !fir.ref) - -! CHECK-LABEL: } combiner { -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref): -! CHECK: %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref -! CHECK: %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref -! CHECK: %[[RES:.*]] = arith.addi %[[LD0]], %[[LD1]] : i32 -! CHECK: fir.store %[[RES]] to %[[ARG0]] : !fir.ref -! CHECK: omp.yield(%[[ARG0]] : !fir.ref) -! CHECK: } - -! CHECK-LABEL: func.func @_QPsimple_int_reduction() -! CHECK: %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reductionEi"} -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) -! CHECK: %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reductionEx"} -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsimple_int_reductionEx"} : (!fir.ref) -> (!fir.ref, !fir.ref) -! CHECK: %[[VAL_4:.*]] = arith.constant 0 : i32 -! CHECK: hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : i32, !fir.ref -! CHECK: omp.parallel { -! CHECK: %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}} -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) -! CHECK: %[[VAL_7:.*]] = arith.constant 1 : i32 -! CHECK: %[[VAL_8:.*]] = arith.constant 100 : i32 -! CHECK: %[[VAL_9:.*]] = arith.constant 1 : i32 -! CHECK: omp.wsloop reduction(byref @add_reduction_byref_i32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref) { -! CHECK-NEXT: omp.loop_nest (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) { -! CHECK: %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_int_reductionEx"} : (!fir.ref) -> (!fir.ref, !fir.ref) -! CHECK: fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref -! CHECK: %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref -! CHECK: %[[VAL_14:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref -! CHECK: %[[VAL_15:.*]] = arith.addi %[[VAL_13]], %[[VAL_14]] : i32 -! CHECK: hlfir.assign %[[VAL_15]] to %[[VAL_12]]#0 : i32, !fir.ref -! CHECK: omp.yield -! CHECK: omp.terminator -! CHECK: omp.terminator -! CHECK: return - - -subroutine simple_int_reduction - integer :: x - x = 0 - !$omp parallel - !$omp do reduction(+:x) - do i=1, 100 - x = x + i - end do - !$omp end do - !$omp end parallel -end subroutine diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-add-hlfir.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-add-hlfir.f90 deleted file mode 100644 index d0ba2cdff8174d..00000000000000 --- a/flang/test/Lower/OpenMP/wsloop-reduction-add-hlfir.f90 +++ /dev/null @@ -1,54 +0,0 @@ -! RUN: bbc -emit-hlfir -fopenmp %s -o - | FileCheck %s -! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s - -! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py - -! CHECK-LABEL: omp.declare_reduction @add_reduction_i32 : i32 init { -! CHECK: ^bb0(%[[VAL_0:.*]]: i32): -! CHECK: %[[VAL_1:.*]] = arith.constant 0 : i32 -! CHECK: omp.yield(%[[VAL_1]] : i32) - -! CHECK-LABEL: } combiner { -! CHECK: ^bb0(%[[VAL_0:.*]]: i32, %[[VAL_1:.*]]: i32): -! CHECK: %[[VAL_2:.*]] = arith.addi %[[VAL_0]], %[[VAL_1]] : i32 -! CHECK: omp.yield(%[[VAL_2]] : i32) -! CHECK: } - -! CHECK-LABEL: func.func @_QPsimple_int_reduction() -! CHECK: %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reductionEi"} -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) -! CHECK: %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reductionEx"} -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsimple_int_reductionEx"} : (!fir.ref) -> (!fir.ref, !fir.ref) -! CHECK: %[[VAL_4:.*]] = arith.constant 0 : i32 -! CHECK: hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : i32, !fir.ref -! CHECK: omp.parallel { -! CHECK: %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}} -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) -! CHECK: %[[VAL_7:.*]] = arith.constant 1 : i32 -! CHECK: %[[VAL_8:.*]] = arith.constant 100 : i32 -! CHECK: %[[VAL_9:.*]] = arith.constant 1 : i32 -! CHECK: omp.wsloop reduction(@add_reduction_i32 %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref) { -! CHECK-NEXT: omp.loop_nest (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) { -! CHECK: %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_int_reductionEx"} : (!fir.ref) -> (!fir.ref, !fir.ref) -! CHECK: fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref -! CHECK: %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref -! CHECK: %[[VAL_14:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref -! CHECK: %[[VAL_15:.*]] = arith.addi %[[VAL_13]], %[[VAL_14]] : i32 -! CHECK: hlfir.assign %[[VAL_15]] to %[[VAL_12]]#0 : i32, !fir.ref -! CHECK: omp.yield -! CHECK: omp.terminator -! CHECK: omp.terminator -! CHECK: return - - -subroutine simple_int_reduction - integer :: x - x = 0 - !$omp parallel - !$omp do reduction(+:x) - do i=1, 100 - x = x + i - end do - !$omp end do - !$omp end parallel -end subroutine diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 index cda7593b217ab2..6b901bae539ff9 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 @@ -32,18 +32,20 @@ program reduce15 print *,"min: ", mins end program -! CHECK-LABEL: omp.declare_reduction @min_byref_box_heap_Uxi32 : !fir.ref>>> init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>>): +! CHECK-LABEL: omp.declare_reduction @min_byref_box_heap_Uxi32 : !fir.ref>>> alloc { +! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box>> +! CHECK: omp.yield(%[[VAL_3]] : !fir.ref>>>) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>>, %[[ALLOC:.*]]: !fir.ref>>>): ! CHECK: %[[VAL_1:.*]] = arith.constant 2147483647 : i32 ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref>>> -! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box>> ! CHECK: %[[VAL_4:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box>>) -> !fir.heap> ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.heap>) -> i64 ! CHECK: %[[VAL_6:.*]] = arith.constant 0 : i64 ! CHECK: %[[VAL_7:.*]] = arith.cmpi eq, %[[VAL_5]], %[[VAL_6]] : i64 ! CHECK: fir.if %[[VAL_7]] { ! CHECK: %[[VAL_8:.*]] = fir.embox %[[VAL_4]] : (!fir.heap>) -> !fir.box>> -! CHECK: fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref>>> +! CHECK: fir.store %[[VAL_8]] to %[[ALLOC]] : !fir.ref>>> ! CHECK: } else { ! CHECK: %[[VAL_9:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_10:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_9]] : (!fir.box>>, index) -> (index, index, index) @@ -56,9 +58,9 @@ program reduce15 ! CHECK: %[[VAL_17:.*]] = fir.shape_shift %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index) -> !fir.shapeshift<1> ! CHECK: %[[VAL_18:.*]] = fir.rebox %[[VAL_14]]#0(%[[VAL_17]]) : (!fir.box>, !fir.shapeshift<1>) -> !fir.box>> ! CHECK: hlfir.assign %[[VAL_1]] to %[[VAL_18]] : i32, !fir.box>> -! CHECK: fir.store %[[VAL_18]] to %[[VAL_3]] : !fir.ref>>> +! CHECK: fir.store %[[VAL_18]] to %[[ALLOC]] : !fir.ref>>> ! CHECK: } -! CHECK: omp.yield(%[[VAL_3]] : !fir.ref>>>) +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref>>>) ! CHECK-LABEL: } combiner { ! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>>, %[[VAL_1:.*]]: !fir.ref>>>): ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref>>> @@ -89,18 +91,20 @@ program reduce15 ! CHECK: omp.yield ! CHECK: } -! CHECK-LABEL: omp.declare_reduction @max_byref_box_heap_Uxi32 : !fir.ref>>> init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>>): +! CHECK-LABEL: omp.declare_reduction @max_byref_box_heap_Uxi32 : !fir.ref>>> alloc { +! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box>> +! CHECK: omp.yield(%[[VAL_3]] : !fir.ref>>>) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>>, %[[ALLOC:.*]]: !fir.ref>>>): ! CHECK: %[[VAL_1:.*]] = arith.constant -2147483648 : i32 ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref>>> -! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box>> ! CHECK: %[[VAL_4:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box>>) -> !fir.heap> ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.heap>) -> i64 ! CHECK: %[[VAL_6:.*]] = arith.constant 0 : i64 ! CHECK: %[[VAL_7:.*]] = arith.cmpi eq, %[[VAL_5]], %[[VAL_6]] : i64 ! CHECK: fir.if %[[VAL_7]] { ! CHECK: %[[VAL_8:.*]] = fir.embox %[[VAL_4]] : (!fir.heap>) -> !fir.box>> -! CHECK: fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref>>> +! CHECK: fir.store %[[VAL_8]] to %[[ALLOC]] : !fir.ref>>> ! CHECK: } else { ! CHECK: %[[VAL_9:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_10:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_9]] : (!fir.box>>, index) -> (index, index, index) @@ -113,9 +117,9 @@ program reduce15 ! CHECK: %[[VAL_17:.*]] = fir.shape_shift %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index) -> !fir.shapeshift<1> ! CHECK: %[[VAL_18:.*]] = fir.rebox %[[VAL_14]]#0(%[[VAL_17]]) : (!fir.box>, !fir.shapeshift<1>) -> !fir.box>> ! CHECK: hlfir.assign %[[VAL_1]] to %[[VAL_18]] : i32, !fir.box>> -! CHECK: fir.store %[[VAL_18]] to %[[VAL_3]] : !fir.ref>>> +! CHECK: fir.store %[[VAL_18]] to %[[ALLOC]] : !fir.ref>>> ! CHECK: } -! CHECK: omp.yield(%[[VAL_3]] : !fir.ref>>>) +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref>>>) ! CHECK-LABEL: } combiner { ! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>>, %[[VAL_1:.*]]: !fir.ref>>>): ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref>>> diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 index 3c5388b7e5d906..66db62a36bc175 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 @@ -18,25 +18,27 @@ program reduce end program -! CHECK: omp.declare_reduction @add_reduction_byref_box_heap_i32 : !fir.ref>> init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>): +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_heap_i32 : !fir.ref>> alloc { +! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.box> +! CHECK: omp.yield(%[[VAL_2]] : !fir.ref>>) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[ALLOC:.*]]: !fir.ref>>): ! CHECK: %[[VAL_1:.*]] = arith.constant 0 : i32 ! CHECK: %[[LOAD:.*]] = fir.load %[[VAL_0]] : !fir.ref>> -! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.box> ! CHECK: %[[ADDR:.*]] = fir.box_addr %[[LOAD]] : (!fir.box>) -> !fir.heap ! CHECK: %[[ADDRI:.*]] = fir.convert %[[ADDR]] : (!fir.heap) -> i64 ! CHECK: %[[C0_I64:.*]] = arith.constant 0 : i64 ! CHECK: %[[IS_NULL:.*]] = arith.cmpi eq, %[[ADDRI]], %[[C0_I64]] : i64 ! CHECK: fir.if %[[IS_NULL]] { ! CHECK: %[[NULL_BOX:.*]] = fir.embox %[[ADDR]] : (!fir.heap) -> !fir.box> -! CHECK: fir.store %[[NULL_BOX]] to %[[VAL_2]] : !fir.ref> +! CHECK: fir.store %[[NULL_BOX]] to %[[ALLOC]] : !fir.ref> ! CHECK: } else { ! CHECK: %[[VAL_3:.*]] = fir.allocmem i32 ! CHECK: fir.store %[[VAL_1]] to %[[VAL_3]] : !fir.heap ! CHECK: %[[VAL_4:.*]] = fir.embox %[[VAL_3]] : (!fir.heap) -> !fir.box> -! CHECK: fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref>> +! CHECK: fir.store %[[VAL_4]] to %[[ALLOC]] : !fir.ref>> ! CHECK: } -! CHECK: omp.yield(%[[VAL_2]] : !fir.ref>>) +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref>>) ! CHECK: } combiner { ! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[VAL_1:.*]]: !fir.ref>>): ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref>> diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 index b79c3b4f749d2a..c984ab61bedb3b 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 @@ -22,11 +22,13 @@ subroutine reduce(r) end subroutine end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf64 : !fir.ref>> init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>): +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf64 : !fir.ref>> alloc { +! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box> +! CHECK: omp.yield(%[[VAL_8]] : !fir.ref>>) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[ALLOC:.*]]: !fir.ref>>): ! CHECK: %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64 ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref>> -! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box> ! CHECK: %[[VAL_3:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_4:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_3]] : (!fir.box>, index) -> (index, index, index) ! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_4]]#1 : (index) -> !fir.shape<1> @@ -38,8 +40,8 @@ subroutine reduce(r) ! CHECK: %[[SHIFT:.*]] = fir.shape_shift %[[DIMS]]#0, %[[DIMS]]#1 : (index, index) -> !fir.shapeshift<1> ! CHECK: %[[REBOX:.*]] = fir.rebox %[[VAL_7]]#0(%[[SHIFT]]) : (!fir.box>, !fir.shapeshift<1>) -> !fir.box ! CHECK: hlfir.assign %[[VAL_1]] to %[[REBOX]] : f64, !fir.box> -! CHECK: fir.store %[[REBOX]] to %[[VAL_8]] : !fir.ref>> -! CHECK: omp.yield(%[[VAL_8]] : !fir.ref>>) +! CHECK: fir.store %[[REBOX]] to %[[ALLOC]] : !fir.ref>> +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref>>) ! CHECK-LABEL: } combiner { ! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[VAL_1:.*]]: !fir.ref>>): ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref>> diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 index b92a096de4e1ca..43e4c86b6bade2 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 @@ -14,11 +14,13 @@ program reduce print *,r end program -! CHECK-LABEL omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref>> init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>): +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref>> alloc { +! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box> +! CHECK: omp.yield(%[[VAL_8]] : !fir.ref>>) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[ALLOC:.*]]: !fir.ref>>): ! CHECK: %[[VAL_2:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_0]] : !fir.ref>> -! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box> ! CHECK: %[[VAL_4:.*]] = arith.constant 2 : index ! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1> ! CHECK: %[[VAL_1:.*]] = fir.allocmem !fir.array<2xi32> {bindc_name = ".tmp", uniq_name = ""} @@ -29,8 +31,8 @@ program reduce ! CHECK: %[[SHIFT:.*]] = fir.shape_shift %[[DIMS]]#0, %[[DIMS]]#1 : (index, index) -> !fir.shapeshift<1> ! CHECK: %[[VAL_7:.*]] = fir.embox %[[VAL_6]]#0(%[[SHIFT]]) : (!fir.heap>, !fir.shapeshift<1>) -> !fir.box> ! CHECK: hlfir.assign %[[VAL_2]] to %[[VAL_7]] : i32, !fir.box> -! CHECK: fir.store %[[VAL_7]] to %[[VAL_8]] : !fir.ref>> -! CHECK: omp.yield(%[[VAL_8]] : !fir.ref>>) +! CHECK: fir.store %[[VAL_7]] to %[[ALLOC]] : !fir.ref>> +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref>>) ! CHECK-LABEL } combiner { ! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[VAL_1:.*]]: !fir.ref>>): diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 index 9105a76ec6e97b..be5273ea36c99f 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 @@ -14,11 +14,13 @@ program reduce print *,r end program -! CHECK-LABEL omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref>> init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>): +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref>> alloc { +! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box> +! CHECK: omp.yield(%[[VAL_8]] : !fir.ref>>) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[ALLOC:.*]]: !fir.ref>>): ! CHECK: %[[VAL_2:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_0]] : !fir.ref>> -! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box> ! CHECK: %[[VAL_4:.*]] = arith.constant 2 : index ! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1> ! CHECK: %[[VAL_1:.*]] = fir.allocmem !fir.array<2xi32> {bindc_name = ".tmp", uniq_name = ""} @@ -29,8 +31,8 @@ program reduce ! CHECK: %[[SHIFT:.*]] = fir.shape_shift %[[DIMS]]#0, %[[DIMS]]#1 : (index, index) -> !fir.shapeshift<1> ! CHECK: %[[VAL_7:.*]] = fir.embox %[[VAL_6]]#0(%[[SHIFT]]) : (!fir.heap>, !fir.shapeshift<1>) -> !fir.box> ! CHECK: hlfir.assign %[[VAL_2]] to %[[VAL_7]] : i32, !fir.box> -! CHECK: fir.store %[[VAL_7]] to %[[VAL_8]] : !fir.ref>> -! CHECK: omp.yield(%[[VAL_8]] : !fir.ref>>) +! CHECK: fir.store %[[VAL_7]] to %[[ALLOC]] : !fir.ref>> +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref>>) ! CHECK: } combiner { ! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[VAL_1:.*]]: !fir.ref>>): diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90 index 8eb4f4c6eb4c7a..0696236e8f0736 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90 @@ -4,12 +4,14 @@ ! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py ! CHECK-LABEL: omp.declare_reduction @iand_byref_i32 : !fir.ref -! CHECK-SAME: init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref): -! CHECK: %[[C0_1:.*]] = arith.constant -1 : i32 +! CHECK-SAME: alloc { ! CHECK: %[[REF:.*]] = fir.alloca i32 -! CHECK: fir.store %[[C0_1]] to %[[REF]] : !fir.ref ! CHECK: omp.yield(%[[REF]] : !fir.ref) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref, %[[ALLOC:.*]]: !fir.ref): +! CHECK: %[[C0_1:.*]] = arith.constant -1 : i32 +! CHECK: fir.store %[[C0_1]] to %[[ALLOC]] : !fir.ref +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref) ! CHECK-LABEL: } combiner { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref): diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90 index 6a5d942cb74e9f..5b0758ac3fcc1d 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90 @@ -2,12 +2,14 @@ ! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s ! CHECK-LABEL: omp.declare_reduction @ieor_byref_i32 : !fir.ref -! CHECK-SAME: init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref): -! CHECK: %[[C0_1:.*]] = arith.constant 0 : i32 +! CHECK-SAME: alloc { ! CHECK: %[[REF:.*]] = fir.alloca i32 -! CHECK: fir.store %[[C0_1]] to %[[REF]] : !fir.ref ! CHECK: omp.yield(%[[REF]] : !fir.ref) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref, %[[ALLOC:.*]]: !fir.ref): +! CHECK: %[[C0_1:.*]] = arith.constant 0 : i32 +! CHECK: fir.store %[[C0_1]] to %[[ALLOC]] : !fir.ref +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref) ! CHECK-LABEL: } combiner { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref): diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90 index 2956cd9ef53c37..8604a274a659fd 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90 @@ -2,12 +2,14 @@ ! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s ! CHECK-LABEL: omp.declare_reduction @ior_byref_i32 : !fir.ref -! CHECK-SAME: init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref): -! CHECK: %[[C0_1:.*]] = arith.constant 0 : i32 +! CHECK-SAME: alloc { ! CHECK: %[[REF:.*]] = fir.alloca i32 -! CHECK: fir.store %[[C0_1]] to %[[REF]] : !fir.ref ! CHECK: omp.yield(%[[REF]] : !fir.ref) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref, %[[ALLOC:.*]]: !fir.ref): +! CHECK: %[[C0_1:.*]] = arith.constant 0 : i32 +! CHECK: fir.store %[[C0_1]] to %[[ALLOC]] : !fir.ref +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref) ! CHECK-LABEL: } combiner { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref): diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90 index b505585e5cb0e3..ed89ee1fade8e3 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90 @@ -4,13 +4,15 @@ ! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py ! CHECK-LABEL: omp.declare_reduction @and_reduction : !fir.ref> -! CHECK-SAME: init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>): -! CHECK: %[[VAL_1:.*]] = arith.constant true -! CHECK: %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4> +! CHECK-SAME: alloc { ! CHECK: %[[REF:.*]] = fir.alloca !fir.logical<4> -! CHECK: fir.store %[[VAL_2]] to %[[REF]] : !fir.ref> ! CHECK: omp.yield(%[[REF]] : !fir.ref>) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>, %[[ALLOC:.*]]: !fir.ref>): +! CHECK: %[[VAL_1:.*]] = arith.constant true +! CHECK: %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4> +! CHECK: fir.store %[[VAL_2]] to %[[ALLOC]] : !fir.ref> +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref>) ! CHECK-LABEL: } combiner { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref>, %[[ARG1:.*]]: !fir.ref>): diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90 index a103bf58e16b9a..dd2176e4f2de1e 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90 @@ -4,13 +4,15 @@ ! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py ! CHECK-LABEL: omp.declare_reduction @eqv_reduction : !fir.ref> -! CHECK-SAME: init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>): -! CHECK: %[[VAL_1:.*]] = arith.constant true -! CHECK: %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4> +! CHECK-SAME: alloc { ! CHECK: %[[REF:.*]] = fir.alloca !fir.logical<4> -! CHECK: fir.store %[[VAL_2]] to %[[REF]] : !fir.ref> ! CHECK: omp.yield(%[[REF]] : !fir.ref>) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>, %[[ALLOC:.*]]: !fir.ref>): +! CHECK: %[[VAL_1:.*]] = arith.constant true +! CHECK: %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4> +! CHECK: fir.store %[[VAL_2]] to %[[ALLOC]] : !fir.ref> +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref>) ! CHECK-LABEL: } combiner { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref>, %[[ARG1:.*]]: !fir.ref>): diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90 index 8abc9b61c42e53..4ce4f258f5ec18 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90 @@ -4,13 +4,15 @@ ! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py ! CHECK-LABEL: omp.declare_reduction @neqv_reduction : !fir.ref> -! CHECK-SAME: init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>): -! CHECK: %[[VAL_1:.*]] = arith.constant false -! CHECK: %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4> +! CHECK-SAME: alloc { ! CHECK: %[[REF:.*]] = fir.alloca !fir.logical<4> -! CHECK: fir.store %[[VAL_2]] to %[[REF]] : !fir.ref> ! CHECK: omp.yield(%[[REF]] : !fir.ref>) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>, %[[ALLOC:.*]]: !fir.ref>): +! CHECK: %[[VAL_1:.*]] = arith.constant false +! CHECK: %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4> +! CHECK: fir.store %[[VAL_2]] to %[[ALLOC]] : !fir.ref> +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref>) ! CHECK-LABEL: } combiner { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref>, %[[ARG1:.*]]: !fir.ref>): diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90 index e6def280cf70df..2b750605519cf5 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90 @@ -4,13 +4,15 @@ ! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py ! CHECK-LABEL: omp.declare_reduction @or_reduction : !fir.ref> -! CHECK-SAME: init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>): -! CHECK: %[[VAL_1:.*]] = arith.constant false -! CHECK: %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4> +! CHECK-SAME: alloc { ! CHECK: %[[REF:.*]] = fir.alloca !fir.logical<4> -! CHECK: fir.store %[[VAL_2]] to %[[REF]] : !fir.ref> ! CHECK: omp.yield(%[[REF]] : !fir.ref>) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>, %[[ALLOC:.*]]: !fir.ref>): +! CHECK: %[[VAL_1:.*]] = arith.constant false +! CHECK: %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4> +! CHECK: fir.store %[[VAL_2]] to %[[ALLOC]] : !fir.ref> +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref>) ! CHECK-LABEL: } combiner { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref>, %[[ARG1:.*]]: !fir.ref>): diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90 index 018fb28c6f68a8..7e4890dd00fea3 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90 @@ -3,12 +3,14 @@ ! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py -!CHECK: omp.declare_reduction @max_byref_f32 : !fir.ref -!CHECK-SAME: init { -!CHECK: %[[MINIMUM_VAL:.*]] = arith.constant -3.40282347E+38 : f32 +!CHECK-LABEL: omp.declare_reduction @max_byref_f32 : !fir.ref +!CHECK-SAME: alloc { !CHECK: %[[REF:.*]] = fir.alloca f32 -!CHECK: fir.store %[[MINIMUM_VAL]] to %[[REF]] : !fir.ref !CHECK: omp.yield(%[[REF]] : !fir.ref) +!CHECK-LABEL: } init { +!CHECK: %[[MINIMUM_VAL:.*]] = arith.constant -3.40282347E+38 : f32 +!CHECK: fir.store %[[MINIMUM_VAL]] to %[[ALLOC:.*]] : !fir.ref +!CHECK: omp.yield(%[[ALLOC]] : !fir.ref) !CHECK: combiner !CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref): !CHECK: %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref @@ -18,11 +20,13 @@ !CHECK: omp.yield(%[[ARG0]] : !fir.ref) !CHECK-LABEL: omp.declare_reduction @max_byref_i32 : !fir.ref -!CHECK-SAME: init { -!CHECK: %[[MINIMUM_VAL:.*]] = arith.constant -2147483648 : i32 +!CHECK-SAME: alloc { !CHECK: %[[REF:.*]] = fir.alloca i32 -!CHECK: fir.store %[[MINIMUM_VAL]] to %[[REF]] : !fir.ref !CHECK: omp.yield(%[[REF]] : !fir.ref) +!CHECK-LABEL: } init { +!CHECK: %[[MINIMUM_VAL:.*]] = arith.constant -2147483648 : i32 +!CHECK: fir.store %[[MINIMUM_VAL]] to %[[ALLOC:.*]] : !fir.ref +!CHECK: omp.yield(%[[ALLOC]] : !fir.ref) !CHECK: combiner !CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref): !CHECK: %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir-byref.f90 deleted file mode 100644 index 130a580cd6851e..00000000000000 --- a/flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir-byref.f90 +++ /dev/null @@ -1,64 +0,0 @@ -! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s -! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction -o - %s 2>&1 | FileCheck %s - -! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py - -! CHECK-LABEL: omp.declare_reduction @max_byref_i32 : !fir.ref -! CHECK-SAME: init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref): -! CHECK: %[[MINIMUM_VAL:.*]] = arith.constant -2147483648 : i32 -! CHECK: %[[REF:.*]] = fir.alloca i32 -! CHECK: fir.store %[[MINIMUM_VAL]] to %[[REF]] : !fir.ref -! CHECK: omp.yield(%[[REF]] : !fir.ref) -! CHECK: combiner -! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref): -! CHECK: %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref -! CHECK: %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref -! CHECK: %[[RES:.*]] = arith.maxsi %[[LD0]], %[[LD1]] : i32 -! CHECK: fir.store %[[RES]] to %[[ARG0]] : !fir.ref -! CHECK: omp.yield(%[[ARG0]] : !fir.ref) - -! CHECK-LABEL: func.func @_QPreduction_max_int( -! CHECK-SAME: %[[VAL_0:.*]]: !fir.box> {fir.bindc_name = "y"}) { -! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFreduction_max_intEi"} -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) -! CHECK: %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_max_intEx"} -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_max_intEx"} : (!fir.ref) -> (!fir.ref, !fir.ref) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_max_intEy"} : (!fir.box>, !fir.dscope) -> (!fir.box>, !fir.box>) -! CHECK: %[[VAL_6:.*]] = arith.constant 0 : i32 -! CHECK: hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref -! CHECK: omp.parallel { -! CHECK: %[[VAL_7:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}} -! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) -! CHECK: %[[VAL_9:.*]] = arith.constant 1 : i32 -! CHECK: %[[VAL_10:.*]] = arith.constant 100 : i32 -! CHECK: %[[VAL_11:.*]] = arith.constant 1 : i32 -! CHECK: omp.wsloop reduction(byref @max_byref_i32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref) { -! CHECK-NEXT: omp.loop_nest (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) { -! CHECK: %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_max_intEx"} : (!fir.ref) -> (!fir.ref, !fir.ref) -! CHECK: fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref -! CHECK: %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref -! CHECK: %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i32) -> i64 -! CHECK: %[[VAL_17:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_16]]) : (!fir.box>, i64) -> !fir.ref -! CHECK: %[[VAL_18:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref -! CHECK: %[[VAL_19:.*]] = fir.load %[[VAL_17]] : !fir.ref -! CHECK: %[[VAL_20:.*]] = arith.cmpi sgt, %[[VAL_18]], %[[VAL_19]] : i32 -! CHECK: %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_18]], %[[VAL_19]] : i32 -! CHECK: hlfir.assign %[[VAL_21]] to %[[VAL_14]]#0 : i32, !fir.ref -! CHECK: omp.yield -! CHECK: omp.terminator -! CHECK: omp.terminator - - -subroutine reduction_max_int(y) - integer :: x, y(:) - x = 0 - !$omp parallel - !$omp do reduction(max:x) - do i=1, 100 - x = max(x, y(i)) - end do - !$omp end do - !$omp end parallel - print *, x -end subroutine diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir.f90 deleted file mode 100644 index 23e2ae98a02780..00000000000000 --- a/flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir.f90 +++ /dev/null @@ -1,60 +0,0 @@ -! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s -! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s - -! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py - -! CHECK-LABEL: omp.declare_reduction @max_i32 : i32 init { -! CHECK: ^bb0(%[[VAL_0:.*]]: i32): -! CHECK: %[[VAL_1:.*]] = arith.constant -2147483648 : i32 -! CHECK: omp.yield(%[[VAL_1]] : i32) - -! CHECK-LABEL: } combiner { -! CHECK: ^bb0(%[[VAL_0:.*]]: i32, %[[VAL_1:.*]]: i32): -! CHECK: %[[VAL_2:.*]] = arith.maxsi %[[VAL_0]], %[[VAL_1]] : i32 -! CHECK: omp.yield(%[[VAL_2]] : i32) -! CHECK: } - -! CHECK-LABEL: func.func @_QPreduction_max_int( -! CHECK-SAME: %[[VAL_0:.*]]: !fir.box> {fir.bindc_name = "y"}) { -! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFreduction_max_intEi"} -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) -! CHECK: %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_max_intEx"} -! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_max_intEx"} : (!fir.ref) -> (!fir.ref, !fir.ref) -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFreduction_max_intEy"} : (!fir.box>, !fir.dscope) -> (!fir.box>, !fir.box>) -! CHECK: %[[VAL_6:.*]] = arith.constant 0 : i32 -! CHECK: hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref -! CHECK: omp.parallel { -! CHECK: %[[VAL_7:.*]] = fir.alloca i32 {bindc_name = "i", pinned, {{.*}}} -! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) -! CHECK: %[[VAL_9:.*]] = arith.constant 1 : i32 -! CHECK: %[[VAL_10:.*]] = arith.constant 100 : i32 -! CHECK: %[[VAL_11:.*]] = arith.constant 1 : i32 -! CHECK: omp.wsloop reduction(@max_i32 %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref) { -! CHECK-NEXT: omp.loop_nest (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) { -! CHECK: %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_max_intEx"} : (!fir.ref) -> (!fir.ref, !fir.ref) -! CHECK: fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref -! CHECK: %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref -! CHECK: %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i32) -> i64 -! CHECK: %[[VAL_17:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_16]]) : (!fir.box>, i64) -> !fir.ref -! CHECK: %[[VAL_18:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref -! CHECK: %[[VAL_19:.*]] = fir.load %[[VAL_17]] : !fir.ref -! CHECK: %[[VAL_20:.*]] = arith.cmpi sgt, %[[VAL_18]], %[[VAL_19]] : i32 -! CHECK: %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_18]], %[[VAL_19]] : i32 -! CHECK: hlfir.assign %[[VAL_21]] to %[[VAL_14]]#0 : i32, !fir.ref -! CHECK: omp.yield -! CHECK: omp.terminator -! CHECK: omp.terminator - - -subroutine reduction_max_int(y) - integer :: x, y(:) - x = 0 - !$omp parallel - !$omp do reduction(max:x) - do i=1, 100 - x = max(x, y(i)) - end do - !$omp end do - !$omp end parallel - print *, x -end subroutine diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90 index 754b3fd400d378..41fcc979cdc9d9 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90 @@ -3,12 +3,14 @@ ! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py -!CHECK: omp.declare_reduction @min_byref_f32 : !fir.ref -!CHECK-SAME: init { -!CHECK: %[[MAXIMUM_VAL:.*]] = arith.constant 3.40282347E+38 : f32 +!CHECK-LABEL: omp.declare_reduction @min_byref_f32 : !fir.ref +!CHECK-SAME: alloc { !CHECK: %[[REF:.*]] = fir.alloca f32 -!CHECK: fir.store %[[MAXIMUM_VAL]] to %[[REF]] : !fir.ref !CHECK: omp.yield(%[[REF]] : !fir.ref) +!CHECK-LABEL: } init { +!CHECK: %[[MAXIMUM_VAL:.*]] = arith.constant 3.40282347E+38 : f32 +!CHECK: fir.store %[[MAXIMUM_VAL]] to %[[ALLOC:.*]] : !fir.ref +!CHECK: omp.yield(%[[ALLOC]] : !fir.ref) !CHECK: combiner !CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref): !CHECK: %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref @@ -18,11 +20,13 @@ !CHECK: omp.yield(%[[ARG0]] : !fir.ref) !CHECK-LABEL: omp.declare_reduction @min_byref_i32 : !fir.ref -!CHECK-SAME: init { -!CHECK: %[[MAXIMUM_VAL:.*]] = arith.constant 2147483647 : i32 +!CHECK-SAME: alloc { !CHECK: %[[REF:.*]] = fir.alloca i32 -!CHECK: fir.store %[[MAXIMUM_VAL]] to %[[REF]] : !fir.ref !CHECK: omp.yield(%[[REF]] : !fir.ref) +!CHECK-LABEL: } init { +!CHECK: %[[MAXIMUM_VAL:.*]] = arith.constant 2147483647 : i32 +!CHECK: fir.store %[[MAXIMUM_VAL]] to %[[ALLOC:.*]] : !fir.ref +!CHECK: omp.yield(%[[ALLOC]] : !fir.ref) !CHECK: combiner !CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref): !CHECK: %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-mul-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-mul-byref.f90 index eddb9112d3b0c9..28b78e41be2a08 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-mul-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-mul-byref.f90 @@ -5,12 +5,14 @@ ! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py ! CHECK-LABEL: omp.declare_reduction @multiply_reduction_byref_f64 : !fir.ref -! CHECK-SAME: init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref): -! CHECK: %[[VAL_1:.*]] = arith.constant 1.000000e+00 : f64 +! CHECK-SAME: alloc { ! CHECK: %[[REF:.*]] = fir.alloca f64 -! CHECK: fir.store %[[VAL_1]] to %[[REF]] : !fir.ref ! CHECK: omp.yield(%[[REF]] : !fir.ref) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref, %[[ALLOC:.*]]: !fir.ref): +! CHECK: %[[VAL_1:.*]] = arith.constant 1.000000e+00 : f64 +! CHECK: fir.store %[[VAL_1]] to %[[ALLOC]] : !fir.ref +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref) ! CHECK-LABEL: } combiner { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref): @@ -22,12 +24,14 @@ ! CHECK: } ! CHECK-LABEL: omp.declare_reduction @multiply_reduction_byref_i64 : !fir.ref -! CHECK-SAME: init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref): -! CHECK: %[[VAL_1:.*]] = arith.constant 1 : i64 +! CHECK-SAME: alloc { ! CHECK: %[[REF:.*]] = fir.alloca i64 -! CHECK: fir.store %[[VAL_1]] to %[[REF]] : !fir.ref ! CHECK: omp.yield(%[[REF]] : !fir.ref) +! CHECK-LABE: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref, %[[ALLOC:.*]]: !fir.ref): +! CHECK: %[[VAL_1:.*]] = arith.constant 1 : i64 +! CHECK: fir.store %[[VAL_1]] to %[[ALLOC]] : !fir.ref +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref) ! CHECK-LABEL: } combiner { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref): @@ -39,12 +43,14 @@ ! CHECK: } ! CHECK-LABEL: omp.declare_reduction @multiply_reduction_byref_f32 : !fir.ref -! CHECK-SAME: init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref): -! CHECK: %[[VAL_1:.*]] = arith.constant 1.000000e+00 : f32 +! CHECK-SAME: alloc { ! CHECK: %[[REF:.*]] = fir.alloca f32 -! CHECK: fir.store %[[VAL_1]] to %[[REF]] : !fir.ref ! CHECK: omp.yield(%[[REF]] : !fir.ref) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref, %[[ALLOC:.*]]: !fir.ref): +! CHECK: %[[VAL_1:.*]] = arith.constant 1.000000e+00 : f32 +! CHECK: fir.store %[[VAL_1]] to %[[ALLOC]] : !fir.ref +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref) ! CHECK-LABEL: } combiner { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref): @@ -56,12 +62,14 @@ ! CHECK: } ! CHECK-LABEL: omp.declare_reduction @multiply_reduction_byref_i32 : !fir.ref -! CHECK-SAME: init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref): -! CHECK: %[[VAL_1:.*]] = arith.constant 1 : i32 +! CHECK-SAME: alloc { ! CHECK: %[[REF:.*]] = fir.alloca i32 -! CHECK: fir.store %[[VAL_1]] to %[[REF]] : !fir.ref ! CHECK: omp.yield(%[[REF]] : !fir.ref) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref, %[[ALLOC:.*]]: !fir.ref): +! CHECK: %[[VAL_1:.*]] = arith.constant 1 : i32 +! CHECK: fir.store %[[VAL_1]] to %[[ALLOC]] : !fir.ref +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref) ! CHECK-LABEL: } combiner { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref): diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 index eb7f7a59d5d524..db4b4d33da7579 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 @@ -24,11 +24,13 @@ program main endprogram -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3x3xf64 : !fir.ref>> init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>): +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3x3xf64 : !fir.ref>> alloc { +! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box> +! CHECK: omp.yield(%[[VAL_3]] : !fir.ref>>) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[ALLOC:.*]]: !fir.ref>>): ! CHECK: %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64 ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref>> -! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box> ! CHECK: %[[VAL_4:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_5:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_4]], %[[VAL_5]] : (index, index) -> !fir.shape<2> @@ -42,8 +44,8 @@ program main ! CHECK: %[[VAL_14:.*]] = fir.shape_shift %[[VAL_11]]#0, %[[VAL_11]]#1, %[[VAL_13]]#0, %[[VAL_13]]#1 : (index, index, index, index) -> !fir.shapeshift<2> ! CHECK: %[[VAL_15:.*]] = fir.embox %[[VAL_9]]#0(%[[VAL_14]]) : (!fir.heap>, !fir.shapeshift<2>) -> !fir.box> ! CHECK: hlfir.assign %[[VAL_1]] to %[[VAL_15]] : f64, !fir.box> -! CHECK: fir.store %[[VAL_15]] to %[[VAL_3]] : !fir.ref>> -! CHECK: omp.yield(%[[VAL_3]] : !fir.ref>>) +! CHECK: fir.store %[[VAL_15]] to %[[ALLOC]] : !fir.ref>> +! CHECK: omp.yield(%[[ALLOC]] : !fir.ref>>) ! CHECK-LABEL: } combiner { ! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[VAL_1:.*]]: !fir.ref>>): diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 index aab6efbcbc5fe7..8d4aa8cd830389 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 @@ -18,11 +18,13 @@ program reduce_pointer deallocate(v) end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_ptr_i32 : !fir.ref>> init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>): +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_ptr_i32 : !fir.ref>> alloc { +! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box> +! CHECK: omp.yield(%[[VAL_3]] : !fir.ref>>) +! CHECK-LABEL: } init { +! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[VAL_3:.*]]: !fir.ref>>): ! CHECK: %[[VAL_1:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref>> -! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box> ! CHECK: %[[VAL_4:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box>) -> !fir.ptr ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.ptr) -> i64 ! CHECK: %[[VAL_6:.*]] = arith.constant 0 : i64 From d163935585cd8d538da8326a8e4e9e7da8aa1755 Mon Sep 17 00:00:00 2001 From: Volodymyr Vasylkun Date: Thu, 22 Aug 2024 14:18:48 +0100 Subject: [PATCH 192/426] [InstCombine] Fold `scmp(x -nsw y, 0)` to `scmp(x, y)` (#105583) Proof: https://alive2.llvm.org/ce/z/v6VtXz --- .../InstCombine/InstCombineCalls.cpp | 9 ++++ llvm/test/Transforms/InstCombine/scmp.ll | 51 +++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 2c2e1bc4686a4e..eb94e894b57b06 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1929,6 +1929,15 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { break; } + case Intrinsic::scmp: { + Value *I0 = II->getArgOperand(0), *I1 = II->getArgOperand(1); + Value *LHS, *RHS; + if (match(I0, m_NSWSub(m_Value(LHS), m_Value(RHS))) && match(I1, m_Zero())) + return replaceInstUsesWith( + CI, + Builder.CreateIntrinsic(II->getType(), Intrinsic::scmp, {LHS, RHS})); + break; + } case Intrinsic::bitreverse: { Value *IIOperand = II->getArgOperand(0); // bitrev (zext i1 X to ?) --> X ? SignBitC : 0 diff --git a/llvm/test/Transforms/InstCombine/scmp.ll b/llvm/test/Transforms/InstCombine/scmp.ll index 7f374c5f9a1d64..a3334599a67f1c 100644 --- a/llvm/test/Transforms/InstCombine/scmp.ll +++ b/llvm/test/Transforms/InstCombine/scmp.ll @@ -264,3 +264,54 @@ define i8 @scmp_from_select_ge(i32 %x, i32 %y) { %r = select i1 %ge, i8 %ne, i8 -1 ret i8 %r } + +; Fold scmp(x nsw- y, 0) to scmp(x, y) +define i8 @scmp_of_sub_and_zero(i32 %x, i32 %y) { +; CHECK-LABEL: define i8 @scmp_of_sub_and_zero( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[X]], i32 [[Y]]) +; CHECK-NEXT: ret i8 [[R]] +; + %diff = sub nsw i32 %x, %y + %r = call i8 @llvm.scmp(i32 %diff, i32 0) + ret i8 %r +} + +; Negative test: no nsw +define i8 @scmp_of_sub_and_zero_neg_1(i32 %x, i32 %y) { +; CHECK-LABEL: define i8 @scmp_of_sub_and_zero_neg_1( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[DIFF:%.*]] = sub i32 [[X]], [[Y]] +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[DIFF]], i32 0) +; CHECK-NEXT: ret i8 [[R]] +; + %diff = sub i32 %x, %y + %r = call i8 @llvm.scmp(i32 %diff, i32 0) + ret i8 %r +} + +; Negative test: second argument of scmp is not 0 +define i8 @scmp_of_sub_and_zero_neg2(i32 %x, i32 %y) { +; CHECK-LABEL: define i8 @scmp_of_sub_and_zero_neg2( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[DIFF:%.*]] = sub nsw i32 [[X]], [[Y]] +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[DIFF]], i32 15) +; CHECK-NEXT: ret i8 [[R]] +; + %diff = sub nsw i32 %x, %y + %r = call i8 @llvm.scmp(i32 %diff, i32 15) + ret i8 %r +} + +; Negative test: calling ucmp instead of scmp +define i8 @scmp_of_sub_and_zero_neg3(i32 %x, i32 %y) { +; CHECK-LABEL: define i8 @scmp_of_sub_and_zero_neg3( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[DIFF:%.*]] = sub nsw i32 [[X]], [[Y]] +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[DIFF]], i32 0) +; CHECK-NEXT: ret i8 [[R]] +; + %diff = sub nsw i32 %x, %y + %r = call i8 @llvm.ucmp(i32 %diff, i32 0) + ret i8 %r +} From c82f7976ae20a7c76904415eae1964bab78f1a04 Mon Sep 17 00:00:00 2001 From: Krystian Stasiowski Date: Thu, 22 Aug 2024 09:22:33 -0400 Subject: [PATCH 193/426] [Clang][Sema] Rebuild template parameters for out-of-line template definitions and partial specializations (#104030) We need to rebuild the template parameters of out-of-line definitions/specializations of member templates in the context of the current instantiation for the purposes of declaration matching. We already do this for function templates and class templates, but not variable templates, partial specializations of variable template, and partial specializations of class templates. This patch fixes the latter cases. --- clang/docs/ReleaseNotes.rst | 2 + clang/lib/Sema/SemaDecl.cpp | 6 + clang/lib/Sema/SemaTemplate.cpp | 20 ++-- .../test/CXX/temp/temp.decls/temp.mem/p1.cpp | 113 +++++++++++++++++- 4 files changed, 131 insertions(+), 10 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index bb47350f76b308..12a924acc14331 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -294,6 +294,8 @@ Bug Fixes to C++ Support - Correctly check constraints of explicit instantiations of member functions. (#GH46029) - Fixed an assertion failure about a constraint of a friend function template references to a value with greater template depth than the friend function template. (#GH98258) +- Clang now rebuilds the template parameters of out-of-line declarations and specializations in the context + of the current instantiation in all cases. Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 503e93f9257137..b0ccbbe34b70c3 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -7502,6 +7502,12 @@ NamedDecl *Sema::ActOnVariableDeclarator( /*never a friend*/ false, IsMemberSpecialization, Invalid); if (TemplateParams) { + if (DC->isDependentContext()) { + ContextRAII SavedContext(*this, DC); + if (RebuildTemplateParamsInCurrentInstantiation(TemplateParams)) + Invalid = true; + } + if (!TemplateParams->size() && D.getName().getKind() != UnqualifiedIdKind::IK_TemplateId) { // There is an extraneous 'template<>' for this variable. Complain diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 992565701d40ca..f8f41d0bafffc3 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -8089,13 +8089,14 @@ DeclResult Sema::ActOnClassTemplateSpecialization( return true; } + DeclContext *DC = ClassTemplate->getDeclContext(); + bool isMemberSpecialization = false; bool isPartialSpecialization = false; if (SS.isSet()) { if (TUK != TagUseKind::Reference && TUK != TagUseKind::Friend && - diagnoseQualifiedDeclaration(SS, ClassTemplate->getDeclContext(), - ClassTemplate->getDeclName(), + diagnoseQualifiedDeclaration(SS, DC, ClassTemplate->getDeclName(), TemplateNameLoc, &TemplateId, /*IsMemberSpecialization=*/false)) return true; @@ -8117,6 +8118,12 @@ DeclResult Sema::ActOnClassTemplateSpecialization( if (TemplateParams && CheckTemplateDeclScope(S, TemplateParams)) return true; + if (TemplateParams && DC->isDependentContext()) { + ContextRAII SavedContext(*this, DC); + if (RebuildTemplateParamsInCurrentInstantiation(TemplateParams)) + return true; + } + if (TemplateParams && TemplateParams->size() > 0) { isPartialSpecialization = true; @@ -8282,9 +8289,8 @@ DeclResult Sema::ActOnClassTemplateSpecialization( = cast_or_null(PrevDecl); ClassTemplatePartialSpecializationDecl *Partial = ClassTemplatePartialSpecializationDecl::Create( - Context, Kind, ClassTemplate->getDeclContext(), KWLoc, - TemplateNameLoc, TemplateParams, ClassTemplate, CanonicalConverted, - CanonType, PrevPartial); + Context, Kind, DC, KWLoc, TemplateNameLoc, TemplateParams, + ClassTemplate, CanonicalConverted, CanonType, PrevPartial); Partial->setTemplateArgsAsWritten(TemplateArgs); SetNestedNameSpecifier(*this, Partial, SS); if (TemplateParameterLists.size() > 1 && SS.isSet()) { @@ -8306,8 +8312,8 @@ DeclResult Sema::ActOnClassTemplateSpecialization( // Create a new class template specialization declaration node for // this explicit specialization or friend declaration. Specialization = ClassTemplateSpecializationDecl::Create( - Context, Kind, ClassTemplate->getDeclContext(), KWLoc, TemplateNameLoc, - ClassTemplate, CanonicalConverted, PrevDecl); + Context, Kind, DC, KWLoc, TemplateNameLoc, ClassTemplate, + CanonicalConverted, PrevDecl); Specialization->setTemplateArgsAsWritten(TemplateArgs); SetNestedNameSpecifier(*this, Specialization, SS); if (TemplateParameterLists.size() > 0) { diff --git a/clang/test/CXX/temp/temp.decls/temp.mem/p1.cpp b/clang/test/CXX/temp/temp.decls/temp.mem/p1.cpp index b48e145e1468db..4ec41521f9a3b1 100644 --- a/clang/test/CXX/temp/temp.decls/temp.mem/p1.cpp +++ b/clang/test/CXX/temp/temp.decls/temp.mem/p1.cpp @@ -1,9 +1,8 @@ -// RUN: %clang_cc1 -fsyntax-only -verify %s -// expected-no-diagnostics +// RUN: %clang_cc1 -fsyntax-only -std=c++20 -verify %s template struct A { static T cond; - + template struct B { static T twice(U value) { return (cond ? value + value : value); @@ -35,3 +34,111 @@ namespace PR6376 { Z z0; } + +namespace OutOfLine { + template + struct A { + struct B { }; + + template + void f(); + + template + void g() { } // expected-note {{previous definition is here}} + + template + static int x; + + template + static int x; + + template + static inline int x = 0; // expected-note {{previous definition is here}} + + template + struct C; + + template + struct C; + + template + struct C { }; // expected-note {{previous definition is here}} + }; + + template + template::B V> + void A::f() { } + + template + template::B V> + void A::g() { } // expected-error {{redefinition of 'g'}} + + template + template::B V> + int A::x = 0; + + template + template::B V> + int A::x = 0; + + template + template::B V> + int A::x = 0; // expected-error {{redefinition of 'x'}} + + template + template::B V> + struct A::C { }; + + template + template::B V> + struct A::C { }; + + template + template::B V> + struct A::C { }; // expected-error {{redefinition of 'C'}} + + // FIXME: Crashes when parsing the non-type template parameter prior to C++20 + template<> + template::B V> + void A::f() { } + + template<> + template::B V> + void A::g() { } // expected-note {{previous definition is here}} + + template<> + template::B V> + void A::g() { } // expected-error {{redefinition of 'g'}} + + template<> + template::B V> + int A::x = 0; + + template<> + template::B V> + int A::x = 0; + + template<> + template::B V> + int A::x = 0; // expected-note {{previous definition is here}} + + template<> + template::B V> + int A::x = 0; // expected-error {{redefinition of 'x'}} + + template<> + template::B V> + struct A::C { }; + + template<> + template::B V> + struct A::C { }; + + template<> + template::B V> + struct A::C { }; // expected-note {{previous definition is here}} + + template<> + template::B V> + struct A::C { }; // expected-error {{redefinition of 'C'}} +} From db94852b9b4ca1008ef2889175fe3af51f26a5b0 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Thu, 22 Aug 2024 15:23:50 +0200 Subject: [PATCH 194/426] [clang][bytecode] Allow adding offsets to function pointers (#105641) Convert them to Pointers, do the offset calculation and then convert them back to function pointers. --- clang/lib/AST/ByteCode/Compiler.cpp | 40 ++++++++++++++++---- clang/lib/AST/ByteCode/FunctionPointer.cpp | 43 ++++++++++++++++++++++ clang/lib/AST/ByteCode/FunctionPointer.h | 41 +++++---------------- clang/lib/AST/ByteCode/Interp.h | 37 ++++++++++++++++--- clang/lib/AST/ByteCode/Pointer.h | 6 ++- clang/lib/AST/CMakeLists.txt | 1 + clang/test/AST/ByteCode/c.c | 16 ++++++++ 7 files changed, 139 insertions(+), 45 deletions(-) create mode 100644 clang/lib/AST/ByteCode/FunctionPointer.cpp diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 9d376641f9c5a3..3a3927a9671345 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -885,12 +885,21 @@ bool Compiler::VisitPointerArithBinOp(const BinaryOperator *E) { if (!LT || !RT) return false; + // Visit the given pointer expression and optionally convert to a PT_Ptr. + auto visitAsPointer = [&](const Expr *E, PrimType T) -> bool { + if (!this->visit(E)) + return false; + if (T != PT_Ptr) + return this->emitDecayPtr(T, PT_Ptr, E); + return true; + }; + if (LHS->getType()->isPointerType() && RHS->getType()->isPointerType()) { if (Op != BO_Sub) return false; assert(E->getType()->isIntegerType()); - if (!visit(RHS) || !visit(LHS)) + if (!visitAsPointer(RHS, *RT) || !visitAsPointer(LHS, *LT)) return false; return this->emitSubPtr(classifyPrim(E->getType()), E); @@ -898,21 +907,38 @@ bool Compiler::VisitPointerArithBinOp(const BinaryOperator *E) { PrimType OffsetType; if (LHS->getType()->isIntegerType()) { - if (!visit(RHS) || !visit(LHS)) + if (!visitAsPointer(RHS, *RT)) + return false; + if (!this->visit(LHS)) return false; OffsetType = *LT; } else if (RHS->getType()->isIntegerType()) { - if (!visit(LHS) || !visit(RHS)) + if (!visitAsPointer(LHS, *LT)) + return false; + if (!this->visit(RHS)) return false; OffsetType = *RT; } else { return false; } - if (Op == BO_Add) - return this->emitAddOffset(OffsetType, E); - else if (Op == BO_Sub) - return this->emitSubOffset(OffsetType, E); + // Do the operation and optionally transform to + // result pointer type. + if (Op == BO_Add) { + if (!this->emitAddOffset(OffsetType, E)) + return false; + + if (classifyPrim(E) != PT_Ptr) + return this->emitDecayPtr(PT_Ptr, classifyPrim(E), E); + return true; + } else if (Op == BO_Sub) { + if (!this->emitSubOffset(OffsetType, E)) + return false; + + if (classifyPrim(E) != PT_Ptr) + return this->emitDecayPtr(PT_Ptr, classifyPrim(E), E); + return true; + } return false; } diff --git a/clang/lib/AST/ByteCode/FunctionPointer.cpp b/clang/lib/AST/ByteCode/FunctionPointer.cpp new file mode 100644 index 00000000000000..6b0b559a63386e --- /dev/null +++ b/clang/lib/AST/ByteCode/FunctionPointer.cpp @@ -0,0 +1,43 @@ +//===----------------------- FunctionPointer.cpp ----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FunctionPointer.h" + +namespace clang { +namespace interp { + +APValue FunctionPointer::toAPValue(const ASTContext &) const { + if (!Func) + return APValue(static_cast(nullptr), CharUnits::Zero(), {}, + /*OnePastTheEnd=*/false, /*IsNull=*/true); + + if (!Valid) + return APValue(static_cast(nullptr), + CharUnits::fromQuantity(getIntegerRepresentation()), {}, + /*OnePastTheEnd=*/false, /*IsNull=*/false); + + if (Func->getDecl()) + return APValue(Func->getDecl(), CharUnits::fromQuantity(Offset), {}, + /*OnePastTheEnd=*/false, /*IsNull=*/false); + return APValue(Func->getExpr(), CharUnits::fromQuantity(Offset), {}, + /*OnePastTheEnd=*/false, /*IsNull=*/false); +} + +void FunctionPointer::print(llvm::raw_ostream &OS) const { + OS << "FnPtr("; + if (Func && Valid) + OS << Func->getName(); + else if (Func) + OS << reinterpret_cast(Func); + else + OS << "nullptr"; + OS << ") + " << Offset; +} + +} // namespace interp +} // namespace clang diff --git a/clang/lib/AST/ByteCode/FunctionPointer.h b/clang/lib/AST/ByteCode/FunctionPointer.h index c9bdfbee55441a..e2b45b2344fdce 100644 --- a/clang/lib/AST/ByteCode/FunctionPointer.h +++ b/clang/lib/AST/ByteCode/FunctionPointer.h @@ -11,25 +11,29 @@ #include "Function.h" #include "Primitives.h" -#include "clang/AST/APValue.h" namespace clang { class ASTContext; +class APValue; namespace interp { class FunctionPointer final { private: const Function *Func; + uint64_t Offset; bool Valid; public: FunctionPointer() = default; - FunctionPointer(const Function *Func) : Func(Func), Valid(true) {} + FunctionPointer(const Function *Func, uint64_t Offset = 0) + : Func(Func), Offset(Offset), Valid(true) {} FunctionPointer(uintptr_t IntVal, const Descriptor *Desc = nullptr) - : Func(reinterpret_cast(IntVal)), Valid(false) {} + : Func(reinterpret_cast(IntVal)), Offset(0), + Valid(false) {} const Function *getFunction() const { return Func; } + uint64_t getOffset() const { return Offset; } bool isZero() const { return !Func; } bool isValid() const { return Valid; } bool isWeak() const { @@ -39,33 +43,8 @@ class FunctionPointer final { return Func->getDecl()->isWeak(); } - APValue toAPValue(const ASTContext &) const { - if (!Func) - return APValue(static_cast(nullptr), CharUnits::Zero(), {}, - /*OnePastTheEnd=*/false, /*IsNull=*/true); - - if (!Valid) - return APValue(static_cast(nullptr), - CharUnits::fromQuantity(getIntegerRepresentation()), {}, - /*OnePastTheEnd=*/false, /*IsNull=*/false); - - if (Func->getDecl()) - return APValue(Func->getDecl(), CharUnits::Zero(), {}, - /*OnePastTheEnd=*/false, /*IsNull=*/false); - return APValue(Func->getExpr(), CharUnits::Zero(), {}, - /*OnePastTheEnd=*/false, /*IsNull=*/false); - } - - void print(llvm::raw_ostream &OS) const { - OS << "FnPtr("; - if (Func && Valid) - OS << Func->getName(); - else if (Func) - OS << reinterpret_cast(Func); - else - OS << "nullptr"; - OS << ")"; - } + APValue toAPValue(const ASTContext &) const; + void print(llvm::raw_ostream &OS) const; std::string toDiagnosticString(const ASTContext &Ctx) const { if (!Func) @@ -79,7 +58,7 @@ class FunctionPointer final { } ComparisonCategoryResult compare(const FunctionPointer &RHS) const { - if (Func == RHS.Func) + if (Func == RHS.Func && Offset == RHS.Offset) return ComparisonCategoryResult::Equal; return ComparisonCategoryResult::Unordered; } diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index d8629881abc685..fd4406c0db2b88 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -1857,8 +1857,23 @@ bool OffsetHelper(InterpState &S, CodePtr OpPC, const T &Offset, else S.Stk.push(V - O, Ptr.asIntPointer().Desc); return true; + } else if (Ptr.isFunctionPointer()) { + uint64_t O = static_cast(Offset); + uint64_t N; + if constexpr (Op == ArithOp::Add) + N = Ptr.getByteOffset() + O; + else + N = Ptr.getByteOffset() - O; + + if (N > 1) + S.CCEDiag(S.Current->getSource(OpPC), diag::note_constexpr_array_index) + << N << /*non-array*/ true << 0; + S.Stk.push(Ptr.asFunctionPointer().getFunction(), N); + return true; } + assert(Ptr.isBlockPointer()); + uint64_t MaxIndex = static_cast(Ptr.getNumElems()); uint64_t Index; if (Ptr.isOnePastEnd()) @@ -2024,10 +2039,15 @@ inline bool SubPtr(InterpState &S, CodePtr OpPC) { return true; } - T A = LHS.isElementPastEnd() ? T::from(LHS.getNumElems()) - : T::from(LHS.getIndex()); - T B = RHS.isElementPastEnd() ? T::from(RHS.getNumElems()) - : T::from(RHS.getIndex()); + T A = LHS.isBlockPointer() + ? (LHS.isElementPastEnd() ? T::from(LHS.getNumElems()) + : T::from(LHS.getIndex())) + : T::from(LHS.getIntegerRepresentation()); + T B = RHS.isBlockPointer() + ? (RHS.isElementPastEnd() ? T::from(RHS.getNumElems()) + : T::from(RHS.getIndex())) + : T::from(RHS.getIntegerRepresentation()); + return AddSubMulHelper(S, OpPC, A.bitWidth(), A, B); } @@ -2905,8 +2925,15 @@ inline bool DecayPtr(InterpState &S, CodePtr OpPC) { if constexpr (std::is_same_v && std::is_same_v) { - S.Stk.push(OldPtr.getFunction()); + S.Stk.push(OldPtr.getFunction(), OldPtr.getOffset()); return true; + } else if constexpr (std::is_same_v && + std::is_same_v) { + if (OldPtr.isFunctionPointer()) { + S.Stk.push(OldPtr.asFunctionPointer().getFunction(), + OldPtr.getByteOffset()); + return true; + } } S.Stk.push(ToT(OldPtr.getIntegerRepresentation(), nullptr)); diff --git a/clang/lib/AST/ByteCode/Pointer.h b/clang/lib/AST/ByteCode/Pointer.h index ba30449977376b..27ac33616f5a8b 100644 --- a/clang/lib/AST/ByteCode/Pointer.h +++ b/clang/lib/AST/ByteCode/Pointer.h @@ -137,7 +137,7 @@ class Pointer { if (isIntegralPointer()) return asIntPointer().Value + (Offset * elemSize()); if (isFunctionPointer()) - return asFunctionPointer().getIntegerRepresentation(); + return asFunctionPointer().getIntegerRepresentation() + Offset; return reinterpret_cast(asBlockPointer().Pointee) + Offset; } @@ -551,7 +551,7 @@ class Pointer { } /// Returns the byte offset from the start. - unsigned getByteOffset() const { + uint64_t getByteOffset() const { if (isIntegralPointer()) return asIntPointer().Value + Offset; if (isOnePastEnd()) @@ -614,6 +614,8 @@ class Pointer { /// Checks if the pointer is pointing to a zero-size array. bool isZeroSizeArray() const { + if (isFunctionPointer()) + return false; if (const auto *Desc = getFieldDesc()) return Desc->isZeroSizeArray(); return false; diff --git a/clang/lib/AST/CMakeLists.txt b/clang/lib/AST/CMakeLists.txt index 041252b6830e0a..6195a16c2c68db 100644 --- a/clang/lib/AST/CMakeLists.txt +++ b/clang/lib/AST/CMakeLists.txt @@ -72,6 +72,7 @@ add_clang_library(clangAST ByteCode/EvalEmitter.cpp ByteCode/Frame.cpp ByteCode/Function.cpp + ByteCode/FunctionPointer.cpp ByteCode/InterpBuiltin.cpp ByteCode/Floating.cpp ByteCode/EvaluationResult.cpp diff --git a/clang/test/AST/ByteCode/c.c b/clang/test/AST/ByteCode/c.c index b38259d41130eb..60f4d6ad1b2967 100644 --- a/clang/test/AST/ByteCode/c.c +++ b/clang/test/AST/ByteCode/c.c @@ -297,3 +297,19 @@ void T1(void) { enum teste1 test1f(void), (*test1)(void) = test1f; // pedantic-warning {{ISO C forbids forward references to 'enum' types}} enum teste1 { TEST1 }; + + +void func(void) { + _Static_assert(func + 1 - func == 1, ""); // pedantic-warning {{arithmetic on a pointer to the function type}} \ + // pedantic-warning {{arithmetic on pointers to the function type}} \ + // pedantic-warning {{not an integer constant expression}} + _Static_assert(func + 0xdead000000000000UL - 0xdead000000000000UL == func, ""); // pedantic-warning 2{{arithmetic on a pointer to the function type}} \ + // pedantic-warning {{not an integer constant expression}} \ + // pedantic-note {{cannot refer to element 16045481047390945280 of non-array object in a constant expression}} + _Static_assert(func + 1 != func, ""); // pedantic-warning {{arithmetic on a pointer to the function type}} \ + // pedantic-warning {{expression is not an integer constant expression}} + func + 0xdead000000000000UL; // all-warning {{expression result unused}} \ + // pedantic-warning {{arithmetic on a pointer to the function type}} + func - 0xdead000000000000UL; // all-warning {{expression result unused}} \ + // pedantic-warning {{arithmetic on a pointer to the function type}} +} From 7e3f9dd21f82751ad93a54756f5f098560f31097 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 22 Aug 2024 15:23:03 +0200 Subject: [PATCH 195/426] [InstCombine] Add more tests for foldLogOpOfMaskedICmps transform (NFC) Tests for cases that would have been regressed by https://github.com/llvm/llvm-project/pull/104941. --- .../test/Transforms/InstCombine/bit-checks.ll | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/bit-checks.ll b/llvm/test/Transforms/InstCombine/bit-checks.ll index 208b2b16e99033..b1ab9d888954bb 100644 --- a/llvm/test/Transforms/InstCombine/bit-checks.ll +++ b/llvm/test/Transforms/InstCombine/bit-checks.ll @@ -1322,3 +1322,31 @@ define i32 @main15_logical(i32 %argc) { %retval.0 = select i1 %or.cond, i32 2, i32 1 ret i32 %retval.0 } + +define i1 @no_masks_with_logical_or(i32 %a, i32 %b, i32 noundef %c) { +; CHECK-LABEL: @no_masks_with_logical_or( +; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i32 [[B:%.*]], 63 +; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[A:%.*]], [[C:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 +; CHECK-NEXT: [[OR2:%.*]] = select i1 [[TMP2]], i1 true, i1 [[CMP2]] +; CHECK-NEXT: ret i1 [[OR2]] +; + %cmp1 = icmp ne i32 %a, 0 + %cmp2 = icmp ne i32 %b, 63 + %or1 = select i1 %cmp1, i1 true, i1 %cmp2 + %cmp3 = icmp ne i32 %c, 0 + %or2 = or i1 %or1, %cmp3 + ret i1 %or2 +} + +define i1 @only_one_masked(i64 %a) { +; CHECK-LABEL: @only_one_masked( +; CHECK-NEXT: [[AND:%.*]] = icmp eq i64 [[A:%.*]], -9223372036854775808 +; CHECK-NEXT: ret i1 [[AND]] +; + %cmp1 = icmp ne i64 %a, 0 + %a.mask = and i64 %a, 9223372036854775807 + %cmp2 = icmp eq i64 %a.mask, 0 + %and = and i1 %cmp1, %cmp2 + ret i1 %and +} From dd3b43a455793e79b84171b8d522fc4d86dea61d Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Thu, 22 Aug 2024 14:28:35 +0100 Subject: [PATCH 196/426] [mlir][OpenMP][NFC] clean up optional reduction region parsing (#105644) This can be handled in ODS instead of writing custom parsing/printing code. Thanks for the idea @skatrak --- mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 6 +-- mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 46 ------------------- 2 files changed, 3 insertions(+), 49 deletions(-) diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index 739b1f67be7cb2..5a7dae0b5f3074 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -1576,11 +1576,11 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [IsolatedFromAbove, AnyRegion:$cleanupRegion); let assemblyFormat = "$sym_name `:` $type attr-dict-with-keyword " - "custom($allocRegion) " + "( `alloc` $allocRegion^ )? " "`init` $initializerRegion " "`combiner` $reductionRegion " - "custom($atomicReductionRegion) " - "custom($cleanupRegion)"; + "( `atomic` $atomicReductionRegion^ )? " + "( `cleanup` $cleanupRegion^ )? "; let extraClassDeclaration = [{ PointerLikeType getAccumulatorType() { diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 273f49b8b12b67..eb4f9cb041841b 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -1883,52 +1883,6 @@ LogicalResult DistributeOp::verify() { // DeclareReductionOp //===----------------------------------------------------------------------===// -static ParseResult parseOptionalReductionRegion(OpAsmParser &parser, - Region ®ion, - StringRef keyword) { - if (parser.parseOptionalKeyword(keyword)) - return success(); - return parser.parseRegion(region); -} - -static void printOptionalReductionRegion(OpAsmPrinter &printer, Region ®ion, - StringRef keyword) { - if (region.empty()) - return; - printer << keyword << " "; - printer.printRegion(region); -} - -static ParseResult parseAllocReductionRegion(OpAsmParser &parser, - Region ®ion) { - return parseOptionalReductionRegion(parser, region, "alloc"); -} - -static void printAllocReductionRegion(OpAsmPrinter &printer, - DeclareReductionOp op, Region ®ion) { - printOptionalReductionRegion(printer, region, "alloc"); -} - -static ParseResult parseAtomicReductionRegion(OpAsmParser &parser, - Region ®ion) { - return parseOptionalReductionRegion(parser, region, "atomic"); -} - -static void printAtomicReductionRegion(OpAsmPrinter &printer, - DeclareReductionOp op, Region ®ion) { - printOptionalReductionRegion(printer, region, "atomic"); -} - -static ParseResult parseCleanupReductionRegion(OpAsmParser &parser, - Region ®ion) { - return parseOptionalReductionRegion(parser, region, "cleanup"); -} - -static void printCleanupReductionRegion(OpAsmPrinter &printer, - DeclareReductionOp op, Region ®ion) { - printOptionalReductionRegion(printer, region, "cleanup"); -} - LogicalResult DeclareReductionOp::verifyRegions() { if (!getAllocRegion().empty()) { for (YieldOp yieldOp : getAllocRegion().getOps()) { From 318b0678e3baac5723a3805d719c04b9cf1d95c3 Mon Sep 17 00:00:00 2001 From: Sirui Mu Date: Thu, 22 Aug 2024 21:32:55 +0800 Subject: [PATCH 197/426] [mlir][LLVM] Add support for constant struct with multiple fields (#102752) Currently `mlir.llvm.constant` of structure types restricts that the structure type effectively represents a complex type -- it must have exactly two fields of the same type and the field type must be either an integer type or a float type. This PR relaxes this restriction and it allows the structure type to have an arbitrary number of fields. --- mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td | 37 +++++++++------ mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp | 47 +++++++++++--------- mlir/lib/Target/LLVMIR/ModuleTranslation.cpp | 25 ++++++----- mlir/test/Dialect/LLVMIR/invalid.mlir | 10 ++--- mlir/test/Target/LLVMIR/llvmir-invalid.mlir | 26 ++++++++--- mlir/test/Target/LLVMIR/llvmir.mlir | 6 +++ 6 files changed, 96 insertions(+), 55 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td index 71f249fa538ca9..46bf1c9640c174 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td @@ -1620,19 +1620,30 @@ def LLVM_ConstantOp let description = [{ Unlike LLVM IR, MLIR does not have first-class constant values. Therefore, all constants must be created as SSA values before being used in other - operations. `llvm.mlir.constant` creates such values for scalars and - vectors. It has a mandatory `value` attribute, which may be an integer, - floating point attribute; dense or sparse attribute containing integers or - floats. The type of the attribute is one of the corresponding MLIR builtin - types. It may be omitted for `i64` and `f64` types that are implied. - - The operation produces a new SSA value of the specified LLVM IR dialect - type. Certain builtin types such as integer, float and vector types are - also allowed. The result type _must_ correspond to the attribute type - converted to LLVM IR. In particular, the number of elements of a container - type must match the number of elements in the attribute. If the type is or - contains a scalable vector type, the attribute must be a splat elements - attribute. + operations. `llvm.mlir.constant` creates such values for scalars, vectors, + strings, and structs. It has a mandatory `value` attribute whose type + depends on the type of the constant value. The type of the constant value + must correspond to the attribute type converted to LLVM IR type. + + When creating constant scalars, the `value` attribute must be either an + integer attribute or a floating point attribute. The type of the attribute + may be omitted for `i64` and `f64` types that are implied. + + When creating constant vectors, the `value` attribute must be either an + array attribute, a dense attribute, or a sparse attribute that contains + integers or floats. The number of elements in the result vector must match + the number of elements in the attribute. + + When creating constant strings, the `value` attribute must be a string + attribute. The type of the constant must be an LLVM array of `i8`s, and the + length of the array must match the length of the attribute. + + When creating constant structs, the `value` attribute must be an array + attribute that contains integers or floats. The type of the constant must be + an LLVM struct type. The number of fields in the struct must match the + number of elements in the attribute, and the type of each LLVM struct field + must correspond to the type of the corresponding attribute element converted + to LLVM IR. Examples: diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp index 92f3984e5e6db6..3870aab52f199d 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -16,6 +16,7 @@ #include "mlir/Dialect/LLVMIR/LLVMAttrs.h" #include "mlir/Dialect/LLVMIR/LLVMInterfaces.h" #include "mlir/Dialect/LLVMIR/LLVMTypes.h" +#include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/BuiltinTypes.h" @@ -2710,32 +2711,38 @@ LogicalResult LLVM::ConstantOp::verify() { } return success(); } - if (auto structType = llvm::dyn_cast(getType())) { - if (structType.getBody().size() != 2 || - structType.getBody()[0] != structType.getBody()[1]) { - return emitError() << "expected struct type with two elements of the " - "same type, the type of a complex constant"; + if (auto structType = dyn_cast(getType())) { + auto arrayAttr = dyn_cast(getValue()); + if (!arrayAttr) { + return emitOpError() << "expected array attribute for a struct constant"; } - auto arrayAttr = llvm::dyn_cast(getValue()); - if (!arrayAttr || arrayAttr.size() != 2) { - return emitOpError() << "expected array attribute with two elements, " - "representing a complex constant"; + ArrayRef elementTypes = structType.getBody(); + if (arrayAttr.size() != elementTypes.size()) { + return emitOpError() << "expected array attribute of size " + << elementTypes.size(); } - auto re = llvm::dyn_cast(arrayAttr[0]); - auto im = llvm::dyn_cast(arrayAttr[1]); - if (!re || !im || re.getType() != im.getType()) { - return emitOpError() - << "expected array attribute with two elements of the same type"; + for (auto elementTy : elementTypes) { + if (!isa(elementTy)) { + return emitOpError() << "expected struct element types to be floating " + "point type or integer type"; + } } - Type elementType = structType.getBody()[0]; - if (!llvm::isa( - elementType)) { - return emitError() - << "expected struct element types to be floating point type or " - "integer type"; + for (size_t i = 0; i < elementTypes.size(); ++i) { + Attribute element = arrayAttr[i]; + if (!isa(element)) { + return emitOpError() + << "expected struct element attribute types to be floating " + "point type or integer type"; + } + auto elementType = cast(element).getType(); + if (elementType != elementTypes[i]) { + return emitOpError() + << "struct element at index " << i << " is of wrong type"; + } } + return success(); } if (auto targetExtType = dyn_cast(getType())) { diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp index 2827713e2bf213..bb23da039e21f1 100644 --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -557,20 +557,21 @@ llvm::Constant *mlir::LLVM::detail::getLLVMConstant( return llvm::UndefValue::get(llvmType); if (auto *structType = dyn_cast<::llvm::StructType>(llvmType)) { auto arrayAttr = dyn_cast(attr); - if (!arrayAttr || arrayAttr.size() != 2) { - emitError(loc, "expected struct type to be a complex number"); + if (!arrayAttr) { + emitError(loc, "expected an array attribute for a struct constant"); return nullptr; } - llvm::Type *elementType = structType->getElementType(0); - llvm::Constant *real = - getLLVMConstant(elementType, arrayAttr[0], loc, moduleTranslation); - if (!real) - return nullptr; - llvm::Constant *imag = - getLLVMConstant(elementType, arrayAttr[1], loc, moduleTranslation); - if (!imag) - return nullptr; - return llvm::ConstantStruct::get(structType, {real, imag}); + SmallVector structElements; + structElements.reserve(structType->getNumElements()); + for (auto [elemType, elemAttr] : + zip_equal(structType->elements(), arrayAttr)) { + llvm::Constant *element = + getLLVMConstant(elemType, elemAttr, loc, moduleTranslation); + if (!element) + return nullptr; + structElements.push_back(element); + } + return llvm::ConstantStruct::get(structType, structElements); } // For integer types, we allow a mismatch in sizes as the index type in // MLIR might have a different size than the index type in the LLVM module. diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir index 62346ce0d2c4b1..6670e4b186c397 100644 --- a/mlir/test/Dialect/LLVMIR/invalid.mlir +++ b/mlir/test/Dialect/LLVMIR/invalid.mlir @@ -367,7 +367,7 @@ func.func @constant_wrong_type_string() { // ----- llvm.func @array_attribute_one_element() -> !llvm.struct<(f64, f64)> { - // expected-error @+1 {{expected array attribute with two elements, representing a complex constant}} + // expected-error @+1 {{expected array attribute of size 2}} %0 = llvm.mlir.constant([1.0 : f64]) : !llvm.struct<(f64, f64)> llvm.return %0 : !llvm.struct<(f64, f64)> } @@ -375,7 +375,7 @@ llvm.func @array_attribute_one_element() -> !llvm.struct<(f64, f64)> { // ----- llvm.func @array_attribute_two_different_types() -> !llvm.struct<(f64, f64)> { - // expected-error @+1 {{expected array attribute with two elements of the same type}} + // expected-error @+1 {{struct element at index 1 is of wrong type}} %0 = llvm.mlir.constant([1.0 : f64, 1.0 : f32]) : !llvm.struct<(f64, f64)> llvm.return %0 : !llvm.struct<(f64, f64)> } @@ -383,7 +383,7 @@ llvm.func @array_attribute_two_different_types() -> !llvm.struct<(f64, f64)> { // ----- llvm.func @struct_wrong_attribute_type() -> !llvm.struct<(f64, f64)> { - // expected-error @+1 {{expected array attribute with two elements, representing a complex constant}} + // expected-error @+1 {{expected array attribute}} %0 = llvm.mlir.constant(1.0 : f64) : !llvm.struct<(f64, f64)> llvm.return %0 : !llvm.struct<(f64, f64)> } @@ -391,7 +391,7 @@ llvm.func @struct_wrong_attribute_type() -> !llvm.struct<(f64, f64)> { // ----- llvm.func @struct_one_element() -> !llvm.struct<(f64)> { - // expected-error @+1 {{expected struct type with two elements of the same type, the type of a complex constant}} + // expected-error @+1 {{expected array attribute of size 1}} %0 = llvm.mlir.constant([1.0 : f64, 1.0 : f64]) : !llvm.struct<(f64)> llvm.return %0 : !llvm.struct<(f64)> } @@ -399,7 +399,7 @@ llvm.func @struct_one_element() -> !llvm.struct<(f64)> { // ----- llvm.func @struct_two_different_elements() -> !llvm.struct<(f64, f32)> { - // expected-error @+1 {{expected struct type with two elements of the same type, the type of a complex constant}} + // expected-error @+1 {{struct element at index 1 is of wrong type}} %0 = llvm.mlir.constant([1.0 : f64, 1.0 : f64]) : !llvm.struct<(f64, f32)> llvm.return %0 : !llvm.struct<(f64, f32)> } diff --git a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir index 9cf922ad490a92..0e2afe6fb004d8 100644 --- a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir +++ b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir @@ -15,24 +15,40 @@ llvm.func @vector_with_non_vector_type() -> f32 { // ----- -llvm.func @no_non_complex_struct() -> !llvm.array<2 x array<2 x array<2 x struct<(i32)>>>> { - // expected-error @below{{expected struct type to be a complex number}} +llvm.func @non_array_attr_for_struct() -> !llvm.array<2 x array<2 x array<2 x struct<(i32)>>>> { + // expected-error @below{{expected an array attribute for a struct constant}} %0 = llvm.mlir.constant(dense<[[[1, 2], [3, 4]], [[42, 43], [44, 45]]]> : tensor<2x2x2xi32>) : !llvm.array<2 x array<2 x array<2 x struct<(i32)>>>> llvm.return %0 : !llvm.array<2 x array<2 x array<2 x struct<(i32)>>>> } // ----- -llvm.func @no_non_complex_struct() -> !llvm.array<2 x array<2 x array<2 x struct<(i32, i32, i32)>>>> { - // expected-error @below{{expected struct type to be a complex number}} +llvm.func @non_array_attr_for_struct() -> !llvm.array<2 x array<2 x array<2 x struct<(i32, i32, i32)>>>> { + // expected-error @below{{expected an array attribute for a struct constant}} %0 = llvm.mlir.constant(dense<[[[1, 2], [3, 4]], [[42, 43], [44, 45]]]> : tensor<2x2x2xi32>) : !llvm.array<2 x array<2 x array<2 x struct<(i32, i32, i32)>>>> llvm.return %0 : !llvm.array<2 x array<2 x array<2 x struct<(i32, i32, i32)>>>> } // ----- +llvm.func @invalid_struct_element_type() -> !llvm.struct<(f64, array<2 x i32>)> { + // expected-error @below{{expected struct element types to be floating point type or integer type}} + %0 = llvm.mlir.constant([1.0 : f64, dense<[1, 2]> : tensor<2xi32>]) : !llvm.struct<(f64, array<2 x i32>)> + llvm.return %0 : !llvm.struct<(f64, array<2 x i32>)> +} + +// ----- + +llvm.func @wrong_struct_element_attr_type() -> !llvm.struct<(f64, f64)> { + // expected-error @below{{expected struct element attribute types to be floating point type or integer type}} + %0 = llvm.mlir.constant([dense<[1, 2]> : tensor<2xi32>, 2.0 : f64]) : !llvm.struct<(f64, f64)> + llvm.return %0 : !llvm.struct<(f64, f64)> +} + +// ----- + llvm.func @struct_wrong_attribute_element_type() -> !llvm.struct<(f64, f64)> { - // expected-error @below{{FloatAttr does not match expected type of the constant}} + // expected-error @below{{struct element at index 0 is of wrong type}} %0 = llvm.mlir.constant([1.0 : f32, 1.0 : f32]) : !llvm.struct<(f64, f64)> llvm.return %0 : !llvm.struct<(f64, f64)> } diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir index 8453983aa07c33..df61fef605fde0 100644 --- a/mlir/test/Target/LLVMIR/llvmir.mlir +++ b/mlir/test/Target/LLVMIR/llvmir.mlir @@ -1312,6 +1312,12 @@ llvm.func @complexintconstantarray() -> !llvm.array<2 x !llvm.array<2 x !llvm.st llvm.return %1 : !llvm.array<2 x !llvm.array<2 x !llvm.struct<(i32, i32)>>> } +llvm.func @structconstant() -> !llvm.struct<(i32, f32)> { + %1 = llvm.mlir.constant([1 : i32, 2.000000e+00 : f32]) : !llvm.struct<(i32, f32)> + // CHECK: ret { i32, float } { i32 1, float 2.000000e+00 } + llvm.return %1 : !llvm.struct<(i32, f32)> +} + // CHECK-LABEL: @indexconstantsplat llvm.func @indexconstantsplat() -> vector<3xi32> { %1 = llvm.mlir.constant(dense<42> : vector<3xindex>) : vector<3xi32> From d46812a7be13cae9a9f4f3491cb60a20c57c8da6 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Thu, 22 Aug 2024 14:45:14 +0100 Subject: [PATCH 198/426] [Analysis] Teach ScalarEvolution::getRangeRef about more dereferenceable objects (#104778) Whilst dealing with review comments on https://github.com/llvm/llvm-project/pull/96752 I discovered that SCEV does not know about the dereferenceable attribute on function arguments so I have updated getRangeRef to make use of it by calling getPointerDereferenceableBytes. --- llvm/lib/Analysis/ScalarEvolution.cpp | 18 ++++---- .../ScalarEvolution/different-loops-recs.ll | 2 +- .../ScalarEvolution/no-wrap-add-exprs.ll | 41 ++++++++++++++++++- .../PhaseOrdering/scev-custom-dl.ll | 2 +- 4 files changed, 49 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index a19358dee8ef49..21a1c74eefc071 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -6855,20 +6855,18 @@ const ConstantRange &ScalarEvolution::getRangeRef( if (U->getType()->isPointerTy() && SignHint == HINT_RANGE_UNSIGNED) { // Strengthen the range if the underlying IR value is a // global/alloca/heap allocation using the size of the object. - ObjectSizeOpts Opts; - Opts.RoundToAlign = false; - Opts.NullIsUnknownSize = true; - uint64_t ObjSize; - if ((isa(V) || isa(V) || - isAllocationFn(V, &TLI)) && - getObjectSize(V, ObjSize, DL, &TLI, Opts) && ObjSize > 1) { - // The highest address the object can start is ObjSize bytes before the - // end (unsigned max value). If this value is not a multiple of the + bool CanBeNull, CanBeFreed; + uint64_t DerefBytes = + V->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed); + if (DerefBytes > 1) { + // The highest address the object can start is DerefBytes bytes before + // the end (unsigned max value). If this value is not a multiple of the // alignment, the last possible start value is the next lowest multiple // of the alignment. Note: The computations below cannot overflow, // because if they would there's no possible start address for the // object. - APInt MaxVal = APInt::getMaxValue(BitWidth) - APInt(BitWidth, ObjSize); + APInt MaxVal = + APInt::getMaxValue(BitWidth) - APInt(BitWidth, DerefBytes); uint64_t Align = U->getValue()->getPointerAlignment(DL).value(); uint64_t Rem = MaxVal.urem(Align); MaxVal -= APInt(BitWidth, Rem); diff --git a/llvm/test/Analysis/ScalarEvolution/different-loops-recs.ll b/llvm/test/Analysis/ScalarEvolution/different-loops-recs.ll index 359e22fa41bacd..41e1d059803b21 100644 --- a/llvm/test/Analysis/ScalarEvolution/different-loops-recs.ll +++ b/llvm/test/Analysis/ScalarEvolution/different-loops-recs.ll @@ -457,7 +457,7 @@ define void @test_05(i32 %N) { ; CHECK-NEXT: %"alloca point" = bitcast i32 0 to i32 ; CHECK-NEXT: --> 0 U: [0,1) S: [0,1) ; CHECK-NEXT: %tmp = getelementptr [1000 x i32], ptr @A, i32 0, i32 %i.0 -; CHECK-NEXT: --> {(8 + @A),+,4}<%bb3> U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: (408 + @A) LoopDispositions: { %bb3: Computable } +; CHECK-NEXT: --> {(8 + @A),+,4}<%bb3> U: [40,-3623) S: [-9223372036854775808,9223372036854775805) Exits: (408 + @A) LoopDispositions: { %bb3: Computable } ; CHECK-NEXT: %tmp2 = add i32 %i.0, 1 ; CHECK-NEXT: --> {3,+,1}<%bb3> U: [3,104) S: [3,104) Exits: 103 LoopDispositions: { %bb3: Computable } ; CHECK-NEXT: %i.0 = phi i32 [ 2, %entry ], [ %tmp2, %bb ] diff --git a/llvm/test/Analysis/ScalarEvolution/no-wrap-add-exprs.ll b/llvm/test/Analysis/ScalarEvolution/no-wrap-add-exprs.ll index bd2ffddf396fe8..b096adc7c5eb40 100644 --- a/llvm/test/Analysis/ScalarEvolution/no-wrap-add-exprs.ll +++ b/llvm/test/Analysis/ScalarEvolution/no-wrap-add-exprs.ll @@ -183,7 +183,7 @@ define void @f3(ptr %x_addr, ptr %y_addr, ptr %tmp_addr) { ; CHECK-NEXT: %s3.zext = zext i8 %s3 to i16 ; CHECK-NEXT: --> (1 + (zext i8 (4 + (32 * %x) + (36 * %y)) to i16)) U: [1,254) S: [1,257) ; CHECK-NEXT: %ptr = bitcast ptr @z_addr to ptr -; CHECK-NEXT: --> @z_addr U: [0,-3) S: [-9223372036854775808,9223372036854775805) +; CHECK-NEXT: --> @z_addr U: [4,-19) S: [-9223372036854775808,9223372036854775805) ; CHECK-NEXT: %int0 = ptrtoint ptr %ptr to i32 ; CHECK-NEXT: --> (trunc i64 (ptrtoint ptr @z_addr to i64) to i32) U: [0,-3) S: [-2147483648,2147483645) ; CHECK-NEXT: %int5 = add i32 %int0, 5 @@ -191,7 +191,7 @@ define void @f3(ptr %x_addr, ptr %y_addr, ptr %tmp_addr) { ; CHECK-NEXT: %int.zext = zext i32 %int5 to i64 ; CHECK-NEXT: --> (1 + (zext i32 (4 + (trunc i64 (ptrtoint ptr @z_addr to i64) to i32)) to i64)) U: [1,4294967294) S: [1,4294967297) ; CHECK-NEXT: %ptr_noalign = bitcast ptr @z_addr_noalign to ptr -; CHECK-NEXT: --> @z_addr_noalign U: full-set S: full-set +; CHECK-NEXT: --> @z_addr_noalign U: [1,-16) S: full-set ; CHECK-NEXT: %int0_na = ptrtoint ptr %ptr_noalign to i32 ; CHECK-NEXT: --> (trunc i64 (ptrtoint ptr @z_addr_noalign to i64) to i32) U: full-set S: full-set ; CHECK-NEXT: %int5_na = add i32 %int0_na, 5 @@ -362,3 +362,40 @@ loop: exit2: ret i1 false } + + +define void @dereferenceable_arg(ptr dereferenceable(128) %len_addr, ptr dereferenceable(128) align(8) %len_addr2, ptr dereferenceable(13) align(1) %len_addr3) { +; CHECK-LABEL: 'dereferenceable_arg' +; CHECK-NEXT: Classifying expressions for: @dereferenceable_arg +; CHECK-NEXT: %ptr = bitcast ptr %len_addr to ptr +; CHECK-NEXT: --> %len_addr U: [1,-128) S: full-set +; CHECK-NEXT: %ptr2 = bitcast ptr %len_addr2 to ptr +; CHECK-NEXT: --> %len_addr2 U: [8,-135) S: [-9223372036854775808,9223372036854775801) +; CHECK-NEXT: %ptr3 = bitcast ptr %len_addr3 to ptr +; CHECK-NEXT: --> %len_addr3 U: [1,-13) S: full-set +; CHECK-NEXT: Determining loop execution counts for: @dereferenceable_arg +; + entry: + %ptr = bitcast ptr %len_addr to ptr + %ptr2 = bitcast ptr %len_addr2 to ptr + %ptr3 = bitcast ptr %len_addr3 to ptr + + ret void +} + + +define void @dereferenceable_or_null_arg(ptr dereferenceable_or_null(128) %len_addr, ptr dereferenceable_or_null(128) align(8) %len_addr2) { +; CHECK-LABEL: 'dereferenceable_or_null_arg' +; CHECK-NEXT: Classifying expressions for: @dereferenceable_or_null_arg +; CHECK-NEXT: %ptr = bitcast ptr %len_addr to ptr +; CHECK-NEXT: --> %len_addr U: [0,-128) S: full-set +; CHECK-NEXT: %ptr2 = bitcast ptr %len_addr2 to ptr +; CHECK-NEXT: --> %len_addr2 U: [0,-135) S: [-9223372036854775808,9223372036854775801) +; CHECK-NEXT: Determining loop execution counts for: @dereferenceable_or_null_arg +; + entry: + %ptr = bitcast ptr %len_addr to ptr + %ptr2 = bitcast ptr %len_addr2 to ptr + + ret void +} diff --git a/llvm/test/Transforms/PhaseOrdering/scev-custom-dl.ll b/llvm/test/Transforms/PhaseOrdering/scev-custom-dl.ll index aaea1a453664b9..d5a422ad41f559 100644 --- a/llvm/test/Transforms/PhaseOrdering/scev-custom-dl.ll +++ b/llvm/test/Transforms/PhaseOrdering/scev-custom-dl.ll @@ -112,7 +112,7 @@ define void @test_range_ref1a(i32 %x) { ; CHECK-NEXT: %i.01.0 = phi i32 [ 100, %entry ], [ %tmp4, %bb ] ; CHECK-NEXT: --> {100,+,-1}<%bb> U: [0,101) S: [0,101) Exits: 0 LoopDispositions: { %bb: Computable } ; CHECK-NEXT: %tmp1 = getelementptr [101 x i32], ptr @array, i32 0, i32 %i.01.0 -; CHECK-NEXT: --> {(400 + @array),+,-4}<%bb> U: [0,-3) S: [-2147483648,2147483645) Exits: @array LoopDispositions: { %bb: Computable } +; CHECK-NEXT: --> {(400 + @array),+,-4}<%bb> U: [0,-3) S: [-2147483648,2147483645) Exits: @array LoopDispositions: { %bb: Computable } ; CHECK-NEXT: %tmp4 = add nsw i32 %i.01.0, -1 ; CHECK-NEXT: --> {99,+,-1}<%bb> U: [-1,100) S: [-1,100) Exits: -1 LoopDispositions: { %bb: Computable } ; CHECK-NEXT: Determining loop execution counts for: @test_range_ref1a From 327edbe07ab4370ceb20ea7c805f64950871d835 Mon Sep 17 00:00:00 2001 From: Zaara Syeda Date: Thu, 22 Aug 2024 09:55:46 -0400 Subject: [PATCH 199/426] [PowerPC] Fix mask for __st[d/w/h/b]cx builtins (#104453) These builtins are currently returning CR0 which will have the format [0, 0, flag_true_if_saved, XER]. We only want to return flag_true_if_saved. This patch adds a shift to remove the XER bit before returning. --- llvm/lib/Target/PowerPC/PPCInstr64Bit.td | 4 ++-- llvm/lib/Target/PowerPC/PPCInstrInfo.td | 12 ++++++------ ...-ppc-xlcompat-LoadReserve-StoreCond-64bit-only.ll | 2 +- .../builtins-ppc-xlcompat-LoadReserve-StoreCond.ll | 9 ++++++--- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index 0177034a5ae0f1..ae25f5c78a0e2d 100644 --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -2016,9 +2016,9 @@ def SLBSYNC : XForm_0<31, 338, (outs), (ins), "slbsync", IIC_SprSLBSYNC, []>; } // IsISA3_0 def : Pat<(int_ppc_stdcx ForceXForm:$dst, g8rc:$A), - (STDCX g8rc:$A, ForceXForm:$dst)>; + (RLWINM (STDCX g8rc:$A, ForceXForm:$dst), 31, 31, 31)>; def : Pat<(PPCStoreCond ForceXForm:$dst, g8rc:$A, 8), - (STDCX g8rc:$A, ForceXForm:$dst)>; + (RLWINM (STDCX g8rc:$A, ForceXForm:$dst), 31, 31, 31)>; def : Pat<(i64 (int_ppc_mfspr timm:$SPR)), (MFSPR8 $SPR)>; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 411ea77afc0de3..b4a5e41c0107a3 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -5288,13 +5288,13 @@ def : Pat<(i64 (bitreverse i64:$A)), (OR8 (RLDICR DWBytes7654.DWord, 32, 31), DWBytes3210.DWord)>; def : Pat<(int_ppc_stwcx ForceXForm:$dst, gprc:$A), - (STWCX gprc:$A, ForceXForm:$dst)>; + (RLWINM (STWCX gprc:$A, ForceXForm:$dst), 31, 31, 31)>; def : Pat<(PPCStoreCond ForceXForm:$dst, gprc:$A, 4), - (STWCX gprc:$A, ForceXForm:$dst)>; + (RLWINM (STWCX gprc:$A, ForceXForm:$dst), 31, 31, 31)>; def : Pat<(int_ppc_stbcx ForceXForm:$dst, gprc:$A), - (STBCX gprc:$A, ForceXForm:$dst)>; + (RLWINM (STBCX gprc:$A, ForceXForm:$dst), 31, 31, 31)>; def : Pat<(PPCStoreCond ForceXForm:$dst, gprc:$A, 1), - (STBCX gprc:$A, ForceXForm:$dst)>; + (RLWINM (STBCX gprc:$A, ForceXForm:$dst), 31, 31, 31)>; def : Pat<(int_ppc_fcfid f64:$A), (XSCVSXDDP $A)>; @@ -5324,9 +5324,9 @@ def : Pat<(int_ppc_mtmsr gprc:$RS), let Predicates = [IsISA2_07] in { def : Pat<(int_ppc_sthcx ForceXForm:$dst, gprc:$A), - (STHCX gprc:$A, ForceXForm:$dst)>; + (RLWINM (STHCX gprc:$A, ForceXForm:$dst), 31, 31, 31)>; def : Pat<(PPCStoreCond ForceXForm:$dst, gprc:$A, 2), - (STHCX gprc:$A, ForceXForm:$dst)>; + (RLWINM (STHCX gprc:$A, ForceXForm:$dst), 31, 31, 31)>; } def : Pat<(int_ppc_dcbtstt ForceXForm:$dst), (DCBTST 16, ForceXForm:$dst)>; diff --git a/llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-LoadReserve-StoreCond-64bit-only.ll b/llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-LoadReserve-StoreCond-64bit-only.ll index ddfdcda7a61a75..d765f0845641c6 100644 --- a/llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-LoadReserve-StoreCond-64bit-only.ll +++ b/llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-LoadReserve-StoreCond-64bit-only.ll @@ -26,7 +26,7 @@ define dso_local i64 @test_stdcx(ptr %a, i64 %b) { ; CHECK-NEXT: stdcx. 4, 0, 3 ; CHECK-NEXT: mfocrf 3, 128 ; CHECK-NEXT: srwi 3, 3, 28 -; CHECK-NEXT: extsw 3, 3 +; CHECK-NEXT: rlwinm 3, 3, 31, 31, 31 ; CHECK-NEXT: blr entry: %0 = tail call i32 @llvm.ppc.stdcx(ptr %a, i64 %b) diff --git a/llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-LoadReserve-StoreCond.ll b/llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-LoadReserve-StoreCond.ll index 8d90c5cb882064..778fd0a37a1ede 100644 --- a/llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-LoadReserve-StoreCond.ll +++ b/llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-LoadReserve-StoreCond.ll @@ -36,7 +36,7 @@ define dso_local signext i32 @test_stwcx(ptr %a, i32 signext %b) { ; CHECK-64-NEXT: stwcx. 4, 0, 3 ; CHECK-64-NEXT: mfocrf 3, 128 ; CHECK-64-NEXT: srwi 3, 3, 28 -; CHECK-64-NEXT: extsw 3, 3 +; CHECK-64-NEXT: rlwinm 3, 3, 31, 31, 31 ; CHECK-64-NEXT: blr ; ; CHECK-32-LABEL: test_stwcx: @@ -44,6 +44,7 @@ define dso_local signext i32 @test_stwcx(ptr %a, i32 signext %b) { ; CHECK-32-NEXT: stwcx. 4, 0, 3 ; CHECK-32-NEXT: mfocrf 3, 128 ; CHECK-32-NEXT: srwi 3, 3, 28 +; CHECK-32-NEXT: rlwinm 3, 3, 31, 31, 31 ; CHECK-32-NEXT: blr entry: %0 = tail call i32 @llvm.ppc.stwcx(ptr %a, i32 %b) @@ -57,7 +58,7 @@ define dso_local signext i32 @test_sthcx(ptr %a, i16 signext %val) { ; CHECK-64-NEXT: sthcx. 4, 0, 3 ; CHECK-64-NEXT: mfocrf 3, 128 ; CHECK-64-NEXT: srwi 3, 3, 28 -; CHECK-64-NEXT: extsw 3, 3 +; CHECK-64-NEXT: rlwinm 3, 3, 31, 31, 31 ; CHECK-64-NEXT: blr ; ; CHECK-32-LABEL: test_sthcx: @@ -65,6 +66,7 @@ define dso_local signext i32 @test_sthcx(ptr %a, i16 signext %val) { ; CHECK-32-NEXT: sthcx. 4, 0, 3 ; CHECK-32-NEXT: mfocrf 3, 128 ; CHECK-32-NEXT: srwi 3, 3, 28 +; CHECK-32-NEXT: rlwinm 3, 3, 31, 31, 31 ; CHECK-32-NEXT: blr entry: %0 = sext i16 %val to i32 @@ -79,7 +81,7 @@ define signext i32 @test_stbcx(ptr %addr, i8 signext %val) { ; CHECK-64-NEXT: stbcx. 4, 0, 3 ; CHECK-64-NEXT: mfocrf 3, 128 ; CHECK-64-NEXT: srwi 3, 3, 28 -; CHECK-64-NEXT: extsw 3, 3 +; CHECK-64-NEXT: rlwinm 3, 3, 31, 31, 31 ; CHECK-64-NEXT: blr ; ; CHECK-32-LABEL: test_stbcx: @@ -87,6 +89,7 @@ define signext i32 @test_stbcx(ptr %addr, i8 signext %val) { ; CHECK-32-NEXT: stbcx. 4, 0, 3 ; CHECK-32-NEXT: mfocrf 3, 128 ; CHECK-32-NEXT: srwi 3, 3, 28 +; CHECK-32-NEXT: rlwinm 3, 3, 31, 31, 31 ; CHECK-32-NEXT: blr entry: %conv = sext i8 %val to i32 From 11e1378e56ef78e51e4fe7180511c6f40ae8dc67 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Wed, 21 Aug 2024 17:23:53 +0000 Subject: [PATCH 200/426] [LLVM][CodeGen][SVE] Increase vector.insert test coverage. --- .../AArch64/sve-insert-scalable-vector.ll | 260 ++++++++++++++++++ 1 file changed, 260 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/sve-insert-scalable-vector.ll diff --git a/llvm/test/CodeGen/AArch64/sve-insert-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-scalable-vector.ll new file mode 100644 index 00000000000000..484bed2b84d34e --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-insert-scalable-vector.ll @@ -0,0 +1,260 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; half vectors + +define @insert_into_poison_nxv4f16_nxv2f16_0( %a) #0 { +; CHECK-LABEL: insert_into_poison_nxv4f16_nxv2f16_0: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.d, z0.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv4f16.nxv2f16( poison, %a, i64 0) + ret %res +} + +define @insert_into_poison_nxv4f16_nxv2f16_2( %a) #0 { +; CHECK-LABEL: insert_into_poison_nxv4f16_nxv2f16_2: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z1.d, z0.s +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv4f16.nxv2f16( poison, %a, i64 2) + ret %res +} + +define @insert_into_poison_nxv8f16_nxv2f16_0( %a) #0 { +; CHECK-LABEL: insert_into_poison_nxv8f16_nxv2f16_0: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: st1h { z0.d }, p0, [sp] +; CHECK-NEXT: ld1h { z0.h }, p1/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv8f16.nxv2f16( poison, %a, i64 0) + ret %res +} + +define @insert_into_poison_nxv8f16_nxv2f16_2( %a) #0 { +; CHECK-LABEL: insert_into_poison_nxv8f16_nxv2f16_2: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: st1h { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: ld1h { z0.h }, p1/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv8f16.nxv2f16( poison, %a, i64 2) + ret %res +} + +define @insert_into_poison_nxv8f16_nxv2f16_4( %a) #0 { +; CHECK-LABEL: insert_into_poison_nxv8f16_nxv2f16_4: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: st1h { z0.d }, p0, [sp, #2, mul vl] +; CHECK-NEXT: ld1h { z0.h }, p1/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv8f16.nxv2f16( poison, %a, i64 4) + ret %res +} + +define @insert_into_poison_nxv8f16_nxv2f16_6( %a) #0 { +; CHECK-LABEL: insert_into_poison_nxv8f16_nxv2f16_6: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: st1h { z0.d }, p0, [sp, #3, mul vl] +; CHECK-NEXT: ld1h { z0.h }, p1/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv8f16.nxv2f16( poison, %a, i64 6) + ret %res +} + +define @insert_into_poison_nxv8f16_nxv4f16_0( %a) #0 { +; CHECK-LABEL: insert_into_poison_nxv8f16_nxv4f16_0: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv8f16.nxv4f16( poison, %a, i64 0) + ret %res +} + +define @insert_into_poison_nxv8f16_nxv4f16_4( %a) #0 { +; CHECK-LABEL: insert_into_poison_nxv8f16_nxv4f16_4: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv8f16.nxv4f16( poison, %a, i64 4) + ret %res +} + +; float vectors +define @insert_into_poison_nxv4f32_nxv2f32_0( %a) #0 { +; CHECK-LABEL: insert_into_poison_nxv4f32_nxv2f32_0: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv4f32.nxv2f32( poison, %a, i64 0) + ret %res +} + +define @insert_into_poison_nxv4f32_nxv2f32_2( %a) #0 { +; CHECK-LABEL: insert_into_poison_nxv4f32_nxv2f32_2: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv4f32.nxv2f32( poison, %a, i64 2) + ret %res +} + +; bfloat vectors + +define @insert_into_poison_nxv4bf16_nxv2bf16_0( %a) #0 { +; CHECK-LABEL: insert_into_poison_nxv4bf16_nxv2bf16_0: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.d, z0.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv4bf16.nxv2bf16( poison, %a, i64 0) + ret %res +} + +define @insert_into_poison_nxv4bf16_nxv2bf16_2( %a) #0 { +; CHECK-LABEL: insert_into_poison_nxv4bf16_nxv2bf16_2: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z1.d, z0.s +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv4bf16.nxv2bf16( poison, %a, i64 2) + ret %res +} + +define @insert_into_poison_nxv8bf16_nxv2bf16_0( %a) #0 { +; CHECK-LABEL: insert_into_poison_nxv8bf16_nxv2bf16_0: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: st1h { z0.d }, p0, [sp] +; CHECK-NEXT: ld1h { z0.h }, p1/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv8bf16.nxv2bf16( poison, %a, i64 0) + ret %res +} + +define @insert_into_poison_nxv8bf16_nxv2bf16_2( %a) #0 { +; CHECK-LABEL: insert_into_poison_nxv8bf16_nxv2bf16_2: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: st1h { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: ld1h { z0.h }, p1/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv8bf16.nxv2bf16( poison, %a, i64 2) + ret %res +} + +define @insert_into_poison_nxv8bf16_nxv2bf16_4( %a) #0 { +; CHECK-LABEL: insert_into_poison_nxv8bf16_nxv2bf16_4: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: st1h { z0.d }, p0, [sp, #2, mul vl] +; CHECK-NEXT: ld1h { z0.h }, p1/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv8bf16.nxv2bf16( poison, %a, i64 4) + ret %res +} + +define @insert_into_poison_nxv8bf16_nxv2bf16_6( %a) #0 { +; CHECK-LABEL: insert_into_poison_nxv8bf16_nxv2bf16_6: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: st1h { z0.d }, p0, [sp, #3, mul vl] +; CHECK-NEXT: ld1h { z0.h }, p1/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv8bf16.nxv2bf16( poison, %a, i64 6) + ret %res +} + +define @insert_into_poison_nxv8bf16_nxv4bf16_0( %a) #0 { +; CHECK-LABEL: insert_into_poison_nxv8bf16_nxv4bf16_0: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv8bf16.nxv4bf16( poison, %a, i64 0) + ret %res +} + +define @insert_into_poison_nxv8bf16_nxv4bf16_4( %a) #0 { +; CHECK-LABEL: insert_into_poison_nxv8bf16_nxv4bf16_4: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv8bf16.nxv4bf16( poison, %a, i64 4) + ret %res +} + +attributes #0 = { "target-features"="+sve,+bf16" } + +declare @llvm.vector.insert.nxv4f16.nxv2f16(, , i64) +declare @llvm.vector.insert.nxv8f16.nxv2f16(, , i64) +declare @llvm.vector.insert.nxv8f16.nxv4f16(, , i64) + +declare @llvm.vector.insert.nxv4f32.nxv2f32(, , i64) + +declare @llvm.vector.insert.nxv4bf16.nxv2bf16(, , i64) +declare @llvm.vector.insert.nxv8bf16.nxv2bf16(, , i64) +declare @llvm.vector.insert.nxv8bf16.nxv4bf16(, , i64) + From c8f40e7cf546557e885fe06b0349753d78193872 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 22 Aug 2024 16:12:18 +0200 Subject: [PATCH 201/426] [InstCombine] Add more test variants with poison elements (NFC) --- .../test/Transforms/InstCombine/bit-checks.ll | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/bit-checks.ll b/llvm/test/Transforms/InstCombine/bit-checks.ll index b1ab9d888954bb..93cf09f530192f 100644 --- a/llvm/test/Transforms/InstCombine/bit-checks.ll +++ b/llvm/test/Transforms/InstCombine/bit-checks.ll @@ -1339,6 +1339,56 @@ define i1 @no_masks_with_logical_or(i32 %a, i32 %b, i32 noundef %c) { ret i1 %or2 } +define i1 @no_masks_with_logical_or2(i32 %a, i32 %b, i32 noundef %c) { +; CHECK-LABEL: @no_masks_with_logical_or2( +; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i32 [[B:%.*]], 63 +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], [[C:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], -1 +; CHECK-NEXT: [[OR2:%.*]] = select i1 [[TMP2]], i1 true, i1 [[CMP2]] +; CHECK-NEXT: ret i1 [[OR2]] +; + %cmp1 = icmp ne i32 %a, -1 + %cmp2 = icmp ne i32 %b, 63 + %or1 = select i1 %cmp1, i1 true, i1 %cmp2 + %cmp3 = icmp ne i32 %c, -1 + %or2 = or i1 %or1, %cmp3 + ret i1 %or2 +} + +define <2 x i1> @no_masks_with_logical_or_vec_poison1(<2 x i32> %a, <2 x i32> %b, <2 x i32> noundef %c) { +; CHECK-LABEL: @no_masks_with_logical_or_vec_poison1( +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne <2 x i32> [[A:%.*]], +; CHECK-NEXT: [[CMP2:%.*]] = icmp ne <2 x i32> [[B:%.*]], +; CHECK-NEXT: [[OR1:%.*]] = select <2 x i1> [[CMP1]], <2 x i1> , <2 x i1> [[CMP2]] +; CHECK-NEXT: [[CMP3:%.*]] = icmp ne <2 x i32> [[C:%.*]], +; CHECK-NEXT: [[OR2:%.*]] = or <2 x i1> [[OR1]], [[CMP3]] +; CHECK-NEXT: ret <2 x i1> [[OR2]] +; + %cmp1 = icmp ne <2 x i32> %a, + %cmp2 = icmp ne <2 x i32> %b, + %or1 = select <2 x i1> %cmp1, <2 x i1> , <2 x i1> %cmp2 + %cmp3 = icmp ne <2 x i32> %c, + %or2 = or <2 x i1> %or1, %cmp3 + ret <2 x i1> %or2 +} + +define <2 x i1> @no_masks_with_logical_or_vec_poison2(<2 x i32> %a, <2 x i32> %b, <2 x i32> noundef %c) { +; CHECK-LABEL: @no_masks_with_logical_or_vec_poison2( +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne <2 x i32> [[A:%.*]], +; CHECK-NEXT: [[CMP2:%.*]] = icmp ne <2 x i32> [[B:%.*]], +; CHECK-NEXT: [[OR1:%.*]] = select <2 x i1> [[CMP1]], <2 x i1> , <2 x i1> [[CMP2]] +; CHECK-NEXT: [[CMP3:%.*]] = icmp ne <2 x i32> [[C:%.*]], +; CHECK-NEXT: [[OR2:%.*]] = or <2 x i1> [[OR1]], [[CMP3]] +; CHECK-NEXT: ret <2 x i1> [[OR2]] +; + %cmp1 = icmp ne <2 x i32> %a, + %cmp2 = icmp ne <2 x i32> %b, + %or1 = select <2 x i1> %cmp1, <2 x i1> , <2 x i1> %cmp2 + %cmp3 = icmp ne <2 x i32> %c, + %or2 = or <2 x i1> %or1, %cmp3 + ret <2 x i1> %or2 +} + define i1 @only_one_masked(i64 %a) { ; CHECK-LABEL: @only_one_masked( ; CHECK-NEXT: [[AND:%.*]] = icmp eq i64 [[A:%.*]], -9223372036854775808 From 32679e10a9b66405c340213993f65b2edf5a794a Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 22 Aug 2024 16:17:39 +0200 Subject: [PATCH 202/426] [InstCombine] Handle logical op for and/or of icmp 0/-1 This aligns the transform with what foldLogOpOfMaskedICmp() does. --- .../Transforms/InstCombine/InstCombineAndOrXor.cpp | 10 ++++++---- llvm/test/Transforms/InstCombine/bit-checks.ll | 14 ++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 2bba83b5cde3c7..b703bc7d04db58 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -3384,9 +3384,10 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, // (icmp ne A, 0) | (icmp ne B, 0) --> (icmp ne (A|B), 0) // (icmp eq A, 0) & (icmp eq B, 0) --> (icmp eq (A|B), 0) // TODO: Remove this and below when foldLogOpOfMaskedICmps can handle undefs. - if (!IsLogical && PredL == (IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE) && + if (PredL == (IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE) && PredL == PredR && match(LHS1, m_ZeroInt()) && match(RHS1, m_ZeroInt()) && - LHS0->getType() == RHS0->getType()) { + LHS0->getType() == RHS0->getType() && + (!IsLogical || isGuaranteedNotToBePoison(RHS0))) { Value *NewOr = Builder.CreateOr(LHS0, RHS0); return Builder.CreateICmp(PredL, NewOr, Constant::getNullValue(NewOr->getType())); @@ -3394,9 +3395,10 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, // (icmp ne A, -1) | (icmp ne B, -1) --> (icmp ne (A&B), -1) // (icmp eq A, -1) & (icmp eq B, -1) --> (icmp eq (A&B), -1) - if (!IsLogical && PredL == (IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE) && + if (PredL == (IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE) && PredL == PredR && match(LHS1, m_AllOnes()) && match(RHS1, m_AllOnes()) && - LHS0->getType() == RHS0->getType()) { + LHS0->getType() == RHS0->getType() && + (!IsLogical || isGuaranteedNotToBePoison(RHS0))) { Value *NewAnd = Builder.CreateAnd(LHS0, RHS0); return Builder.CreateICmp(PredL, NewAnd, Constant::getAllOnesValue(LHS0->getType())); diff --git a/llvm/test/Transforms/InstCombine/bit-checks.ll b/llvm/test/Transforms/InstCombine/bit-checks.ll index 93cf09f530192f..d1b61040853705 100644 --- a/llvm/test/Transforms/InstCombine/bit-checks.ll +++ b/llvm/test/Transforms/InstCombine/bit-checks.ll @@ -1357,11 +1357,10 @@ define i1 @no_masks_with_logical_or2(i32 %a, i32 %b, i32 noundef %c) { define <2 x i1> @no_masks_with_logical_or_vec_poison1(<2 x i32> %a, <2 x i32> %b, <2 x i32> noundef %c) { ; CHECK-LABEL: @no_masks_with_logical_or_vec_poison1( -; CHECK-NEXT: [[CMP1:%.*]] = icmp ne <2 x i32> [[A:%.*]], ; CHECK-NEXT: [[CMP2:%.*]] = icmp ne <2 x i32> [[B:%.*]], -; CHECK-NEXT: [[OR1:%.*]] = select <2 x i1> [[CMP1]], <2 x i1> , <2 x i1> [[CMP2]] -; CHECK-NEXT: [[CMP3:%.*]] = icmp ne <2 x i32> [[C:%.*]], -; CHECK-NEXT: [[OR2:%.*]] = or <2 x i1> [[OR1]], [[CMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> [[A:%.*]], [[C:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[OR2:%.*]] = select <2 x i1> [[TMP2]], <2 x i1> , <2 x i1> [[CMP2]] ; CHECK-NEXT: ret <2 x i1> [[OR2]] ; %cmp1 = icmp ne <2 x i32> %a, @@ -1374,11 +1373,10 @@ define <2 x i1> @no_masks_with_logical_or_vec_poison1(<2 x i32> %a, <2 x i32> %b define <2 x i1> @no_masks_with_logical_or_vec_poison2(<2 x i32> %a, <2 x i32> %b, <2 x i32> noundef %c) { ; CHECK-LABEL: @no_masks_with_logical_or_vec_poison2( -; CHECK-NEXT: [[CMP1:%.*]] = icmp ne <2 x i32> [[A:%.*]], ; CHECK-NEXT: [[CMP2:%.*]] = icmp ne <2 x i32> [[B:%.*]], -; CHECK-NEXT: [[OR1:%.*]] = select <2 x i1> [[CMP1]], <2 x i1> , <2 x i1> [[CMP2]] -; CHECK-NEXT: [[CMP3:%.*]] = icmp ne <2 x i32> [[C:%.*]], -; CHECK-NEXT: [[OR2:%.*]] = or <2 x i1> [[OR1]], [[CMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[A:%.*]], [[C:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], +; CHECK-NEXT: [[OR2:%.*]] = select <2 x i1> [[TMP2]], <2 x i1> , <2 x i1> [[CMP2]] ; CHECK-NEXT: ret <2 x i1> [[OR2]] ; %cmp1 = icmp ne <2 x i32> %a, From 41dcdfbff12a9bc06af25457d603b6ec26b6b45f Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Thu, 22 Aug 2024 09:16:01 -0400 Subject: [PATCH 203/426] [libc++][docs] Major update to the documentation - Landing page: add link to the libc++ Discord channel - Landing page: reorder "Getting Involved" above "Design documents" - Landing page: remove "Notes and Known Issues" which was completely outdated - Rename "Using Libc++" to "User Documentation" and update contents - Rename "Building Libc++" to "Vendor Documentation" and update contents The "BuildingLibcxx" and "UsingLibcxx" pages have basically been used for vendor and user documentation respectively. However, they were named in a way that doesn't really make that clear. Renaming the pages now gives us a location to clearly document what we target at vendors and what we target at users, and to do that separately. --- libcxx/CMakeLists.txt | 3 - libcxx/docs/Contributing.rst | 22 +- libcxx/docs/TestingLibcxx.rst | 9 +- ...{UsingLibcxx.rst => UserDocumentation.rst} | 189 +++++---- ...dingLibcxx.rst => VendorDocumentation.rst} | 365 +++++++++--------- libcxx/docs/index.rst | 52 +-- 6 files changed, 299 insertions(+), 341 deletions(-) rename libcxx/docs/{UsingLibcxx.rst => UserDocumentation.rst} (87%) rename libcxx/docs/{BuildingLibcxx.rst => VendorDocumentation.rst} (89%) diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt index 273b2238f34851..75c926f5432aea 100644 --- a/libcxx/CMakeLists.txt +++ b/libcxx/CMakeLists.txt @@ -1,6 +1,3 @@ -# See https://libcxx.llvm.org/docs/BuildingLibcxx.html for instructions on how -# to build libcxx with CMake. - #=============================================================================== # Setup Project #=============================================================================== diff --git a/libcxx/docs/Contributing.rst b/libcxx/docs/Contributing.rst index 90aabc9c4ff6fa..f81c537e00c243 100644 --- a/libcxx/docs/Contributing.rst +++ b/libcxx/docs/Contributing.rst @@ -4,9 +4,9 @@ Contributing to libc++ ====================== -This file contains notes about various tasks and processes specific to contributing -to libc++. If this is your first time contributing, please also read `this document -`__ on general rules for contributing to LLVM. +This file contains information useful when contributing to libc++. If this is your first time contributing, +please also read `this document `__ on general rules for +contributing to LLVM. If you plan on contributing to libc++, it can be useful to join the ``#libcxx`` channel on `LLVM's Discord server `__. @@ -24,16 +24,22 @@ RFCs for significant user-affecting changes =========================================== Before you start working on a change that can have significant impact on users of the library, -please consider creating a RFC on `libc++'s Discourse forum `__. +please consider creating a RFC on the `libc++ forum `_. This will ensure that you work in a direction that the project endorses and will ease reviewing your -contribution as directional questions can be raised early. Including a WIP patch is not mandatory, but -it can be useful to ground the discussion in something concrete. +contribution as directional questions can be raised early. Including a WIP patch is not mandatory, +but it can be useful to ground the discussion in something concrete. + +Writing tests and running the test suite +======================================== + +Every change in libc++ must come with appropriate tests. Libc++ has an extensive test suite that +should be run locally by developers before submitting patches and is also run as part of our CI +infrastructure. The documentation about writing tests and running them is :ref:`here `. Coding standards ================ -In general, libc++ follows the -`LLVM Coding Standards `_. +In general, libc++ follows the `LLVM Coding Standards `_. There are some deviations from these standards. Libc++ uses ``__ugly_names``. These names are reserved for implementations, so diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst index 65a7e1ec30c962..55f84a312e85c8 100644 --- a/libcxx/docs/TestingLibcxx.rst +++ b/libcxx/docs/TestingLibcxx.rst @@ -1,3 +1,5 @@ +.. _testing: + ============== Testing libc++ ============== @@ -5,8 +7,6 @@ Testing libc++ .. contents:: :local: -.. _testing: - Getting Started =============== @@ -459,11 +459,6 @@ This will build all of the benchmarks under ``/test/benchmarks`` to be built against the just-built libc++. The compiled tests are output into ``build/libcxx/test/benchmarks``. -Also See: - - * :ref:`Building Libc++ ` - * :ref:`CMake Options` - Running Benchmarks ------------------ diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UserDocumentation.rst similarity index 87% rename from libcxx/docs/UsingLibcxx.rst rename to libcxx/docs/UserDocumentation.rst index df08875c13beae..165c3b7b049087 100644 --- a/libcxx/docs/UsingLibcxx.rst +++ b/libcxx/docs/UserDocumentation.rst @@ -1,19 +1,17 @@ -.. _using-libcxx: +.. _user-documentation: -============ -Using libc++ -============ +================== +User documentation +================== .. contents:: :local: -Usually, libc++ is packaged and shipped by a vendor through some delivery vehicle -(operating system distribution, SDK, toolchain, etc) and users don't need to do -anything special in order to use the library. - This page contains information about configuration knobs that can be used by users when they know libc++ is used by their toolchain, and how to use libc++ -when it is not the default library used by their toolchain. +when it is not the default library used by their toolchain. It is aimed at +users of libc++: a separate page contains documentation aimed at vendors who +build and ship libc++ as part of their toolchain. Using a different version of the C++ Standard @@ -28,10 +26,29 @@ matches that Standard in the library. $ clang++ -std=c++17 test.cpp -.. warning:: - Using ``-std=c++XY`` with a version of the Standard that has not been ratified yet - is considered unstable. Libc++ reserves the right to make breaking changes to the - library until the standard has been ratified. +Note that using ``-std=c++XY`` with a version of the Standard that has not been ratified +yet is considered unstable. While we strive to maintain stability, libc++ may be forced to +make breaking changes to features shipped in a Standard that hasn't been ratified yet. Use +these versions of the Standard at your own risk. + + +Using libc++ when it is not the system default +============================================== + +Usually, libc++ is packaged and shipped by a vendor through some delivery vehicle +(operating system distribution, SDK, toolchain, etc) and users don't need to do +anything special in order to use the library. + +On systems where libc++ is provided but is not the default, Clang provides a flag +called ``-stdlib=`` that can be used to decide which standard library is used. +Using ``-stdlib=libc++`` will select libc++: + +.. code-block:: bash + + $ clang++ -stdlib=libc++ test.cpp + +On systems where libc++ is the library in use by default such as macOS and FreeBSD, +this flag is not required. Enabling experimental C++ Library features @@ -43,6 +60,11 @@ the Standard but whose implementation is not complete or stable yet in libc++. T are disabled by default because they are neither API nor ABI stable. However, the ``-fexperimental-library`` compiler flag can be defined to turn those features on. +On compilers that do not support the ``-fexperimental-library`` flag (such as GCC), +users can define the ``_LIBCPP_ENABLE_EXPERIMENTAL`` macro and manually link against +the appropriate static library (usually shipped as ``libc++experimental.a``) to get +access to experimental library features. + The following features are currently considered experimental and are only provided when ``-fexperimental-library`` is passed: @@ -51,7 +73,7 @@ when ``-fexperimental-library`` is passed: * ``std::jthread`` * ``std::chrono::tzdb`` and related time zone functionality -.. warning:: +.. note:: Experimental libraries are experimental. * The contents of the ```` headers and the associated static library will not remain compatible between versions. @@ -60,98 +82,18 @@ when ``-fexperimental-library`` is passed: the experimental feature is removed two releases after the non-experimental version has shipped. The full policy is explained :ref:`here `. -.. note:: - On compilers that do not support the ``-fexperimental-library`` flag, users can - define the ``_LIBCPP_ENABLE_EXPERIMENTAL`` macro and manually link against the - appropriate static library (usually shipped as ``libc++experimental.a``) to get - access to experimental library features. - - -Using libc++ when it is not the system default -============================================== - -On systems where libc++ is provided but is not the default, Clang provides a flag -called ``-stdlib=`` that can be used to decide which standard library is used. -Using ``-stdlib=libc++`` will select libc++: - -.. code-block:: bash - - $ clang++ -stdlib=libc++ test.cpp - -On systems where libc++ is the library in use by default such as macOS and FreeBSD, -this flag is not required. - - -.. _alternate libcxx: - -Using a custom built libc++ -=========================== - -Most compilers provide a way to disable the default behavior for finding the -standard library and to override it with custom paths. With Clang, this can -be done with: - -.. code-block:: bash - - $ clang++ -nostdinc++ -nostdlib++ \ - -isystem /include/c++/v1 \ - -L /lib \ - -Wl,-rpath,/lib \ - -lc++ \ - test.cpp - -The option ``-Wl,-rpath,/lib`` adds a runtime library search path, -which causes the system's dynamic linker to look for libc++ in ``/lib`` -whenever the program is loaded. - -GCC does not support the ``-nostdlib++`` flag, so one must use ``-nodefaultlibs`` -instead. Since that removes all the standard system libraries and not just libc++, -the system libraries must be re-added manually. For example: - -.. code-block:: bash - - $ g++ -nostdinc++ -nodefaultlibs \ - -isystem /include/c++/v1 \ - -L /lib \ - -Wl,-rpath,/lib \ - -lc++ -lc++abi -lm -lc -lgcc_s -lgcc \ - test.cpp - - -GDB Pretty printers for libc++ -============================== - -GDB does not support pretty-printing of libc++ symbols by default. However, libc++ does -provide pretty-printers itself. Those can be used as: - -.. code-block:: bash - - $ gdb -ex "source /utils/gdb/libcxx/printers.py" \ - -ex "python register_libcxx_printer_loader()" \ - - -.. _include-what-you-use: - -include-what-you-use (IWYU) -=========================== - -libc++ provides an IWYU `mapping file `_, -which drastically improves the accuracy of the tool when using libc++. To use the mapping file with -IWYU, you should run the tool like so: - -.. code-block:: bash - - $ include-what-you-use -Xiwyu --mapping_file=/path/to/libcxx/include/libcxx.imp file.cpp - -If you would prefer to not use that flag, then you can replace ``/path/to/include-what-you-use/share/libcxx.imp`` -file with the libc++-provided ``libcxx.imp`` file. Libc++ Configuration Macros =========================== -Libc++ provides a number of configuration macros which can be used to enable -or disable extended libc++ behavior, including enabling hardening or thread -safety annotations. +Libc++ provides a number of configuration macros that can be used by developers to +enable or disable extended libc++ behavior. + +.. warning:: + Configuration macros that are not documented here are not intended to be customized + by developers and should not be used. In particular, some configuration macros are + only intended to be used by vendors and changing their value from the one provided + in your toolchain can lead to unexpected behavior. **_LIBCPP_ENABLE_THREAD_SAFETY_ANNOTATIONS**: This macro is used to enable -Wthread-safety annotations on libc++'s @@ -193,6 +135,12 @@ safety annotations. warning saying that `std::auto_ptr` is deprecated. If the macro is defined, no warning will be emitted. By default, this macro is not defined. +**_LIBCPP_ENABLE_EXPERIMENTAL**: + This macro enables experimental features. This can be used on compilers that do + not support the ``-fexperimental-library`` flag. When used, users also need to + ensure that the appropriate experimental library (usually ``libc++experimental.a``) + is linked into their program. + C++17 Specific Configuration Macros ----------------------------------- **_LIBCPP_ENABLE_CXX17_REMOVED_AUTO_PTR**: @@ -307,7 +255,7 @@ Extensions to the C++23 modules ``std`` and ``std.compat`` ---------------------------------------------------------- Like other major implementations, libc++ provides C++23 modules ``std`` and -``std.compat`` in C++20 as an extension" +``std.compat`` in C++20 as an extension. Constant-initialized std::string -------------------------------- @@ -386,3 +334,38 @@ specific locale is imbued, the IO with the underlying stream happens with regular ``char`` elements, which are converted to/from wide characters according to the locale. Note that this doesn't behave as expected if the stream has been set in Unicode mode. + + +Third-party Integrations +======================== + +Libc++ provides integration with a few third-party tools. + +GDB Pretty printers for libc++ +------------------------------ + +GDB does not support pretty-printing of libc++ symbols by default. However, libc++ does +provide pretty-printers itself. Those can be used as: + +.. code-block:: bash + + $ gdb -ex "source /utils/gdb/libcxx/printers.py" \ + -ex "python register_libcxx_printer_loader()" \ + + + +.. _include-what-you-use: + +include-what-you-use (IWYU) +--------------------------- + +libc++ provides an IWYU `mapping file `_, +which drastically improves the accuracy of the tool when using libc++. To use the mapping file with +IWYU, you should run the tool like so: + +.. code-block:: bash + + $ include-what-you-use -Xiwyu --mapping_file=/path/to/libcxx/include/libcxx.imp file.cpp + +If you would prefer to not use that flag, then you can replace ``/path/to/include-what-you-use/share/libcxx.imp`` +file with the libc++-provided ``libcxx.imp`` file. diff --git a/libcxx/docs/BuildingLibcxx.rst b/libcxx/docs/VendorDocumentation.rst similarity index 89% rename from libcxx/docs/BuildingLibcxx.rst rename to libcxx/docs/VendorDocumentation.rst index 5c224689e0f9f8..3a3d1cdb1ea7ff 100644 --- a/libcxx/docs/BuildingLibcxx.rst +++ b/libcxx/docs/VendorDocumentation.rst @@ -1,19 +1,17 @@ -.. _BuildingLibcxx: +.. _VendorDocumentation: -=============== -Building libc++ -=============== +==================== +Vendor Documentation +==================== .. contents:: :local: -.. _build instructions: - The instructions on this page are aimed at vendors who ship libc++ as part of an operating system distribution, a toolchain or similar shipping vehicles. If you are a user merely trying to use libc++ in your program, you most likely want to -refer to your vendor's documentation, or to the general documentation for using -libc++ :ref:`here `. +refer to your vendor's documentation, or to the general user documentation +:ref:`here `. .. warning:: If your operating system already provides libc++, it is important to be careful @@ -42,21 +40,37 @@ with the following CMake invocation: $ ninja -C build install-cxx install-cxxabi install-unwind # Install .. note:: - See :ref:`CMake Options` below for more configuration options. + See :ref:`Vendor Configuration Options` below for more configuration options. After building the various ``install-XXX`` targets, shared libraries for libc++, libc++abi and libunwind should now be present in ``/lib``, and headers in -``/include/c++/v1``. See :ref:`using an alternate libc++ installation -` for information on how to use this libc++ over the default one. +``/include/c++/v1``. See the instructions below for information on how +to use this libc++ over the default one. In the default configuration, the runtimes will be built using the compiler available by default on your system. Of course, you can change what compiler is being used with the usual CMake variables. If you wish to build the runtimes from a just-built Clang, the bootstrapping build explained below makes this task easy. +Using the just-built libc++ +--------------------------- -Bootstrapping build -=================== +Most compilers provide a way to disable the default behavior for finding the standard library and +to override it with custom paths. With Clang, this can be done with: + +.. code-block:: bash + + $ clang++ -nostdinc++ -isystem /include/c++/v1 \ + -nostdlib++ -L /lib -lc++ \ + -Wl,-rpath,/lib \ + test.cpp + +The option ``-Wl,-rpath,/lib`` adds a runtime library search path, which causes the system's +dynamic linker to look for libc++ in ``/lib`` whenever the program is loaded. + + +The Bootstrapping build +======================= It is possible to build Clang and then build the runtimes using that just-built compiler in a single CMake invocation. This is usually the correct way to build the runtimes when putting together @@ -75,123 +89,29 @@ CMake invocation at ``/llvm``: $ ninja -C build install-runtimes # Install .. note:: - This type of build is also commonly called a "Runtimes build", but we would like to move - away from that terminology, which is too confusing. - -.. warning:: - Adding the `--fresh` flag to the top-level cmake invocation in a bootstrapping build *will not* - freshen the cmake cache of any of the enabled runtimes. - -Support for Windows -=================== - -libcxx supports being built with clang-cl, but not with MSVC's cl.exe, as -cl doesn't support the ``#include_next`` extension. Furthermore, VS 2017 or -newer (19.14) is required. - -libcxx also supports being built with clang targeting MinGW environments. - -CMake + Visual Studio ---------------------- + - This type of build is also commonly called a "Runtimes build", but we would like to move + away from that terminology, which is too confusing. -Building with Visual Studio currently does not permit running tests. However, -it is the simplest way to build. + - Adding the `--fresh` flag to the top-level cmake invocation in a bootstrapping build *will not* + freshen the cmake cache of any of the enabled runtimes. -.. code-block:: batch - > cmake -G "Visual Studio 16 2019" -S runtimes -B build ^ - -T "ClangCL" ^ - -DLLVM_ENABLE_RUNTIMES=libcxx ^ - -DLIBCXX_ENABLE_SHARED=YES ^ - -DLIBCXX_ENABLE_STATIC=NO - > cmake --build build +.. _Vendor Configuration Options: -CMake + ninja (MSVC) --------------------- +Vendor Configuration Options +============================ -Building with ninja is required for development to enable tests. -A couple of tests require Bash to be available, and a couple dozens -of tests require other posix tools (cp, grep and similar - LLVM's tests -require the same). Without those tools the vast majority of tests -can still be ran successfully. +This section documents configuration options that can be used by vendors when building the library. +These options provide a great deal of flexibility to customize libc++, such as selecting the ABI in +use, whether some features are provided, etc. -If Git for Windows is available, that can be used to provide the bash -shell by adding the right bin directory to the path, e.g. -``set PATH=%PATH%;C:\Program Files\Git\usr\bin``. - -Alternatively, one can also choose to run the whole build in a MSYS2 -shell. That can be set up e.g. by starting a Visual Studio Tools Command -Prompt (for getting the environment variables pointing to the headers and -import libraries), and making sure that clang-cl is available in the -path. From there, launch an MSYS2 shell via e.g. -``C:\msys64\msys2_shell.cmd -full-path -mingw64`` (preserving the earlier -environment, allowing the MSVC headers/libraries and clang-cl to be found). - -In either case, then run: - -.. code-block:: batch - - > cmake -G Ninja -S runtimes -B build ^ - -DCMAKE_C_COMPILER=clang-cl ^ - -DCMAKE_CXX_COMPILER=clang-cl ^ - -DLLVM_ENABLE_RUNTIMES=libcxx - > ninja -C build cxx - > ninja -C build check-cxx - -If you are running in an MSYS2 shell and you have installed the -MSYS2-provided clang package (which defaults to a non-MSVC target), you -should add e.g. ``-DCMAKE_CXX_COMPILER_TARGET=x86_64-windows-msvc`` (replacing -``x86_64`` with the architecture you're targeting) to the ``cmake`` command -line above. This will instruct ``check-cxx`` to use the right target triple -when invoking ``clang++``. - -CMake + ninja (MinGW) ---------------------- - -libcxx can also be built in MinGW environments, e.g. with the MinGW -compilers in MSYS2. This requires clang to be available (installed with -e.g. the ``mingw-w64-x86_64-clang`` package), together with CMake and ninja. - -.. code-block:: bash - - > cmake -G Ninja -S runtimes -B build \ - -DCMAKE_C_COMPILER=clang \ - -DCMAKE_CXX_COMPILER=clang++ \ - -DLLVM_ENABLE_LLD=ON \ - -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" \ - -DLIBCXXABI_ENABLE_SHARED=OFF \ - -DLIBCXX_ENABLE_STATIC_ABI_LIBRARY=ON - > ninja -C build cxx - > ninja -C build check-cxx - -.. _`libc++abi`: http://libcxxabi.llvm.org/ - - -.. _CMake Options: - -CMake Options -============= - -Here are some of the CMake variables that are used often, along with a -brief explanation and LLVM-specific notes. For full documentation, check the -CMake docs or execute ``cmake --help-variable VARIABLE_NAME``. - -**CMAKE_BUILD_TYPE**:STRING - Sets the build type for ``make`` based generators. Possible values are - Release, Debug, RelWithDebInfo and MinSizeRel. On systems like Visual Studio - the user sets the build type with the IDE settings. - -**CMAKE_INSTALL_PREFIX**:PATH - Path where LLVM will be installed if "make install" is invoked or the - "INSTALL" target is built. - -**CMAKE_CXX_COMPILER**:STRING - The C++ compiler to use when building and testing libc++. - - -.. _libcxx-specific options: +.. warning:: + Many of these CMake options are tied to configuration macros with a corresponding name in the source + code. However, these configuration macros are not intended to be customized by users directly, since + many of them require the library to be built with a matching configuration. If you don't build libc++ + yourself, you should not use the options documented here. -libc++ specific options +General purpose options ----------------------- .. option:: LIBCXX_INSTALL_LIBRARY:BOOL @@ -305,65 +225,6 @@ libc++ specific options Additional libraries libc++ is linked to which can be provided in cache. - -.. _ABI Library Specific Options: - -ABI Library Specific Options ----------------------------- - -.. option:: LIBCXX_CXX_ABI:STRING - - **Values**: ``none``, ``libcxxabi``, ``system-libcxxabi``, ``libcxxrt``, ``libstdc++``, ``libsupc++``, ``vcruntime``. - - Select the ABI library to build libc++ against. - -.. option:: LIBCXX_CXX_ABI_INCLUDE_PATHS:PATHS - - Provide additional search paths for the ABI library headers. - -.. option:: LIBCXX_CXX_ABI_LIBRARY_PATH:PATH - - Provide the path to the ABI library that libc++ should link against. This is only - useful when linking against an out-of-tree ABI library. - -.. option:: LIBCXX_ENABLE_STATIC_ABI_LIBRARY:BOOL - - **Default**: ``OFF`` - - If this option is enabled, libc++ will try and link the selected ABI library - statically. - -.. option:: LIBCXX_ENABLE_ABI_LINKER_SCRIPT:BOOL - - **Default**: ``ON`` by default on UNIX platforms other than Apple unless - 'LIBCXX_ENABLE_STATIC_ABI_LIBRARY' is ON. Otherwise the default value is ``OFF``. - - This option generate and installs a linker script as ``libc++.so`` which - links the correct ABI library. - -.. option:: LIBCXXABI_USE_LLVM_UNWINDER:BOOL - - **Default**: ``ON`` - - Build and use the LLVM unwinder. Note: This option can only be used when - libc++abi is the C++ ABI library used. - -.. option:: LIBCXXABI_ADDITIONAL_COMPILE_FLAGS:STRING - - **Default**: ``""`` - - Additional Compile only flags which can be provided in cache. - -.. option:: LIBCXXABI_ADDITIONAL_LIBRARIES:STRING - - **Default**: ``""`` - - Additional libraries libc++abi is linked to which can be provided in cache. - - -libc++ Feature Options ----------------------- - .. option:: LIBCXX_ENABLE_EXCEPTIONS:BOOL **Default**: ``ON`` @@ -409,9 +270,8 @@ libc++ Feature Options default assertion handler. If this is specified as a relative path, it is assumed to be relative to ``/libcxx``. - -libc++ ABI Feature Options --------------------------- +ABI Specific Options +-------------------- The following options allow building libc++ for a different ABI version. @@ -437,7 +297,7 @@ The following options allow building libc++ for a different ABI version. with other libc++ versions. .. warning:: - When providing a custom namespace, it's the user's responsibility to ensure the name won't cause + When providing a custom namespace, it's the vendor's responsibility to ensure the name won't cause conflicts with other names defined by libc++, both now and in the future. In particular, inline namespaces of the form ``__[0-9]+`` could cause conflicts with future versions of the library, and so should be avoided. @@ -449,8 +309,54 @@ The following options allow building libc++ for a different ABI version. A semicolon-separated list of ABI macros to persist in the site config header. See ``include/__config`` for the list of ABI macros. +.. option:: LIBCXX_CXX_ABI:STRING + + **Values**: ``none``, ``libcxxabi``, ``system-libcxxabi``, ``libcxxrt``, ``libstdc++``, ``libsupc++``, ``vcruntime``. + + Select the ABI library to build libc++ against. + +.. option:: LIBCXX_CXX_ABI_INCLUDE_PATHS:PATHS + + Provide additional search paths for the ABI library headers. + +.. option:: LIBCXX_CXX_ABI_LIBRARY_PATH:PATH + + Provide the path to the ABI library that libc++ should link against. This is only + useful when linking against an out-of-tree ABI library. + +.. option:: LIBCXX_ENABLE_STATIC_ABI_LIBRARY:BOOL + + **Default**: ``OFF`` + + If this option is enabled, libc++ will try and link the selected ABI library + statically. -.. _LLVM-specific variables: +.. option:: LIBCXX_ENABLE_ABI_LINKER_SCRIPT:BOOL + + **Default**: ``ON`` by default on UNIX platforms other than Apple unless + 'LIBCXX_ENABLE_STATIC_ABI_LIBRARY' is ON. Otherwise the default value is ``OFF``. + + This option generate and installs a linker script as ``libc++.so`` which + links the correct ABI library. + +.. option:: LIBCXXABI_USE_LLVM_UNWINDER:BOOL + + **Default**: ``ON`` + + Build and use the LLVM unwinder. Note: This option can only be used when + libc++abi is the C++ ABI library used. + +.. option:: LIBCXXABI_ADDITIONAL_COMPILE_FLAGS:STRING + + **Default**: ``""`` + + Additional Compile only flags which can be provided in cache. + +.. option:: LIBCXXABI_ADDITIONAL_LIBRARIES:STRING + + **Default**: ``""`` + + Additional libraries libc++abi is linked to which can be provided in cache. LLVM-specific options --------------------- @@ -473,6 +379,91 @@ LLVM-specific options others. +Support for Windows +=================== + +Libc++ supports being built with clang-cl, but not with MSVC's cl.exe, as +cl doesn't support the ``#include_next`` extension. Furthermore, VS 2017 or +newer (19.14) is required. + +Libc++ also supports being built with clang targeting MinGW environments. + +CMake + Visual Studio +--------------------- + +Building with Visual Studio currently does not permit running tests. However, +it is the simplest way to build. + +.. code-block:: batch + + > cmake -G "Visual Studio 16 2019" -S runtimes -B build ^ + -T "ClangCL" ^ + -DLLVM_ENABLE_RUNTIMES=libcxx ^ + -DLIBCXX_ENABLE_SHARED=YES ^ + -DLIBCXX_ENABLE_STATIC=NO + > cmake --build build + +CMake + ninja (MSVC) +-------------------- + +Building with ninja is required for development to enable tests. +A couple of tests require Bash to be available, and a couple dozens +of tests require other posix tools (cp, grep and similar - LLVM's tests +require the same). Without those tools the vast majority of tests +can still be ran successfully. + +If Git for Windows is available, that can be used to provide the bash +shell by adding the right bin directory to the path, e.g. +``set PATH=%PATH%;C:\Program Files\Git\usr\bin``. + +Alternatively, one can also choose to run the whole build in a MSYS2 +shell. That can be set up e.g. by starting a Visual Studio Tools Command +Prompt (for getting the environment variables pointing to the headers and +import libraries), and making sure that clang-cl is available in the +path. From there, launch an MSYS2 shell via e.g. +``C:\msys64\msys2_shell.cmd -full-path -mingw64`` (preserving the earlier +environment, allowing the MSVC headers/libraries and clang-cl to be found). + +In either case, then run: + +.. code-block:: batch + + > cmake -G Ninja -S runtimes -B build ^ + -DCMAKE_C_COMPILER=clang-cl ^ + -DCMAKE_CXX_COMPILER=clang-cl ^ + -DLLVM_ENABLE_RUNTIMES=libcxx + > ninja -C build cxx + > ninja -C build check-cxx + +If you are running in an MSYS2 shell and you have installed the +MSYS2-provided clang package (which defaults to a non-MSVC target), you +should add e.g. ``-DCMAKE_CXX_COMPILER_TARGET=x86_64-windows-msvc`` (replacing +``x86_64`` with the architecture you're targeting) to the ``cmake`` command +line above. This will instruct ``check-cxx`` to use the right target triple +when invoking ``clang++``. + +CMake + ninja (MinGW) +--------------------- + +libcxx can also be built in MinGW environments, e.g. with the MinGW +compilers in MSYS2. This requires clang to be available (installed with +e.g. the ``mingw-w64-x86_64-clang`` package), together with CMake and ninja. + +.. code-block:: bash + + > cmake -G Ninja -S runtimes -B build \ + -DCMAKE_C_COMPILER=clang \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DLLVM_ENABLE_LLD=ON \ + -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" \ + -DLIBCXXABI_ENABLE_SHARED=OFF \ + -DLIBCXX_ENABLE_STATIC_ABI_LIBRARY=ON + > ninja -C build cxx + > ninja -C build check-cxx + +.. _`libc++abi`: http://libcxxabi.llvm.org/ + + .. _assertion-handler: Overriding the default assertion handler diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst index 2dc08563358aba..18af347a1217ff 100644 --- a/libcxx/docs/index.rst +++ b/libcxx/docs/index.rst @@ -35,10 +35,10 @@ Getting Started with libc++ :maxdepth: 1 ReleaseNotes - UsingLibcxx - BuildingLibcxx - TestingLibcxx + UserDocumentation + VendorDocumentation Contributing + TestingLibcxx ImplementationDefinedBehavior Modules Hardening @@ -185,18 +185,25 @@ this release is described in the pages below: * :ref:`C++ Feature Test Macro Status ` -Notes and Known Issues -====================== +Getting Involved +================ + +First please review our `Developer's Policy `__ +and `Getting started with LLVM `__. + +**Bug Reports** -This list contains known issues with libc++ +If you think you've found a bug in libc++, please report it using the `LLVM bug tracker`_. +If you're not sure, you can ask for support on the `libc++ forum`_ or in the `libc++ chat`_. -* Building libc++ with ``-fno-rtti`` is not supported. However - linking against it with ``-fno-rtti`` is supported. +**Patches** +If you want to contribute a patch to libc++, please start by reviewing our +:ref:`documentation about contributing `. -A full list of currently open libc++ bugs can be `found here`__. +**Discussion and Questions** -.. __: https://github.com/llvm/llvm-project/labels/libc%2B%2B +Send discussions and questions to the `libc++ forum`_. Design Documents @@ -232,33 +239,12 @@ Build Bots and Test Coverage * :ref:`Adding New CI Jobs ` -Getting Involved -================ - -First please review our `Developer's Policy `__ -and `Getting started with LLVM `__. - -**Bug Reports** - -If you think you've found a bug in libc++, please report it using -the `LLVM bug tracker`_. If you're not sure, you -can ask for support on the `libcxx forum`_ or on IRC. - -**Patches** - -If you want to contribute a patch to libc++, please start by reviewing our -:ref:`documentation about contributing `. - -**Discussion and Questions** - -Send discussions and questions to the `libcxx forum`_. - - Quick Links =========== * `LLVM Homepage `_ * `libc++abi Homepage `_ * `LLVM Bug Tracker `_ * `libcxx-commits Mailing List `_ -* `libcxx Forum `_ +* `libc++ forum `_ +* `libc++ chat `_ * `Browse libc++ Sources `_ From 00baa1af0f73f0e4c12edc12f57e62021ada7ccd Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 22 Aug 2024 07:30:39 -0700 Subject: [PATCH 204/426] [DAG][RISCV] Use vp_reduce_* when widening illegal types for reductions (#105455) This allows the use a single wider operation with a restricted EVL instead of padding the vector with the neutral element. For RISCV specifically, it's worth noting that an alternate padded lowering is available when VL is one less than a power of two, and LMUL <= m1. We could slide the vector operand up by one, and insert the padding via a vslide1up. We don't currently pattern match this, but we could. This form would arguably be better iff the surrounding code wanted VL=4. This patch will force a VL toggle in that case instead. Basically, it comes down to a question of whether we think odd sized vectors are going to appear clustered with odd size vector operations, or mixed in with larger power of two operations. Note there is a potential downside of using vp nodes; we loose any generic DAG combines which might have applied to the widened form. --- llvm/include/llvm/IR/VPIntrinsics.def | 29 ++-- .../SelectionDAG/LegalizeVectorTypes.cpp | 43 ++++- .../rvv/fixed-vectors-reduction-formation.ll | 160 ++++++------------ .../RISCV/rvv/fixed-vectors-reduction-fp.ll | 52 +++--- .../RISCV/rvv/fixed-vectors-reduction-int.ll | 34 ++-- .../RISCV/rvv/vreductions-fp-sdnode.ll | 42 ++--- 6 files changed, 158 insertions(+), 202 deletions(-) diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def index a4a1000d37259e..9333f6be5b516d 100644 --- a/llvm/include/llvm/IR/VPIntrinsics.def +++ b/llvm/include/llvm/IR/VPIntrinsics.def @@ -651,63 +651,64 @@ END_REGISTER_VP(vp_gather, VP_GATHER) #error \ "The internal helper macro HELPER_REGISTER_REDUCTION_VP is already defined!" #endif -#define HELPER_REGISTER_REDUCTION_VP(VPID, VPSD, INTRIN) \ +#define HELPER_REGISTER_REDUCTION_VP(VPID, VPSD, INTRIN, SDOPC) \ BEGIN_REGISTER_VP(VPID, 2, 3, VPSD, 1) \ VP_PROPERTY_FUNCTIONAL_INTRINSIC(INTRIN) \ + VP_PROPERTY_FUNCTIONAL_SDOPC(SDOPC) \ VP_PROPERTY_REDUCTION(0, 1) \ END_REGISTER_VP(VPID, VPSD) // llvm.vp.reduce.add(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_add, VP_REDUCE_ADD, - vector_reduce_add) + vector_reduce_add, VECREDUCE_ADD) // llvm.vp.reduce.mul(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_mul, VP_REDUCE_MUL, - vector_reduce_mul) + vector_reduce_mul, VECREDUCE_MUL) // llvm.vp.reduce.and(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_and, VP_REDUCE_AND, - vector_reduce_and) + vector_reduce_and, VECREDUCE_AND) // llvm.vp.reduce.or(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_or, VP_REDUCE_OR, - vector_reduce_or) + vector_reduce_or, VECREDUCE_OR) // llvm.vp.reduce.xor(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_xor, VP_REDUCE_XOR, - vector_reduce_xor) + vector_reduce_xor, VECREDUCE_XOR) // llvm.vp.reduce.smax(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_smax, VP_REDUCE_SMAX, - vector_reduce_smax) + vector_reduce_smax, VECREDUCE_SMAX) // llvm.vp.reduce.smin(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_smin, VP_REDUCE_SMIN, - vector_reduce_smin) + vector_reduce_smin, VECREDUCE_SMIN) // llvm.vp.reduce.umax(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_umax, VP_REDUCE_UMAX, - vector_reduce_umax) + vector_reduce_umax, VECREDUCE_UMAX) // llvm.vp.reduce.umin(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_umin, VP_REDUCE_UMIN, - vector_reduce_umin) + vector_reduce_umin, VECREDUCE_UMIN) // llvm.vp.reduce.fmax(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_fmax, VP_REDUCE_FMAX, - vector_reduce_fmax) + vector_reduce_fmax, VECREDUCE_FMAX) // llvm.vp.reduce.fmin(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_fmin, VP_REDUCE_FMIN, - vector_reduce_fmin) + vector_reduce_fmin, VECREDUCE_FMIN) // llvm.vp.reduce.fmaximum(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_fmaximum, VP_REDUCE_FMAXIMUM, - vector_reduce_fmaximum) + vector_reduce_fmaximum, VECREDUCE_FMAXIMUM) // llvm.vp.reduce.fminimum(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_fminimum, VP_REDUCE_FMINIMUM, - vector_reduce_fminimum) + vector_reduce_fminimum, VECREDUCE_FMINIMUM) #undef HELPER_REGISTER_REDUCTION_VP diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 8315efcb6750f9..5745c147e3502d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -7271,9 +7271,29 @@ SDValue DAGTypeLegalizer::WidenVecOp_STRICT_FSETCC(SDNode *N) { return DAG.getBuildVector(VT, dl, Scalars); } +static unsigned getExtendForIntVecReduction(unsigned Opc) { + switch (Opc) { + default: + llvm_unreachable("Expected integer vector reduction"); + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + return ISD::ANY_EXTEND; + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + return ISD::SIGN_EXTEND; + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + return ISD::ZERO_EXTEND; + } +} + SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) { SDLoc dl(N); SDValue Op = GetWidenedVector(N->getOperand(0)); + EVT VT = N->getValueType(0); EVT OrigVT = N->getOperand(0).getValueType(); EVT WideVT = Op.getValueType(); EVT ElemVT = OrigVT.getVectorElementType(); @@ -7288,6 +7308,25 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) { unsigned OrigElts = OrigVT.getVectorMinNumElements(); unsigned WideElts = WideVT.getVectorMinNumElements(); + // Generate a vp.reduce_op if it is custom/legal for the target. This avoids + // needing to pad the source vector, because the inactive lanes can simply be + // disabled and not contribute to the result. + // TODO: VECREDUCE_FADD, VECREDUCE_FMUL aren't currently mapped correctly, + // and thus don't take this path. + if (auto VPOpcode = ISD::getVPForBaseOpcode(Opc); + VPOpcode && TLI.isOperationLegalOrCustom(*VPOpcode, WideVT)) { + SDValue Start = NeutralElem; + if (VT.isInteger()) + Start = DAG.getNode(getExtendForIntVecReduction(Opc), dl, VT, Start); + assert(Start.getValueType() == VT); + EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + WideVT.getVectorElementCount()); + SDValue Mask = DAG.getAllOnesConstant(dl, WideMaskVT); + SDValue EVL = DAG.getElementCount(dl, TLI.getVPExplicitVectorLengthTy(), + OrigVT.getVectorElementCount()); + return DAG.getNode(*VPOpcode, dl, VT, {Start, Op, Mask, EVL}, Flags); + } + if (WideVT.isScalableVector()) { unsigned GCD = std::gcd(OrigElts, WideElts); EVT SplatVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, @@ -7296,14 +7335,14 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) { for (unsigned Idx = OrigElts; Idx < WideElts; Idx = Idx + GCD) Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Op, SplatNeutral, DAG.getVectorIdxConstant(Idx, dl)); - return DAG.getNode(Opc, dl, N->getValueType(0), Op, Flags); + return DAG.getNode(Opc, dl, VT, Op, Flags); } for (unsigned Idx = OrigElts; Idx < WideElts; Idx++) Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, WideVT, Op, NeutralElem, DAG.getVectorIdxConstant(Idx, dl)); - return DAG.getNode(Opc, dl, N->getValueType(0), Op, Flags); + return DAG.getNode(Opc, dl, VT, Op, Flags); } SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE_SEQ(SDNode *N) { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll index c0bd49cc9c5cbf..fa56412e71c678 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll @@ -124,7 +124,7 @@ define i32 @reduce_sum_16xi32_prefix3(ptr %p) { ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vslideup.vi v8, v9, 3 +; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma ; CHECK-NEXT: vredsum.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -160,16 +160,10 @@ define i32 @reduce_sum_16xi32_prefix4(ptr %p) { define i32 @reduce_sum_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_sum_16xi32_prefix5: ; CHECK: # %bb.0: -; CHECK-NEXT: li a1, -32 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vmerge.vim v10, v10, 0, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vsext.vf4 v12, v10 -; CHECK-NEXT: vand.vv v8, v8, v12 ; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma ; CHECK-NEXT: vredsum.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -189,16 +183,10 @@ define i32 @reduce_sum_16xi32_prefix5(ptr %p) { define i32 @reduce_sum_16xi32_prefix6(ptr %p) { ; CHECK-LABEL: reduce_sum_16xi32_prefix6: ; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 192 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vmerge.vim v10, v10, 0, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vsext.vf4 v12, v10 -; CHECK-NEXT: vand.vv v8, v8, v12 ; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; CHECK-NEXT: vredsum.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -223,7 +211,7 @@ define i32 @reduce_sum_16xi32_prefix7(ptr %p) { ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vslideup.vi v8, v10, 7 +; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma ; CHECK-NEXT: vredsum.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -277,15 +265,8 @@ define i32 @reduce_sum_16xi32_prefix9(ptr %p) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: li a0, -512 -; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vmv.v.i v12, -1 -; CHECK-NEXT: vmerge.vim v12, v12, 0, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vsext.vf4 v16, v12 -; CHECK-NEXT: vand.vv v8, v8, v16 ; CHECK-NEXT: vmv.s.x v12, zero +; CHECK-NEXT: vsetivli zero, 9, e32, m4, ta, ma ; CHECK-NEXT: vredsum.vs v8, v8, v12 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -315,15 +296,8 @@ define i32 @reduce_sum_16xi32_prefix13(ptr %p) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: lui a0, 14 -; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vmv.v.i v12, -1 -; CHECK-NEXT: vmerge.vim v12, v12, 0, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vsext.vf4 v16, v12 -; CHECK-NEXT: vand.vv v8, v8, v16 ; CHECK-NEXT: vmv.s.x v12, zero +; CHECK-NEXT: vsetivli zero, 13, e32, m4, ta, ma ; CHECK-NEXT: vredsum.vs v8, v8, v12 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -362,15 +336,8 @@ define i32 @reduce_sum_16xi32_prefix14(ptr %p) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: lui a0, 12 -; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vmv.v.i v12, -1 -; CHECK-NEXT: vmerge.vim v12, v12, 0, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vsext.vf4 v16, v12 -; CHECK-NEXT: vand.vv v8, v8, v16 ; CHECK-NEXT: vmv.s.x v12, zero +; CHECK-NEXT: vsetivli zero, 14, e32, m4, ta, ma ; CHECK-NEXT: vredsum.vs v8, v8, v12 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -411,7 +378,7 @@ define i32 @reduce_sum_16xi32_prefix15(ptr %p) { ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmv.s.x v12, zero -; CHECK-NEXT: vslideup.vi v8, v12, 15 +; CHECK-NEXT: vsetivli zero, 15, e32, m4, ta, ma ; CHECK-NEXT: vredsum.vs v8, v8, v12 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -532,16 +499,10 @@ define i32 @reduce_xor_16xi32_prefix2(ptr %p) { define i32 @reduce_xor_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_xor_16xi32_prefix5: ; CHECK: # %bb.0: -; CHECK-NEXT: li a1, -32 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vmerge.vim v10, v10, 0, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vsext.vf4 v12, v10 -; CHECK-NEXT: vand.vv v8, v8, v12 ; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma ; CHECK-NEXT: vredxor.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -578,15 +539,10 @@ define i32 @reduce_and_16xi32_prefix5(ptr %p) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 5, e32, m1, ta, ma ; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 5 -; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 6 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 7 -; CHECK-NEXT: vredand.vs v8, v8, v8 +; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma +; CHECK-NEXT: vredand.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 @@ -620,16 +576,11 @@ define i32 @reduce_or_16xi32_prefix2(ptr %p) { define i32 @reduce_or_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_or_16xi32_prefix5: ; CHECK: # %bb.0: -; CHECK-NEXT: li a1, -32 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vmerge.vim v10, v10, 0, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vsext.vf4 v12, v10 -; CHECK-NEXT: vand.vv v8, v8, v12 -; CHECK-NEXT: vredor.vs v8, v8, v8 +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma +; CHECK-NEXT: vredor.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 @@ -672,13 +623,8 @@ define i32 @reduce_smax_16xi32_prefix5(ptr %p) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a0, 524288 ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 5 -; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 6 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 7 -; CHECK-NEXT: vredmax.vs v8, v8, v8 +; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma +; CHECK-NEXT: vredmax.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 @@ -712,18 +658,13 @@ define i32 @reduce_smin_16xi32_prefix2(ptr %p) { define i32 @reduce_smin_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_smin_16xi32_prefix5: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, 524288 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vmv.s.x v10, a1 -; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 5 -; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 6 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 7 -; CHECK-NEXT: vredmin.vs v8, v8, v8 +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma +; CHECK-NEXT: vredmin.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 @@ -757,16 +698,11 @@ define i32 @reduce_umax_16xi32_prefix2(ptr %p) { define i32 @reduce_umax_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_umax_16xi32_prefix5: ; CHECK: # %bb.0: -; CHECK-NEXT: li a1, -32 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vmerge.vim v10, v10, 0, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vsext.vf4 v12, v10 -; CHECK-NEXT: vand.vv v8, v8, v12 -; CHECK-NEXT: vredmaxu.vs v8, v8, v8 +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma +; CHECK-NEXT: vredmaxu.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 @@ -798,21 +734,27 @@ define i32 @reduce_umin_16xi32_prefix2(ptr %p) { } define i32 @reduce_umin_16xi32_prefix5(ptr %p) { -; CHECK-LABEL: reduce_umin_16xi32_prefix5: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 5 -; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 6 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 7 -; CHECK-NEXT: vredminu.vs v8, v8, v8 -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: ret +; RV32-LABEL: reduce_umin_16xi32_prefix5: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 5, e32, m1, ta, ma +; RV32-NEXT: vmv.v.i v10, -1 +; RV32-NEXT: vsetivli zero, 5, e32, m2, ta, ma +; RV32-NEXT: vredminu.vs v8, v8, v10 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: reduce_umin_16xi32_prefix5: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: li a0, -1 +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetivli zero, 5, e32, m2, ta, ma +; RV64-NEXT: vredminu.vs v8, v8, v10 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll index e9e147861df564..26dc11aef2805b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll @@ -1318,10 +1318,7 @@ define float @vreduce_fmin_v7f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vmv.v.v v12, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v12, v10, 7 -; CHECK-NEXT: vfredmin.vs v8, v12, v8 +; CHECK-NEXT: vfredmin.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <7 x float>, ptr %x @@ -1568,10 +1565,7 @@ define float @vreduce_fmax_v7f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a0, 1047552 ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vmv.v.v v12, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v12, v10, 7 -; CHECK-NEXT: vfredmax.vs v8, v12, v8 +; CHECK-NEXT: vfredmax.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <7 x float>, ptr %x @@ -1771,20 +1765,20 @@ define float @vreduce_fminimum_v7f32(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: lui a0, 522240 -; CHECK-NEXT: vmv.s.x v12, a0 -; CHECK-NEXT: vmv.v.v v10, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v10, v12, 7 -; CHECK-NEXT: vmfne.vv v9, v10, v10 -; CHECK-NEXT: vcpop.m a0, v9 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma +; CHECK-NEXT: vmfne.vv v10, v8, v8 +; CHECK-NEXT: vcpop.m a0, v10, v0.t ; CHECK-NEXT: beqz a0, .LBB108_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB108_2: -; CHECK-NEXT: vfredmin.vs v8, v10, v8 +; CHECK-NEXT: lui a0, 522240 +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vfredmin.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <7 x float>, ptr %x @@ -1799,10 +1793,7 @@ define float @vreduce_fminimum_v7f32_nonans(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a0, 522240 ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vmv.v.v v12, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v12, v10, 7 -; CHECK-NEXT: vfredmin.vs v8, v12, v8 +; CHECK-NEXT: vfredmin.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <7 x float>, ptr %x @@ -2527,20 +2518,20 @@ define float @vreduce_fmaximum_v7f32(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: lui a0, 1046528 -; CHECK-NEXT: vmv.s.x v12, a0 -; CHECK-NEXT: vmv.v.v v10, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v10, v12, 7 -; CHECK-NEXT: vmfne.vv v9, v10, v10 -; CHECK-NEXT: vcpop.m a0, v9 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma +; CHECK-NEXT: vmfne.vv v10, v8, v8 +; CHECK-NEXT: vcpop.m a0, v10, v0.t ; CHECK-NEXT: beqz a0, .LBB136_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB136_2: -; CHECK-NEXT: vfredmax.vs v8, v10, v8 +; CHECK-NEXT: lui a0, 1046528 +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vfredmax.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <7 x float>, ptr %x @@ -2555,10 +2546,7 @@ define float @vreduce_fmaximum_v7f32_nonans(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a0, 1046528 ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vmv.v.v v12, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v12, v10, 7 -; CHECK-NEXT: vfredmax.vs v8, v12, v8 +; CHECK-NEXT: vfredmax.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <7 x float>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll index 29d80979808a9c..56944e2aa5074d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll @@ -40,8 +40,6 @@ define i8 @vreduce_add_v3i8(ptr %x) { ; CHECK-NEXT: vsetivli zero, 3, e8, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 3 ; CHECK-NEXT: vredsum.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -1768,10 +1766,9 @@ define i8 @vreduce_and_v3i8(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 3, e8, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vslideup.vi v8, v9, 3 -; CHECK-NEXT: vredand.vs v8, v8, v8 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vredand.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <3 x i8>, ptr %x @@ -2373,9 +2370,7 @@ define i8 @vreduce_or_v3i8(ptr %x) { ; CHECK-NEXT: vsetivli zero, 3, e8, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 3 -; CHECK-NEXT: vredor.vs v8, v8, v8 +; CHECK-NEXT: vredor.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <3 x i8>, ptr %x @@ -2977,8 +2972,6 @@ define i8 @vreduce_xor_v3i8(ptr %x) { ; CHECK-NEXT: vsetivli zero, 3, e8, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 3 ; CHECK-NEXT: vredxor.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -3613,9 +3606,7 @@ define i8 @vreduce_smin_v3i8(ptr %x) { ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: li a0, 127 ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 3 -; CHECK-NEXT: vredmin.vs v8, v8, v8 +; CHECK-NEXT: vredmin.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <3 x i8>, ptr %x @@ -4217,9 +4208,7 @@ define i8 @vreduce_smax_v3i8(ptr %x) { ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: li a0, -128 ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 3 -; CHECK-NEXT: vredmax.vs v8, v8, v8 +; CHECK-NEXT: vredmax.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <3 x i8>, ptr %x @@ -4819,10 +4808,9 @@ define i8 @vreduce_umin_v3i8(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 3, e8, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vslideup.vi v8, v9, 3 -; CHECK-NEXT: vredminu.vs v8, v8, v8 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vredminu.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <3 x i8>, ptr %x @@ -5423,9 +5411,7 @@ define i8 @vreduce_umax_v3i8(ptr %x) { ; CHECK-NEXT: vsetivli zero, 3, e8, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 3 -; CHECK-NEXT: vredmaxu.vs v8, v8, v8 +; CHECK-NEXT: vredmaxu.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <3 x i8>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll index 30e31cecbf2c7b..5b140299070b94 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll @@ -1018,22 +1018,17 @@ declare half @llvm.vector.reduce.fmin.nxv10f16() define half @vreduce_fmin_nxv10f16( %v) { ; CHECK-LABEL: vreduce_fmin_nxv10f16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI73_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI73_0)(a0) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: add a1, a0, a0 -; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; CHECK-NEXT: vfmv.v.f v12, fa5 -; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vx v10, v12, a0 -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma -; CHECK-NEXT: vmv.v.v v11, v12 -; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vx v11, v12, a0 -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfredmin.vs v8, v8, v8 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: lui a1, %hi(.LCPI73_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI73_0) +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v12, (a1) +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: li a1, 10 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfredmin.vs v12, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v12 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fmin.nxv10f16( %v) ret half %red @@ -1044,12 +1039,17 @@ declare half @llvm.vector.reduce.fmax.nxv12f16() define half @vreduce_fmax_nxv12f16( %v) { ; CHECK-LABEL: vreduce_fmax_nxv12f16: ; CHECK: # %bb.0: -; CHECK-NEXT: li a0, -512 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; CHECK-NEXT: vmv.v.x v11, a0 -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfredmax.vs v8, v8, v8 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: slli a1, a0, 2 +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub a0, a0, a1 +; CHECK-NEXT: li a1, -512 +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; CHECK-NEXT: vmv.s.x v12, a1 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfredmax.vs v12, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v12 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fmax.nxv12f16( %v) ret half %red From 26a8a857dcdc219d57e39b495ff58aef7d746fdc Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 22 Aug 2024 07:34:41 -0700 Subject: [PATCH 205/426] [RISCV] Introduce local peephole to reduce VLs based on demanded VL (#104689) This is a fairly narrow transform (at the moment) to reduce the VLs of instructions feeding a store with a smaller VL. Note that the goal of this transform isn't really to reduce VL - it's to reduce VL *toggles*. To our knowledge, small reductions in VL without also changing LMUL are generally not profitable on existing hardware. For a single use instruction without side effects, fp exceptions, or a result dependency on VL, reducing VL is legal if only a subset of elements are legal. We'd already implemented this logic for vmv.v.v, and this patch simply applies it to stores as an alternate root. Longer term, I plan to extend this to other root instructions (i.e. different kind of stores, reduces, etc..), and add a more general recursive walkback through operands. One risk with the dataflow based approach is that we could be reducing VL of an instruction scheduled in a region with the wider VL (i.e. mixed mode computations) forcing an additional VL toggle. An example of this is the @insert_subvector_dag_loop test case, but it doesn't appear to happen widely. I think this is a risk we should accept. --- llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp | 149 ++++++++++++------ .../CodeGen/RISCV/rvv/fixed-vectors-abs.ll | 2 +- .../CodeGen/RISCV/rvv/fixed-vectors-fp.ll | 64 +------- .../CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll | 4 - .../CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll | 4 - .../fixed-vectors-insert-subvector-shuffle.ll | 7 +- .../RISCV/rvv/fixed-vectors-int-buildvec.ll | 2 +- .../RISCV/rvv/fixed-vectors-int-splat.ll | 3 +- .../CodeGen/RISCV/rvv/fixed-vectors-int.ll | 44 ------ .../RISCV/rvv/fixed-vectors-non-power-of-2.ll | 16 -- .../rvv/fixed-vectors-strided-load-combine.ll | 3 +- .../RISCV/rvv/fixed-vectors-vselect.ll | 24 +-- .../RISCV/rvv/rvv-peephole-vmerge-vops.ll | 3 +- .../RISCV/rvv/vsetvli-insert-crossbb.ll | 10 +- llvm/test/CodeGen/RISCV/rvv/zve32-types.ll | 8 - 15 files changed, 123 insertions(+), 220 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp index 2abed1ac984e35..9772782ad3d6db 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp @@ -61,6 +61,7 @@ class RISCVVectorPeephole : public MachineFunctionPass { } private: + bool tryToReduceVL(MachineInstr &MI) const; bool convertToVLMAX(MachineInstr &MI) const; bool convertToWholeRegister(MachineInstr &MI) const; bool convertToUnmasked(MachineInstr &MI) const; @@ -81,6 +82,96 @@ char RISCVVectorPeephole::ID = 0; INITIALIZE_PASS(RISCVVectorPeephole, DEBUG_TYPE, "RISC-V Fold Masks", false, false) +/// Given two VL operands, do we know that LHS <= RHS? +static bool isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS) { + if (LHS.isReg() && RHS.isReg() && LHS.getReg().isVirtual() && + LHS.getReg() == RHS.getReg()) + return true; + if (RHS.isImm() && RHS.getImm() == RISCV::VLMaxSentinel) + return true; + if (LHS.isImm() && LHS.getImm() == RISCV::VLMaxSentinel) + return false; + if (!LHS.isImm() || !RHS.isImm()) + return false; + return LHS.getImm() <= RHS.getImm(); +} + +static unsigned getSEWLMULRatio(const MachineInstr &MI) { + RISCVII::VLMUL LMUL = RISCVII::getLMul(MI.getDesc().TSFlags); + unsigned Log2SEW = MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm(); + return RISCVVType::getSEWLMULRatio(1 << Log2SEW, LMUL); +} + +// Attempt to reduce the VL of an instruction whose sole use is feeding a +// instruction with a narrower VL. This currently works backwards from the +// user instruction (which might have a smaller VL). +bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) const { + // Note that the goal here is a bit multifaceted. + // 1) For store's reducing the VL of the value being stored may help to + // reduce VL toggles. This is somewhat of an artifact of the fact we + // promote arithmetic instructions but VL predicate stores. + // 2) For vmv.v.v reducing VL eagerly on the source instruction allows us + // to share code with the foldVMV_V_V transform below. + // + // Note that to the best of our knowledge, reducing VL is generally not + // a significant win on real hardware unless we can also reduce LMUL which + // this code doesn't try to do. + // + // TODO: We can handle a bunch more instructions here, and probably + // recurse backwards through operands too. + unsigned SrcIdx = 0; + switch (RISCV::getRVVMCOpcode(MI.getOpcode())) { + default: + return false; + case RISCV::VSE8_V: + case RISCV::VSE16_V: + case RISCV::VSE32_V: + case RISCV::VSE64_V: + break; + case RISCV::VMV_V_V: + SrcIdx = 2; + break; + } + + MachineOperand &VL = MI.getOperand(RISCVII::getVLOpNum(MI.getDesc())); + if (VL.isImm() && VL.getImm() == RISCV::VLMaxSentinel) + return false; + + Register SrcReg = MI.getOperand(SrcIdx).getReg(); + // Note: one *use*, not one *user*. + if (!MRI->hasOneUse(SrcReg)) + return false; + + MachineInstr *Src = MRI->getVRegDef(SrcReg); + if (!Src || Src->hasUnmodeledSideEffects() || + Src->getParent() != MI.getParent() || Src->getNumDefs() != 1 || + !RISCVII::hasVLOp(Src->getDesc().TSFlags) || + !RISCVII::hasSEWOp(Src->getDesc().TSFlags)) + return false; + + // Src needs to have the same VLMAX as MI + if (getSEWLMULRatio(MI) != getSEWLMULRatio(*Src)) + return false; + + bool ActiveElementsAffectResult = RISCVII::activeElementsAffectResult( + TII->get(RISCV::getRVVMCOpcode(Src->getOpcode())).TSFlags); + if (ActiveElementsAffectResult || Src->mayRaiseFPException()) + return false; + + MachineOperand &SrcVL = Src->getOperand(RISCVII::getVLOpNum(Src->getDesc())); + if (VL.isIdenticalTo(SrcVL) || !isVLKnownLE(VL, SrcVL)) + return false; + + if (VL.isImm()) + SrcVL.ChangeToImmediate(VL.getImm()); + else if (VL.isReg()) + SrcVL.ChangeToRegister(VL.getReg(), false); + + // TODO: For instructions with a passthru, we could clear the passthru + // and tail policy since we've just proven the tail is not demanded. + return true; +} + /// Check if an operand is an immediate or a materialized ADDI $x0, imm. std::optional RISCVVectorPeephole::getConstant(const MachineOperand &VL) const { @@ -325,22 +416,6 @@ bool RISCVVectorPeephole::convertToUnmasked(MachineInstr &MI) const { return true; } -/// Given two VL operands, returns the one known to be the smallest or nullptr -/// if unknown. -static const MachineOperand *getKnownMinVL(const MachineOperand *LHS, - const MachineOperand *RHS) { - if (LHS->isReg() && RHS->isReg() && LHS->getReg().isVirtual() && - LHS->getReg() == RHS->getReg()) - return LHS; - if (LHS->isImm() && LHS->getImm() == RISCV::VLMaxSentinel) - return RHS; - if (RHS->isImm() && RHS->getImm() == RISCV::VLMaxSentinel) - return LHS; - if (!LHS->isImm() || !RHS->isImm()) - return nullptr; - return LHS->getImm() <= RHS->getImm() ? LHS : RHS; -} - /// Check if it's safe to move From down to To, checking that no physical /// registers are clobbered. static bool isSafeToMove(const MachineInstr &From, const MachineInstr &To) { @@ -362,21 +437,16 @@ static bool isSafeToMove(const MachineInstr &From, const MachineInstr &To) { return From.isSafeToMove(SawStore); } -static unsigned getSEWLMULRatio(const MachineInstr &MI) { - RISCVII::VLMUL LMUL = RISCVII::getLMul(MI.getDesc().TSFlags); - unsigned Log2SEW = MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm(); - return RISCVVType::getSEWLMULRatio(1 << Log2SEW, LMUL); -} - /// If a PseudoVMV_V_V is the only user of its input, fold its passthru and VL /// into it. /// /// %x = PseudoVADD_V_V_M1 %passthru, %a, %b, %vl1, sew, policy /// %y = PseudoVMV_V_V_M1 %passthru, %x, %vl2, sew, policy +/// (where %vl1 <= %vl2, see related tryToReduceVL) /// /// -> /// -/// %y = PseudoVADD_V_V_M1 %passthru, %a, %b, min(vl1, vl2), sew, policy +/// %y = PseudoVADD_V_V_M1 %passthru, %a, %b, vl1, sew, policy bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) { if (RISCV::getRVVMCOpcode(MI.getOpcode()) != RISCV::VMV_V_V) return false; @@ -404,33 +474,16 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) { SrcPassthru.getReg() != Passthru.getReg()) return false; - // Because Src and MI have the same passthru, we can use either AVL as long as - // it's the smaller of the two. - // - // (src pt, ..., vl=5) x x x x x|. . . - // (vmv.v.v pt, src, vl=3) x x x|. . . . . - // -> - // (src pt, ..., vl=3) x x x|. . . . . - // - // (src pt, ..., vl=3) x x x|. . . . . - // (vmv.v.v pt, src, vl=6) x x x . . .|. . - // -> - // (src pt, ..., vl=3) x x x|. . . . . + // Src VL will have already been reduced if legal (see tryToReduceVL), + // so we don't need to handle a smaller source VL here. However, the + // user's VL may be larger MachineOperand &SrcVL = Src->getOperand(RISCVII::getVLOpNum(Src->getDesc())); - const MachineOperand *MinVL = getKnownMinVL(&MI.getOperand(3), &SrcVL); - if (!MinVL) - return false; - - bool VLChanged = !MinVL->isIdenticalTo(SrcVL); - bool ActiveElementsAffectResult = RISCVII::activeElementsAffectResult( - TII->get(RISCV::getRVVMCOpcode(Src->getOpcode())).TSFlags); - - if (VLChanged && (ActiveElementsAffectResult || Src->mayRaiseFPException())) + if (!isVLKnownLE(SrcVL, MI.getOperand(3))) return false; // If Src ends up using MI's passthru/VL, move it so it can access it. // TODO: We don't need to do this if they already dominate Src. - if (!SrcVL.isIdenticalTo(*MinVL) || !SrcPassthru.isIdenticalTo(Passthru)) { + if (!SrcPassthru.isIdenticalTo(Passthru)) { if (!isSafeToMove(*Src, MI)) return false; Src->moveBefore(&MI); @@ -445,11 +498,6 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) { *Src->getParent()->getParent())); } - if (MinVL->isImm()) - SrcVL.ChangeToImmediate(MinVL->getImm()); - else if (MinVL->isReg()) - SrcVL.ChangeToRegister(MinVL->getReg(), false); - // Use a conservative tu,mu policy, RISCVInsertVSETVLI will relax it if // passthru is undef. Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc())) @@ -498,6 +546,7 @@ bool RISCVVectorPeephole::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : make_early_inc_range(MBB)) { Changed |= convertToVLMAX(MI); + Changed |= tryToReduceVL(MI); Changed |= convertToUnmasked(MI); Changed |= convertToWholeRegister(MI); Changed |= convertVMergeToVMv(MI); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs.ll index f607add17b4b9d..ac7d3d9109e39c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs.ll @@ -41,8 +41,8 @@ define void @abs_v6i16(ptr %x) { ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vrsub.vi v9, v8, 0 -; CHECK-NEXT: vmax.vv v8, v8, v9 ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vmax.vv v8, v8, v9 ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll index d25312268ada62..a6e224d475a312 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll @@ -47,9 +47,7 @@ define void @fadd_v6f16(ptr %x, ptr %y) { ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) ; ZVFH-NEXT: vle16.v v9, (a1) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfadd.vv v8, v8, v9 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -173,9 +171,7 @@ define void @fsub_v6f16(ptr %x, ptr %y) { ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) ; ZVFH-NEXT: vle16.v v9, (a1) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfsub.vv v8, v8, v9 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -299,9 +295,7 @@ define void @fmul_v6f16(ptr %x, ptr %y) { ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) ; ZVFH-NEXT: vle16.v v9, (a1) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfmul.vv v8, v8, v9 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -425,9 +419,7 @@ define void @fdiv_v6f16(ptr %x, ptr %y) { ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) ; ZVFH-NEXT: vle16.v v9, (a1) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfdiv.vv v8, v8, v9 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -546,9 +538,7 @@ define void @fneg_v6f16(ptr %x) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfneg.v v8, v8 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -658,9 +648,7 @@ define void @fabs_v6f16(ptr %x) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v8, v8 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -778,9 +766,7 @@ define void @copysign_v6f16(ptr %x, ptr %y) { ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) ; ZVFH-NEXT: vle16.v v9, (a1) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfsgnj.vv v8, v8, v9 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -911,9 +897,7 @@ define void @copysign_vf_v6f16(ptr %x, half %y) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfsgnj.vf v8, v8, fa0 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -1053,9 +1037,7 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) { ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) ; ZVFH-NEXT: vle16.v v9, (a1) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfsgnjn.vv v8, v8, v9 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -1204,8 +1186,8 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) { ; ZVFH-NEXT: vle16.v v9, (a0) ; ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFH-NEXT: vfncvt.f.f.w v10, v8 -; ZVFH-NEXT: vfsgnjn.vv v8, v9, v10 ; ZVFH-NEXT: vsetivli zero, 3, e16, mf2, ta, ma +; ZVFH-NEXT: vfsgnjn.vv v8, v9, v10 ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -1334,9 +1316,7 @@ define void @sqrt_v6f16(ptr %x) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfsqrt.v v8, v8 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -1459,9 +1439,7 @@ define void @fma_v6f16(ptr %x, ptr %y, ptr %z) { ; ZVFH-NEXT: vle16.v v8, (a0) ; ZVFH-NEXT: vle16.v v9, (a1) ; ZVFH-NEXT: vle16.v v10, (a2) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfmacc.vv v10, v8, v9 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v10, (a0) ; ZVFH-NEXT: ret ; @@ -1609,9 +1587,7 @@ define void @fmsub_v6f16(ptr %x, ptr %y, ptr %z) { ; ZVFH-NEXT: vle16.v v8, (a0) ; ZVFH-NEXT: vle16.v v9, (a1) ; ZVFH-NEXT: vle16.v v10, (a2) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfmsac.vv v10, v8, v9 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v10, (a0) ; ZVFH-NEXT: ret ; @@ -2246,9 +2222,7 @@ define void @fadd_vf_v6f16(ptr %x, half %y) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfadd.vf v8, v8, fa0 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -2386,9 +2360,7 @@ define void @fadd_fv_v6f16(ptr %x, half %y) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfadd.vf v8, v8, fa0 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -2526,9 +2498,7 @@ define void @fsub_vf_v6f16(ptr %x, half %y) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfsub.vf v8, v8, fa0 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -2666,9 +2636,7 @@ define void @fsub_fv_v6f16(ptr %x, half %y) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfrsub.vf v8, v8, fa0 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -2806,9 +2774,7 @@ define void @fmul_vf_v6f16(ptr %x, half %y) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfmul.vf v8, v8, fa0 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -2946,9 +2912,7 @@ define void @fmul_fv_v6f16(ptr %x, half %y) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfmul.vf v8, v8, fa0 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -3086,9 +3050,7 @@ define void @fdiv_vf_v6f16(ptr %x, half %y) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfdiv.vf v8, v8, fa0 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -3226,9 +3188,7 @@ define void @fdiv_fv_v6f16(ptr %x, half %y) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfrdiv.vf v8, v8, fa0 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -3371,9 +3331,7 @@ define void @fma_vf_v6f16(ptr %x, ptr %y, half %z) { ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) ; ZVFH-NEXT: vle16.v v9, (a1) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfmacc.vf v9, fa0, v8 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v9, (a0) ; ZVFH-NEXT: ret ; @@ -3526,9 +3484,7 @@ define void @fma_fv_v6f16(ptr %x, ptr %y, half %z) { ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) ; ZVFH-NEXT: vle16.v v9, (a1) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfmacc.vf v9, fa0, v8 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v9, (a0) ; ZVFH-NEXT: ret ; @@ -3687,9 +3643,7 @@ define void @fmsub_vf_v6f16(ptr %x, ptr %y, half %z) { ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) ; ZVFH-NEXT: vle16.v v9, (a1) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfmsac.vf v9, fa0, v8 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v9, (a0) ; ZVFH-NEXT: ret ; @@ -3893,9 +3847,8 @@ define void @trunc_v6f16(ptr %x) { ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t -; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, mu ; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret %a = load <6 x half>, ptr %x @@ -4023,9 +3976,8 @@ define void @ceil_v6f16(ptr %x) { ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a1 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t -; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, mu ; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -4210,9 +4162,8 @@ define void @floor_v6f16(ptr %x) { ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a1 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t -; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, mu ; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -4397,9 +4348,8 @@ define void @round_v6f16(ptr %x) { ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a1 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t -; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, mu ; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -4782,9 +4732,7 @@ define void @fmuladd_v6f16(ptr %x, ptr %y, ptr %z) { ; ZVFH-NEXT: vle16.v v8, (a0) ; ZVFH-NEXT: vle16.v v9, (a1) ; ZVFH-NEXT: vle16.v v10, (a2) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfmacc.vv v10, v8, v9 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v10, (a0) ; ZVFH-NEXT: ret ; @@ -4942,9 +4890,7 @@ define void @fmsub_fmuladd_v6f16(ptr %x, ptr %y, ptr %z) { ; ZVFH-NEXT: vle16.v v8, (a0) ; ZVFH-NEXT: vle16.v v9, (a1) ; ZVFH-NEXT: vle16.v v10, (a2) -; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfmsac.vv v10, v8, v9 -; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vse16.v v10, (a0) ; ZVFH-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll index dbc65620b7f249..bfcc7017178e31 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll @@ -83,9 +83,7 @@ define void @fp2si_v3f32_v3i32(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v8 -; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a1) ; CHECK-NEXT: ret %a = load <3 x float>, ptr %x @@ -99,9 +97,7 @@ define void @fp2ui_v3f32_v3i32(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.rtz.xu.f.v v8, v8 -; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a1) ; CHECK-NEXT: ret %a = load <3 x float>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll index 9c76b83d0974af..7333067e9205e0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll @@ -89,9 +89,7 @@ define void @si2fp_v3i32_v3f32(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.f.x.v v8, v8 -; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a1) ; CHECK-NEXT: ret %a = load <3 x i32>, ptr %x @@ -105,9 +103,7 @@ define void @ui2fp_v3i32_v3f32(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a1) ; CHECK-NEXT: ret %a = load <3 x i32>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll index 64e1bf3e6c0324..245b4a8a9c1005 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll @@ -30,9 +30,9 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>) define <4 x i32> @insert_subvector_load_unfoldable_passthru_v4i32_v4i32(<4 x i32> %v1, ptr %p, <4 x i1> %mask, <4 x i32> %passthru) { ; CHECK-LABEL: insert_subvector_load_unfoldable_passthru_v4i32_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v9, (a0), v0.t -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma ; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret %v2 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %p, i32 4, <4 x i1> %mask, <4 x i32> %passthru) @@ -197,8 +197,9 @@ define <4 x i32> @insert_subvector_vp_add_v4i32_v8i32(<4 x i32> %v1, <8 x i32> % define <4 x i32> @insert_subvector_dag_loop(ptr %p, ptr %q) { ; CHECK-LABEL: insert_subvector_dag_loop: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a1) ; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; CHECK-NEXT: vmv.v.v v8, v9 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll index f65431bf470aae..cbea842e28f0f2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -680,8 +680,8 @@ define void @buildvec_seq_v9i8(ptr %x) { ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vim v8, v9, 2, v0 ; CHECK-NEXT: vsetivli zero, 9, e8, m1, ta, ma +; CHECK-NEXT: vmerge.vim v8, v9, 2, v0 ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: ret store <9 x i8> , ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll index 649aa067b01aff..336a64b1b89ca8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll @@ -279,9 +279,8 @@ define void @splat_zero_v2i32(ptr %p) { define void @splat_zero_v7i16(ptr %p) { ; CHECK-LABEL: splat_zero_v7i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vsetivli zero, 7, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret store <7 x i16> zeroinitializer, ptr %p diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll index f411ddf41d903f..70bda8c2da0f27 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -40,9 +40,7 @@ define void @add_v6i16(ptr %x, ptr %y) { ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x @@ -122,9 +120,7 @@ define void @sub_v6i16(ptr %x, ptr %y) { ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vsub.vv v8, v8, v9 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x @@ -204,9 +200,7 @@ define void @mul_v6i16(ptr %x, ptr %y) { ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmul.vv v8, v8, v9 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x @@ -286,9 +280,7 @@ define void @and_v6i16(ptr %x, ptr %y) { ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x @@ -368,9 +360,7 @@ define void @or_v6i16(ptr %x, ptr %y) { ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x @@ -450,9 +440,7 @@ define void @xor_v6i16(ptr %x, ptr %y) { ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vxor.vv v8, v8, v9 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x @@ -532,9 +520,7 @@ define void @lshr_v6i16(ptr %x, ptr %y) { ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vsrl.vv v8, v8, v9 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x @@ -614,9 +600,7 @@ define void @ashr_v6i16(ptr %x, ptr %y) { ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vsra.vv v8, v8, v9 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x @@ -696,9 +680,7 @@ define void @shl_v6i16(ptr %x, ptr %y) { ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vsll.vv v8, v8, v9 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x @@ -1463,9 +1445,7 @@ define void @smin_v6i16(ptr %x, ptr %y) { ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmin.vv v8, v8, v9 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x @@ -1549,9 +1529,7 @@ define void @smin_vx_v6i16(ptr %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmin.vx v8, v8, a1 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x @@ -1617,9 +1595,7 @@ define void @smin_xv_v6i16(ptr %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmin.vx v8, v8, a1 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x @@ -1686,9 +1662,7 @@ define void @smax_v6i16(ptr %x, ptr %y) { ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmax.vv v8, v8, v9 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x @@ -1772,9 +1746,7 @@ define void @smax_vx_v6i16(ptr %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmax.vx v8, v8, a1 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x @@ -1840,9 +1812,7 @@ define void @smax_xv_v6i16(ptr %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmax.vx v8, v8, a1 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x @@ -1909,9 +1879,7 @@ define void @umin_v6i16(ptr %x, ptr %y) { ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vminu.vv v8, v8, v9 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x @@ -1995,9 +1963,7 @@ define void @umin_vx_v6i16(ptr %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vminu.vx v8, v8, a1 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x @@ -2063,9 +2029,7 @@ define void @umin_xv_v6i16(ptr %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vminu.vx v8, v8, a1 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x @@ -2132,9 +2096,7 @@ define void @umax_v6i16(ptr %x, ptr %y) { ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmaxu.vv v8, v8, v9 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x @@ -2218,9 +2180,7 @@ define void @umax_vx_v6i16(ptr %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmaxu.vx v8, v8, a1 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x @@ -2286,9 +2246,7 @@ define void @umax_xv_v6i16(ptr %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmaxu.vx v8, v8, a1 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x @@ -2370,9 +2328,7 @@ define void @add_v6i32(ptr %x, ptr %y) { ; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vle32.v v10, (a1) -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i32>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-non-power-of-2.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-non-power-of-2.ll index 4aa60897f5064f..0063a0d0f61241 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-non-power-of-2.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-non-power-of-2.ll @@ -7,9 +7,7 @@ define void @vls3i8(ptr align 8 %array) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 3, e8, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsetivli zero, 3, e8, mf4, ta, ma ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: ret entry: @@ -25,9 +23,7 @@ define void @vls3(ptr align 8 %array) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret entry: @@ -43,9 +39,7 @@ define void @vls5(ptr align 8 %array) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret entry: @@ -61,9 +55,7 @@ define void @vls6(ptr align 8 %array) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret entry: @@ -79,9 +71,7 @@ define void @vls7(ptr align 8 %array) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret entry: @@ -98,9 +88,7 @@ define void @vls9(ptr align 8 %array) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 9, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsetivli zero, 9, e32, m4, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret entry: @@ -117,9 +105,7 @@ define void @vls10(ptr align 8 %array) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 10, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsetivli zero, 10, e32, m4, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret entry: @@ -135,9 +121,7 @@ define void @vls11(ptr align 8 %array) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 11, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsetivli zero, 11, e32, m4, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll index cdf0d35843620d..b49e323478e8ca 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll @@ -32,9 +32,8 @@ define void @widen_3xv4i16(ptr %x, ptr %z) { ; CHECK-NEXT: vle16.v v10, (a0) ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vslideup.vi v8, v9, 4 -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 8 ; CHECK-NEXT: vsetivli zero, 12, e16, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 8 ; CHECK-NEXT: vse16.v v8, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll index ed2ed2a2ebfaa0..2194651a95e54f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll @@ -96,9 +96,8 @@ define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) { ; RV32-NEXT: vslidedown.vi v10, v10, 2 ; RV32-NEXT: vand.vi v10, v10, 1 ; RV32-NEXT: vmsne.vi v0, v10, 0 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV32-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV32-NEXT: vse32.v v8, (a3) ; RV32-NEXT: ret ; @@ -127,9 +126,8 @@ define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) { ; RV64-NEXT: vslidedown.vi v10, v10, 2 ; RV64-NEXT: vand.vi v10, v10, 1 ; RV64-NEXT: vmsne.vi v0, v10, 0 -; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV64-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV64-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64-NEXT: vse32.v v8, (a3) ; RV64-NEXT: ret %vb = load <6 x i32>, ptr %b @@ -167,9 +165,8 @@ define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) { ; RV32-NEXT: vslidedown.vi v10, v10, 2 ; RV32-NEXT: vand.vi v10, v10, 1 ; RV32-NEXT: vmsne.vi v0, v10, 0 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32-NEXT: vmerge.vim v8, v8, -1, v0 ; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV32-NEXT: vmerge.vim v8, v8, -1, v0 ; RV32-NEXT: vse32.v v8, (a2) ; RV32-NEXT: ret ; @@ -198,9 +195,8 @@ define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) { ; RV64-NEXT: vslidedown.vi v10, v10, 2 ; RV64-NEXT: vand.vi v10, v10, 1 ; RV64-NEXT: vmsne.vi v0, v10, 0 -; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV64-NEXT: vmerge.vim v8, v8, -1, v0 ; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV64-NEXT: vmerge.vim v8, v8, -1, v0 ; RV64-NEXT: vse32.v v8, (a2) ; RV64-NEXT: ret %vb = load <6 x i32>, ptr %b @@ -305,9 +301,8 @@ define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) { ; RV32-NEXT: vslidedown.vi v10, v10, 2 ; RV32-NEXT: vand.vi v10, v10, 1 ; RV32-NEXT: vmsne.vi v0, v10, 0 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32-NEXT: vfmerge.vfm v8, v8, fa0, v0 ; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV32-NEXT: vfmerge.vfm v8, v8, fa0, v0 ; RV32-NEXT: vse32.v v8, (a2) ; RV32-NEXT: ret ; @@ -336,9 +331,8 @@ define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) { ; RV64-NEXT: vslidedown.vi v10, v10, 2 ; RV64-NEXT: vand.vi v10, v10, 1 ; RV64-NEXT: vmsne.vi v0, v10, 0 -; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV64-NEXT: vfmerge.vfm v8, v8, fa0, v0 ; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV64-NEXT: vfmerge.vfm v8, v8, fa0, v0 ; RV64-NEXT: vse32.v v8, (a2) ; RV64-NEXT: ret %vb = load <6 x float>, ptr %b @@ -376,9 +370,8 @@ define void @vselect_vfpzero_v6f32(ptr %b, ptr %cc, ptr %z) { ; RV32-NEXT: vslidedown.vi v10, v10, 2 ; RV32-NEXT: vand.vi v10, v10, 1 ; RV32-NEXT: vmsne.vi v0, v10, 0 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32-NEXT: vmerge.vim v8, v8, 0, v0 ; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV32-NEXT: vmerge.vim v8, v8, 0, v0 ; RV32-NEXT: vse32.v v8, (a2) ; RV32-NEXT: ret ; @@ -407,9 +400,8 @@ define void @vselect_vfpzero_v6f32(ptr %b, ptr %cc, ptr %z) { ; RV64-NEXT: vslidedown.vi v10, v10, 2 ; RV64-NEXT: vand.vi v10, v10, 1 ; RV64-NEXT: vmsne.vi v0, v10, 0 -; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV64-NEXT: vmerge.vim v8, v8, 0, v0 ; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; RV64-NEXT: vmerge.vim v8, v8, 0, v0 ; RV64-NEXT: vse32.v v8, (a2) ; RV64-NEXT: ret %vb = load <6 x float>, ptr %b diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll index 259515f160048d..39055dc5adfcf7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll +++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll @@ -902,9 +902,8 @@ define void @test_dag_loop() { ; CHECK-NEXT: vmseq.vv v0, v12, v8 ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vsetivli zero, 1, e16, m8, tu, mu +; CHECK-NEXT: vsetivli zero, 0, e16, m8, tu, mu ; CHECK-NEXT: vle16.v v8, (zero), v0.t -; CHECK-NEXT: vsetivli zero, 0, e16, m8, ta, ma ; CHECK-NEXT: vse16.v v8, (zero) ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll index 44b152126942cb..027c81180d5f19 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll @@ -719,11 +719,9 @@ define void @vector_init_vsetvli_N(i64 %N, ptr %c) { ; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: vsetvli a3, a0, e64, m1, ta, ma ; CHECK-NEXT: slli a4, a3, 3 -; CHECK-NEXT: vsetvli a5, zero, e64, m1, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: .LBB14_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: add a2, a2, a3 ; CHECK-NEXT: add a1, a1, a4 @@ -755,11 +753,9 @@ define void @vector_init_vsetvli_fv(i64 %N, ptr %c) { ; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: vsetivli a3, 4, e64, m1, ta, ma ; CHECK-NEXT: slli a4, a3, 3 -; CHECK-NEXT: vsetvli a5, zero, e64, m1, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: .LBB15_1: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vsetivli zero, 4, e64, m1, ta, ma ; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: add a2, a2, a3 ; CHECK-NEXT: add a1, a1, a4 @@ -789,11 +785,10 @@ define void @vector_init_vsetvli_fv2(i64 %N, ptr %c) { ; CHECK-LABEL: vector_init_vsetvli_fv2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 0 -; CHECK-NEXT: vsetvli a3, zero, e64, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e64, m1, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: .LBB16_1: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vsetivli zero, 4, e64, m1, ta, ma ; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: addi a1, a1, 32 @@ -823,11 +818,10 @@ define void @vector_init_vsetvli_fv3(i64 %N, ptr %c) { ; CHECK-LABEL: vector_init_vsetvli_fv3: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 0 -; CHECK-NEXT: vsetvli a3, zero, e64, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e64, m1, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: .LBB17_1: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vsetivli zero, 4, e64, m1, ta, ma ; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: addi a1, a1, 32 diff --git a/llvm/test/CodeGen/RISCV/rvv/zve32-types.ll b/llvm/test/CodeGen/RISCV/rvv/zve32-types.ll index 6fcd4dc2378532..e4dfdb2c72c026 100644 --- a/llvm/test/CodeGen/RISCV/rvv/zve32-types.ll +++ b/llvm/test/CodeGen/RISCV/rvv/zve32-types.ll @@ -18,9 +18,7 @@ define void @vadd_vv_nxv1i8(ptr %pa, ptr %pb) { ; CHECK-NEXT: vsetvli zero, a2, e8, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle8.v v9, (a1) -; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: vsetvli zero, a2, e8, mf4, ta, ma ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: ret %va = load , ptr %pa @@ -38,9 +36,7 @@ define void @vadd_vv_nxv1i16(ptr %pa, ptr %pb) { ; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %va = load , ptr %pa @@ -58,9 +54,7 @@ define void @vadd_vv_nxv1i32(ptr %pa, ptr %pb) { ; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vle32.v v9, (a1) -; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %va = load , ptr %pa @@ -78,9 +72,7 @@ define void @vfadd_vv_nxv1f32(ptr %pa, ptr %pb) { ; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vle32.v v9, (a1) -; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; CHECK-NEXT: vfadd.vv v8, v8, v9 -; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %va = load , ptr %pa From 29cb1e6b4fccb99d32eaa4b81af481d94be79242 Mon Sep 17 00:00:00 2001 From: Lukacma Date: Thu, 22 Aug 2024 15:51:51 +0100 Subject: [PATCH 206/426] [AArch64] optimise SVE cmp intrinsics with no active lanes (#104779) This patch extends https://github.com/llvm/llvm-project/pull/73964 and optimises SVE cmp intrinsics to zero vector when predicate is zero. --- .../AArch64/AArch64TargetTransformInfo.cpp | 25 ++ .../sve-intrinsic-comb-no-active-lanes-cmp.ll | 245 ++++++++++++++++++ 2 files changed, 270 insertions(+) create mode 100644 llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-cmp.ll diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index f31e1fa9ab3045..8c64822c474b61 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1160,6 +1160,10 @@ static std::optional instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II) { LLVMContext &Ctx = II.getContext(); + // Replace by zero constant when all lanes are inactive + if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II)) + return II_NA; + // Check that the predicate is all active auto *Pg = dyn_cast(II.getArgOperand(0)); if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) @@ -2131,6 +2135,27 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, case Intrinsic::aarch64_sve_st4: case Intrinsic::aarch64_sve_st4q: return instCombineSVENoActiveUnaryErase(IC, II, 4); + case Intrinsic::aarch64_sve_cmpeq: + case Intrinsic::aarch64_sve_cmpeq_wide: + case Intrinsic::aarch64_sve_cmpge: + case Intrinsic::aarch64_sve_cmpge_wide: + case Intrinsic::aarch64_sve_cmpgt: + case Intrinsic::aarch64_sve_cmpgt_wide: + case Intrinsic::aarch64_sve_cmphi: + case Intrinsic::aarch64_sve_cmphi_wide: + case Intrinsic::aarch64_sve_cmphs: + case Intrinsic::aarch64_sve_cmphs_wide: + case Intrinsic::aarch64_sve_cmple_wide: + case Intrinsic::aarch64_sve_cmplo_wide: + case Intrinsic::aarch64_sve_cmpls_wide: + case Intrinsic::aarch64_sve_cmplt_wide: + case Intrinsic::aarch64_sve_facge: + case Intrinsic::aarch64_sve_facgt: + case Intrinsic::aarch64_sve_fcmpeq: + case Intrinsic::aarch64_sve_fcmpge: + case Intrinsic::aarch64_sve_fcmpgt: + case Intrinsic::aarch64_sve_fcmpne: + case Intrinsic::aarch64_sve_fcmpuo: case Intrinsic::aarch64_sve_ld1_gather: case Intrinsic::aarch64_sve_ld1_gather_scalar_offset: case Intrinsic::aarch64_sve_ld1_gather_sxtw: diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-cmp.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-cmp.ll new file mode 100644 index 00000000000000..1833bb6db248d9 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-cmp.ll @@ -0,0 +1,245 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=instcombine < %s | FileCheck %s +target triple = "aarch64-unknown-linux-gnu" + +define @test_cmpeq( %a, %b){ +; CHECK-LABEL: define @test_cmpeq( +; CHECK-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret zeroinitializer +; +entry: + %0 = tail call @llvm.aarch64.sve.cmpeq.nxv16i8( zeroinitializer, %a, %b) + ret %0 +} + +define @test_cmpeq_wide( %a, %b){ +; CHECK-LABEL: define @test_cmpeq_wide( +; CHECK-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret zeroinitializer +; +entry: + %0 = tail call @llvm.aarch64.sve.cmpeq.wide.nxv16i8( zeroinitializer, %a, %b) + ret %0 +} + +define @test_cmpge( %a, %b){ +; CHECK-LABEL: define @test_cmpge( +; CHECK-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret zeroinitializer +; +entry: + %0 = tail call @llvm.aarch64.sve.cmpge.nxv16i8( zeroinitializer, %a, %b) + ret %0 +} + +define @test_cmpge_wide( %a, %b){ +; CHECK-LABEL: define @test_cmpge_wide( +; CHECK-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret zeroinitializer +; +entry: + %0 = tail call @llvm.aarch64.sve.cmpge.wide.nxv16i8( zeroinitializer, %a, %b) + ret %0 +} + +define @test_cmpgt( %a, %b){ +; CHECK-LABEL: define @test_cmpgt( +; CHECK-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret zeroinitializer +; +entry: + %0 = tail call @llvm.aarch64.sve.cmpgt.nxv16i8( zeroinitializer, %a, %b) + ret %0 +} + +define @test_cmpgt_wide( %a, %b){ +; CHECK-LABEL: define @test_cmpgt_wide( +; CHECK-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret zeroinitializer +; +entry: + %0 = tail call @llvm.aarch64.sve.cmpgt.wide.nxv16i8( zeroinitializer, %a, %b) + ret %0 +} + +define @test_cmphi( %a, %b){ +; CHECK-LABEL: define @test_cmphi( +; CHECK-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret zeroinitializer +; +entry: + %0 = tail call @llvm.aarch64.sve.cmphi.nxv16i8( zeroinitializer, %a, %b) + ret %0 +} + +define @test_cmphi_wide( %a, %b){ +; CHECK-LABEL: define @test_cmphi_wide( +; CHECK-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret zeroinitializer +; +entry: + %0 = tail call @llvm.aarch64.sve.cmphi.wide.nxv16i8( zeroinitializer, %a, %b) + ret %0 +} + +define @test_cmphs( %a, %b){ +; CHECK-LABEL: define @test_cmphs( +; CHECK-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret zeroinitializer +; +entry: + %0 = tail call @llvm.aarch64.sve.cmphs.nxv16i8( zeroinitializer, %a, %b) + ret %0 +} + +define @test_cmphs_wide( %a, %b){ +; CHECK-LABEL: define @test_cmphs_wide( +; CHECK-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret zeroinitializer +; +entry: + %0 = tail call @llvm.aarch64.sve.cmphs.wide.nxv16i8( zeroinitializer, %a, %b) + ret %0 +} + +define @test_cmple_wide( %a, %b){ +; CHECK-LABEL: define @test_cmple_wide( +; CHECK-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret zeroinitializer +; +entry: + %0 = tail call @llvm.aarch64.sve.cmple.wide.nxv16i8( zeroinitializer, %a, %b) + ret %0 +} + +define @test_cmplo_wide( %a, %b){ +; CHECK-LABEL: define @test_cmplo_wide( +; CHECK-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret zeroinitializer +; +entry: + %0 = tail call @llvm.aarch64.sve.cmplo.wide.nxv16i8( zeroinitializer, %a, %b) + ret %0 +} + +define @test_cmpls_wide( %a, %b){ +; CHECK-LABEL: define @test_cmpls_wide( +; CHECK-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret zeroinitializer +; +entry: + %0 = tail call @llvm.aarch64.sve.cmpls.wide.nxv16i8( zeroinitializer, %a, %b) + ret %0 +} + +define @test_cmplt_wide( %a, %b){ +; CHECK-LABEL: define @test_cmplt_wide( +; CHECK-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret zeroinitializer +; +entry: + %0 = tail call @llvm.aarch64.sve.cmplt.wide.nxv16i8( zeroinitializer, %a, %b) + ret %0 +} + +define @test_cmpne( %a, %b){ +; CHECK-LABEL: define @test_cmpne( +; CHECK-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret zeroinitializer +; +entry: + %0 = tail call @llvm.aarch64.sve.cmpne.nxv16i8( zeroinitializer, %a, %b) + ret %0 +} + +define @test_cmpne_wide( %a, %b){ +; CHECK-LABEL: define @test_cmpne_wide( +; CHECK-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret zeroinitializer +; +entry: + %0 = tail call @llvm.aarch64.sve.cmpne.wide.nxv16i8( zeroinitializer, %a, %b) + ret %0 +} + +define @test_facge( %a, %b){ry: + %0 = tail call @llvm.aarch64.sve.facge.nxv8f16( zeroinitializer, %a, %b) + ret %0 +} + +define @test_facgt( %a, %b){ry: + %0 = tail call @llvm.aarch64.sve.facgt.nxv8f16( zeroinitializer, %a, %b) + ret %0 +} + +define @test_fcmpeq( %a, %b){ +; CHECK-LABEL: define @test_fcmpeq( +; CHECK-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret zeroinitializer +; +entry: + %0 = tail call @llvm.aarch64.sve.fcmpeq.nxv16i8( zeroinitializer, %a, %b) + ret %0 +} + +define @test_fcmpge( %a, %b){ +; CHECK-LABEL: define @test_fcmpge( +; CHECK-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret zeroinitializer +; +entry: + %0 = tail call @llvm.aarch64.sve.fcmpge.nxv16i8( zeroinitializer, %a, %b) + ret %0 +} + +define @test_fcmpgt( %a, %b){ +; CHECK-LABEL: define @test_fcmpgt( +; CHECK-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret zeroinitializer +; +entry: + %0 = tail call @llvm.aarch64.sve.fcmpgt.nxv16i8( zeroinitializer, %a, %b) + ret %0 +} + +define @test_fcmpne( %a, %b){ +; CHECK-LABEL: define @test_fcmpne( +; CHECK-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret zeroinitializer +; +entry: + %0 = tail call @llvm.aarch64.sve.fcmpne.nxv16i8( zeroinitializer, %a, %b) + ret %0 +} + +define @test_fcmpuo( %a, %b){ +; CHECK-LABEL: define @test_fcmpuo( +; CHECK-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret zeroinitializer +; +entry: + %0 = tail call @llvm.aarch64.sve.fcmpuo.nxv16i8( zeroinitializer, %a, %b) + ret %0 +} + From 58ac764b013606a67043cde6a287db3648d87582 Mon Sep 17 00:00:00 2001 From: Hristo Hristov Date: Thu, 22 Aug 2024 17:56:42 +0300 Subject: [PATCH 207/426] [libc++] Post-LLVM19-release docs cleanup (#99667) This patch removes obsolete status pages for projects that were completed: LLVM 18 release, C++20 Ranges and Spaceship support. Co-authored-by: Hristo Hristov --- libcxx/docs/ReleaseNotes.rst | 1 - libcxx/docs/ReleaseNotes/18.rst | 345 --------------------- libcxx/docs/ReleaseNotes/19.rst | 3 +- libcxx/docs/Status/Ranges.rst | 48 --- libcxx/docs/Status/RangesAlgorithms.csv | 18 -- libcxx/docs/Status/RangesMajorFeatures.csv | 5 - libcxx/docs/Status/RangesViews.csv | 38 --- libcxx/docs/Status/Spaceship.rst | 53 ---- libcxx/docs/Status/SpaceshipPapers.csv | 12 - libcxx/docs/Status/SpaceshipProjects.csv | 202 ------------ libcxx/docs/index.rst | 2 - 11 files changed, 1 insertion(+), 726 deletions(-) delete mode 100644 libcxx/docs/ReleaseNotes/18.rst delete mode 100644 libcxx/docs/Status/Ranges.rst delete mode 100644 libcxx/docs/Status/RangesAlgorithms.csv delete mode 100644 libcxx/docs/Status/RangesMajorFeatures.csv delete mode 100644 libcxx/docs/Status/RangesViews.csv delete mode 100644 libcxx/docs/Status/Spaceship.rst delete mode 100644 libcxx/docs/Status/SpaceshipPapers.csv delete mode 100644 libcxx/docs/Status/SpaceshipProjects.csv diff --git a/libcxx/docs/ReleaseNotes.rst b/libcxx/docs/ReleaseNotes.rst index 9a77a5c23f30bb..47042c4af1649d 100644 --- a/libcxx/docs/ReleaseNotes.rst +++ b/libcxx/docs/ReleaseNotes.rst @@ -4,7 +4,6 @@ .. toctree:: :hidden: - ReleaseNotes/18 ReleaseNotes/19 .. The release notes are in versioned files, but we make sure to keep publishing diff --git a/libcxx/docs/ReleaseNotes/18.rst b/libcxx/docs/ReleaseNotes/18.rst deleted file mode 100644 index 4f7b9b362e5e66..00000000000000 --- a/libcxx/docs/ReleaseNotes/18.rst +++ /dev/null @@ -1,345 +0,0 @@ -=========================================== -Libc++ 18.0.0 (In-Progress) Release Notes -=========================================== - -.. contents:: - :local: - :depth: 2 - -Written by the `Libc++ Team `_ - -.. warning:: - - These are in-progress notes for the upcoming libc++ 18.0.0 release. - Release notes for previous releases can be found on - `the Download Page `_. - -Introduction -============ - -This document contains the release notes for the libc++ C++ Standard Library, -part of the LLVM Compiler Infrastructure, release 18.0.0. Here we describe the -status of libc++ in some detail, including major improvements from the previous -release and new feature work. For the general LLVM release notes, see `the LLVM -documentation `_. All LLVM releases may -be downloaded from the `LLVM releases web site `_. - -For more information about libc++, please see the `Libc++ Web Site -`_ or the `LLVM Web Site `_. - -Note that if you are reading this file from a Git checkout or the -main Libc++ web page, this document applies to the *next* release, not -the current one. To see the release notes for a specific release, please -see the `releases page `_. - -What's New in Libc++ 18.0.0? -============================== - -The main focus of the libc++ team has been to implement new C++20, C++23, -and C++26 features. - -New hardened modes for the library have been added, replacing the legacy debug mode that was -removed in the LLVM 17 release. Unlike the legacy debug mode, some of these hardening modes are -also intended to be used in production. See :ref:`hardening` for more details. - -Work on the ranges support has progressed. See -:ref:`ranges-status` for the current status. - -Work on the experimental C++23 module support has progressed. The ``std.compat`` -module is available and the feature is retroactively available in C++20. See -:ref:`ModulesInLibcxx` for more information. - -Work on the experimental C++17 Parallel STL has progressed. See -:ref:`pstl-status` for the current status. - -Work on the experimental C++17 SIMD support has progressed. See -:ref:`parallelism-status` for the current status. - - -Implemented Papers ------------------- -- P2093R14 - Formatted output -- P2539R4 - Should the output of ``std::print`` to a terminal be synchronized with the underlying stream? -- P2497R0 - Testing for success or failure of ```` functions -- P2697R1 - Interfacing ``bitset`` with ``string_view`` -- P2443R1 - ``views::chunk_by`` -- P2538R1 - ADL-proof ``std::projected`` -- P2614R2 - Deprecate ``numeric_limits::has_denorm`` -- P0053R7 - C++ Synchronized Buffered Ostream (in the experimental library) -- P2467R1 - Support exclusive mode for fstreams -- P0020R6 - Floating Point Atomic -- P2905R2 - Runtime format strings -- P2918R2 - Runtime format strings II -- P2871R3 - Remove Deprecated Unicode Conversion Facets from C++26 -- P2870R3 - Remove ``basic_string::reserve()`` -- P2909R4 - Fix formatting of code units as integers (Dude, where’s my ``char``?) -- P2821R5 - ``span.at()`` -- P0521R0 - Proposed Resolution for CA 14 (``shared_ptr`` ``use_count/unique``) -- P0543R3 - Saturation arithmetic -- P1759R6 - Native handles and file streams -- P2868R3 - Remove Deprecated ``std::allocator`` Typedef From C++26 -- P2517R1 - Add a conditional ``noexcept`` specification to ``std::apply`` -- P2447R6 - ``span`` over initializer list - - -Improvements and New Features ------------------------------ - -- ``std::ranges::count`` and ``std::ranges::find`` are now optimized for - ``std::vector::iterator``, which can lead up to 350x performance - improvements. - -- ``std::for_each`` has been optimized for segmented iterators like ``std::deque::iterator`` in C++23 and - later, which can lead up to 40x performance improvements. - -- The library now provides several hardening modes under which common cases of library undefined behavior will be turned - into a reliable program termination. The ``fast`` hardening mode enables a set of security-critical checks with - minimal runtime overhead; the ``extensive`` hardening mode additionally enables relatively cheap checks that catch - common logic errors but aren't necessarily security-critical; and the ``debug`` hardening mode enables all available - checks, some of which might be very expensive. Vendors can configure which hardening mode is enabled by default with - the ``LIBCXX_HARDENING_MODE`` variable at CMake configuration time. Users can control which hardening mode is enabled - on a per translation unit basis using the ``_LIBCPP_HARDENING_MODE`` macro. See :ref:`the hardening documentation - ` for more details. - -- The ``_LIBCPP_ENABLE_CXX26_REMOVED_CODECVT`` macro has been added to make - the declarations in ```` available. - -- The ``_LIBCPP_ENABLE_CXX26_REMOVED_STRING_RESERVE`` macro has been added to make - the function ``std::basic_string<...>::reserve()`` available. - -- The ``_LIBCPP_ENABLE_CXX26_REMOVED_ALLOCATOR_MEMBERS`` macro has been added to make - the function ``allocator::is_always_equal`` available. - -- The ``_LIBCPP_ENABLE_CXX20_REMOVED_SHARED_PTR_UNIQUE`` macro has been added to make - the function ``std::shared_ptr<...>::unique()`` available. - -- The cmake option ``LIBCXX_ENABLE_STD_MODULES`` has been removed. The test - infrastructure no longer depends on a modern CMake, it works with the minimal - required LLVM version (3.20.0). - -- The ``.cppm`` files of experimental standard library modules can now be - installed. By default, they are not installed. This can be enabled by - configuring CMake with ``-DLIBCXX_INSTALL_MODULES=ON``. The installation - directory can be configured with the CMake option - ``-DLIBCXX_INSTALL_MODULE_DIR=``. The default location is - ``${PREFIX}/share/libc++/v1``. - -- AddressSanitizer annotations have been added to ``std::basic_string``. - These annotations are enabled for all allocators by default. - It's only enabled for long strings, strings using the small buffer optimization are not annotated. - -- The libc++ source code has been formatted with ``clang-format``. This - `discourse thread `_ - contains information how to rebase downstream patches. - -Deprecations and Removals -------------------------- - -- Availability macros which will never trigger an error have been removed. This includes anything that has been - introduced before macOS 10.13, iOS 12, tvOS 12 and watchOS 4. This shouldn't affect anybody, since AppleClang 15 - doesn't support any older OSes. If you are a vendor and make use of these macros, please inform the libc++ team so we - can re-introduce them and consider upstreaming support for your platform. - -- The non-conforming constructor ``std::future_error(std::error_code)`` has been removed. Please use the - ``std::future_error(std::future_errc)`` constructor provided in C++17 instead. - -- `P1957 `_ has been implemented in Clang and libc++ removed a code path that led to - narrowing conversions in ``std::variant`` behaving in a non-standard way. This may change how some uses of - ``std::variant``'s constructor behave in user code. The ``_LIBCPP_ENABLE_NARROWING_CONVERSIONS_IN_VARIANT`` - macro is provided to restore the previous behavior, and it will be supported in the LLVM 18 release only. - In LLVM 19 and beyond, ``_LIBCPP_ENABLE_NARROWING_CONVERSIONS_IN_VARIANT`` will not be honored anymore. - -- Overriding ``__libcpp_verbose_abort`` no longer has any effect on library assertions. The only supported way - to customize the assertion handler that gets invoked when a hardening assertion fails is now by setting the - ``LIBCXX_ASSERTION_HANDLER_FILE`` CMake variable and providing a custom header. See the documentation on - overriding the default assertion handler for details. The ability to override ``__libcpp_verbose_abort`` - will be removed in an upcoming release in favor of the new overriding mechanism. - -- In safe mode (which is now equivalent to the ``extensive`` hardening mode), a failed assertion will now - generate a trap rather than a call to verbose abort. - -- The ``_LIBCPP_AVAILABILITY_CUSTOM_VERBOSE_ABORT_PROVIDED`` macro is not honored anymore in LLVM 18. - Please see the updated documentation about the hardening modes in libc++ and in particular on - overriding the default assertion handler. - -- The headers ````, ````, ````, - ````, ````, ````, ````, - ````, ````, ````, - and ```` have been removed in LLVM 18, as all their contents will have been - implemented in namespace ``std`` for at least two releases. - -- The macro ``_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS`` has been deprecated and will be removed - in LLVM 19. This macro used to re-enable redundant members of ``std::allocator`` like ``pointer``, - ``reference``, ``rebind``, ``address``, ``max_size``, ``construct``, ``destroy``, and the two-argument - overload of ``allocate``. However, this led to the library being non-conforming due to incorrect - constexpr-ness. - -- The macros ``_LIBCPP_ENABLE_CXX17_REMOVED_FEATURES`` and - ``_LIBCPP_ENABLE_CXX20_REMOVED_FEATURES`` have been deprecated and - will be removed in LLVM 19. These macros used to re-enable all features - that were removed in the C++17 and C++20 standards. Instead of using these - macros, please use the macros to re-enable individual features. - -- The macro ``_LIBCPP_INLINE_VISIBILITY`` has been deprecated in LLVM 18 and - will be removed entirely in LLVM 19. The macro ``_LIBCPP_HIDE_FROM_ABI`` is - the drop-in replacement. - -- The macro ``_VSTD`` has been deprecated in LLVM 18 and will be removed - entirely in LLVM 19. The code ``std`` is the drop-in replacement. - - -Upcoming Deprecations and Removals ----------------------------------- - -- The ability to override ``__libcpp_verbose_abort`` will be removed in an upcoming release. - -LLVM 19 -~~~~~~~ - -- The ``LIBCXX_EXECUTOR`` CMake variable has been deprecated. LLVM 19 will - completely remove support for the ``*_EXECUTOR`` variables. - -- The ``LIBCXX_ENABLE_ASSERTIONS`` CMake variable that was used to enable the safe mode will be deprecated and setting - it will trigger an error; use the ``LIBCXX_HARDENING_MODE`` variable with the value ``extensive`` instead. Similarly, - the ``_LIBCPP_ENABLE_ASSERTIONS`` macro will be deprecated (setting it to ``1`` still enables the extensive mode in - the LLVM 19 release while also issuing a deprecation warning). See :ref:`the hardening documentation - ` for more details. - -- The base template for ``std::char_traits`` has been marked as deprecated and will be removed in LLVM 19. If you - are using ``std::char_traits`` with types other than ``char``, ``wchar_t``, ``char8_t``, ``char16_t``, ``char32_t`` - or a custom character type for which you specialized ``std::char_traits``, your code will stop working when we - remove the base template. The Standard does not mandate that a base template is provided, and such a base template - is bound to be incorrect for some types, which could currently cause unexpected behavior while going undetected. - Note that the ``_LIBCPP_CHAR_TRAITS_REMOVE_BASE_SPECIALIZATION`` macro can be defined in LLVM 18 to eagerly remove - the specialization and prepare code bases for the unconditional removal in LLVM 19. - -- The ``_LIBCPP_ENABLE_NARROWING_CONVERSIONS_IN_VARIANT`` macro that changed the behavior for narrowing conversions - in ``std::variant`` will be removed in LLVM 19. - -- The ``_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS`` macro has been deprecated in LLVM 18 and will be removed - entirely in LLVM 19. - -- The ``_LIBCPP_ENABLE_CXX17_REMOVED_FEATURES`` and - ``_LIBCPP_ENABLE_CXX20_REMOVED_FEATURES`` macros have been deprecated - in LLVM 18 and will be removed entirely in LLVM 19. - -- The macro ``_LIBCPP_INLINE_VISIBILITY`` has been deprecated in LLVM 18 and - will be removed entirely in LLVM 19. - -- The macro ``_VSTD`` has been deprecated in LLVM 18 and will be removed - entirely in LLVM 19. - -LLVM 20 -~~~~~~~ - -- The ``LIBCXX_ENABLE_ASSERTIONS`` CMake variable and the ``_LIBCPP_ENABLE_ASSERTIONS`` macro that were used to enable - the safe mode will be removed. - - -ABI Affecting Changes ---------------------- - -- When the shared/static library is built with ``-fno-exceptions``, the behavior of ``operator new`` was changed - to make it standards-conforming. In LLVM 17 and before, the throwing versions of ``operator new`` would return - ``nullptr`` upon failure to allocate, when the shared/static library was built with exceptions disabled. This - was non-conforming, since the throwing versions of ``operator new`` are never expected to return ``nullptr``, and - this non-conformance could actually lead to miscompiles in subtle cases. - - Starting in LLVM 18, the throwing versions of ``operator new`` will abort the program when they fail to allocate - if the shared/static library has been built with ``-fno-exceptions``. This is consistent with the behavior of all - other potentially-throwing functions in the library, which abort the program instead of throwing when ``-fno-exceptions`` - is used. - - Furthermore, when the shared/static library is built with ``-fno-exceptions``, users who override the throwing - version of ``operator new`` will now need to also override the ``std::nothrow_t`` version of ``operator new`` if - they want to use it. Indeed, this is because there is no way to implement a conforming ``operator new(nothrow)`` - from a conforming potentially-throwing ``operator new`` when compiled with ``-fno-exceptions``. In that case, using - ``operator new(nothrow)`` without overriding it explicitly but after overriding the throwing ``operator new`` will - result in an error. - - Note that this change only impacts vendors/users that build the shared/static library themselves and pass - ``-DLIBCXX_ENABLE_EXCEPTIONS=OFF``, which is not the default configuration. If you are using the default - configuration of the library, the libc++ shared/static library will be built with exceptions enabled, and - there is no change between LLVM 17 and LLVM 18, even for users who build their own code using ``-fno-exceptions``. - -- The symbol of a non-visible function part of ``std::system_error`` was removed. - This is not a breaking change as the private function ``__init`` was never referenced internally outside of the dylib. - -- This release of libc++ added missing visibility annotations on some types in the library. Users compiling with - ``-fvisbility=hidden`` may notice that additional type infos from libc++ are being exported from their ABI. This is - the correct behavior in almost all cases since exporting the RTTI is required for these types to work properly with - ``dynamic_cast``, exceptions and other mechanisms across binaries. However, if you intend to use libc++ purely as an - internal implementation detail (i.e. you use libc++ as a static archive and never export libc++ symbols from your ABI) - and you notice changes to your exported symbols list, then this means that you were not properly preventing libc++ - symbols from being part of your ABI. - -- The name mangling for instantiations of ``std::projected`` has changed in order to implement P2538R1. This technically - results in an ABI break, however in practice we expect uses of ``std::projected`` in ABI-sensitive places to be - extremely rare. Any error resulting from this change should result in a link-time error. - -- The internal alignment requirements for heap allocations inside ``std::string`` has decreased from 16 to 8. This - saves memory since string requests fewer additional bytes than it did previously. However, this also changes the - return value of ``std::string::max_size`` and can cause code compiled against older libc++ versions but linked at - runtime to a new version to throw a different exception when attempting allocations that are too large - (``std::bad_alloc`` vs ``std::length_error``). - -- The layout of some range adaptors that use the ``movable-box`` exposition-only type as an implementation - detail has changed in order to fix a `bug `_ which could result in - overwriting user data following the ``movable-box``. - This bug was caused by incorrect usage of the ``[[no_unique_address]]`` attribute inside the implementation of ``movable-box``. - This fix affects the layout of the following views: ``take_while_view``, ``filter_view``, ``single_view``, ``drop_while_view``, - ``repeat_view``, ``transform_view``, ``chunk_by_view``. In order to avoid silent breakage as a result of this fix, an ABI tag has been added to - these views such that their mangled name will be different starting in this version of libc++. - As a result, attempting to call a function that expects one of these views will fail to link until the code has been rebuilt - against a matching version of libc++. In practice, we believe it is unusual for these views to appear at ABI boundaries so this - should not be a major problem for most users. However it is probably worth auditing ranges-heavy code for ABI boundaries that - would contain these views, or for types that contain these views as members and which are passed across ABI boundaries. - -- Some properties of libc++ may cause ODR-violations when mixing multiple libc++ - instances. To avoid these, often benign, ODR-violations the ODR-affecting - properties are now part of the ABI tag. The ODR-affecting properties are: - - - library version (This was part of the ABI tag prior to LLVM 18.) - - exceptions vs no-exceptions - - hardening mode - - This should not be ABI-affecting except that libc++ will be more robust - against different configurations of it being used in different translation - units. - -- The amount of padding bytes available for use at the end of certain ``std::expected`` instantiations has changed in this - release. This is an ABI break for any code that held a ``std::expected`` member with ``[[no_unique_address]]`` in an - ABI-facing type. In those cases, the layout of the enclosing type will change, breaking the ABI. However, the - ``std::expected`` member requires a few characteristics in order to be affected by this change: - - - A type equivalent to ``union {T ; E}`` needs to have more than one byte of padding available. - - The ``std::expected`` member must have been in a situation where its padding bytes were previously reused by - another object, which can happen in a few cases (this is probably not exhaustive): - - - It is a member with ``[[no_unique_address]]`` applied to it, and it is followed by another data member, or - - It is a member with ``[[no_unique_address]]`` applied to it, and it is the last member of the user-defined type, - and that user-defined type is used in ways that its padding bytes can be reused, or - - It is inherited from - - We expect that this will not be a very frequent occurrence. However, there is unfortunately no technique we can use - in the library to catch such misuse. Indeed, even applying an ABI tag to ``std::expected`` would not help since ABI - tags are not propagated to containing types. As a result, if you notice very difficult to explain bugs around the - usage of a ``std::expected``, you should consider checking whether you are hitting this ABI break. This change was - done to fix `#70494 `_ and the vendor communication is handled - in `#70820 `_. - - -Build System Changes --------------------- - -- The ``LIBCXX_EXECUTOR`` CMake variable has been deprecated. If you are relying on this, the new replacement is - passing ``-Dexecutor=...`` to ``llvm-lit``. Alternatively, this flag can be made persistent in the generated test - configuration file by passing ``-DLIBCXX_TEST_PARAMS=executor=...``. This also applies to the ``LIBUWIND_EXECTOR`` - and ``LIBCXXABI_EXECUTOR`` CMake variables. LLVM 19 will completely remove support for the ``*_EXECUTOR`` variables. - -- ``LIBCXXABI_USE_LLVM_UNWINDER`` and ``COMPILER_RT_USE_LLVM_UNWINDER`` switched defaults from ``OFF`` to ``ON``. - This means that by default, libc++abi and compiler-rt will link against the LLVM provided ``libunwind`` library - instead of the system-provided unwinding library. If you are building the LLVM runtimes with the goal of shipping - them so that they can interoperate with other system-provided libraries that might be using a different unwinding - library (such as ``libgcc_s``), you should pass ``LIBCXXABI_USE_LLVM_UNWINDER=OFF`` and ``COMPILER_RT_USE_LLVM_UNWINDER=OFF`` - to make sure the system-provided unwinding library is used by the LLVM runtimes. diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst index 3d79def336a874..e167d21e39f93c 100644 --- a/libcxx/docs/ReleaseNotes/19.rst +++ b/libcxx/docs/ReleaseNotes/19.rst @@ -40,8 +40,7 @@ and C++26 features. Experimental support for the time zone database has progressed. -Work on the ranges support has progressed. See -:ref:`ranges-status` for the current status. +Work on the ranges support has progressed. Work on the experimental C++17 Parallel STL has progressed. See :ref:`pstl-status` for the current status. diff --git a/libcxx/docs/Status/Ranges.rst b/libcxx/docs/Status/Ranges.rst deleted file mode 100644 index cdbf68393b739d..00000000000000 --- a/libcxx/docs/Status/Ranges.rst +++ /dev/null @@ -1,48 +0,0 @@ -.. _ranges-status: - -================================ -libc++ Ranges Status -================================ - -.. include:: ../Helpers/Styles.rst - -.. contents:: - :local: - - -Overview -================================ - -This document contains the status of the Ranges library in libc++. It is used to track -both the status of the sub-projects of the ranges library and who is assigned to these -sub-projects. This is imperative to effective implementation so that work is not -duplicated and implementors are not blocked by each other. - -If you are interested in contributing to the libc++ Ranges library, please send a message -to the #libcxx channel in the LLVM discord. Please *do not* start working on any of the -*assigned* items below. - - -Major features -======================================= - -.. csv-table:: - :file: RangesMajorFeatures.csv - :header-rows: 1 - :widths: auto - -Views -======================================= - -.. csv-table:: - :file: RangesViews.csv - :header-rows: 1 - :widths: auto - -Algorithms -======================================= - -.. csv-table:: - :file: RangesAlgorithms.csv - :header-rows: 1 - :widths: auto diff --git a/libcxx/docs/Status/RangesAlgorithms.csv b/libcxx/docs/Status/RangesAlgorithms.csv deleted file mode 100644 index 469ea21a76aab9..00000000000000 --- a/libcxx/docs/Status/RangesAlgorithms.csv +++ /dev/null @@ -1,18 +0,0 @@ -Standard,Algorithm,Assignee,CL,Status -C++20,all C++20 algorithms,N/A,N/A,✅ -C++23,`find_last `_,Nicole Mazzuca,`#99312 `_,Complete -C++23,`find_last_if `_,Nicole Mazzuca,`#99312 `_,Complete -C++23,`find_last_if_not `_,Nicole Mazzuca,`#99312 `_,Complete -C++23,`starts_with `_,Zijun Zhao,`D150735 `_,Complete -C++23,`ends_with `_,Zijun Zhao, `D150831 `_,Complete -C++23,`shift_left `_,Unassigned,No patch yet,Not started -C++23,`shift_right `_,Unassigned,No patch yet,Not started -C++23,`iota (algorithm) `_,Unassigned,No patch yet,Not started -C++23,`fold `_,Unassigned,No patch yet,Not started -C++23,`contains `_,Zijun Zhao, `#65148 `_,Complete -C++23,`fold_left_with_iter `_,Christopher Di Bella,N/A,Complete -C++23,`fold_left `_,Christopher Di Bella,N/A,Complete -C++23,`fold_left_first_with_iter `_,Christopher Di Bella,N/A,In progress -C++23,`fold_left_first `_,Christopher Di Bella,N/A,In progress -C++23,`fold_right `_,Christopher Di Bella,N/A,In progress -C++23,`fold_right_last `_,Christopher Di Bella,N/A,In progress diff --git a/libcxx/docs/Status/RangesMajorFeatures.csv b/libcxx/docs/Status/RangesMajorFeatures.csv deleted file mode 100644 index d00fbce9edf489..00000000000000 --- a/libcxx/docs/Status/RangesMajorFeatures.csv +++ /dev/null @@ -1,5 +0,0 @@ -Standard,Name,Assignee,CL,Status -C++23,`ranges::to `_,Konstantin Varlamov,`D142335 `_,Complete -C++23,`Pipe support for user-defined range adaptors `_,"Louis Dionne, Jakub Mazurkiewicz, and Xiaoyang Liu",Various,Complete -C++23,`Formatting Ranges `_,Mark de Wever,Various,Complete -C++20,`Stashing stashing iterators for proper flattening `_,Jakub Mazurkiewicz,Various,In progress diff --git a/libcxx/docs/Status/RangesViews.csv b/libcxx/docs/Status/RangesViews.csv deleted file mode 100644 index f141656eb131a2..00000000000000 --- a/libcxx/docs/Status/RangesViews.csv +++ /dev/null @@ -1,38 +0,0 @@ -Standard,View,Assignee,CL,Status -C++20,`empty `_,Zoe Carver,`D103208 `_,✅ -C++20,`single `_,Zoe Carver,`D106840 `_,✅ -C++20,`iota (view) `_,Zoe Carver,`D107396 `_,✅ -C++20,`all `_,Zoe Carver,`D102028 `_,✅ -C++20,`ref_view `_,Zoe Carver,`D102020 `_,✅ -C++20,`owning_view `_,Arthur O'Dwyer,`D116894 `_,✅ -C++20,`filter `_,Louis Dionne,`D109086 `_,✅ -C++20,`transform `_,Zoe Carver,`D103056 `_,✅ -C++20,`take `_,Zoe Carver,`D106507 `_,✅ -C++20,`take_while `_,Hui Xie,`D134952 `_,✅ -C++20,`drop `_,Zoe Carver,`D102037 `_,✅ -C++20,`drop_while `_,Hui Xie,`D135460 `_,✅ -C++20,`join `_,Zoe Carver,`D107671 `_,✅ -C++20,`split `_,Hui Xie,`D142063 `_,✅ -C++20,`lazy_split `_,Zoe Carver and Konstantin Varlamov,`D107500 `_,✅ -C++20,`counted `_,Zoe Carver,`D106923 `_,✅ -C++20,`common `_,Zoe Carver,`D105753 `_,✅ -C++20,`reverse `_,Zoe Carver,`D107096 `_,✅ -C++20,`elements / keys / values `_,Hui Xie,`D136268 `_,✅ -C++20,`istream `_,Hui Xie,`D133317 `_,✅ -,,,, -,,,, -,,,, -C++23,`repeat `_,Yrong,`D141699 `_,✅ -C++23,`cartesian_product `_,Unassigned,No patch yet,Not started -C++23,`zip `_,Hui Xie,`D122806 `_,✅ -C++23,`zip_transform `_,Hui Xie,No patch yet,Not started -C++23,`adjacent `_,Hui Xie,No patch yet,Not started -C++23,`adjacent_transform `_,Hui Xie,No patch yet,Not started -C++23,`join_with `_,Jakub Mazurkiewicz,`65536 `_,In progress -C++23,`slide `_,Will Hawkins,`67146 `_,In Progress -C++23,`chunk `_,Unassigned,No patch yet,Not started -C++23,`chunk_by `_,Jakub Mazurkiewicz,`D144767 `_,✅ -C++23,`as_const `_,Unassigned,No patch yet,Not started -C++23,`as_rvalue `_,Nikolas Klauser,`D137637 `_,✅ -C++23,`stride `_,Hristo Hristov and Will Hawkins,`D156924 `_,In Progress -C++23,`enumerate `_,Hristo Hristov,`D157193 `_,In Progress diff --git a/libcxx/docs/Status/Spaceship.rst b/libcxx/docs/Status/Spaceship.rst deleted file mode 100644 index d596c1128dbf63..00000000000000 --- a/libcxx/docs/Status/Spaceship.rst +++ /dev/null @@ -1,53 +0,0 @@ -.. spaceship-status: - -============================================== -libc++ Spaceship Operator Status (operator<=>) -============================================== - -.. include:: ../Helpers/Styles.rst - -.. contents:: - :local: - - -Overview -================================ - -This document contains the status of the C++20 spaceship operator support -in libc++. It is used to track both the status of the sub-projects of the effort -and who is assigned to these sub-projects. This is imperative to effective -implementation so that work is not duplicated and implementors are not blocked -by each other. - -If you are interested in contributing to this effort, please send a message -to the #libcxx channel in the LLVM discord. Please *do not* start working on any -of the assigned items below. - - -Sub-Projects in the Implementation Effort -========================================= - -.. csv-table:: - :file: SpaceshipProjects.csv - :header-rows: 1 - :widths: auto - -.. note:: - - .. [#note-strongorder] ``std::strong_order(long double, long double)`` is not yet implemented. - - -Misc. Items and TODOs -==================================== - -(Note: files with required updates will contain the TODO at the beginning of the -list item so they can be easily found via global search.) - - -Paper and Issue Status -==================================== - -.. csv-table:: - :file: SpaceshipPapers.csv - :header-rows: 1 - :widths: auto diff --git a/libcxx/docs/Status/SpaceshipPapers.csv b/libcxx/docs/Status/SpaceshipPapers.csv deleted file mode 100644 index 1ab64a9caf86a3..00000000000000 --- a/libcxx/docs/Status/SpaceshipPapers.csv +++ /dev/null @@ -1,12 +0,0 @@ -"Number","Name","Status","First released version" -`P1614R2 `_,The Mothership has Landed,|Complete|,19.0 -`P2404R3 `_,"Relaxing ``equality_comparable_with``'s, ``totally_ordered_with``'s, and ``three_way_comparable_with``'s common reference requirements to support move-only types",, -`LWG3330 `_,Include ```` from most library headers,"|Complete|","13.0" -`LWG3347 `_,"``std::pair`` now requires ``T`` and ``U`` to be *less-than-comparable*",|Nothing To Do|, -`LWG3350 `_,Simplify return type of ``lexicographical_compare_three_way``,|Nothing To Do|, -`LWG3360 `_,``three_way_comparable_with`` is inconsistent with similar concepts,|Nothing To Do|, -`LWG3380 `_,``common_type`` and comparison categories,|Nothing To Do|, -`LWG3395 `_,Definition for *three-way* comparison needs to be updated,|Nothing To Do|, -`P0905R1 `_,Symmetry for spaceship,|Complete|,7.0 -`P1120R0 `_,Consistency improvements for ``<=>`` and other comparison operators,, -`LWG3431 `_,``<=>`` for containers should require ``three_way_comparable`` instead of ``<=>``,, diff --git a/libcxx/docs/Status/SpaceshipProjects.csv b/libcxx/docs/Status/SpaceshipProjects.csv deleted file mode 100644 index 4dc43cdbbd08fd..00000000000000 --- a/libcxx/docs/Status/SpaceshipProjects.csv +++ /dev/null @@ -1,202 +0,0 @@ -Section,Description,Dependencies,Assignee,Complete -- `5.1 Clause 16: Library Introduction `_,,,, -| `[expos.only.func] `_,"| `synth-three-way `_ -| `synth-three-way-result `_",[cmp.concept],Kent Ross,|Complete| -- `5.2 Clause 17: Language support library `_,,,, -| `[support.limits.general] `_,|,None,Unassigned,|Nothing To Do| -| `[type.info] `_,| remove ops `typeinfo `_,None,Adrian Vogelsgesang,|Complete| -| `[compare.syn] `_,,"| [cmp.concept] -| [cmp.result] -| [cmp.object] -| [cmp.alg]",Unassigned,|Complete| -"| `[cmp.weakeq] `_ -| `[cmp.strongeq] `_",| removed by `P1959R0 `_,None,Unassigned,|Nothing To Do| -"| `[cmp.partialord] `_ -| `[cmp.weakord] `_ -| `[cmp.strongord] `_",| remove ops `*_ordering `_,None,Christopher Di Bella,|Complete| -| `[cmp.concept] `_,"| `three_way_comparable `_ -| `three_way_comparable_with `_",None,Ruslan Arutyunyan,|Complete| -| `[cmp.result] `_,| `compare_three_way_result `_,None,Arthur O'Dwyer,|Complete| -| `[comparisons.three.way] `_,| `compare_three_way `_,[cmp.concept],Arthur O'Dwyer,|Complete| -| `[cmp.alg] `_,"| `strong_order `_ -| `weak_order `_ -| `partial_order `_ -| `strong_order_fallback `_ -| `weak_order_fallback `_ -| `partial_order_fallback `_",None,Arthur O'Dwyer,|Complete| [#note-strongorder]_ -"| `[coroutine.syn] `_ -| `[coroutine.handle.compare] `_",| `coroutine_handle `_,[comparisons.three.way],Chuanqi Xu,|Complete| -- `5.3 Clause 18: Concepts Library `_,,,, -- `5.4 Clause 19: Diagnostics Library `_,,,, -| `[system.error.syn] `_,|,"| [syserr.errcat.nonvirtuals] -| [syserr.compare]",Unassigned,|Complete| -"| `[syserr.errcat.overview] `_ -| `[syserr.errcat.nonvirtuals] `_",| `error_category `_,[comparisons.three.way],Adrian Vogelsgesang,|Complete| -| `[syserr.compare] `_,"| `error_code `_ -| `error_condition `_",None,Adrian Vogelsgesang,|Complete| -- `5.5 Clause 20: General utilities library `_,,,, -"| `[utility.syn] `_ -| `[pairs.pair] `_ -| `[pairs.spec] `_",| `pair `_,[expos.only.func],Kent Ross,|Complete| -"| `[tuple.syn] `_ -| `[tuple.rel] `_",| `tuple `_,[expos.only.func],Kent Ross,|Complete| -"| `[optional.syn] `_ -| `[optional.relops] `_ -| `[optional.nullops] `_ -| `[optional.comp.with.t] `_","| `optional `_ -| `nullopt `_",None,Hristo Hristov,|Complete| -"| `[variant.syn] `_ -| `[variant.relops] `_ -| `[variant.monostate.relops] `_","| `monostate `_ -| `variant `_",None,Kent Ross,|Complete| -"| `[template.bitset] `_ -| `[bitset.members] `_","| remove ops `bitset `_",None,Hristo Hristov,|Complete| -| `[memory.syn] `_,|,None,Unassigned,|Complete| -| `[allocator.globals] `_,| remove ops `allocator `_,None,Hristo Hristov,|Complete| -| `[unique.ptr.special] `_,| `unique_ptr `_,[comparisons.three.way],Adrian Vogelsgesang,|Complete| -| `[util.smartptr.shared.cmp] `_,| `shared_ptr `_,[comparisons.three.way],Adrian Vogelsgesang,|Complete| -"| `[mem.res.syn] `_ -| `[mem.res.eq] `_ -| `[mem.poly.allocator.eq] `_","| -| remove ops `memory_resource `_ -| remove ops `polymorphic_allocator `_",None,Hristo Hristov,|Complete| -"| `[allocator.adaptor.syn] `_ -| `[scoped.adaptor.operators] `_",| remove ops `scoped_allocator_adaptor `_,None,Hristo Hristov,|Complete| -"| `[functional.syn] `_ -| `[range.cmp] `_ -| `[func.wrap.func] `_ -| `[func.wrap.func.nullptr] `_",| remove ops `function `_,None,Hristo Hristov,|Complete| -| `[meta.unary.prop] `_,| replaced by `issue LWG3354 `_,None,Unassigned,|Nothing To Do| -| `[meta.trans.other] `_,| removed by `issue LWG3380 `_,None,Unassigned,|Nothing To Do| -"| `[type.index.overview] `_ -| `[type.index.members] `_",| `type_index `_,None,Adrian Vogelsgesang,|Complete| -| `[charconv.syn] `_,| `to_chars_result `_,None,Mark de Wever,|Complete| -| `[charconv.syn] `_,| `from_chars_result `_,None,Mark de Wever,|Complete| -- `5.6 Clause 21: Strings library `_,,,, -"| `[char.traits.specializations.char] `_ -| `[char.traits.specializations.char8.t] `_ -| `[char.traits.specializations.char16.t] `_ -| `[char.traits.specializations.char32.t] `_ -| `[char.traits.specializations.wchar.t] `_ -| `[string.syn] `_ -| `[string.cmp] `_",| `basic_string `_,None,Mark de Wever,|Complete| -"| `[string.view.synop] `_ -| `[string.view.comparison] `_",| `basic_string_view `_,None,Mark de Wever,|Complete| -- `5.7 Clause 22: Containers library `_,,,, -| `[container.requirements.general] `_,|,None,Mark de Wever,|Complete| -| `[array.syn] `_ (`general `_),| `array `_,[expos.only.func],"| Adrian Vogelsgesang -| Hristo Hristov",|Complete| -| `[deque.syn] `_ (`general `_),| `deque `_,[expos.only.func],Hristo Hristov,|Complete| -| `[forward.list.syn] `_ (`general `_),| `forward_list `_,[expos.only.func],Hristo Hristov,|Complete| -| `[list.syn] `_ (`general `_),| `list `_,[expos.only.func],Adrian Vogelsgesang,|Complete| -| `[vector.syn] `_ (`general `_),| `vector `_,[expos.only.func],"| Adrian Vogelsgesang -| Hristo Hristov",|Complete| -| `[array.overview] `_ (`general `_),| removed by `issue LWG3347 `_,None,"| Adrian Vogelsgesang -| Hristo Hristov",|Complete| -| `[associative.map.syn] `_ (`general `_),"| `map `_ -| `multimap `_",[expos.only.func],Hristo Hristov,|Complete| -| `[associative.set.syn] `_ (`general `_),"| `multiset `_ -| `set `_",[expos.only.func],Hristo Hristov,|Complete| -| `[unord.map.syn] `_,"| remove ops `unordered_map `_ -| remove ops `unordered_multimap `_",None,Hristo Hristov,|Complete| -| `[unordered.set.syn] `_,"| remove ops `unordered_set `_ -| remove ops `unordered_multiset `_",None,Hristo Hristov,|Complete| -| `[queue.syn] `_,| `queue `_,None,Hristo Hristov,|Complete| -| `[stack.syn] `_,| `stack `_,None,Hristo Hristov,|Complete| -| `[queue.ops] `_,| `queue `_,None,Hristo Hristov,|Complete| -| `[stack.ops] `_,| `stack `_,None,Hristo Hristov,|Complete| -- `5.8 Clause 23: Iterators library `_,,,, -| `[iterator.synopsis] `_,|,None,Unassigned,|Complete| -| `[reverse.iter.cmp] `_,| `reverse_iterator `_,None,Mikhail Maltsev,|Complete| -"| `[move.iterator] `_ -| `[move.iter.op.comp] `_",| `move_iterator `_,None,Arthur O'Dwyer,|Complete| -"| `[common.iterator] `_ -| `[common.iter.cmp] `_",| `common_iterator `_,None,Zoe Carver,|Complete| -"| `[counted.iterator] `_ -| `[counted.iter.cmp] `_",| `counted_iterator `_,None,Zoe Carver,|Complete| -| `[unreachable.sentinel] `_,| `unreachable_sentinel_t `_,None,Zoe Carver,|Complete| -"| `[istream.iterator] `_ -| `[istream.iterator.ops] `_",| remove ops `istream_iterator `_,None,Konstantin Varlamov,|Complete| -"| `[istreambuf.iterator] `_ -| `[istreambuf.iterator.ops] `_",| remove ops `istreambuf_iterator `_,None,Konstantin Varlamov,|Complete| -- `5.9 Clause 24: Ranges library `_,,,, -| `[range.iota.iterator] `_,| `ranges::iota_view::iterator `_,[concepts.cmp],Arthur O'Dwyer,|Complete| -| `[range.iota.sentinel] `_,| remove ops `iota_view::sentinel `_,None,Zoe Carver,|Complete| -| `[range.filter.iterator] `_,| remove ops `filter_view::iterator `_,None,Louis Dionne,|Complete| -| `[range.filter.sentinel] `_,| remove ops `filter_view::sentinel `_,None,Louis Dionne,|Complete| -| `[range.transform.iterator] `_,| `ranges::transform_view::iterator `_,[concepts.cmp],Arthur O'Dwyer,|Complete| -| `[range.transform.sentinel] `_,| remove ops `transform_view::sentinel `_,None,Zoe Carver,|Complete| -| `[range.take.sentinel] `_,| remove ops `take_view::sentinel `_,None,Konstantin Varlamov,|Complete| -| `[range.join.iterator] `_,| remove ops `join_view::iterator `_,None,Zoe Carver,|Complete| -| `[range.join.sentinel] `_,| remove ops `join_view::sentinel `_,None,Zoe Carver,|Complete| -| `[range.split.outer] `_,| remove ops `split_view::outer_iterator `_,None,Hui Xie,|Complete| -| `[range.split.inner] `_,| remove ops `split_view::inner_iterator `_,None,Hui Xie,|Complete| -- `5.10 Clause 25: Algorithms library `_,,,, -"| `[algorithm.syn] `_ -| `[alg.three.way] `_",| `lexicographical_compare_three_way `_,[comparisons.three.way],Adrian Vogelsgesang,|Complete| -- `5.11 Clause 26: Numerics library `_,,,, -"| `[complex.syn] `_ -| `[complex.ops] `_",| remove ops `complex `_,None,Hristo Hristov,|Complete| -"| `[class.slice.overview] `_ -| `[slice.ops] `_",| `slice `_,None,Hristo Hristov,|Complete| -- `5.12 Clause 27: Time library `_,,,, -| `[time.syn] `_,|,None,Mark de Wever,|Complete| -| `[time.duration.comparisons] `_, `chrono::duration `_, None, Hristo Hristov, |Complete| -| `[time.point.comparisons] `_, `chrono::time_point `_, None, Hristo Hristov, |Complete| -"| `[time.cal.day.nonmembers] `_ -| `[time.cal.month.nonmembers] `_ -| `[time.cal.year.nonmembers] `_ -| `[time.cal.md.nonmembers] `_ -| `[time.cal.mdlast] `_ -| `[time.cal.ym.nonmembers] `_ -| `[time.cal.ymd.nonmembers] `_ -| `[time.cal.ymdlast.nonmembers] `_","| `chrono::day `_ -| `chrono::month `_ -| `chrono::year `_ -| `chrono::month_day `_ -| `chrono::month_day_last `_ -| `chrono::year_month `_ -| `chrono::year_month_day `_ -| `chrono::year_month_day_last `_",None,Mark de Wever,|Complete| -"| `[time.cal.wd] `_ -| `[time.cal.wdidx] `_ -| `[time.cal.wdlast] `_ -| `[time.cal.mwd] `_ -| `[time.cal.mwdlast] `_ -| `[time.cal.ymwd] `_ -| `[time.cal.ymwdlast] `_","| `weekday `_ -| `weekday_indexed `_ -| `weekday_last `_ -| `month_weekday `_ -| `month_weekday_last `_ -| `year_month_weekday `_ -| `year_month_weekday_last `_",None,Hristo Hristov,|Complete| -`[time.zone.nonmembers] `_,"`chrono::time_zone`",,Mark de Wever,|Complete| -`[time.zone.zonedtime.nonmembers] `_,"`chrono::zoned_time`",,Mark de Wever,|Complete| -`[time.zone.leap.nonmembers] `_,"`chrono::time_leap_seconds`",,Mark de Wever,|Complete| -`[time.zone.link.nonmembers] `_,"`chrono::time_zone_link`",,Mark de Wever,|Complete| -- `5.13 Clause 28: Localization library `_,,,, -"| `[locale] `_ -| `[locale.operators] `_",| remove ops `locale `_,None,Hristo Hristov,|Complete| -- `5.14 Clause 29: Input/output library `_,,,, -| `[fs.filesystem.syn] `_,| `filesystem::space_info `_,None,Adrian Vogelsgesang,|Complete| -"| `[fs.class.path] `_ -| `[fs.path.nonmember] `_",| `filesystem::path `_,None,Adrian Vogelsgesang,|Complete| -| `[fs.class.file.status] `_,| `file_status `_,None,Hristo Hristov,|Complete| -"| `[fs.class.directory.entry] `_ -| `[fs.dir.entry.obs] `_",| `filesystem::directory_entry `_,None,Adrian Vogelsgesang,|Complete| -- `5.15 Clause 30: Regular expressions library `_,,,, -| `[re.syn] `_,|,None,Mark de Wever,|Complete| -| `[re.submatch.op] `_,| `sub_match `_,None,Mark de Wever,|Complete| -| `[re.results.nonmember] `_,| remove ops `match_results`,None,Mark de Wever,|Complete| -"| `[re.regiter] `_, -| `[re.regiter.comp] `_",| remove ops `regex_iterator`,None,Mark de Wever,|Complete| -"| `[re.tokiter] `_ -| `[re.tokiter.comp] `_",| remove ops `regex_token_iterator`,None,Mark de Wever,|Complete| -- `5.16 Clause 31: Atomic operations library `_,,,, -- `5.17 Clause 32: Thread support library `_,,,, -| `[thread.thread.id] `_,| `thread::id `_,None,Adrian Vogelsgesang,|Complete| -Misc (Not part of R1614),,,, -| `[range.elements.iterator] `_,| `ranges::elements_view::iterator `_,[concepts.cmp],Hui Xie,|Complete| -| `[stacktrace.entry.cmp] `_,| `stacktrace_entry `_,None,Nikolas Klauser,|In Progress| -| `[stacktrace.basic.cmp] `_,| `basic_stacktrace `_,[alg.three.way],Nikolas Klauser,|In Progress| diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst index 18af347a1217ff..a77405eb138124 100644 --- a/libcxx/docs/index.rst +++ b/libcxx/docs/index.rst @@ -50,8 +50,6 @@ Getting Started with libc++ Status/Format Status/Parallelism Status/PSTL - Status/Ranges - Status/Spaceship Status/SpecialMath Status/Zip From 4d85285ff68d11fcb8c6b296799a11074e7ff7d7 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 22 Aug 2024 16:57:09 +0200 Subject: [PATCH 208/426] [SimplifyCFG] Fold switch over ucmp/scmp to icmp and br (#105636) If we switch over ucmp/scmp and have two switch cases going to the same destination, we can convert into icmp+br. Fixes https://github.com/llvm/llvm-project/issues/105632. --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 116 +++++ .../Transforms/SimplifyCFG/switch-on-cmp.ll | 416 ++++++++++++++++-- 2 files changed, 486 insertions(+), 46 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 00efd3c0eb72ec..da4d57f808e9bf 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -7131,6 +7131,119 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder, return true; } +/// Fold switch over ucmp/scmp intrinsic to br if two of the switch arms have +/// the same destination. +static bool simplifySwitchOfCmpIntrinsic(SwitchInst *SI, IRBuilderBase &Builder, + DomTreeUpdater *DTU) { + auto *Cmp = dyn_cast(SI->getCondition()); + if (!Cmp || !Cmp->hasOneUse()) + return false; + + SmallVector Weights; + bool HasWeights = extractBranchWeights(getBranchWeightMDNode(*SI), Weights); + if (!HasWeights) + Weights.resize(4); // Avoid checking HasWeights everywhere. + + // Normalize to [us]cmp == Res ? Succ : OtherSucc. + int64_t Res; + BasicBlock *Succ, *OtherSucc; + uint32_t SuccWeight = 0, OtherSuccWeight = 0; + BasicBlock *Unreachable = nullptr; + + if (SI->getNumCases() == 2) { + // Find which of 1, 0 or -1 is missing (handled by default dest). + SmallSet Missing; + Missing.insert(1); + Missing.insert(0); + Missing.insert(-1); + + Succ = SI->getDefaultDest(); + SuccWeight = Weights[0]; + OtherSucc = nullptr; + for (auto &Case : SI->cases()) { + std::optional Val = + Case.getCaseValue()->getValue().trySExtValue(); + if (!Val) + return false; + if (!Missing.erase(*Val)) + return false; + if (OtherSucc && OtherSucc != Case.getCaseSuccessor()) + return false; + OtherSucc = Case.getCaseSuccessor(); + OtherSuccWeight += Weights[Case.getSuccessorIndex()]; + } + + assert(Missing.size() == 1 && "Should have one case left"); + Res = *Missing.begin(); + } else if (SI->getNumCases() == 3 && SI->defaultDestUndefined()) { + // Normalize so that Succ is taken once and OtherSucc twice. + Unreachable = SI->getDefaultDest(); + Succ = OtherSucc = nullptr; + for (auto &Case : SI->cases()) { + BasicBlock *NewSucc = Case.getCaseSuccessor(); + uint32_t Weight = Weights[Case.getSuccessorIndex()]; + if (!OtherSucc || OtherSucc == NewSucc) { + OtherSucc = NewSucc; + OtherSuccWeight += Weight; + } else if (!Succ) { + Succ = NewSucc; + SuccWeight = Weight; + } else if (Succ == NewSucc) { + std::swap(Succ, OtherSucc); + std::swap(SuccWeight, OtherSuccWeight); + } else + return false; + } + for (auto &Case : SI->cases()) { + std::optional Val = + Case.getCaseValue()->getValue().trySExtValue(); + if (!Val || (Val != 1 && Val != 0 && Val != -1)) + return false; + if (Case.getCaseSuccessor() == Succ) { + Res = *Val; + break; + } + } + } else { + return false; + } + + // Determine predicate for the missing case. + ICmpInst::Predicate Pred; + switch (Res) { + case 1: + Pred = ICmpInst::ICMP_UGT; + break; + case 0: + Pred = ICmpInst::ICMP_EQ; + break; + case -1: + Pred = ICmpInst::ICMP_ULT; + break; + } + if (Cmp->isSigned()) + Pred = ICmpInst::getSignedPredicate(Pred); + + MDNode *NewWeights = nullptr; + if (HasWeights) + NewWeights = MDBuilder(SI->getContext()) + .createBranchWeights(SuccWeight, OtherSuccWeight); + + BasicBlock *BB = SI->getParent(); + Builder.SetInsertPoint(SI->getIterator()); + Value *ICmp = Builder.CreateICmp(Pred, Cmp->getLHS(), Cmp->getRHS()); + Builder.CreateCondBr(ICmp, Succ, OtherSucc, NewWeights, + SI->getMetadata(LLVMContext::MD_unpredictable)); + OtherSucc->removePredecessor(BB); + if (Unreachable) + Unreachable->removePredecessor(BB); + SI->eraseFromParent(); + Cmp->eraseFromParent(); + if (DTU && Unreachable) + DTU->applyUpdates({{DominatorTree::Delete, BB, Unreachable}}); + return true; +} + bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { BasicBlock *BB = SI->getParent(); @@ -7163,6 +7276,9 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { if (eliminateDeadSwitchCases(SI, DTU, Options.AC, DL)) return requestResimplify(); + if (simplifySwitchOfCmpIntrinsic(SI, Builder, DTU)) + return requestResimplify(); + if (trySwitchToSelect(SI, Builder, DTU, DL, TTI)) return requestResimplify(); diff --git a/llvm/test/Transforms/SimplifyCFG/switch-on-cmp.ll b/llvm/test/Transforms/SimplifyCFG/switch-on-cmp.ll index 1ce18533d156d0..6230a319495dba 100644 --- a/llvm/test/Transforms/SimplifyCFG/switch-on-cmp.ll +++ b/llvm/test/Transforms/SimplifyCFG/switch-on-cmp.ll @@ -4,11 +4,8 @@ define void @ucmp_gt1(i32 %a, i32 %b) { ; CHECK-LABEL: define void @ucmp_gt1( ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { -; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) -; CHECK-NEXT: switch i8 [[RES]], label %[[BB1:.*]] [ -; CHECK-NEXT: i8 -1, label %[[BB2:.*]] -; CHECK-NEXT: i8 0, label %[[BB2]] -; CHECK-NEXT: ] +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[A]], [[B]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB1:.*]], label %[[BB2:.*]] ; CHECK: [[BB1]]: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label %[[BB2]] @@ -32,11 +29,8 @@ bb2: define void @ucmp_gt2(i32 %a, i32 %b) { ; CHECK-LABEL: define void @ucmp_gt2( ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { -; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) -; CHECK-NEXT: switch i8 [[RES]], label %[[BB1:.*]] [ -; CHECK-NEXT: i8 0, label %[[BB2:.*]] -; CHECK-NEXT: i8 -1, label %[[BB2]] -; CHECK-NEXT: ] +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[A]], [[B]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB1:.*]], label %[[BB2:.*]] ; CHECK: [[BB1]]: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label %[[BB2]] @@ -60,11 +54,8 @@ bb2: define void @ucmp_lt1(i32 %a, i32 %b) { ; CHECK-LABEL: define void @ucmp_lt1( ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { -; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) -; CHECK-NEXT: switch i8 [[RES]], label %[[BB2:.*]] [ -; CHECK-NEXT: i8 1, label %[[BB1:.*]] -; CHECK-NEXT: i8 0, label %[[BB1]] -; CHECK-NEXT: ] +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[A]], [[B]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB1:.*]] ; CHECK: [[BB1]]: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label %[[BB2]] @@ -88,11 +79,8 @@ bb2: define void @ucmp_lt2(i32 %a, i32 %b) { ; CHECK-LABEL: define void @ucmp_lt2( ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { -; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) -; CHECK-NEXT: switch i8 [[RES]], label %[[BB2:.*]] [ -; CHECK-NEXT: i8 0, label %[[BB1:.*]] -; CHECK-NEXT: i8 1, label %[[BB1]] -; CHECK-NEXT: ] +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[A]], [[B]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB1:.*]] ; CHECK: [[BB1]]: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label %[[BB2]] @@ -116,11 +104,8 @@ bb2: define void @ucmp_eq1(i32 %a, i32 %b) { ; CHECK-LABEL: define void @ucmp_eq1( ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { -; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) -; CHECK-NEXT: switch i8 [[RES]], label %[[BB1:.*]] [ -; CHECK-NEXT: i8 -1, label %[[BB2:.*]] -; CHECK-NEXT: i8 1, label %[[BB2]] -; CHECK-NEXT: ] +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[A]], [[B]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB1:.*]], label %[[BB2:.*]] ; CHECK: [[BB1]]: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label %[[BB2]] @@ -144,11 +129,8 @@ bb2: define void @ucmp_eq2(i32 %a, i32 %b) { ; CHECK-LABEL: define void @ucmp_eq2( ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { -; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) -; CHECK-NEXT: switch i8 [[RES]], label %[[BB1:.*]] [ -; CHECK-NEXT: i8 1, label %[[BB2:.*]] -; CHECK-NEXT: i8 -1, label %[[BB2]] -; CHECK-NEXT: ] +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[A]], [[B]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB1:.*]], label %[[BB2:.*]] ; CHECK: [[BB1]]: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label %[[BB2]] @@ -172,11 +154,8 @@ bb2: define void @scmp_gt1(i32 %a, i32 %b) { ; CHECK-LABEL: define void @scmp_gt1( ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { -; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[A]], i32 [[B]]) -; CHECK-NEXT: switch i8 [[RES]], label %[[BB1:.*]] [ -; CHECK-NEXT: i8 -1, label %[[BB2:.*]] -; CHECK-NEXT: i8 0, label %[[BB2]] -; CHECK-NEXT: ] +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[A]], [[B]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB1:.*]], label %[[BB2:.*]] ; CHECK: [[BB1]]: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label %[[BB2]] @@ -200,11 +179,8 @@ bb2: define void @scmp_gt2(i32 %a, i32 %b) { ; CHECK-LABEL: define void @scmp_gt2( ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { -; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[A]], i32 [[B]]) -; CHECK-NEXT: switch i8 [[RES]], label %[[BB1:.*]] [ -; CHECK-NEXT: i8 0, label %[[BB2:.*]] -; CHECK-NEXT: i8 -1, label %[[BB2]] -; CHECK-NEXT: ] +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[A]], [[B]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB1:.*]], label %[[BB2:.*]] ; CHECK: [[BB1]]: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label %[[BB2]] @@ -259,16 +235,13 @@ define i32 @ucmp_gt_phi(i32 %a, i32 %b) { ; CHECK-LABEL: define i32 @ucmp_gt_phi( ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) -; CHECK-NEXT: switch i8 [[RES]], label %[[BB1:.*]] [ -; CHECK-NEXT: i8 -1, label %[[BB2:.*]] -; CHECK-NEXT: i8 0, label %[[BB2]] -; CHECK-NEXT: ] +; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i32 [[A]], [[B]] +; CHECK-NEXT: br i1 [[TMP0]], label %[[BB1:.*]], label %[[BB2:.*]] ; CHECK: [[BB1]]: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label %[[BB2]] ; CHECK: [[BB2]]: -; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, %[[BB1]] ], [ 1, %[[ENTRY]] ], [ 1, %[[ENTRY]] ] +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, %[[BB1]] ], [ 1, %[[ENTRY]] ] ; CHECK-NEXT: ret i32 [[PHI]] ; entry: @@ -380,5 +353,356 @@ bb2: ret void } +define void @ucmp_gt_unpredictable(i32 %a, i32 %b) { +; CHECK-LABEL: define void @ucmp_gt_unpredictable( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[A]], [[B]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB1:.*]], label %[[BB2:.*]], !unpredictable [[META0:![0-9]+]] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: ret void +; + %res = call i8 @llvm.ucmp.i8.i32(i32 %a, i32 %b) + switch i8 %res, label %bb1 [ + i8 -1, label %bb2 + i8 0, label %bb2 + ], !unpredictable !{} + +bb1: + call void @foo() + br label %bb2 + +bb2: + ret void +} + +define void @ucmp_gt_weights(i32 %a, i32 %b) { +; CHECK-LABEL: define void @ucmp_gt_weights( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[A]], [[B]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB1:.*]], label %[[BB2:.*]], !prof [[PROF1:![0-9]+]] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: ret void +; + %res = call i8 @llvm.ucmp.i8.i32(i32 %a, i32 %b) + switch i8 %res, label %bb1 [ + i8 -1, label %bb2 + i8 0, label %bb2 + ], !prof !{!"branch_weights", i32 5, i32 10, i32 20} + +bb1: + call void @foo() + br label %bb2 + +bb2: + ret void +} + +define void @ucmp_gt_unreachable(i32 %a, i32 %b) { +; CHECK-LABEL: define void @ucmp_gt_unreachable( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[A]], [[B]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB1:.*]], label %[[BB2:.*]] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: ret void +; + %res = call i8 @llvm.ucmp.i8.i32(i32 %a, i32 %b) + switch i8 %res, label %unreachable [ + i8 -1, label %bb2 + i8 0, label %bb2 + i8 1, label %bb1 + ] + +bb1: + call void @foo() + br label %bb2 + +bb2: + ret void + +unreachable: + unreachable +} + +define void @ucmp_lt_unreachable(i32 %a, i32 %b) { +; CHECK-LABEL: define void @ucmp_lt_unreachable( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[A]], [[B]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB1:.*]], label %[[BB2:.*]] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: ret void +; + %res = call i8 @llvm.ucmp.i8.i32(i32 %a, i32 %b) + switch i8 %res, label %unreachable [ + i8 -1, label %bb1 + i8 0, label %bb2 + i8 1, label %bb2 + ] + +bb1: + call void @foo() + br label %bb2 + +bb2: + ret void + +unreachable: + unreachable +} + +define void @ucmp_eq_unreachable(i32 %a, i32 %b) { +; CHECK-LABEL: define void @ucmp_eq_unreachable( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[A]], [[B]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB1:.*]], label %[[BB2:.*]] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: ret void +; + %res = call i8 @llvm.ucmp.i8.i32(i32 %a, i32 %b) + switch i8 %res, label %unreachable [ + i8 -1, label %bb2 + i8 0, label %bb1 + i8 1, label %bb2 + ] + +bb1: + call void @foo() + br label %bb2 + +bb2: + ret void + +unreachable: + unreachable +} + +define void @ucmp_gt_unreachable_multi_edge(i8 %x, i32 %a, i32 %b) { +; CHECK-LABEL: define void @ucmp_gt_unreachable_multi_edge( +; CHECK-SAME: i8 [[X:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: switch i8 [[X]], label %[[UNREACHABLE:.*]] [ +; CHECK-NEXT: i8 0, label %[[SW:.*]] +; CHECK-NEXT: i8 1, label %[[BB1:.*]] +; CHECK-NEXT: ] +; CHECK: [[SW]]: +; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i32 [[A]], [[B]] +; CHECK-NEXT: br i1 [[TMP0]], label %[[BB1]], label %[[BB2:.*]] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: ret void +; CHECK: [[UNREACHABLE]]: +; CHECK-NEXT: unreachable +; +entry: + switch i8 %x, label %unreachable [ + i8 0, label %sw + i8 1, label %bb1 + ] + +sw: + %res = call i8 @llvm.ucmp.i8.i32(i32 %a, i32 %b) + switch i8 %res, label %unreachable [ + i8 -1, label %bb2 + i8 0, label %bb2 + i8 1, label %bb1 + ] + +bb1: + call void @foo() + br label %bb2 + +bb2: + ret void + +unreachable: + %phi = phi i32 [ 0, %entry ], [ 1, %sw ] + unreachable +} + +define void @ucmp_gt_unreachable_wrong_case(i32 %a, i32 %b) { +; CHECK-LABEL: define void @ucmp_gt_unreachable_wrong_case( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) +; CHECK-NEXT: switch i8 [[RES]], label %[[UNREACHABLE:.*]] [ +; CHECK-NEXT: i8 -2, label %[[BB2:.*]] +; CHECK-NEXT: i8 0, label %[[BB2]] +; CHECK-NEXT: i8 1, label %[[BB1:.*]] +; CHECK-NEXT: ] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: ret void +; CHECK: [[UNREACHABLE]]: +; CHECK-NEXT: unreachable +; + %res = call i8 @llvm.ucmp.i8.i32(i32 %a, i32 %b) + switch i8 %res, label %unreachable [ + i8 -2, label %bb2 + i8 0, label %bb2 + i8 1, label %bb1 + ] + +bb1: + call void @foo() + br label %bb2 + +bb2: + ret void + +unreachable: + unreachable +} + +define void @ucmp_gt_unreachable_no_two_equal_cases(i32 %a, i32 %b) { +; CHECK-LABEL: define void @ucmp_gt_unreachable_no_two_equal_cases( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) +; CHECK-NEXT: switch i8 [[RES]], label %[[UNREACHABLE:.*]] [ +; CHECK-NEXT: i8 -1, label %[[BB3:.*]] +; CHECK-NEXT: i8 0, label %[[BB2:.*]] +; CHECK-NEXT: i8 1, label %[[BB1:.*]] +; CHECK-NEXT: ] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB3]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: ret void +; CHECK: [[UNREACHABLE]]: +; CHECK-NEXT: unreachable +; + %res = call i8 @llvm.ucmp.i8.i32(i32 %a, i32 %b) + switch i8 %res, label %unreachable [ + i8 -1, label %bb3 + i8 0, label %bb2 + i8 1, label %bb1 + ] + +bb1: + call void @foo() + br label %bb2 + +bb3: + call void @foo() + br label %bb2 + +bb2: + ret void + +unreachable: + unreachable +} + +define void @ucmp_gt_unreachable_three_equal_cases(i32 %a, i32 %b) { +; CHECK-LABEL: define void @ucmp_gt_unreachable_three_equal_cases( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[BB1:.*:]] +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: ret void +; + %res = call i8 @llvm.ucmp.i8.i32(i32 %a, i32 %b) + switch i8 %res, label %unreachable [ + i8 -1, label %bb1 + i8 0, label %bb1 + i8 1, label %bb1 + ] + +bb1: + call void @foo() + ret void + +unreachable: + unreachable +} + +define void @ucmp_gt_unreachable_default_not_unreachable(i32 %a, i32 %b) { +; CHECK-LABEL: define void @ucmp_gt_unreachable_default_not_unreachable( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) +; CHECK-NEXT: switch i8 [[RES]], label %[[NOT_UNREACHABLE:.*]] [ +; CHECK-NEXT: i8 -1, label %[[BB2:.*]] +; CHECK-NEXT: i8 0, label %[[BB2]] +; CHECK-NEXT: i8 1, label %[[BB1:.*]] +; CHECK-NEXT: ] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: ret void +; CHECK: [[NOT_UNREACHABLE]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; + %res = call i8 @llvm.ucmp.i8.i32(i32 %a, i32 %b) + switch i8 %res, label %not.unreachable [ + i8 -1, label %bb2 + i8 0, label %bb2 + i8 1, label %bb1 + ] + +bb1: + call void @foo() + br label %bb2 + +bb2: + ret void + +not.unreachable: + call void @foo() + br label %bb2 +} + +define void @ucmp_gt_unreachable_weights(i32 %a, i32 %b) { +; CHECK-LABEL: define void @ucmp_gt_unreachable_weights( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[A]], [[B]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB1:.*]], label %[[BB2:.*]], !prof [[PROF1]] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %[[BB2]] +; CHECK: [[BB2]]: +; CHECK-NEXT: ret void +; + %res = call i8 @llvm.ucmp.i8.i32(i32 %a, i32 %b) + switch i8 %res, label %unreachable [ + i8 -1, label %bb2 + i8 0, label %bb2 + i8 1, label %bb1 + ], !prof !{!"branch_weights", i32 0, i32 10, i32 20, i32 5} + +bb1: + call void @foo() + br label %bb2 + +bb2: + ret void + +unreachable: + unreachable +} + declare void @use(i8) declare void @foo() +;. +; CHECK: [[META0]] = !{} +; CHECK: [[PROF1]] = !{!"branch_weights", i32 5, i32 30} +;. From 9402bb090824312882d47c8e52a1b1aeacbcfd3c Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 22 Aug 2024 11:03:34 -0400 Subject: [PATCH 209/426] [SLP]Do not count extractelement costs in unreachable/landing pad blocks. If the external user of the scalar to be extract is in unreachable/landing pad block, we can skip counting their cost. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/105667 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 16 ++++++++++------ .../X86/same-scalar-in-same-phi-extract.ll | 3 ++- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 8f70a43465b8ac..d7763a022f3b6e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -10768,17 +10768,21 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { std::optional> ValueToExtUses; DenseMap> ExtractsCount; for (ExternalUser &EU : ExternalUses) { + // Uses by ephemeral values are free (because the ephemeral value will be + // removed prior to code generation, and so the extraction will be + // removed as well) as well as uses in unreachable blocks or in landing pads + // (rarely executed). + if (EphValues.count(EU.User) || + (EU.User && + (!DT->isReachableFromEntry(cast(EU.User)->getParent()) || + cast(EU.User)->getParent()->isLandingPad()))) + continue; + // We only add extract cost once for the same scalar. if (!isa_and_nonnull(EU.User) && !ExtractCostCalculated.insert(EU.Scalar).second) continue; - // Uses by ephemeral values are free (because the ephemeral value will be - // removed prior to code generation, and so the extraction will be - // removed as well). - if (EphValues.count(EU.User)) - continue; - // No extract cost for vector "scalar" if (isa(EU.Scalar->getType())) continue; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll b/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll index 8bcf650d41d931..f1be11d0d0fc51 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll @@ -5,7 +5,6 @@ define void @test(i32 %arg) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: i32 [[ARG:%.*]]) { ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[ARG]] to i64 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[ARG]], i32 0 ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: @@ -15,6 +14,8 @@ define void @test(i32 %arg) { ; CHECK-NEXT: i32 1, label [[BB4:%.*]] ; CHECK-NEXT: ] ; CHECK: bb3: +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 ; CHECK-NEXT: switch i32 0, label [[BB10]] [ ; CHECK-NEXT: i32 18, label [[BB7:%.*]] ; CHECK-NEXT: i32 1, label [[BB7]] From ec5e58519d24010beea937fccf5fc4541db3ec21 Mon Sep 17 00:00:00 2001 From: Mital Ashok Date: Thu, 22 Aug 2024 16:04:39 +0100 Subject: [PATCH 210/426] [NFC] Replace bool <= bool comparison (#102948) Static analyser tool cppcheck flags ordered comparison with `bool`s. Replace with equivalent logical operators to prevent this. Closes #102912 --- clang/lib/Sema/SemaOverload.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 52f640eb96b73b..1ce0fa091938d7 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -509,7 +509,7 @@ NarrowingKind StandardConversionSequence::getNarrowingKind( constexpr auto CanRepresentAll = [](bool FromSigned, unsigned FromWidth, bool ToSigned, unsigned ToWidth) { return (FromWidth < ToWidth + (FromSigned == ToSigned)) && - (FromSigned <= ToSigned); + !(FromSigned && !ToSigned); }; if (CanRepresentAll(FromSigned, FromWidth, ToSigned, ToWidth)) @@ -542,7 +542,7 @@ NarrowingKind StandardConversionSequence::getNarrowingKind( // If the bit-field width was dependent, it might end up being small // enough to fit in the target type (unless the target type is unsigned // and the source type is signed, in which case it will never fit) - if (DependentBitField && (FromSigned <= ToSigned)) + if (DependentBitField && !(FromSigned && !ToSigned)) return NK_Dependent_Narrowing; // Otherwise, such a conversion is always narrowing From c4c5fdd933fa2d1f7624d863d05a4fb982b4c074 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 22 Aug 2024 16:11:12 +0100 Subject: [PATCH 211/426] [AMDGPU] Generate checks for vector indexing. NFC. (#105668) This allows combining some test files that were only split because adding new RUN lines introduced too much churn in the checks. --- .../AMDGPU/indirect-addressing-si-gfx9.ll | 67 - .../AMDGPU/indirect-addressing-si-noopt.ll | 63 - .../AMDGPU/indirect-addressing-si-pregfx9.ll | 53 - .../CodeGen/AMDGPU/indirect-addressing-si.ll | 8379 ++++++++++++++++- 4 files changed, 8066 insertions(+), 496 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll delete mode 100644 llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll index 31fa32b3475cb7..872a457a3b5c34 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll @@ -2,70 +2,6 @@ ; indexing of vectors. -; Subtest below moved from file test/CodeGen/AMDGPU/indirect-addressing-si.ll -; to avoid gfx9 scheduling induced issues. - - -; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block: -; GCN-DAG: s_load_dwordx16 s[[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]] -; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]] -; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62 - -; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]] -; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]] - -; GCN: v_cmp_eq_u32_e32 -; GCN-COUNT-32: v_cndmask_b32 - -; GCN-COUNT-4: buffer_store_dwordx4 -define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in, <16 x i32> %vec0) #0 { -entry: - %id = call i32 @llvm.amdgcn.workitem.id.x() #1 - %id.ext = zext i32 %id to i64 - %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %id.ext - %idx0 = load volatile i32, ptr addrspace(1) %gep - %idx1 = add i32 %idx0, 1 - %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"() - %vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0 - %vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1 - store volatile <16 x i32> %vec2, ptr addrspace(1) %out0 - %cmp = icmp eq i32 %id, 0 - br i1 %cmp, label %bb1, label %bb2 - -bb1: - store volatile i32 %live.out.val, ptr addrspace(1) undef - br label %bb2 - -bb2: - ret void -} - -; Avoid inserting extra v_mov from copies within the vgpr indexing sequence. The -; gpr_idx mode switching sequence is expanded late for this reason. - -; GCN-LABEL: {{^}}insert_w_offset_multiple_in_block - -; GCN: s_set_gpr_idx_on -; GCN-NEXT: v_mov_b32_e32 -; GCN-NEXT: s_set_gpr_idx_off - -; GCN: s_set_gpr_idx_on -; GCN-NEXT: v_mov_b32_e32 -; GCN-NOT: v_mov_b32_e32 -; GCN-NEXT: s_set_gpr_idx_off -define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %out1, i32 %in) #0 { -entry: - %add1 = add i32 %in, 1 - %ins1 = insertelement <16 x float> , float 17.0, i32 %add1 - %add2 = add i32 %in, 2 - %ins2 = insertelement <16 x float> %ins1, float 17.0, i32 %add2 - store <16 x float> %ins1, ptr addrspace(1) %out1 - %out2 = getelementptr <16 x float>, ptr addrspace(1) %out1, i32 1 - store <16 x float> %ins2, ptr addrspace(1) %out2 - - ret void -} - declare hidden void @foo() ; For functions with calls, we were not accounting for m0_lo16/m0_hi16 @@ -83,7 +19,4 @@ define amdgpu_kernel void @insertelement_with_call(ptr addrspace(1) %ptr, i32 %i ret void } -declare i32 @llvm.amdgcn.workitem.id.x() #1 -declare void @llvm.amdgcn.s.barrier() #2 - attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll deleted file mode 100644 index 1a72140963d696..00000000000000 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll +++ /dev/null @@ -1,63 +0,0 @@ -; RUN: llc -O0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s - -; FIXME: Merge into indirect-addressing-si.ll - -; Make sure that TwoAddressInstructions keeps src0 as subregister sub0 -; of the tied implicit use and def of the super register. - -; CHECK-LABEL: {{^}}insert_wo_offset: -; CHECK: s_load_dword [[IN:s[0-9]+]] -; CHECK: s_mov_b32 m0, [[IN]] -; CHECK: v_movreld_b32_e32 v[[ELT0:[0-9]+]] -; CHECK: buffer_store_dwordx4 -; CHECK: buffer_store_dwordx4 -; CHECK: buffer_store_dwordx4 -; CHECK: buffer_store_dwordx4 -define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) { -entry: - %ins = insertelement <16 x float> , float 17.0, i32 %in - store <16 x float> %ins, ptr addrspace(1) %out - ret void -} - -; Make sure we don't hit use of undefined register errors when expanding an -; extract with undef index. - -; CHECK-LABEL: {{^}}extract_adjacent_blocks: -; CHECK: s_load_dword [[ARG:s[0-9]+]] -; CHECK: s_cmp_lg_u32 -; CHECK: s_cbranch_scc1 [[BB4:.LBB[0-9]+_[0-9]+]] - -; CHECK: buffer_load_dwordx4 - -; CHECK: s_branch [[ENDBB:.LBB[0-9]+_[0-9]+]] - -; CHECK: [[BB4]]: -; CHECK: buffer_load_dwordx4 - -; CHECK: [[ENDBB]]: -; CHECK: buffer_store_dword -; CHECK: s_endpgm - -define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) #0 { -bb: - %tmp = icmp eq i32 %arg, 0 - br i1 %tmp, label %bb1, label %bb4 - -bb1: - %tmp2 = load volatile <4 x float>, ptr addrspace(1) undef - %tmp3 = extractelement <4 x float> %tmp2, i32 undef - call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp2) #0 ; Prevent block optimize out - br label %bb7 - -bb4: - %tmp5 = load volatile <4 x float>, ptr addrspace(1) undef - %tmp6 = extractelement <4 x float> %tmp5, i32 undef - call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp5) #0 ; Prevent block optimize out - br label %bb7 - -bb7: - %tmp8 = phi float [ %tmp3, %bb1 ], [ %tmp6, %bb4 ] - store volatile float %tmp8, ptr addrspace(1) undef - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll deleted file mode 100644 index cbb5d9e1692843..00000000000000 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll +++ /dev/null @@ -1,53 +0,0 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s - -; Tests for indirect addressing on SI, which is implemented using dynamic -; indexing of vectors. - -; Subtest below moved from file test/CodeGen/AMDGPU/indirect-addressing-si.ll -; to avoid gfx9 scheduling induced issues. - - -; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block: -; GCN-DAG: s_load_dwordx16 s[[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]] -; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]] -; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62 - -; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]] -; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]] - -; GCN-DAG: v_add_{{i32|u32}}_e32 [[IDX1:v[0-9]+]], vcc, 1, [[IDX0]] - -; GCN: v_cmp_eq_u32_e32 -; GCN-COUNT-32: v_cndmask_b32 - -; GCN-COUNT-4: buffer_store_dwordx4 -define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in, <16 x i32> %vec0) #0 { -entry: - %id = call i32 @llvm.amdgcn.workitem.id.x() #1 - %id.ext = zext i32 %id to i64 - %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %id.ext - %idx0 = load volatile i32, ptr addrspace(1) %gep - %idx1 = add i32 %idx0, 1 - %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"() - %vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0 - %vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1 - store volatile <16 x i32> %vec2, ptr addrspace(1) %out0 - %cmp = icmp eq i32 %id, 0 - br i1 %cmp, label %bb1, label %bb2 - -bb1: - store volatile i32 %live.out.val, ptr addrspace(1) undef - br label %bb2 - -bb2: - ret void -} - -declare i32 @llvm.amdgcn.workitem.id.x() #1 -declare void @llvm.amdgcn.s.barrier() #2 - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } -attributes #2 = { nounwind convergent } diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index f095aef7a0cc81..c130eb04d02370 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -1,26 +1,197 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=tahiti -O0 < %s | FileCheck -check-prefix=NOOPT %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI-MOVREL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI,VI-MOVREL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-vgpr-index-mode < %s | FileCheck -check-prefixes=VI,VI-IDXMODE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9-IDXMODE %s ; Tests for indirect addressing on SI, which is implemented using dynamic ; indexing of vectors. - -; GCN-LABEL: {{^}}extract_w_offset: -; GCN-DAG: s_load_dword [[IN0:s[0-9]+]] -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 -; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0 -; GCN-DAG: s_add_i32 [[IN:s[0-9]+]], [[IN0]], 1 - -; MOVREL-DAG: s_mov_b32 m0, [[IN]] -; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]] - -; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(SRC0){{$}} -; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]] -; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) { +; NOOPT-LABEL: extract_w_offset: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s5, 1 +; NOOPT-NEXT: s_add_i32 s4, s4, s5 +; NOOPT-NEXT: s_mov_b32 s5, 0x41800000 +; NOOPT-NEXT: s_mov_b32 s6, 0x41700000 +; NOOPT-NEXT: s_mov_b32 s7, 0x41600000 +; NOOPT-NEXT: s_mov_b32 s8, 0x41500000 +; NOOPT-NEXT: s_mov_b32 s9, 0x41400000 +; NOOPT-NEXT: s_mov_b32 s10, 0x41300000 +; NOOPT-NEXT: s_mov_b32 s11, 0x41200000 +; NOOPT-NEXT: s_mov_b32 s12, 0x41100000 +; NOOPT-NEXT: s_mov_b32 s13, 0x41000000 +; NOOPT-NEXT: s_mov_b32 s14, 0x40e00000 +; NOOPT-NEXT: s_mov_b32 s15, 0x40c00000 +; NOOPT-NEXT: s_mov_b32 s16, 0x40a00000 +; NOOPT-NEXT: s_mov_b32 s17, 4.0 +; NOOPT-NEXT: s_mov_b32 s18, 0x40400000 +; NOOPT-NEXT: s_mov_b32 s19, 2.0 +; NOOPT-NEXT: s_mov_b32 s20, 1.0 +; NOOPT-NEXT: v_mov_b32_e32 v0, s20 +; NOOPT-NEXT: v_mov_b32_e32 v30, s19 +; NOOPT-NEXT: v_mov_b32_e32 v29, s18 +; NOOPT-NEXT: v_mov_b32_e32 v28, s17 +; NOOPT-NEXT: v_mov_b32_e32 v27, s16 +; NOOPT-NEXT: v_mov_b32_e32 v26, s15 +; NOOPT-NEXT: v_mov_b32_e32 v25, s14 +; NOOPT-NEXT: v_mov_b32_e32 v24, s13 +; NOOPT-NEXT: v_mov_b32_e32 v23, s12 +; NOOPT-NEXT: v_mov_b32_e32 v22, s11 +; NOOPT-NEXT: v_mov_b32_e32 v21, s10 +; NOOPT-NEXT: v_mov_b32_e32 v20, s9 +; NOOPT-NEXT: v_mov_b32_e32 v19, s8 +; NOOPT-NEXT: v_mov_b32_e32 v18, s7 +; NOOPT-NEXT: v_mov_b32_e32 v17, s6 +; NOOPT-NEXT: v_mov_b32_e32 v16, s5 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v30 +; NOOPT-NEXT: v_mov_b32_e32 v2, v29 +; NOOPT-NEXT: v_mov_b32_e32 v3, v28 +; NOOPT-NEXT: v_mov_b32_e32 v4, v27 +; NOOPT-NEXT: v_mov_b32_e32 v5, v26 +; NOOPT-NEXT: v_mov_b32_e32 v6, v25 +; NOOPT-NEXT: v_mov_b32_e32 v7, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v22 +; NOOPT-NEXT: v_mov_b32_e32 v10, v21 +; NOOPT-NEXT: v_mov_b32_e32 v11, v20 +; NOOPT-NEXT: v_mov_b32_e32 v12, v19 +; NOOPT-NEXT: v_mov_b32_e32 v13, v18 +; NOOPT-NEXT: v_mov_b32_e32 v14, v17 +; NOOPT-NEXT: v_mov_b32_e32 v15, v16 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_movrels_b32_e32 v0, v0 +; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extract_w_offset: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_add_i32 s4, s4, 1 +; SI-MOVREL-NEXT: s_mov_b32 m0, s4 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: extract_w_offset: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_add_i32 s4, s4, 1 +; VI-MOVREL-NEXT: s_mov_b32 m0, s4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-MOVREL-NEXT: v_movrels_b32_e32 v2, v0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; VI-MOVREL-NEXT: flat_store_dword v[0:1], v2 +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: extract_w_offset: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_add_i32 s4, s4, 1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, v0 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s1 +; VI-IDXMODE-NEXT: flat_store_dword v[0:1], v2 +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: extract_w_offset: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_add_i32 s4, s4, 1 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %idx = add i32 %in, 1 %elt = extractelement <16 x float> , i32 %idx @@ -29,24 +200,291 @@ entry: } ; XXX: Could do v_or_b32 directly -; GCN-LABEL: {{^}}extract_w_offset_salu_use_vector: -; GCN-DAG: s_or_b32 -; GCN-DAG: s_or_b32 -; GCN-DAG: s_or_b32 -; GCN-DAG: s_or_b32 -; MOVREL: s_mov_b32 m0 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} - - -; MOVREL: v_movrels_b32_e32 - -; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, gpr_idx(SRC0){{$}} -; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %out, i32 %in, <16 x i32> %or.val) { +; NOOPT-LABEL: extract_w_offset_salu_use_vector: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x19 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s21, 1 +; NOOPT-NEXT: s_add_i32 s4, s4, s21 +; NOOPT-NEXT: s_mov_b32 s5, s51 +; NOOPT-NEXT: s_mov_b32 s6, 16 +; NOOPT-NEXT: s_or_b32 s5, s5, s6 +; NOOPT-NEXT: s_mov_b32 s6, s50 +; NOOPT-NEXT: s_mov_b32 s7, 15 +; NOOPT-NEXT: s_or_b32 s6, s6, s7 +; NOOPT-NEXT: s_mov_b32 s7, s49 +; NOOPT-NEXT: s_mov_b32 s8, 14 +; NOOPT-NEXT: s_or_b32 s7, s7, s8 +; NOOPT-NEXT: s_mov_b32 s8, s48 +; NOOPT-NEXT: s_mov_b32 s9, 13 +; NOOPT-NEXT: s_or_b32 s8, s8, s9 +; NOOPT-NEXT: s_mov_b32 s9, s47 +; NOOPT-NEXT: s_mov_b32 s10, 12 +; NOOPT-NEXT: s_or_b32 s9, s9, s10 +; NOOPT-NEXT: s_mov_b32 s10, s46 +; NOOPT-NEXT: s_mov_b32 s11, 11 +; NOOPT-NEXT: s_or_b32 s10, s10, s11 +; NOOPT-NEXT: s_mov_b32 s11, s45 +; NOOPT-NEXT: s_mov_b32 s12, 10 +; NOOPT-NEXT: s_or_b32 s11, s11, s12 +; NOOPT-NEXT: s_mov_b32 s12, s44 +; NOOPT-NEXT: s_mov_b32 s13, 9 +; NOOPT-NEXT: s_or_b32 s12, s12, s13 +; NOOPT-NEXT: s_mov_b32 s13, s43 +; NOOPT-NEXT: s_mov_b32 s14, 8 +; NOOPT-NEXT: s_or_b32 s13, s13, s14 +; NOOPT-NEXT: s_mov_b32 s14, s42 +; NOOPT-NEXT: s_mov_b32 s15, 7 +; NOOPT-NEXT: s_or_b32 s14, s14, s15 +; NOOPT-NEXT: s_mov_b32 s15, s41 +; NOOPT-NEXT: s_mov_b32 s16, 6 +; NOOPT-NEXT: s_or_b32 s15, s15, s16 +; NOOPT-NEXT: s_mov_b32 s16, s40 +; NOOPT-NEXT: s_mov_b32 s17, 5 +; NOOPT-NEXT: s_or_b32 s16, s16, s17 +; NOOPT-NEXT: s_mov_b32 s17, s39 +; NOOPT-NEXT: s_mov_b32 s18, 4 +; NOOPT-NEXT: s_or_b32 s17, s17, s18 +; NOOPT-NEXT: s_mov_b32 s18, s38 +; NOOPT-NEXT: s_mov_b32 s19, 3 +; NOOPT-NEXT: s_or_b32 s18, s18, s19 +; NOOPT-NEXT: s_mov_b32 s19, s37 +; NOOPT-NEXT: s_mov_b32 s20, 2 +; NOOPT-NEXT: s_or_b32 s19, s19, s20 +; NOOPT-NEXT: s_mov_b32 s20, s36 +; NOOPT-NEXT: s_or_b32 s20, s20, s21 +; NOOPT-NEXT: v_mov_b32_e32 v0, s20 +; NOOPT-NEXT: v_mov_b32_e32 v30, s19 +; NOOPT-NEXT: v_mov_b32_e32 v29, s18 +; NOOPT-NEXT: v_mov_b32_e32 v28, s17 +; NOOPT-NEXT: v_mov_b32_e32 v27, s16 +; NOOPT-NEXT: v_mov_b32_e32 v26, s15 +; NOOPT-NEXT: v_mov_b32_e32 v25, s14 +; NOOPT-NEXT: v_mov_b32_e32 v24, s13 +; NOOPT-NEXT: v_mov_b32_e32 v23, s12 +; NOOPT-NEXT: v_mov_b32_e32 v22, s11 +; NOOPT-NEXT: v_mov_b32_e32 v21, s10 +; NOOPT-NEXT: v_mov_b32_e32 v20, s9 +; NOOPT-NEXT: v_mov_b32_e32 v19, s8 +; NOOPT-NEXT: v_mov_b32_e32 v18, s7 +; NOOPT-NEXT: v_mov_b32_e32 v17, s6 +; NOOPT-NEXT: v_mov_b32_e32 v16, s5 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v30 +; NOOPT-NEXT: v_mov_b32_e32 v2, v29 +; NOOPT-NEXT: v_mov_b32_e32 v3, v28 +; NOOPT-NEXT: v_mov_b32_e32 v4, v27 +; NOOPT-NEXT: v_mov_b32_e32 v5, v26 +; NOOPT-NEXT: v_mov_b32_e32 v6, v25 +; NOOPT-NEXT: v_mov_b32_e32 v7, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v22 +; NOOPT-NEXT: v_mov_b32_e32 v10, v21 +; NOOPT-NEXT: v_mov_b32_e32 v11, v20 +; NOOPT-NEXT: v_mov_b32_e32 v12, v19 +; NOOPT-NEXT: v_mov_b32_e32 v13, v18 +; NOOPT-NEXT: v_mov_b32_e32 v14, v17 +; NOOPT-NEXT: v_mov_b32_e32 v15, v16 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_movrels_b32_e32 v0, v0 +; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extract_w_offset_salu_use_vector: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dword s20, s[2:3], 0xb +; SI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_add_i32 s20, s20, 1 +; SI-MOVREL-NEXT: s_or_b32 s4, s4, 1 +; SI-MOVREL-NEXT: s_or_b32 s19, s19, 16 +; SI-MOVREL-NEXT: s_or_b32 s18, s18, 15 +; SI-MOVREL-NEXT: s_or_b32 s17, s17, 14 +; SI-MOVREL-NEXT: s_or_b32 s16, s16, 13 +; SI-MOVREL-NEXT: s_or_b32 s15, s15, 12 +; SI-MOVREL-NEXT: s_or_b32 s14, s14, 11 +; SI-MOVREL-NEXT: s_or_b32 s13, s13, 10 +; SI-MOVREL-NEXT: s_or_b32 s12, s12, 9 +; SI-MOVREL-NEXT: s_or_b32 s11, s11, 8 +; SI-MOVREL-NEXT: s_or_b32 s10, s10, 7 +; SI-MOVREL-NEXT: s_or_b32 s9, s9, 6 +; SI-MOVREL-NEXT: s_or_b32 s8, s8, 5 +; SI-MOVREL-NEXT: s_or_b32 s7, s7, 4 +; SI-MOVREL-NEXT: s_or_b32 s6, s6, 3 +; SI-MOVREL-NEXT: s_or_b32 s5, s5, 2 +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 +; SI-MOVREL-NEXT: s_mov_b32 m0, s20 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s18 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s19 +; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: extract_w_offset_salu_use_vector: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dword s20, s[2:3], 0x2c +; VI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_add_i32 s20, s20, 1 +; VI-MOVREL-NEXT: s_or_b32 s6, s6, 3 +; VI-MOVREL-NEXT: s_or_b32 s5, s5, 2 +; VI-MOVREL-NEXT: s_or_b32 s4, s4, 1 +; VI-MOVREL-NEXT: s_or_b32 s2, s19, 16 +; VI-MOVREL-NEXT: s_or_b32 s3, s18, 15 +; VI-MOVREL-NEXT: s_or_b32 s17, s17, 14 +; VI-MOVREL-NEXT: s_or_b32 s16, s16, 13 +; VI-MOVREL-NEXT: s_or_b32 s15, s15, 12 +; VI-MOVREL-NEXT: s_or_b32 s14, s14, 11 +; VI-MOVREL-NEXT: s_or_b32 s13, s13, 10 +; VI-MOVREL-NEXT: s_or_b32 s12, s12, 9 +; VI-MOVREL-NEXT: s_or_b32 s11, s11, 8 +; VI-MOVREL-NEXT: s_or_b32 s10, s10, 7 +; VI-MOVREL-NEXT: s_or_b32 s9, s9, 6 +; VI-MOVREL-NEXT: s_or_b32 s8, s8, 5 +; VI-MOVREL-NEXT: s_or_b32 s7, s7, 4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 +; VI-MOVREL-NEXT: s_mov_b32 m0, s20 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s2 +; VI-MOVREL-NEXT: v_movrels_b32_e32 v2, v0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; VI-MOVREL-NEXT: flat_store_dword v[0:1], v2 +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: extract_w_offset_salu_use_vector: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0x2c +; VI-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_add_i32 s20, s20, 1 +; VI-IDXMODE-NEXT: s_or_b32 s6, s6, 3 +; VI-IDXMODE-NEXT: s_or_b32 s5, s5, 2 +; VI-IDXMODE-NEXT: s_or_b32 s4, s4, 1 +; VI-IDXMODE-NEXT: s_or_b32 s2, s19, 16 +; VI-IDXMODE-NEXT: s_or_b32 s3, s18, 15 +; VI-IDXMODE-NEXT: s_or_b32 s17, s17, 14 +; VI-IDXMODE-NEXT: s_or_b32 s16, s16, 13 +; VI-IDXMODE-NEXT: s_or_b32 s15, s15, 12 +; VI-IDXMODE-NEXT: s_or_b32 s14, s14, 11 +; VI-IDXMODE-NEXT: s_or_b32 s13, s13, 10 +; VI-IDXMODE-NEXT: s_or_b32 s12, s12, 9 +; VI-IDXMODE-NEXT: s_or_b32 s11, s11, 8 +; VI-IDXMODE-NEXT: s_or_b32 s10, s10, 7 +; VI-IDXMODE-NEXT: s_or_b32 s9, s9, 6 +; VI-IDXMODE-NEXT: s_or_b32 s8, s8, 5 +; VI-IDXMODE-NEXT: s_or_b32 s7, s7, 4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, s2 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s20, gpr_idx(SRC0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, v0 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s1 +; VI-IDXMODE-NEXT: flat_store_dword v[0:1], v2 +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: extract_w_offset_salu_use_vector: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_add_i32 s20, s20, 1 +; GFX9-IDXMODE-NEXT: s_or_b32 s4, s4, 1 +; GFX9-IDXMODE-NEXT: s_or_b32 s2, s19, 16 +; GFX9-IDXMODE-NEXT: s_or_b32 s3, s18, 15 +; GFX9-IDXMODE-NEXT: s_or_b32 s17, s17, 14 +; GFX9-IDXMODE-NEXT: s_or_b32 s16, s16, 13 +; GFX9-IDXMODE-NEXT: s_or_b32 s15, s15, 12 +; GFX9-IDXMODE-NEXT: s_or_b32 s14, s14, 11 +; GFX9-IDXMODE-NEXT: s_or_b32 s13, s13, 10 +; GFX9-IDXMODE-NEXT: s_or_b32 s12, s12, 9 +; GFX9-IDXMODE-NEXT: s_or_b32 s11, s11, 8 +; GFX9-IDXMODE-NEXT: s_or_b32 s10, s10, 7 +; GFX9-IDXMODE-NEXT: s_or_b32 s9, s9, 6 +; GFX9-IDXMODE-NEXT: s_or_b32 s8, s8, 5 +; GFX9-IDXMODE-NEXT: s_or_b32 s7, s7, 4 +; GFX9-IDXMODE-NEXT: s_or_b32 s6, s6, 3 +; GFX9-IDXMODE-NEXT: s_or_b32 s5, s5, 2 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s3 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s2 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s20, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %idx = add i32 %in, 1 %vec = or <16 x i32> %or.val, @@ -55,38 +493,371 @@ entry: ret void } -; GCN-LABEL: {{^}}extract_wo_offset: -; GCN-DAG: s_load_dword [[IN:s[0-9]+]] -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 -; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0 - -; MOVREL-DAG: s_mov_b32 m0, [[IN]] -; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]] - -; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(SRC0){{$}} -; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]] -; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @extract_wo_offset(ptr addrspace(1) %out, i32 %in) { +; NOOPT-LABEL: extract_wo_offset: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s5, 0x41800000 +; NOOPT-NEXT: s_mov_b32 s6, 0x41700000 +; NOOPT-NEXT: s_mov_b32 s7, 0x41600000 +; NOOPT-NEXT: s_mov_b32 s8, 0x41500000 +; NOOPT-NEXT: s_mov_b32 s9, 0x41400000 +; NOOPT-NEXT: s_mov_b32 s10, 0x41300000 +; NOOPT-NEXT: s_mov_b32 s11, 0x41200000 +; NOOPT-NEXT: s_mov_b32 s12, 0x41100000 +; NOOPT-NEXT: s_mov_b32 s13, 0x41000000 +; NOOPT-NEXT: s_mov_b32 s14, 0x40e00000 +; NOOPT-NEXT: s_mov_b32 s15, 0x40c00000 +; NOOPT-NEXT: s_mov_b32 s16, 0x40a00000 +; NOOPT-NEXT: s_mov_b32 s17, 4.0 +; NOOPT-NEXT: s_mov_b32 s18, 0x40400000 +; NOOPT-NEXT: s_mov_b32 s19, 2.0 +; NOOPT-NEXT: s_mov_b32 s20, 1.0 +; NOOPT-NEXT: v_mov_b32_e32 v0, s20 +; NOOPT-NEXT: v_mov_b32_e32 v30, s19 +; NOOPT-NEXT: v_mov_b32_e32 v29, s18 +; NOOPT-NEXT: v_mov_b32_e32 v28, s17 +; NOOPT-NEXT: v_mov_b32_e32 v27, s16 +; NOOPT-NEXT: v_mov_b32_e32 v26, s15 +; NOOPT-NEXT: v_mov_b32_e32 v25, s14 +; NOOPT-NEXT: v_mov_b32_e32 v24, s13 +; NOOPT-NEXT: v_mov_b32_e32 v23, s12 +; NOOPT-NEXT: v_mov_b32_e32 v22, s11 +; NOOPT-NEXT: v_mov_b32_e32 v21, s10 +; NOOPT-NEXT: v_mov_b32_e32 v20, s9 +; NOOPT-NEXT: v_mov_b32_e32 v19, s8 +; NOOPT-NEXT: v_mov_b32_e32 v18, s7 +; NOOPT-NEXT: v_mov_b32_e32 v17, s6 +; NOOPT-NEXT: v_mov_b32_e32 v16, s5 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v30 +; NOOPT-NEXT: v_mov_b32_e32 v2, v29 +; NOOPT-NEXT: v_mov_b32_e32 v3, v28 +; NOOPT-NEXT: v_mov_b32_e32 v4, v27 +; NOOPT-NEXT: v_mov_b32_e32 v5, v26 +; NOOPT-NEXT: v_mov_b32_e32 v6, v25 +; NOOPT-NEXT: v_mov_b32_e32 v7, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v22 +; NOOPT-NEXT: v_mov_b32_e32 v10, v21 +; NOOPT-NEXT: v_mov_b32_e32 v11, v20 +; NOOPT-NEXT: v_mov_b32_e32 v12, v19 +; NOOPT-NEXT: v_mov_b32_e32 v13, v18 +; NOOPT-NEXT: v_mov_b32_e32 v14, v17 +; NOOPT-NEXT: v_mov_b32_e32 v15, v16 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_movrels_b32_e32 v0, v0 +; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extract_wo_offset: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_mov_b32 m0, s4 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: extract_wo_offset: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-MOVREL-NEXT: v_movrels_b32_e32 v2, v0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; VI-MOVREL-NEXT: flat_store_dword v[0:1], v2 +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: extract_wo_offset: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, v0 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s1 +; VI-IDXMODE-NEXT: flat_store_dword v[0:1], v2 +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: extract_wo_offset: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %elt = extractelement <16 x float> , i32 %in store float %elt, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}extract_neg_offset_sgpr: -; The offset depends on the register that holds the first element of the vector. -; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} -; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0 - -; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}} -; IDXMODE: v_mov_b32_e32 v14, 15 -; IDXMODE: v_mov_b32_e32 v15, 16 -; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0){{$}} -; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @extract_neg_offset_sgpr(ptr addrspace(1) %out, i32 %offset) { +; NOOPT-LABEL: extract_neg_offset_sgpr: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s5, 16 +; NOOPT-NEXT: s_mov_b32 s6, 15 +; NOOPT-NEXT: s_mov_b32 s7, 14 +; NOOPT-NEXT: s_mov_b32 s8, 13 +; NOOPT-NEXT: s_mov_b32 s9, 12 +; NOOPT-NEXT: s_mov_b32 s10, 11 +; NOOPT-NEXT: s_mov_b32 s11, 10 +; NOOPT-NEXT: s_mov_b32 s12, 9 +; NOOPT-NEXT: s_mov_b32 s13, 8 +; NOOPT-NEXT: s_mov_b32 s14, 7 +; NOOPT-NEXT: s_mov_b32 s15, 6 +; NOOPT-NEXT: s_mov_b32 s16, 5 +; NOOPT-NEXT: s_mov_b32 s17, 3 +; NOOPT-NEXT: s_mov_b32 s18, 2 +; NOOPT-NEXT: s_mov_b32 s19, 1 +; NOOPT-NEXT: s_mov_b32 s20, 0 +; NOOPT-NEXT: v_mov_b32_e32 v0, s20 +; NOOPT-NEXT: v_mov_b32_e32 v30, s19 +; NOOPT-NEXT: v_mov_b32_e32 v29, s18 +; NOOPT-NEXT: v_mov_b32_e32 v28, s17 +; NOOPT-NEXT: v_mov_b32_e32 v27, s16 +; NOOPT-NEXT: v_mov_b32_e32 v26, s15 +; NOOPT-NEXT: v_mov_b32_e32 v25, s14 +; NOOPT-NEXT: v_mov_b32_e32 v24, s13 +; NOOPT-NEXT: v_mov_b32_e32 v23, s12 +; NOOPT-NEXT: v_mov_b32_e32 v22, s11 +; NOOPT-NEXT: v_mov_b32_e32 v21, s10 +; NOOPT-NEXT: v_mov_b32_e32 v20, s9 +; NOOPT-NEXT: v_mov_b32_e32 v19, s8 +; NOOPT-NEXT: v_mov_b32_e32 v18, s7 +; NOOPT-NEXT: v_mov_b32_e32 v17, s6 +; NOOPT-NEXT: v_mov_b32_e32 v16, s5 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v30 +; NOOPT-NEXT: v_mov_b32_e32 v2, v29 +; NOOPT-NEXT: v_mov_b32_e32 v3, v28 +; NOOPT-NEXT: v_mov_b32_e32 v4, v27 +; NOOPT-NEXT: v_mov_b32_e32 v5, v26 +; NOOPT-NEXT: v_mov_b32_e32 v6, v25 +; NOOPT-NEXT: v_mov_b32_e32 v7, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v22 +; NOOPT-NEXT: v_mov_b32_e32 v10, v21 +; NOOPT-NEXT: v_mov_b32_e32 v11, v20 +; NOOPT-NEXT: v_mov_b32_e32 v12, v19 +; NOOPT-NEXT: v_mov_b32_e32 v13, v18 +; NOOPT-NEXT: v_mov_b32_e32 v14, v17 +; NOOPT-NEXT: v_mov_b32_e32 v15, v16 +; NOOPT-NEXT: s_add_i32 m0, s4, 0xfffffe00 +; NOOPT-NEXT: v_movrels_b32_e32 v0, v0 +; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extract_neg_offset_sgpr: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 0 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 1 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_add_i32 m0, s4, 0xfffffe00 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 2 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 3 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 8 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 11 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 16 +; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: extract_neg_offset_sgpr: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 2 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 3 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_add_i32 m0, s2, 0xfffffe00 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 7 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 8 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 9 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 10 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 11 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 12 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 13 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 14 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 15 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 16 +; VI-MOVREL-NEXT: v_movrels_b32_e32 v2, v0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; VI-MOVREL-NEXT: flat_store_dword v[0:1], v2 +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: extract_neg_offset_sgpr: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 3 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_addk_i32 s2, 0xfe00 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 7 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 8 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 9 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 10 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 11 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 12 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 13 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 14 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 15 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 16 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, v0 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s1 +; VI-IDXMODE-NEXT: flat_store_dword v[0:1], v2 +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: extract_neg_offset_sgpr: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 2 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 3 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_addk_i32 s4, 0xfe00 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 7 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 16 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %index = add i32 %offset, -512 %value = extractelement <16 x i32> , i32 %index @@ -94,32 +865,293 @@ entry: ret void } -; GCN-LABEL: {{^}}extract_neg_offset_sgpr_loaded: -; The offset depends on the register that holds the first element of the vector. -; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} -; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0 - -; IDXMODE-DAG: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}} -; IDXMODE-DAG: v_mov_b32_e32 v0, -; IDXMODE: v_mov_b32_e32 v1, -; IDXMODE: v_mov_b32_e32 v2, -; IDXMODE: v_mov_b32_e32 v3, -; IDXMODE: v_mov_b32_e32 v4, -; IDXMODE: v_mov_b32_e32 v5, -; IDXMODE: v_mov_b32_e32 v6, -; IDXMODE: v_mov_b32_e32 v7, -; IDXMODE: v_mov_b32_e32 v8, -; IDXMODE: v_mov_b32_e32 v9, -; IDXMODE: v_mov_b32_e32 v10, -; IDXMODE: v_mov_b32_e32 v11, -; IDXMODE: v_mov_b32_e32 v12, -; IDXMODE: v_mov_b32_e32 v13, -; IDXMODE: v_mov_b32_e32 v14, -; IDXMODE: v_mov_b32_e32 v15, -; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0){{$}} -; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, <16 x i32> %vec0, <16 x i32> %vec1, i32 %offset) { +; NOOPT-LABEL: extract_neg_offset_sgpr_loaded: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x19 +; NOOPT-NEXT: s_load_dwordx16 s[52:67], s[2:3], 0x29 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0x39 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s6, s67 +; NOOPT-NEXT: s_mov_b32 s5, s51 +; NOOPT-NEXT: s_or_b32 s5, s5, s6 +; NOOPT-NEXT: s_mov_b32 s7, s66 +; NOOPT-NEXT: s_mov_b32 s6, s50 +; NOOPT-NEXT: s_or_b32 s6, s6, s7 +; NOOPT-NEXT: s_mov_b32 s8, s65 +; NOOPT-NEXT: s_mov_b32 s7, s49 +; NOOPT-NEXT: s_or_b32 s7, s7, s8 +; NOOPT-NEXT: s_mov_b32 s9, s64 +; NOOPT-NEXT: s_mov_b32 s8, s48 +; NOOPT-NEXT: s_or_b32 s8, s8, s9 +; NOOPT-NEXT: s_mov_b32 s10, s63 +; NOOPT-NEXT: s_mov_b32 s9, s47 +; NOOPT-NEXT: s_or_b32 s9, s9, s10 +; NOOPT-NEXT: s_mov_b32 s11, s62 +; NOOPT-NEXT: s_mov_b32 s10, s46 +; NOOPT-NEXT: s_or_b32 s10, s10, s11 +; NOOPT-NEXT: s_mov_b32 s12, s61 +; NOOPT-NEXT: s_mov_b32 s11, s45 +; NOOPT-NEXT: s_or_b32 s11, s11, s12 +; NOOPT-NEXT: s_mov_b32 s13, s60 +; NOOPT-NEXT: s_mov_b32 s12, s44 +; NOOPT-NEXT: s_or_b32 s12, s12, s13 +; NOOPT-NEXT: s_mov_b32 s14, s59 +; NOOPT-NEXT: s_mov_b32 s13, s43 +; NOOPT-NEXT: s_or_b32 s13, s13, s14 +; NOOPT-NEXT: s_mov_b32 s15, s58 +; NOOPT-NEXT: s_mov_b32 s14, s42 +; NOOPT-NEXT: s_or_b32 s14, s14, s15 +; NOOPT-NEXT: s_mov_b32 s16, s57 +; NOOPT-NEXT: s_mov_b32 s15, s41 +; NOOPT-NEXT: s_or_b32 s15, s15, s16 +; NOOPT-NEXT: s_mov_b32 s17, s56 +; NOOPT-NEXT: s_mov_b32 s16, s40 +; NOOPT-NEXT: s_or_b32 s16, s16, s17 +; NOOPT-NEXT: s_mov_b32 s18, s55 +; NOOPT-NEXT: s_mov_b32 s17, s39 +; NOOPT-NEXT: s_or_b32 s17, s17, s18 +; NOOPT-NEXT: s_mov_b32 s19, s54 +; NOOPT-NEXT: s_mov_b32 s18, s38 +; NOOPT-NEXT: s_or_b32 s18, s18, s19 +; NOOPT-NEXT: s_mov_b32 s20, s53 +; NOOPT-NEXT: s_mov_b32 s19, s37 +; NOOPT-NEXT: s_or_b32 s19, s19, s20 +; NOOPT-NEXT: s_mov_b32 s21, s52 +; NOOPT-NEXT: s_mov_b32 s20, s36 +; NOOPT-NEXT: s_or_b32 s20, s20, s21 +; NOOPT-NEXT: v_mov_b32_e32 v0, s20 +; NOOPT-NEXT: v_mov_b32_e32 v30, s19 +; NOOPT-NEXT: v_mov_b32_e32 v29, s18 +; NOOPT-NEXT: v_mov_b32_e32 v28, s17 +; NOOPT-NEXT: v_mov_b32_e32 v27, s16 +; NOOPT-NEXT: v_mov_b32_e32 v26, s15 +; NOOPT-NEXT: v_mov_b32_e32 v25, s14 +; NOOPT-NEXT: v_mov_b32_e32 v24, s13 +; NOOPT-NEXT: v_mov_b32_e32 v23, s12 +; NOOPT-NEXT: v_mov_b32_e32 v22, s11 +; NOOPT-NEXT: v_mov_b32_e32 v21, s10 +; NOOPT-NEXT: v_mov_b32_e32 v20, s9 +; NOOPT-NEXT: v_mov_b32_e32 v19, s8 +; NOOPT-NEXT: v_mov_b32_e32 v18, s7 +; NOOPT-NEXT: v_mov_b32_e32 v17, s6 +; NOOPT-NEXT: v_mov_b32_e32 v16, s5 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v30 +; NOOPT-NEXT: v_mov_b32_e32 v2, v29 +; NOOPT-NEXT: v_mov_b32_e32 v3, v28 +; NOOPT-NEXT: v_mov_b32_e32 v4, v27 +; NOOPT-NEXT: v_mov_b32_e32 v5, v26 +; NOOPT-NEXT: v_mov_b32_e32 v6, v25 +; NOOPT-NEXT: v_mov_b32_e32 v7, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v22 +; NOOPT-NEXT: v_mov_b32_e32 v10, v21 +; NOOPT-NEXT: v_mov_b32_e32 v11, v20 +; NOOPT-NEXT: v_mov_b32_e32 v12, v19 +; NOOPT-NEXT: v_mov_b32_e32 v13, v18 +; NOOPT-NEXT: v_mov_b32_e32 v14, v17 +; NOOPT-NEXT: v_mov_b32_e32 v15, v16 +; NOOPT-NEXT: s_add_i32 m0, s4, 0xfffffe00 +; NOOPT-NEXT: v_movrels_b32_e32 v0, v0 +; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extract_neg_offset_sgpr_loaded: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; SI-MOVREL-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x29 +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s20, s[2:3], 0x39 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_or_b32 s4, s4, s36 +; SI-MOVREL-NEXT: s_or_b32 s19, s19, s51 +; SI-MOVREL-NEXT: s_or_b32 s18, s18, s50 +; SI-MOVREL-NEXT: s_or_b32 s17, s17, s49 +; SI-MOVREL-NEXT: s_or_b32 s16, s16, s48 +; SI-MOVREL-NEXT: s_or_b32 s15, s15, s47 +; SI-MOVREL-NEXT: s_or_b32 s14, s14, s46 +; SI-MOVREL-NEXT: s_or_b32 s13, s13, s45 +; SI-MOVREL-NEXT: s_or_b32 s12, s12, s44 +; SI-MOVREL-NEXT: s_or_b32 s11, s11, s43 +; SI-MOVREL-NEXT: s_or_b32 s10, s10, s42 +; SI-MOVREL-NEXT: s_or_b32 s9, s9, s41 +; SI-MOVREL-NEXT: s_or_b32 s8, s8, s40 +; SI-MOVREL-NEXT: s_or_b32 s7, s7, s39 +; SI-MOVREL-NEXT: s_or_b32 s6, s6, s38 +; SI-MOVREL-NEXT: s_or_b32 s5, s5, s37 +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 +; SI-MOVREL-NEXT: s_add_i32 m0, s20, 0xfffffe00 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s18 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s19 +; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: extract_neg_offset_sgpr_loaded: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-MOVREL-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0xe4 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_or_b32 s6, s6, s38 +; VI-MOVREL-NEXT: s_or_b32 s5, s5, s37 +; VI-MOVREL-NEXT: s_or_b32 s4, s4, s36 +; VI-MOVREL-NEXT: s_or_b32 s3, s19, s51 +; VI-MOVREL-NEXT: s_or_b32 s18, s18, s50 +; VI-MOVREL-NEXT: s_or_b32 s17, s17, s49 +; VI-MOVREL-NEXT: s_or_b32 s16, s16, s48 +; VI-MOVREL-NEXT: s_or_b32 s15, s15, s47 +; VI-MOVREL-NEXT: s_or_b32 s14, s14, s46 +; VI-MOVREL-NEXT: s_or_b32 s13, s13, s45 +; VI-MOVREL-NEXT: s_or_b32 s12, s12, s44 +; VI-MOVREL-NEXT: s_or_b32 s11, s11, s43 +; VI-MOVREL-NEXT: s_or_b32 s10, s10, s42 +; VI-MOVREL-NEXT: s_or_b32 s9, s9, s41 +; VI-MOVREL-NEXT: s_or_b32 s8, s8, s40 +; VI-MOVREL-NEXT: s_or_b32 s7, s7, s39 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 +; VI-MOVREL-NEXT: s_add_i32 m0, s2, 0xfffffe00 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s18 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s3 +; VI-MOVREL-NEXT: v_movrels_b32_e32 v2, v0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; VI-MOVREL-NEXT: flat_store_dword v[0:1], v2 +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: extract_neg_offset_sgpr_loaded: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-IDXMODE-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0xe4 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_or_b32 s6, s6, s38 +; VI-IDXMODE-NEXT: s_or_b32 s5, s5, s37 +; VI-IDXMODE-NEXT: s_or_b32 s4, s4, s36 +; VI-IDXMODE-NEXT: s_or_b32 s3, s19, s51 +; VI-IDXMODE-NEXT: s_or_b32 s18, s18, s50 +; VI-IDXMODE-NEXT: s_or_b32 s17, s17, s49 +; VI-IDXMODE-NEXT: s_or_b32 s16, s16, s48 +; VI-IDXMODE-NEXT: s_or_b32 s15, s15, s47 +; VI-IDXMODE-NEXT: s_or_b32 s14, s14, s46 +; VI-IDXMODE-NEXT: s_or_b32 s13, s13, s45 +; VI-IDXMODE-NEXT: s_or_b32 s12, s12, s44 +; VI-IDXMODE-NEXT: s_or_b32 s11, s11, s43 +; VI-IDXMODE-NEXT: s_or_b32 s10, s10, s42 +; VI-IDXMODE-NEXT: s_or_b32 s9, s9, s41 +; VI-IDXMODE-NEXT: s_or_b32 s8, s8, s40 +; VI-IDXMODE-NEXT: s_or_b32 s7, s7, s39 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 +; VI-IDXMODE-NEXT: s_addk_i32 s2, 0xfe00 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, s18 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, s3 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, v0 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s1 +; VI-IDXMODE-NEXT: flat_store_dword v[0:1], v2 +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: extract_neg_offset_sgpr_loaded: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0xe4 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_or_b32 s4, s4, s36 +; GFX9-IDXMODE-NEXT: s_or_b32 s2, s19, s51 +; GFX9-IDXMODE-NEXT: s_or_b32 s3, s18, s50 +; GFX9-IDXMODE-NEXT: s_or_b32 s17, s17, s49 +; GFX9-IDXMODE-NEXT: s_or_b32 s16, s16, s48 +; GFX9-IDXMODE-NEXT: s_or_b32 s15, s15, s47 +; GFX9-IDXMODE-NEXT: s_or_b32 s14, s14, s46 +; GFX9-IDXMODE-NEXT: s_or_b32 s13, s13, s45 +; GFX9-IDXMODE-NEXT: s_or_b32 s12, s12, s44 +; GFX9-IDXMODE-NEXT: s_or_b32 s11, s11, s43 +; GFX9-IDXMODE-NEXT: s_or_b32 s10, s10, s42 +; GFX9-IDXMODE-NEXT: s_or_b32 s9, s9, s41 +; GFX9-IDXMODE-NEXT: s_or_b32 s8, s8, s40 +; GFX9-IDXMODE-NEXT: s_or_b32 s7, s7, s39 +; GFX9-IDXMODE-NEXT: s_or_b32 s6, s6, s38 +; GFX9-IDXMODE-NEXT: s_or_b32 s5, s5, s37 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-IDXMODE-NEXT: s_addk_i32 s20, 0xfe00 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s3 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s2 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s20, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %index = add i32 %offset, -512 %or = or <16 x i32> %vec0, %vec1 @@ -128,25 +1160,350 @@ entry: ret void } -; GCN-LABEL: {{^}}extract_neg_offset_vgpr: -; The offset depends on the register that holds the first element of the vector. - -; GCN: v_cmp_eq_u32_e32 -; GCN-COUNT-14: v_cndmask_b32 -; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], 16 -; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) { +; NOOPT-LABEL: extract_neg_offset_vgpr: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; NOOPT-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; NOOPT-NEXT: s_mov_b32 s22, -1 +; NOOPT-NEXT: s_mov_b32 s23, 0xe8f000 +; NOOPT-NEXT: s_add_u32 s20, s20, s9 +; NOOPT-NEXT: s_addc_u32 s21, s21, 0 +; NOOPT-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane +; NOOPT-NEXT: v_mov_b32_e32 v1, v0 +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s6, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s4, 0xf000 +; NOOPT-NEXT: s_mov_b32 s5, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s6 +; NOOPT-NEXT: s_mov_b32 s2, s5 +; NOOPT-NEXT: s_mov_b32 s3, s4 +; NOOPT-NEXT: s_waitcnt vmcnt(1) +; NOOPT-NEXT: v_writelane_b32 v0, s0, 0 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 1 +; NOOPT-NEXT: v_writelane_b32 v0, s2, 2 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 3 +; NOOPT-NEXT: s_mov_b32 s0, 16 +; NOOPT-NEXT: s_mov_b32 s1, 15 +; NOOPT-NEXT: s_mov_b32 s2, 14 +; NOOPT-NEXT: s_mov_b32 s3, 13 +; NOOPT-NEXT: s_mov_b32 s4, 12 +; NOOPT-NEXT: s_mov_b32 s5, 11 +; NOOPT-NEXT: s_mov_b32 s6, 10 +; NOOPT-NEXT: s_mov_b32 s7, 9 +; NOOPT-NEXT: s_mov_b32 s8, 8 +; NOOPT-NEXT: s_mov_b32 s9, 7 +; NOOPT-NEXT: s_mov_b32 s10, 6 +; NOOPT-NEXT: s_mov_b32 s11, 5 +; NOOPT-NEXT: s_mov_b32 s12, 3 +; NOOPT-NEXT: s_mov_b32 s13, 2 +; NOOPT-NEXT: s_mov_b32 s14, 1 +; NOOPT-NEXT: s_mov_b32 s15, 0 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v1, s15 +; NOOPT-NEXT: v_mov_b32_e32 v31, s14 +; NOOPT-NEXT: v_mov_b32_e32 v30, s13 +; NOOPT-NEXT: v_mov_b32_e32 v29, s12 +; NOOPT-NEXT: v_mov_b32_e32 v28, s11 +; NOOPT-NEXT: v_mov_b32_e32 v27, s10 +; NOOPT-NEXT: v_mov_b32_e32 v26, s9 +; NOOPT-NEXT: v_mov_b32_e32 v25, s8 +; NOOPT-NEXT: v_mov_b32_e32 v24, s7 +; NOOPT-NEXT: v_mov_b32_e32 v23, s6 +; NOOPT-NEXT: v_mov_b32_e32 v22, s5 +; NOOPT-NEXT: v_mov_b32_e32 v21, s4 +; NOOPT-NEXT: v_mov_b32_e32 v20, s3 +; NOOPT-NEXT: v_mov_b32_e32 v19, s2 +; NOOPT-NEXT: v_mov_b32_e32 v18, s1 +; NOOPT-NEXT: v_mov_b32_e32 v17, s0 +; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v2, v31 +; NOOPT-NEXT: v_mov_b32_e32 v3, v30 +; NOOPT-NEXT: v_mov_b32_e32 v4, v29 +; NOOPT-NEXT: v_mov_b32_e32 v5, v28 +; NOOPT-NEXT: v_mov_b32_e32 v6, v27 +; NOOPT-NEXT: v_mov_b32_e32 v7, v26 +; NOOPT-NEXT: v_mov_b32_e32 v8, v25 +; NOOPT-NEXT: v_mov_b32_e32 v9, v24 +; NOOPT-NEXT: v_mov_b32_e32 v10, v23 +; NOOPT-NEXT: v_mov_b32_e32 v11, v22 +; NOOPT-NEXT: v_mov_b32_e32 v12, v21 +; NOOPT-NEXT: v_mov_b32_e32 v13, v20 +; NOOPT-NEXT: v_mov_b32_e32 v14, v19 +; NOOPT-NEXT: v_mov_b32_e32 v15, v18 +; NOOPT-NEXT: v_mov_b32_e32 v16, v17 +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[0:1], exec +; NOOPT-NEXT: v_writelane_b32 v0, s0, 4 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 5 +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: ; implicit-def: $vgpr0 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 6 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 7 +; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:72 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 +; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; NOOPT-NEXT: s_add_i32 m0, s2, 0xfffffe00 +; NOOPT-NEXT: v_movrels_b32_e32 v1, v1 +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:72 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] +; NOOPT-NEXT: v_writelane_b32 v0, s2, 6 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 7 +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] +; NOOPT-NEXT: s_cbranch_execnz .LBB5_1 +; NOOPT-NEXT: ; %bb.2: +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 4 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 5 +; NOOPT-NEXT: s_mov_b64 exec, s[0:1] +; NOOPT-NEXT: ; %bb.3: +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 0 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 1 +; NOOPT-NEXT: v_readlane_b32 s2, v0, 2 +; NOOPT-NEXT: v_readlane_b32 s3, v0, 3 +; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; NOOPT-NEXT: ; kill: killed $vgpr0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extract_neg_offset_vgpr: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffe00, v0 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 2, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 4, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 6, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 6, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 7, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 7, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 8, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 8, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 9, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 10, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 10, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 11, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 11, v0 +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 12, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 12, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 13, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 13, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 14, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 14, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 15, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 15, v0 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v0, 16, v1, vcc +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-LABEL: extract_neg_offset_vgpr: +; VI: ; %bb.0: ; %entry +; VI-NEXT: v_add_u32_e32 v0, vcc, 0xfffffe00, v0 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 2, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 6, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 6, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 7, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 7, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 8, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 8, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 9, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 10, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 10, v0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_cndmask_b32_e32 v1, 11, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 11, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 12, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 12, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 13, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 13, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 14, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 14, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 15, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 15, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, 16, v1, vcc +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: extract_neg_offset_vgpr: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: v_add_u32_e32 v0, 0xfffffe00, v0 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 2, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 3, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 4, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 6, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 6, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 7, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 7, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 8, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 8, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 9, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 10, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 10, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 11, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 11, v0 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 12, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 12, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 13, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 13, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 14, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 14, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 15, v2, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 15, v0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v0, 16, v2, vcc +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: - %id = call i32 @llvm.amdgcn.workitem.id.x() #1 + %id = call i32 @llvm.amdgcn.workitem.id.x() %index = add i32 %id, -512 %value = extractelement <16 x i32> , i32 %index store i32 %value, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}extract_undef_offset_sgpr: ; undefined behavior, but shouldn't crash compiler define amdgpu_kernel void @extract_undef_offset_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; NOOPT-LABEL: extract_undef_offset_sgpr: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s6, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s4, 0xf000 +; NOOPT-NEXT: s_mov_b32 s5, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s6 +; NOOPT-NEXT: s_mov_b32 s2, s5 +; NOOPT-NEXT: s_mov_b32 s3, s4 +; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extract_undef_offset_sgpr: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_mov_b32 s7, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s6, -1 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_mov_b32 s4, s2 +; SI-MOVREL-NEXT: s_mov_b32 s5, s3 +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: s_endpgm +; +; VI-LABEL: extract_undef_offset_sgpr: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: extract_undef_offset_sgpr: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: s_endpgm entry: %ld = load volatile <4 x i32>, ptr addrspace(1) %in %value = extractelement <4 x i32> %ld, i32 undef @@ -154,9 +1511,23 @@ entry: ret void } -; GCN-LABEL: {{^}}insert_undef_offset_sgpr_vector_src: ; undefined behavior, but shouldn't crash compiler define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; NOOPT-LABEL: insert_undef_offset_sgpr_vector_src: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_undef_offset_sgpr_vector_src: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_endpgm +; +; VI-LABEL: insert_undef_offset_sgpr_vector_src: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_undef_offset_sgpr_vector_src: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_endpgm entry: %ld = load <4 x i32>, ptr addrspace(1) %in %value = insertelement <4 x i32> %ld, i32 5, i32 undef @@ -164,20 +1535,276 @@ entry: ret void } -; GCN-LABEL: {{^}}insert_w_offset: -; GCN-DAG: s_load_dword [[IN0:s[0-9]+]] -; MOVREL-DAG: s_add_i32 [[IN:s[0-9]+]], [[IN0]], 1 -; MOVREL-DAG: s_mov_b32 m0, [[IN]] -; GCN-DAG: v_mov_b32_e32 v[[ELT0:[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 v[[ELT1:[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 v[[ELT2:[0-9]+]], 0x40400000 -; GCN-DAG: v_mov_b32_e32 v[[ELT3:[0-9]+]], 4.0 -; GCN-DAG: v_mov_b32_e32 v[[ELT15:[0-9]+]], 0x41800000 -; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x41880000 - -; MOVREL: v_movreld_b32_e32 v[[ELT0]], v[[INS]] -; MOVREL: buffer_store_dwordx4 v[[[ELT0]]:[[ELT3]]] define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { +; NOOPT-LABEL: insert_w_offset: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s5, 1 +; NOOPT-NEXT: s_add_i32 s4, s4, s5 +; NOOPT-NEXT: s_mov_b32 s5, 0x41800000 +; NOOPT-NEXT: s_mov_b32 s6, 0x41700000 +; NOOPT-NEXT: s_mov_b32 s7, 0x41600000 +; NOOPT-NEXT: s_mov_b32 s8, 0x41500000 +; NOOPT-NEXT: s_mov_b32 s9, 0x41400000 +; NOOPT-NEXT: s_mov_b32 s10, 0x41300000 +; NOOPT-NEXT: s_mov_b32 s11, 0x41200000 +; NOOPT-NEXT: s_mov_b32 s12, 0x41100000 +; NOOPT-NEXT: s_mov_b32 s13, 0x41000000 +; NOOPT-NEXT: s_mov_b32 s14, 0x40e00000 +; NOOPT-NEXT: s_mov_b32 s15, 0x40c00000 +; NOOPT-NEXT: s_mov_b32 s16, 0x40a00000 +; NOOPT-NEXT: s_mov_b32 s17, 4.0 +; NOOPT-NEXT: s_mov_b32 s18, 0x40400000 +; NOOPT-NEXT: s_mov_b32 s19, 2.0 +; NOOPT-NEXT: s_mov_b32 s20, 1.0 +; NOOPT-NEXT: v_mov_b32_e32 v7, s20 +; NOOPT-NEXT: v_mov_b32_e32 v30, s19 +; NOOPT-NEXT: v_mov_b32_e32 v29, s18 +; NOOPT-NEXT: v_mov_b32_e32 v28, s17 +; NOOPT-NEXT: v_mov_b32_e32 v27, s16 +; NOOPT-NEXT: v_mov_b32_e32 v26, s15 +; NOOPT-NEXT: v_mov_b32_e32 v25, s14 +; NOOPT-NEXT: v_mov_b32_e32 v24, s13 +; NOOPT-NEXT: v_mov_b32_e32 v23, s12 +; NOOPT-NEXT: v_mov_b32_e32 v6, s11 +; NOOPT-NEXT: v_mov_b32_e32 v5, s10 +; NOOPT-NEXT: v_mov_b32_e32 v4, s9 +; NOOPT-NEXT: v_mov_b32_e32 v3, s8 +; NOOPT-NEXT: v_mov_b32_e32 v2, s7 +; NOOPT-NEXT: v_mov_b32_e32 v1, s6 +; NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v8, v30 +; NOOPT-NEXT: v_mov_b32_e32 v9, v29 +; NOOPT-NEXT: v_mov_b32_e32 v10, v28 +; NOOPT-NEXT: v_mov_b32_e32 v11, v27 +; NOOPT-NEXT: v_mov_b32_e32 v12, v26 +; NOOPT-NEXT: v_mov_b32_e32 v13, v25 +; NOOPT-NEXT: v_mov_b32_e32 v14, v24 +; NOOPT-NEXT: v_mov_b32_e32 v15, v23 +; NOOPT-NEXT: v_mov_b32_e32 v16, v6 +; NOOPT-NEXT: v_mov_b32_e32 v17, v5 +; NOOPT-NEXT: v_mov_b32_e32 v18, v4 +; NOOPT-NEXT: v_mov_b32_e32 v19, v3 +; NOOPT-NEXT: v_mov_b32_e32 v20, v2 +; NOOPT-NEXT: v_mov_b32_e32 v21, v1 +; NOOPT-NEXT: v_mov_b32_e32 v22, v0 +; NOOPT-NEXT: v_mov_b32_e32 v0, 0x41880000 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_movreld_b32_e32 v7, v0 +; NOOPT-NEXT: v_mov_b32_e32 v4, v22 +; NOOPT-NEXT: v_mov_b32_e32 v5, v21 +; NOOPT-NEXT: v_mov_b32_e32 v6, v20 +; NOOPT-NEXT: v_mov_b32_e32 v0, v19 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: v_mov_b32_e32 v4, v18 +; NOOPT-NEXT: v_mov_b32_e32 v5, v17 +; NOOPT-NEXT: v_mov_b32_e32 v6, v16 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v15 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; NOOPT-NEXT: v_mov_b32_e32 v4, v14 +; NOOPT-NEXT: v_mov_b32_e32 v5, v13 +; NOOPT-NEXT: v_mov_b32_e32 v6, v12 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v11 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; NOOPT-NEXT: v_mov_b32_e32 v4, v10 +; NOOPT-NEXT: v_mov_b32_e32 v5, v9 +; NOOPT-NEXT: v_mov_b32_e32 v6, v8 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v7 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_w_offset: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_add_i32 s4, s4, 1 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 +; SI-MOVREL-NEXT: s_mov_b32 m0, s4 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: insert_w_offset: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_add_i32 s4, s4, 1 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 +; VI-MOVREL-NEXT: s_mov_b32 m0, s4 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: insert_w_offset: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_add_i32 s4, s4, 1 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, 0x41880000 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v16 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_w_offset: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_add_i32 s4, s4, 1 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, 0x41880000 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v17 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %add = add i32 %in, 1 %ins = insertelement <16 x float> , float 17.0, i32 %add @@ -185,19 +1812,276 @@ entry: ret void } -; GCN-LABEL: {{^}}insert_unsigned_base_plus_offset: -; GCN-DAG: s_load_dword [[IN:s[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 2.0 -; GCN-DAG: s_and_b32 [[BASE:s[0-9]+]], [[IN]], 0xffff - -; MOVREL: s_mov_b32 m0, [[BASE]] -; MOVREL: v_movreld_b32_e32 [[ELT1]], v{{[0-9]+}} - -; IDXMODE: s_set_gpr_idx_on [[BASE]], gpr_idx(DST) -; IDXMODE-NEXT: v_mov_b32_e32 [[ELT1]], v{{[0-9]+}} -; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %out, i16 %in) { +; NOOPT-LABEL: insert_unsigned_base_plus_offset: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s5, 0xffff +; NOOPT-NEXT: s_and_b32 s4, s4, s5 +; NOOPT-NEXT: s_mov_b32 s5, 0x41800000 +; NOOPT-NEXT: s_mov_b32 s6, 0x41700000 +; NOOPT-NEXT: s_mov_b32 s7, 0x41600000 +; NOOPT-NEXT: s_mov_b32 s8, 0x41500000 +; NOOPT-NEXT: s_mov_b32 s9, 0x41400000 +; NOOPT-NEXT: s_mov_b32 s10, 0x41300000 +; NOOPT-NEXT: s_mov_b32 s11, 0x41200000 +; NOOPT-NEXT: s_mov_b32 s12, 0x41100000 +; NOOPT-NEXT: s_mov_b32 s13, 0x41000000 +; NOOPT-NEXT: s_mov_b32 s14, 0x40e00000 +; NOOPT-NEXT: s_mov_b32 s15, 0x40c00000 +; NOOPT-NEXT: s_mov_b32 s16, 0x40a00000 +; NOOPT-NEXT: s_mov_b32 s17, 4.0 +; NOOPT-NEXT: s_mov_b32 s18, 0x40400000 +; NOOPT-NEXT: s_mov_b32 s19, 2.0 +; NOOPT-NEXT: s_mov_b32 s20, 1.0 +; NOOPT-NEXT: v_mov_b32_e32 v7, s20 +; NOOPT-NEXT: v_mov_b32_e32 v30, s19 +; NOOPT-NEXT: v_mov_b32_e32 v29, s18 +; NOOPT-NEXT: v_mov_b32_e32 v28, s17 +; NOOPT-NEXT: v_mov_b32_e32 v27, s16 +; NOOPT-NEXT: v_mov_b32_e32 v26, s15 +; NOOPT-NEXT: v_mov_b32_e32 v25, s14 +; NOOPT-NEXT: v_mov_b32_e32 v24, s13 +; NOOPT-NEXT: v_mov_b32_e32 v23, s12 +; NOOPT-NEXT: v_mov_b32_e32 v6, s11 +; NOOPT-NEXT: v_mov_b32_e32 v5, s10 +; NOOPT-NEXT: v_mov_b32_e32 v4, s9 +; NOOPT-NEXT: v_mov_b32_e32 v3, s8 +; NOOPT-NEXT: v_mov_b32_e32 v2, s7 +; NOOPT-NEXT: v_mov_b32_e32 v1, s6 +; NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v8, v30 +; NOOPT-NEXT: v_mov_b32_e32 v9, v29 +; NOOPT-NEXT: v_mov_b32_e32 v10, v28 +; NOOPT-NEXT: v_mov_b32_e32 v11, v27 +; NOOPT-NEXT: v_mov_b32_e32 v12, v26 +; NOOPT-NEXT: v_mov_b32_e32 v13, v25 +; NOOPT-NEXT: v_mov_b32_e32 v14, v24 +; NOOPT-NEXT: v_mov_b32_e32 v15, v23 +; NOOPT-NEXT: v_mov_b32_e32 v16, v6 +; NOOPT-NEXT: v_mov_b32_e32 v17, v5 +; NOOPT-NEXT: v_mov_b32_e32 v18, v4 +; NOOPT-NEXT: v_mov_b32_e32 v19, v3 +; NOOPT-NEXT: v_mov_b32_e32 v20, v2 +; NOOPT-NEXT: v_mov_b32_e32 v21, v1 +; NOOPT-NEXT: v_mov_b32_e32 v22, v0 +; NOOPT-NEXT: v_mov_b32_e32 v0, 0x41880000 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_movreld_b32_e32 v8, v0 +; NOOPT-NEXT: v_mov_b32_e32 v4, v22 +; NOOPT-NEXT: v_mov_b32_e32 v5, v21 +; NOOPT-NEXT: v_mov_b32_e32 v6, v20 +; NOOPT-NEXT: v_mov_b32_e32 v0, v19 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: v_mov_b32_e32 v4, v18 +; NOOPT-NEXT: v_mov_b32_e32 v5, v17 +; NOOPT-NEXT: v_mov_b32_e32 v6, v16 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v15 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; NOOPT-NEXT: v_mov_b32_e32 v4, v14 +; NOOPT-NEXT: v_mov_b32_e32 v5, v13 +; NOOPT-NEXT: v_mov_b32_e32 v6, v12 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v11 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; NOOPT-NEXT: v_mov_b32_e32 v4, v10 +; NOOPT-NEXT: v_mov_b32_e32 v5, v9 +; NOOPT-NEXT: v_mov_b32_e32 v6, v8 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v7 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_unsigned_base_plus_offset: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_and_b32 s4, s4, 0xffff +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 +; SI-MOVREL-NEXT: s_mov_b32 m0, s4 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: insert_unsigned_base_plus_offset: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_and_b32 s2, s4, 0xffff +; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: insert_unsigned_base_plus_offset: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_and_b32 s2, s4, 0xffff +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, 0x41880000 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, v16 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_unsigned_base_plus_offset: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, 0x41880000 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, v17 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %base = zext i16 %in to i32 %add = add i32 %base, 1 @@ -206,21 +2090,281 @@ entry: ret void } -; GCN-LABEL: {{^}}insert_signed_base_plus_offset: -; GCN-DAG: s_load_dword [[IN:s[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 1.0 -; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 2.0 - -; GCN-DAG: s_sext_i32_i16 [[BASE:s[0-9]+]], [[IN]] -; GCN-DAG: s_add_i32 [[BASE_PLUS_OFFSET:s[0-9]+]], [[BASE]], 1 - -; MOVREL: s_mov_b32 m0, [[BASE_PLUS_OFFSET]] -; MOVREL: v_movreld_b32_e32 [[ELT0]], v{{[0-9]+}} - -; IDXMODE: s_set_gpr_idx_on [[BASE_PLUS_OFFSET]], gpr_idx(DST) -; IDXMODE-NEXT: v_mov_b32_e32 [[ELT0]], v{{[0-9]+}} -; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, i16 %in) { +; NOOPT-LABEL: insert_signed_base_plus_offset: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_sext_i32_i16 s4, s4 +; NOOPT-NEXT: s_mov_b32 s5, 1 +; NOOPT-NEXT: s_add_i32 s4, s4, s5 +; NOOPT-NEXT: s_mov_b32 s5, 0x41800000 +; NOOPT-NEXT: s_mov_b32 s6, 0x41700000 +; NOOPT-NEXT: s_mov_b32 s7, 0x41600000 +; NOOPT-NEXT: s_mov_b32 s8, 0x41500000 +; NOOPT-NEXT: s_mov_b32 s9, 0x41400000 +; NOOPT-NEXT: s_mov_b32 s10, 0x41300000 +; NOOPT-NEXT: s_mov_b32 s11, 0x41200000 +; NOOPT-NEXT: s_mov_b32 s12, 0x41100000 +; NOOPT-NEXT: s_mov_b32 s13, 0x41000000 +; NOOPT-NEXT: s_mov_b32 s14, 0x40e00000 +; NOOPT-NEXT: s_mov_b32 s15, 0x40c00000 +; NOOPT-NEXT: s_mov_b32 s16, 0x40a00000 +; NOOPT-NEXT: s_mov_b32 s17, 4.0 +; NOOPT-NEXT: s_mov_b32 s18, 0x40400000 +; NOOPT-NEXT: s_mov_b32 s19, 2.0 +; NOOPT-NEXT: s_mov_b32 s20, 1.0 +; NOOPT-NEXT: v_mov_b32_e32 v7, s20 +; NOOPT-NEXT: v_mov_b32_e32 v30, s19 +; NOOPT-NEXT: v_mov_b32_e32 v29, s18 +; NOOPT-NEXT: v_mov_b32_e32 v28, s17 +; NOOPT-NEXT: v_mov_b32_e32 v27, s16 +; NOOPT-NEXT: v_mov_b32_e32 v26, s15 +; NOOPT-NEXT: v_mov_b32_e32 v25, s14 +; NOOPT-NEXT: v_mov_b32_e32 v24, s13 +; NOOPT-NEXT: v_mov_b32_e32 v23, s12 +; NOOPT-NEXT: v_mov_b32_e32 v6, s11 +; NOOPT-NEXT: v_mov_b32_e32 v5, s10 +; NOOPT-NEXT: v_mov_b32_e32 v4, s9 +; NOOPT-NEXT: v_mov_b32_e32 v3, s8 +; NOOPT-NEXT: v_mov_b32_e32 v2, s7 +; NOOPT-NEXT: v_mov_b32_e32 v1, s6 +; NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v8, v30 +; NOOPT-NEXT: v_mov_b32_e32 v9, v29 +; NOOPT-NEXT: v_mov_b32_e32 v10, v28 +; NOOPT-NEXT: v_mov_b32_e32 v11, v27 +; NOOPT-NEXT: v_mov_b32_e32 v12, v26 +; NOOPT-NEXT: v_mov_b32_e32 v13, v25 +; NOOPT-NEXT: v_mov_b32_e32 v14, v24 +; NOOPT-NEXT: v_mov_b32_e32 v15, v23 +; NOOPT-NEXT: v_mov_b32_e32 v16, v6 +; NOOPT-NEXT: v_mov_b32_e32 v17, v5 +; NOOPT-NEXT: v_mov_b32_e32 v18, v4 +; NOOPT-NEXT: v_mov_b32_e32 v19, v3 +; NOOPT-NEXT: v_mov_b32_e32 v20, v2 +; NOOPT-NEXT: v_mov_b32_e32 v21, v1 +; NOOPT-NEXT: v_mov_b32_e32 v22, v0 +; NOOPT-NEXT: v_mov_b32_e32 v0, 0x41880000 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_movreld_b32_e32 v7, v0 +; NOOPT-NEXT: v_mov_b32_e32 v4, v22 +; NOOPT-NEXT: v_mov_b32_e32 v5, v21 +; NOOPT-NEXT: v_mov_b32_e32 v6, v20 +; NOOPT-NEXT: v_mov_b32_e32 v0, v19 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: v_mov_b32_e32 v4, v18 +; NOOPT-NEXT: v_mov_b32_e32 v5, v17 +; NOOPT-NEXT: v_mov_b32_e32 v6, v16 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v15 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; NOOPT-NEXT: v_mov_b32_e32 v4, v14 +; NOOPT-NEXT: v_mov_b32_e32 v5, v13 +; NOOPT-NEXT: v_mov_b32_e32 v6, v12 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v11 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; NOOPT-NEXT: v_mov_b32_e32 v4, v10 +; NOOPT-NEXT: v_mov_b32_e32 v5, v9 +; NOOPT-NEXT: v_mov_b32_e32 v6, v8 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v7 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_signed_base_plus_offset: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_sext_i32_i16 s4, s4 +; SI-MOVREL-NEXT: s_add_i32 s4, s4, 1 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 +; SI-MOVREL-NEXT: s_mov_b32 m0, s4 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: insert_signed_base_plus_offset: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_sext_i32_i16 s2, s4 +; VI-MOVREL-NEXT: s_add_i32 s2, s2, 1 +; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: insert_signed_base_plus_offset: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_sext_i32_i16 s2, s4 +; VI-IDXMODE-NEXT: s_add_i32 s2, s2, 1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, 0x41880000 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v16 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_signed_base_plus_offset: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_sext_i32_i16 s2, s4 +; GFX9-IDXMODE-NEXT: s_add_i32 s2, s2, 1 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, 0x41880000 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v17 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %base = sext i16 %in to i32 %add = add i32 %base, 1 @@ -229,35 +2373,553 @@ entry: ret void } - -; GCN-LABEL: {{^}}insert_wo_offset: -; GCN: s_load_dword [[IN:s[0-9]+]] - -; MOVREL: s_mov_b32 m0, [[IN]] -; MOVREL: v_movreld_b32_e32 v[[ELT0:[0-9]+]] - -; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(DST) -; IDXMODE-NEXT: v_mov_b32_e32 v[[ELT0:[0-9]+]], v{{[0-9]+}} -; IDXMODE-NEXT: s_set_gpr_idx_off - -; GCN: buffer_store_dwordx4 v[[[ELT0]]: +; Make sure that TwoAddressInstructions keeps src0 as subregister sub0 +; of the tied implicit use and def of the super register. define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) { +; NOOPT-LABEL: insert_wo_offset: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s5, 0x41800000 +; NOOPT-NEXT: s_mov_b32 s6, 0x41700000 +; NOOPT-NEXT: s_mov_b32 s7, 0x41600000 +; NOOPT-NEXT: s_mov_b32 s8, 0x41500000 +; NOOPT-NEXT: s_mov_b32 s9, 0x41400000 +; NOOPT-NEXT: s_mov_b32 s10, 0x41300000 +; NOOPT-NEXT: s_mov_b32 s11, 0x41200000 +; NOOPT-NEXT: s_mov_b32 s12, 0x41100000 +; NOOPT-NEXT: s_mov_b32 s13, 0x41000000 +; NOOPT-NEXT: s_mov_b32 s14, 0x40e00000 +; NOOPT-NEXT: s_mov_b32 s15, 0x40c00000 +; NOOPT-NEXT: s_mov_b32 s16, 0x40a00000 +; NOOPT-NEXT: s_mov_b32 s17, 4.0 +; NOOPT-NEXT: s_mov_b32 s18, 0x40400000 +; NOOPT-NEXT: s_mov_b32 s19, 2.0 +; NOOPT-NEXT: s_mov_b32 s20, 1.0 +; NOOPT-NEXT: v_mov_b32_e32 v7, s20 +; NOOPT-NEXT: v_mov_b32_e32 v30, s19 +; NOOPT-NEXT: v_mov_b32_e32 v29, s18 +; NOOPT-NEXT: v_mov_b32_e32 v28, s17 +; NOOPT-NEXT: v_mov_b32_e32 v27, s16 +; NOOPT-NEXT: v_mov_b32_e32 v26, s15 +; NOOPT-NEXT: v_mov_b32_e32 v25, s14 +; NOOPT-NEXT: v_mov_b32_e32 v24, s13 +; NOOPT-NEXT: v_mov_b32_e32 v23, s12 +; NOOPT-NEXT: v_mov_b32_e32 v6, s11 +; NOOPT-NEXT: v_mov_b32_e32 v5, s10 +; NOOPT-NEXT: v_mov_b32_e32 v4, s9 +; NOOPT-NEXT: v_mov_b32_e32 v3, s8 +; NOOPT-NEXT: v_mov_b32_e32 v2, s7 +; NOOPT-NEXT: v_mov_b32_e32 v1, s6 +; NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v8, v30 +; NOOPT-NEXT: v_mov_b32_e32 v9, v29 +; NOOPT-NEXT: v_mov_b32_e32 v10, v28 +; NOOPT-NEXT: v_mov_b32_e32 v11, v27 +; NOOPT-NEXT: v_mov_b32_e32 v12, v26 +; NOOPT-NEXT: v_mov_b32_e32 v13, v25 +; NOOPT-NEXT: v_mov_b32_e32 v14, v24 +; NOOPT-NEXT: v_mov_b32_e32 v15, v23 +; NOOPT-NEXT: v_mov_b32_e32 v16, v6 +; NOOPT-NEXT: v_mov_b32_e32 v17, v5 +; NOOPT-NEXT: v_mov_b32_e32 v18, v4 +; NOOPT-NEXT: v_mov_b32_e32 v19, v3 +; NOOPT-NEXT: v_mov_b32_e32 v20, v2 +; NOOPT-NEXT: v_mov_b32_e32 v21, v1 +; NOOPT-NEXT: v_mov_b32_e32 v22, v0 +; NOOPT-NEXT: v_mov_b32_e32 v0, 0x41880000 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_movreld_b32_e32 v7, v0 +; NOOPT-NEXT: v_mov_b32_e32 v4, v22 +; NOOPT-NEXT: v_mov_b32_e32 v5, v21 +; NOOPT-NEXT: v_mov_b32_e32 v6, v20 +; NOOPT-NEXT: v_mov_b32_e32 v0, v19 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: v_mov_b32_e32 v4, v18 +; NOOPT-NEXT: v_mov_b32_e32 v5, v17 +; NOOPT-NEXT: v_mov_b32_e32 v6, v16 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v15 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; NOOPT-NEXT: v_mov_b32_e32 v4, v14 +; NOOPT-NEXT: v_mov_b32_e32 v5, v13 +; NOOPT-NEXT: v_mov_b32_e32 v6, v12 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v11 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; NOOPT-NEXT: v_mov_b32_e32 v4, v10 +; NOOPT-NEXT: v_mov_b32_e32 v5, v9 +; NOOPT-NEXT: v_mov_b32_e32 v6, v8 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v7 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_wo_offset: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_mov_b32 m0, s4 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: insert_wo_offset: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: insert_wo_offset: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, 0x41880000 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v16 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_wo_offset: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, 0x41880000 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v17 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %ins = insertelement <16 x float> , float 17.0, i32 %in store <16 x float> %ins, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}insert_neg_offset_sgpr: -; The offset depends on the register that holds the first element of the vector. -; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} -; MOVREL: v_movreld_b32_e32 v0, 16 - -; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}} -; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST) -; IDXMODE-NEXT: v_mov_b32_e32 v0, 16 -; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addrspace(1) %out, i32 %offset) { +; NOOPT-LABEL: insert_neg_offset_sgpr: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xd +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s5, 15 +; NOOPT-NEXT: s_mov_b32 s6, 14 +; NOOPT-NEXT: s_mov_b32 s7, 13 +; NOOPT-NEXT: s_mov_b32 s8, 12 +; NOOPT-NEXT: s_mov_b32 s9, 11 +; NOOPT-NEXT: s_mov_b32 s10, 10 +; NOOPT-NEXT: s_mov_b32 s11, 9 +; NOOPT-NEXT: s_mov_b32 s12, 8 +; NOOPT-NEXT: s_mov_b32 s13, 7 +; NOOPT-NEXT: s_mov_b32 s14, 6 +; NOOPT-NEXT: s_mov_b32 s15, 5 +; NOOPT-NEXT: s_mov_b32 s16, 4 +; NOOPT-NEXT: s_mov_b32 s17, 3 +; NOOPT-NEXT: s_mov_b32 s18, 2 +; NOOPT-NEXT: s_mov_b32 s19, 1 +; NOOPT-NEXT: s_mov_b32 s20, 0 +; NOOPT-NEXT: v_mov_b32_e32 v15, s20 +; NOOPT-NEXT: v_mov_b32_e32 v14, s19 +; NOOPT-NEXT: v_mov_b32_e32 v13, s18 +; NOOPT-NEXT: v_mov_b32_e32 v12, s17 +; NOOPT-NEXT: v_mov_b32_e32 v11, s16 +; NOOPT-NEXT: v_mov_b32_e32 v10, s15 +; NOOPT-NEXT: v_mov_b32_e32 v9, s14 +; NOOPT-NEXT: v_mov_b32_e32 v8, s13 +; NOOPT-NEXT: v_mov_b32_e32 v7, s12 +; NOOPT-NEXT: v_mov_b32_e32 v6, s11 +; NOOPT-NEXT: v_mov_b32_e32 v5, s10 +; NOOPT-NEXT: v_mov_b32_e32 v4, s9 +; NOOPT-NEXT: v_mov_b32_e32 v3, s8 +; NOOPT-NEXT: v_mov_b32_e32 v2, s7 +; NOOPT-NEXT: v_mov_b32_e32 v1, s6 +; NOOPT-NEXT: v_mov_b32_e32 v0, s5 +; NOOPT-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v16, v14 +; NOOPT-NEXT: v_mov_b32_e32 v17, v13 +; NOOPT-NEXT: v_mov_b32_e32 v18, v12 +; NOOPT-NEXT: v_mov_b32_e32 v19, v11 +; NOOPT-NEXT: v_mov_b32_e32 v20, v10 +; NOOPT-NEXT: v_mov_b32_e32 v21, v9 +; NOOPT-NEXT: v_mov_b32_e32 v22, v8 +; NOOPT-NEXT: v_mov_b32_e32 v23, v7 +; NOOPT-NEXT: v_mov_b32_e32 v24, v6 +; NOOPT-NEXT: v_mov_b32_e32 v25, v5 +; NOOPT-NEXT: v_mov_b32_e32 v26, v4 +; NOOPT-NEXT: v_mov_b32_e32 v27, v3 +; NOOPT-NEXT: v_mov_b32_e32 v28, v2 +; NOOPT-NEXT: v_mov_b32_e32 v29, v1 +; NOOPT-NEXT: v_mov_b32_e32 v30, v0 +; NOOPT-NEXT: v_mov_b32_e32 v0, 16 +; NOOPT-NEXT: s_add_i32 m0, s4, 0xfffffe00 +; NOOPT-NEXT: v_movreld_b32_e32 v15, v0 +; NOOPT-NEXT: v_mov_b32_e32 v4, v18 +; NOOPT-NEXT: v_mov_b32_e32 v5, v17 +; NOOPT-NEXT: v_mov_b32_e32 v6, v16 +; NOOPT-NEXT: v_mov_b32_e32 v0, v15 +; NOOPT-NEXT: v_mov_b32_e32 v1, v22 +; NOOPT-NEXT: v_mov_b32_e32 v2, v21 +; NOOPT-NEXT: v_mov_b32_e32 v3, v20 +; NOOPT-NEXT: v_mov_b32_e32 v7, v19 +; NOOPT-NEXT: v_mov_b32_e32 v12, v26 +; NOOPT-NEXT: v_mov_b32_e32 v13, v25 +; NOOPT-NEXT: v_mov_b32_e32 v14, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v30 +; NOOPT-NEXT: v_mov_b32_e32 v10, v29 +; NOOPT-NEXT: v_mov_b32_e32 v11, v28 +; NOOPT-NEXT: v_mov_b32_e32 v15, v27 +; NOOPT-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v16, v11 +; NOOPT-NEXT: v_mov_b32_e32 v17, v10 +; NOOPT-NEXT: v_mov_b32_e32 v18, v9 +; NOOPT-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v9, v14 +; NOOPT-NEXT: v_mov_b32_e32 v10, v13 +; NOOPT-NEXT: v_mov_b32_e32 v11, v12 +; NOOPT-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10 killed $exec +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v8, v3 +; NOOPT-NEXT: v_mov_b32_e32 v9, v2 +; NOOPT-NEXT: v_mov_b32_e32 v10, v1 +; NOOPT-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:16 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_neg_offset_sgpr: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 1 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 2 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 3 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 4 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 8 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 11 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 15 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_add_i32 m0, s4, 0xfffffe00 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, 16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: insert_neg_offset_sgpr: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 2 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 3 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_add_i32 m0, s2, 0xfffffe00 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 7 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 8 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 9 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 10 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 11 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 12 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 13 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 14 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 15 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, 16 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: insert_neg_offset_sgpr: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 7 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 8 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 9 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 10 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 11 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 12 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 13 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 14 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 15 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_addk_i32 s2, 0xfe00 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 16 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_neg_offset_sgpr: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x34 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 2 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 3 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 4 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 7 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v15 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_addk_i32 s4, 0xfe00 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, v14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, v13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, v12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, v11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, v10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, v9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 16 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %index = add i32 %offset, -512 %value = insertelement <16 x i32> , i32 16, i32 %index @@ -267,17 +2929,239 @@ entry: ; The vector indexed into is originally loaded into an SGPR rather ; than built with a reg_sequence - -; GCN-LABEL: {{^}}insert_neg_offset_sgpr_loadreg: -; The offset depends on the register that holds the first element of the vector. -; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} -; MOVREL: v_movreld_b32_e32 v0, 5 - -; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}} -; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST) -; IDXMODE-NEXT: v_mov_b32_e32 v0, 5 -; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ptr addrspace(1) %out, <16 x i32> %vec, i32 %offset) { +; NOOPT-LABEL: insert_neg_offset_sgpr_loadreg: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; NOOPT-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x19 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0x29 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: v_mov_b32_e32 v0, 5 +; NOOPT-NEXT: v_mov_b32_e32 v30, s23 +; NOOPT-NEXT: v_mov_b32_e32 v29, s22 +; NOOPT-NEXT: v_mov_b32_e32 v28, s21 +; NOOPT-NEXT: v_mov_b32_e32 v27, s20 +; NOOPT-NEXT: v_mov_b32_e32 v26, s19 +; NOOPT-NEXT: v_mov_b32_e32 v25, s18 +; NOOPT-NEXT: v_mov_b32_e32 v24, s17 +; NOOPT-NEXT: v_mov_b32_e32 v23, s16 +; NOOPT-NEXT: v_mov_b32_e32 v22, s15 +; NOOPT-NEXT: v_mov_b32_e32 v21, s14 +; NOOPT-NEXT: v_mov_b32_e32 v20, s13 +; NOOPT-NEXT: v_mov_b32_e32 v19, s12 +; NOOPT-NEXT: v_mov_b32_e32 v18, s11 +; NOOPT-NEXT: v_mov_b32_e32 v17, s10 +; NOOPT-NEXT: v_mov_b32_e32 v16, s9 +; NOOPT-NEXT: v_mov_b32_e32 v15, s8 +; NOOPT-NEXT: s_add_i32 m0, s4, 0xfffffe00 +; NOOPT-NEXT: v_movreld_b32_e32 v15, v0 +; NOOPT-NEXT: v_mov_b32_e32 v4, v18 +; NOOPT-NEXT: v_mov_b32_e32 v5, v17 +; NOOPT-NEXT: v_mov_b32_e32 v6, v16 +; NOOPT-NEXT: v_mov_b32_e32 v0, v15 +; NOOPT-NEXT: v_mov_b32_e32 v1, v22 +; NOOPT-NEXT: v_mov_b32_e32 v2, v21 +; NOOPT-NEXT: v_mov_b32_e32 v3, v20 +; NOOPT-NEXT: v_mov_b32_e32 v7, v19 +; NOOPT-NEXT: v_mov_b32_e32 v12, v26 +; NOOPT-NEXT: v_mov_b32_e32 v13, v25 +; NOOPT-NEXT: v_mov_b32_e32 v14, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v30 +; NOOPT-NEXT: v_mov_b32_e32 v10, v29 +; NOOPT-NEXT: v_mov_b32_e32 v11, v28 +; NOOPT-NEXT: v_mov_b32_e32 v15, v27 +; NOOPT-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v16, v11 +; NOOPT-NEXT: v_mov_b32_e32 v17, v10 +; NOOPT-NEXT: v_mov_b32_e32 v18, v9 +; NOOPT-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v9, v14 +; NOOPT-NEXT: v_mov_b32_e32 v10, v13 +; NOOPT-NEXT: v_mov_b32_e32 v11, v12 +; NOOPT-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10 killed $exec +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v8, v3 +; NOOPT-NEXT: v_mov_b32_e32 v9, v2 +; NOOPT-NEXT: v_mov_b32_e32 v10, v1 +; NOOPT-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:16 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_neg_offset_sgpr_loadreg: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; SI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x29 +; SI-MOVREL-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0xb +; SI-MOVREL-NEXT: s_mov_b32 s23, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s22, -1 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s18 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s19 +; SI-MOVREL-NEXT: s_add_i32 m0, s0, 0xfffffe00 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, 5 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[20:23], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[20:23], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: insert_neg_offset_sgpr_loadreg: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-MOVREL-NEXT: s_load_dword s20, s[2:3], 0xa4 +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 +; VI-MOVREL-NEXT: s_add_i32 m0, s20, 0xfffffe00 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s18 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s19 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, 5 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: insert_neg_offset_sgpr_loadreg: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0xa4 +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 +; VI-IDXMODE-NEXT: s_addk_i32 s20, 0xfe00 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, s18 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, s19 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s20, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 5 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_neg_offset_sgpr_loadreg: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; GFX9-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0xa4 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-IDXMODE-NEXT: s_addk_i32 s20, 0xfe00 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s20, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 5 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %index = add i32 %offset, -512 %value = insertelement <16 x i32> %vec, i32 5, i32 %index @@ -285,29 +3169,885 @@ entry: ret void } -; GCN-LABEL: {{^}}insert_neg_offset_vgpr: -; The offset depends on the register that holds the first element of the vector. - -; GCN: v_cmp_eq_u32_e32 -; GCN-COUNT-16: v_cndmask_b32 -; GCN-COUNT-4: buffer_store_dwordx4 define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; NOOPT-LABEL: insert_neg_offset_vgpr: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; NOOPT-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; NOOPT-NEXT: s_mov_b32 s22, -1 +; NOOPT-NEXT: s_mov_b32 s23, 0xe8f000 +; NOOPT-NEXT: s_add_u32 s20, s20, s9 +; NOOPT-NEXT: s_addc_u32 s21, s21, 0 +; NOOPT-NEXT: ; implicit-def: $vgpr16 : SGPR spill to VGPR lane +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:136 ; 4-byte Folded Spill +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s6, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s4, 0xf000 +; NOOPT-NEXT: s_mov_b32 s5, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s6 +; NOOPT-NEXT: s_mov_b32 s2, s5 +; NOOPT-NEXT: s_mov_b32 s3, s4 +; NOOPT-NEXT: v_writelane_b32 v16, s0, 0 +; NOOPT-NEXT: v_writelane_b32 v16, s1, 1 +; NOOPT-NEXT: v_writelane_b32 v16, s2, 2 +; NOOPT-NEXT: v_writelane_b32 v16, s3, 3 +; NOOPT-NEXT: s_mov_b32 s0, 16 +; NOOPT-NEXT: s_mov_b32 s1, 15 +; NOOPT-NEXT: s_mov_b32 s2, 14 +; NOOPT-NEXT: s_mov_b32 s3, 13 +; NOOPT-NEXT: s_mov_b32 s4, 12 +; NOOPT-NEXT: s_mov_b32 s5, 11 +; NOOPT-NEXT: s_mov_b32 s6, 10 +; NOOPT-NEXT: s_mov_b32 s7, 9 +; NOOPT-NEXT: s_mov_b32 s8, 8 +; NOOPT-NEXT: s_mov_b32 s9, 7 +; NOOPT-NEXT: s_mov_b32 s10, 6 +; NOOPT-NEXT: s_mov_b32 s11, 5 +; NOOPT-NEXT: s_mov_b32 s12, 4 +; NOOPT-NEXT: s_mov_b32 s13, 3 +; NOOPT-NEXT: s_mov_b32 s14, 2 +; NOOPT-NEXT: s_mov_b32 s15, 1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, s15 +; NOOPT-NEXT: v_mov_b32_e32 v31, s14 +; NOOPT-NEXT: v_mov_b32_e32 v30, s13 +; NOOPT-NEXT: v_mov_b32_e32 v29, s12 +; NOOPT-NEXT: v_mov_b32_e32 v28, s11 +; NOOPT-NEXT: v_mov_b32_e32 v27, s10 +; NOOPT-NEXT: v_mov_b32_e32 v26, s9 +; NOOPT-NEXT: v_mov_b32_e32 v25, s8 +; NOOPT-NEXT: v_mov_b32_e32 v24, s7 +; NOOPT-NEXT: v_mov_b32_e32 v23, s6 +; NOOPT-NEXT: v_mov_b32_e32 v22, s5 +; NOOPT-NEXT: v_mov_b32_e32 v21, s4 +; NOOPT-NEXT: v_mov_b32_e32 v20, s3 +; NOOPT-NEXT: v_mov_b32_e32 v19, s2 +; NOOPT-NEXT: v_mov_b32_e32 v18, s1 +; NOOPT-NEXT: v_mov_b32_e32 v17, s0 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v31 +; NOOPT-NEXT: v_mov_b32_e32 v2, v30 +; NOOPT-NEXT: v_mov_b32_e32 v3, v29 +; NOOPT-NEXT: v_mov_b32_e32 v4, v28 +; NOOPT-NEXT: v_mov_b32_e32 v5, v27 +; NOOPT-NEXT: v_mov_b32_e32 v6, v26 +; NOOPT-NEXT: v_mov_b32_e32 v7, v25 +; NOOPT-NEXT: v_mov_b32_e32 v8, v24 +; NOOPT-NEXT: v_mov_b32_e32 v9, v23 +; NOOPT-NEXT: v_mov_b32_e32 v10, v22 +; NOOPT-NEXT: v_mov_b32_e32 v11, v21 +; NOOPT-NEXT: v_mov_b32_e32 v12, v20 +; NOOPT-NEXT: v_mov_b32_e32 v13, v19 +; NOOPT-NEXT: v_mov_b32_e32 v14, v18 +; NOOPT-NEXT: v_mov_b32_e32 v15, v17 +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:72 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:80 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:84 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:88 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:92 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:96 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:100 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:104 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:108 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:112 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:116 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:120 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:124 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:128 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:132 ; 4-byte Folded Spill +; NOOPT-NEXT: v_mov_b32_e32 v17, 33 +; NOOPT-NEXT: buffer_store_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[0:1], exec +; NOOPT-NEXT: v_writelane_b32 v16, s0, 4 +; NOOPT-NEXT: v_writelane_b32 v16, s1, 5 +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_waitcnt expcnt(2) +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 6 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 7 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 +; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; NOOPT-NEXT: s_add_i32 m0, s2, 0xfffffe00 +; NOOPT-NEXT: v_movreld_b32_e32 v1, v17 +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:140 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:144 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:148 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:152 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:156 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:160 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:164 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:168 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:172 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:176 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:180 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:184 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:188 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:192 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:196 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:200 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] +; NOOPT-NEXT: v_writelane_b32 v0, s2, 6 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 7 +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] +; NOOPT-NEXT: s_cbranch_execnz .LBB14_1 +; NOOPT-NEXT: ; %bb.2: +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 4 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 5 +; NOOPT-NEXT: s_mov_b64 exec, s[0:1] +; NOOPT-NEXT: ; %bb.3: +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 0 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 1 +; NOOPT-NEXT: v_readlane_b32 s2, v0, 2 +; NOOPT-NEXT: v_readlane_b32 s3, v0, 3 +; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v19, off, s[20:23], 0 offset:152 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v20, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v21, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v22, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v23, off, s[20:23], 0 offset:168 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v24, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v25, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v26, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v27, off, s[20:23], 0 offset:184 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v28, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v29, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v30, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v5, v19 +; NOOPT-NEXT: v_mov_b32_e32 v6, v18 +; NOOPT-NEXT: v_mov_b32_e32 v7, v17 +; NOOPT-NEXT: v_mov_b32_e32 v1, v16 +; NOOPT-NEXT: v_mov_b32_e32 v2, v23 +; NOOPT-NEXT: v_mov_b32_e32 v3, v22 +; NOOPT-NEXT: v_mov_b32_e32 v4, v21 +; NOOPT-NEXT: v_mov_b32_e32 v8, v20 +; NOOPT-NEXT: v_mov_b32_e32 v13, v27 +; NOOPT-NEXT: v_mov_b32_e32 v14, v26 +; NOOPT-NEXT: v_mov_b32_e32 v15, v25 +; NOOPT-NEXT: v_mov_b32_e32 v9, v24 +; NOOPT-NEXT: v_mov_b32_e32 v10, v31 +; NOOPT-NEXT: v_mov_b32_e32 v11, v30 +; NOOPT-NEXT: v_mov_b32_e32 v12, v29 +; NOOPT-NEXT: v_mov_b32_e32 v16, v28 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17_vgpr18_vgpr19 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v17, v12 +; NOOPT-NEXT: v_mov_b32_e32 v18, v11 +; NOOPT-NEXT: v_mov_b32_e32 v19, v10 +; NOOPT-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v10, v15 +; NOOPT-NEXT: v_mov_b32_e32 v11, v14 +; NOOPT-NEXT: v_mov_b32_e32 v12, v13 +; NOOPT-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:32 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v9, v4 +; NOOPT-NEXT: v_mov_b32_e32 v10, v3 +; NOOPT-NEXT: v_mov_b32_e32 v11, v2 +; NOOPT-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v2, v7 +; NOOPT-NEXT: v_mov_b32_e32 v3, v6 +; NOOPT-NEXT: v_mov_b32_e32 v4, v5 +; NOOPT-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 +; NOOPT-NEXT: ; kill: killed $vgpr0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_neg_offset_vgpr: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: v_add_i32_e32 v12, vcc, 0xfffffe00, v0 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 4, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 3, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v1, 2, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v0, 1, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v7, 8, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v6, 7, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v5, 6, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v4, 5, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v11, 12, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v10, 11, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v9, 10, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12 +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v8, 9, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v15, 16, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v14, 15, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v13, 14, 33, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v12, 13, 33, vcc +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-LABEL: insert_neg_offset_vgpr: +; VI: ; %bb.0: ; %entry +; VI-NEXT: v_add_u32_e32 v12, vcc, 0xfffffe00, v0 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 +; VI-NEXT: v_cndmask_b32_e64 v3, 4, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12 +; VI-NEXT: v_cndmask_b32_e64 v2, 3, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; VI-NEXT: v_cndmask_b32_e64 v1, 2, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 +; VI-NEXT: v_cndmask_b32_e64 v0, 1, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12 +; VI-NEXT: v_cndmask_b32_e64 v7, 8, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12 +; VI-NEXT: v_cndmask_b32_e64 v6, 7, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: v_cndmask_b32_e64 v5, 6, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12 +; VI-NEXT: v_cndmask_b32_e64 v4, 5, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12 +; VI-NEXT: v_cndmask_b32_e64 v11, 12, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12 +; VI-NEXT: v_cndmask_b32_e64 v10, 11, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12 +; VI-NEXT: v_cndmask_b32_e64 v9, 10, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12 +; VI-NEXT: v_cndmask_b32_e64 v8, 9, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s2, s0, 48 +; VI-NEXT: v_cndmask_b32_e64 v15, 16, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_cndmask_b32_e64 v14, 15, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12 +; VI-NEXT: v_mov_b32_e32 v17, s3 +; VI-NEXT: v_cndmask_b32_e64 v13, 14, 33, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12 +; VI-NEXT: v_mov_b32_e32 v16, s2 +; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: v_cndmask_b32_e64 v12, 13, 33, vcc +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v13, s3 +; VI-NEXT: v_mov_b32_e32 v12, s2 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v9, s3 +; VI-NEXT: v_mov_b32_e32 v8, s2 +; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_neg_offset_vgpr: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: v_add_u32_e32 v12, 0xfffffe00, v0 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 4, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v2, 3, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v1, 2, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v0, 1, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v7, 8, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v6, 7, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v5, 6, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 5, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v11, 12, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v10, 11, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v9, 10, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v8, 9, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v15, 16, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v14, 15, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v13, 14, 33, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v12, 13, 33, vcc +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: - %id = call i32 @llvm.amdgcn.workitem.id.x() #1 + %id = call i32 @llvm.amdgcn.workitem.id.x() %index = add i32 %id, -512 %value = insertelement <16 x i32> , i32 33, i32 %index store <16 x i32> %value, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}insert_neg_inline_offset_vgpr: - -; GCN: v_cmp_eq_u32_e32 -; GCN-COUNT-16: v_cndmask_b32 -; GCN-COUNT-4: buffer_store_dwordx4 define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; NOOPT-LABEL: insert_neg_inline_offset_vgpr: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; NOOPT-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; NOOPT-NEXT: s_mov_b32 s22, -1 +; NOOPT-NEXT: s_mov_b32 s23, 0xe8f000 +; NOOPT-NEXT: s_add_u32 s20, s20, s9 +; NOOPT-NEXT: s_addc_u32 s21, s21, 0 +; NOOPT-NEXT: ; implicit-def: $vgpr16 : SGPR spill to VGPR lane +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:136 ; 4-byte Folded Spill +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s6, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s4, 0xf000 +; NOOPT-NEXT: s_mov_b32 s5, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s6 +; NOOPT-NEXT: s_mov_b32 s2, s5 +; NOOPT-NEXT: s_mov_b32 s3, s4 +; NOOPT-NEXT: v_writelane_b32 v16, s0, 0 +; NOOPT-NEXT: v_writelane_b32 v16, s1, 1 +; NOOPT-NEXT: v_writelane_b32 v16, s2, 2 +; NOOPT-NEXT: v_writelane_b32 v16, s3, 3 +; NOOPT-NEXT: s_mov_b32 s0, 16 +; NOOPT-NEXT: s_mov_b32 s1, 15 +; NOOPT-NEXT: s_mov_b32 s2, 14 +; NOOPT-NEXT: s_mov_b32 s3, 13 +; NOOPT-NEXT: s_mov_b32 s4, 12 +; NOOPT-NEXT: s_mov_b32 s5, 11 +; NOOPT-NEXT: s_mov_b32 s6, 10 +; NOOPT-NEXT: s_mov_b32 s7, 9 +; NOOPT-NEXT: s_mov_b32 s8, 8 +; NOOPT-NEXT: s_mov_b32 s9, 7 +; NOOPT-NEXT: s_mov_b32 s10, 6 +; NOOPT-NEXT: s_mov_b32 s11, 5 +; NOOPT-NEXT: s_mov_b32 s12, 4 +; NOOPT-NEXT: s_mov_b32 s13, 3 +; NOOPT-NEXT: s_mov_b32 s14, 2 +; NOOPT-NEXT: s_mov_b32 s15, 1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, s15 +; NOOPT-NEXT: v_mov_b32_e32 v31, s14 +; NOOPT-NEXT: v_mov_b32_e32 v30, s13 +; NOOPT-NEXT: v_mov_b32_e32 v29, s12 +; NOOPT-NEXT: v_mov_b32_e32 v28, s11 +; NOOPT-NEXT: v_mov_b32_e32 v27, s10 +; NOOPT-NEXT: v_mov_b32_e32 v26, s9 +; NOOPT-NEXT: v_mov_b32_e32 v25, s8 +; NOOPT-NEXT: v_mov_b32_e32 v24, s7 +; NOOPT-NEXT: v_mov_b32_e32 v23, s6 +; NOOPT-NEXT: v_mov_b32_e32 v22, s5 +; NOOPT-NEXT: v_mov_b32_e32 v21, s4 +; NOOPT-NEXT: v_mov_b32_e32 v20, s3 +; NOOPT-NEXT: v_mov_b32_e32 v19, s2 +; NOOPT-NEXT: v_mov_b32_e32 v18, s1 +; NOOPT-NEXT: v_mov_b32_e32 v17, s0 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v31 +; NOOPT-NEXT: v_mov_b32_e32 v2, v30 +; NOOPT-NEXT: v_mov_b32_e32 v3, v29 +; NOOPT-NEXT: v_mov_b32_e32 v4, v28 +; NOOPT-NEXT: v_mov_b32_e32 v5, v27 +; NOOPT-NEXT: v_mov_b32_e32 v6, v26 +; NOOPT-NEXT: v_mov_b32_e32 v7, v25 +; NOOPT-NEXT: v_mov_b32_e32 v8, v24 +; NOOPT-NEXT: v_mov_b32_e32 v9, v23 +; NOOPT-NEXT: v_mov_b32_e32 v10, v22 +; NOOPT-NEXT: v_mov_b32_e32 v11, v21 +; NOOPT-NEXT: v_mov_b32_e32 v12, v20 +; NOOPT-NEXT: v_mov_b32_e32 v13, v19 +; NOOPT-NEXT: v_mov_b32_e32 v14, v18 +; NOOPT-NEXT: v_mov_b32_e32 v15, v17 +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:72 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:80 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:84 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:88 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:92 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:96 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:100 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:104 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:108 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:112 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:116 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:120 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:124 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:128 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:132 ; 4-byte Folded Spill +; NOOPT-NEXT: v_mov_b32_e32 v17, 0x1f4 +; NOOPT-NEXT: buffer_store_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[0:1], exec +; NOOPT-NEXT: v_writelane_b32 v16, s0, 4 +; NOOPT-NEXT: v_writelane_b32 v16, s1, 5 +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_waitcnt expcnt(2) +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 6 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 7 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 +; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; NOOPT-NEXT: s_add_i32 m0, s2, -16 +; NOOPT-NEXT: v_movreld_b32_e32 v1, v17 +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:140 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:144 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:148 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:152 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:156 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:160 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:164 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:168 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:172 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:176 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:180 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:184 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:188 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:192 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:196 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:200 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] +; NOOPT-NEXT: v_writelane_b32 v0, s2, 6 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 7 +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] +; NOOPT-NEXT: s_cbranch_execnz .LBB15_1 +; NOOPT-NEXT: ; %bb.2: +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 4 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 5 +; NOOPT-NEXT: s_mov_b64 exec, s[0:1] +; NOOPT-NEXT: ; %bb.3: +; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[16:17] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 0 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 1 +; NOOPT-NEXT: v_readlane_b32 s2, v0, 2 +; NOOPT-NEXT: v_readlane_b32 s3, v0, 3 +; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v19, off, s[20:23], 0 offset:152 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v20, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v21, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v22, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v23, off, s[20:23], 0 offset:168 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v24, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v25, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v26, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v27, off, s[20:23], 0 offset:184 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v28, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v29, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v30, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v5, v19 +; NOOPT-NEXT: v_mov_b32_e32 v6, v18 +; NOOPT-NEXT: v_mov_b32_e32 v7, v17 +; NOOPT-NEXT: v_mov_b32_e32 v1, v16 +; NOOPT-NEXT: v_mov_b32_e32 v2, v23 +; NOOPT-NEXT: v_mov_b32_e32 v3, v22 +; NOOPT-NEXT: v_mov_b32_e32 v4, v21 +; NOOPT-NEXT: v_mov_b32_e32 v8, v20 +; NOOPT-NEXT: v_mov_b32_e32 v13, v27 +; NOOPT-NEXT: v_mov_b32_e32 v14, v26 +; NOOPT-NEXT: v_mov_b32_e32 v15, v25 +; NOOPT-NEXT: v_mov_b32_e32 v9, v24 +; NOOPT-NEXT: v_mov_b32_e32 v10, v31 +; NOOPT-NEXT: v_mov_b32_e32 v11, v30 +; NOOPT-NEXT: v_mov_b32_e32 v12, v29 +; NOOPT-NEXT: v_mov_b32_e32 v16, v28 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17_vgpr18_vgpr19 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v17, v12 +; NOOPT-NEXT: v_mov_b32_e32 v18, v11 +; NOOPT-NEXT: v_mov_b32_e32 v19, v10 +; NOOPT-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v10, v15 +; NOOPT-NEXT: v_mov_b32_e32 v11, v14 +; NOOPT-NEXT: v_mov_b32_e32 v12, v13 +; NOOPT-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:32 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v9, v4 +; NOOPT-NEXT: v_mov_b32_e32 v10, v3 +; NOOPT-NEXT: v_mov_b32_e32 v11, v2 +; NOOPT-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v2, v7 +; NOOPT-NEXT: v_mov_b32_e32 v3, v6 +; NOOPT-NEXT: v_mov_b32_e32 v4, v5 +; NOOPT-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 +; NOOPT-NEXT: ; kill: killed $vgpr0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_neg_inline_offset_vgpr: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: v_add_i32_e32 v12, vcc, -16, v0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x1f4 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, 4, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, 3, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 2, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v0, 1, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, 8, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, 7, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v5, 6, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, 5, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, 12, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, 11, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v9, 10, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12 +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, 9, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v15, 16, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v14, 15, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v13, 14, v16, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v12, 13, v16, vcc +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-LABEL: insert_neg_inline_offset_vgpr: +; VI: ; %bb.0: ; %entry +; VI-NEXT: v_add_u32_e32 v12, vcc, -16, v0 +; VI-NEXT: v_mov_b32_e32 v16, 0x1f4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 +; VI-NEXT: v_cndmask_b32_e32 v3, 4, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12 +; VI-NEXT: v_cndmask_b32_e32 v2, 3, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; VI-NEXT: v_cndmask_b32_e32 v1, 2, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 +; VI-NEXT: v_cndmask_b32_e32 v0, 1, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12 +; VI-NEXT: v_cndmask_b32_e32 v7, 8, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12 +; VI-NEXT: v_cndmask_b32_e32 v6, 7, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12 +; VI-NEXT: v_cndmask_b32_e32 v5, 6, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12 +; VI-NEXT: v_cndmask_b32_e32 v4, 5, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: v_cndmask_b32_e32 v11, 12, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12 +; VI-NEXT: v_cndmask_b32_e32 v10, 11, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12 +; VI-NEXT: v_cndmask_b32_e32 v9, 10, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12 +; VI-NEXT: v_cndmask_b32_e32 v8, 9, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12 +; VI-NEXT: v_cndmask_b32_e32 v15, 16, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12 +; VI-NEXT: v_cndmask_b32_e32 v14, 15, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s2, s0, 48 +; VI-NEXT: v_cndmask_b32_e32 v13, 14, v16, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_cndmask_b32_e32 v12, 13, v16, vcc +; VI-NEXT: v_mov_b32_e32 v17, s3 +; VI-NEXT: v_mov_b32_e32 v16, s2 +; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v13, s3 +; VI-NEXT: v_mov_b32_e32 v12, s2 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v9, s3 +; VI-NEXT: v_mov_b32_e32 v8, s2 +; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_neg_inline_offset_vgpr: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: v_add_u32_e32 v12, -16, v0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, 0x1f4 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, 4, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 3, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v1, 2, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v0, 1, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, 8, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v6, 7, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, 6, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, 5, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, 12, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v10, 11, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v9, 10, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v8, 9, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v15, 16, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v14, 15, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v13, 14, v17, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v12, 13, v17, vcc +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm entry: - %id = call i32 @llvm.amdgcn.workitem.id.x() #1 + %id = call i32 @llvm.amdgcn.workitem.id.x() %index = add i32 %id, -16 %value = insertelement <16 x i32> , i32 500, i32 %index store <16 x i32> %value, ptr addrspace(1) %out @@ -316,19 +4056,646 @@ entry: ; When the block is split to insert the loop, make sure any other ; places that need to be expanded in the same block are also handled. - -; GCN-LABEL: {{^}}extract_vgpr_offset_multiple_in_block: - -; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]] -; GCN: v_cmp_eq_u32 -; GCN: v_cndmask_b32_e64 [[RESULT0:v[0-9]+]], 16, -; GCN: v_cndmask_b32_e64 [[RESULT1:v[0-9]+]], 16, - -; GCN: buffer_store_dword [[RESULT0]] -; GCN: buffer_store_dword [[RESULT1]] -define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) #0 { +define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) { +; NOOPT-LABEL: extract_vgpr_offset_multiple_in_block: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; NOOPT-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; NOOPT-NEXT: s_mov_b32 s38, -1 +; NOOPT-NEXT: s_mov_b32 s39, 0xe8f000 +; NOOPT-NEXT: s_add_u32 s36, s36, s9 +; NOOPT-NEXT: s_addc_u32 s37, s37, 0 +; NOOPT-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane +; NOOPT-NEXT: s_mov_b64 s[0:1], s[2:3] +; NOOPT-NEXT: v_mov_b32_e32 v1, v0 +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:76 ; 4-byte Folded Spill +; NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s8, s3 +; NOOPT-NEXT: s_mov_b32 s4, s2 +; NOOPT-NEXT: s_mov_b32 s2, 0xf000 +; NOOPT-NEXT: s_mov_b32 s3, -1 +; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b32 s5, s8 +; NOOPT-NEXT: s_mov_b32 s6, s3 +; NOOPT-NEXT: s_mov_b32 s7, s2 +; NOOPT-NEXT: s_waitcnt vmcnt(1) +; NOOPT-NEXT: v_writelane_b32 v0, s4, 0 +; NOOPT-NEXT: v_writelane_b32 v0, s5, 1 +; NOOPT-NEXT: v_writelane_b32 v0, s6, 2 +; NOOPT-NEXT: v_writelane_b32 v0, s7, 3 +; NOOPT-NEXT: s_mov_b32 s4, 0 +; NOOPT-NEXT: v_writelane_b32 v0, s4, 4 +; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; NOOPT-NEXT: s_mov_b32 s5, s2 +; NOOPT-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] +; NOOPT-NEXT: s_mov_b32 s4, 2 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_lshlrev_b32_e64 v1, s4, v1 +; NOOPT-NEXT: s_mov_b32 s4, 0 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: v_mov_b32_e32 v3, 0 +; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v2, v3 +; NOOPT-NEXT: buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:72 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b32 s0, 1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_add_i32_e64 v1, s[0:1], v1, s0 +; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:68 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b32 s16, 16 +; NOOPT-NEXT: s_mov_b32 s17, 15 +; NOOPT-NEXT: s_mov_b32 s18, 14 +; NOOPT-NEXT: s_mov_b32 s20, 12 +; NOOPT-NEXT: s_mov_b32 s22, 10 +; NOOPT-NEXT: s_mov_b32 s24, 8 +; NOOPT-NEXT: s_mov_b32 s26, 6 +; NOOPT-NEXT: s_mov_b32 s27, 5 +; NOOPT-NEXT: s_mov_b32 s19, 13 +; NOOPT-NEXT: s_mov_b32 s21, 11 +; NOOPT-NEXT: s_mov_b32 s23, 9 +; NOOPT-NEXT: s_mov_b32 s25, 7 +; NOOPT-NEXT: s_mov_b32 s0, s25 +; NOOPT-NEXT: s_mov_b32 s1, s23 +; NOOPT-NEXT: s_mov_b32 s2, s21 +; NOOPT-NEXT: s_mov_b32 s3, s19 +; NOOPT-NEXT: s_mov_b32 s4, s27 +; NOOPT-NEXT: s_mov_b32 s5, s26 +; NOOPT-NEXT: s_mov_b32 s6, s25 +; NOOPT-NEXT: s_mov_b32 s7, s24 +; NOOPT-NEXT: s_mov_b32 s8, s23 +; NOOPT-NEXT: s_mov_b32 s9, s22 +; NOOPT-NEXT: s_mov_b32 s10, s21 +; NOOPT-NEXT: s_mov_b32 s11, s20 +; NOOPT-NEXT: s_mov_b32 s12, s19 +; NOOPT-NEXT: s_mov_b32 s13, s18 +; NOOPT-NEXT: s_mov_b32 s14, s17 +; NOOPT-NEXT: s_mov_b32 s15, s16 +; NOOPT-NEXT: v_writelane_b32 v0, s0, 5 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 6 +; NOOPT-NEXT: v_writelane_b32 v0, s2, 7 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 8 +; NOOPT-NEXT: v_writelane_b32 v0, s4, 9 +; NOOPT-NEXT: v_writelane_b32 v0, s5, 10 +; NOOPT-NEXT: v_writelane_b32 v0, s6, 11 +; NOOPT-NEXT: v_writelane_b32 v0, s7, 12 +; NOOPT-NEXT: v_writelane_b32 v0, s8, 13 +; NOOPT-NEXT: v_writelane_b32 v0, s9, 14 +; NOOPT-NEXT: v_writelane_b32 v0, s10, 15 +; NOOPT-NEXT: v_writelane_b32 v0, s11, 16 +; NOOPT-NEXT: v_writelane_b32 v0, s12, 17 +; NOOPT-NEXT: v_writelane_b32 v0, s13, 18 +; NOOPT-NEXT: v_writelane_b32 v0, s14, 19 +; NOOPT-NEXT: v_writelane_b32 v0, s15, 20 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v16, s15 +; NOOPT-NEXT: v_mov_b32_e32 v15, s14 +; NOOPT-NEXT: v_mov_b32_e32 v14, s13 +; NOOPT-NEXT: v_mov_b32_e32 v13, s12 +; NOOPT-NEXT: v_mov_b32_e32 v12, s11 +; NOOPT-NEXT: v_mov_b32_e32 v11, s10 +; NOOPT-NEXT: v_mov_b32_e32 v10, s9 +; NOOPT-NEXT: v_mov_b32_e32 v9, s8 +; NOOPT-NEXT: v_mov_b32_e32 v8, s7 +; NOOPT-NEXT: v_mov_b32_e32 v7, s6 +; NOOPT-NEXT: v_mov_b32_e32 v6, s5 +; NOOPT-NEXT: v_mov_b32_e32 v5, s4 +; NOOPT-NEXT: v_mov_b32_e32 v4, s3 +; NOOPT-NEXT: v_mov_b32_e32 v3, s2 +; NOOPT-NEXT: v_mov_b32_e32 v2, s1 +; NOOPT-NEXT: v_mov_b32_e32 v1, s0 +; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[36:39], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[36:39], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[36:39], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[36:39], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[36:39], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[36:39], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[36:39], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[36:39], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[36:39], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[36:39], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[36:39], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[36:39], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[36:39], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[36:39], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[36:39], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[0:1], exec +; NOOPT-NEXT: v_writelane_b32 v0, s0, 21 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 22 +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: ; implicit-def: $vgpr0 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: .LBB16_1: ; =>This Inner Loop Header: Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 23 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 24 +; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 offset:80 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:8 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[36:39], 0 offset:12 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[36:39], 0 offset:16 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[36:39], 0 offset:20 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[36:39], 0 offset:24 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[36:39], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[36:39], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[36:39], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[36:39], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[36:39], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[36:39], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[36:39], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[36:39], 0 offset:56 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[36:39], 0 offset:60 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[36:39], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[36:39], 0 offset:72 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 +; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; NOOPT-NEXT: s_mov_b32 m0, s2 +; NOOPT-NEXT: v_movrels_b32_e32 v1, v1 +; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:84 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:80 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] +; NOOPT-NEXT: v_writelane_b32 v0, s2, 23 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 24 +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] +; NOOPT-NEXT: s_cbranch_execnz .LBB16_1 +; NOOPT-NEXT: ; %bb.2: +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 21 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 22 +; NOOPT-NEXT: s_mov_b64 exec, s[0:1] +; NOOPT-NEXT: ; %bb.3: +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: ;;#ASMSTART +; NOOPT-NEXT: s_mov_b32 s4, 17 +; NOOPT-NEXT: ;;#ASMEND +; NOOPT-NEXT: s_mov_b32 s16, s4 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 5 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 6 +; NOOPT-NEXT: v_readlane_b32 s2, v0, 7 +; NOOPT-NEXT: v_readlane_b32 s3, v0, 8 +; NOOPT-NEXT: v_readlane_b32 s4, v0, 9 +; NOOPT-NEXT: v_readlane_b32 s5, v0, 10 +; NOOPT-NEXT: v_readlane_b32 s6, v0, 11 +; NOOPT-NEXT: v_readlane_b32 s7, v0, 12 +; NOOPT-NEXT: v_readlane_b32 s8, v0, 13 +; NOOPT-NEXT: v_readlane_b32 s9, v0, 14 +; NOOPT-NEXT: v_readlane_b32 s10, v0, 15 +; NOOPT-NEXT: v_readlane_b32 s11, v0, 16 +; NOOPT-NEXT: v_readlane_b32 s12, v0, 17 +; NOOPT-NEXT: v_readlane_b32 s13, v0, 18 +; NOOPT-NEXT: v_readlane_b32 s14, v0, 19 +; NOOPT-NEXT: v_readlane_b32 s15, v0, 20 +; NOOPT-NEXT: v_writelane_b32 v0, s16, 25 +; NOOPT-NEXT: v_mov_b32_e32 v16, s15 +; NOOPT-NEXT: v_mov_b32_e32 v15, s14 +; NOOPT-NEXT: v_mov_b32_e32 v14, s13 +; NOOPT-NEXT: v_mov_b32_e32 v13, s12 +; NOOPT-NEXT: v_mov_b32_e32 v12, s11 +; NOOPT-NEXT: v_mov_b32_e32 v11, s10 +; NOOPT-NEXT: v_mov_b32_e32 v10, s9 +; NOOPT-NEXT: v_mov_b32_e32 v9, s8 +; NOOPT-NEXT: v_mov_b32_e32 v8, s7 +; NOOPT-NEXT: v_mov_b32_e32 v7, s6 +; NOOPT-NEXT: v_mov_b32_e32 v6, s5 +; NOOPT-NEXT: v_mov_b32_e32 v5, s4 +; NOOPT-NEXT: v_mov_b32_e32 v4, s3 +; NOOPT-NEXT: v_mov_b32_e32 v3, s2 +; NOOPT-NEXT: v_mov_b32_e32 v2, s1 +; NOOPT-NEXT: v_mov_b32_e32 v1, s0 +; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:88 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[36:39], 0 offset:92 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[36:39], 0 offset:96 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[36:39], 0 offset:100 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[36:39], 0 offset:104 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[36:39], 0 offset:108 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[36:39], 0 offset:112 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[36:39], 0 offset:116 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[36:39], 0 offset:120 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[36:39], 0 offset:124 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[36:39], 0 offset:128 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[36:39], 0 offset:132 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[36:39], 0 offset:136 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[36:39], 0 offset:140 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[36:39], 0 offset:144 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[36:39], 0 offset:148 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[0:1], exec +; NOOPT-NEXT: v_writelane_b32 v0, s0, 26 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 27 +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: ; implicit-def: $vgpr0 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: .LBB16_4: ; =>This Inner Loop Header: Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 28 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 29 +; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 offset:152 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:88 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:92 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[36:39], 0 offset:96 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[36:39], 0 offset:100 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[36:39], 0 offset:104 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[36:39], 0 offset:108 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[36:39], 0 offset:112 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[36:39], 0 offset:116 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[36:39], 0 offset:120 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[36:39], 0 offset:124 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[36:39], 0 offset:128 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[36:39], 0 offset:132 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[36:39], 0 offset:136 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[36:39], 0 offset:140 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[36:39], 0 offset:144 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[36:39], 0 offset:148 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[36:39], 0 offset:68 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readfirstlane_b32 s2, v17 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17 +; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; NOOPT-NEXT: s_mov_b32 m0, s2 +; NOOPT-NEXT: v_movrels_b32_e32 v1, v1 +; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:156 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:152 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] +; NOOPT-NEXT: v_writelane_b32 v0, s2, 28 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 29 +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] +; NOOPT-NEXT: s_cbranch_execnz .LBB16_4 +; NOOPT-NEXT: ; %bb.5: +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 26 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 27 +; NOOPT-NEXT: s_mov_b64 exec, s[0:1] +; NOOPT-NEXT: ; %bb.6: +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 4 +; NOOPT-NEXT: v_readlane_b32 s4, v0, 0 +; NOOPT-NEXT: v_readlane_b32 s5, v0, 1 +; NOOPT-NEXT: v_readlane_b32 s6, v0, 2 +; NOOPT-NEXT: v_readlane_b32 s7, v0, 3 +; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:76 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:156 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[36:39], 0 offset:84 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v3, off, s[4:7], 0 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0 +; NOOPT-NEXT: s_mov_b64 s[0:1], exec +; NOOPT-NEXT: v_writelane_b32 v0, s0, 30 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 31 +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; NOOPT-NEXT: s_mov_b64 exec, s[0:1] +; NOOPT-NEXT: s_cbranch_execz .LBB16_8 +; NOOPT-NEXT: ; %bb.7: ; %bb1 +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s4, v0, 25 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: v_mov_b32_e32 v0, s4 +; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: .LBB16_8: ; %bb2 +; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[28:29] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 30 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 31 +; NOOPT-NEXT: s_or_b64 exec, exec, s[0:1] +; NOOPT-NEXT: ; kill: killed $vgpr0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extract_vgpr_offset_multiple_in_block: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-MOVREL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_mov_b32 s11, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s6, 0 +; SI-MOVREL-NEXT: s_mov_b32 s7, s11 +; SI-MOVREL-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SI-MOVREL-NEXT: s_mov_b32 s10, -1 +; SI-MOVREL-NEXT: ;;#ASMSTART +; SI-MOVREL-NEXT: s_mov_b32 s4, 17 +; SI-MOVREL-NEXT: ;;#ASMEND +; SI-MOVREL-NEXT: v_add_i32_e64 v0, s[0:1], 1, v1 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 7, 9, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 11, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 7, 9, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 13, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 11, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 4, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 5, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 13, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 5, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 6, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 4, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 5, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 6, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 7, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 5, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 6, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 7, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 8, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 6, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 7, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 8, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 9, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 7, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 8, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 9, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 10, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 8, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 9, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 10, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 11, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 9, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 10, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 11, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 12, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 10, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 11, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 12, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 13, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 11, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 12, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 14, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 12, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 13, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 14, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 15, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 14, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v1 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v1, 16, v2, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 14, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 15, v3, s[0:1] +; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v0 +; SI-MOVREL-NEXT: v_cndmask_b32_e64 v0, 16, v2, s[0:1] +; SI-MOVREL-NEXT: buffer_store_dword v1, off, s[8:11], 0 +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; SI-MOVREL-NEXT: s_cbranch_execz .LBB16_2 +; SI-MOVREL-NEXT: ; %bb.1: ; %bb1 +; SI-MOVREL-NEXT: s_waitcnt expcnt(0) +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: .LBB16_2: ; %bb2 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-LABEL: extract_vgpr_offset_multiple_in_block: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: flat_load_dword v2, v[1:2] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: ;;#ASMSTART +; VI-NEXT: s_mov_b32 s4, 17 +; VI-NEXT: ;;#ASMEND +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e64 v3, s[0:1], 1, v2 +; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 7, 9, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 11, v4, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 7, 9, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 13, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 11, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 4, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 5, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 13, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 5, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 6, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 4, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 5, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 6, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 7, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 5, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 6, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 7, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 8, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 6, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 7, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 8, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 9, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 7, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 8, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 9, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 10, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 8, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 9, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 10, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 11, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 9, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 10, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 11, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 12, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 10, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 11, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 12, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 13, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 11, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 12, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 14, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 12, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 13, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 14, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 15, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 14, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v2 +; VI-NEXT: v_cndmask_b32_e64 v2, 16, v4, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 14, v3 +; VI-NEXT: v_cndmask_b32_e64 v4, 15, v5, s[0:1] +; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v3 +; VI-NEXT: v_cndmask_b32_e64 v3, 16, v4, s[0:1] +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: ; %bb.1: ; %bb1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: flat_store_dword v[0:1], v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: .LBB16_2: ; %bb2 +; VI-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: extract_vgpr_offset_multiple_in_block: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dword v2, v1, s[0:1] glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-IDXMODE-NEXT: ;;#ASMSTART +; GFX9-IDXMODE-NEXT: s_mov_b32 s4, 17 +; GFX9-IDXMODE-NEXT: ;;#ASMEND +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2 +; GFX9-IDXMODE-NEXT: v_add_u32_e32 v0, 1, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 7, 9, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 11, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 7, 9, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 13, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 11, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 4, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 5, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 13, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 5, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 6, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 4, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 5, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 6, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 7, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 5, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 6, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 7, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 8, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 6, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 7, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 8, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 9, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 7, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 8, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 9, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 10, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 8, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 9, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 10, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 11, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 9, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 10, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 11, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 12, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 10, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 11, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 12, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 13, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 11, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 12, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 14, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 12, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 13, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 14, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 15, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 14, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v2 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v2, 16, v3, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 14, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 15, v4, s[0:1] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v0 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v0, 16, v3, s[0:1] +; GFX9-IDXMODE-NEXT: global_store_dword v1, v2, s[6:7] +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_store_dword v1, v0, s[6:7] +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-IDXMODE-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-IDXMODE-NEXT: ; %bb.1: ; %bb1 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-IDXMODE-NEXT: global_store_dword v[0:1], v0, off +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: .LBB16_2: ; %bb2 +; GFX9-IDXMODE-NEXT: s_endpgm entry: - %id = call i32 @llvm.amdgcn.workitem.id.x() #1 + %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = zext i32 %id to i64 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %id.ext %idx0 = load volatile i32, ptr addrspace(1) %gep @@ -349,62 +4716,1870 @@ bb2: ret void } -; Moved subtest for insert_vgpr_offset_multiple_in_block to separate file to -; avoid very different schedule induced isses with gfx9. -; test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll +define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in, <16 x i32> %vec0) { +; NOOPT-LABEL: insert_vgpr_offset_multiple_in_block: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_mov_b32 s28, SCRATCH_RSRC_DWORD0 +; NOOPT-NEXT: s_mov_b32 s29, SCRATCH_RSRC_DWORD1 +; NOOPT-NEXT: s_mov_b32 s30, -1 +; NOOPT-NEXT: s_mov_b32 s31, 0xe8f000 +; NOOPT-NEXT: s_add_u32 s28, s28, s9 +; NOOPT-NEXT: s_addc_u32 s29, s29, 0 +; NOOPT-NEXT: ; implicit-def: $vgpr16 : SGPR spill to VGPR lane +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:84 ; 4-byte Folded Spill +; NOOPT-NEXT: s_load_dwordx2 s[18:19], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0xd +; NOOPT-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x19 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s24, s19 +; NOOPT-NEXT: s_mov_b32 s20, s18 +; NOOPT-NEXT: s_mov_b32 s18, 0xf000 +; NOOPT-NEXT: s_mov_b32 s19, -1 +; NOOPT-NEXT: ; kill: def $sgpr20 killed $sgpr20 def $sgpr20_sgpr21_sgpr22_sgpr23 +; NOOPT-NEXT: s_mov_b32 s21, s24 +; NOOPT-NEXT: s_mov_b32 s22, s19 +; NOOPT-NEXT: s_mov_b32 s23, s18 +; NOOPT-NEXT: v_writelane_b32 v16, s20, 0 +; NOOPT-NEXT: v_writelane_b32 v16, s21, 1 +; NOOPT-NEXT: v_writelane_b32 v16, s22, 2 +; NOOPT-NEXT: v_writelane_b32 v16, s23, 3 +; NOOPT-NEXT: s_mov_b32 s20, 0 +; NOOPT-NEXT: v_writelane_b32 v16, s20, 4 +; NOOPT-NEXT: ; kill: def $sgpr20 killed $sgpr20 def $sgpr20_sgpr21 +; NOOPT-NEXT: s_mov_b32 s21, s18 +; NOOPT-NEXT: ; kill: def $sgpr16_sgpr17 killed $sgpr16_sgpr17 def $sgpr16_sgpr17_sgpr18_sgpr19 +; NOOPT-NEXT: s_mov_b64 s[18:19], s[20:21] +; NOOPT-NEXT: s_mov_b32 s20, 2 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_lshlrev_b32_e64 v0, s20, v0 +; NOOPT-NEXT: s_mov_b32 s20, 0 +; NOOPT-NEXT: ; implicit-def: $sgpr20 +; NOOPT-NEXT: v_mov_b32_e32 v2, 0 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v2 +; NOOPT-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:80 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b32 s16, 1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_add_i32_e64 v0, s[16:17], v0, s16 +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:76 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: ;;#ASMSTART +; NOOPT-NEXT: v_mov_b32 v0, 62 +; NOOPT-NEXT: ;;#ASMEND +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:72 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:68 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, s0 +; NOOPT-NEXT: v_mov_b32_e32 v1, s1 +; NOOPT-NEXT: v_mov_b32_e32 v2, s2 +; NOOPT-NEXT: v_mov_b32_e32 v3, s3 +; NOOPT-NEXT: v_mov_b32_e32 v4, s4 +; NOOPT-NEXT: v_mov_b32_e32 v5, s5 +; NOOPT-NEXT: v_mov_b32_e32 v6, s6 +; NOOPT-NEXT: v_mov_b32_e32 v7, s7 +; NOOPT-NEXT: v_mov_b32_e32 v8, s8 +; NOOPT-NEXT: v_mov_b32_e32 v9, s9 +; NOOPT-NEXT: v_mov_b32_e32 v10, s10 +; NOOPT-NEXT: v_mov_b32_e32 v11, s11 +; NOOPT-NEXT: v_mov_b32_e32 v12, s12 +; NOOPT-NEXT: v_mov_b32_e32 v13, s13 +; NOOPT-NEXT: v_mov_b32_e32 v14, s14 +; NOOPT-NEXT: v_mov_b32_e32 v15, s15 +; NOOPT-NEXT: s_mov_b64 s[0:1], exec +; NOOPT-NEXT: v_writelane_b32 v16, s0, 5 +; NOOPT-NEXT: v_writelane_b32 v16, s1, 6 +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 7 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 8 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[28:31], 0 offset:8 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[28:31], 0 offset:12 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[28:31], 0 offset:16 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[28:31], 0 offset:20 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[28:31], 0 offset:24 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[28:31], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[28:31], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[28:31], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[28:31], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[28:31], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[28:31], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[28:31], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:56 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:60 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:72 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[28:31], 0 offset:80 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 +; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; NOOPT-NEXT: s_mov_b32 m0, s2 +; NOOPT-NEXT: v_movreld_b32_e32 v1, v17 +; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:88 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:92 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:96 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:100 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:104 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:108 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:112 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:116 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:120 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:124 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:128 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:132 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:136 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:140 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:144 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:148 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] +; NOOPT-NEXT: v_writelane_b32 v0, s2, 7 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 8 +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] +; NOOPT-NEXT: s_cbranch_execnz .LBB17_1 +; NOOPT-NEXT: ; %bb.2: +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 5 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 6 +; NOOPT-NEXT: s_mov_b64 exec, s[0:1] +; NOOPT-NEXT: ; %bb.3: +; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:88 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:92 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:96 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[28:31], 0 offset:100 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[28:31], 0 offset:104 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[28:31], 0 offset:108 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[28:31], 0 offset:112 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[28:31], 0 offset:116 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[28:31], 0 offset:120 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[28:31], 0 offset:124 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[28:31], 0 offset:128 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[28:31], 0 offset:132 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[28:31], 0 offset:136 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[28:31], 0 offset:140 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[28:31], 0 offset:144 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:148 ; 4-byte Folded Reload +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: v_mov_b32_e32 v17, 63 +; NOOPT-NEXT: buffer_store_dword v17, off, s[28:31], 0 offset:216 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[0:1], exec +; NOOPT-NEXT: s_waitcnt vmcnt(1) +; NOOPT-NEXT: v_writelane_b32 v16, s0, 9 +; NOOPT-NEXT: v_writelane_b32 v16, s1, 10 +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:152 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:156 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:160 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:164 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:168 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:172 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:176 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:180 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:184 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:188 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:192 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:196 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:200 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:204 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:208 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:212 ; 4-byte Folded Spill +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: .LBB17_4: ; =>This Inner Loop Header: Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 11 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 12 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:152 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:156 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[28:31], 0 offset:160 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[28:31], 0 offset:164 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[28:31], 0 offset:168 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[28:31], 0 offset:172 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[28:31], 0 offset:176 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[28:31], 0 offset:180 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[28:31], 0 offset:184 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[28:31], 0 offset:188 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[28:31], 0 offset:192 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[28:31], 0 offset:196 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[28:31], 0 offset:200 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[28:31], 0 offset:204 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:208 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:212 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:216 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[28:31], 0 offset:76 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 +; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; NOOPT-NEXT: s_mov_b32 m0, s2 +; NOOPT-NEXT: v_movreld_b32_e32 v1, v17 +; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:220 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:224 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:228 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:232 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:236 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:240 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:244 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:248 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:252 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:256 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:260 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:264 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:268 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:272 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:276 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:280 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:152 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:156 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:160 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:164 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:168 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:172 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:176 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:180 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:184 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:188 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:192 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:196 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:200 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:204 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:208 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:212 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] +; NOOPT-NEXT: v_writelane_b32 v0, s2, 11 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 12 +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] +; NOOPT-NEXT: s_cbranch_execnz .LBB17_4 +; NOOPT-NEXT: ; %bb.5: +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 9 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 10 +; NOOPT-NEXT: s_mov_b64 exec, s[0:1] +; NOOPT-NEXT: ; %bb.6: +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 4 +; NOOPT-NEXT: v_readlane_b32 s4, v0, 0 +; NOOPT-NEXT: v_readlane_b32 s5, v0, 1 +; NOOPT-NEXT: v_readlane_b32 s6, v0, 2 +; NOOPT-NEXT: v_readlane_b32 s7, v0, 3 +; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:84 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:220 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[28:31], 0 offset:224 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v19, off, s[28:31], 0 offset:228 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v20, off, s[28:31], 0 offset:232 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v21, off, s[28:31], 0 offset:236 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v22, off, s[28:31], 0 offset:240 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v23, off, s[28:31], 0 offset:244 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v24, off, s[28:31], 0 offset:248 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v25, off, s[28:31], 0 offset:252 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v26, off, s[28:31], 0 offset:256 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v27, off, s[28:31], 0 offset:260 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v28, off, s[28:31], 0 offset:264 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v29, off, s[28:31], 0 offset:268 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v30, off, s[28:31], 0 offset:272 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v31, off, s[28:31], 0 offset:276 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 offset:280 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v6, v20 +; NOOPT-NEXT: v_mov_b32_e32 v7, v19 +; NOOPT-NEXT: v_mov_b32_e32 v8, v18 +; NOOPT-NEXT: v_mov_b32_e32 v2, v17 +; NOOPT-NEXT: v_mov_b32_e32 v3, v24 +; NOOPT-NEXT: v_mov_b32_e32 v4, v23 +; NOOPT-NEXT: v_mov_b32_e32 v5, v22 +; NOOPT-NEXT: v_mov_b32_e32 v9, v21 +; NOOPT-NEXT: v_mov_b32_e32 v14, v28 +; NOOPT-NEXT: v_mov_b32_e32 v15, v27 +; NOOPT-NEXT: v_mov_b32_e32 v16, v26 +; NOOPT-NEXT: v_mov_b32_e32 v10, v25 +; NOOPT-NEXT: v_mov_b32_e32 v11, v32 +; NOOPT-NEXT: v_mov_b32_e32 v12, v31 +; NOOPT-NEXT: v_mov_b32_e32 v13, v30 +; NOOPT-NEXT: v_mov_b32_e32 v17, v29 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18_vgpr19_vgpr20 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v18, v13 +; NOOPT-NEXT: v_mov_b32_e32 v19, v12 +; NOOPT-NEXT: v_mov_b32_e32 v20, v11 +; NOOPT-NEXT: buffer_store_dwordx4 v[17:20], off, s[4:7], 0 offset:48 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11_vgpr12_vgpr13 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v11, v16 +; NOOPT-NEXT: v_mov_b32_e32 v12, v15 +; NOOPT-NEXT: v_mov_b32_e32 v13, v14 +; NOOPT-NEXT: buffer_store_dwordx4 v[10:13], off, s[4:7], 0 offset:32 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v10, v5 +; NOOPT-NEXT: v_mov_b32_e32 v11, v4 +; NOOPT-NEXT: v_mov_b32_e32 v12, v3 +; NOOPT-NEXT: buffer_store_dwordx4 v[9:12], off, s[4:7], 0 offset:16 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v3, v8 +; NOOPT-NEXT: v_mov_b32_e32 v4, v7 +; NOOPT-NEXT: v_mov_b32_e32 v5, v6 +; NOOPT-NEXT: buffer_store_dwordx4 v[2:5], off, s[4:7], 0 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0 +; NOOPT-NEXT: s_mov_b64 s[0:1], exec +; NOOPT-NEXT: v_writelane_b32 v0, s0, 13 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 14 +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; NOOPT-NEXT: s_mov_b64 exec, s[0:1] +; NOOPT-NEXT: s_cbranch_execz .LBB17_8 +; NOOPT-NEXT: ; %bb.7: ; %bb1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:68 ; 4-byte Folded Reload +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s6, s1 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s4, 0xf000 +; NOOPT-NEXT: s_mov_b32 s5, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s6 +; NOOPT-NEXT: s_mov_b32 s2, s5 +; NOOPT-NEXT: s_mov_b32 s3, s4 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: .LBB17_8: ; %bb2 +; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[26:27] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 13 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 14 +; NOOPT-NEXT: s_or_b64 exec, exec, s[0:1] +; NOOPT-NEXT: ; kill: killed $vgpr0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_vgpr_offset_multiple_in_block: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-MOVREL-NEXT: s_mov_b32 s23, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s6, 0 +; SI-MOVREL-NEXT: s_mov_b32 s7, s23 +; SI-MOVREL-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: buffer_load_dword v2, v[1:2], s[4:7], 0 addr64 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; SI-MOVREL-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 +; SI-MOVREL-NEXT: ;;#ASMSTART +; SI-MOVREL-NEXT: v_mov_b32 v1, 62 +; SI-MOVREL-NEXT: ;;#ASMEND +; SI-MOVREL-NEXT: s_mov_b32 s22, -1 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s16 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s17 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s18 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s19 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s8 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s11 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s4 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, s5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v17, s6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v18, s7 +; SI-MOVREL-NEXT: v_add_i32_e32 v19, vcc, 1, v2 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v20, v3, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 13, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 14, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, v11, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, v15, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, v18, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 3, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v5, 63, v2, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 2, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, 63, v11, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 7, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 6, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 5, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 4, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 11, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v13, 63, v10, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 10, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 9, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 8, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, 63, v24, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 15, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 14, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 13, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[14:17], off, s[20:23], 0 offset:48 +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[10:13], off, s[20:23], 0 offset:32 +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[6:9], off, s[20:23], 0 offset:16 +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[2:5], off, s[20:23], 0 +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; SI-MOVREL-NEXT: s_cbranch_execz .LBB17_2 +; SI-MOVREL-NEXT: ; %bb.1: ; %bb1 +; SI-MOVREL-NEXT: buffer_store_dword v1, off, s[20:23], 0 +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: .LBB17_2: ; %bb2 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-LABEL: insert_vgpr_offset_multiple_in_block: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: flat_load_dword v2, v[1:2] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: ;;#ASMSTART +; VI-NEXT: v_mov_b32 v1, 62 +; VI-NEXT: ;;#ASMEND +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s18 +; VI-NEXT: v_mov_b32_e32 v6, s19 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s13 +; VI-NEXT: v_mov_b32_e32 v9, s14 +; VI-NEXT: v_mov_b32_e32 v10, s15 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s9 +; VI-NEXT: v_mov_b32_e32 v13, s10 +; VI-NEXT: v_mov_b32_e32 v14, s11 +; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: v_mov_b32_e32 v16, s5 +; VI-NEXT: v_mov_b32_e32 v17, s6 +; VI-NEXT: v_mov_b32_e32 v18, s7 +; VI-NEXT: s_add_u32 s2, s0, 48 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 1, v2 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2 +; VI-NEXT: v_cndmask_b32_e32 v20, v3, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 13, v2 +; VI-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 14, v2 +; VI-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2 +; VI-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2 +; VI-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2 +; VI-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2 +; VI-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2 +; VI-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; VI-NEXT: v_cndmask_b32_e32 v6, v11, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; VI-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; VI-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; VI-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; VI-NEXT: v_cndmask_b32_e32 v11, v15, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; VI-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v18, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 3, v19 +; VI-NEXT: v_cndmask_b32_e32 v5, 63, v2, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 2, v19 +; VI-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v19 +; VI-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; VI-NEXT: v_cndmask_b32_e32 v2, 63, v11, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 7, v19 +; VI-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 6, v19 +; VI-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v19 +; VI-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 4, v19 +; VI-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 11, v19 +; VI-NEXT: v_cndmask_b32_e32 v13, 63, v10, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 10, v19 +; VI-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 9, v19 +; VI-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 8, v19 +; VI-NEXT: v_cndmask_b32_e32 v10, 63, v24, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 15, v19 +; VI-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 14, v19 +; VI-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 13, v19 +; VI-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19 +; VI-NEXT: v_mov_b32_e32 v19, s3 +; VI-NEXT: v_mov_b32_e32 v18, s2 +; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: flat_store_dwordx4 v[18:19], v[14:17] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_mov_b32_e32 v15, s3 +; VI-NEXT: v_mov_b32_e32 v14, s2 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: flat_store_dwordx4 v[14:15], v[10:13] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v11, s3 +; VI-NEXT: v_mov_b32_e32 v10, s2 +; VI-NEXT: flat_store_dwordx4 v[10:11], v[6:9] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v6, s0 +; VI-NEXT: flat_store_dwordx4 v[6:7], v[2:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: ; %bb.1: ; %bb1 +; VI-NEXT: flat_store_dword v[0:1], v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: .LBB17_2: ; %bb2 +; VI-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_vgpr_offset_multiple_in_block: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-IDXMODE-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dword v3, v1, s[0:1] glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: ;;#ASMSTART +; GFX9-IDXMODE-NEXT: v_mov_b32 v1, 62 +; GFX9-IDXMODE-NEXT: ;;#ASMEND +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s17 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s18 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s19 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, s4 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, s5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, s6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v19, s7 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 12, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 13, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 14, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 15, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 8, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 9, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 10, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v27, v10, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 11, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, v11, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 4, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 5, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 6, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 7, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v12, v16, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 2, v3 +; GFX9-IDXMODE-NEXT: v_add_u32_e32 v20, 1, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, v18, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, v19, v1, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 3, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v6, 63, v3, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 2, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, 63, v5, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 1, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, 63, v12, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 7, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 6, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 5, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 4, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 11, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v14, 63, v11, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 10, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v13, 63, v27, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 9, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 8, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 15, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v18, 63, v24, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 14, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 13, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 12, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[15:18], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[11:14], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[7:10], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[3:6], s[0:1] +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-IDXMODE-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-IDXMODE-NEXT: ; %bb.1: ; %bb1 +; GFX9-IDXMODE-NEXT: global_store_dword v[0:1], v1, off +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: .LBB17_2: ; %bb2 +; GFX9-IDXMODE-NEXT: s_endpgm +entry: + %id = call i32 @llvm.amdgcn.workitem.id.x() + %id.ext = zext i32 %id to i64 + %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %id.ext + %idx0 = load volatile i32, ptr addrspace(1) %gep + %idx1 = add i32 %idx0, 1 + %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"() + %vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0 + %vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1 + store volatile <16 x i32> %vec2, ptr addrspace(1) %out0 + %cmp = icmp eq i32 %id, 0 + br i1 %cmp, label %bb1, label %bb2 +bb1: + store volatile i32 %live.out.val, ptr addrspace(1) undef + br label %bb2 -; GCN-LABEL: {{^}}insert_adjacent_blocks: -define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) #0 { +bb2: + ret void +} + +; Avoid inserting extra v_mov from copies within the vgpr indexing sequence. The +; gpr_idx mode switching sequence is expanded late for this reason. +define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %out1, i32 %in) { +; NOOPT-LABEL: insert_w_offset_multiple_in_block: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s5, 1 +; NOOPT-NEXT: s_add_i32 s5, s4, s5 +; NOOPT-NEXT: s_mov_b32 s6, 0x41800000 +; NOOPT-NEXT: s_mov_b32 s7, 0x41700000 +; NOOPT-NEXT: s_mov_b32 s8, 0x41600000 +; NOOPT-NEXT: s_mov_b32 s9, 0x41500000 +; NOOPT-NEXT: s_mov_b32 s10, 0x41400000 +; NOOPT-NEXT: s_mov_b32 s11, 0x41300000 +; NOOPT-NEXT: s_mov_b32 s12, 0x41200000 +; NOOPT-NEXT: s_mov_b32 s13, 0x41100000 +; NOOPT-NEXT: s_mov_b32 s14, 0x41000000 +; NOOPT-NEXT: s_mov_b32 s15, 0x40e00000 +; NOOPT-NEXT: s_mov_b32 s16, 0x40c00000 +; NOOPT-NEXT: s_mov_b32 s17, 0x40a00000 +; NOOPT-NEXT: s_mov_b32 s18, 4.0 +; NOOPT-NEXT: s_mov_b32 s19, 0x40400000 +; NOOPT-NEXT: s_mov_b32 s20, 2.0 +; NOOPT-NEXT: s_mov_b32 s21, 1.0 +; NOOPT-NEXT: v_mov_b32_e32 v23, s21 +; NOOPT-NEXT: v_mov_b32_e32 v14, s20 +; NOOPT-NEXT: v_mov_b32_e32 v13, s19 +; NOOPT-NEXT: v_mov_b32_e32 v12, s18 +; NOOPT-NEXT: v_mov_b32_e32 v11, s17 +; NOOPT-NEXT: v_mov_b32_e32 v10, s16 +; NOOPT-NEXT: v_mov_b32_e32 v9, s15 +; NOOPT-NEXT: v_mov_b32_e32 v8, s14 +; NOOPT-NEXT: v_mov_b32_e32 v7, s13 +; NOOPT-NEXT: v_mov_b32_e32 v6, s12 +; NOOPT-NEXT: v_mov_b32_e32 v5, s11 +; NOOPT-NEXT: v_mov_b32_e32 v4, s10 +; NOOPT-NEXT: v_mov_b32_e32 v3, s9 +; NOOPT-NEXT: v_mov_b32_e32 v2, s8 +; NOOPT-NEXT: v_mov_b32_e32 v1, s7 +; NOOPT-NEXT: v_mov_b32_e32 v0, s6 +; NOOPT-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v24, v14 +; NOOPT-NEXT: v_mov_b32_e32 v25, v13 +; NOOPT-NEXT: v_mov_b32_e32 v26, v12 +; NOOPT-NEXT: v_mov_b32_e32 v27, v11 +; NOOPT-NEXT: v_mov_b32_e32 v28, v10 +; NOOPT-NEXT: v_mov_b32_e32 v29, v9 +; NOOPT-NEXT: v_mov_b32_e32 v30, v8 +; NOOPT-NEXT: v_mov_b32_e32 v31, v7 +; NOOPT-NEXT: v_mov_b32_e32 v32, v6 +; NOOPT-NEXT: v_mov_b32_e32 v33, v5 +; NOOPT-NEXT: v_mov_b32_e32 v34, v4 +; NOOPT-NEXT: v_mov_b32_e32 v35, v3 +; NOOPT-NEXT: v_mov_b32_e32 v36, v2 +; NOOPT-NEXT: v_mov_b32_e32 v37, v1 +; NOOPT-NEXT: v_mov_b32_e32 v38, v0 +; NOOPT-NEXT: v_mov_b32_e32 v0, 0x41880000 +; NOOPT-NEXT: s_mov_b32 m0, s5 +; NOOPT-NEXT: v_movreld_b32_e32 v23, v0 +; NOOPT-NEXT: s_mov_b32 s5, 2 +; NOOPT-NEXT: s_add_i32 s4, s4, s5 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_mov_b32_e32 v7, v23 +; NOOPT-NEXT: v_mov_b32_e32 v8, v24 +; NOOPT-NEXT: v_mov_b32_e32 v9, v25 +; NOOPT-NEXT: v_mov_b32_e32 v10, v26 +; NOOPT-NEXT: v_mov_b32_e32 v11, v27 +; NOOPT-NEXT: v_mov_b32_e32 v12, v28 +; NOOPT-NEXT: v_mov_b32_e32 v13, v29 +; NOOPT-NEXT: v_mov_b32_e32 v14, v30 +; NOOPT-NEXT: v_mov_b32_e32 v15, v31 +; NOOPT-NEXT: v_mov_b32_e32 v16, v32 +; NOOPT-NEXT: v_mov_b32_e32 v17, v33 +; NOOPT-NEXT: v_mov_b32_e32 v18, v34 +; NOOPT-NEXT: v_mov_b32_e32 v19, v35 +; NOOPT-NEXT: v_mov_b32_e32 v20, v36 +; NOOPT-NEXT: v_mov_b32_e32 v21, v37 +; NOOPT-NEXT: v_mov_b32_e32 v22, v38 +; NOOPT-NEXT: v_movreld_b32_e32 v7, v0 +; NOOPT-NEXT: v_mov_b32_e32 v4, v38 +; NOOPT-NEXT: v_mov_b32_e32 v5, v37 +; NOOPT-NEXT: v_mov_b32_e32 v6, v36 +; NOOPT-NEXT: v_mov_b32_e32 v0, v35 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: v_mov_b32_e32 v4, v34 +; NOOPT-NEXT: v_mov_b32_e32 v5, v33 +; NOOPT-NEXT: v_mov_b32_e32 v6, v32 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v31 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; NOOPT-NEXT: v_mov_b32_e32 v4, v30 +; NOOPT-NEXT: v_mov_b32_e32 v5, v29 +; NOOPT-NEXT: v_mov_b32_e32 v6, v28 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v27 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; NOOPT-NEXT: v_mov_b32_e32 v4, v26 +; NOOPT-NEXT: v_mov_b32_e32 v5, v25 +; NOOPT-NEXT: v_mov_b32_e32 v6, v24 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v23 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; NOOPT-NEXT: v_mov_b32_e32 v4, v22 +; NOOPT-NEXT: v_mov_b32_e32 v5, v21 +; NOOPT-NEXT: v_mov_b32_e32 v6, v20 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v19 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; NOOPT-NEXT: v_mov_b32_e32 v4, v18 +; NOOPT-NEXT: v_mov_b32_e32 v5, v17 +; NOOPT-NEXT: v_mov_b32_e32 v6, v16 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v15 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; NOOPT-NEXT: v_mov_b32_e32 v4, v14 +; NOOPT-NEXT: v_mov_b32_e32 v5, v13 +; NOOPT-NEXT: v_mov_b32_e32 v6, v12 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v11 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; NOOPT-NEXT: v_mov_b32_e32 v4, v10 +; NOOPT-NEXT: v_mov_b32_e32 v5, v9 +; NOOPT-NEXT: v_mov_b32_e32 v6, v8 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v7 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_w_offset_multiple_in_block: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_add_i32 s2, s4, 1 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000 +; SI-MOVREL-NEXT: s_mov_b32 m0, s2 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32 +; SI-MOVREL-NEXT: s_add_i32 s4, s4, 2 +; SI-MOVREL-NEXT: v_mov_b32_e32 v31, v15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v30, v14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v29, v13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v28, v12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v27, v11 +; SI-MOVREL-NEXT: v_mov_b32_e32 v26, v10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v25, v9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v24, v8 +; SI-MOVREL-NEXT: v_mov_b32_e32 v23, v7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v22, v6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v21, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v20, v4 +; SI-MOVREL-NEXT: v_mov_b32_e32 v19, v3 +; SI-MOVREL-NEXT: v_mov_b32_e32 v18, v2 +; SI-MOVREL-NEXT: v_mov_b32_e32 v17, v1 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, v0 +; SI-MOVREL-NEXT: s_mov_b32 m0, s4 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v16, v32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: insert_w_offset_multiple_in_block: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_add_i32 s2, s4, 1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000 +; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: s_add_i32 s4, s4, 2 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32 +; VI-MOVREL-NEXT: v_mov_b32_e32 v31, v15 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 +; VI-MOVREL-NEXT: v_mov_b32_e32 v30, v14 +; VI-MOVREL-NEXT: v_mov_b32_e32 v29, v13 +; VI-MOVREL-NEXT: v_mov_b32_e32 v28, v12 +; VI-MOVREL-NEXT: v_mov_b32_e32 v27, v11 +; VI-MOVREL-NEXT: v_mov_b32_e32 v26, v10 +; VI-MOVREL-NEXT: v_mov_b32_e32 v25, v9 +; VI-MOVREL-NEXT: v_mov_b32_e32 v24, v8 +; VI-MOVREL-NEXT: v_mov_b32_e32 v23, v7 +; VI-MOVREL-NEXT: v_mov_b32_e32 v22, v6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v21, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v20, v4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v19, v3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v18, v2 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, v1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, v0 +; VI-MOVREL-NEXT: s_mov_b32 m0, s4 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v16, v32 +; VI-MOVREL-NEXT: v_mov_b32_e32 v33, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v32, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[32:33], v[12:15] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 64 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: s_add_u32 s4, s0, 0x70 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-MOVREL-NEXT: s_addc_u32 s5, s1, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 +; VI-MOVREL-NEXT: s_add_u32 s4, s0, 0x60 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[28:31] +; VI-MOVREL-NEXT: s_addc_u32 s5, s1, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 +; VI-MOVREL-NEXT: s_add_u32 s0, s0, 0x50 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[24:27] +; VI-MOVREL-NEXT: s_addc_u32 s1, s1, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[20:23] +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s2 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s3 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[16:19] +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: insert_w_offset_multiple_in_block: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_add_i32 s2, s4, 1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; VI-IDXMODE-NEXT: s_add_i32 s4, s4, 2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v32, 0x41880000 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v32 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: v_mov_b32_e32 v31, v15 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v30, v14 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v29, v13 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v28, v12 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v27, v11 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v26, v10 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v25, v9 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v24, v8 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v23, v7 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v22, v6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v21, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v20, v4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v19, v3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v18, v2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, v1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, v0 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, v32 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: v_mov_b32_e32 v33, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v32, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[32:33], v[12:15] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 64 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: s_add_u32 s4, s0, 0x70 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-IDXMODE-NEXT: s_addc_u32 s5, s1, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 +; VI-IDXMODE-NEXT: s_add_u32 s4, s0, 0x60 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[28:31] +; VI-IDXMODE-NEXT: s_addc_u32 s5, s1, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 +; VI-IDXMODE-NEXT: s_add_u32 s0, s0, 0x50 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[24:27] +; VI-IDXMODE-NEXT: s_addc_u32 s1, s1, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s1 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[20:23] +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[16:19] +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_w_offset_multiple_in_block: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_add_i32 s2, s4, 1 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v32, 0x41880000 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v32 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v31, v15 +; GFX9-IDXMODE-NEXT: s_add_i32 s4, s4, 2 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v30, v14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v29, v13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v28, v12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v27, v11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v26, v10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v25, v9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v24, v8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v23, v7 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v21, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v20, v4 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, v2 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, v1 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v0 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v32 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v32, 0 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX9-IDXMODE-NEXT: s_endpgm +entry: + %add1 = add i32 %in, 1 + %ins1 = insertelement <16 x float> , float 17.0, i32 %add1 + %add2 = add i32 %in, 2 + %ins2 = insertelement <16 x float> %ins1, float 17.0, i32 %add2 + store <16 x float> %ins1, ptr addrspace(1) %out1 + %out2 = getelementptr <16 x float>, ptr addrspace(1) %out1, i32 1 + store <16 x float> %ins2, ptr addrspace(1) %out2 + + ret void +} + +; Make sure we don't hit use of undefined register errors when expanding an +; extract with undef index. +define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) { +; NOOPT-LABEL: extract_adjacent_blocks: +; NOOPT: ; %bb.0: ; %bb +; NOOPT-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; NOOPT-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; NOOPT-NEXT: s_mov_b32 s14, -1 +; NOOPT-NEXT: s_mov_b32 s15, 0xe8f000 +; NOOPT-NEXT: s_add_u32 s12, s12, s9 +; NOOPT-NEXT: s_addc_u32 s13, s13, 0 +; NOOPT-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane +; NOOPT-NEXT: s_load_dword s2, s[2:3], 0x9 +; NOOPT-NEXT: s_mov_b64 s[0:1], -1 +; NOOPT-NEXT: ; implicit-def: $sgpr3 +; NOOPT-NEXT: s_mov_b32 s3, 0 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_cmp_lg_u32 s2, s3 +; NOOPT-NEXT: v_writelane_b32 v0, s0, 0 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 1 +; NOOPT-NEXT: s_mov_b64 s[8:9], exec +; NOOPT-NEXT: s_mov_b64 exec, -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[8:9] +; NOOPT-NEXT: s_cbranch_scc1 .LBB19_3 +; NOOPT-NEXT: .LBB19_1: ; %Flow +; NOOPT-NEXT: s_or_saveexec_b64 s[8:9], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[8:9] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 0 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 1 +; NOOPT-NEXT: ; implicit-def: $sgpr2 +; NOOPT-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; NOOPT-NEXT: s_mov_b32 s0, 1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: v_cmp_ne_u32_e64 s[0:1], v0, s0 +; NOOPT-NEXT: s_and_b64 vcc, exec, s[0:1] +; NOOPT-NEXT: s_cbranch_vccnz .LBB19_4 +; NOOPT-NEXT: ; %bb.2: ; %bb1 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s6, s1 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s4, 0xf000 +; NOOPT-NEXT: s_mov_b32 s5, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s6 +; NOOPT-NEXT: s_mov_b32 s2, s5 +; NOOPT-NEXT: s_mov_b32 s3, s4 +; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ; implicit-def: $sgpr0 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ;;#ASMSTART +; NOOPT-NEXT: ; reg use v[0:3] +; NOOPT-NEXT: ;;#ASMEND +; NOOPT-NEXT: s_branch .LBB19_4 +; NOOPT-NEXT: .LBB19_3: ; %bb4 +; NOOPT-NEXT: s_or_saveexec_b64 s[8:9], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[8:9] +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s6, s1 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s4, 0xf000 +; NOOPT-NEXT: s_mov_b32 s5, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s6 +; NOOPT-NEXT: s_mov_b32 s2, s5 +; NOOPT-NEXT: s_mov_b32 s3, s4 +; NOOPT-NEXT: buffer_load_dwordx4 v[1:4], off, s[0:3], 0 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ; implicit-def: $sgpr0 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ;;#ASMSTART +; NOOPT-NEXT: ; reg use v[1:4] +; NOOPT-NEXT: ;;#ASMEND +; NOOPT-NEXT: s_mov_b64 s[0:1], 0 +; NOOPT-NEXT: v_writelane_b32 v0, s0, 0 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 1 +; NOOPT-NEXT: s_or_saveexec_b64 s[8:9], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[8:9] +; NOOPT-NEXT: s_branch .LBB19_1 +; NOOPT-NEXT: .LBB19_4: ; %bb7 +; NOOPT-NEXT: s_or_saveexec_b64 s[8:9], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[8:9] +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; NOOPT-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ; kill: killed $vgpr0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extract_adjacent_blocks: +; SI-MOVREL: ; %bb.0: ; %bb +; SI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_cmp_lg_u32 s0, 0 +; SI-MOVREL-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-MOVREL-NEXT: ; %bb.1: ; %bb4 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: ;;#ASMSTART +; SI-MOVREL-NEXT: ; reg use v[0:3] +; SI-MOVREL-NEXT: ;;#ASMEND +; SI-MOVREL-NEXT: s_cbranch_execnz .LBB19_3 +; SI-MOVREL-NEXT: .LBB19_2: ; %bb1 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: ;;#ASMSTART +; SI-MOVREL-NEXT: ; reg use v[0:3] +; SI-MOVREL-NEXT: ;;#ASMEND +; SI-MOVREL-NEXT: .LBB19_3: ; %bb7 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: s_endpgm +; SI-MOVREL-NEXT: .LBB19_4: +; SI-MOVREL-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: extract_adjacent_blocks: +; VI: ; %bb.0: ; %bb +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %bb4 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: ;;#ASMSTART +; VI-NEXT: ; reg use v[0:3] +; VI-NEXT: ;;#ASMEND +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %bb1 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: ;;#ASMSTART +; VI-NEXT: ; reg use v[0:3] +; VI-NEXT: ;;#ASMEND +; VI-NEXT: .LBB19_3: ; %bb7 +; VI-NEXT: flat_store_dword v[0:1], v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; VI-NEXT: .LBB19_4: +; VI-NEXT: s_branch .LBB19_2 +; +; GFX9-IDXMODE-LABEL: extract_adjacent_blocks: +; GFX9-IDXMODE: ; %bb.0: ; %bb +; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-IDXMODE-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX9-IDXMODE-NEXT: ; %bb.1: ; %bb4 +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v[0:1], off glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: ;;#ASMSTART +; GFX9-IDXMODE-NEXT: ; reg use v[0:3] +; GFX9-IDXMODE-NEXT: ;;#ASMEND +; GFX9-IDXMODE-NEXT: s_cbranch_execnz .LBB19_3 +; GFX9-IDXMODE-NEXT: .LBB19_2: ; %bb1 +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v[0:1], off glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: ;;#ASMSTART +; GFX9-IDXMODE-NEXT: ; reg use v[0:3] +; GFX9-IDXMODE-NEXT: ;;#ASMEND +; GFX9-IDXMODE-NEXT: .LBB19_3: ; %bb7 +; GFX9-IDXMODE-NEXT: global_store_dword v[0:1], v0, off +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: s_endpgm +; GFX9-IDXMODE-NEXT: .LBB19_4: +; GFX9-IDXMODE-NEXT: s_branch .LBB19_2 bb: %tmp = icmp eq i32 %arg, 0 br i1 %tmp, label %bb1, label %bb4 -bb1: ; preds = %bb +bb1: + %tmp2 = load volatile <4 x float>, ptr addrspace(1) undef + %tmp3 = extractelement <4 x float> %tmp2, i32 undef + call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp2) ; Prevent block optimize out + br label %bb7 + +bb4: + %tmp5 = load volatile <4 x float>, ptr addrspace(1) undef + %tmp6 = extractelement <4 x float> %tmp5, i32 undef + call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp5) ; Prevent block optimize out + br label %bb7 + +bb7: + %tmp8 = phi float [ %tmp3, %bb1 ], [ %tmp6, %bb4 ] + store volatile float %tmp8, ptr addrspace(1) undef + ret void +} + +define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) { +; NOOPT-LABEL: insert_adjacent_blocks: +; NOOPT: ; %bb.0: ; %bb +; NOOPT-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; NOOPT-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; NOOPT-NEXT: s_mov_b32 s18, -1 +; NOOPT-NEXT: s_mov_b32 s19, 0xe8f000 +; NOOPT-NEXT: s_add_u32 s16, s16, s9 +; NOOPT-NEXT: s_addc_u32 s17, s17, 0 +; NOOPT-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane +; NOOPT-NEXT: s_mov_b64 s[0:1], s[2:3] +; NOOPT-NEXT: s_load_dword s2, s[0:1], 0x9 +; NOOPT-NEXT: s_load_dword s0, s[0:1], 0xa +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b64 s[0:1], -1 +; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b32 s3, 0 +; NOOPT-NEXT: s_cmp_lg_u32 s2, s3 +; NOOPT-NEXT: v_writelane_b32 v0, s0, 0 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 1 +; NOOPT-NEXT: s_mov_b64 s[12:13], exec +; NOOPT-NEXT: s_mov_b64 exec, -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[12:13] +; NOOPT-NEXT: s_cbranch_scc1 .LBB20_3 +; NOOPT-NEXT: .LBB20_1: ; %Flow +; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[12:13] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 0 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 1 +; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; NOOPT-NEXT: s_mov_b32 s0, 1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: v_cmp_ne_u32_e64 s[0:1], v0, s0 +; NOOPT-NEXT: s_and_b64 vcc, exec, s[0:1] +; NOOPT-NEXT: s_cbranch_vccnz .LBB20_4 +; NOOPT-NEXT: ; %bb.2: ; %bb1 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s6, s1 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s4, 0xf000 +; NOOPT-NEXT: s_mov_b32 s5, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s6 +; NOOPT-NEXT: s_mov_b32 s2, s5 +; NOOPT-NEXT: s_mov_b32 s3, s4 +; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ;;#ASMSTART +; NOOPT-NEXT: ; reg use v[0:3] +; NOOPT-NEXT: ;;#ASMEND +; NOOPT-NEXT: s_branch .LBB20_4 +; NOOPT-NEXT: .LBB20_3: ; %bb4 +; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[12:13] +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s6, s1 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s4, 0xf000 +; NOOPT-NEXT: s_mov_b32 s5, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s6 +; NOOPT-NEXT: s_mov_b32 s2, s5 +; NOOPT-NEXT: s_mov_b32 s3, s4 +; NOOPT-NEXT: buffer_load_dwordx4 v[1:4], off, s[0:3], 0 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ;;#ASMSTART +; NOOPT-NEXT: ; reg use v[1:4] +; NOOPT-NEXT: ;;#ASMEND +; NOOPT-NEXT: s_mov_b64 s[0:1], 0 +; NOOPT-NEXT: v_writelane_b32 v0, s0, 0 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 1 +; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[12:13] +; NOOPT-NEXT: s_branch .LBB20_1 +; NOOPT-NEXT: .LBB20_4: ; %bb7 +; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[12:13] +; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s10, s1 +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s8, 0xf000 +; NOOPT-NEXT: s_mov_b32 s9, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s10 +; NOOPT-NEXT: s_mov_b32 s2, s9 +; NOOPT-NEXT: s_mov_b32 s3, s8 +; NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; NOOPT-NEXT: v_mov_b32_e32 v2, s5 +; NOOPT-NEXT: v_mov_b32_e32 v3, s6 +; NOOPT-NEXT: v_mov_b32_e32 v4, s7 +; NOOPT-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: ; kill: killed $vgpr0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_adjacent_blocks: +; SI-MOVREL: ; %bb.0: ; %bb +; SI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_cmp_lg_u32 s0, 0 +; SI-MOVREL-NEXT: s_cbranch_scc0 .LBB20_4 +; SI-MOVREL-NEXT: ; %bb.1: ; %bb4 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: ;;#ASMSTART +; SI-MOVREL-NEXT: ; reg use v[0:3] +; SI-MOVREL-NEXT: ;;#ASMEND +; SI-MOVREL-NEXT: s_cbranch_execnz .LBB20_3 +; SI-MOVREL-NEXT: .LBB20_2: ; %bb1 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: ;;#ASMSTART +; SI-MOVREL-NEXT: ; reg use v[0:3] +; SI-MOVREL-NEXT: ;;#ASMEND +; SI-MOVREL-NEXT: .LBB20_3: ; %bb7 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: s_endpgm +; SI-MOVREL-NEXT: .LBB20_4: +; SI-MOVREL-NEXT: s_branch .LBB20_2 +; +; VI-LABEL: insert_adjacent_blocks: +; VI: ; %bb.0: ; %bb +; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cbranch_scc0 .LBB20_4 +; VI-NEXT: ; %bb.1: ; %bb4 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: ;;#ASMSTART +; VI-NEXT: ; reg use v[0:3] +; VI-NEXT: ;;#ASMEND +; VI-NEXT: s_cbranch_execnz .LBB20_3 +; VI-NEXT: .LBB20_2: ; %bb1 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: ;;#ASMSTART +; VI-NEXT: ; reg use v[0:3] +; VI-NEXT: ;;#ASMEND +; VI-NEXT: .LBB20_3: ; %bb7 +; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; VI-NEXT: .LBB20_4: +; VI-NEXT: s_branch .LBB20_2 +; +; GFX9-IDXMODE-LABEL: insert_adjacent_blocks: +; GFX9-IDXMODE: ; %bb.0: ; %bb +; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-IDXMODE-NEXT: s_cbranch_scc0 .LBB20_4 +; GFX9-IDXMODE-NEXT: ; %bb.1: ; %bb4 +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v[0:1], off glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: ;;#ASMSTART +; GFX9-IDXMODE-NEXT: ; reg use v[0:3] +; GFX9-IDXMODE-NEXT: ;;#ASMEND +; GFX9-IDXMODE-NEXT: s_cbranch_execnz .LBB20_3 +; GFX9-IDXMODE-NEXT: .LBB20_2: ; %bb1 +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v[0:1], off glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: ;;#ASMSTART +; GFX9-IDXMODE-NEXT: ; reg use v[0:3] +; GFX9-IDXMODE-NEXT: ;;#ASMEND +; GFX9-IDXMODE-NEXT: .LBB20_3: ; %bb7 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: s_endpgm +; GFX9-IDXMODE-NEXT: .LBB20_4: +; GFX9-IDXMODE-NEXT: s_branch .LBB20_2 +bb: + %tmp = icmp eq i32 %arg, 0 + br i1 %tmp, label %bb1, label %bb4 + +bb1: %tmp2 = load volatile <4 x float>, ptr addrspace(1) undef %tmp3 = insertelement <4 x float> %tmp2, float %val0, i32 undef - call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp3) #0 ; Prevent block optimize out + call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp3) ; Prevent block optimize out br label %bb7 -bb4: ; preds = %bb +bb4: %tmp5 = load volatile <4 x float>, ptr addrspace(1) undef %tmp6 = insertelement <4 x float> %tmp5, float %val0, i32 undef - call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp6) #0 ; Prevent block optimize out + call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp6) ; Prevent block optimize out br label %bb7 -bb7: ; preds = %bb4, %bb1 +bb7: %tmp8 = phi <4 x float> [ %tmp3, %bb1 ], [ %tmp6, %bb4 ] store volatile <4 x float> %tmp8, ptr addrspace(1) undef ret void } ; FIXME: Should be able to fold zero input to movreld to inline imm? - -; GCN-LABEL: {{^}}multi_same_block: - -; GCN: s_load_dword [[ARG:s[0-9]+]] - -; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000 -; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd -; MOVREL: s_waitcnt -; MOVREL: s_add_i32 m0, [[ARG]], -16 -; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, 4.0 -; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, -4.0 -; MOVREL: s_mov_b32 m0, -1 - - -; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000 -; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd -; IDXMODE: s_waitcnt -; IDXMODE: s_add_i32 [[ARG]], [[ARG]], -16 -; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 4.0 -; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, -4.0 -; IDXMODE: s_set_gpr_idx_off - -; GCN: ds_write_b32 -; GCN: ds_write_b32 -; GCN: s_endpgm -define amdgpu_kernel void @multi_same_block(i32 %arg) #0 { +define amdgpu_kernel void @multi_same_block(i32 %arg) { +; NOOPT-LABEL: multi_same_block: +; NOOPT: ; %bb.0: ; %bb +; NOOPT-NEXT: s_load_dword s0, s[2:3], 0x9 +; NOOPT-NEXT: s_mov_b32 s8, 0x41900000 +; NOOPT-NEXT: ; implicit-def: $sgpr9 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr7 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr6 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr5 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr3 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr2 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr10 +; NOOPT-NEXT: v_mov_b32_e32 v12, s9 +; NOOPT-NEXT: v_mov_b32_e32 v7, s8 +; NOOPT-NEXT: v_mov_b32_e32 v6, s7 +; NOOPT-NEXT: v_mov_b32_e32 v5, s6 +; NOOPT-NEXT: v_mov_b32_e32 v4, s5 +; NOOPT-NEXT: v_mov_b32_e32 v3, s4 +; NOOPT-NEXT: v_mov_b32_e32 v2, s3 +; NOOPT-NEXT: v_mov_b32_e32 v1, s2 +; NOOPT-NEXT: v_mov_b32_e32 v0, s1 +; NOOPT-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v13, v7 +; NOOPT-NEXT: v_mov_b32_e32 v14, v6 +; NOOPT-NEXT: v_mov_b32_e32 v15, v5 +; NOOPT-NEXT: v_mov_b32_e32 v16, v4 +; NOOPT-NEXT: v_mov_b32_e32 v17, v3 +; NOOPT-NEXT: v_mov_b32_e32 v18, v2 +; NOOPT-NEXT: v_mov_b32_e32 v19, v1 +; NOOPT-NEXT: v_mov_b32_e32 v20, v0 +; NOOPT-NEXT: v_mov_b32_e32 v0, 4.0 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_add_i32 m0, s0, -16 +; NOOPT-NEXT: v_movreld_b32_e32 v12, v0 +; NOOPT-NEXT: s_mov_b32 s4, 0x41b0cccd +; NOOPT-NEXT: ; implicit-def: $sgpr9 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr8 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr7 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr6 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr5 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr3 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr2 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: ; implicit-def: $sgpr10 +; NOOPT-NEXT: v_mov_b32_e32 v3, s9 +; NOOPT-NEXT: v_mov_b32_e32 v25, s8 +; NOOPT-NEXT: v_mov_b32_e32 v24, s7 +; NOOPT-NEXT: v_mov_b32_e32 v23, s6 +; NOOPT-NEXT: v_mov_b32_e32 v22, s5 +; NOOPT-NEXT: v_mov_b32_e32 v21, s4 +; NOOPT-NEXT: v_mov_b32_e32 v2, s3 +; NOOPT-NEXT: v_mov_b32_e32 v1, s2 +; NOOPT-NEXT: v_mov_b32_e32 v0, s1 +; NOOPT-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v4, v25 +; NOOPT-NEXT: v_mov_b32_e32 v5, v24 +; NOOPT-NEXT: v_mov_b32_e32 v6, v23 +; NOOPT-NEXT: v_mov_b32_e32 v7, v22 +; NOOPT-NEXT: v_mov_b32_e32 v8, v21 +; NOOPT-NEXT: v_mov_b32_e32 v9, v2 +; NOOPT-NEXT: v_mov_b32_e32 v10, v1 +; NOOPT-NEXT: v_mov_b32_e32 v11, v0 +; NOOPT-NEXT: v_mov_b32_e32 v0, -4.0 +; NOOPT-NEXT: s_add_i32 m0, s0, -16 +; NOOPT-NEXT: v_movreld_b32_e32 v3, v0 +; NOOPT-NEXT: v_mov_b32_e32 v2, v13 +; NOOPT-NEXT: v_mov_b32_e32 v1, v8 +; NOOPT-NEXT: s_mov_b32 m0, -1 +; NOOPT-NEXT: ; implicit-def: $sgpr0 +; NOOPT-NEXT: v_mov_b32_e32 v0, s0 +; NOOPT-NEXT: ds_write_b32 v0, v2 +; NOOPT-NEXT: s_mov_b32 m0, -1 +; NOOPT-NEXT: ; implicit-def: $sgpr0 +; NOOPT-NEXT: v_mov_b32_e32 v0, s0 +; NOOPT-NEXT: ds_write_b32 v0, v1 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: multi_same_block: +; SI-MOVREL: ; %bb.0: ; %bb +; SI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 0x41900000 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41b0cccd +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_add_i32 m0, s0, -16 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, 4.0 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v4, -4.0 +; SI-MOVREL-NEXT: s_mov_b32 m0, -1 +; SI-MOVREL-NEXT: ds_write_b32 v0, v1 +; SI-MOVREL-NEXT: ds_write_b32 v0, v9 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: multi_same_block: +; VI-MOVREL: ; %bb.0: ; %bb +; VI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 0x41900000 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41b0cccd +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_add_i32 m0, s0, -16 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, 4.0 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v4, -4.0 +; VI-MOVREL-NEXT: s_mov_b32 m0, -1 +; VI-MOVREL-NEXT: ds_write_b32 v0, v1 +; VI-MOVREL-NEXT: ds_write_b32 v0, v9 +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: multi_same_block: +; VI-IDXMODE: ; %bb.0: ; %bb +; VI-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 0x41900000 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41b0cccd +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_add_i32 s0, s0, -16 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 4.0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, -4.0 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: s_mov_b32 m0, -1 +; VI-IDXMODE-NEXT: ds_write_b32 v0, v1 +; VI-IDXMODE-NEXT: ds_write_b32 v0, v9 +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: multi_same_block: +; GFX9-IDXMODE: ; %bb.0: ; %bb +; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 0x41900000 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41b0cccd +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_add_i32 s0, s0, -16 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, -4.0 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: ds_write_b32 v0, v1 +; GFX9-IDXMODE-NEXT: ds_write_b32 v0, v9 +; GFX9-IDXMODE-NEXT: s_endpgm bb: %tmp1 = add i32 %arg, -16 %tmp2 = insertelement <9 x float> , float 4.000000e+00, i32 %tmp1 @@ -420,20 +6595,192 @@ bb: } ; offset puts outside of superegister bounaries, so clamp to 1st element. -; GCN-LABEL: {{^}}extract_largest_inbounds_offset: -; GCN-DAG: buffer_load_dwordx4 v[[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]] -; GCN-DAG: s_load_dword [[IDX0:s[0-9]+]] -; GCN-DAG: s_add_i32 [[IDX:s[0-9]+]], [[IDX0]], 15 - -; MOVREL: s_mov_b32 m0, [[IDX]] -; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] - -; IDXMODE: s_set_gpr_idx_on [[IDX]], gpr_idx(SRC0) -; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] -; IDXMODE: s_set_gpr_idx_off - -; GCN: buffer_store_dword [[EXTRACT]] define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) { +; NOOPT-LABEL: extract_largest_inbounds_offset: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xb +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xd +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s7, s9 +; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 +; NOOPT-NEXT: s_mov_b32 s9, s7 +; NOOPT-NEXT: s_mov_b32 s10, s6 +; NOOPT-NEXT: s_mov_b32 s11, s5 +; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v16, v15 +; NOOPT-NEXT: v_mov_b32_e32 v17, v14 +; NOOPT-NEXT: v_mov_b32_e32 v18, v13 +; NOOPT-NEXT: v_mov_b32_e32 v19, v12 +; NOOPT-NEXT: v_mov_b32_e32 v20, v11 +; NOOPT-NEXT: v_mov_b32_e32 v21, v10 +; NOOPT-NEXT: v_mov_b32_e32 v22, v9 +; NOOPT-NEXT: v_mov_b32_e32 v23, v8 +; NOOPT-NEXT: v_mov_b32_e32 v24, v7 +; NOOPT-NEXT: v_mov_b32_e32 v25, v6 +; NOOPT-NEXT: v_mov_b32_e32 v26, v5 +; NOOPT-NEXT: v_mov_b32_e32 v27, v4 +; NOOPT-NEXT: v_mov_b32_e32 v28, v3 +; NOOPT-NEXT: v_mov_b32_e32 v29, v2 +; NOOPT-NEXT: v_mov_b32_e32 v30, v1 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v30 +; NOOPT-NEXT: v_mov_b32_e32 v2, v29 +; NOOPT-NEXT: v_mov_b32_e32 v3, v28 +; NOOPT-NEXT: v_mov_b32_e32 v4, v27 +; NOOPT-NEXT: v_mov_b32_e32 v5, v26 +; NOOPT-NEXT: v_mov_b32_e32 v6, v25 +; NOOPT-NEXT: v_mov_b32_e32 v7, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v22 +; NOOPT-NEXT: v_mov_b32_e32 v10, v21 +; NOOPT-NEXT: v_mov_b32_e32 v11, v20 +; NOOPT-NEXT: v_mov_b32_e32 v12, v19 +; NOOPT-NEXT: v_mov_b32_e32 v13, v18 +; NOOPT-NEXT: v_mov_b32_e32 v14, v17 +; NOOPT-NEXT: v_mov_b32_e32 v15, v16 +; NOOPT-NEXT: s_mov_b32 s5, 15 +; NOOPT-NEXT: s_add_i32 s4, s4, s5 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_movrels_b32_e32 v0, v0 +; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extract_largest_inbounds_offset: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s12, s[2:3], 0xd +; SI-MOVREL-NEXT: s_mov_b32 s7, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s6, -1 +; SI-MOVREL-NEXT: s_mov_b32 s2, s6 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_mov_b32 s0, s10 +; SI-MOVREL-NEXT: s_mov_b32 s1, s11 +; SI-MOVREL-NEXT: s_mov_b32 s3, s7 +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: s_add_i32 s12, s12, 15 +; SI-MOVREL-NEXT: s_mov_b32 m0, s12 +; SI-MOVREL-NEXT: s_mov_b32 s4, s8 +; SI-MOVREL-NEXT: s_mov_b32 s5, s9 +; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: extract_largest_inbounds_offset: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s7 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: s_add_u32 s0, s6, 48 +; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: s_add_u32 s0, s6, 32 +; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: s_add_u32 s0, s6, 16 +; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: s_add_i32 s2, s2, 15 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s5 +; VI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 +; VI-MOVREL-NEXT: flat_store_dword v[16:17], v0 +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: extract_largest_inbounds_offset: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 48 +; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s7 +; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 16 +; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s5 +; VI-IDXMODE-NEXT: s_add_i32 s2, s2, 15 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: flat_store_dword v[16:17], v0 +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: extract_largest_inbounds_offset: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: s_add_i32 s0, s0, 15 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[4:5] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %ld = load volatile <16 x i32>, ptr addrspace(1) %in %offset = add i32 %idx, 15 @@ -442,20 +6789,192 @@ entry: ret void } -; GCN-LABEL: {{^}}extract_out_of_bounds_offset: -; GCN-DAG: buffer_load_dwordx4 v[[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]] -; GCN-DAG: s_load_dword [[IDX:s[0-9]+]] -; GCN: s_add_i32 [[ADD_IDX:s[0-9]+]], [[IDX]], 16 - -; MOVREL: s_mov_b32 m0, [[ADD_IDX]] -; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] - -; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0) -; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]] -; IDXMODE: s_set_gpr_idx_off - -; GCN: buffer_store_dword [[EXTRACT]] define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) { +; NOOPT-LABEL: extract_out_of_bounds_offset: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xb +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xd +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s7, s9 +; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 +; NOOPT-NEXT: s_mov_b32 s9, s7 +; NOOPT-NEXT: s_mov_b32 s10, s6 +; NOOPT-NEXT: s_mov_b32 s11, s5 +; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v16, v15 +; NOOPT-NEXT: v_mov_b32_e32 v17, v14 +; NOOPT-NEXT: v_mov_b32_e32 v18, v13 +; NOOPT-NEXT: v_mov_b32_e32 v19, v12 +; NOOPT-NEXT: v_mov_b32_e32 v20, v11 +; NOOPT-NEXT: v_mov_b32_e32 v21, v10 +; NOOPT-NEXT: v_mov_b32_e32 v22, v9 +; NOOPT-NEXT: v_mov_b32_e32 v23, v8 +; NOOPT-NEXT: v_mov_b32_e32 v24, v7 +; NOOPT-NEXT: v_mov_b32_e32 v25, v6 +; NOOPT-NEXT: v_mov_b32_e32 v26, v5 +; NOOPT-NEXT: v_mov_b32_e32 v27, v4 +; NOOPT-NEXT: v_mov_b32_e32 v28, v3 +; NOOPT-NEXT: v_mov_b32_e32 v29, v2 +; NOOPT-NEXT: v_mov_b32_e32 v30, v1 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v30 +; NOOPT-NEXT: v_mov_b32_e32 v2, v29 +; NOOPT-NEXT: v_mov_b32_e32 v3, v28 +; NOOPT-NEXT: v_mov_b32_e32 v4, v27 +; NOOPT-NEXT: v_mov_b32_e32 v5, v26 +; NOOPT-NEXT: v_mov_b32_e32 v6, v25 +; NOOPT-NEXT: v_mov_b32_e32 v7, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v22 +; NOOPT-NEXT: v_mov_b32_e32 v10, v21 +; NOOPT-NEXT: v_mov_b32_e32 v11, v20 +; NOOPT-NEXT: v_mov_b32_e32 v12, v19 +; NOOPT-NEXT: v_mov_b32_e32 v13, v18 +; NOOPT-NEXT: v_mov_b32_e32 v14, v17 +; NOOPT-NEXT: v_mov_b32_e32 v15, v16 +; NOOPT-NEXT: s_mov_b32 s5, 16 +; NOOPT-NEXT: s_add_i32 s4, s4, s5 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_movrels_b32_e32 v0, v0 +; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extract_out_of_bounds_offset: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s12, s[2:3], 0xd +; SI-MOVREL-NEXT: s_mov_b32 s7, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s6, -1 +; SI-MOVREL-NEXT: s_mov_b32 s2, s6 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_mov_b32 s0, s10 +; SI-MOVREL-NEXT: s_mov_b32 s1, s11 +; SI-MOVREL-NEXT: s_mov_b32 s3, s7 +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: s_add_i32 s12, s12, 16 +; SI-MOVREL-NEXT: s_mov_b32 m0, s12 +; SI-MOVREL-NEXT: s_mov_b32 s4, s8 +; SI-MOVREL-NEXT: s_mov_b32 s5, s9 +; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: extract_out_of_bounds_offset: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s7 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: s_add_u32 s0, s6, 48 +; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: s_add_u32 s0, s6, 32 +; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: s_add_u32 s0, s6, 16 +; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: s_add_i32 s2, s2, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s5 +; VI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 +; VI-MOVREL-NEXT: flat_store_dword v[16:17], v0 +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: extract_out_of_bounds_offset: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 48 +; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s7 +; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 16 +; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s5 +; VI-IDXMODE-NEXT: s_add_i32 s2, s2, 16 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: flat_store_dword v[16:17], v0 +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: extract_out_of_bounds_offset: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: s_add_i32 s0, s0, 16 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[4:5] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %ld = load volatile <16 x i32>, ptr addrspace(1) %in %offset = add i32 %idx, 16 @@ -464,17 +6983,192 @@ entry: ret void } -; GCN-LABEL: {{^}}extractelement_v16i32_or_index: -; GCN: s_load_dword [[IDX_IN:s[0-9]+]] -; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]] - -; MOVREL: s_mov_b32 m0, [[IDX_SHL]] -; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} - -; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], gpr_idx(SRC0) -; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; IDXMODE: s_set_gpr_idx_off define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx.in) { +; NOOPT-LABEL: extractelement_v16i32_or_index: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xb +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xd +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s7, s9 +; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 +; NOOPT-NEXT: s_mov_b32 s9, s7 +; NOOPT-NEXT: s_mov_b32 s10, s6 +; NOOPT-NEXT: s_mov_b32 s11, s5 +; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v16, v15 +; NOOPT-NEXT: v_mov_b32_e32 v17, v14 +; NOOPT-NEXT: v_mov_b32_e32 v18, v13 +; NOOPT-NEXT: v_mov_b32_e32 v19, v12 +; NOOPT-NEXT: v_mov_b32_e32 v20, v11 +; NOOPT-NEXT: v_mov_b32_e32 v21, v10 +; NOOPT-NEXT: v_mov_b32_e32 v22, v9 +; NOOPT-NEXT: v_mov_b32_e32 v23, v8 +; NOOPT-NEXT: v_mov_b32_e32 v24, v7 +; NOOPT-NEXT: v_mov_b32_e32 v25, v6 +; NOOPT-NEXT: v_mov_b32_e32 v26, v5 +; NOOPT-NEXT: v_mov_b32_e32 v27, v4 +; NOOPT-NEXT: v_mov_b32_e32 v28, v3 +; NOOPT-NEXT: v_mov_b32_e32 v29, v2 +; NOOPT-NEXT: v_mov_b32_e32 v30, v1 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v30 +; NOOPT-NEXT: v_mov_b32_e32 v2, v29 +; NOOPT-NEXT: v_mov_b32_e32 v3, v28 +; NOOPT-NEXT: v_mov_b32_e32 v4, v27 +; NOOPT-NEXT: v_mov_b32_e32 v5, v26 +; NOOPT-NEXT: v_mov_b32_e32 v6, v25 +; NOOPT-NEXT: v_mov_b32_e32 v7, v24 +; NOOPT-NEXT: v_mov_b32_e32 v8, v23 +; NOOPT-NEXT: v_mov_b32_e32 v9, v22 +; NOOPT-NEXT: v_mov_b32_e32 v10, v21 +; NOOPT-NEXT: v_mov_b32_e32 v11, v20 +; NOOPT-NEXT: v_mov_b32_e32 v12, v19 +; NOOPT-NEXT: v_mov_b32_e32 v13, v18 +; NOOPT-NEXT: v_mov_b32_e32 v14, v17 +; NOOPT-NEXT: v_mov_b32_e32 v15, v16 +; NOOPT-NEXT: s_mov_b32 s5, 2 +; NOOPT-NEXT: s_lshl_b32 s4, s4, s5 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_movrels_b32_e32 v0, v1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: extractelement_v16i32_or_index: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s12, s[2:3], 0xd +; SI-MOVREL-NEXT: s_mov_b32 s7, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s6, -1 +; SI-MOVREL-NEXT: s_mov_b32 s2, s6 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_mov_b32 s0, s10 +; SI-MOVREL-NEXT: s_mov_b32 s1, s11 +; SI-MOVREL-NEXT: s_mov_b32 s3, s7 +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: s_lshl_b32 s0, s12, 2 +; SI-MOVREL-NEXT: s_mov_b32 m0, s0 +; SI-MOVREL-NEXT: s_mov_b32 s4, s8 +; SI-MOVREL-NEXT: s_mov_b32 s5, s9 +; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v1 +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: extractelement_v16i32_or_index: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s7 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: s_add_u32 s0, s6, 48 +; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: s_add_u32 s0, s6, 32 +; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: s_add_u32 s0, s6, 16 +; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: s_lshl_b32 s0, s2, 2 +; VI-MOVREL-NEXT: s_mov_b32 m0, s0 +; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s5 +; VI-MOVREL-NEXT: v_movrels_b32_e32 v0, v1 +; VI-MOVREL-NEXT: flat_store_dword v[16:17], v0 +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: extractelement_v16i32_or_index: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 48 +; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s7 +; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 16 +; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s5 +; VI-IDXMODE-NEXT: s_lshl_b32 s0, s2, 2 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v1 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: flat_store_dword v[16:17], v0 +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: extractelement_v16i32_or_index: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[4:5] +; GFX9-IDXMODE-NEXT: s_endpgm entry: %ld = load volatile <16 x i32>, ptr addrspace(1) %in %idx.shl = shl i32 %idx.in, 2 @@ -484,17 +7178,249 @@ entry: ret void } -; GCN-LABEL: {{^}}insertelement_v16f32_or_index: -; GCN: s_load_dword [[IDX_IN:s[0-9]+]] -; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]] - -; MOVREL: s_mov_b32 m0, [[IDX_SHL]] -; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} - -; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], gpr_idx(DST) -; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; IDXMODE: s_set_gpr_idx_off define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, <16 x float> %a, i32 %idx.in) nounwind { +; NOOPT-LABEL: insertelement_v16f32_or_index: +; NOOPT: ; %bb.0: +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x19 +; NOOPT-NEXT: s_load_dword s4, s[2:3], 0x29 +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; NOOPT-NEXT: s_mov_b32 s5, 0xf000 +; NOOPT-NEXT: s_mov_b32 s6, -1 +; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; NOOPT-NEXT: s_mov_b32 s1, s7 +; NOOPT-NEXT: s_mov_b32 s2, s6 +; NOOPT-NEXT: s_mov_b32 s3, s5 +; NOOPT-NEXT: s_mov_b32 s5, 2 +; NOOPT-NEXT: s_lshl_b32 s4, s4, s5 +; NOOPT-NEXT: v_mov_b32_e32 v0, 0x40a00000 +; NOOPT-NEXT: v_mov_b32_e32 v7, s8 +; NOOPT-NEXT: v_mov_b32_e32 v8, s9 +; NOOPT-NEXT: v_mov_b32_e32 v9, s10 +; NOOPT-NEXT: v_mov_b32_e32 v10, s11 +; NOOPT-NEXT: v_mov_b32_e32 v11, s12 +; NOOPT-NEXT: v_mov_b32_e32 v12, s13 +; NOOPT-NEXT: v_mov_b32_e32 v13, s14 +; NOOPT-NEXT: v_mov_b32_e32 v14, s15 +; NOOPT-NEXT: v_mov_b32_e32 v15, s16 +; NOOPT-NEXT: v_mov_b32_e32 v16, s17 +; NOOPT-NEXT: v_mov_b32_e32 v17, s18 +; NOOPT-NEXT: v_mov_b32_e32 v18, s19 +; NOOPT-NEXT: v_mov_b32_e32 v19, s20 +; NOOPT-NEXT: v_mov_b32_e32 v20, s21 +; NOOPT-NEXT: v_mov_b32_e32 v21, s22 +; NOOPT-NEXT: v_mov_b32_e32 v22, s23 +; NOOPT-NEXT: s_mov_b32 m0, s4 +; NOOPT-NEXT: v_movreld_b32_e32 v8, v0 +; NOOPT-NEXT: v_mov_b32_e32 v4, v22 +; NOOPT-NEXT: v_mov_b32_e32 v5, v21 +; NOOPT-NEXT: v_mov_b32_e32 v6, v20 +; NOOPT-NEXT: v_mov_b32_e32 v0, v19 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; NOOPT-NEXT: v_mov_b32_e32 v4, v18 +; NOOPT-NEXT: v_mov_b32_e32 v5, v17 +; NOOPT-NEXT: v_mov_b32_e32 v6, v16 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v15 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; NOOPT-NEXT: v_mov_b32_e32 v4, v14 +; NOOPT-NEXT: v_mov_b32_e32 v5, v13 +; NOOPT-NEXT: v_mov_b32_e32 v6, v12 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v11 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; NOOPT-NEXT: v_mov_b32_e32 v4, v10 +; NOOPT-NEXT: v_mov_b32_e32 v5, v9 +; NOOPT-NEXT: v_mov_b32_e32 v6, v8 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, v7 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v6 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v4 +; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insertelement_v16f32_or_index: +; SI-MOVREL: ; %bb.0: +; SI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x29 +; SI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; SI-MOVREL-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x40a00000 +; SI-MOVREL-NEXT: s_mov_b32 s23, 0xf000 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_lshl_b32 s0, s0, 2 +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s18 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s19 +; SI-MOVREL-NEXT: s_mov_b32 m0, s0 +; SI-MOVREL-NEXT: s_mov_b32 s22, -1 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[20:23], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[20:23], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: insertelement_v16f32_or_index: +; VI-MOVREL: ; %bb.0: +; VI-MOVREL-NEXT: s_load_dword s20, s[2:3], 0xa4 +; VI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x40a00000 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_lshl_b32 s2, s20, 2 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 +; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s18 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s19 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 +; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-MOVREL-NEXT: s_nop 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: insertelement_v16f32_or_index: +; VI-IDXMODE: ; %bb.0: +; VI-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0xa4 +; VI-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, 0x40a00000 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: s_lshl_b32 s3, s20, 2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, s18 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, s19 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, v16 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-IDXMODE-NEXT: s_nop 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insertelement_v16f32_or_index: +; GFX9-IDXMODE: ; %bb.0: +; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 +; GFX9-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0xa4 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, 0x40a00000 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s18 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-IDXMODE-NEXT: s_lshl_b32 s2, s20, 2 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, v17 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX9-IDXMODE-NEXT: s_endpgm %idx.shl = shl i32 %idx.in, 2 %idx = or i32 %idx.shl, 1 %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %idx @@ -502,57 +7428,891 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, ret void } -; GCN-LABEL: {{^}}broken_phi_bb: -; GCN: v_mov_b32_e32 [[PHIREG:v[0-9]+]], 8 - -; GCN: {{.LBB[0-9]+_[0-9]+}}: -; GCN: [[BB2:.LBB[0-9]+_[0-9]+]]: -; GCN: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, [[PHIREG]] -; GCN: buffer_load_dword - -; GCN: [[REGLOOP:.LBB[0-9]+_[0-9]+]]: -; MOVREL: v_movreld_b32_e32 - -; IDXMODE: s_set_gpr_idx_on -; IDXMODE: v_mov_b32_e32 -; IDXMODE: s_set_gpr_idx_off - -; GCN: s_cbranch_execnz [[REGLOOP]] - -; GCN: {{^; %bb.[0-9]}}: -; GCN: s_mov_b64 exec, -; GCN: s_cbranch_execnz [[BB2]] - -define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 { +define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { +; NOOPT-LABEL: broken_phi_bb: +; NOOPT: ; %bb.0: ; %bb +; NOOPT-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 +; NOOPT-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 +; NOOPT-NEXT: s_mov_b32 s26, -1 +; NOOPT-NEXT: s_mov_b32 s27, 0xe8f000 +; NOOPT-NEXT: s_add_u32 s24, s24, s9 +; NOOPT-NEXT: s_addc_u32 s25, s25, 0 +; NOOPT-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane +; NOOPT-NEXT: s_load_dword s1, s[2:3], 0x9 +; NOOPT-NEXT: s_load_dword s0, s[2:3], 0xa +; NOOPT-NEXT: s_waitcnt lgkmcnt(0) +; NOOPT-NEXT: v_writelane_b32 v0, s1, 0 +; NOOPT-NEXT: s_mov_b32 s1, 8 +; NOOPT-NEXT: v_writelane_b32 v0, s0, 1 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, 8 +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: .LBB26_1: ; %bb2 +; NOOPT-NEXT: ; =>This Loop Header: Depth=1 +; NOOPT-NEXT: ; Child Loop BB26_3 Depth 2 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s2, v0, 0 +; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 s[0:1], -1 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_cmp_ge_i32_e64 s[2:3], v1, s2 +; NOOPT-NEXT: v_mov_b32_e32 v1, s4 +; NOOPT-NEXT: s_and_b64 vcc, exec, s[2:3] +; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: v_writelane_b32 v0, s0, 2 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 3 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: s_cbranch_vccnz .LBB26_6 +; NOOPT-NEXT: ; %bb.2: ; %bb4 +; NOOPT-NEXT: ; in Loop: Header=BB26_1 Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: buffer_load_dword v16, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v16, 1 +; NOOPT-NEXT: ; implicit-def: $sgpr2_sgpr3 +; NOOPT-NEXT: ; kill: def $sgpr3 killed $sgpr3 killed $sgpr2_sgpr3 +; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5 +; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; NOOPT-NEXT: s_mov_b32 s1, 0xf000 +; NOOPT-NEXT: s_mov_b32 s2, -1 +; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b32 s5, s3 +; NOOPT-NEXT: s_mov_b32 s6, s2 +; NOOPT-NEXT: s_mov_b32 s7, s1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:80 ; 4-byte Folded Spill +; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, s4 +; NOOPT-NEXT: v_mov_b32_e32 v1, s5 +; NOOPT-NEXT: v_mov_b32_e32 v2, s6 +; NOOPT-NEXT: v_mov_b32_e32 v3, s7 +; NOOPT-NEXT: v_mov_b32_e32 v4, s8 +; NOOPT-NEXT: v_mov_b32_e32 v5, s9 +; NOOPT-NEXT: v_mov_b32_e32 v6, s10 +; NOOPT-NEXT: v_mov_b32_e32 v7, s11 +; NOOPT-NEXT: v_mov_b32_e32 v8, s12 +; NOOPT-NEXT: v_mov_b32_e32 v9, s13 +; NOOPT-NEXT: v_mov_b32_e32 v10, s14 +; NOOPT-NEXT: v_mov_b32_e32 v11, s15 +; NOOPT-NEXT: v_mov_b32_e32 v12, s16 +; NOOPT-NEXT: v_mov_b32_e32 v13, s17 +; NOOPT-NEXT: v_mov_b32_e32 v14, s18 +; NOOPT-NEXT: v_mov_b32_e32 v15, s19 +; NOOPT-NEXT: v_mov_b32_e32 v17, s0 +; NOOPT-NEXT: buffer_store_dword v17, off, s[24:27], 0 offset:76 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[0:1], exec +; NOOPT-NEXT: v_writelane_b32 v16, s0, 4 +; NOOPT-NEXT: v_writelane_b32 v16, s1, 5 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: buffer_store_dword v16, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: .LBB26_3: ; Parent Loop BB26_1 Depth=1 +; NOOPT-NEXT: ; => This Inner Loop Header: Depth=2 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 6 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 7 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:12 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[24:27], 0 offset:16 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[24:27], 0 offset:20 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[24:27], 0 offset:24 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[24:27], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[24:27], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[24:27], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[24:27], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[24:27], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[24:27], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[24:27], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[24:27], 0 offset:56 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[24:27], 0 offset:60 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[24:27], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[24:27], 0 offset:68 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[24:27], 0 offset:72 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[24:27], 0 offset:76 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[24:27], 0 offset:80 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 +; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; NOOPT-NEXT: s_mov_b32 m0, s2 +; NOOPT-NEXT: v_movreld_b32_e32 v1, v17 +; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:84 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[24:27], 0 offset:88 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:92 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[24:27], 0 offset:96 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[24:27], 0 offset:100 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[24:27], 0 offset:104 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[24:27], 0 offset:108 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[24:27], 0 offset:112 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[24:27], 0 offset:116 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[24:27], 0 offset:120 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[24:27], 0 offset:124 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[24:27], 0 offset:128 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[24:27], 0 offset:132 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:136 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:140 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[24:27], 0 offset:144 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] +; NOOPT-NEXT: v_writelane_b32 v0, s2, 6 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 7 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] +; NOOPT-NEXT: s_cbranch_execnz .LBB26_3 +; NOOPT-NEXT: ; %bb.4: ; in Loop: Header=BB26_1 Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 4 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 5 +; NOOPT-NEXT: s_mov_b64 exec, s[0:1] +; NOOPT-NEXT: ; %bb.5: ; in Loop: Header=BB26_1 Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:84 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[24:27], 0 offset:88 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[24:27], 0 offset:92 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[24:27], 0 offset:96 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[24:27], 0 offset:100 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[24:27], 0 offset:104 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[24:27], 0 offset:108 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[24:27], 0 offset:112 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[24:27], 0 offset:116 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[24:27], 0 offset:120 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[24:27], 0 offset:124 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[24:27], 0 offset:132 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[24:27], 0 offset:136 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[24:27], 0 offset:140 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[24:27], 0 offset:144 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 s[0:1], 0 +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: v_writelane_b32 v0, s0, 2 +; NOOPT-NEXT: v_writelane_b32 v0, s1, 3 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: .LBB26_6: ; %Flow +; NOOPT-NEXT: ; in Loop: Header=BB26_1 Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: s_waitcnt expcnt(1) +; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v1, 2 +; NOOPT-NEXT: v_readlane_b32 s1, v1, 3 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Reload +; NOOPT-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; NOOPT-NEXT: s_mov_b32 s0, 1 +; NOOPT-NEXT: ; implicit-def: $sgpr1 +; NOOPT-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, s0 +; NOOPT-NEXT: s_and_b64 vcc, exec, s[0:1] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_cbranch_vccnz .LBB26_1 +; NOOPT-NEXT: ; %bb.7: ; %bb8 +; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[20:21] +; NOOPT-NEXT: ; kill: killed $vgpr0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: broken_phi_bb: +; SI-MOVREL: ; %bb.0: ; %bb +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 8 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: s_branch .LBB26_2 +; SI-MOVREL-NEXT: .LBB26_1: +; SI-MOVREL-NEXT: ; implicit-def: $vgpr0 +; SI-MOVREL-NEXT: s_branch .LBB26_6 +; SI-MOVREL-NEXT: .LBB26_2: ; %bb2 +; SI-MOVREL-NEXT: ; =>This Loop Header: Depth=1 +; SI-MOVREL-NEXT: ; Child Loop BB26_4 Depth 2 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: v_cmp_le_i32_e32 vcc, s0, v0 +; SI-MOVREL-NEXT: s_cbranch_vccnz .LBB26_1 +; SI-MOVREL-NEXT: ; %bb.3: ; %bb4 +; SI-MOVREL-NEXT: ; in Loop: Header=BB26_2 Depth=1 +; SI-MOVREL-NEXT: buffer_load_dword v16, off, s[0:3], 0 glc +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: v_mov_b32_e32 v17, s1 +; SI-MOVREL-NEXT: s_mov_b64 s[4:5], exec +; SI-MOVREL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-MOVREL-NEXT: .LBB26_4: ; Parent Loop BB26_2 Depth=1 +; SI-MOVREL-NEXT: ; => This Inner Loop Header: Depth=2 +; SI-MOVREL-NEXT: v_readfirstlane_b32 s6, v16 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v16 +; SI-MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc +; SI-MOVREL-NEXT: s_mov_b32 m0, s6 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v17 +; SI-MOVREL-NEXT: s_xor_b64 exec, exec, vcc +; SI-MOVREL-NEXT: s_cbranch_execnz .LBB26_4 +; SI-MOVREL-NEXT: ; %bb.5: ; in Loop: Header=BB26_2 Depth=1 +; SI-MOVREL-NEXT: s_mov_b64 exec, s[4:5] +; SI-MOVREL-NEXT: s_cbranch_execnz .LBB26_2 +; SI-MOVREL-NEXT: .LBB26_6: ; %bb8 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: broken_phi_bb: +; VI-MOVREL: ; %bb.0: ; %bb +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 8 +; VI-MOVREL-NEXT: s_branch .LBB26_2 +; VI-MOVREL-NEXT: .LBB26_1: +; VI-MOVREL-NEXT: ; implicit-def: $vgpr0 +; VI-MOVREL-NEXT: s_branch .LBB26_6 +; VI-MOVREL-NEXT: .LBB26_2: ; %bb2 +; VI-MOVREL-NEXT: ; =>This Loop Header: Depth=1 +; VI-MOVREL-NEXT: ; Child Loop BB26_4 Depth 2 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: v_cmp_le_i32_e32 vcc, s0, v0 +; VI-MOVREL-NEXT: s_cbranch_vccnz .LBB26_1 +; VI-MOVREL-NEXT: ; %bb.3: ; %bb4 +; VI-MOVREL-NEXT: ; in Loop: Header=BB26_2 Depth=1 +; VI-MOVREL-NEXT: flat_load_dword v16, v[0:1] glc +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s1 +; VI-MOVREL-NEXT: s_mov_b64 s[2:3], exec +; VI-MOVREL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-MOVREL-NEXT: .LBB26_4: ; Parent Loop BB26_2 Depth=1 +; VI-MOVREL-NEXT: ; => This Inner Loop Header: Depth=2 +; VI-MOVREL-NEXT: v_readfirstlane_b32 s4, v16 +; VI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v16 +; VI-MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc +; VI-MOVREL-NEXT: s_mov_b32 m0, s4 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v17 +; VI-MOVREL-NEXT: s_xor_b64 exec, exec, vcc +; VI-MOVREL-NEXT: s_cbranch_execnz .LBB26_4 +; VI-MOVREL-NEXT: ; %bb.5: ; in Loop: Header=BB26_2 Depth=1 +; VI-MOVREL-NEXT: s_mov_b64 exec, s[2:3] +; VI-MOVREL-NEXT: s_cbranch_execnz .LBB26_2 +; VI-MOVREL-NEXT: .LBB26_6: ; %bb8 +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: broken_phi_bb: +; VI-IDXMODE: ; %bb.0: ; %bb +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 8 +; VI-IDXMODE-NEXT: s_branch .LBB26_2 +; VI-IDXMODE-NEXT: .LBB26_1: +; VI-IDXMODE-NEXT: ; implicit-def: $vgpr0 +; VI-IDXMODE-NEXT: s_branch .LBB26_6 +; VI-IDXMODE-NEXT: .LBB26_2: ; %bb2 +; VI-IDXMODE-NEXT: ; =>This Loop Header: Depth=1 +; VI-IDXMODE-NEXT: ; Child Loop BB26_4 Depth 2 +; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; VI-IDXMODE-NEXT: v_cmp_le_i32_e32 vcc, s0, v0 +; VI-IDXMODE-NEXT: s_cbranch_vccnz .LBB26_1 +; VI-IDXMODE-NEXT: ; %bb.3: ; %bb4 +; VI-IDXMODE-NEXT: ; in Loop: Header=BB26_2 Depth=1 +; VI-IDXMODE-NEXT: flat_load_dword v16, v[0:1] glc +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s1 +; VI-IDXMODE-NEXT: s_mov_b64 s[2:3], exec +; VI-IDXMODE-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-IDXMODE-NEXT: .LBB26_4: ; Parent Loop BB26_2 Depth=1 +; VI-IDXMODE-NEXT: ; => This Inner Loop Header: Depth=2 +; VI-IDXMODE-NEXT: v_readfirstlane_b32 s4, v16 +; VI-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v16 +; VI-IDXMODE-NEXT: s_and_saveexec_b64 vcc, vcc +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v17 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: s_xor_b64 exec, exec, vcc +; VI-IDXMODE-NEXT: s_cbranch_execnz .LBB26_4 +; VI-IDXMODE-NEXT: ; %bb.5: ; in Loop: Header=BB26_2 Depth=1 +; VI-IDXMODE-NEXT: s_mov_b64 exec, s[2:3] +; VI-IDXMODE-NEXT: s_cbranch_execnz .LBB26_2 +; VI-IDXMODE-NEXT: .LBB26_6: ; %bb8 +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: broken_phi_bb: +; GFX9-IDXMODE: ; %bb.0: ; %bb +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-IDXMODE-NEXT: s_branch .LBB26_2 +; GFX9-IDXMODE-NEXT: .LBB26_1: +; GFX9-IDXMODE-NEXT: ; implicit-def: $vgpr0 +; GFX9-IDXMODE-NEXT: s_branch .LBB26_6 +; GFX9-IDXMODE-NEXT: .LBB26_2: ; %bb2 +; GFX9-IDXMODE-NEXT: ; =>This Loop Header: Depth=1 +; GFX9-IDXMODE-NEXT: ; Child Loop BB26_4 Depth 2 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: v_cmp_le_i32_e32 vcc, s0, v0 +; GFX9-IDXMODE-NEXT: s_cbranch_vccnz .LBB26_1 +; GFX9-IDXMODE-NEXT: ; %bb.3: ; %bb4 +; GFX9-IDXMODE-NEXT: ; in Loop: Header=BB26_2 Depth=1 +; GFX9-IDXMODE-NEXT: global_load_dword v16, v[0:1], off glc +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, s1 +; GFX9-IDXMODE-NEXT: s_mov_b64 s[2:3], exec +; GFX9-IDXMODE-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-IDXMODE-NEXT: .LBB26_4: ; Parent Loop BB26_2 Depth=1 +; GFX9-IDXMODE-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX9-IDXMODE-NEXT: v_readfirstlane_b32 s4, v16 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v16 +; GFX9-IDXMODE-NEXT: s_and_saveexec_b64 vcc, vcc +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v17 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: s_xor_b64 exec, exec, vcc +; GFX9-IDXMODE-NEXT: s_cbranch_execnz .LBB26_4 +; GFX9-IDXMODE-NEXT: ; %bb.5: ; in Loop: Header=BB26_2 Depth=1 +; GFX9-IDXMODE-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-IDXMODE-NEXT: s_cbranch_execnz .LBB26_2 +; GFX9-IDXMODE-NEXT: .LBB26_6: ; %bb8 +; GFX9-IDXMODE-NEXT: s_endpgm bb: br label %bb2 -bb2: ; preds = %bb4, %bb +bb2: %tmp = phi i32 [ 8, %bb ], [ %tmp7, %bb4 ] %tmp3 = icmp slt i32 %tmp, %arg br i1 %tmp3, label %bb4, label %bb8 -bb4: ; preds = %bb2 +bb4: %vgpr = load volatile i32, ptr addrspace(1) undef %tmp5 = insertelement <16 x i32> undef, i32 undef, i32 %vgpr %tmp6 = insertelement <16 x i32> %tmp5, i32 %arg1, i32 %vgpr %tmp7 = extractelement <16 x i32> %tmp6, i32 0 br label %bb2 -bb8: ; preds = %bb2 +bb8: ret void } -; GCN-LABEL: {{^}}insert_or_disj_index: -; GCN: v_mov_b32_e32 v[[#VIDX:]], 0 - -; MOVREL: s_mov_b32 m0, s{{[0-9]+}} -; MOVREL: v_movreld_b32_e32 v[[#VIDX + 1]], v{{[0-9]+}} - -; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, gpr_idx(DST) -; IDXMODE: v_mov_b32_e32 v[[#VIDX + 1]], v{{[0-9]+}} -; IDXMODE: s_set_gpr_idx_off define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace(4) %in, i32 %val, <4 x i32> inreg %desc, i32 inreg %A) { +; NOOPT-LABEL: insert_or_disj_index: +; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; NOOPT-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; NOOPT-NEXT: s_mov_b32 s18, -1 +; NOOPT-NEXT: s_mov_b32 s19, 0xe8f000 +; NOOPT-NEXT: s_add_u32 s16, s16, s5 +; NOOPT-NEXT: s_addc_u32 s17, s17, 0 +; NOOPT-NEXT: ; implicit-def: $vgpr16 : SGPR spill to VGPR lane +; NOOPT-NEXT: v_writelane_b32 v16, s4, 0 +; NOOPT-NEXT: s_mov_b32 s4, s1 +; NOOPT-NEXT: v_readlane_b32 s1, v16, 0 +; NOOPT-NEXT: v_writelane_b32 v16, s4, 1 +; NOOPT-NEXT: s_mov_b32 s4, s0 +; NOOPT-NEXT: v_readlane_b32 s0, v16, 1 +; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:144 ; 4-byte Folded Spill +; NOOPT-NEXT: v_mov_b32_e32 v2, v1 +; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: s_mov_b32 s5, s0 +; NOOPT-NEXT: s_mov_b32 s6, s2 +; NOOPT-NEXT: s_mov_b32 s7, s3 +; NOOPT-NEXT: ; implicit-def: $sgpr0 +; NOOPT-NEXT: ; implicit-def: $sgpr0 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v2 +; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:136 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:140 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b32 s8, 0xf000 +; NOOPT-NEXT: s_mov_b32 s0, 0 +; NOOPT-NEXT: v_writelane_b32 v16, s0, 2 +; NOOPT-NEXT: s_mov_b32 s2, s0 +; NOOPT-NEXT: s_mov_b32 s3, s8 +; NOOPT-NEXT: s_mov_b32 s8, s0 +; NOOPT-NEXT: s_mov_b32 s9, s0 +; NOOPT-NEXT: ; kill: def $sgpr8_sgpr9 killed $sgpr8_sgpr9 def $sgpr8_sgpr9_sgpr10_sgpr11 +; NOOPT-NEXT: s_mov_b64 s[10:11], s[2:3] +; NOOPT-NEXT: v_writelane_b32 v16, s8, 3 +; NOOPT-NEXT: v_writelane_b32 v16, s9, 4 +; NOOPT-NEXT: v_writelane_b32 v16, s10, 5 +; NOOPT-NEXT: v_writelane_b32 v16, s11, 6 +; NOOPT-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10_sgpr11 killed $sgpr4_sgpr5_sgpr6_sgpr7 +; NOOPT-NEXT: ; implicit-def: $sgpr2_sgpr3 +; NOOPT-NEXT: s_waitcnt expcnt(1) +; NOOPT-NEXT: v_mov_b32_e32 v0, s1 +; NOOPT-NEXT: buffer_load_dword v0, v0, s[4:7], s0 offen +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:132 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v0, s0 +; NOOPT-NEXT: v_mov_b32_e32 v31, s0 +; NOOPT-NEXT: v_mov_b32_e32 v30, s0 +; NOOPT-NEXT: v_mov_b32_e32 v29, s0 +; NOOPT-NEXT: v_mov_b32_e32 v28, s0 +; NOOPT-NEXT: v_mov_b32_e32 v27, s0 +; NOOPT-NEXT: v_mov_b32_e32 v26, s0 +; NOOPT-NEXT: v_mov_b32_e32 v25, s0 +; NOOPT-NEXT: v_mov_b32_e32 v24, s0 +; NOOPT-NEXT: v_mov_b32_e32 v23, s0 +; NOOPT-NEXT: v_mov_b32_e32 v22, s0 +; NOOPT-NEXT: v_mov_b32_e32 v21, s0 +; NOOPT-NEXT: v_mov_b32_e32 v20, s0 +; NOOPT-NEXT: v_mov_b32_e32 v19, s0 +; NOOPT-NEXT: v_mov_b32_e32 v18, s0 +; NOOPT-NEXT: v_mov_b32_e32 v17, s0 +; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v1, v31 +; NOOPT-NEXT: v_mov_b32_e32 v2, v30 +; NOOPT-NEXT: v_mov_b32_e32 v3, v29 +; NOOPT-NEXT: v_mov_b32_e32 v4, v28 +; NOOPT-NEXT: v_mov_b32_e32 v5, v27 +; NOOPT-NEXT: v_mov_b32_e32 v6, v26 +; NOOPT-NEXT: v_mov_b32_e32 v7, v25 +; NOOPT-NEXT: v_mov_b32_e32 v8, v24 +; NOOPT-NEXT: v_mov_b32_e32 v9, v23 +; NOOPT-NEXT: v_mov_b32_e32 v10, v22 +; NOOPT-NEXT: v_mov_b32_e32 v11, v21 +; NOOPT-NEXT: v_mov_b32_e32 v12, v20 +; NOOPT-NEXT: v_mov_b32_e32 v13, v19 +; NOOPT-NEXT: v_mov_b32_e32 v14, v18 +; NOOPT-NEXT: v_mov_b32_e32 v15, v17 +; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:68 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:72 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:76 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:80 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:84 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:88 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[16:19], 0 offset:92 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[16:19], 0 offset:96 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[16:19], 0 offset:100 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[16:19], 0 offset:104 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[16:19], 0 offset:108 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[16:19], 0 offset:112 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[16:19], 0 offset:116 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[16:19], 0 offset:120 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:124 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:128 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[0:1], exec +; NOOPT-NEXT: v_writelane_b32 v16, s0, 7 +; NOOPT-NEXT: v_writelane_b32 v16, s1, 8 +; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 +; NOOPT-NEXT: buffer_store_dword v16, off, s[16:19], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[12:13] +; NOOPT-NEXT: s_waitcnt expcnt(1) +; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[16:19], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[16:19], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[16:19], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[16:19], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[16:19], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[16:19], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[16:19], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[16:19], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1 +; NOOPT-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1 +; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[12:13] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 9 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 10 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[16:19], 0 offset:20 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v7, off, s[16:19], 0 offset:24 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v8, off, s[16:19], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v9, off, s[16:19], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v10, off, s[16:19], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v11, off, s[16:19], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v12, off, s[16:19], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v13, off, s[16:19], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v14, off, s[16:19], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v15, off, s[16:19], 0 offset:56 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v16, off, s[16:19], 0 offset:60 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v17, off, s[16:19], 0 offset:144 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[16:19], 0 offset:132 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readfirstlane_b32 s2, v18 +; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18 +; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; NOOPT-NEXT: s_mov_b32 m0, s2 +; NOOPT-NEXT: v_movreld_b32_e32 v2, v17 +; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:148 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:152 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:156 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:160 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:164 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[16:19], 0 offset:168 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[16:19], 0 offset:172 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[16:19], 0 offset:176 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[16:19], 0 offset:180 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[16:19], 0 offset:184 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[16:19], 0 offset:188 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[16:19], 0 offset:192 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[16:19], 0 offset:196 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:200 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:204 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[16:19], 0 offset:208 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 ; 4-byte Folded Spill +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v6, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v7, off, s[16:19], 0 offset:24 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v8, off, s[16:19], 0 offset:28 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v9, off, s[16:19], 0 offset:32 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v10, off, s[16:19], 0 offset:36 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v11, off, s[16:19], 0 offset:40 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v12, off, s[16:19], 0 offset:44 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v13, off, s[16:19], 0 offset:48 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:52 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:56 ; 4-byte Folded Spill +; NOOPT-NEXT: buffer_store_dword v16, off, s[16:19], 0 offset:60 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1] +; NOOPT-NEXT: v_writelane_b32 v0, s2, 9 +; NOOPT-NEXT: v_writelane_b32 v0, s3, 10 +; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 +; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:64 ; 4-byte Folded Spill +; NOOPT-NEXT: s_mov_b64 exec, s[12:13] +; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1] +; NOOPT-NEXT: s_cbranch_execnz .LBB27_1 +; NOOPT-NEXT: ; %bb.2: +; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[12:13] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 7 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 8 +; NOOPT-NEXT: s_mov_b64 exec, s[0:1] +; NOOPT-NEXT: ; %bb.3: +; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 +; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_mov_b64 exec, s[12:13] +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_readlane_b32 s0, v0, 3 +; NOOPT-NEXT: v_readlane_b32 s1, v0, 4 +; NOOPT-NEXT: v_readlane_b32 s2, v0, 5 +; NOOPT-NEXT: v_readlane_b32 s3, v0, 6 +; NOOPT-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:136 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v6, off, s[16:19], 0 offset:140 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v18, off, s[16:19], 0 offset:148 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v19, off, s[16:19], 0 offset:152 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v20, off, s[16:19], 0 offset:156 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v21, off, s[16:19], 0 offset:160 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v22, off, s[16:19], 0 offset:164 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v23, off, s[16:19], 0 offset:168 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v24, off, s[16:19], 0 offset:172 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v25, off, s[16:19], 0 offset:176 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v26, off, s[16:19], 0 offset:180 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v27, off, s[16:19], 0 offset:184 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v28, off, s[16:19], 0 offset:188 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v29, off, s[16:19], 0 offset:192 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v30, off, s[16:19], 0 offset:196 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v31, off, s[16:19], 0 offset:200 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v32, off, s[16:19], 0 offset:204 ; 4-byte Folded Reload +; NOOPT-NEXT: buffer_load_dword v33, off, s[16:19], 0 offset:208 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt vmcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v7, v21 +; NOOPT-NEXT: v_mov_b32_e32 v8, v20 +; NOOPT-NEXT: v_mov_b32_e32 v9, v19 +; NOOPT-NEXT: v_mov_b32_e32 v1, v18 +; NOOPT-NEXT: v_mov_b32_e32 v2, v25 +; NOOPT-NEXT: v_mov_b32_e32 v3, v24 +; NOOPT-NEXT: v_mov_b32_e32 v4, v23 +; NOOPT-NEXT: v_mov_b32_e32 v10, v22 +; NOOPT-NEXT: v_mov_b32_e32 v15, v29 +; NOOPT-NEXT: v_mov_b32_e32 v16, v28 +; NOOPT-NEXT: v_mov_b32_e32 v17, v27 +; NOOPT-NEXT: v_mov_b32_e32 v11, v26 +; NOOPT-NEXT: v_mov_b32_e32 v12, v33 +; NOOPT-NEXT: v_mov_b32_e32 v13, v32 +; NOOPT-NEXT: v_mov_b32_e32 v14, v31 +; NOOPT-NEXT: v_mov_b32_e32 v18, v30 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19_vgpr20_vgpr21 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v19, v14 +; NOOPT-NEXT: v_mov_b32_e32 v20, v13 +; NOOPT-NEXT: v_mov_b32_e32 v21, v12 +; NOOPT-NEXT: v_mov_b32_e32 v13, v6 +; NOOPT-NEXT: v_mov_b32_e32 v12, v5 +; NOOPT-NEXT: buffer_store_dwordx4 v[18:21], v[12:13], s[0:3], 0 addr64 offset:48 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12_vgpr13_vgpr14 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v12, v17 +; NOOPT-NEXT: v_mov_b32_e32 v13, v16 +; NOOPT-NEXT: v_mov_b32_e32 v14, v15 +; NOOPT-NEXT: v_mov_b32_e32 v16, v6 +; NOOPT-NEXT: v_mov_b32_e32 v15, v5 +; NOOPT-NEXT: buffer_store_dwordx4 v[11:14], v[15:16], s[0:3], 0 addr64 offset:32 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11_vgpr12_vgpr13 killed $exec +; NOOPT-NEXT: s_waitcnt expcnt(0) +; NOOPT-NEXT: v_mov_b32_e32 v11, v4 +; NOOPT-NEXT: v_mov_b32_e32 v12, v3 +; NOOPT-NEXT: v_mov_b32_e32 v13, v2 +; NOOPT-NEXT: v_mov_b32_e32 v2, v5 +; NOOPT-NEXT: v_mov_b32_e32 v3, v6 +; NOOPT-NEXT: buffer_store_dwordx4 v[10:13], v[2:3], s[0:3], 0 addr64 offset:16 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; implicit-def: $sgpr4 +; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; NOOPT-NEXT: v_mov_b32_e32 v2, v9 +; NOOPT-NEXT: v_mov_b32_e32 v3, v8 +; NOOPT-NEXT: v_mov_b32_e32 v4, v7 +; NOOPT-NEXT: buffer_store_dwordx4 v[1:4], v[5:6], s[0:3], 0 addr64 +; NOOPT-NEXT: ; kill: killed $vgpr0 +; NOOPT-NEXT: s_endpgm +; +; SI-MOVREL-LABEL: insert_or_disj_index: +; SI-MOVREL: ; %bb.0: ; %entry +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s4 +; SI-MOVREL-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; SI-MOVREL-NEXT: s_mov_b32 s2, 0 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s0, s2 +; SI-MOVREL-NEXT: s_mov_b32 s1, s2 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v17, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v18, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v19, v5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v20, v5 +; SI-MOVREL-NEXT: s_mov_b64 s[4:5], exec +; SI-MOVREL-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1 +; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: v_readfirstlane_b32 s6, v2 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v2 +; SI-MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc +; SI-MOVREL-NEXT: s_mov_b32 m0, s6 +; SI-MOVREL-NEXT: v_movreld_b32_e32 v6, v4 +; SI-MOVREL-NEXT: s_xor_b64 exec, exec, vcc +; SI-MOVREL-NEXT: s_cbranch_execnz .LBB27_1 +; SI-MOVREL-NEXT: ; %bb.2: +; SI-MOVREL-NEXT: s_mov_b64 exec, s[4:5] +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[17:20], v[0:1], s[0:3], 0 addr64 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[13:16], v[0:1], s[0:3], 0 addr64 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[9:12], v[0:1], s[0:3], 0 addr64 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[5:8], v[0:1], s[0:3], 0 addr64 +; SI-MOVREL-NEXT: s_endpgm +; +; VI-MOVREL-LABEL: insert_or_disj_index: +; VI-MOVREL: ; %bb.0: ; %entry +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s4 +; VI-MOVREL-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v18, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v19, v5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v20, v5 +; VI-MOVREL-NEXT: s_mov_b64 s[0:1], exec +; VI-MOVREL-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1 +; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_readfirstlane_b32 s2, v2 +; VI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; VI-MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc +; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: v_movreld_b32_e32 v6, v4 +; VI-MOVREL-NEXT: s_xor_b64 exec, exec, vcc +; VI-MOVREL-NEXT: s_cbranch_execnz .LBB27_1 +; VI-MOVREL-NEXT: ; %bb.2: +; VI-MOVREL-NEXT: s_mov_b64 exec, s[0:1] +; VI-MOVREL-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-MOVREL-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-MOVREL-NEXT: flat_store_dwordx4 v[2:3], v[17:20] +; VI-MOVREL-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-MOVREL-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-MOVREL-NEXT: flat_store_dwordx4 v[2:3], v[13:16] +; VI-MOVREL-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-MOVREL-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-MOVREL-NEXT: flat_store_dwordx4 v[2:3], v[9:12] +; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[5:8] +; VI-MOVREL-NEXT: s_endpgm +; +; VI-IDXMODE-LABEL: insert_or_disj_index: +; VI-IDXMODE: ; %bb.0: ; %entry +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s4 +; VI-IDXMODE-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v18, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v19, v5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v20, v5 +; VI-IDXMODE-NEXT: s_mov_b64 s[0:1], exec +; VI-IDXMODE-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1 +; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; VI-IDXMODE-NEXT: v_readfirstlane_b32 s2, v2 +; VI-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; VI-IDXMODE-NEXT: s_and_saveexec_b64 vcc, vcc +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, v4 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: s_xor_b64 exec, exec, vcc +; VI-IDXMODE-NEXT: s_cbranch_execnz .LBB27_1 +; VI-IDXMODE-NEXT: ; %bb.2: +; VI-IDXMODE-NEXT: s_mov_b64 exec, s[0:1] +; VI-IDXMODE-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-IDXMODE-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[2:3], v[17:20] +; VI-IDXMODE-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-IDXMODE-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[2:3], v[13:16] +; VI-IDXMODE-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-IDXMODE-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[2:3], v[9:12] +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[5:8] +; VI-IDXMODE-NEXT: s_endpgm +; +; GFX9-IDXMODE-LABEL: insert_or_disj_index: +; GFX9-IDXMODE: ; %bb.0: ; %entry +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-IDXMODE-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v20, v5 +; GFX9-IDXMODE-NEXT: s_mov_b64 s[0:1], exec +; GFX9-IDXMODE-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX9-IDXMODE-NEXT: s_and_saveexec_b64 vcc, vcc +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off +; GFX9-IDXMODE-NEXT: s_xor_b64 exec, exec, vcc +; GFX9-IDXMODE-NEXT: s_cbranch_execnz .LBB27_1 +; GFX9-IDXMODE-NEXT: ; %bb.2: +; GFX9-IDXMODE-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v[0:1], v[17:20], off offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v[0:1], v[13:16], off offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v[0:1], v[9:12], off offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v[0:1], v[5:8], off +; GFX9-IDXMODE-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %A, i32 0, i32 0) %off = or disjoint i32 %idx, 1 @@ -560,10 +8320,3 @@ entry: store <16 x i32> %v, ptr addrspace(1) %out ret void } - -declare i32 @llvm.amdgcn.workitem.id.x() #1 -declare void @llvm.amdgcn.s.barrier() #2 - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } -attributes #2 = { nounwind convergent } From 8ba2ae31fa6a386d42aec5dedd685e99747dbf0f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 22 Aug 2024 08:12:51 -0700 Subject: [PATCH 212/426] [RISCV][GISel] Implement canLowerReturn. (#105465) This allows us to handle return values that are too large to fit in x10 and x11. They will be converted to a sret by passing a pointer to where to store the return value. --- .../Target/RISCV/GISel/RISCVCallLowering.cpp | 87 ++++++---- .../Target/RISCV/GISel/RISCVCallLowering.h | 4 + ...calling-conv-ilp32-ilp32f-ilp32d-common.ll | 151 ++++++++++++++++++ .../calling-conv-lp64-lp64f-lp64d-common.ll | 151 ++++++++++++++++++ 4 files changed, 363 insertions(+), 30 deletions(-) diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp index 33371512706469..b274a8fc45c5ce 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp @@ -17,6 +17,7 @@ #include "RISCVMachineFunctionInfo.h" #include "RISCVSubtarget.h" #include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -360,13 +361,7 @@ static bool isSupportedArgumentType(Type *T, const RISCVSubtarget &Subtarget, // lowerCall. static bool isSupportedReturnType(Type *T, const RISCVSubtarget &Subtarget, bool IsLowerRetVal = false) { - // TODO: Integers larger than 2*XLen are passed indirectly which is not - // supported yet. - if (T->isIntegerTy()) - return T->getIntegerBitWidth() <= Subtarget.getXLen() * 2; - if (T->isHalfTy() || T->isFloatTy() || T->isDoubleTy()) - return true; - if (T->isPointerTy()) + if (T->isIntegerTy() || T->isFloatingPointTy() || T->isPointerTy()) return true; if (T->isArrayTy()) @@ -394,10 +389,13 @@ bool RISCVCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, assert(!Val == VRegs.empty() && "Return value without a vreg"); MachineInstrBuilder Ret = MIRBuilder.buildInstrNoInsert(RISCV::PseudoRET); - if (!VRegs.empty()) { + if (!FLI.CanLowerReturn) { + insertSRetStores(MIRBuilder, Val->getType(), VRegs, FLI.DemoteRegister); + } else if (!VRegs.empty()) { const RISCVSubtarget &Subtarget = MIRBuilder.getMF().getSubtarget(); - if (!isSupportedReturnType(Val->getType(), Subtarget, /*IsLowerRetVal=*/true)) + if (!isSupportedReturnType(Val->getType(), Subtarget, + /*IsLowerRetVal=*/true)) return false; MachineFunction &MF = MIRBuilder.getMF(); @@ -418,7 +416,7 @@ bool RISCVCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, /*IsRet=*/true, Dispatcher); RISCVOutgoingValueHandler Handler(MIRBuilder, MF.getRegInfo(), Ret); if (!determineAndHandleAssignments(Handler, Assigner, SplitRetInfos, - MIRBuilder, CC, F.isVarArg())) + MIRBuilder, CC, F.isVarArg())) return false; } @@ -426,6 +424,30 @@ bool RISCVCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, return true; } +bool RISCVCallLowering::canLowerReturn(MachineFunction &MF, + CallingConv::ID CallConv, + SmallVectorImpl &Outs, + bool IsVarArg) const { + SmallVector ArgLocs; + const auto &TLI = *getTLI(); + CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, + MF.getFunction().getContext()); + + RVVArgDispatcher Dispatcher{&MF, &TLI, + ArrayRef(MF.getFunction().getReturnType())}; + + RISCVABI::ABI ABI = MF.getSubtarget().getTargetABI(); + + for (unsigned I = 0, E = Outs.size(); I < E; ++I) { + MVT VT = MVT::getVT(Outs[I].Ty); + if (RISCV::CC_RISCV(MF.getDataLayout(), ABI, I, VT, VT, CCValAssign::Full, + Outs[I].Flags[0], CCInfo, /*IsFixed=*/true, + /*isRet=*/true, nullptr, TLI, Dispatcher)) + return false; + } + return true; +} + /// If there are varargs that were passed in a0-a7, the data in those registers /// must be copied to the varargs save area on the stack. void RISCVCallLowering::saveVarArgRegisters( @@ -498,24 +520,26 @@ bool RISCVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef> VRegs, FunctionLoweringInfo &FLI) const { - // Early exit if there are no arguments. varargs are not part of F.args() but - // must be lowered. - if (F.arg_empty() && !F.isVarArg()) - return true; + MachineFunction &MF = MIRBuilder.getMF(); - const RISCVSubtarget &Subtarget = - MIRBuilder.getMF().getSubtarget(); + const RISCVSubtarget &Subtarget = MF.getSubtarget(); for (auto &Arg : F.args()) { if (!isSupportedArgumentType(Arg.getType(), Subtarget, /*IsLowerArgs=*/true)) return false; } - MachineFunction &MF = MIRBuilder.getMF(); + MachineRegisterInfo &MRI = MF.getRegInfo(); const DataLayout &DL = MF.getDataLayout(); CallingConv::ID CC = F.getCallingConv(); SmallVector SplitArgInfos; + + // Insert the hidden sret parameter if the return value won't fit in the + // return registers. + if (!FLI.CanLowerReturn) + insertSRetIncomingArgument(F, SplitArgInfos, FLI.DemoteRegister, MRI, DL); + SmallVector TypeList; unsigned Index = 0; for (auto &Arg : F.args()) { @@ -625,21 +649,24 @@ bool RISCVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, *Subtarget.getRegBankInfo(), *Call, Call->getDesc(), Call->getOperand(0), 0); - if (Info.OrigRet.Ty->isVoidTy()) - return true; + if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) { + SmallVector SplitRetInfos; + splitToValueTypes(Info.OrigRet, SplitRetInfos, DL, CC); - SmallVector SplitRetInfos; - splitToValueTypes(Info.OrigRet, SplitRetInfos, DL, CC); + RVVArgDispatcher RetDispatcher{&MF, getTLI(), + ArrayRef(F.getReturnType())}; + RISCVIncomingValueAssigner RetAssigner( + CC == CallingConv::Fast ? RISCV::CC_RISCV_FastCC : RISCV::CC_RISCV, + /*IsRet=*/true, RetDispatcher); + RISCVCallReturnHandler RetHandler(MIRBuilder, MF.getRegInfo(), Call); + if (!determineAndHandleAssignments(RetHandler, RetAssigner, SplitRetInfos, + MIRBuilder, CC, Info.IsVarArg)) + return false; + } - RVVArgDispatcher RetDispatcher{&MF, getTLI(), - ArrayRef(F.getReturnType())}; - RISCVIncomingValueAssigner RetAssigner( - CC == CallingConv::Fast ? RISCV::CC_RISCV_FastCC : RISCV::CC_RISCV, - /*IsRet=*/true, RetDispatcher); - RISCVCallReturnHandler RetHandler(MIRBuilder, MF.getRegInfo(), Call); - if (!determineAndHandleAssignments(RetHandler, RetAssigner, SplitRetInfos, - MIRBuilder, CC, Info.IsVarArg)) - return false; + if (!Info.CanLowerReturn) + insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs, + Info.DemoteRegister, Info.DemoteStackIndex); return true; } diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.h b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.h index ec7fdbc26e24e8..1154449a580e7e 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.h +++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.h @@ -32,6 +32,10 @@ class RISCVCallLowering : public CallLowering { ArrayRef VRegs, FunctionLoweringInfo &FLI) const override; + bool canLowerReturn(MachineFunction &MF, CallingConv::ID CallConv, + SmallVectorImpl &Outs, + bool IsVarArg) const override; + bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef> VRegs, FunctionLoweringInfo &FLI) const override; diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/calling-conv-ilp32-ilp32f-ilp32d-common.ll b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/calling-conv-ilp32-ilp32f-ilp32d-common.ll index 5ca1bf7467858e..b7eb4574cc4dde 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/calling-conv-ilp32-ilp32f-ilp32d-common.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/calling-conv-ilp32-ilp32f-ilp32d-common.ll @@ -945,6 +945,54 @@ define i32 @caller_small_struct_ret() nounwind { ret i32 %5 } +; Check return of >2x xlen scalars + +define fp128 @callee_large_scalar_ret() nounwind { + ; RV32I-LABEL: name: callee_large_scalar_ret + ; RV32I: bb.1 (%ir-block.0): + ; RV32I-NEXT: liveins: $x10 + ; RV32I-NEXT: {{ $}} + ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; RV32I-NEXT: [[C:%[0-9]+]]:_(s128) = G_FCONSTANT fp128 0xL00000000000000007FFF000000000000 + ; RV32I-NEXT: G_STORE [[C]](s128), [[COPY]](p0) :: (store (s128)) + ; RV32I-NEXT: PseudoRET + ret fp128 0xL00000000000000007FFF000000000000 +} + +define void @caller_large_scalar_ret() nounwind { + ; ILP32-LABEL: name: caller_large_scalar_ret + ; ILP32: bb.1 (%ir-block.0): + ; ILP32-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0 + ; ILP32-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; ILP32-NEXT: $x10 = COPY [[FRAME_INDEX]](p0) + ; ILP32-NEXT: PseudoCALL target-flags(riscv-call) @callee_large_scalar_ret, csr_ilp32_lp64, implicit-def $x1, implicit $x10 + ; ILP32-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; ILP32-NEXT: [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s128) from %stack.0) + ; ILP32-NEXT: PseudoRET + ; + ; ILP32F-LABEL: name: caller_large_scalar_ret + ; ILP32F: bb.1 (%ir-block.0): + ; ILP32F-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0 + ; ILP32F-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; ILP32F-NEXT: $x10 = COPY [[FRAME_INDEX]](p0) + ; ILP32F-NEXT: PseudoCALL target-flags(riscv-call) @callee_large_scalar_ret, csr_ilp32f_lp64f, implicit-def $x1, implicit $x10 + ; ILP32F-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; ILP32F-NEXT: [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s128) from %stack.0) + ; ILP32F-NEXT: PseudoRET + ; + ; ILP32D-LABEL: name: caller_large_scalar_ret + ; ILP32D: bb.1 (%ir-block.0): + ; ILP32D-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0 + ; ILP32D-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; ILP32D-NEXT: $x10 = COPY [[FRAME_INDEX]](p0) + ; ILP32D-NEXT: PseudoCALL target-flags(riscv-call) @callee_large_scalar_ret, csr_ilp32d_lp64d, implicit-def $x1, implicit $x10 + ; ILP32D-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; ILP32D-NEXT: [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s128) from %stack.0) + ; ILP32D-NEXT: PseudoRET + %1 = call fp128 @callee_large_scalar_ret() + ret void +} + ; Check return of >2x xlen structs %struct.large = type { i32, i32, i32, i32 } @@ -1033,3 +1081,106 @@ define i32 @caller_large_struct_ret() nounwind { %5 = add i32 %2, %4 ret i32 %5 } + +%struct.large2 = type { i32, float, i16, i32 } + +define %struct.large2 @callee_large_struct_ret2() nounwind { + ; RV32I-LABEL: name: callee_large_struct_ret2 + ; RV32I: bb.1 (%ir-block.0): + ; RV32I-NEXT: liveins: $x10 + ; RV32I-NEXT: {{ $}} + ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; RV32I-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; RV32I-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; RV32I-NEXT: [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; RV32I-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; RV32I-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 2.000000e+00 + ; RV32I-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 3 + ; RV32I-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; RV32I-NEXT: G_STORE [[C]](s32), [[COPY]](p0) :: (store (s32), align 8) + ; RV32I-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; RV32I-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; RV32I-NEXT: G_STORE [[C1]](s32), [[PTR_ADD]](p0) :: (store (s32)) + ; RV32I-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; RV32I-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; RV32I-NEXT: G_STORE [[C2]](s16), [[PTR_ADD1]](p0) :: (store (s16), align 8) + ; RV32I-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; RV32I-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s32) + ; RV32I-NEXT: G_STORE [[C3]](s32), [[PTR_ADD2]](p0) :: (store (s32)) + ; RV32I-NEXT: PseudoRET + %a = insertvalue %struct.large2 poison, i32 1, 0 + %b = insertvalue %struct.large2 %a, float 2.0, 1 + %c = insertvalue %struct.large2 %b, i16 3, 2 + %d = insertvalue %struct.large2 %c, i32 4, 3 + ret %struct.large2 %d +} + +define i32 @caller_large_struct_ret2() nounwind { + ; ILP32-LABEL: name: caller_large_struct_ret2 + ; ILP32: bb.1 (%ir-block.0): + ; ILP32-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0 + ; ILP32-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; ILP32-NEXT: $x10 = COPY [[FRAME_INDEX]](p0) + ; ILP32-NEXT: PseudoCALL target-flags(riscv-call) @callee_large_struct_ret, csr_ilp32_lp64, implicit-def $x1, implicit $x10 + ; ILP32-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; ILP32-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s32) from %stack.0, align 8) + ; ILP32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; ILP32-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C]](s32) + ; ILP32-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from %stack.0) + ; ILP32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; ILP32-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C1]](s32) + ; ILP32-NEXT: [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from %stack.0, align 8) + ; ILP32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; ILP32-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C2]](s32) + ; ILP32-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s32) from %stack.0) + ; ILP32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD]], [[LOAD3]] + ; ILP32-NEXT: $x10 = COPY [[ADD]](s32) + ; ILP32-NEXT: PseudoRET implicit $x10 + ; + ; ILP32F-LABEL: name: caller_large_struct_ret2 + ; ILP32F: bb.1 (%ir-block.0): + ; ILP32F-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0 + ; ILP32F-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; ILP32F-NEXT: $x10 = COPY [[FRAME_INDEX]](p0) + ; ILP32F-NEXT: PseudoCALL target-flags(riscv-call) @callee_large_struct_ret, csr_ilp32f_lp64f, implicit-def $x1, implicit $x10 + ; ILP32F-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; ILP32F-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s32) from %stack.0, align 8) + ; ILP32F-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; ILP32F-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C]](s32) + ; ILP32F-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from %stack.0) + ; ILP32F-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; ILP32F-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C1]](s32) + ; ILP32F-NEXT: [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from %stack.0, align 8) + ; ILP32F-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; ILP32F-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C2]](s32) + ; ILP32F-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s32) from %stack.0) + ; ILP32F-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD]], [[LOAD3]] + ; ILP32F-NEXT: $x10 = COPY [[ADD]](s32) + ; ILP32F-NEXT: PseudoRET implicit $x10 + ; + ; ILP32D-LABEL: name: caller_large_struct_ret2 + ; ILP32D: bb.1 (%ir-block.0): + ; ILP32D-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0 + ; ILP32D-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; ILP32D-NEXT: $x10 = COPY [[FRAME_INDEX]](p0) + ; ILP32D-NEXT: PseudoCALL target-flags(riscv-call) @callee_large_struct_ret, csr_ilp32d_lp64d, implicit-def $x1, implicit $x10 + ; ILP32D-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; ILP32D-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s32) from %stack.0, align 8) + ; ILP32D-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; ILP32D-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C]](s32) + ; ILP32D-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from %stack.0) + ; ILP32D-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; ILP32D-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C1]](s32) + ; ILP32D-NEXT: [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from %stack.0, align 8) + ; ILP32D-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; ILP32D-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C2]](s32) + ; ILP32D-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s32) from %stack.0) + ; ILP32D-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD]], [[LOAD3]] + ; ILP32D-NEXT: $x10 = COPY [[ADD]](s32) + ; ILP32D-NEXT: PseudoRET implicit $x10 + %1 = call %struct.large2 @callee_large_struct_ret() + %2 = extractvalue %struct.large2 %1, 0 + %3 = extractvalue %struct.large2 %1, 3 + %4 = add i32 %2, %3 + ret i32 %4 +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/calling-conv-lp64-lp64f-lp64d-common.ll b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/calling-conv-lp64-lp64f-lp64d-common.ll index 2499f8caf02bcf..6750954a53708b 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/calling-conv-lp64-lp64f-lp64d-common.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/calling-conv-lp64-lp64f-lp64d-common.ll @@ -600,6 +600,54 @@ define i64 @caller_small_struct_ret() nounwind { ret i64 %5 } +; Check return of >2x xlen scalars + +define i256 @callee_large_scalar_ret() nounwind { + ; RV64I-LABEL: name: callee_large_scalar_ret + ; RV64I: bb.1 (%ir-block.0): + ; RV64I-NEXT: liveins: $x10 + ; RV64I-NEXT: {{ $}} + ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; RV64I-NEXT: [[C:%[0-9]+]]:_(s256) = G_CONSTANT i256 -123456789 + ; RV64I-NEXT: G_STORE [[C]](s256), [[COPY]](p0) :: (store (s256), align 16) + ; RV64I-NEXT: PseudoRET + ret i256 -123456789 +} + +define void @caller_large_scalar_ret() nounwind { + ; LP64-LABEL: name: caller_large_scalar_ret + ; LP64: bb.1 (%ir-block.0): + ; LP64-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0 + ; LP64-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; LP64-NEXT: $x10 = COPY [[FRAME_INDEX]](p0) + ; LP64-NEXT: PseudoCALL target-flags(riscv-call) @callee_large_scalar_ret, csr_ilp32_lp64, implicit-def $x1, implicit $x10 + ; LP64-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; LP64-NEXT: [[LOAD:%[0-9]+]]:_(s256) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s256) from %stack.0, align 16) + ; LP64-NEXT: PseudoRET + ; + ; LP64F-LABEL: name: caller_large_scalar_ret + ; LP64F: bb.1 (%ir-block.0): + ; LP64F-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0 + ; LP64F-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; LP64F-NEXT: $x10 = COPY [[FRAME_INDEX]](p0) + ; LP64F-NEXT: PseudoCALL target-flags(riscv-call) @callee_large_scalar_ret, csr_ilp32f_lp64f, implicit-def $x1, implicit $x10 + ; LP64F-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; LP64F-NEXT: [[LOAD:%[0-9]+]]:_(s256) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s256) from %stack.0, align 16) + ; LP64F-NEXT: PseudoRET + ; + ; LP64D-LABEL: name: caller_large_scalar_ret + ; LP64D: bb.1 (%ir-block.0): + ; LP64D-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0 + ; LP64D-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; LP64D-NEXT: $x10 = COPY [[FRAME_INDEX]](p0) + ; LP64D-NEXT: PseudoCALL target-flags(riscv-call) @callee_large_scalar_ret, csr_ilp32d_lp64d, implicit-def $x1, implicit $x10 + ; LP64D-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; LP64D-NEXT: [[LOAD:%[0-9]+]]:_(s256) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s256) from %stack.0, align 16) + ; LP64D-NEXT: PseudoRET + %1 = call i256 @callee_large_scalar_ret() + ret void +} + ; Check return of >2x xlen structs %struct.large = type { i64, i64, i64, i64 } @@ -688,3 +736,106 @@ define i64 @caller_large_struct_ret() nounwind { %5 = add i64 %2, %4 ret i64 %5 } + +%struct.large2 = type { i64, i128, double, i64 } + +define %struct.large2 @callee_large_struct_ret2() nounwind { + ; RV64I-LABEL: name: callee_large_struct_ret2 + ; RV64I: bb.1 (%ir-block.0): + ; RV64I-NEXT: liveins: $x10 + ; RV64I-NEXT: {{ $}} + ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; RV64I-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF + ; RV64I-NEXT: [[DEF1:%[0-9]+]]:_(s128) = G_IMPLICIT_DEF + ; RV64I-NEXT: [[DEF2:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF + ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; RV64I-NEXT: [[C1:%[0-9]+]]:_(s128) = G_CONSTANT i128 2 + ; RV64I-NEXT: [[C2:%[0-9]+]]:_(s64) = G_FCONSTANT double 3.000000e+00 + ; RV64I-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; RV64I-NEXT: G_STORE [[C]](s64), [[COPY]](p0) :: (store (s64), align 16) + ; RV64I-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; RV64I-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) + ; RV64I-NEXT: G_STORE [[C1]](s128), [[PTR_ADD]](p0) :: (store (s128)) + ; RV64I-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 + ; RV64I-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) + ; RV64I-NEXT: G_STORE [[C2]](s64), [[PTR_ADD1]](p0) :: (store (s64), align 16) + ; RV64I-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 40 + ; RV64I-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) + ; RV64I-NEXT: G_STORE [[C3]](s64), [[PTR_ADD2]](p0) :: (store (s64)) + ; RV64I-NEXT: PseudoRET + %a = insertvalue %struct.large2 poison, i64 1, 0 + %b = insertvalue %struct.large2 %a, i128 2, 1 + %c = insertvalue %struct.large2 %b, double 3.0, 2 + %d = insertvalue %struct.large2 %c, i64 4, 3 + ret %struct.large2 %d +} + +define i64 @caller_large_struct_ret2() nounwind { + ; LP64-LABEL: name: caller_large_struct_ret2 + ; LP64: bb.1 (%ir-block.0): + ; LP64-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0 + ; LP64-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; LP64-NEXT: $x10 = COPY [[FRAME_INDEX]](p0) + ; LP64-NEXT: PseudoCALL target-flags(riscv-call) @callee_large_struct_ret, csr_ilp32_lp64, implicit-def $x1, implicit $x10 + ; LP64-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; LP64-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s64) from %stack.0, align 16) + ; LP64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; LP64-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C]](s64) + ; LP64-NEXT: [[LOAD1:%[0-9]+]]:_(s128) = G_LOAD [[PTR_ADD]](p0) :: (load (s128) from %stack.0) + ; LP64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 + ; LP64-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C1]](s64) + ; LP64-NEXT: [[LOAD2:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p0) :: (load (s64) from %stack.0, align 16) + ; LP64-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 40 + ; LP64-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C2]](s64) + ; LP64-NEXT: [[LOAD3:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD2]](p0) :: (load (s64) from %stack.0) + ; LP64-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LOAD]], [[LOAD3]] + ; LP64-NEXT: $x10 = COPY [[ADD]](s64) + ; LP64-NEXT: PseudoRET implicit $x10 + ; + ; LP64F-LABEL: name: caller_large_struct_ret2 + ; LP64F: bb.1 (%ir-block.0): + ; LP64F-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0 + ; LP64F-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; LP64F-NEXT: $x10 = COPY [[FRAME_INDEX]](p0) + ; LP64F-NEXT: PseudoCALL target-flags(riscv-call) @callee_large_struct_ret, csr_ilp32f_lp64f, implicit-def $x1, implicit $x10 + ; LP64F-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; LP64F-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s64) from %stack.0, align 16) + ; LP64F-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; LP64F-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C]](s64) + ; LP64F-NEXT: [[LOAD1:%[0-9]+]]:_(s128) = G_LOAD [[PTR_ADD]](p0) :: (load (s128) from %stack.0) + ; LP64F-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 + ; LP64F-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C1]](s64) + ; LP64F-NEXT: [[LOAD2:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p0) :: (load (s64) from %stack.0, align 16) + ; LP64F-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 40 + ; LP64F-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C2]](s64) + ; LP64F-NEXT: [[LOAD3:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD2]](p0) :: (load (s64) from %stack.0) + ; LP64F-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LOAD]], [[LOAD3]] + ; LP64F-NEXT: $x10 = COPY [[ADD]](s64) + ; LP64F-NEXT: PseudoRET implicit $x10 + ; + ; LP64D-LABEL: name: caller_large_struct_ret2 + ; LP64D: bb.1 (%ir-block.0): + ; LP64D-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0 + ; LP64D-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2 + ; LP64D-NEXT: $x10 = COPY [[FRAME_INDEX]](p0) + ; LP64D-NEXT: PseudoCALL target-flags(riscv-call) @callee_large_struct_ret, csr_ilp32d_lp64d, implicit-def $x1, implicit $x10 + ; LP64D-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2 + ; LP64D-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s64) from %stack.0, align 16) + ; LP64D-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; LP64D-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C]](s64) + ; LP64D-NEXT: [[LOAD1:%[0-9]+]]:_(s128) = G_LOAD [[PTR_ADD]](p0) :: (load (s128) from %stack.0) + ; LP64D-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 + ; LP64D-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C1]](s64) + ; LP64D-NEXT: [[LOAD2:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p0) :: (load (s64) from %stack.0, align 16) + ; LP64D-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 40 + ; LP64D-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C2]](s64) + ; LP64D-NEXT: [[LOAD3:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD2]](p0) :: (load (s64) from %stack.0) + ; LP64D-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LOAD]], [[LOAD3]] + ; LP64D-NEXT: $x10 = COPY [[ADD]](s64) + ; LP64D-NEXT: PseudoRET implicit $x10 + %1 = call %struct.large2 @callee_large_struct_ret() + %2 = extractvalue %struct.large2 %1, 0 + %3 = extractvalue %struct.large2 %1, 3 + %4 = add i64 %2, %3 + ret i64 %4 +} From e76db25832d6ac2d3a36769b26f982d9dee4b346 Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Thu, 22 Aug 2024 08:13:20 -0700 Subject: [PATCH 213/426] [DwarfEhPrepare] Assign dummy debug location for more inserted _Unwind_Resume calls (#105513) Similar to the fix for #57469, ensure that the other `_Unwind_Resume` call emitted by DwarfEHPrepare has a debug location if needed. This fixes https://github.com/nbdd0121/unwinding/issues/34. --- llvm/lib/CodeGen/DwarfEHPrepare.cpp | 7 + .../CodeGen/AArch64/dwarf-eh-prepare-dbg.ll | 1175 +++++++++++++++++ 2 files changed, 1182 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/dwarf-eh-prepare-dbg.ll diff --git a/llvm/lib/CodeGen/DwarfEHPrepare.cpp b/llvm/lib/CodeGen/DwarfEHPrepare.cpp index 324329ce989e71..f4324fffc4ed45 100644 --- a/llvm/lib/CodeGen/DwarfEHPrepare.cpp +++ b/llvm/lib/CodeGen/DwarfEHPrepare.cpp @@ -293,6 +293,13 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls() { // Call the function. CallInst *CI = CallInst::Create(RewindFunction, RewindFunctionArgs, "", UnwindBB); + // The verifier requires that all calls of debug-info-bearing functions + // from debug-info-bearing functions have a debug location (for inlining + // purposes). Assign a dummy location to satisfy the constraint. + Function *RewindFn = dyn_cast(RewindFunction.getCallee()); + if (RewindFn && RewindFn->getSubprogram()) + if (DISubprogram *SP = F.getSubprogram()) + CI->setDebugLoc(DILocation::get(SP->getContext(), 0, 0, SP)); CI->setCallingConv(RewindFunctionCallingConv); // We never expect _Unwind_Resume to return. diff --git a/llvm/test/CodeGen/AArch64/dwarf-eh-prepare-dbg.ll b/llvm/test/CodeGen/AArch64/dwarf-eh-prepare-dbg.ll new file mode 100644 index 00000000000000..020a10f278ed68 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/dwarf-eh-prepare-dbg.ll @@ -0,0 +1,1175 @@ +; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -dwarf-eh-prepare < %s | FileCheck %s +; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -passes=dwarf-eh-prepare < %s | FileCheck %s + +; If _Unwind_Resume is defined in the same module and we have debug +; info, then the inserted _Unwind_Resume calls also need to have a dummy debug +; location to satisfy inlining requirements. + +; CHECK-LABEL: @_ZN9unwinding8unwinder5frame5Frame19evaluate_expression17h2bd8716b79f71675E( +; CHECK: %exn.obj = phi ptr [ [[A:%.*]], %cleanup.i ], [ [[B:%.*]], %bb44 ] +; CHECK: call void @_Unwind_Resume(ptr %exn.obj) #2, !dbg !1039 + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" +target triple = "aarch64-unknown-linux-gnu" + +; Function Attrs: uwtable +declare void @"17h74cc711a87d52d83E"() unnamed_addr #0 + +; Function Attrs: uwtable +define void @_ZN9unwinding8unwinder5frame5Frame19evaluate_expression17h2bd8716b79f71675E() unnamed_addr #0 personality ptr @rust_eh_personality !dbg !2 { +start: + invoke void @_ZN4core6result13unwrap_failed17h043998f7f81c2189E() #2 + to label %unreachable.i unwind label %cleanup.i, !dbg !999 + +cleanup.i: ; preds = %start + %i9 = landingpad { ptr, i32 } + cleanup + resume { ptr, i32 } undef, !dbg !999 + +unreachable.i: ; preds = %start + unreachable + +bb43: ; preds = %cleanup.loopexit.split-lp + invoke void @"17h74cc711a87d52d83E"() + to label %bb44 unwind label %cleanup.loopexit.split-lp, !dbg !999 + +cleanup.loopexit.split-lp: ; preds = %bb43 + %lpad.loopexit.split-lp = landingpad { ptr, i32 } + cleanup + br label %bb43 + +bb44: ; preds = %bb43 + resume { ptr, i32 } undef, !dbg !999 +} + +; Function Attrs: noreturn uwtable +define void @_Unwind_Resume(ptr %arg) unnamed_addr #1 !dbg !1039 { +start: + unreachable +} + +declare i32 @rust_eh_personality(...) unnamed_addr + +; Function Attrs: noreturn uwtable +declare void @_ZN4core6result13unwrap_failed17h043998f7f81c2189E() unnamed_addr #1 + +attributes #0 = { uwtable } +attributes #1 = { noreturn uwtable } +attributes #2 = { noreturn } + +!llvm.module.flags = !{!0, !1} + +!0 = !{i32 2, !"Dwarf Version", i32 4} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = distinct !DISubprogram(name: "evaluate_expression", linkageName: "_ZN9unwinding8unwinder5frame5Frame19evaluate_expression17h2bd8716b79f71675E", scope: !4, file: !3, line: 79, type: !302, scopeLine: 79, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !564, templateParams: !46, declaration: !607, retainedNodes: !608) +!3 = !DIFile(filename: "src/unwinder/frame.rs", directory: "/home/dev/ecosystem/unwinding", checksumkind: CSK_MD5, checksum: "8e7ed70cea65000339db1f4ec1025545") +!4 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Frame", scope: !6, file: !5, size: 35520, align: 64, flags: DIFlagPublic, elements: !9, templateParams: !46, identifier: "668e0516028efb27d51536b6c511f9") +!5 = !DIFile(filename: "", directory: "") +!6 = !DINamespace(name: "frame", scope: !7) +!7 = !DINamespace(name: "unwinder", scope: !8) +!8 = !DINamespace(name: "unwinding", scope: null) +!9 = !{!10, !205} +!10 = !DIDerivedType(tag: DW_TAG_member, name: "fde_result", scope: !4, file: !5, baseType: !11, size: 2304, align: 64, flags: DIFlagPrivate) +!11 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "FDESearchResult", scope: !12, file: !5, size: 2304, align: 64, flags: DIFlagPublic, elements: !13, templateParams: !46, identifier: "83083fb6983108ea9bd5c8494868595d") +!12 = !DINamespace(name: "find_fde", scope: !7) +!13 = !{!14, !171, !194} +!14 = !DIDerivedType(tag: DW_TAG_member, name: "fde", scope: !11, file: !5, baseType: !15, size: 1344, align: 64, offset: 768, flags: DIFlagPublic) +!15 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "FrameDescriptionEntry, usize>", scope: !16, file: !5, size: 1344, align: 64, flags: DIFlagPublic, elements: !19, templateParams: !134, identifier: "5d3a70f21598ef08f33176ed3c9f48e9") +!16 = !DINamespace(name: "cfi", scope: !17) +!17 = !DINamespace(name: "read", scope: !18) +!18 = !DINamespace(name: "gimli", scope: null) +!19 = !{!20, !22, !23, !30, !137, !138, !139, !140, !170} +!20 = !DIDerivedType(tag: DW_TAG_member, name: "offset", scope: !15, file: !5, baseType: !21, size: 64, align: 64, offset: 960, flags: DIFlagPrivate) +!21 = !DIBasicType(name: "usize", size: 64, encoding: DW_ATE_unsigned) +!22 = !DIDerivedType(tag: DW_TAG_member, name: "length", scope: !15, file: !5, baseType: !21, size: 64, align: 64, offset: 1024, flags: DIFlagPrivate) +!23 = !DIDerivedType(tag: DW_TAG_member, name: "format", scope: !15, file: !5, baseType: !24, size: 8, align: 8, offset: 1280, flags: DIFlagPrivate) +!24 = !DICompositeType(tag: DW_TAG_enumeration_type, name: "Format", scope: !25, file: !5, baseType: !26, size: 8, align: 8, flags: DIFlagEnumClass, elements: !27) +!25 = !DINamespace(name: "common", scope: !18) +!26 = !DIBasicType(name: "u8", size: 8, encoding: DW_ATE_unsigned) +!27 = !{!28, !29} +!28 = !DIEnumerator(name: "Dwarf64", value: 8, isUnsigned: true) +!29 = !DIEnumerator(name: "Dwarf32", value: 4, isUnsigned: true) +!30 = !DIDerivedType(tag: DW_TAG_member, name: "cie", scope: !15, file: !5, baseType: !31, size: 704, align: 64, offset: 128, flags: DIFlagPrivate) +!31 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "CommonInformationEntry, usize>", scope: !16, file: !5, size: 704, align: 64, flags: DIFlagPublic, elements: !32, templateParams: !134, identifier: "d702b04fb343c0c9e9d3001992e9a1") +!32 = !{!33, !34, !35, !36, !37, !109, !110, !111, !112, !114, !119} +!33 = !DIDerivedType(tag: DW_TAG_member, name: "offset", scope: !31, file: !5, baseType: !21, size: 64, align: 64, offset: 384, flags: DIFlagPrivate) +!34 = !DIDerivedType(tag: DW_TAG_member, name: "length", scope: !31, file: !5, baseType: !21, size: 64, align: 64, offset: 448, flags: DIFlagPrivate) +!35 = !DIDerivedType(tag: DW_TAG_member, name: "format", scope: !31, file: !5, baseType: !24, size: 8, align: 8, offset: 656, flags: DIFlagPrivate) +!36 = !DIDerivedType(tag: DW_TAG_member, name: "version", scope: !31, file: !5, baseType: !26, size: 8, align: 8, offset: 664, flags: DIFlagPrivate) +!37 = !DIDerivedType(tag: DW_TAG_member, name: "augmentation", scope: !31, file: !5, baseType: !38, size: 256, align: 64, flags: DIFlagPrivate) +!38 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Option", scope: !39, file: !5, size: 256, align: 64, flags: DIFlagPublic, elements: !41, templateParams: !46, identifier: "3a0af3bf6a8f5409cf76f6c3324a8471") +!39 = !DINamespace(name: "option", scope: !40) +!40 = !DINamespace(name: "core", scope: null) +!41 = !{!42} +!42 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !38, file: !5, size: 256, align: 64, elements: !43, templateParams: !46, identifier: "38f36f4fe7ce04182001fea3f0ce78b9", discriminator: !108) +!43 = !{!44, !104} +!44 = !DIDerivedType(tag: DW_TAG_member, name: "None", scope: !42, file: !5, baseType: !45, size: 256, align: 64, extraData: i128 3) +!45 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "None", scope: !38, file: !5, size: 256, align: 64, flags: DIFlagPublic, elements: !46, templateParams: !47, identifier: "fc965678566be8f379abea64e9b3abac") +!46 = !{} +!47 = !{!48} +!48 = !DITemplateTypeParameter(name: "T", type: !49) +!49 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Augmentation", scope: !16, file: !5, size: 256, align: 64, flags: DIFlagPublic, elements: !50, templateParams: !46, identifier: "437b83c5d52a974b59a7fdcd9a6e5529") +!50 = !{!51, !69, !101, !102} +!51 = !DIDerivedType(tag: DW_TAG_member, name: "lsda", scope: !49, file: !5, baseType: !52, size: 16, align: 8, offset: 192, flags: DIFlagPrivate) +!52 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Option", scope: !39, file: !5, size: 16, align: 8, flags: DIFlagPublic, elements: !53, templateParams: !46, identifier: "269c974aec4b862f607e3d0f37bf4289") +!53 = !{!54} +!54 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !52, file: !5, size: 16, align: 8, elements: !55, templateParams: !46, identifier: "a41237f2e73a6dfd1e15b8d422b0caf", discriminator: !68) +!55 = !{!56, !64} +!56 = !DIDerivedType(tag: DW_TAG_member, name: "None", scope: !54, file: !5, baseType: !57, size: 16, align: 8, extraData: i128 0) +!57 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "None", scope: !52, file: !5, size: 16, align: 8, flags: DIFlagPublic, elements: !46, templateParams: !58, identifier: "f43d443124b25d2cf6c3daa0ca6dbe8e") +!58 = !{!59} +!59 = !DITemplateTypeParameter(name: "T", type: !60) +!60 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "DwEhPe", scope: !61, file: !5, size: 8, align: 8, flags: DIFlagPublic, elements: !62, templateParams: !46, identifier: "be9eed4e424cae07de87786ea65cc31a") +!61 = !DINamespace(name: "constants", scope: !18) +!62 = !{!63} +!63 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !60, file: !5, baseType: !26, size: 8, align: 8, flags: DIFlagPublic) +!64 = !DIDerivedType(tag: DW_TAG_member, name: "Some", scope: !54, file: !5, baseType: !65, size: 16, align: 8, extraData: i128 1) +!65 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Some", scope: !52, file: !5, size: 16, align: 8, flags: DIFlagPublic, elements: !66, templateParams: !58, identifier: "2c48764c698ea6f9c8867fbde87bbb17") +!66 = !{!67} +!67 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !65, file: !5, baseType: !60, size: 8, align: 8, offset: 8, flags: DIFlagPublic) +!68 = !DIDerivedType(tag: DW_TAG_member, scope: !52, file: !5, baseType: !26, size: 8, align: 8, flags: DIFlagArtificial) +!69 = !DIDerivedType(tag: DW_TAG_member, name: "personality", scope: !49, file: !5, baseType: !70, size: 192, align: 64, flags: DIFlagPrivate) +!70 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Option<(gimli::constants::DwEhPe, gimli::read::cfi::Pointer)>", scope: !39, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !71, templateParams: !46, identifier: "1fdf28ed684e117d758984993d4c393") +!71 = !{!72} +!72 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !70, file: !5, size: 192, align: 64, elements: !73, templateParams: !46, identifier: "186ecead5120d6ea5f8043eb398d7171", discriminator: !100) +!73 = !{!74, !96} +!74 = !DIDerivedType(tag: DW_TAG_member, name: "None", scope: !72, file: !5, baseType: !75, size: 192, align: 64, extraData: i128 2) +!75 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "None", scope: !70, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !46, templateParams: !76, identifier: "8ffc865f6e828740f09ae9b5f0639b8b") +!76 = !{!77} +!77 = !DITemplateTypeParameter(name: "T", type: !78) +!78 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "(gimli::constants::DwEhPe, gimli::read::cfi::Pointer)", file: !5, size: 192, align: 64, elements: !79, templateParams: !46, identifier: "1135188aa1561172ec3fe388e59e5ad3") +!79 = !{!80, !81} +!80 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !78, file: !5, baseType: !60, size: 8, align: 8) +!81 = !DIDerivedType(tag: DW_TAG_member, name: "__1", scope: !78, file: !5, baseType: !82, size: 128, align: 64, offset: 64) +!82 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Pointer", scope: !16, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !83, templateParams: !46, identifier: "54ccac1071baa52ac351c194c6a75888") +!83 = !{!84} +!84 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !82, file: !5, size: 128, align: 64, elements: !85, templateParams: !46, identifier: "28884e6a838202696aebd009c8be1e5d", discriminator: !95) +!85 = !{!86, !91} +!86 = !DIDerivedType(tag: DW_TAG_member, name: "Direct", scope: !84, file: !5, baseType: !87, size: 128, align: 64, extraData: i128 0) +!87 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Direct", scope: !82, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !88, templateParams: !46, identifier: "d33e0cdef11588372b91f9bf78c76dea") +!88 = !{!89} +!89 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !87, file: !5, baseType: !90, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!90 = !DIBasicType(name: "u64", size: 64, encoding: DW_ATE_unsigned) +!91 = !DIDerivedType(tag: DW_TAG_member, name: "Indirect", scope: !84, file: !5, baseType: !92, size: 128, align: 64, extraData: i128 1) +!92 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Indirect", scope: !82, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !93, templateParams: !46, identifier: "878c7c7b43c77510f5719ec0e083c0df") +!93 = !{!94} +!94 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !92, file: !5, baseType: !90, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!95 = !DIDerivedType(tag: DW_TAG_member, scope: !82, file: !5, baseType: !90, size: 64, align: 64, flags: DIFlagArtificial) +!96 = !DIDerivedType(tag: DW_TAG_member, name: "Some", scope: !72, file: !5, baseType: !97, size: 192, align: 64) +!97 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Some", scope: !70, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !98, templateParams: !76, identifier: "dbb5400f4ed59423d6d6c8d7daf9d6e2") +!98 = !{!99} +!99 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !97, file: !5, baseType: !78, size: 192, align: 64, flags: DIFlagPublic) +!100 = !DIDerivedType(tag: DW_TAG_member, scope: !70, file: !5, baseType: !90, size: 64, align: 64, offset: 64, flags: DIFlagArtificial) +!101 = !DIDerivedType(tag: DW_TAG_member, name: "fde_address_encoding", scope: !49, file: !5, baseType: !52, size: 16, align: 8, offset: 208, flags: DIFlagPrivate) +!102 = !DIDerivedType(tag: DW_TAG_member, name: "is_signal_trampoline", scope: !49, file: !5, baseType: !103, size: 8, align: 8, offset: 224, flags: DIFlagPrivate) +!103 = !DIBasicType(name: "bool", size: 8, encoding: DW_ATE_boolean) +!104 = !DIDerivedType(tag: DW_TAG_member, name: "Some", scope: !42, file: !5, baseType: !105, size: 256, align: 64) +!105 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Some", scope: !38, file: !5, size: 256, align: 64, flags: DIFlagPublic, elements: !106, templateParams: !47, identifier: "7e0027ca0949f7cbda755de183ba6ac7") +!106 = !{!107} +!107 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !105, file: !5, baseType: !49, size: 256, align: 64, flags: DIFlagPublic) +!108 = !DIDerivedType(tag: DW_TAG_member, scope: !38, file: !5, baseType: !90, size: 64, align: 64, offset: 64, flags: DIFlagArtificial) +!109 = !DIDerivedType(tag: DW_TAG_member, name: "address_size", scope: !31, file: !5, baseType: !26, size: 8, align: 8, offset: 672, flags: DIFlagPrivate) +!110 = !DIDerivedType(tag: DW_TAG_member, name: "segment_size", scope: !31, file: !5, baseType: !26, size: 8, align: 8, offset: 680, flags: DIFlagPrivate) +!111 = !DIDerivedType(tag: DW_TAG_member, name: "code_alignment_factor", scope: !31, file: !5, baseType: !90, size: 64, align: 64, offset: 512, flags: DIFlagPrivate) +!112 = !DIDerivedType(tag: DW_TAG_member, name: "data_alignment_factor", scope: !31, file: !5, baseType: !113, size: 64, align: 64, offset: 576, flags: DIFlagPrivate) +!113 = !DIBasicType(name: "i64", size: 64, encoding: DW_ATE_signed) +!114 = !DIDerivedType(tag: DW_TAG_member, name: "return_address_register", scope: !31, file: !5, baseType: !115, size: 16, align: 16, offset: 640, flags: DIFlagPrivate) +!115 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Register", scope: !25, file: !5, size: 16, align: 16, flags: DIFlagPublic, elements: !116, templateParams: !46, identifier: "ab7721750f04c98f7840c9b9e52b656e") +!116 = !{!117} +!117 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !115, file: !5, baseType: !118, size: 16, align: 16, flags: DIFlagPublic) +!118 = !DIBasicType(name: "u16", size: 16, encoding: DW_ATE_unsigned) +!119 = !DIDerivedType(tag: DW_TAG_member, name: "initial_instructions", scope: !31, file: !5, baseType: !120, size: 128, align: 64, offset: 256, flags: DIFlagPrivate) +!120 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "EndianSlice", scope: !121, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !122, templateParams: !132, identifier: "da07131e2106746f95d2ea314dd1a7d6") +!121 = !DINamespace(name: "endian_slice", scope: !17) +!122 = !{!123, !129} +!123 = !DIDerivedType(tag: DW_TAG_member, name: "slice", scope: !120, file: !5, baseType: !124, size: 128, align: 64, flags: DIFlagPrivate) +!124 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "&[u8]", file: !5, size: 128, align: 64, elements: !125, templateParams: !46, identifier: "31681e0c10b314f1f33e38b2779acbb4") +!125 = !{!126, !128} +!126 = !DIDerivedType(tag: DW_TAG_member, name: "data_ptr", scope: !124, file: !5, baseType: !127, size: 64, align: 64) +!127 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !26, size: 64, align: 64, dwarfAddressSpace: 0) +!128 = !DIDerivedType(tag: DW_TAG_member, name: "length", scope: !124, file: !5, baseType: !21, size: 64, align: 64, offset: 64) +!129 = !DIDerivedType(tag: DW_TAG_member, name: "endian", scope: !120, file: !5, baseType: !130, align: 8, offset: 128, flags: DIFlagPrivate) +!130 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "LittleEndian", scope: !131, file: !5, align: 8, flags: DIFlagPublic, elements: !46, identifier: "3d0f5d089fd1d1e4e850cd8b54585231") +!131 = !DINamespace(name: "endianity", scope: !18) +!132 = !{!133} +!133 = !DITemplateTypeParameter(name: "Endian", type: !130) +!134 = !{!135, !136} +!135 = !DITemplateTypeParameter(name: "R", type: !120) +!136 = !DITemplateTypeParameter(name: "Offset", type: !21) +!137 = !DIDerivedType(tag: DW_TAG_member, name: "initial_segment", scope: !15, file: !5, baseType: !90, size: 64, align: 64, offset: 1088, flags: DIFlagPrivate) +!138 = !DIDerivedType(tag: DW_TAG_member, name: "initial_address", scope: !15, file: !5, baseType: !90, size: 64, align: 64, offset: 1152, flags: DIFlagPrivate) +!139 = !DIDerivedType(tag: DW_TAG_member, name: "address_range", scope: !15, file: !5, baseType: !90, size: 64, align: 64, offset: 1216, flags: DIFlagPrivate) +!140 = !DIDerivedType(tag: DW_TAG_member, name: "augmentation", scope: !15, file: !5, baseType: !141, size: 128, align: 64, flags: DIFlagPrivate) +!141 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Option", scope: !39, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !142, templateParams: !46, identifier: "6c86ed6ec859a01352ba34a0d8b67b42") +!142 = !{!143} +!143 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !141, file: !5, size: 128, align: 64, elements: !144, templateParams: !46, identifier: "fba0139e17508a99930ee3f15479fed", discriminator: !169) +!144 = !{!145, !165} +!145 = !DIDerivedType(tag: DW_TAG_member, name: "None", scope: !143, file: !5, baseType: !146, size: 128, align: 64, extraData: i128 3) +!146 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "None", scope: !141, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, templateParams: !147, identifier: "80c4c2d1f17a14408b80f1b79766a748") +!147 = !{!148} +!148 = !DITemplateTypeParameter(name: "T", type: !149) +!149 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "AugmentationData", scope: !16, file: !5, size: 128, align: 64, flags: DIFlagPrivate, elements: !150, templateParams: !46, identifier: "3de3a0bd67f5300b194fe75d6a32ba34") +!150 = !{!151} +!151 = !DIDerivedType(tag: DW_TAG_member, name: "lsda", scope: !149, file: !5, baseType: !152, size: 128, align: 64, flags: DIFlagPrivate) +!152 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Option", scope: !39, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !153, templateParams: !46, identifier: "63ae53efcc3b644d5af5be76d59f1b22") +!153 = !{!154} +!154 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !152, file: !5, size: 128, align: 64, elements: !155, templateParams: !46, identifier: "7cc3843db76c59d9856a0146da2deda", discriminator: !164) +!155 = !{!156, !160} +!156 = !DIDerivedType(tag: DW_TAG_member, name: "None", scope: !154, file: !5, baseType: !157, size: 128, align: 64, extraData: i128 2) +!157 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "None", scope: !152, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, templateParams: !158, identifier: "3f7569d9bddac98414af4ae68a670d47") +!158 = !{!159} +!159 = !DITemplateTypeParameter(name: "T", type: !82) +!160 = !DIDerivedType(tag: DW_TAG_member, name: "Some", scope: !154, file: !5, baseType: !161, size: 128, align: 64) +!161 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Some", scope: !152, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !162, templateParams: !158, identifier: "a59a0ac8e43839cb28a35341b044ba6") +!162 = !{!163} +!163 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !161, file: !5, baseType: !82, size: 128, align: 64, flags: DIFlagPublic) +!164 = !DIDerivedType(tag: DW_TAG_member, scope: !152, file: !5, baseType: !90, size: 64, align: 64, flags: DIFlagArtificial) +!165 = !DIDerivedType(tag: DW_TAG_member, name: "Some", scope: !143, file: !5, baseType: !166, size: 128, align: 64) +!166 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Some", scope: !141, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !167, templateParams: !147, identifier: "310743464abe902a5334b2cd55207cfe") +!167 = !{!168} +!168 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !166, file: !5, baseType: !149, size: 128, align: 64, flags: DIFlagPublic) +!169 = !DIDerivedType(tag: DW_TAG_member, scope: !141, file: !5, baseType: !90, size: 64, align: 64, flags: DIFlagArtificial) +!170 = !DIDerivedType(tag: DW_TAG_member, name: "instructions", scope: !15, file: !5, baseType: !120, size: 128, align: 64, offset: 832, flags: DIFlagPrivate) +!171 = !DIDerivedType(tag: DW_TAG_member, name: "bases", scope: !11, file: !5, baseType: !172, size: 768, align: 64, flags: DIFlagPublic) +!172 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "BaseAddresses", scope: !16, file: !5, size: 768, align: 64, flags: DIFlagPublic, elements: !173, templateParams: !46, identifier: "4f73e7ef799b29fbae067e0be323dcfe") +!173 = !{!174, !193} +!174 = !DIDerivedType(tag: DW_TAG_member, name: "eh_frame_hdr", scope: !172, file: !5, baseType: !175, size: 384, align: 64, flags: DIFlagPublic) +!175 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "SectionBaseAddresses", scope: !16, file: !5, size: 384, align: 64, flags: DIFlagPublic, elements: !176, templateParams: !46, identifier: "78885c23ca9bb0ec15b9d93531201334") +!176 = !{!177, !191, !192} +!177 = !DIDerivedType(tag: DW_TAG_member, name: "section", scope: !175, file: !5, baseType: !178, size: 128, align: 64, flags: DIFlagPublic) +!178 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Option", scope: !39, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !179, templateParams: !46, identifier: "a764e4be4144b599a440b2d3c234bd8f") +!179 = !{!180} +!180 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !178, file: !5, size: 128, align: 64, elements: !181, templateParams: !46, identifier: "5f71a65a6e7dd57c14e50416050c3a90", discriminator: !190) +!181 = !{!182, !186} +!182 = !DIDerivedType(tag: DW_TAG_member, name: "None", scope: !180, file: !5, baseType: !183, size: 128, align: 64, extraData: i128 0) +!183 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "None", scope: !178, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, templateParams: !184, identifier: "fc451b096a948ef431e3a4971f318c0") +!184 = !{!185} +!185 = !DITemplateTypeParameter(name: "T", type: !90) +!186 = !DIDerivedType(tag: DW_TAG_member, name: "Some", scope: !180, file: !5, baseType: !187, size: 128, align: 64, extraData: i128 1) +!187 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Some", scope: !178, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !188, templateParams: !184, identifier: "321068e05ede571e678dd7e02b883f79") +!188 = !{!189} +!189 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !187, file: !5, baseType: !90, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!190 = !DIDerivedType(tag: DW_TAG_member, scope: !178, file: !5, baseType: !90, size: 64, align: 64, flags: DIFlagArtificial) +!191 = !DIDerivedType(tag: DW_TAG_member, name: "text", scope: !175, file: !5, baseType: !178, size: 128, align: 64, offset: 128, flags: DIFlagPublic) +!192 = !DIDerivedType(tag: DW_TAG_member, name: "data", scope: !175, file: !5, baseType: !178, size: 128, align: 64, offset: 256, flags: DIFlagPublic) +!193 = !DIDerivedType(tag: DW_TAG_member, name: "eh_frame", scope: !172, file: !5, baseType: !175, size: 384, align: 64, offset: 384, flags: DIFlagPublic) +!194 = !DIDerivedType(tag: DW_TAG_member, name: "eh_frame", scope: !11, file: !5, baseType: !195, size: 192, align: 64, offset: 2112, flags: DIFlagPublic) +!195 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "EhFrame>", scope: !16, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !196, templateParams: !204, identifier: "53e7dc3de0299504e773ad37859e191b") +!196 = !{!197, !198, !199} +!197 = !DIDerivedType(tag: DW_TAG_member, name: "section", scope: !195, file: !5, baseType: !120, size: 128, align: 64, flags: DIFlagPrivate) +!198 = !DIDerivedType(tag: DW_TAG_member, name: "address_size", scope: !195, file: !5, baseType: !26, size: 8, align: 8, offset: 128, flags: DIFlagPrivate) +!199 = !DIDerivedType(tag: DW_TAG_member, name: "vendor", scope: !195, file: !5, baseType: !200, size: 8, align: 8, offset: 136, flags: DIFlagPrivate) +!200 = !DICompositeType(tag: DW_TAG_enumeration_type, name: "Vendor", scope: !25, file: !5, baseType: !26, size: 8, align: 8, flags: DIFlagEnumClass, elements: !201) +!201 = !{!202, !203} +!202 = !DIEnumerator(name: "Default", value: 0, isUnsigned: true) +!203 = !DIEnumerator(name: "AArch64", value: 1, isUnsigned: true) +!204 = !{!135} +!205 = !DIDerivedType(tag: DW_TAG_member, name: "row", scope: !4, file: !5, baseType: !206, size: 33216, align: 64, offset: 2304, flags: DIFlagPrivate) +!206 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnwindTableRow", scope: !16, file: !5, size: 33216, align: 64, flags: DIFlagPublic, elements: !207, templateParams: !299, identifier: "82cd368dc604f667d8116583e5f9088f") +!207 = !{!208, !209, !210, !211, !232} +!208 = !DIDerivedType(tag: DW_TAG_member, name: "start_address", scope: !206, file: !5, baseType: !90, size: 64, align: 64, offset: 192, flags: DIFlagPrivate) +!209 = !DIDerivedType(tag: DW_TAG_member, name: "end_address", scope: !206, file: !5, baseType: !90, size: 64, align: 64, offset: 256, flags: DIFlagPrivate) +!210 = !DIDerivedType(tag: DW_TAG_member, name: "saved_args_size", scope: !206, file: !5, baseType: !90, size: 64, align: 64, offset: 320, flags: DIFlagPrivate) +!211 = !DIDerivedType(tag: DW_TAG_member, name: "cfa", scope: !206, file: !5, baseType: !212, size: 192, align: 64, flags: DIFlagPrivate) +!212 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "CfaRule", scope: !16, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !213, templateParams: !46, identifier: "ade6ef3f7e303426a7d52d422a1beefc") +!213 = !{!214} +!214 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !212, file: !5, size: 192, align: 64, elements: !215, templateParams: !46, identifier: "122b00cae98c57f43a7358b8913c043", discriminator: !231) +!215 = !{!216, !223} +!216 = !DIDerivedType(tag: DW_TAG_member, name: "RegisterAndOffset", scope: !214, file: !5, baseType: !217, size: 192, align: 64, extraData: i128 0) +!217 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "RegisterAndOffset", scope: !212, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !218, templateParams: !221, identifier: "119bb0cef257ae2dfac36839c7598241") +!218 = !{!219, !220} +!219 = !DIDerivedType(tag: DW_TAG_member, name: "register", scope: !217, file: !5, baseType: !115, size: 16, align: 16, offset: 16, flags: DIFlagPublic) +!220 = !DIDerivedType(tag: DW_TAG_member, name: "offset", scope: !217, file: !5, baseType: !113, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!221 = !{!222} +!222 = !DITemplateTypeParameter(name: "T", type: !21) +!223 = !DIDerivedType(tag: DW_TAG_member, name: "Expression", scope: !214, file: !5, baseType: !224, size: 192, align: 64, extraData: i128 1) +!224 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Expression", scope: !212, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !225, templateParams: !221, identifier: "aaead2b15d3fe770a8c30bba7fd8ad61") +!225 = !{!226} +!226 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !224, file: !5, baseType: !227, size: 128, align: 64, offset: 64, flags: DIFlagPublic) +!227 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnwindExpression", scope: !16, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !228, templateParams: !221, identifier: "83534a13fed1c49912ea150ec14a104b") +!228 = !{!229, !230} +!229 = !DIDerivedType(tag: DW_TAG_member, name: "offset", scope: !227, file: !5, baseType: !21, size: 64, align: 64, flags: DIFlagPublic) +!230 = !DIDerivedType(tag: DW_TAG_member, name: "length", scope: !227, file: !5, baseType: !21, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!231 = !DIDerivedType(tag: DW_TAG_member, scope: !212, file: !5, baseType: !118, size: 16, align: 16, flags: DIFlagArtificial) +!232 = !DIDerivedType(tag: DW_TAG_member, name: "registers", scope: !206, file: !5, baseType: !233, size: 32832, align: 64, offset: 384, flags: DIFlagPrivate) +!233 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "RegisterRuleMap", scope: !16, file: !5, size: 32832, align: 64, flags: DIFlagPrivate, elements: !234, templateParams: !299, identifier: "cc473d27e5413589c350ce3df23f43da") +!234 = !{!235} +!235 = !DIDerivedType(tag: DW_TAG_member, name: "rules", scope: !233, file: !5, baseType: !236, size: 32832, align: 64, flags: DIFlagPrivate) +!236 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "ArrayVec<[(gimli::common::Register, gimli::read::cfi::RegisterRule); 128]>", scope: !237, file: !5, size: 32832, align: 64, flags: DIFlagProtected, elements: !238, templateParams: !296, identifier: "d22acd2dadf9e1b9a77b71369aacf142") +!237 = !DINamespace(name: "util", scope: !17) +!238 = !{!239, !295} +!239 = !DIDerivedType(tag: DW_TAG_member, name: "storage", scope: !236, file: !5, baseType: !240, size: 32768, align: 64, flags: DIFlagPrivate) +!240 = !DICompositeType(tag: DW_TAG_array_type, baseType: !241, size: 32768, align: 64, elements: !293) +!241 = distinct !DICompositeType(tag: DW_TAG_union_type, name: "MaybeUninit<(gimli::common::Register, gimli::read::cfi::RegisterRule)>", scope: !242, file: !5, size: 256, align: 64, elements: !244, templateParams: !291, identifier: "5969f24240e56f6ba292746e32127a72") +!242 = !DINamespace(name: "maybe_uninit", scope: !243) +!243 = !DINamespace(name: "mem", scope: !40) +!244 = !{!245, !247} +!245 = !DIDerivedType(tag: DW_TAG_member, name: "uninit", scope: !241, file: !5, baseType: !246, align: 8) +!246 = !DIBasicType(name: "()", encoding: DW_ATE_unsigned) +!247 = !DIDerivedType(tag: DW_TAG_member, name: "value", scope: !241, file: !5, baseType: !248, size: 256, align: 64) +!248 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "ManuallyDrop<(gimli::common::Register, gimli::read::cfi::RegisterRule)>", scope: !249, file: !5, size: 256, align: 64, flags: DIFlagPublic, elements: !250, templateParams: !291, identifier: "5694f7bb71f656adeda6ad1c7765e67e") +!249 = !DINamespace(name: "manually_drop", scope: !243) +!250 = !{!251} +!251 = !DIDerivedType(tag: DW_TAG_member, name: "value", scope: !248, file: !5, baseType: !252, size: 256, align: 64, flags: DIFlagPrivate) +!252 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "(gimli::common::Register, gimli::read::cfi::RegisterRule)", file: !5, size: 256, align: 64, elements: !253, templateParams: !46, identifier: "d5b89cd0eff48be0d57ac79c0fc2d497") +!253 = !{!254, !255} +!254 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !252, file: !5, baseType: !115, size: 16, align: 16) +!255 = !DIDerivedType(tag: DW_TAG_member, name: "__1", scope: !252, file: !5, baseType: !256, size: 192, align: 64, offset: 64) +!256 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "RegisterRule", scope: !16, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !257, templateParams: !46, identifier: "279b181cb161e87c468503bd8374c58f") +!257 = !{!258} +!258 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !256, file: !5, size: 192, align: 64, elements: !259, templateParams: !46, identifier: "6368babf0c076b47c26cd741e43d73b1", discriminator: !290) +!259 = !{!260, !262, !264, !268, !272, !276, !280, !284, !286} +!260 = !DIDerivedType(tag: DW_TAG_member, name: "Undefined", scope: !258, file: !5, baseType: !261, size: 192, align: 64, extraData: i128 0) +!261 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Undefined", scope: !256, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !46, templateParams: !221, identifier: "b6231c923d5de1d6ea3dc67ae4d2b744") +!262 = !DIDerivedType(tag: DW_TAG_member, name: "SameValue", scope: !258, file: !5, baseType: !263, size: 192, align: 64, extraData: i128 1) +!263 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "SameValue", scope: !256, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !46, templateParams: !221, identifier: "366c56686b859fc982e55b9b99b114b7") +!264 = !DIDerivedType(tag: DW_TAG_member, name: "Offset", scope: !258, file: !5, baseType: !265, size: 192, align: 64, extraData: i128 2) +!265 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Offset", scope: !256, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !266, templateParams: !221, identifier: "cbb6f8da06b5c0ef681c5662cd576261") +!266 = !{!267} +!267 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !265, file: !5, baseType: !113, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!268 = !DIDerivedType(tag: DW_TAG_member, name: "ValOffset", scope: !258, file: !5, baseType: !269, size: 192, align: 64, extraData: i128 3) +!269 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "ValOffset", scope: !256, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !270, templateParams: !221, identifier: "9cc23c9ab3b44583f82e31fd655f575d") +!270 = !{!271} +!271 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !269, file: !5, baseType: !113, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!272 = !DIDerivedType(tag: DW_TAG_member, name: "Register", scope: !258, file: !5, baseType: !273, size: 192, align: 64, extraData: i128 4) +!273 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Register", scope: !256, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !274, templateParams: !221, identifier: "801c2836f75f94b250a3dfb037480760") +!274 = !{!275} +!275 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !273, file: !5, baseType: !115, size: 16, align: 16, offset: 16, flags: DIFlagPublic) +!276 = !DIDerivedType(tag: DW_TAG_member, name: "Expression", scope: !258, file: !5, baseType: !277, size: 192, align: 64, extraData: i128 5) +!277 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Expression", scope: !256, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !278, templateParams: !221, identifier: "986350edfa7856f0363c61bb25dba41c") +!278 = !{!279} +!279 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !277, file: !5, baseType: !227, size: 128, align: 64, offset: 64, flags: DIFlagPublic) +!280 = !DIDerivedType(tag: DW_TAG_member, name: "ValExpression", scope: !258, file: !5, baseType: !281, size: 192, align: 64, extraData: i128 6) +!281 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "ValExpression", scope: !256, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !282, templateParams: !221, identifier: "3d4759b08ddb382cf045cef4d1c0c0d4") +!282 = !{!283} +!283 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !281, file: !5, baseType: !227, size: 128, align: 64, offset: 64, flags: DIFlagPublic) +!284 = !DIDerivedType(tag: DW_TAG_member, name: "Architectural", scope: !258, file: !5, baseType: !285, size: 192, align: 64, extraData: i128 7) +!285 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Architectural", scope: !256, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !46, templateParams: !221, identifier: "f12af9617d5705e1d07fb2fcddd9c85b") +!286 = !DIDerivedType(tag: DW_TAG_member, name: "Constant", scope: !258, file: !5, baseType: !287, size: 192, align: 64, extraData: i128 8) +!287 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Constant", scope: !256, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !288, templateParams: !221, identifier: "576df75a63490388e372e15bd69f179d") +!288 = !{!289} +!289 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !287, file: !5, baseType: !90, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!290 = !DIDerivedType(tag: DW_TAG_member, scope: !256, file: !5, baseType: !118, size: 16, align: 16, flags: DIFlagArtificial) +!291 = !{!292} +!292 = !DITemplateTypeParameter(name: "T", type: !252) +!293 = !{!294} +!294 = !DISubrange(count: 128, lowerBound: 0) +!295 = !DIDerivedType(tag: DW_TAG_member, name: "len", scope: !236, file: !5, baseType: !21, size: 64, align: 64, offset: 32768, flags: DIFlagPrivate) +!296 = !{!297} +!297 = !DITemplateTypeParameter(name: "A", type: !298) +!298 = !DICompositeType(tag: DW_TAG_array_type, baseType: !252, size: 32768, align: 64, elements: !293) +!299 = !{!222, !300} +!300 = !DITemplateTypeParameter(name: "S", type: !301) +!301 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "StoreOnStack", scope: !6, file: !5, align: 8, flags: DIFlagPrivate, elements: !46, identifier: "21dc88df75a1263a13dd24276b92d3e") +!302 = !DISubroutineType(types: !303) +!303 = !{!304, !549, !550, !227} +!304 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Result", scope: !305, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !306, templateParams: !46, identifier: "2930057b85b47f2bef5979de26a87b97") +!305 = !DINamespace(name: "result", scope: !40) +!306 = !{!307} +!307 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !304, file: !5, size: 128, align: 64, elements: !308, templateParams: !46, identifier: "2298ec794572047066c0d72c8d834034", discriminator: !548) +!308 = !{!309, !544} +!309 = !DIDerivedType(tag: DW_TAG_member, name: "Ok", scope: !307, file: !5, baseType: !310, size: 128, align: 64, extraData: i128 77) +!310 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Ok", scope: !304, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !311, templateParams: !313, identifier: "836193d46f427d39e980f444791b92f5") +!311 = !{!312} +!312 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !310, file: !5, baseType: !21, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!313 = !{!222, !314} +!314 = !DITemplateTypeParameter(name: "E", type: !315) +!315 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Error", scope: !17, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !316, templateParams: !46, identifier: "d77646015d26471497f49470c6fe61cb") +!316 = !{!317} +!317 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !315, file: !5, size: 128, align: 64, elements: !318, templateParams: !46, identifier: "ca3f5d09babb12e628f9b9b97696a8e4", discriminator: !543) +!318 = !{!319, !321, !323, !325, !327, !329, !331, !333, !335, !337, !339, !341, !343, !350, !352, !354, !356, !358, !362, !366, !374, !376, !383, !390, !397, !404, !408, !412, !416, !418, !420, !422, !424, !426, !428, !430, !432, !436, !438, !440, !442, !449, !451, !453, !457, !459, !461, !463, !465, !467, !474, !476, !478, !480, !482, !484, !488, !490, !492, !494, !496, !500, !502, !504, !506, !508, !510, !512, !514, !516, !518, !520, !522, !524, !526, !528, !536} +!319 = !DIDerivedType(tag: DW_TAG_member, name: "Io", scope: !317, file: !5, baseType: !320, size: 128, align: 64, extraData: i128 0) +!320 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Io", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "2320550a9c63e2aae32b0db85e194ad6") +!321 = !DIDerivedType(tag: DW_TAG_member, name: "PcRelativePointerButSectionBaseIsUndefined", scope: !317, file: !5, baseType: !322, size: 128, align: 64, extraData: i128 1) +!322 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "PcRelativePointerButSectionBaseIsUndefined", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "8be741069f75ec37c15f7c3a00f5725f") +!323 = !DIDerivedType(tag: DW_TAG_member, name: "TextRelativePointerButTextBaseIsUndefined", scope: !317, file: !5, baseType: !324, size: 128, align: 64, extraData: i128 2) +!324 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "TextRelativePointerButTextBaseIsUndefined", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "378958e13527898498ae80867eb2d180") +!325 = !DIDerivedType(tag: DW_TAG_member, name: "DataRelativePointerButDataBaseIsUndefined", scope: !317, file: !5, baseType: !326, size: 128, align: 64, extraData: i128 3) +!326 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "DataRelativePointerButDataBaseIsUndefined", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "f6d90525538ea3822cc7864c7d6e132a") +!327 = !DIDerivedType(tag: DW_TAG_member, name: "FuncRelativePointerInBadContext", scope: !317, file: !5, baseType: !328, size: 128, align: 64, extraData: i128 4) +!328 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "FuncRelativePointerInBadContext", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "b3ef419b95336ce3759c0b0990f112fc") +!329 = !DIDerivedType(tag: DW_TAG_member, name: "CannotParseOmitPointerEncoding", scope: !317, file: !5, baseType: !330, size: 128, align: 64, extraData: i128 5) +!330 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "CannotParseOmitPointerEncoding", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "eb130e7d2deb1502c84e84a2fb5387d") +!331 = !DIDerivedType(tag: DW_TAG_member, name: "BadUnsignedLeb128", scope: !317, file: !5, baseType: !332, size: 128, align: 64, extraData: i128 6) +!332 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "BadUnsignedLeb128", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "90e4e9ce1b5f9ac0ab73ec51465666a9") +!333 = !DIDerivedType(tag: DW_TAG_member, name: "BadSignedLeb128", scope: !317, file: !5, baseType: !334, size: 128, align: 64, extraData: i128 7) +!334 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "BadSignedLeb128", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "3c015a8355d523b8e6fb55a5ad8fdb42") +!335 = !DIDerivedType(tag: DW_TAG_member, name: "AbbreviationTagZero", scope: !317, file: !5, baseType: !336, size: 128, align: 64, extraData: i128 8) +!336 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "AbbreviationTagZero", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "21f316a516256bf01c07a0b30b05f15c") +!337 = !DIDerivedType(tag: DW_TAG_member, name: "AttributeFormZero", scope: !317, file: !5, baseType: !338, size: 128, align: 64, extraData: i128 9) +!338 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "AttributeFormZero", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "f33bf8117b76a81a643250c2d9a8b44f") +!339 = !DIDerivedType(tag: DW_TAG_member, name: "BadHasChildren", scope: !317, file: !5, baseType: !340, size: 128, align: 64, extraData: i128 10) +!340 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "BadHasChildren", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "edc800ff621c7b61d39a6e66a7f2e00") +!341 = !DIDerivedType(tag: DW_TAG_member, name: "BadLength", scope: !317, file: !5, baseType: !342, size: 128, align: 64, extraData: i128 11) +!342 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "BadLength", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "abbcc790b2b7b20eb0bda85a8691aacb") +!343 = !DIDerivedType(tag: DW_TAG_member, name: "UnknownForm", scope: !317, file: !5, baseType: !344, size: 128, align: 64, extraData: i128 12) +!344 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnknownForm", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !345, templateParams: !46, identifier: "bf6f1dc4dab6ba136decfeee45bc099b") +!345 = !{!346} +!346 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !344, file: !5, baseType: !347, size: 16, align: 16, offset: 16, flags: DIFlagPublic) +!347 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "DwForm", scope: !61, file: !5, size: 16, align: 16, flags: DIFlagPublic, elements: !348, templateParams: !46, identifier: "48a0ca1754029669f804908ef0889f77") +!348 = !{!349} +!349 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !347, file: !5, baseType: !118, size: 16, align: 16, flags: DIFlagPublic) +!350 = !DIDerivedType(tag: DW_TAG_member, name: "ExpectedZero", scope: !317, file: !5, baseType: !351, size: 128, align: 64, extraData: i128 13) +!351 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "ExpectedZero", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "fb26ef4a66c7e07a914b864dd5cbdaf3") +!352 = !DIDerivedType(tag: DW_TAG_member, name: "DuplicateAbbreviationCode", scope: !317, file: !5, baseType: !353, size: 128, align: 64, extraData: i128 14) +!353 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "DuplicateAbbreviationCode", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "8fba5777d5e3ba3c664af74dfa9883fa") +!354 = !DIDerivedType(tag: DW_TAG_member, name: "DuplicateArange", scope: !317, file: !5, baseType: !355, size: 128, align: 64, extraData: i128 15) +!355 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "DuplicateArange", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "92ad6df4b8eab06a4098fd1ab6fd674a") +!356 = !DIDerivedType(tag: DW_TAG_member, name: "UnknownReservedLength", scope: !317, file: !5, baseType: !357, size: 128, align: 64, extraData: i128 16) +!357 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnknownReservedLength", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "2b13d11aaf1e29e4496a5c786d58e5ad") +!358 = !DIDerivedType(tag: DW_TAG_member, name: "UnknownVersion", scope: !317, file: !5, baseType: !359, size: 128, align: 64, extraData: i128 17) +!359 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnknownVersion", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !360, templateParams: !46, identifier: "af46e087b62d3b17df488fbbae5b6e37") +!360 = !{!361} +!361 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !359, file: !5, baseType: !90, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!362 = !DIDerivedType(tag: DW_TAG_member, name: "UnknownAbbreviation", scope: !317, file: !5, baseType: !363, size: 128, align: 64, extraData: i128 18) +!363 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnknownAbbreviation", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !364, templateParams: !46, identifier: "b3570174fc0c73045fd6de2058159273") +!364 = !{!365} +!365 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !363, file: !5, baseType: !90, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!366 = !DIDerivedType(tag: DW_TAG_member, name: "UnexpectedEof", scope: !317, file: !5, baseType: !367, size: 128, align: 64, extraData: i128 19) +!367 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnexpectedEof", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !368, templateParams: !46, identifier: "c321526e0a5c153ffff82fcf12f05ee5") +!368 = !{!369} +!369 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !367, file: !5, baseType: !370, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!370 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "ReaderOffsetId", scope: !371, file: !5, size: 64, align: 64, flags: DIFlagPublic, elements: !372, templateParams: !46, identifier: "10753e39de75edad2b99c864968b4116") +!371 = !DINamespace(name: "reader", scope: !17) +!372 = !{!373} +!373 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !370, file: !5, baseType: !90, size: 64, align: 64, flags: DIFlagPublic) +!374 = !DIDerivedType(tag: DW_TAG_member, name: "UnexpectedNull", scope: !317, file: !5, baseType: !375, size: 128, align: 64, extraData: i128 20) +!375 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnexpectedNull", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "a086692abce983283debcb3978720724") +!376 = !DIDerivedType(tag: DW_TAG_member, name: "UnknownStandardOpcode", scope: !317, file: !5, baseType: !377, size: 128, align: 64, extraData: i128 21) +!377 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnknownStandardOpcode", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !378, templateParams: !46, identifier: "d17e9cb4063b64896929e2327bb308ce") +!378 = !{!379} +!379 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !377, file: !5, baseType: !380, size: 8, align: 8, offset: 8, flags: DIFlagPublic) +!380 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "DwLns", scope: !61, file: !5, size: 8, align: 8, flags: DIFlagPublic, elements: !381, templateParams: !46, identifier: "62ddfcc3d37948b75bdcdd7f8282c4fd") +!381 = !{!382} +!382 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !380, file: !5, baseType: !26, size: 8, align: 8, flags: DIFlagPublic) +!383 = !DIDerivedType(tag: DW_TAG_member, name: "UnknownExtendedOpcode", scope: !317, file: !5, baseType: !384, size: 128, align: 64, extraData: i128 22) +!384 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnknownExtendedOpcode", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !385, templateParams: !46, identifier: "fe4bafba0822d26228794fa2925d8c1") +!385 = !{!386} +!386 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !384, file: !5, baseType: !387, size: 8, align: 8, offset: 8, flags: DIFlagPublic) +!387 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "DwLne", scope: !61, file: !5, size: 8, align: 8, flags: DIFlagPublic, elements: !388, templateParams: !46, identifier: "d9915819558840f5b2e0f12224d88dd4") +!388 = !{!389} +!389 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !387, file: !5, baseType: !26, size: 8, align: 8, flags: DIFlagPublic) +!390 = !DIDerivedType(tag: DW_TAG_member, name: "UnknownLocListsEntry", scope: !317, file: !5, baseType: !391, size: 128, align: 64, extraData: i128 23) +!391 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnknownLocListsEntry", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !392, templateParams: !46, identifier: "980da01d1d36b39f8f45cd86f8be0b88") +!392 = !{!393} +!393 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !391, file: !5, baseType: !394, size: 8, align: 8, offset: 8, flags: DIFlagPublic) +!394 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "DwLle", scope: !61, file: !5, size: 8, align: 8, flags: DIFlagPublic, elements: !395, templateParams: !46, identifier: "a0030f5b581a30602dd91af017afec49") +!395 = !{!396} +!396 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !394, file: !5, baseType: !26, size: 8, align: 8, flags: DIFlagPublic) +!397 = !DIDerivedType(tag: DW_TAG_member, name: "UnknownRangeListsEntry", scope: !317, file: !5, baseType: !398, size: 128, align: 64, extraData: i128 24) +!398 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnknownRangeListsEntry", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !399, templateParams: !46, identifier: "b1b22f15da3690a528b596a003d5cedb") +!399 = !{!400} +!400 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !398, file: !5, baseType: !401, size: 8, align: 8, offset: 8, flags: DIFlagPublic) +!401 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "DwRle", scope: !61, file: !5, size: 8, align: 8, flags: DIFlagPublic, elements: !402, templateParams: !46, identifier: "fb2890de8a376d35eb233acc57542dcd") +!402 = !{!403} +!403 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !401, file: !5, baseType: !26, size: 8, align: 8, flags: DIFlagPublic) +!404 = !DIDerivedType(tag: DW_TAG_member, name: "UnsupportedAddressSize", scope: !317, file: !5, baseType: !405, size: 128, align: 64, extraData: i128 25) +!405 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnsupportedAddressSize", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !406, templateParams: !46, identifier: "600d8151955ba8481767c9c43c8868b4") +!406 = !{!407} +!407 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !405, file: !5, baseType: !26, size: 8, align: 8, offset: 8, flags: DIFlagPublic) +!408 = !DIDerivedType(tag: DW_TAG_member, name: "UnsupportedOffsetSize", scope: !317, file: !5, baseType: !409, size: 128, align: 64, extraData: i128 26) +!409 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnsupportedOffsetSize", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !410, templateParams: !46, identifier: "34552b1f282c3a7eaf508f4d4757384") +!410 = !{!411} +!411 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !409, file: !5, baseType: !26, size: 8, align: 8, offset: 8, flags: DIFlagPublic) +!412 = !DIDerivedType(tag: DW_TAG_member, name: "UnsupportedFieldSize", scope: !317, file: !5, baseType: !413, size: 128, align: 64, extraData: i128 27) +!413 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnsupportedFieldSize", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !414, templateParams: !46, identifier: "bed14b8322ad5b70d0f0b5e56821cc97") +!414 = !{!415} +!415 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !413, file: !5, baseType: !26, size: 8, align: 8, offset: 8, flags: DIFlagPublic) +!416 = !DIDerivedType(tag: DW_TAG_member, name: "MinimumInstructionLengthZero", scope: !317, file: !5, baseType: !417, size: 128, align: 64, extraData: i128 28) +!417 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "MinimumInstructionLengthZero", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "fa1fd0b565474fe82ccb893a09da66e5") +!418 = !DIDerivedType(tag: DW_TAG_member, name: "MaximumOperationsPerInstructionZero", scope: !317, file: !5, baseType: !419, size: 128, align: 64, extraData: i128 29) +!419 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "MaximumOperationsPerInstructionZero", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "a35b154691400fb6f961d541241b8c31") +!420 = !DIDerivedType(tag: DW_TAG_member, name: "LineRangeZero", scope: !317, file: !5, baseType: !421, size: 128, align: 64, extraData: i128 30) +!421 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "LineRangeZero", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "3c6bfb38e53333f5f80900903619c1b5") +!422 = !DIDerivedType(tag: DW_TAG_member, name: "OpcodeBaseZero", scope: !317, file: !5, baseType: !423, size: 128, align: 64, extraData: i128 31) +!423 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "OpcodeBaseZero", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "8f3fdd0c06ec5fb1fc8679e4671305a5") +!424 = !DIDerivedType(tag: DW_TAG_member, name: "BadUtf8", scope: !317, file: !5, baseType: !425, size: 128, align: 64, extraData: i128 32) +!425 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "BadUtf8", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "9b9aef385a70fc9dcd57b2e8b5ab6b24") +!426 = !DIDerivedType(tag: DW_TAG_member, name: "NotCieId", scope: !317, file: !5, baseType: !427, size: 128, align: 64, extraData: i128 33) +!427 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "NotCieId", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "ebd29b268a6ba10a2b22260f8252bd99") +!428 = !DIDerivedType(tag: DW_TAG_member, name: "NotCiePointer", scope: !317, file: !5, baseType: !429, size: 128, align: 64, extraData: i128 34) +!429 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "NotCiePointer", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "22e0182bcdf61321493fc4cd39e9e6db") +!430 = !DIDerivedType(tag: DW_TAG_member, name: "NotFdePointer", scope: !317, file: !5, baseType: !431, size: 128, align: 64, extraData: i128 35) +!431 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "NotFdePointer", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "d0b8d1771a76ab5e69aa55743b84d5b8") +!432 = !DIDerivedType(tag: DW_TAG_member, name: "BadBranchTarget", scope: !317, file: !5, baseType: !433, size: 128, align: 64, extraData: i128 36) +!433 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "BadBranchTarget", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !434, templateParams: !46, identifier: "f580dfe27723b7ae4b65f9357871a56e") +!434 = !{!435} +!435 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !433, file: !5, baseType: !90, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!436 = !DIDerivedType(tag: DW_TAG_member, name: "InvalidPushObjectAddress", scope: !317, file: !5, baseType: !437, size: 128, align: 64, extraData: i128 37) +!437 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "InvalidPushObjectAddress", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "269d995e87b5d13c7cb611055fbb1cc") +!438 = !DIDerivedType(tag: DW_TAG_member, name: "NotEnoughStackItems", scope: !317, file: !5, baseType: !439, size: 128, align: 64, extraData: i128 38) +!439 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "NotEnoughStackItems", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "20bcdb0491a3e809303636911dbf8cd") +!440 = !DIDerivedType(tag: DW_TAG_member, name: "TooManyIterations", scope: !317, file: !5, baseType: !441, size: 128, align: 64, extraData: i128 39) +!441 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "TooManyIterations", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "ff30adf3a0a0372083d438531902f07c") +!442 = !DIDerivedType(tag: DW_TAG_member, name: "InvalidExpression", scope: !317, file: !5, baseType: !443, size: 128, align: 64, extraData: i128 40) +!443 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "InvalidExpression", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !444, templateParams: !46, identifier: "5651d5c999cb39e0115de3cc6366e2a1") +!444 = !{!445} +!445 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !443, file: !5, baseType: !446, size: 8, align: 8, offset: 8, flags: DIFlagPublic) +!446 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "DwOp", scope: !61, file: !5, size: 8, align: 8, flags: DIFlagPublic, elements: !447, templateParams: !46, identifier: "7f53682bc0f25d737c4504ffbb705411") +!447 = !{!448} +!448 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !446, file: !5, baseType: !26, size: 8, align: 8, flags: DIFlagPublic) +!449 = !DIDerivedType(tag: DW_TAG_member, name: "UnsupportedEvaluation", scope: !317, file: !5, baseType: !450, size: 128, align: 64, extraData: i128 41) +!450 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnsupportedEvaluation", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "f7f5df83d048fac1b6a625fe43495894") +!451 = !DIDerivedType(tag: DW_TAG_member, name: "InvalidPiece", scope: !317, file: !5, baseType: !452, size: 128, align: 64, extraData: i128 42) +!452 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "InvalidPiece", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "b63df519ec7f41efddc9e93b531a6ad9") +!453 = !DIDerivedType(tag: DW_TAG_member, name: "InvalidExpressionTerminator", scope: !317, file: !5, baseType: !454, size: 128, align: 64, extraData: i128 43) +!454 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "InvalidExpressionTerminator", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !455, templateParams: !46, identifier: "3a81ef5c26543c7edcc61e9fc7944fef") +!455 = !{!456} +!456 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !454, file: !5, baseType: !90, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!457 = !DIDerivedType(tag: DW_TAG_member, name: "DivisionByZero", scope: !317, file: !5, baseType: !458, size: 128, align: 64, extraData: i128 44) +!458 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "DivisionByZero", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "b76367d376d20e9fbb0ca9f290f2c235") +!459 = !DIDerivedType(tag: DW_TAG_member, name: "TypeMismatch", scope: !317, file: !5, baseType: !460, size: 128, align: 64, extraData: i128 45) +!460 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "TypeMismatch", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "89d807c67453ebabd4929695d8f66096") +!461 = !DIDerivedType(tag: DW_TAG_member, name: "IntegralTypeRequired", scope: !317, file: !5, baseType: !462, size: 128, align: 64, extraData: i128 46) +!462 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "IntegralTypeRequired", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "7b0620dbd2111d443ce6e80003d23fb5") +!463 = !DIDerivedType(tag: DW_TAG_member, name: "UnsupportedTypeOperation", scope: !317, file: !5, baseType: !464, size: 128, align: 64, extraData: i128 47) +!464 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnsupportedTypeOperation", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "d08821c006f73bfca08dc2d771305521") +!465 = !DIDerivedType(tag: DW_TAG_member, name: "InvalidShiftExpression", scope: !317, file: !5, baseType: !466, size: 128, align: 64, extraData: i128 48) +!466 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "InvalidShiftExpression", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "f9d7ea923183442e88981e5113d95cd8") +!467 = !DIDerivedType(tag: DW_TAG_member, name: "UnknownCallFrameInstruction", scope: !317, file: !5, baseType: !468, size: 128, align: 64, extraData: i128 49) +!468 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnknownCallFrameInstruction", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !469, templateParams: !46, identifier: "93dffe49e5f883b92277a0558e3cb1b2") +!469 = !{!470} +!470 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !468, file: !5, baseType: !471, size: 8, align: 8, offset: 8, flags: DIFlagPublic) +!471 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "DwCfa", scope: !61, file: !5, size: 8, align: 8, flags: DIFlagPublic, elements: !472, templateParams: !46, identifier: "25e8987d4efbed4c58d052b7ad506631") +!472 = !{!473} +!473 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !471, file: !5, baseType: !26, size: 8, align: 8, flags: DIFlagPublic) +!474 = !DIDerivedType(tag: DW_TAG_member, name: "InvalidAddressRange", scope: !317, file: !5, baseType: !475, size: 128, align: 64, extraData: i128 50) +!475 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "InvalidAddressRange", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "472535216c273e01cbb038f4e1a46c30") +!476 = !DIDerivedType(tag: DW_TAG_member, name: "CfiInstructionInInvalidContext", scope: !317, file: !5, baseType: !477, size: 128, align: 64, extraData: i128 51) +!477 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "CfiInstructionInInvalidContext", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "eddaee247b191a56aa7bc2dc0985c5da") +!478 = !DIDerivedType(tag: DW_TAG_member, name: "PopWithEmptyStack", scope: !317, file: !5, baseType: !479, size: 128, align: 64, extraData: i128 52) +!479 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "PopWithEmptyStack", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "b820ab2462801a0fc0899bf3654c4b8") +!480 = !DIDerivedType(tag: DW_TAG_member, name: "NoUnwindInfoForAddress", scope: !317, file: !5, baseType: !481, size: 128, align: 64, extraData: i128 53) +!481 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "NoUnwindInfoForAddress", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "ae01ce5aaf3d99bcee8d4f5cd8182484") +!482 = !DIDerivedType(tag: DW_TAG_member, name: "UnsupportedOffset", scope: !317, file: !5, baseType: !483, size: 128, align: 64, extraData: i128 54) +!483 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnsupportedOffset", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "2ac7a792140cb6d7bb468e88d6cc2308") +!484 = !DIDerivedType(tag: DW_TAG_member, name: "UnknownPointerEncoding", scope: !317, file: !5, baseType: !485, size: 128, align: 64, extraData: i128 55) +!485 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnknownPointerEncoding", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !486, templateParams: !46, identifier: "81e4dbaa1837b41877cb2650ba0cb059") +!486 = !{!487} +!487 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !485, file: !5, baseType: !60, size: 8, align: 8, offset: 8, flags: DIFlagPublic) +!488 = !DIDerivedType(tag: DW_TAG_member, name: "NoEntryAtGivenOffset", scope: !317, file: !5, baseType: !489, size: 128, align: 64, extraData: i128 56) +!489 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "NoEntryAtGivenOffset", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "fe3a27c995940427e718d5e3a088265") +!490 = !DIDerivedType(tag: DW_TAG_member, name: "OffsetOutOfBounds", scope: !317, file: !5, baseType: !491, size: 128, align: 64, extraData: i128 57) +!491 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "OffsetOutOfBounds", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "5d4def3cd252d7be8c56a756b8c7d7b") +!492 = !DIDerivedType(tag: DW_TAG_member, name: "UnknownAugmentation", scope: !317, file: !5, baseType: !493, size: 128, align: 64, extraData: i128 58) +!493 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnknownAugmentation", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "dd1c3a83fa13d7501e7c00dbc7ffd4e6") +!494 = !DIDerivedType(tag: DW_TAG_member, name: "UnsupportedPointerEncoding", scope: !317, file: !5, baseType: !495, size: 128, align: 64, extraData: i128 59) +!495 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnsupportedPointerEncoding", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "f23537dcf6caef7aacff71f94f88ad9a") +!496 = !DIDerivedType(tag: DW_TAG_member, name: "UnsupportedRegister", scope: !317, file: !5, baseType: !497, size: 128, align: 64, extraData: i128 60) +!497 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnsupportedRegister", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !498, templateParams: !46, identifier: "69e04d530c2a072b985811e3c0c90332") +!498 = !{!499} +!499 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !497, file: !5, baseType: !90, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!500 = !DIDerivedType(tag: DW_TAG_member, name: "TooManyRegisterRules", scope: !317, file: !5, baseType: !501, size: 128, align: 64, extraData: i128 61) +!501 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "TooManyRegisterRules", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "f74f825fd17a89366d5fbcd594ad371e") +!502 = !DIDerivedType(tag: DW_TAG_member, name: "StackFull", scope: !317, file: !5, baseType: !503, size: 128, align: 64, extraData: i128 62) +!503 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "StackFull", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "649f9201d23349d0547f2bf0aa454d2c") +!504 = !DIDerivedType(tag: DW_TAG_member, name: "VariableLengthSearchTable", scope: !317, file: !5, baseType: !505, size: 128, align: 64, extraData: i128 63) +!505 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "VariableLengthSearchTable", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "c37c43df5d63ae12b1ee17c8b3bac70") +!506 = !DIDerivedType(tag: DW_TAG_member, name: "UnsupportedUnitType", scope: !317, file: !5, baseType: !507, size: 128, align: 64, extraData: i128 64) +!507 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnsupportedUnitType", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "c522143e95508ac586d52dd2a2094233") +!508 = !DIDerivedType(tag: DW_TAG_member, name: "UnsupportedAddressIndex", scope: !317, file: !5, baseType: !509, size: 128, align: 64, extraData: i128 65) +!509 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnsupportedAddressIndex", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "7f9ad238cf1907731ceaa4464f643d59") +!510 = !DIDerivedType(tag: DW_TAG_member, name: "UnsupportedSegmentSize", scope: !317, file: !5, baseType: !511, size: 128, align: 64, extraData: i128 66) +!511 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnsupportedSegmentSize", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "d82bf4bb4842cb1a2d0d0685137bb22") +!512 = !DIDerivedType(tag: DW_TAG_member, name: "MissingUnitDie", scope: !317, file: !5, baseType: !513, size: 128, align: 64, extraData: i128 67) +!513 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "MissingUnitDie", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "bfa91df9d2fb7ed5794259b263898a8") +!514 = !DIDerivedType(tag: DW_TAG_member, name: "UnsupportedAttributeForm", scope: !317, file: !5, baseType: !515, size: 128, align: 64, extraData: i128 68) +!515 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnsupportedAttributeForm", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "b4d57042fdb22adeec228129f57c3bdb") +!516 = !DIDerivedType(tag: DW_TAG_member, name: "MissingFileEntryFormatPath", scope: !317, file: !5, baseType: !517, size: 128, align: 64, extraData: i128 69) +!517 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "MissingFileEntryFormatPath", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "2feb917517f62b9a49da4bec8085de52") +!518 = !DIDerivedType(tag: DW_TAG_member, name: "ExpectedStringAttributeValue", scope: !317, file: !5, baseType: !519, size: 128, align: 64, extraData: i128 70) +!519 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "ExpectedStringAttributeValue", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "2e1c89ffcc99bcff26aae384fe90a0b5") +!520 = !DIDerivedType(tag: DW_TAG_member, name: "InvalidImplicitConst", scope: !317, file: !5, baseType: !521, size: 128, align: 64, extraData: i128 71) +!521 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "InvalidImplicitConst", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "aea5e93522c8b1b32186ed286b609796") +!522 = !DIDerivedType(tag: DW_TAG_member, name: "InvalidIndexSectionCount", scope: !317, file: !5, baseType: !523, size: 128, align: 64, extraData: i128 72) +!523 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "InvalidIndexSectionCount", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "588d1fe04fa40bd64d4b9811824daa5a") +!524 = !DIDerivedType(tag: DW_TAG_member, name: "InvalidIndexSlotCount", scope: !317, file: !5, baseType: !525, size: 128, align: 64, extraData: i128 73) +!525 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "InvalidIndexSlotCount", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "6274016f543239a9e591b947785d3205") +!526 = !DIDerivedType(tag: DW_TAG_member, name: "InvalidIndexRow", scope: !317, file: !5, baseType: !527, size: 128, align: 64, extraData: i128 74) +!527 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "InvalidIndexRow", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, identifier: "59ad95d188560365cbe8c5f5359279c9") +!528 = !DIDerivedType(tag: DW_TAG_member, name: "UnknownIndexSection", scope: !317, file: !5, baseType: !529, size: 128, align: 64, extraData: i128 75) +!529 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnknownIndexSection", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !530, templateParams: !46, identifier: "b73e5f323eb887c2778393497e67883f") +!530 = !{!531} +!531 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !529, file: !5, baseType: !532, size: 32, align: 32, offset: 32, flags: DIFlagPublic) +!532 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "DwSect", scope: !61, file: !5, size: 32, align: 32, flags: DIFlagPublic, elements: !533, templateParams: !46, identifier: "2076c0f96c14e3f2bddcc281c845b22f") +!533 = !{!534} +!534 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !532, file: !5, baseType: !535, size: 32, align: 32, flags: DIFlagPublic) +!535 = !DIBasicType(name: "u32", size: 32, encoding: DW_ATE_unsigned) +!536 = !DIDerivedType(tag: DW_TAG_member, name: "UnknownIndexSectionV2", scope: !317, file: !5, baseType: !537, size: 128, align: 64, extraData: i128 76) +!537 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnknownIndexSectionV2", scope: !315, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !538, templateParams: !46, identifier: "5ef812c992486c75cad093763c73c88f") +!538 = !{!539} +!539 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !537, file: !5, baseType: !540, size: 32, align: 32, offset: 32, flags: DIFlagPublic) +!540 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "DwSectV2", scope: !61, file: !5, size: 32, align: 32, flags: DIFlagPublic, elements: !541, templateParams: !46, identifier: "f456df61651542439b4c0d56ddfa3e2b") +!541 = !{!542} +!542 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !540, file: !5, baseType: !535, size: 32, align: 32, flags: DIFlagPublic) +!543 = !DIDerivedType(tag: DW_TAG_member, scope: !315, file: !5, baseType: !26, size: 8, align: 8, flags: DIFlagArtificial) +!544 = !DIDerivedType(tag: DW_TAG_member, name: "Err", scope: !307, file: !5, baseType: !545, size: 128, align: 64) +!545 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Err", scope: !304, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !546, templateParams: !313, identifier: "73d51b53019cec2eb7866e0ccdee2b05") +!546 = !{!547} +!547 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !545, file: !5, baseType: !315, size: 128, align: 64, flags: DIFlagPublic) +!548 = !DIDerivedType(tag: DW_TAG_member, scope: !304, file: !5, baseType: !26, size: 8, align: 8, flags: DIFlagArtificial) +!549 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "&unwinding::unwinder::frame::Frame", baseType: !4, size: 64, align: 64, dwarfAddressSpace: 0) +!550 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "&unwinding::unwinder::arch::aarch64::Context", baseType: !551, size: 64, align: 64, dwarfAddressSpace: 0) +!551 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Context", scope: !552, file: !5, size: 4096, align: 64, flags: DIFlagPublic, elements: !554, templateParams: !46, identifier: "8e981de74a115bb4264fb06b8de66f0") +!552 = !DINamespace(name: "aarch64", scope: !553) +!553 = !DINamespace(name: "arch", scope: !7) +!554 = !{!555, !559, !560} +!555 = !DIDerivedType(tag: DW_TAG_member, name: "gp", scope: !551, file: !5, baseType: !556, size: 1984, align: 64, flags: DIFlagPublic) +!556 = !DICompositeType(tag: DW_TAG_array_type, baseType: !21, size: 1984, align: 64, elements: !557) +!557 = !{!558} +!558 = !DISubrange(count: 31, lowerBound: 0) +!559 = !DIDerivedType(tag: DW_TAG_member, name: "sp", scope: !551, file: !5, baseType: !21, size: 64, align: 64, offset: 1984, flags: DIFlagPublic) +!560 = !DIDerivedType(tag: DW_TAG_member, name: "fp", scope: !551, file: !5, baseType: !561, size: 2048, align: 64, offset: 2048, flags: DIFlagPublic) +!561 = !DICompositeType(tag: DW_TAG_array_type, baseType: !21, size: 2048, align: 64, elements: !562) +!562 = !{!563} +!563 = !DISubrange(count: 32, lowerBound: 0) +!564 = distinct !DICompileUnit(language: DW_LANG_Rust, file: !565, producer: "clang LLVM (rustc version 1.82.0-nightly (636d7ff91 2024-08-19))", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !566, globals: !597, splitDebugInlining: false, nameTableKind: None) +!565 = !DIFile(filename: "src/lib.rs/@/unwinding.453513c1ca9c7b65-cgu.0", directory: "/home/dev/ecosystem/unwinding") +!566 = !{!200, !567, !24, !571, !579, !586, !591} +!567 = !DICompositeType(tag: DW_TAG_enumeration_type, name: "CieOffsetEncoding", scope: !16, file: !5, baseType: !26, size: 8, align: 8, flags: DIFlagEnumClass, elements: !568) +!568 = !{!569, !570} +!569 = !DIEnumerator(name: "U32", value: 0, isUnsigned: true) +!570 = !DIEnumerator(name: "U64", value: 1, isUnsigned: true) +!571 = !DICompositeType(tag: DW_TAG_enumeration_type, name: "Alignment", scope: !572, file: !5, baseType: !26, size: 8, align: 8, flags: DIFlagEnumClass, elements: !574) +!572 = !DINamespace(name: "rt", scope: !573) +!573 = !DINamespace(name: "fmt", scope: !40) +!574 = !{!575, !576, !577, !578} +!575 = !DIEnumerator(name: "Left", value: 0, isUnsigned: true) +!576 = !DIEnumerator(name: "Right", value: 1, isUnsigned: true) +!577 = !DIEnumerator(name: "Center", value: 2, isUnsigned: true) +!578 = !DIEnumerator(name: "Unknown", value: 3, isUnsigned: true) +!579 = !DICompositeType(tag: DW_TAG_enumeration_type, name: "Ordering", scope: !580, file: !5, baseType: !581, size: 8, align: 8, flags: DIFlagEnumClass, elements: !582) +!580 = !DINamespace(name: "cmp", scope: !40) +!581 = !DIBasicType(name: "i8", size: 8, encoding: DW_ATE_signed) +!582 = !{!583, !584, !585} +!583 = !DIEnumerator(name: "Less", value: -1) +!584 = !DIEnumerator(name: "Equal", value: 0) +!585 = !DIEnumerator(name: "Greater", value: 1) +!586 = !DICompositeType(tag: DW_TAG_enumeration_type, name: "c_void", scope: !587, file: !5, baseType: !26, size: 8, align: 8, flags: DIFlagEnumClass, elements: !588) +!587 = !DINamespace(name: "ffi", scope: !40) +!588 = !{!589, !590} +!589 = !DIEnumerator(name: "__variant1", value: 0, isUnsigned: true) +!590 = !DIEnumerator(name: "__variant2", value: 1, isUnsigned: true) +!591 = !DICompositeType(tag: DW_TAG_enumeration_type, name: "AssertKind", scope: !592, file: !5, baseType: !26, size: 8, align: 8, flags: DIFlagEnumClass, elements: !593) +!592 = !DINamespace(name: "panicking", scope: !40) +!593 = !{!594, !595, !596} +!594 = !DIEnumerator(name: "Eq", value: 0, isUnsigned: true) +!595 = !DIEnumerator(name: "Ne", value: 1, isUnsigned: true) +!596 = !DIEnumerator(name: "Match", value: 2, isUnsigned: true) +!597 = !{!598} +!598 = !DIGlobalVariableExpression(var: !599, expr: !DIExpression()) +!599 = distinct !DIGlobalVariable(name: "::{vtable}", scope: null, file: !5, type: !600, isLocal: true, isDefinition: true) +!600 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "::{vtable_type}", file: !5, size: 256, align: 64, flags: DIFlagArtificial, elements: !601, vtableHolder: !315, templateParams: !46, identifier: "1f97312b991e7e51c27c8ed2941b7252") +!601 = !{!602, !604, !605, !606} +!602 = !DIDerivedType(tag: DW_TAG_member, name: "drop_in_place", scope: !600, file: !5, baseType: !603, size: 64, align: 64) +!603 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "*const ()", baseType: !246, size: 64, align: 64, dwarfAddressSpace: 0) +!604 = !DIDerivedType(tag: DW_TAG_member, name: "size", scope: !600, file: !5, baseType: !21, size: 64, align: 64, offset: 64) +!605 = !DIDerivedType(tag: DW_TAG_member, name: "align", scope: !600, file: !5, baseType: !21, size: 64, align: 64, offset: 128) +!606 = !DIDerivedType(tag: DW_TAG_member, name: "__method3", scope: !600, file: !5, baseType: !603, size: 64, align: 64, offset: 192) +!607 = !DISubprogram(name: "evaluate_expression", linkageName: "_ZN9unwinding8unwinder5frame5Frame19evaluate_expression17h2bd8716b79f71675E", scope: !4, file: !3, line: 79, type: !302, scopeLine: 79, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit, templateParams: !46) +!608 = !{!609, !610, !611, !612, !618, !873, !946, !966, !968, !970, !972, !974, !976, !978, !980, !982, !984, !986, !988, !990, !992, !994, !997} +!609 = !DILocalVariable(name: "self", arg: 1, scope: !2, file: !3, line: 80, type: !549) +!610 = !DILocalVariable(name: "ctx", arg: 2, scope: !2, file: !3, line: 81, type: !550) +!611 = !DILocalVariable(name: "expr", arg: 3, scope: !2, file: !3, line: 82, type: !227) +!612 = !DILocalVariable(name: "expr", scope: !613, file: !3, line: 84, type: !614, align: 8) +!613 = distinct !DILexicalBlock(scope: !2, file: !3, line: 84, column: 9) +!614 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Expression>", scope: !615, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !616, templateParams: !204, identifier: "c37ff0bc75fe37cf7f7a1245102bd107") +!615 = !DINamespace(name: "op", scope: !17) +!616 = !{!617} +!617 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !614, file: !5, baseType: !120, size: 128, align: 64, flags: DIFlagPublic) +!618 = !DILocalVariable(name: "eval", scope: !619, file: !3, line: 85, type: !620, align: 8) +!619 = distinct !DILexicalBlock(scope: !613, file: !3, line: 85, column: 9) +!620 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Evaluation, unwinding::unwinder::frame::StoreOnStack>", scope: !615, file: !5, size: 9728, align: 64, flags: DIFlagPublic, elements: !621, templateParams: !872, identifier: "96ccd7b3bd9690c1bf418466cde67a") +!621 = !{!622, !623, !629, !630, !644, !645, !704, !705, !779, !780, !804, !816} +!622 = !DIDerivedType(tag: DW_TAG_member, name: "bytecode", scope: !620, file: !5, baseType: !120, size: 128, align: 64, offset: 512, flags: DIFlagPrivate) +!623 = !DIDerivedType(tag: DW_TAG_member, name: "encoding", scope: !620, file: !5, baseType: !624, size: 32, align: 16, offset: 9664, flags: DIFlagPrivate) +!624 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Encoding", scope: !25, file: !5, size: 32, align: 16, flags: DIFlagPublic, elements: !625, templateParams: !46, identifier: "1e5c559cf794bf056cbc617988ad2fe8") +!625 = !{!626, !627, !628} +!626 = !DIDerivedType(tag: DW_TAG_member, name: "address_size", scope: !624, file: !5, baseType: !26, size: 8, align: 8, flags: DIFlagPublic) +!627 = !DIDerivedType(tag: DW_TAG_member, name: "format", scope: !624, file: !5, baseType: !24, size: 8, align: 8, offset: 8, flags: DIFlagPublic) +!628 = !DIDerivedType(tag: DW_TAG_member, name: "version", scope: !624, file: !5, baseType: !118, size: 16, align: 16, offset: 16, flags: DIFlagPublic) +!629 = !DIDerivedType(tag: DW_TAG_member, name: "object_address", scope: !620, file: !5, baseType: !178, size: 128, align: 64, flags: DIFlagPrivate) +!630 = !DIDerivedType(tag: DW_TAG_member, name: "max_iterations", scope: !620, file: !5, baseType: !631, size: 64, align: 32, offset: 320, flags: DIFlagPrivate) +!631 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Option", scope: !39, file: !5, size: 64, align: 32, flags: DIFlagPublic, elements: !632, templateParams: !46, identifier: "ebe42463e4e7e92377731e8e461eca4b") +!632 = !{!633} +!633 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !631, file: !5, size: 64, align: 32, elements: !634, templateParams: !46, identifier: "4e5479196563409542c164f35683db2c", discriminator: !643) +!634 = !{!635, !639} +!635 = !DIDerivedType(tag: DW_TAG_member, name: "None", scope: !633, file: !5, baseType: !636, size: 64, align: 32, extraData: i128 0) +!636 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "None", scope: !631, file: !5, size: 64, align: 32, flags: DIFlagPublic, elements: !46, templateParams: !637, identifier: "1b68f282e961af132516648785d5c5b") +!637 = !{!638} +!638 = !DITemplateTypeParameter(name: "T", type: !535) +!639 = !DIDerivedType(tag: DW_TAG_member, name: "Some", scope: !633, file: !5, baseType: !640, size: 64, align: 32, extraData: i128 1) +!640 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Some", scope: !631, file: !5, size: 64, align: 32, flags: DIFlagPublic, elements: !641, templateParams: !637, identifier: "1174a86fa23a90bb9338798d86b9144b") +!641 = !{!642} +!642 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !640, file: !5, baseType: !535, size: 32, align: 32, offset: 32, flags: DIFlagPublic) +!643 = !DIDerivedType(tag: DW_TAG_member, scope: !631, file: !5, baseType: !535, size: 32, align: 32, flags: DIFlagArtificial) +!644 = !DIDerivedType(tag: DW_TAG_member, name: "iteration", scope: !620, file: !5, baseType: !535, size: 32, align: 32, offset: 9696, flags: DIFlagPrivate) +!645 = !DIDerivedType(tag: DW_TAG_member, name: "state", scope: !620, file: !5, baseType: !646, size: 192, align: 64, offset: 128, flags: DIFlagPrivate) +!646 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "EvaluationState>", scope: !615, file: !5, size: 192, align: 64, flags: DIFlagPrivate, elements: !647, templateParams: !46, identifier: "a87e1097ab6add42183b1a7b5ce2c22a") +!647 = !{!648} +!648 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !646, file: !5, size: 192, align: 64, elements: !649, templateParams: !46, identifier: "964a46dc5f7d0161aef9349b825f4d3c", discriminator: !703) +!649 = !{!650, !654, !656, !660, !662} +!650 = !DIDerivedType(tag: DW_TAG_member, name: "Start", scope: !648, file: !5, baseType: !651, size: 192, align: 64, extraData: i128 13) +!651 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Start", scope: !646, file: !5, size: 192, align: 64, flags: DIFlagPrivate, elements: !652, templateParams: !204, identifier: "676f580c535a7a6db58a5a99c5a2c2d8") +!652 = !{!653} +!653 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !651, file: !5, baseType: !178, size: 128, align: 64, offset: 64, flags: DIFlagPrivate) +!654 = !DIDerivedType(tag: DW_TAG_member, name: "Ready", scope: !648, file: !5, baseType: !655, size: 192, align: 64, extraData: i128 14) +!655 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Ready", scope: !646, file: !5, size: 192, align: 64, flags: DIFlagPrivate, elements: !46, templateParams: !204, identifier: "65ad2c990c0611a1db46a27501104448") +!656 = !DIDerivedType(tag: DW_TAG_member, name: "Error", scope: !648, file: !5, baseType: !657, size: 192, align: 64, extraData: i128 15) +!657 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Error", scope: !646, file: !5, size: 192, align: 64, flags: DIFlagPrivate, elements: !658, templateParams: !204, identifier: "dc65c3743b7b42cd171e919a2e16d7b") +!658 = !{!659} +!659 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !657, file: !5, baseType: !315, size: 128, align: 64, offset: 64, flags: DIFlagPrivate) +!660 = !DIDerivedType(tag: DW_TAG_member, name: "Complete", scope: !648, file: !5, baseType: !661, size: 192, align: 64, extraData: i128 16) +!661 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Complete", scope: !646, file: !5, size: 192, align: 64, flags: DIFlagPrivate, elements: !46, templateParams: !204, identifier: "9bbe3d1f49a3210198c8587f4e96fb03") +!662 = !DIDerivedType(tag: DW_TAG_member, name: "Waiting", scope: !648, file: !5, baseType: !663, size: 192, align: 64) +!663 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Waiting", scope: !646, file: !5, size: 192, align: 64, flags: DIFlagPrivate, elements: !664, templateParams: !204, identifier: "584797001e661d96746ac8ed6d5724a6") +!664 = !{!665} +!665 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !663, file: !5, baseType: !666, size: 192, align: 64, flags: DIFlagPrivate) +!666 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "EvaluationWaiting>", scope: !615, file: !5, size: 192, align: 64, flags: DIFlagPrivate, elements: !667, templateParams: !46, identifier: "54f4ef87ee3afe1d13a2fe921d99fd24") +!667 = !{!668} +!668 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !666, file: !5, size: 192, align: 64, elements: !669, templateParams: !46, identifier: "f00c533ddc4d77e115b3ad4cb5c15280", discriminator: !702) +!669 = !{!670, !672, !676, !680, !682, !684, !686, !688, !690, !692, !694, !698, !700} +!670 = !DIDerivedType(tag: DW_TAG_member, name: "Memory", scope: !668, file: !5, baseType: !671, size: 192, align: 64, extraData: i128 0) +!671 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Memory", scope: !666, file: !5, size: 192, align: 64, flags: DIFlagPrivate, elements: !46, templateParams: !204, identifier: "60f0e96ff64536f4e22d06b39e5a1767") +!672 = !DIDerivedType(tag: DW_TAG_member, name: "Register", scope: !668, file: !5, baseType: !673, size: 192, align: 64, extraData: i128 1) +!673 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Register", scope: !666, file: !5, size: 192, align: 64, flags: DIFlagPrivate, elements: !674, templateParams: !204, identifier: "f6217dc77671182d67f76e991ad0d34") +!674 = !{!675} +!675 = !DIDerivedType(tag: DW_TAG_member, name: "offset", scope: !673, file: !5, baseType: !113, size: 64, align: 64, offset: 64, flags: DIFlagPrivate) +!676 = !DIDerivedType(tag: DW_TAG_member, name: "FrameBase", scope: !668, file: !5, baseType: !677, size: 192, align: 64, extraData: i128 2) +!677 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "FrameBase", scope: !666, file: !5, size: 192, align: 64, flags: DIFlagPrivate, elements: !678, templateParams: !204, identifier: "204f299ba7b55659430baa961a704123") +!678 = !{!679} +!679 = !DIDerivedType(tag: DW_TAG_member, name: "offset", scope: !677, file: !5, baseType: !113, size: 64, align: 64, offset: 64, flags: DIFlagPrivate) +!680 = !DIDerivedType(tag: DW_TAG_member, name: "Tls", scope: !668, file: !5, baseType: !681, size: 192, align: 64, extraData: i128 3) +!681 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Tls", scope: !666, file: !5, size: 192, align: 64, flags: DIFlagPrivate, elements: !46, templateParams: !204, identifier: "f74f40e6e4e01256a41b6afde7f8c13") +!682 = !DIDerivedType(tag: DW_TAG_member, name: "Cfa", scope: !668, file: !5, baseType: !683, size: 192, align: 64, extraData: i128 4) +!683 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Cfa", scope: !666, file: !5, size: 192, align: 64, flags: DIFlagPrivate, elements: !46, templateParams: !204, identifier: "1cc7f11b0e59e14dc1f82774fcaf9f50") +!684 = !DIDerivedType(tag: DW_TAG_member, name: "AtLocation", scope: !668, file: !5, baseType: !685, size: 192, align: 64, extraData: i128 5) +!685 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "AtLocation", scope: !666, file: !5, size: 192, align: 64, flags: DIFlagPrivate, elements: !46, templateParams: !204, identifier: "38d703a8ecd90730b9598d5d108a58c7") +!686 = !DIDerivedType(tag: DW_TAG_member, name: "EntryValue", scope: !668, file: !5, baseType: !687, size: 192, align: 64, extraData: i128 6) +!687 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "EntryValue", scope: !666, file: !5, size: 192, align: 64, flags: DIFlagPrivate, elements: !46, templateParams: !204, identifier: "18ccb8cef0642e8bb7e43041978cd5a2") +!688 = !DIDerivedType(tag: DW_TAG_member, name: "ParameterRef", scope: !668, file: !5, baseType: !689, size: 192, align: 64, extraData: i128 7) +!689 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "ParameterRef", scope: !666, file: !5, size: 192, align: 64, flags: DIFlagPrivate, elements: !46, templateParams: !204, identifier: "98799c88bea54fd8e71d4d352afe0a60") +!690 = !DIDerivedType(tag: DW_TAG_member, name: "RelocatedAddress", scope: !668, file: !5, baseType: !691, size: 192, align: 64, extraData: i128 8) +!691 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "RelocatedAddress", scope: !666, file: !5, size: 192, align: 64, flags: DIFlagPrivate, elements: !46, templateParams: !204, identifier: "681574cd255ae76c95f580792b7e3324") +!692 = !DIDerivedType(tag: DW_TAG_member, name: "IndexedAddress", scope: !668, file: !5, baseType: !693, size: 192, align: 64, extraData: i128 9) +!693 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "IndexedAddress", scope: !666, file: !5, size: 192, align: 64, flags: DIFlagPrivate, elements: !46, templateParams: !204, identifier: "8ef93144f1a4ce7466f498c65e53f78d") +!694 = !DIDerivedType(tag: DW_TAG_member, name: "TypedLiteral", scope: !668, file: !5, baseType: !695, size: 192, align: 64, extraData: i128 10) +!695 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "TypedLiteral", scope: !666, file: !5, size: 192, align: 64, flags: DIFlagPrivate, elements: !696, templateParams: !204, identifier: "950b11aa2ef5d8eac4d1b8c044c1c8ff") +!696 = !{!697} +!697 = !DIDerivedType(tag: DW_TAG_member, name: "value", scope: !695, file: !5, baseType: !120, size: 128, align: 64, offset: 64, flags: DIFlagPrivate) +!698 = !DIDerivedType(tag: DW_TAG_member, name: "Convert", scope: !668, file: !5, baseType: !699, size: 192, align: 64, extraData: i128 11) +!699 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Convert", scope: !666, file: !5, size: 192, align: 64, flags: DIFlagPrivate, elements: !46, templateParams: !204, identifier: "ce96d5537bd6c4694efd5485edbf7a81") +!700 = !DIDerivedType(tag: DW_TAG_member, name: "Reinterpret", scope: !668, file: !5, baseType: !701, size: 192, align: 64, extraData: i128 12) +!701 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Reinterpret", scope: !666, file: !5, size: 192, align: 64, flags: DIFlagPrivate, elements: !46, templateParams: !204, identifier: "4ec3c0dfd90d332fd315c26b2bb8be1") +!702 = !DIDerivedType(tag: DW_TAG_member, scope: !666, file: !5, baseType: !90, size: 64, align: 64, flags: DIFlagArtificial) +!703 = !DIDerivedType(tag: DW_TAG_member, scope: !646, file: !5, baseType: !90, size: 64, align: 64, flags: DIFlagArtificial) +!704 = !DIDerivedType(tag: DW_TAG_member, name: "addr_mask", scope: !620, file: !5, baseType: !90, size: 64, align: 64, offset: 768, flags: DIFlagPrivate) +!705 = !DIDerivedType(tag: DW_TAG_member, name: "stack", scope: !620, file: !5, baseType: !706, size: 8256, align: 64, offset: 832, flags: DIFlagPrivate) +!706 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "ArrayVec<[gimli::read::value::Value; 64]>", scope: !237, file: !5, size: 8256, align: 64, flags: DIFlagProtected, elements: !707, templateParams: !776, identifier: "ae126a3705f435911648edbf2c5ecbbc") +!707 = !{!708, !775} +!708 = !DIDerivedType(tag: DW_TAG_member, name: "storage", scope: !706, file: !5, baseType: !709, size: 8192, align: 64, flags: DIFlagPrivate) +!709 = !DICompositeType(tag: DW_TAG_array_type, baseType: !710, size: 8192, align: 64, elements: !773) +!710 = distinct !DICompositeType(tag: DW_TAG_union_type, name: "MaybeUninit", scope: !242, file: !5, size: 128, align: 64, elements: !711, templateParams: !771, identifier: "2f3d34f66e72ebe9405ea41c59f10b3e") +!711 = !{!712, !713} +!712 = !DIDerivedType(tag: DW_TAG_member, name: "uninit", scope: !710, file: !5, baseType: !246, align: 8) +!713 = !DIDerivedType(tag: DW_TAG_member, name: "value", scope: !710, file: !5, baseType: !714, size: 128, align: 64) +!714 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "ManuallyDrop", scope: !249, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !715, templateParams: !771, identifier: "fb28eeb8aae47bc0c4a1c0f3b1cb482b") +!715 = !{!716} +!716 = !DIDerivedType(tag: DW_TAG_member, name: "value", scope: !714, file: !5, baseType: !717, size: 128, align: 64, flags: DIFlagPrivate) +!717 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Value", scope: !718, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !719, templateParams: !46, identifier: "449f2092324ba7422f77464ef34843c0") +!718 = !DINamespace(name: "value", scope: !17) +!719 = !{!720} +!720 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !717, file: !5, size: 128, align: 64, elements: !721, templateParams: !46, identifier: "68c86d4974102053f5e2c86d31812aa6", discriminator: !770) +!721 = !{!722, !726, !730, !734, !739, !743, !748, !752, !756, !760, !765} +!722 = !DIDerivedType(tag: DW_TAG_member, name: "Generic", scope: !720, file: !5, baseType: !723, size: 128, align: 64, extraData: i128 0) +!723 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Generic", scope: !717, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !724, templateParams: !46, identifier: "aded9ad7d102fa3d3f7c711747aa177f") +!724 = !{!725} +!725 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !723, file: !5, baseType: !90, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!726 = !DIDerivedType(tag: DW_TAG_member, name: "I8", scope: !720, file: !5, baseType: !727, size: 128, align: 64, extraData: i128 1) +!727 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "I8", scope: !717, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !728, templateParams: !46, identifier: "b7e241735c2437539ef217967df198d5") +!728 = !{!729} +!729 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !727, file: !5, baseType: !581, size: 8, align: 8, offset: 8, flags: DIFlagPublic) +!730 = !DIDerivedType(tag: DW_TAG_member, name: "U8", scope: !720, file: !5, baseType: !731, size: 128, align: 64, extraData: i128 2) +!731 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "U8", scope: !717, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !732, templateParams: !46, identifier: "235cabb3f8c8ea232e140126fb0f84") +!732 = !{!733} +!733 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !731, file: !5, baseType: !26, size: 8, align: 8, offset: 8, flags: DIFlagPublic) +!734 = !DIDerivedType(tag: DW_TAG_member, name: "I16", scope: !720, file: !5, baseType: !735, size: 128, align: 64, extraData: i128 3) +!735 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "I16", scope: !717, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !736, templateParams: !46, identifier: "dcfa782be57e9cc55e549bb113b04a06") +!736 = !{!737} +!737 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !735, file: !5, baseType: !738, size: 16, align: 16, offset: 16, flags: DIFlagPublic) +!738 = !DIBasicType(name: "i16", size: 16, encoding: DW_ATE_signed) +!739 = !DIDerivedType(tag: DW_TAG_member, name: "U16", scope: !720, file: !5, baseType: !740, size: 128, align: 64, extraData: i128 4) +!740 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "U16", scope: !717, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !741, templateParams: !46, identifier: "f05631058507a1c5a5bf2d6cc5754e47") +!741 = !{!742} +!742 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !740, file: !5, baseType: !118, size: 16, align: 16, offset: 16, flags: DIFlagPublic) +!743 = !DIDerivedType(tag: DW_TAG_member, name: "I32", scope: !720, file: !5, baseType: !744, size: 128, align: 64, extraData: i128 5) +!744 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "I32", scope: !717, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !745, templateParams: !46, identifier: "d2b5ebdf257a9ced33cecf5822df487f") +!745 = !{!746} +!746 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !744, file: !5, baseType: !747, size: 32, align: 32, offset: 32, flags: DIFlagPublic) +!747 = !DIBasicType(name: "i32", size: 32, encoding: DW_ATE_signed) +!748 = !DIDerivedType(tag: DW_TAG_member, name: "U32", scope: !720, file: !5, baseType: !749, size: 128, align: 64, extraData: i128 6) +!749 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "U32", scope: !717, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !750, templateParams: !46, identifier: "34e90881764c38957273f3906eba341e") +!750 = !{!751} +!751 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !749, file: !5, baseType: !535, size: 32, align: 32, offset: 32, flags: DIFlagPublic) +!752 = !DIDerivedType(tag: DW_TAG_member, name: "I64", scope: !720, file: !5, baseType: !753, size: 128, align: 64, extraData: i128 7) +!753 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "I64", scope: !717, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !754, templateParams: !46, identifier: "30d890c34543c5fdd47c6eb9c24adb3d") +!754 = !{!755} +!755 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !753, file: !5, baseType: !113, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!756 = !DIDerivedType(tag: DW_TAG_member, name: "U64", scope: !720, file: !5, baseType: !757, size: 128, align: 64, extraData: i128 8) +!757 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "U64", scope: !717, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !758, templateParams: !46, identifier: "8c8d09ef3a30b56e8997e64f4a4a569d") +!758 = !{!759} +!759 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !757, file: !5, baseType: !90, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!760 = !DIDerivedType(tag: DW_TAG_member, name: "F32", scope: !720, file: !5, baseType: !761, size: 128, align: 64, extraData: i128 9) +!761 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "F32", scope: !717, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !762, templateParams: !46, identifier: "b561c863896d760d8a61fadefdeec708") +!762 = !{!763} +!763 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !761, file: !5, baseType: !764, size: 32, align: 32, offset: 32, flags: DIFlagPublic) +!764 = !DIBasicType(name: "f32", size: 32, encoding: DW_ATE_float) +!765 = !DIDerivedType(tag: DW_TAG_member, name: "F64", scope: !720, file: !5, baseType: !766, size: 128, align: 64, extraData: i128 10) +!766 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "F64", scope: !717, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !767, templateParams: !46, identifier: "833cddc71cbeff4994a97bcd2953d5e9") +!767 = !{!768} +!768 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !766, file: !5, baseType: !769, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!769 = !DIBasicType(name: "f64", size: 64, encoding: DW_ATE_float) +!770 = !DIDerivedType(tag: DW_TAG_member, scope: !717, file: !5, baseType: !26, size: 8, align: 8, flags: DIFlagArtificial) +!771 = !{!772} +!772 = !DITemplateTypeParameter(name: "T", type: !717) +!773 = !{!774} +!774 = !DISubrange(count: 64, lowerBound: 0) +!775 = !DIDerivedType(tag: DW_TAG_member, name: "len", scope: !706, file: !5, baseType: !21, size: 64, align: 64, offset: 8192, flags: DIFlagPrivate) +!776 = !{!777} +!777 = !DITemplateTypeParameter(name: "A", type: !778) +!778 = !DICompositeType(tag: DW_TAG_array_type, baseType: !717, size: 8192, align: 64, elements: !773) +!779 = !DIDerivedType(tag: DW_TAG_member, name: "pc", scope: !620, file: !5, baseType: !120, size: 128, align: 64, offset: 640, flags: DIFlagPrivate) +!780 = !DIDerivedType(tag: DW_TAG_member, name: "expression_stack", scope: !620, file: !5, baseType: !781, size: 64, align: 64, offset: 9088, flags: DIFlagPrivate) +!781 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "ArrayVec<[(gimli::read::endian_slice::EndianSlice, gimli::read::endian_slice::EndianSlice); 0]>", scope: !237, file: !5, size: 64, align: 64, flags: DIFlagProtected, elements: !782, templateParams: !801, identifier: "8d9c2b45a9b65c919587001001717f17") +!782 = !{!783, !800} +!783 = !DIDerivedType(tag: DW_TAG_member, name: "storage", scope: !781, file: !5, baseType: !784, align: 64, flags: DIFlagPrivate) +!784 = !DICompositeType(tag: DW_TAG_array_type, baseType: !785, align: 64, elements: !798) +!785 = distinct !DICompositeType(tag: DW_TAG_union_type, name: "MaybeUninit<(gimli::read::endian_slice::EndianSlice, gimli::read::endian_slice::EndianSlice)>", scope: !242, file: !5, size: 256, align: 64, elements: !786, templateParams: !796, identifier: "b96b05b25060709f1f9ba7dbb105f622") +!786 = !{!787, !788} +!787 = !DIDerivedType(tag: DW_TAG_member, name: "uninit", scope: !785, file: !5, baseType: !246, align: 8) +!788 = !DIDerivedType(tag: DW_TAG_member, name: "value", scope: !785, file: !5, baseType: !789, size: 256, align: 64) +!789 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "ManuallyDrop<(gimli::read::endian_slice::EndianSlice, gimli::read::endian_slice::EndianSlice)>", scope: !249, file: !5, size: 256, align: 64, flags: DIFlagPublic, elements: !790, templateParams: !796, identifier: "5ab52ed8045fcd3b3de870a7f5e8be9c") +!790 = !{!791} +!791 = !DIDerivedType(tag: DW_TAG_member, name: "value", scope: !789, file: !5, baseType: !792, size: 256, align: 64, flags: DIFlagPrivate) +!792 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "(gimli::read::endian_slice::EndianSlice, gimli::read::endian_slice::EndianSlice)", file: !5, size: 256, align: 64, elements: !793, templateParams: !46, identifier: "120acc42d3a3b94d11c3bb50c5e39835") +!793 = !{!794, !795} +!794 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !792, file: !5, baseType: !120, size: 128, align: 64) +!795 = !DIDerivedType(tag: DW_TAG_member, name: "__1", scope: !792, file: !5, baseType: !120, size: 128, align: 64, offset: 128) +!796 = !{!797} +!797 = !DITemplateTypeParameter(name: "T", type: !792) +!798 = !{!799} +!799 = !DISubrange(count: 0, lowerBound: 0) +!800 = !DIDerivedType(tag: DW_TAG_member, name: "len", scope: !781, file: !5, baseType: !21, size: 64, align: 64, flags: DIFlagPrivate) +!801 = !{!802} +!802 = !DITemplateTypeParameter(name: "A", type: !803) +!803 = !DICompositeType(tag: DW_TAG_array_type, baseType: !792, align: 64, elements: !798) +!804 = !DIDerivedType(tag: DW_TAG_member, name: "value_result", scope: !620, file: !5, baseType: !805, size: 128, align: 64, offset: 384, flags: DIFlagPrivate) +!805 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Option", scope: !39, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !806, templateParams: !46, identifier: "e1364e1d42e393154265328f988592b1") +!806 = !{!807} +!807 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !805, file: !5, size: 128, align: 64, elements: !808, templateParams: !46, identifier: "2963a62c96890b42de64d3f49ad31868", discriminator: !815) +!808 = !{!809, !811} +!809 = !DIDerivedType(tag: DW_TAG_member, name: "None", scope: !807, file: !5, baseType: !810, size: 128, align: 64, extraData: i128 11) +!810 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "None", scope: !805, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !46, templateParams: !771, identifier: "2683f935a05e998d96e137521e8b07c3") +!811 = !DIDerivedType(tag: DW_TAG_member, name: "Some", scope: !807, file: !5, baseType: !812, size: 128, align: 64) +!812 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Some", scope: !805, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !813, templateParams: !771, identifier: "5d048e646ac07fd732c4ffd8213bf634") +!813 = !{!814} +!814 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !812, file: !5, baseType: !717, size: 128, align: 64, flags: DIFlagPublic) +!815 = !DIDerivedType(tag: DW_TAG_member, scope: !805, file: !5, baseType: !26, size: 8, align: 8, flags: DIFlagArtificial) +!816 = !DIDerivedType(tag: DW_TAG_member, name: "result", scope: !620, file: !5, baseType: !817, size: 512, align: 64, offset: 9152, flags: DIFlagPrivate) +!817 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "ArrayVec<[gimli::read::op::Piece, usize>; 1]>", scope: !237, file: !5, size: 512, align: 64, flags: DIFlagProtected, elements: !818, templateParams: !869, identifier: "a656b8eefdf176ad8e0ebffbfe315302") +!818 = !{!819, !868} +!819 = !DIDerivedType(tag: DW_TAG_member, name: "storage", scope: !817, file: !5, baseType: !820, size: 448, align: 64, flags: DIFlagPrivate) +!820 = !DICompositeType(tag: DW_TAG_array_type, baseType: !821, size: 448, align: 64, elements: !866) +!821 = distinct !DICompositeType(tag: DW_TAG_union_type, name: "MaybeUninit, usize>>", scope: !242, file: !5, size: 448, align: 64, elements: !822, templateParams: !864, identifier: "2530b61c20edfb23a48f57d629ac63e3") +!822 = !{!823, !824} +!823 = !DIDerivedType(tag: DW_TAG_member, name: "uninit", scope: !821, file: !5, baseType: !246, align: 8) +!824 = !DIDerivedType(tag: DW_TAG_member, name: "value", scope: !821, file: !5, baseType: !825, size: 448, align: 64) +!825 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "ManuallyDrop, usize>>", scope: !249, file: !5, size: 448, align: 64, flags: DIFlagPublic, elements: !826, templateParams: !864, identifier: "82751d4c9232c7ab6fbf72d12a97f018") +!826 = !{!827} +!827 = !DIDerivedType(tag: DW_TAG_member, name: "value", scope: !825, file: !5, baseType: !828, size: 448, align: 64, flags: DIFlagPrivate) +!828 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Piece, usize>", scope: !615, file: !5, size: 448, align: 64, flags: DIFlagPublic, elements: !829, templateParams: !134, identifier: "d1f755cca082fc24287af19e88191dcf") +!829 = !{!830, !831, !832} +!830 = !DIDerivedType(tag: DW_TAG_member, name: "size_in_bits", scope: !828, file: !5, baseType: !178, size: 128, align: 64, flags: DIFlagPublic) +!831 = !DIDerivedType(tag: DW_TAG_member, name: "bit_offset", scope: !828, file: !5, baseType: !178, size: 128, align: 64, offset: 128, flags: DIFlagPublic) +!832 = !DIDerivedType(tag: DW_TAG_member, name: "location", scope: !828, file: !5, baseType: !833, size: 192, align: 64, offset: 256, flags: DIFlagPublic) +!833 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Location, usize>", scope: !615, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !834, templateParams: !46, identifier: "d395133be8e29139e6d036a34ad208b9") +!834 = !{!835} +!835 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !833, file: !5, size: 192, align: 64, elements: !836, templateParams: !46, identifier: "55c2436db36ecc04a2c666eb5b859a4c", discriminator: !863) +!836 = !{!837, !839, !843, !847, !851, !855} +!837 = !DIDerivedType(tag: DW_TAG_member, name: "Empty", scope: !835, file: !5, baseType: !838, size: 192, align: 64, extraData: i128 0) +!838 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Empty", scope: !833, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !46, templateParams: !134, identifier: "b49e8bb1f2f8602041a87566dbfb91d2") +!839 = !DIDerivedType(tag: DW_TAG_member, name: "Register", scope: !835, file: !5, baseType: !840, size: 192, align: 64, extraData: i128 1) +!840 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Register", scope: !833, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !841, templateParams: !134, identifier: "7eec3de9d20d4cc0862ee3fa2252804e") +!841 = !{!842} +!842 = !DIDerivedType(tag: DW_TAG_member, name: "register", scope: !840, file: !5, baseType: !115, size: 16, align: 16, offset: 16, flags: DIFlagPublic) +!843 = !DIDerivedType(tag: DW_TAG_member, name: "Address", scope: !835, file: !5, baseType: !844, size: 192, align: 64, extraData: i128 2) +!844 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Address", scope: !833, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !845, templateParams: !134, identifier: "ea4da798d514389ff4fefeb6e23e4cb7") +!845 = !{!846} +!846 = !DIDerivedType(tag: DW_TAG_member, name: "address", scope: !844, file: !5, baseType: !90, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!847 = !DIDerivedType(tag: DW_TAG_member, name: "Value", scope: !835, file: !5, baseType: !848, size: 192, align: 64, extraData: i128 3) +!848 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Value", scope: !833, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !849, templateParams: !134, identifier: "d10a76e8b53088f5d90ae0638223103") +!849 = !{!850} +!850 = !DIDerivedType(tag: DW_TAG_member, name: "value", scope: !848, file: !5, baseType: !717, size: 128, align: 64, offset: 64, flags: DIFlagPublic) +!851 = !DIDerivedType(tag: DW_TAG_member, name: "Bytes", scope: !835, file: !5, baseType: !852, size: 192, align: 64, extraData: i128 4) +!852 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Bytes", scope: !833, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !853, templateParams: !134, identifier: "b8af572c2d30f325beba0759bb7b15bc") +!853 = !{!854} +!854 = !DIDerivedType(tag: DW_TAG_member, name: "value", scope: !852, file: !5, baseType: !120, size: 128, align: 64, offset: 64, flags: DIFlagPublic) +!855 = !DIDerivedType(tag: DW_TAG_member, name: "ImplicitPointer", scope: !835, file: !5, baseType: !856, size: 192, align: 64, extraData: i128 5) +!856 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "ImplicitPointer", scope: !833, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !857, templateParams: !134, identifier: "bdbf25dabffeb5d0679ee8e169accba1") +!857 = !{!858, !862} +!858 = !DIDerivedType(tag: DW_TAG_member, name: "value", scope: !856, file: !5, baseType: !859, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!859 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "DebugInfoOffset", scope: !25, file: !5, size: 64, align: 64, flags: DIFlagPublic, elements: !860, templateParams: !221, identifier: "240ef1e2d1384c4db51fe4b33cb864ae") +!860 = !{!861} +!861 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !859, file: !5, baseType: !21, size: 64, align: 64, flags: DIFlagPublic) +!862 = !DIDerivedType(tag: DW_TAG_member, name: "byte_offset", scope: !856, file: !5, baseType: !113, size: 64, align: 64, offset: 128, flags: DIFlagPublic) +!863 = !DIDerivedType(tag: DW_TAG_member, scope: !833, file: !5, baseType: !118, size: 16, align: 16, flags: DIFlagArtificial) +!864 = !{!865} +!865 = !DITemplateTypeParameter(name: "T", type: !828) +!866 = !{!867} +!867 = !DISubrange(count: 1, lowerBound: 0) +!868 = !DIDerivedType(tag: DW_TAG_member, name: "len", scope: !817, file: !5, baseType: !21, size: 64, align: 64, offset: 448, flags: DIFlagPrivate) +!869 = !{!870} +!870 = !DITemplateTypeParameter(name: "A", type: !871) +!871 = !DICompositeType(tag: DW_TAG_array_type, baseType: !828, size: 448, align: 64, elements: !866) +!872 = !{!135, !300} +!873 = !DILocalVariable(name: "result", scope: !874, file: !3, line: 87, type: !875, align: 8) +!874 = distinct !DILexicalBlock(scope: !619, file: !3, line: 87, column: 9) +!875 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "EvaluationResult>", scope: !615, file: !5, size: 320, align: 64, flags: DIFlagPublic, elements: !876, templateParams: !46, identifier: "51d4bb2ed321c46272ccbeec740b49c") +!876 = !{!877} +!877 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !875, file: !5, size: 320, align: 64, elements: !878, templateParams: !46, identifier: "b5b0d5dbd4679161fe022116a9af800", discriminator: !945) +!878 = !{!879, !881, !891, !896, !898, !902, !904, !921, !925, !929, !933, !941} +!879 = !DIDerivedType(tag: DW_TAG_member, name: "Complete", scope: !877, file: !5, baseType: !880, size: 320, align: 64, extraData: i128 2) +!880 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Complete", scope: !875, file: !5, size: 320, align: 64, flags: DIFlagPublic, elements: !46, templateParams: !204, identifier: "98b5d1403036cb41e2f44cc6ea0efccd") +!881 = !DIDerivedType(tag: DW_TAG_member, name: "RequiresMemory", scope: !877, file: !5, baseType: !882, size: 320, align: 64) +!882 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "RequiresMemory", scope: !875, file: !5, size: 320, align: 64, flags: DIFlagPublic, elements: !883, templateParams: !204, identifier: "8003e489afddfa79b3b73f5fc4a14802") +!883 = !{!884, !885, !886, !887} +!884 = !DIDerivedType(tag: DW_TAG_member, name: "address", scope: !882, file: !5, baseType: !90, size: 64, align: 64, offset: 128, flags: DIFlagPublic) +!885 = !DIDerivedType(tag: DW_TAG_member, name: "size", scope: !882, file: !5, baseType: !26, size: 8, align: 8, offset: 256, flags: DIFlagPublic) +!886 = !DIDerivedType(tag: DW_TAG_member, name: "space", scope: !882, file: !5, baseType: !178, size: 128, align: 64, flags: DIFlagPublic) +!887 = !DIDerivedType(tag: DW_TAG_member, name: "base_type", scope: !882, file: !5, baseType: !888, size: 64, align: 64, offset: 192, flags: DIFlagPublic) +!888 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnitOffset", scope: !17, file: !5, size: 64, align: 64, flags: DIFlagPublic, elements: !889, templateParams: !221, identifier: "da1d24a786a32ca5ac7efb4fa178ae2b") +!889 = !{!890} +!890 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !888, file: !5, baseType: !21, size: 64, align: 64, flags: DIFlagPublic) +!891 = !DIDerivedType(tag: DW_TAG_member, name: "RequiresRegister", scope: !877, file: !5, baseType: !892, size: 320, align: 64, extraData: i128 4) +!892 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "RequiresRegister", scope: !875, file: !5, size: 320, align: 64, flags: DIFlagPublic, elements: !893, templateParams: !204, identifier: "1536d1e70f21cd80ea98159552d5b03") +!893 = !{!894, !895} +!894 = !DIDerivedType(tag: DW_TAG_member, name: "register", scope: !892, file: !5, baseType: !115, size: 16, align: 16, offset: 128, flags: DIFlagPublic) +!895 = !DIDerivedType(tag: DW_TAG_member, name: "base_type", scope: !892, file: !5, baseType: !888, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!896 = !DIDerivedType(tag: DW_TAG_member, name: "RequiresFrameBase", scope: !877, file: !5, baseType: !897, size: 320, align: 64, extraData: i128 5) +!897 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "RequiresFrameBase", scope: !875, file: !5, size: 320, align: 64, flags: DIFlagPublic, elements: !46, templateParams: !204, identifier: "aa3ffa474f6042aaf55b84c6e95fc29") +!898 = !DIDerivedType(tag: DW_TAG_member, name: "RequiresTls", scope: !877, file: !5, baseType: !899, size: 320, align: 64, extraData: i128 6) +!899 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "RequiresTls", scope: !875, file: !5, size: 320, align: 64, flags: DIFlagPublic, elements: !900, templateParams: !204, identifier: "b1e8d9ef29a993fcbf2f73f7de6e296b") +!900 = !{!901} +!901 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !899, file: !5, baseType: !90, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!902 = !DIDerivedType(tag: DW_TAG_member, name: "RequiresCallFrameCfa", scope: !877, file: !5, baseType: !903, size: 320, align: 64, extraData: i128 7) +!903 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "RequiresCallFrameCfa", scope: !875, file: !5, size: 320, align: 64, flags: DIFlagPublic, elements: !46, templateParams: !204, identifier: "72816062d26fd7b762fb5e5ed129ff2") +!904 = !DIDerivedType(tag: DW_TAG_member, name: "RequiresAtLocation", scope: !877, file: !5, baseType: !905, size: 320, align: 64, extraData: i128 8) +!905 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "RequiresAtLocation", scope: !875, file: !5, size: 320, align: 64, flags: DIFlagPublic, elements: !906, templateParams: !204, identifier: "2af96c961b57c627b47a5a74aa0f23eb") +!906 = !{!907} +!907 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !905, file: !5, baseType: !908, size: 128, align: 64, offset: 64, flags: DIFlagPublic) +!908 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "DieReference", scope: !615, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !909, templateParams: !46, identifier: "634c048e6d97d42d69a1dd4c2a2b8d27") +!909 = !{!910} +!910 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !908, file: !5, size: 128, align: 64, elements: !911, templateParams: !46, identifier: "8970939422a637ddea811e213749faa", discriminator: !920) +!911 = !{!912, !916} +!912 = !DIDerivedType(tag: DW_TAG_member, name: "UnitRef", scope: !910, file: !5, baseType: !913, size: 128, align: 64, extraData: i128 0) +!913 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnitRef", scope: !908, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !914, templateParams: !221, identifier: "a26a62911cf741a4d4a44e27f0a5ed51") +!914 = !{!915} +!915 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !913, file: !5, baseType: !888, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!916 = !DIDerivedType(tag: DW_TAG_member, name: "DebugInfoRef", scope: !910, file: !5, baseType: !917, size: 128, align: 64, extraData: i128 1) +!917 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "DebugInfoRef", scope: !908, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !918, templateParams: !221, identifier: "301c17503a4b73a819dffa9d3ff4f17") +!918 = !{!919} +!919 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !917, file: !5, baseType: !859, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!920 = !DIDerivedType(tag: DW_TAG_member, scope: !908, file: !5, baseType: !90, size: 64, align: 64, flags: DIFlagArtificial) +!921 = !DIDerivedType(tag: DW_TAG_member, name: "RequiresEntryValue", scope: !877, file: !5, baseType: !922, size: 320, align: 64, extraData: i128 9) +!922 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "RequiresEntryValue", scope: !875, file: !5, size: 320, align: 64, flags: DIFlagPublic, elements: !923, templateParams: !204, identifier: "faa1a56431d594ba83847c61cba8413e") +!923 = !{!924} +!924 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !922, file: !5, baseType: !614, size: 128, align: 64, offset: 64, flags: DIFlagPublic) +!925 = !DIDerivedType(tag: DW_TAG_member, name: "RequiresParameterRef", scope: !877, file: !5, baseType: !926, size: 320, align: 64, extraData: i128 10) +!926 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "RequiresParameterRef", scope: !875, file: !5, size: 320, align: 64, flags: DIFlagPublic, elements: !927, templateParams: !204, identifier: "49d532a26f9036eb3e31efa5492676d5") +!927 = !{!928} +!928 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !926, file: !5, baseType: !888, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!929 = !DIDerivedType(tag: DW_TAG_member, name: "RequiresRelocatedAddress", scope: !877, file: !5, baseType: !930, size: 320, align: 64, extraData: i128 11) +!930 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "RequiresRelocatedAddress", scope: !875, file: !5, size: 320, align: 64, flags: DIFlagPublic, elements: !931, templateParams: !204, identifier: "6113feb90ddaa61b8e4c18f58521fe69") +!931 = !{!932} +!932 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !930, file: !5, baseType: !90, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!933 = !DIDerivedType(tag: DW_TAG_member, name: "RequiresIndexedAddress", scope: !877, file: !5, baseType: !934, size: 320, align: 64, extraData: i128 12) +!934 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "RequiresIndexedAddress", scope: !875, file: !5, size: 320, align: 64, flags: DIFlagPublic, elements: !935, templateParams: !204, identifier: "c88ebe4f9abcc34adc5bed282975fd47") +!935 = !{!936, !940} +!936 = !DIDerivedType(tag: DW_TAG_member, name: "index", scope: !934, file: !5, baseType: !937, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!937 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "DebugAddrIndex", scope: !25, file: !5, size: 64, align: 64, flags: DIFlagPublic, elements: !938, templateParams: !221, identifier: "ab8b077a231ba172fd0f54a5426fad2b") +!938 = !{!939} +!939 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !937, file: !5, baseType: !21, size: 64, align: 64, flags: DIFlagPublic) +!940 = !DIDerivedType(tag: DW_TAG_member, name: "relocate", scope: !934, file: !5, baseType: !103, size: 8, align: 8, offset: 128, flags: DIFlagPublic) +!941 = !DIDerivedType(tag: DW_TAG_member, name: "RequiresBaseType", scope: !877, file: !5, baseType: !942, size: 320, align: 64, extraData: i128 13) +!942 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "RequiresBaseType", scope: !875, file: !5, size: 320, align: 64, flags: DIFlagPublic, elements: !943, templateParams: !204, identifier: "737c9cc6266a1749293672abd5b69040") +!943 = !{!944} +!944 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !942, file: !5, baseType: !888, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!945 = !DIDerivedType(tag: DW_TAG_member, scope: !875, file: !5, baseType: !90, size: 64, align: 64, flags: DIFlagArtificial) +!946 = !DILocalVariable(name: "residual", scope: !947, file: !3, line: 87, type: !948, align: 8) +!947 = distinct !DILexicalBlock(scope: !619, file: !3, line: 87, column: 41) +!948 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Result", scope: !305, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !949, templateParams: !46, identifier: "919ce7a601fb4a13ba9fff7fa5c31214") +!949 = !{!950} +!950 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !948, file: !5, size: 128, align: 64, elements: !951, templateParams: !46, identifier: "4d9c0f1a76ad6640c91b178b0a25cb6d") +!951 = !{!952, !962} +!952 = !DIDerivedType(tag: DW_TAG_member, name: "Ok", scope: !950, file: !5, baseType: !953, size: 128, align: 64) +!953 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Ok", scope: !948, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !954, templateParams: !960, identifier: "80500a1964edcdf9df1c06fd016a020c") +!954 = !{!955} +!955 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !953, file: !5, baseType: !956, align: 8, flags: DIFlagPublic) +!956 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Infallible", scope: !957, file: !5, align: 8, flags: DIFlagPublic, elements: !958, templateParams: !46, identifier: "64765147ab70b22e683668bfcd1e19c6") +!957 = !DINamespace(name: "convert", scope: !40) +!958 = !{!959} +!959 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !956, file: !5, align: 8, elements: !46, identifier: "7ae49c0d49b0241a817f4820926e50a2") +!960 = !{!961, !314} +!961 = !DITemplateTypeParameter(name: "T", type: !956) +!962 = !DIDerivedType(tag: DW_TAG_member, name: "Err", scope: !950, file: !5, baseType: !963, size: 128, align: 64) +!963 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Err", scope: !948, file: !5, size: 128, align: 64, flags: DIFlagPublic, elements: !964, templateParams: !960, identifier: "b0437e745c7777a31897e0e59bf1640b") +!964 = !{!965} +!965 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !963, file: !5, baseType: !315, size: 128, align: 64, flags: DIFlagPublic) +!966 = !DILocalVariable(name: "val", scope: !967, file: !3, line: 87, type: !875, align: 8) +!967 = distinct !DILexicalBlock(scope: !619, file: !3, line: 87, column: 26) +!968 = !DILocalVariable(name: "address", scope: !969, file: !3, line: 91, type: !90, align: 8) +!969 = distinct !DILexicalBlock(scope: !874, file: !3, line: 91, column: 17) +!970 = !DILocalVariable(name: "value", scope: !971, file: !3, line: 92, type: !21, align: 8) +!971 = distinct !DILexicalBlock(scope: !969, file: !3, line: 92, column: 21) +!972 = !DILocalVariable(name: "residual", scope: !973, file: !3, line: 93, type: !948, align: 8) +!973 = distinct !DILexicalBlock(scope: !971, file: !3, line: 93, column: 81) +!974 = !DILocalVariable(name: "val", scope: !975, file: !3, line: 93, type: !875, align: 8) +!975 = distinct !DILexicalBlock(scope: !971, file: !3, line: 93, column: 30) +!976 = !DILocalVariable(name: "register", scope: !977, file: !3, line: 95, type: !115, align: 2) +!977 = distinct !DILexicalBlock(scope: !874, file: !3, line: 95, column: 17) +!978 = !DILocalVariable(name: "value", scope: !979, file: !3, line: 96, type: !21, align: 8) +!979 = distinct !DILexicalBlock(scope: !977, file: !3, line: 96, column: 21) +!980 = !DILocalVariable(name: "residual", scope: !981, file: !3, line: 97, type: !948, align: 8) +!981 = distinct !DILexicalBlock(scope: !979, file: !3, line: 97, column: 83) +!982 = !DILocalVariable(name: "val", scope: !983, file: !3, line: 97, type: !875, align: 8) +!983 = distinct !DILexicalBlock(scope: !979, file: !3, line: 97, column: 30) +!984 = !DILocalVariable(name: "address", scope: !985, file: !3, line: 99, type: !90, align: 8) +!985 = distinct !DILexicalBlock(scope: !874, file: !3, line: 99, column: 17) +!986 = !DILocalVariable(name: "value", scope: !987, file: !3, line: 100, type: !21, align: 8) +!987 = distinct !DILexicalBlock(scope: !985, file: !3, line: 100, column: 21) +!988 = !DILocalVariable(name: "residual", scope: !989, file: !3, line: 101, type: !948, align: 8) +!989 = distinct !DILexicalBlock(scope: !987, file: !3, line: 101, column: 81) +!990 = !DILocalVariable(name: "val", scope: !991, file: !3, line: 101, type: !875, align: 8) +!991 = distinct !DILexicalBlock(scope: !987, file: !3, line: 101, column: 30) +!992 = !DILocalVariable(name: "residual", scope: !993, file: !3, line: 111, type: !948, align: 8) +!993 = distinct !DILexicalBlock(scope: !874, file: !3, line: 111, column: 56) +!994 = !DILocalVariable(name: "val", scope: !995, file: !3, line: 108, type: !996, align: 8) +!995 = distinct !DILexicalBlock(scope: !874, file: !3, line: 108, column: 19) +!996 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "&gimli::read::op::Piece, usize>", baseType: !828, size: 64, align: 64, dwarfAddressSpace: 0) +!997 = !DILocalVariable(name: "address", scope: !998, file: !3, line: 114, type: !90, align: 8) +!998 = distinct !DILexicalBlock(scope: !874, file: !3, line: 114, column: 17) +!999 = !DILocation(line: 1102, column: 23, scope: !1000, inlinedAt: !1038) +!1000 = distinct !DILexicalBlock(scope: !1002, file: !1001, line: 1102, column: 13) +!1001 = !DIFile(filename: "/rustc/636d7ff91b9847d6d43c7bbe023568828f6e3246/library/core/src/result.rs", directory: "", checksumkind: CSK_MD5, checksum: "13dbc19e8bd386b8c9d62247cee85b56") +!1002 = distinct !DISubprogram(name: "unwrap>, gimli::read::Error>", linkageName: "_ZN4core6result19Result$LT$T$C$E$GT$6unwrap17h14fd7c0569eb842aE", scope: !1003, file: !1001, line: 1096, type: !1018, scopeLine: 1096, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !564, templateParams: !1011, declaration: !1032, retainedNodes: !1033) +!1003 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Result>, gimli::read::Error>", scope: !305, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !1004, templateParams: !46, identifier: "769c6b05a4491edd8f1f7ebaabbcd9ce") +!1004 = !{!1005} +!1005 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !1003, file: !5, size: 192, align: 64, elements: !1006, templateParams: !46, identifier: "f07482fa33f1f68430e7cc41a9027a6b", discriminator: !1017) +!1006 = !{!1007, !1013} +!1007 = !DIDerivedType(tag: DW_TAG_member, name: "Ok", scope: !1005, file: !5, baseType: !1008, size: 192, align: 64, extraData: i128 0) +!1008 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Ok", scope: !1003, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !1009, templateParams: !1011, identifier: "b3503abe43c6c1a92f481dfc52138ec2") +!1009 = !{!1010} +!1010 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !1008, file: !5, baseType: !614, size: 128, align: 64, offset: 64, flags: DIFlagPublic) +!1011 = !{!1012, !314} +!1012 = !DITemplateTypeParameter(name: "T", type: !614) +!1013 = !DIDerivedType(tag: DW_TAG_member, name: "Err", scope: !1005, file: !5, baseType: !1014, size: 192, align: 64, extraData: i128 1) +!1014 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Err", scope: !1003, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !1015, templateParams: !1011, identifier: "4538ee170ef87c5e49ce2f27f79f37a2") +!1015 = !{!1016} +!1016 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !1014, file: !5, baseType: !315, size: 128, align: 64, offset: 64, flags: DIFlagPublic) +!1017 = !DIDerivedType(tag: DW_TAG_member, scope: !1003, file: !5, baseType: !90, size: 64, align: 64, flags: DIFlagArtificial) +!1018 = !DISubroutineType(types: !1019) +!1019 = !{!614, !1003, !1020} +!1020 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "&core::panic::location::Location", baseType: !1021, size: 64, align: 64, dwarfAddressSpace: 0) +!1021 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Location", scope: !1022, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !1024, templateParams: !46, identifier: "e063870a552be7101e2bcd793a8716b0") +!1022 = !DINamespace(name: "location", scope: !1023) +!1023 = !DINamespace(name: "panic", scope: !40) +!1024 = !{!1025, !1030, !1031} +!1025 = !DIDerivedType(tag: DW_TAG_member, name: "file", scope: !1021, file: !5, baseType: !1026, size: 128, align: 64, flags: DIFlagPrivate) +!1026 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "&str", file: !5, size: 128, align: 64, elements: !1027, templateParams: !46, identifier: "9277eecd40495f85161460476aacc992") +!1027 = !{!1028, !1029} +!1028 = !DIDerivedType(tag: DW_TAG_member, name: "data_ptr", scope: !1026, file: !5, baseType: !127, size: 64, align: 64) +!1029 = !DIDerivedType(tag: DW_TAG_member, name: "length", scope: !1026, file: !5, baseType: !21, size: 64, align: 64, offset: 64) +!1030 = !DIDerivedType(tag: DW_TAG_member, name: "line", scope: !1021, file: !5, baseType: !535, size: 32, align: 32, offset: 128, flags: DIFlagPrivate) +!1031 = !DIDerivedType(tag: DW_TAG_member, name: "col", scope: !1021, file: !5, baseType: !535, size: 32, align: 32, offset: 160, flags: DIFlagPrivate) +!1032 = !DISubprogram(name: "unwrap>, gimli::read::Error>", linkageName: "_ZN4core6result19Result$LT$T$C$E$GT$6unwrap17h14fd7c0569eb842aE", scope: !1003, file: !1001, line: 1096, type: !1018, scopeLine: 1096, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit, templateParams: !1011) +!1033 = !{!1034, !1035, !1037} +!1034 = !DILocalVariable(name: "self", arg: 1, scope: !1002, file: !1001, line: 1096, type: !1003) +!1035 = !DILocalVariable(name: "t", scope: !1036, file: !1001, line: 1101, type: !614, align: 8) +!1036 = distinct !DILexicalBlock(scope: !1002, file: !1001, line: 1101, column: 13) +!1037 = !DILocalVariable(name: "e", scope: !1000, file: !1001, line: 1102, type: !315, align: 8) +!1038 = distinct !DILocation(line: 84, column: 20, scope: !2) +!1039 = distinct !DISubprogram(name: "_Unwind_Resume", scope: !7, file: !1040, line: 346, type: !1041, scopeLine: 346, flags: DIFlagPrototyped | DIFlagNoReturn, spFlags: DISPFlagDefinition, unit: !564, templateParams: !46, retainedNodes: !1112) +!1040 = !DIFile(filename: "src/unwinder/mod.rs", directory: "/home/dev/ecosystem/unwinding", checksumkind: CSK_MD5, checksum: "0b7cd150e86dd087aeaa8e0e18bae6d9") +!1041 = !DISubroutineType(types: !1042) +!1042 = !{null, !1043} +!1043 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "*mut unwinding::unwinder::UnwindException", baseType: !1044, size: 64, align: 64, dwarfAddressSpace: 0) +!1044 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnwindException", scope: !7, file: !5, size: 256, align: 64, flags: DIFlagPublic, elements: !1045, templateParams: !46, identifier: "f6e359707e96b28f68e0123bb3490311") +!1045 = !{!1046, !1047, !1068, !1109, !1110} +!1046 = !DIDerivedType(tag: DW_TAG_member, name: "exception_class", scope: !1044, file: !5, baseType: !90, size: 64, align: 64, flags: DIFlagPublic) +!1047 = !DIDerivedType(tag: DW_TAG_member, name: "exception_cleanup", scope: !1044, file: !5, baseType: !1048, size: 64, align: 64, offset: 64, flags: DIFlagPublic) +!1048 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Option", scope: !39, file: !5, size: 64, align: 64, flags: DIFlagPublic, elements: !1049, templateParams: !46, identifier: "55edbca04b7b79406fe597df5da69fb6") +!1049 = !{!1050} +!1050 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !1048, file: !5, size: 64, align: 64, elements: !1051, templateParams: !46, identifier: "d4ba33946a9e213e48833b2948ffc69a", discriminator: !1067) +!1051 = !{!1052, !1063} +!1052 = !DIDerivedType(tag: DW_TAG_member, name: "None", scope: !1050, file: !5, baseType: !1053, size: 64, align: 64, extraData: i128 0) +!1053 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "None", scope: !1048, file: !5, size: 64, align: 64, flags: DIFlagPublic, elements: !46, templateParams: !1054, identifier: "5f49070303e2d908386f0a327220e7") +!1054 = !{!1055} +!1055 = !DITemplateTypeParameter(name: "T", type: !1056) +!1056 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "unsafe extern \22C\22 fn(unwinding::abi::UnwindReasonCode, *mut unwinding::unwinder::UnwindException)", baseType: !1057, size: 64, align: 64, dwarfAddressSpace: 0) +!1057 = !DISubroutineType(types: !1058) +!1058 = !{null, !1059, !1043} +!1059 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnwindReasonCode", scope: !1060, file: !5, size: 32, align: 32, flags: DIFlagPublic, elements: !1061, templateParams: !46, identifier: "78d1c20b6f4c6f13f91e6941a59e3070") +!1060 = !DINamespace(name: "abi", scope: !8) +!1061 = !{!1062} +!1062 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !1059, file: !5, baseType: !747, size: 32, align: 32, flags: DIFlagPublic) +!1063 = !DIDerivedType(tag: DW_TAG_member, name: "Some", scope: !1050, file: !5, baseType: !1064, size: 64, align: 64) +!1064 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Some", scope: !1048, file: !5, size: 64, align: 64, flags: DIFlagPublic, elements: !1065, templateParams: !1054, identifier: "88c5936a7984265e3c9f2ddf1a30acca") +!1065 = !{!1066} +!1066 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !1064, file: !5, baseType: !1056, size: 64, align: 64, flags: DIFlagPublic) +!1067 = !DIDerivedType(tag: DW_TAG_member, scope: !1048, file: !5, baseType: !90, size: 64, align: 64, flags: DIFlagArtificial) +!1068 = !DIDerivedType(tag: DW_TAG_member, name: "private_1", scope: !1044, file: !5, baseType: !1069, size: 64, align: 64, offset: 128, flags: DIFlagPrivate) +!1069 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Option unwinding::abi::UnwindReasonCode>", scope: !39, file: !5, size: 64, align: 64, flags: DIFlagPublic, elements: !1070, templateParams: !46, identifier: "3fd0f4ff1cf8b26bfa970433d6b9be1f") +!1070 = !{!1071} +!1071 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !1069, file: !5, size: 64, align: 64, elements: !1072, templateParams: !46, identifier: "c06dd7a3f8e0e4b1f3c073ade268504e", discriminator: !1108) +!1072 = !{!1073, !1104} +!1073 = !DIDerivedType(tag: DW_TAG_member, name: "None", scope: !1071, file: !5, baseType: !1074, size: 64, align: 64, extraData: i128 0) +!1074 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "None", scope: !1069, file: !5, size: 64, align: 64, flags: DIFlagPublic, elements: !46, templateParams: !1075, identifier: "a7907e0a0f03f43538101bc2ae5b0cc9") +!1075 = !{!1076} +!1076 = !DITemplateTypeParameter(name: "T", type: !1077) +!1077 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "unsafe extern \22C\22 fn(i32, unwinding::abi::UnwindAction, u64, *mut unwinding::unwinder::UnwindException, &mut unwinding::unwinder::UnwindContext, *mut core::ffi::c_void) -> unwinding::abi::UnwindReasonCode", baseType: !1078, size: 64, align: 64, dwarfAddressSpace: 0) +!1078 = !DISubroutineType(types: !1079) +!1079 = !{!1059, !747, !1080, !90, !1043, !1083, !1103} +!1080 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnwindAction", scope: !1060, file: !5, size: 32, align: 32, flags: DIFlagPublic, elements: !1081, templateParams: !46, identifier: "364c99c0f0ff127f318feffefcb3c87") +!1081 = !{!1082} +!1082 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !1080, file: !5, baseType: !747, size: 32, align: 32, flags: DIFlagPublic) +!1083 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "&mut unwinding::unwinder::UnwindContext", baseType: !1084, size: 64, align: 64, dwarfAddressSpace: 0) +!1084 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "UnwindContext", scope: !7, file: !5, size: 192, align: 64, flags: DIFlagPublic, elements: !1085, templateParams: !46, identifier: "911f8c19bc1f5e24ad054a625f8be0d6") +!1085 = !{!1086, !1100, !1102} +!1086 = !DIDerivedType(tag: DW_TAG_member, name: "frame", scope: !1084, file: !5, baseType: !1087, size: 64, align: 64, offset: 64, flags: DIFlagPrivate) +!1087 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Option<&unwinding::unwinder::frame::Frame>", scope: !39, file: !5, size: 64, align: 64, flags: DIFlagPublic, elements: !1088, templateParams: !46, identifier: "74fadfe0892d41cd8e0d03eb53ad3e54") +!1088 = !{!1089} +!1089 = distinct !DICompositeType(tag: DW_TAG_variant_part, scope: !1087, file: !5, size: 64, align: 64, elements: !1090, templateParams: !46, identifier: "3f92d1546b9840fa83783ed5018281cd", discriminator: !1099) +!1090 = !{!1091, !1095} +!1091 = !DIDerivedType(tag: DW_TAG_member, name: "None", scope: !1089, file: !5, baseType: !1092, size: 64, align: 64, extraData: i128 0) +!1092 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "None", scope: !1087, file: !5, size: 64, align: 64, flags: DIFlagPublic, elements: !46, templateParams: !1093, identifier: "512ffa16cad01e9d1b32a5885a0360bc") +!1093 = !{!1094} +!1094 = !DITemplateTypeParameter(name: "T", type: !549) +!1095 = !DIDerivedType(tag: DW_TAG_member, name: "Some", scope: !1089, file: !5, baseType: !1096, size: 64, align: 64) +!1096 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Some", scope: !1087, file: !5, size: 64, align: 64, flags: DIFlagPublic, elements: !1097, templateParams: !1093, identifier: "e463cc92afc82dc88438dd3a5d8e906d") +!1097 = !{!1098} +!1098 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !1096, file: !5, baseType: !549, size: 64, align: 64, flags: DIFlagPublic) +!1099 = !DIDerivedType(tag: DW_TAG_member, scope: !1087, file: !5, baseType: !90, size: 64, align: 64, flags: DIFlagArtificial) +!1100 = !DIDerivedType(tag: DW_TAG_member, name: "ctx", scope: !1084, file: !5, baseType: !1101, size: 64, align: 64, flags: DIFlagPrivate) +!1101 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "&mut unwinding::unwinder::arch::aarch64::Context", baseType: !551, size: 64, align: 64, dwarfAddressSpace: 0) +!1102 = !DIDerivedType(tag: DW_TAG_member, name: "signal", scope: !1084, file: !5, baseType: !103, size: 8, align: 8, offset: 128, flags: DIFlagPrivate) +!1103 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "*mut core::ffi::c_void", baseType: !586, size: 64, align: 64, dwarfAddressSpace: 0) +!1104 = !DIDerivedType(tag: DW_TAG_member, name: "Some", scope: !1071, file: !5, baseType: !1105, size: 64, align: 64) +!1105 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Some", scope: !1069, file: !5, size: 64, align: 64, flags: DIFlagPublic, elements: !1106, templateParams: !1075, identifier: "757604dfadcc7bc333dd8afe5c3f1b07") +!1106 = !{!1107} +!1107 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !1105, file: !5, baseType: !1077, size: 64, align: 64, flags: DIFlagPublic) +!1108 = !DIDerivedType(tag: DW_TAG_member, scope: !1069, file: !5, baseType: !90, size: 64, align: 64, flags: DIFlagArtificial) +!1109 = !DIDerivedType(tag: DW_TAG_member, name: "private_2", scope: !1044, file: !5, baseType: !21, size: 64, align: 64, offset: 192, flags: DIFlagPrivate) +!1110 = !DIDerivedType(tag: DW_TAG_member, name: "private_unused", scope: !1044, file: !5, baseType: !1111, align: 64, offset: 256, flags: DIFlagPrivate) +!1111 = !DICompositeType(tag: DW_TAG_array_type, baseType: !21, align: 64, elements: !798) +!1112 = !{!1113} +!1113 = !DILocalVariable(name: "exception", arg: 1, scope: !1039, file: !1040, line: 346, type: !1043) From 69332bb8995aef60d830406de12cb79a50390261 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 22 Aug 2024 11:24:08 -0400 Subject: [PATCH 214/426] [SLP]Improve/fix subvectors in gather/buildvector nodes handling SLP vectorizer has an estimation for gather/buildvector nodes, which contain some scalar loads. SLP vectorizer performs pretty similar (but large in SLOCs) estimation, which not always correct. Instead, this patch implements clustering analysis and actual node allocation with the full analysis for the vectorized clustered scalars (not only loads, but also some other instructions) with the correct cost estimation and vector insert instructions. Improves overall vectorization quality and simplifies analysis/estimations. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/104144 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 328 +++++++--------- .../PhaseOrdering/AArch64/slpordering.ll | 74 ++-- .../SLPVectorizer/AArch64/getelementptr.ll | 11 +- .../SLPVectorizer/AArch64/loadorder.ll | 192 ++++----- .../AArch64/multiple_reduction.ll | 365 +++++++----------- .../AArch64/scalarization-overhead.ll | 62 ++- .../AArch64/shuffle-vectors-mask-size.ll | 7 +- .../SLPVectorizer/AArch64/tsc-s116.ll | 8 +- .../vectorizable-selects-uniform-cmps.ll | 32 +- .../RISCV/combined-loads-stored.ll | 7 +- .../SLPVectorizer/RISCV/reductions.ll | 48 ++- .../SLPVectorizer/SystemZ/pr34619.ll | 11 +- .../Transforms/SLPVectorizer/X86/addsub.ll | 18 +- .../X86/extract-many-users-buildvector.ll | 43 +-- .../X86/extract-scalar-from-undef.ll | 27 +- .../X86/gather-node-same-as-vect-but-order.ll | 13 +- .../SLPVectorizer/X86/horizontal-minmax.ll | 16 +- .../SLPVectorizer/X86/inst_size_bug.ll | 18 +- .../SLPVectorizer/X86/landing_pad.ll | 19 +- llvm/test/Transforms/SLPVectorizer/X86/phi.ll | 36 +- .../SLPVectorizer/X86/reduction-logical.ll | 17 +- .../X86/remark-partial-loads-vectorize.ll | 16 +- .../X86/scatter-vectorize-reused-pointer.ll | 26 +- .../X86/schedule_budget_debug_info.ll | 40 +- .../SLPVectorizer/X86/split-load8_2-unord.ll | 39 +- .../Transforms/SLPVectorizer/X86/tiny-tree.ll | 5 +- .../X86/vect-gather-same-nodes.ll | 6 +- 27 files changed, 699 insertions(+), 785 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index d7763a022f3b6e..afaef6f9da9872 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3094,6 +3094,10 @@ class BoUpSLP { /// The index of this treeEntry in VectorizableTree. int Idx = -1; + /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from + /// other nodes as a series of insertvector instructions. + SmallVector, 0> CombinedEntriesWithIndices; + private: /// The operands of each instruction in each lane Operands[op_index][lane]. /// Note: This helps avoid the replication of the code that performs the @@ -3394,7 +3398,9 @@ class BoUpSLP { if (!isConstant(V)) { auto *I = dyn_cast(V); AllConstsOrCasts &= I && I->getType()->isIntegerTy(); - ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last); + if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE || + !UserTreeIdx.UserTE->isGather()) + ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last); } if (AllConstsOrCasts) CastMaxMinBWSizes = @@ -8349,8 +8355,49 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef Ptrs, void BoUpSLP::transformNodes() { constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - for (std::unique_ptr &TE : VectorizableTree) { - TreeEntry &E = *TE; + // The tree may grow here, so iterate over nodes, built before. + for (unsigned Idx : seq(VectorizableTree.size())) { + TreeEntry &E = *VectorizableTree[Idx]; + if (E.isGather()) { + ArrayRef VL = E.Scalars; + const unsigned Sz = getVectorElementSize(VL.front()); + unsigned MinVF = getMinVF(2 * Sz); + if (VL.size() <= 2 || + (E.getOpcode() && + (E.isAltShuffle() || E.getOpcode() != Instruction::Load))) + continue; + // Try to find vectorizable sequences and transform them into a series of + // insertvector instructions. + unsigned StartIdx = 0; + unsigned End = VL.size(); + for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) { + for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) { + ArrayRef Slice = VL.slice(Cnt, VF); + InstructionsState S = getSameOpcode(Slice, *TLI); + if (!S.getOpcode() || S.isAltShuffle() || + (S.getOpcode() != Instruction::Load && + any_of(Slice, [&](Value *V) { + return !areAllUsersVectorized(cast(V), + UserIgnoreList); + }))) + continue; + if (!getTreeEntry(Slice.front()) && !getTreeEntry(Slice.back())) { + unsigned PrevSize = VectorizableTree.size(); + buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX)); + if (PrevSize + 1 == VectorizableTree.size() && + VectorizableTree[PrevSize]->isGather()) { + VectorizableTree.pop_back(); + continue; + } + E.CombinedEntriesWithIndices.emplace_back(PrevSize, Cnt); + if (StartIdx == Cnt) + StartIdx = Cnt + VF; + if (End == Cnt + VF) + End = Cnt; + } + } + } + } switch (E.getOpcode()) { case Instruction::Load: { // No need to reorder masked gather loads, just reorder the scalar @@ -8473,175 +8520,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { auto *VecTy = getWidenedType(ScalarTy, VL.size()); InstructionCost GatherCost = 0; SmallVector Gathers(VL); - // Improve gather cost for gather of loads, if we can group some of the - // loads into vector loads. - InstructionsState S = getSameOpcode(VL, *R.TLI); - const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy); - unsigned MinVF = R.getMinVF(2 * Sz); - if (VL.size() > 2 && - ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) || - (InVectors.empty() && - any_of(seq(0, VL.size() / MinVF), - [&](unsigned Idx) { - ArrayRef SubVL = VL.slice(Idx * MinVF, MinVF); - InstructionsState S = getSameOpcode(SubVL, *R.TLI); - return S.getOpcode() == Instruction::Load && - !S.isAltShuffle(); - }))) && - !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) && - !isSplat(Gathers)) { - InstructionCost BaseCost = R.getGatherCost(Gathers, !Root, ScalarTy); - SetVector VectorizedLoads; - SmallVector> VectorizedStarts; - SmallVector ScatterVectorized; - unsigned StartIdx = 0; - unsigned VF = VL.size() / 2; - for (; VF >= MinVF; VF /= 2) { - for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End; - Cnt += VF) { - ArrayRef Slice = VL.slice(Cnt, VF); - if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) { - InstructionsState SliceS = getSameOpcode(Slice, *R.TLI); - if (SliceS.getOpcode() != Instruction::Load || - SliceS.isAltShuffle()) - continue; - } - if (!VectorizedLoads.count(Slice.front()) && - !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) { - SmallVector PointerOps; - OrdersType CurrentOrder; - LoadsState LS = R.canVectorizeLoads(Slice, Slice.front(), - CurrentOrder, PointerOps); - switch (LS) { - case LoadsState::Vectorize: - case LoadsState::ScatterVectorize: - case LoadsState::StridedVectorize: - // Mark the vectorized loads so that we don't vectorize them - // again. - // TODO: better handling of loads with reorders. - if (((LS == LoadsState::Vectorize || - LS == LoadsState::StridedVectorize) && - CurrentOrder.empty()) || - (LS == LoadsState::StridedVectorize && - isReverseOrder(CurrentOrder))) - VectorizedStarts.emplace_back(Cnt, LS); - else - ScatterVectorized.push_back(Cnt); - VectorizedLoads.insert(Slice.begin(), Slice.end()); - // If we vectorized initial block, no need to try to vectorize - // it again. - if (Cnt == StartIdx) - StartIdx += VF; - break; - case LoadsState::Gather: - break; - } - } - } - // Check if the whole array was vectorized already - exit. - if (StartIdx >= VL.size()) - break; - // Found vectorizable parts - exit. - if (!VectorizedLoads.empty()) - break; - } - if (!VectorizedLoads.empty()) { - unsigned NumParts = TTI.getNumberOfParts(VecTy); - bool NeedInsertSubvectorAnalysis = - !NumParts || (VL.size() / VF) > NumParts; - // Get the cost for gathered loads. - for (unsigned I = 0, End = VL.size(); I < End; I += VF) { - if (VectorizedLoads.contains(VL[I])) - continue; - GatherCost += - getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root); - } - // Exclude potentially vectorized loads from list of gathered - // scalars. - Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType())); - // The cost for vectorized loads. - InstructionCost ScalarsCost = 0; - for (Value *V : VectorizedLoads) { - auto *LI = cast(V); - ScalarsCost += - TTI.getMemoryOpCost(Instruction::Load, LI->getType(), - LI->getAlign(), LI->getPointerAddressSpace(), - CostKind, TTI::OperandValueInfo(), LI); - } - auto *LoadTy = getWidenedType(VL.front()->getType(), VF); - for (const std::pair &P : VectorizedStarts) { - auto *LI = cast(VL[P.first]); - Align Alignment = LI->getAlign(); - GatherCost += - P.second == LoadsState::Vectorize - ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, - LI->getPointerAddressSpace(), CostKind, - TTI::OperandValueInfo(), LI) - : TTI.getStridedMemoryOpCost( - Instruction::Load, LoadTy, LI->getPointerOperand(), - /*VariableMask=*/false, Alignment, CostKind, LI); - // Add external uses costs. - for (auto [Idx, V] : enumerate(VL.slice( - P.first, std::min(VL.size() - P.first, VF)))) - if (!R.areAllUsersVectorized(cast(V))) - GatherCost += TTI.getVectorInstrCost(Instruction::ExtractElement, - LoadTy, CostKind, Idx); - // Estimate GEP cost. - SmallVector PointerOps(VF); - for (auto [I, V] : enumerate(VL.slice(P.first, VF))) - PointerOps[I] = cast(V)->getPointerOperand(); - auto [ScalarGEPCost, VectorGEPCost] = - getGEPCosts(TTI, PointerOps, LI->getPointerOperand(), - Instruction::Load, CostKind, LI->getType(), LoadTy); - GatherCost += VectorGEPCost - ScalarGEPCost; - } - for (unsigned P : ScatterVectorized) { - auto *LI0 = cast(VL[P]); - ArrayRef Slice = VL.slice(P, VF); - Align CommonAlignment = computeCommonAlignment(Slice); - GatherCost += TTI.getGatherScatterOpCost( - Instruction::Load, LoadTy, LI0->getPointerOperand(), - /*VariableMask=*/false, CommonAlignment, CostKind, LI0); - // Estimate GEP cost. - SmallVector PointerOps(VF); - for (auto [I, V] : enumerate(Slice)) - PointerOps[I] = cast(V)->getPointerOperand(); - OrdersType Order; - if (sortPtrAccesses(PointerOps, LI0->getType(), *R.DL, *R.SE, - Order)) { - // TODO: improve checks if GEPs can be vectorized. - Value *Ptr0 = PointerOps.front(); - Type *ScalarTy = Ptr0->getType(); - auto *VecTy = getWidenedType(ScalarTy, VF); - auto [ScalarGEPCost, VectorGEPCost] = - getGEPCosts(TTI, PointerOps, Ptr0, Instruction::GetElementPtr, - CostKind, ScalarTy, VecTy); - GatherCost += VectorGEPCost - ScalarGEPCost; - if (!Order.empty()) { - SmallVector Mask; - inversePermutation(Order, Mask); - GatherCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, - VecTy, Mask, CostKind); - } - } else { - GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true, - PointerOps.front()->getType()); - } - } - if (NeedInsertSubvectorAnalysis) { - // Add the cost for the subvectors insert. - SmallVector ShuffleMask(VL.size()); - for (unsigned I = VF, E = VL.size(); I < E; I += VF) { - for (unsigned Idx : seq(0, E)) - ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx; - GatherCost += ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, - ShuffleMask, CostKind, I, LoadTy); - } - } - GatherCost -= ScalarsCost; - } - GatherCost = std::min(BaseCost, GatherCost); - } else if (!Root && isSplat(VL)) { + if (!Root && isSplat(VL)) { // Found the broadcasting of the single scalar, calculate the cost as // the broadcast. const auto *It = find_if_not(VL, IsaPred); @@ -9389,7 +9268,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { InstructionCost createFreeze(InstructionCost Cost) { return Cost; } /// Finalize emission of the shuffles. InstructionCost - finalize(ArrayRef ExtMask, unsigned VF = 0, + finalize(ArrayRef ExtMask, + ArrayRef> SubVectors, + unsigned VF = 0, function_ref &)> Action = {}) { IsFinalized = true; if (Action) { @@ -9407,6 +9288,29 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { Action(V, CommonMask); InVectors.front() = V; } + if (!SubVectors.empty()) { + const PointerUnion &Vec = InVectors.front(); + if (InVectors.size() == 2) + Cost += createShuffle(Vec, InVectors.back(), CommonMask); + else + Cost += createShuffle(Vec, nullptr, CommonMask); + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (CommonMask[Idx] != PoisonMaskElem) + CommonMask[Idx] = Idx; + for (const auto [E, Idx] : SubVectors) { + Cost += ::getShuffleCost( + TTI, TTI::SK_InsertSubvector, + FixedVectorType::get(ScalarTy, CommonMask.size()), std::nullopt, + CostKind, Idx, + FixedVectorType::get(ScalarTy, E->getVectorFactor())); + if (!CommonMask.empty()) { + std::iota(std::next(CommonMask.begin(), Idx), + std::next(CommonMask.begin(), Idx + E->getVectorFactor()), + Idx); + } + } + } + ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true); if (CommonMask.empty()) { assert(InVectors.size() == 1 && "Expected only one vector with no mask"); @@ -12504,7 +12408,9 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { /// \param Action the action (if any) to be performed before final applying of /// the \p ExtMask mask. Value * - finalize(ArrayRef ExtMask, unsigned VF = 0, + finalize(ArrayRef ExtMask, + ArrayRef> SubVectors, + unsigned VF = 0, function_ref &)> Action = {}) { IsFinalized = true; SmallVector NewExtMask(ExtMask); @@ -12538,6 +12444,29 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { Action(Vec, CommonMask); InVectors.front() = Vec; } + if (!SubVectors.empty()) { + Value *Vec = InVectors.front(); + if (InVectors.size() == 2) { + Vec = createShuffle(Vec, InVectors.back(), CommonMask); + InVectors.pop_back(); + } else { + Vec = createShuffle(Vec, nullptr, CommonMask); + } + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (CommonMask[Idx] != PoisonMaskElem) + CommonMask[Idx] = Idx; + for (const auto [E, Idx] : SubVectors) { + Vec = Builder.CreateInsertVector( + Vec->getType(), Vec, E->VectorizedValue, Builder.getInt64(Idx)); + if (!CommonMask.empty()) { + std::iota(std::next(CommonMask.begin(), Idx), + std::next(CommonMask.begin(), Idx + E->getVectorFactor()), + Idx); + } + } + InVectors.front() = Vec; + } + if (!ExtMask.empty()) { if (CommonMask.empty()) { CommonMask.assign(ExtMask.begin(), ExtMask.end()); @@ -12616,7 +12545,14 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, : ScalarTy, Builder, *this); ShuffleBuilder.add(V, Mask); - return ShuffleBuilder.finalize(std::nullopt); + SmallVector> SubVectors( + E->CombinedEntriesWithIndices.size()); + transform(E->CombinedEntriesWithIndices, SubVectors.begin(), + [&](const auto &P) { + return std::make_pair(VectorizableTree[P.first].get(), + P.second); + }); + return ShuffleBuilder.finalize(std::nullopt, SubVectors); }; Value *V = vectorizeTree(VE, PostponedPHIs); if (VF * getNumElements(VL[0]->getType()) != @@ -12699,6 +12635,17 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, SmallVector ReuseShuffleIndices(E->ReuseShuffleIndices.begin(), E->ReuseShuffleIndices.end()); SmallVector GatheredScalars(E->Scalars.begin(), E->Scalars.end()); + // Clear values, to be replaced by insertvector instructions. + for (const auto [EIdx, Idx] : E->CombinedEntriesWithIndices) + for_each(MutableArrayRef(GatheredScalars) + .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()), + [&](Value *&V) { V = PoisonValue::get(V->getType()); }); + SmallVector> SubVectors( + E->CombinedEntriesWithIndices.size()); + transform(E->CombinedEntriesWithIndices, SubVectors.begin(), + [&](const auto &P) { + return std::make_pair(VectorizableTree[P.first].get(), P.second); + }); // Build a mask out of the reorder indices and reorder scalars per this // mask. SmallVector ReorderMask; @@ -12836,7 +12783,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } } ShuffleBuilder.add(*FrontTE, Mask); - Res = ShuffleBuilder.finalize(E->getCommonMask()); + Res = ShuffleBuilder.finalize(E->getCommonMask(), SubVectors); return Res; } if (!Resized) { @@ -13093,10 +13040,10 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, (IsSingleShuffle && ((IsIdentityShuffle && IsNonPoisoned) || IsUsedInExpr) && isa(V)); })) - Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); else Res = ShuffleBuilder.finalize( - E->ReuseShuffleIndices, E->Scalars.size(), + E->ReuseShuffleIndices, SubVectors, E->Scalars.size(), [&](Value *&Vec, SmallVectorImpl &Mask) { TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false); Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec); @@ -13107,7 +13054,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true); Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size()); ShuffleBuilder.add(BV, ReuseMask); - Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); } else { // Gather all constants. SmallVector Mask(GatheredScalars.size(), PoisonMaskElem); @@ -13117,7 +13064,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } Value *BV = ShuffleBuilder.gather(GatheredScalars); ShuffleBuilder.add(BV, Mask); - Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); } if (NeedFreeze) @@ -13126,6 +13073,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) { + for (const auto [EIdx, _] : E->CombinedEntriesWithIndices) + (void)vectorizeTree(VectorizableTree[EIdx].get(), /*PostponedPHIs=*/false); return processBuildVector(E, ScalarTy, Builder, *this); } @@ -13177,7 +13126,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { } else { ShuffleBuilder.addOrdered(V, E->ReorderIndices); } - return ShuffleBuilder.finalize(E->ReuseShuffleIndices); + SmallVector> SubVectors( + E->CombinedEntriesWithIndices.size()); + transform( + E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) { + return std::make_pair(VectorizableTree[P.first].get(), P.second); + }); + return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); }; assert(!E->isGather() && "Unhandled state"); @@ -14580,7 +14535,7 @@ Value *BoUpSLP::vectorizeTree( ShuffleBuilder.add(V1, CombinedMask1); if (V2) ShuffleBuilder.add(V2, CombinedMask2); - return ShuffleBuilder.finalize(std::nullopt); + return ShuffleBuilder.finalize(std::nullopt, std::nullopt); }; auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef Mask, @@ -14718,7 +14673,14 @@ Value *BoUpSLP::vectorizeTree( // Clear up reduction references, if any. if (UserIgnoreList) { for (Instruction *I : RemovedInsts) { - if (getTreeEntry(I)->Idx != 0) + const TreeEntry *IE = getTreeEntry(I); + if (IE->Idx != 0 && + !(VectorizableTree.front()->isGather() && isa(I) && + !IE->UserTreeIndices.empty() && + any_of(IE->UserTreeIndices, [&](const EdgeInfo &EI) { + return EI.UserTE == VectorizableTree.front().get() && + EI.EdgeIdx == UINT_MAX; + }))) continue; SmallVector LogicalOpSelects; I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) { diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll index 22511c018dca2d..2121775224098e 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll @@ -18,62 +18,62 @@ define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[IP2]] to i64 ; CHECK-NEXT: [[RRRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 4 ; CHECK-NEXT: [[RRRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 4 -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1, !tbaa [[TBAA0:![0-9]+]] -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[RDD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[RDD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[RRRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR]], i64 4 ; CHECK-NEXT: [[RRRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64]], i64 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[RDD_PTR]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[RDD_PTR64]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_1]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_1]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[RDD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[RDD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[RRRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_1]], i64 4 ; CHECK-NEXT: [[RRRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_1]], i64 4 -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[RDD_PTR_1]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_1]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_2]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[RDD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[RDD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_1]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[RRRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_2]], i64 4 ; CHECK-NEXT: [[RRRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_2]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1, !tbaa [[TBAA0:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[RDD_PTR]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[RDD_PTR64]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[RDD_PTR_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_2]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[RDD_PTR_2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_3]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_3]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP18]], <16 x i8> [[TMP19]], <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP21]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <16 x i8> [[TMP31]], <16 x i8> [[TMP32]], <16 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_3]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_3]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> ; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> ; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> -; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP34]], [[TMP40]] +; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]] ; CHECK-NEXT: [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], -; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP28]] +; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP26]] ; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll index c1cef6ff3d10b4..91c8db14a45aa1 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll @@ -169,11 +169,12 @@ define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[T11]] to i64 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP9]] ; CHECK-NEXT: [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[T10]], i64 2 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[T12]], i64 3 -; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]]) -; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP13]], [[SUM_032]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[T10]], i64 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[T12]], i64 3 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]]) +; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP14]], [[SUM_032]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll index d79aed89b0be73..5b878108af59af 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -340,12 +340,12 @@ entry: define i16 @reduce_blockstrided4(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, i32 noundef %stride) { ; CHECK-LABEL: @reduce_blockstrided4( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X:%.*]], align 2 ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X]], align 2 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y:%.*]], align 2 -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y]], align 2 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i16> [[TMP2]], [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i16> [[TMP3]], [[TMP1]] @@ -416,31 +416,31 @@ define i32 @reduce_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[OFF2:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4 -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 ; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> [[TMP11]], <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = zext <16 x i8> [[TMP12]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP12]], <16 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP15]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> ; CHECK-NEXT: [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = mul nuw nsw <16 x i32> [[TMP13]], [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = mul nuw nsw <16 x i32> [[TMP11]], [[TMP19]] ; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP20]]) ; CHECK-NEXT: ret i32 [[TMP21]] ; @@ -677,63 +677,63 @@ entry: define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, ptr nocapture noundef writeonly %z, i32 noundef %stride) { ; CHECK-LABEL: @store_blockstrided3( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[X:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 8 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i64 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ADD4:%.*]] = add nsw i32 [[STRIDE:%.*]], 1 ; CHECK-NEXT: [[IDXPROM5:%.*]] = sext i32 [[ADD4]] to i64 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[STRIDE]], 1 ; CHECK-NEXT: [[IDXPROM11:%.*]] = sext i32 [[MUL]] to i64 ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM11]] -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[ADD14:%.*]] = or disjoint i32 [[MUL]], 1 -; CHECK-NEXT: [[IDXPROM15:%.*]] = sext i32 [[ADD14]] to i64 -; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM15]] +; CHECK-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL]], 2 +; CHECK-NEXT: [[IDXPROM19:%.*]] = sext i32 [[ADD18]] to i64 +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM19]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4 ; CHECK-NEXT: [[MUL21:%.*]] = mul nsw i32 [[STRIDE]], 3 ; CHECK-NEXT: [[IDXPROM23:%.*]] = sext i32 [[MUL21]] to i64 ; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM23]] -; CHECK-NEXT: [[ADD30:%.*]] = add nsw i32 [[MUL21]], 2 -; CHECK-NEXT: [[IDXPROM31:%.*]] = sext i32 [[ADD30]] to i64 -; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM31]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX32]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[Y:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, ptr [[Y]], i64 8 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX24]], align 4 +; CHECK-NEXT: [[ADD26:%.*]] = add nsw i32 [[MUL21]], 1 +; CHECK-NEXT: [[IDXPROM27:%.*]] = sext i32 [[ADD26]] to i64 +; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM27]] +; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i64 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]] -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4 -; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM15]] +; CHECK-NEXT: [[ARRAYIDX56:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM19]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX56]], align 4 ; CHECK-NEXT: [[ARRAYIDX60:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM23]] -; CHECK-NEXT: [[ARRAYIDX68:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM31]] -; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX68]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX60]], align 4 +; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]] ; CHECK-NEXT: [[ARRAYIDX72:%.*]] = getelementptr inbounds i8, ptr [[Z:%.*]], i64 4 -; CHECK-NEXT: [[MUL73:%.*]] = mul nsw i32 [[TMP6]], [[TMP1]] +; CHECK-NEXT: [[MUL73:%.*]] = mul nsw i32 [[TMP3]], [[TMP0]] ; CHECK-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 24 -; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP5]], [[TMP0]] -; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP7]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[X]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr [[Y]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP8]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP9]], [[TMP7]] ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> -; CHECK-NEXT: [[MUL81:%.*]] = mul nsw i32 [[TMP8]], [[TMP3]] -; CHECK-NEXT: [[ARRAYIDX82:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 32 -; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX16]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX52]], align 4 +; CHECK-NEXT: [[ARRAYIDX84:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 28 +; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX48]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP14]], [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[ARRAYIDX90:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 40 -; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX24]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX60]], align 4 +; CHECK-NEXT: [[MUL85:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]] +; CHECK-NEXT: [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 44 +; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 36 +; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4 ; CHECK-NEXT: store i32 [[MUL73]], ptr [[Z]], align 4 ; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr [[ARRAYIDX72]], align 4 -; CHECK-NEXT: store i32 [[MUL81]], ptr [[ARRAYIDX82]], align 4 -; CHECK-NEXT: store <2 x i32> [[TMP16]], ptr [[ARRAYIDX76]], align 4 +; CHECK-NEXT: store <2 x i32> [[TMP16]], ptr [[ARRAYIDX84]], align 4 +; CHECK-NEXT: store i32 [[MUL85]], ptr [[ARRAYIDX76]], align 4 +; CHECK-NEXT: store i32 [[MUL87]], ptr [[ARRAYIDX88]], align 4 ; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <2 x i32> [[TMP18]], [[TMP17]] ; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP19]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: store <2 x i32> [[TMP20]], ptr [[ARRAYIDX90]], align 4 -; CHECK-NEXT: [[MUL91:%.*]] = mul nsw i32 [[TMP9]], [[TMP4]] -; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 36 -; CHECK-NEXT: store i32 [[MUL91]], ptr [[ARRAYIDX92]], align 4 +; CHECK-NEXT: store <2 x i32> [[TMP20]], ptr [[ARRAYIDX92]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -833,12 +833,12 @@ entry: define void @store_blockstrided4(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, i32 noundef %stride, ptr %dst0) { ; CHECK-LABEL: @store_blockstrided4( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X:%.*]], align 2 ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X]], align 2 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y:%.*]], align 2 -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y]], align 2 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i16> [[TMP2]], [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i16> [[TMP3]], [[TMP1]] @@ -1203,62 +1203,62 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1, ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[ST2:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4 -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 4 -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 ; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 ; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP18]], <16 x i8> [[TMP19]], <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP21]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <16 x i8> [[TMP31]], <16 x i8> [[TMP32]], <16 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> ; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> ; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> -; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP34]], [[TMP40]] +; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]] ; CHECK-NEXT: [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], -; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP28]] +; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP26]] ; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP45:%.*]] = add nsw <16 x i32> [[TMP43]], [[TMP44]] ; CHECK-NEXT: [[TMP46:%.*]] = sub nsw <16 x i32> [[TMP43]], [[TMP44]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll index d89d6286703605..07411cacb36268 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll @@ -14,232 +14,161 @@ define i64 @straight(ptr nocapture noundef readonly %p, i32 noundef %st) { ; CHECK-LABEL: @straight( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[ST:%.*]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 2 -; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ADD_PTR]], align 2 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ADD_PTR_1]], align 2 ; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[ADD_PTR_2]], align 2 ; CHECK-NEXT: [[ADD_PTR_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[ADD_PTR_3]], align 2 ; CHECK-NEXT: [[ADD_PTR_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr [[ADD_PTR_4]], align 2 ; CHECK-NEXT: [[ADD_PTR_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[ADD_PTR_5]], align 2 ; CHECK-NEXT: [[ADD_PTR_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[P]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ADD_PTR]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ADD_PTR_1]], align 2 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[ADD_PTR_2]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[ADD_PTR_3]], align 2 +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr [[ADD_PTR_4]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[ADD_PTR_5]], align 2 ; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i16>, ptr [[ADD_PTR_6]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP7]], i32 7 -; CHECK-NEXT: [[CONV_7_7:%.*]] = zext i16 [[TMP8]] to i32 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <64 x i16> [[TMP9]], <64 x i16> [[TMP10]], <64 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <64 x i16> [[TMP11]], <64 x i16> [[TMP12]], <64 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <64 x i16> [[TMP13]], <64 x i16> [[TMP14]], <64 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <64 x i16> [[TMP15]], <64 x i16> [[TMP16]], <64 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <64 x i16> [[TMP17]], <64 x i16> [[TMP18]], <64 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <64 x i16> [[TMP19]], <64 x i16> [[TMP20]], <64 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <64 x i16> [[TMP21]], <64 x i16> [[TMP22]], <64 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = zext <64 x i16> [[TMP23]] to <64 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[TMP7]], i32 6 -; CHECK-NEXT: [[CONV_6_7:%.*]] = zext i16 [[TMP25]] to i32 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP7]], i32 5 -; CHECK-NEXT: [[CONV_5_7:%.*]] = zext i16 [[TMP26]] to i32 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x i16> [[TMP7]], i32 4 -; CHECK-NEXT: [[CONV_4_7:%.*]] = zext i16 [[TMP27]] to i32 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i16> [[TMP7]], i32 3 -; CHECK-NEXT: [[CONV_3_7:%.*]] = zext i16 [[TMP28]] to i32 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i16> [[TMP7]], i32 2 -; CHECK-NEXT: [[CONV_2_7:%.*]] = zext i16 [[TMP29]] to i32 -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i16> [[TMP7]], i32 1 -; CHECK-NEXT: [[CONV_1_7:%.*]] = zext i16 [[TMP30]] to i32 -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <8 x i16> [[TMP7]], i32 0 -; CHECK-NEXT: [[CONV_764:%.*]] = zext i16 [[TMP31]] to i32 -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7 -; CHECK-NEXT: [[CONV_7_6:%.*]] = zext i16 [[TMP32]] to i32 -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6 -; CHECK-NEXT: [[CONV_6_6:%.*]] = zext i16 [[TMP33]] to i32 -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5 -; CHECK-NEXT: [[CONV_5_6:%.*]] = zext i16 [[TMP34]] to i32 -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4 -; CHECK-NEXT: [[CONV_4_6:%.*]] = zext i16 [[TMP35]] to i32 -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 3 -; CHECK-NEXT: [[CONV_3_6:%.*]] = zext i16 [[TMP36]] to i32 -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i16> [[TMP6]], i32 2 -; CHECK-NEXT: [[CONV_2_6:%.*]] = zext i16 [[TMP37]] to i32 -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <8 x i16> [[TMP6]], i32 1 -; CHECK-NEXT: [[CONV_1_6:%.*]] = zext i16 [[TMP38]] to i32 -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 -; CHECK-NEXT: [[CONV_660:%.*]] = zext i16 [[TMP39]] to i32 -; CHECK-NEXT: [[TMP40:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7 -; CHECK-NEXT: [[CONV_7_5:%.*]] = zext i16 [[TMP40]] to i32 -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6 -; CHECK-NEXT: [[CONV_6_5:%.*]] = zext i16 [[TMP41]] to i32 -; CHECK-NEXT: [[TMP42:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5 -; CHECK-NEXT: [[CONV_5_5:%.*]] = zext i16 [[TMP42]] to i32 -; CHECK-NEXT: [[TMP43:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4 -; CHECK-NEXT: [[CONV_4_5:%.*]] = zext i16 [[TMP43]] to i32 -; CHECK-NEXT: [[TMP44:%.*]] = extractelement <8 x i16> [[TMP5]], i32 3 -; CHECK-NEXT: [[CONV_3_5:%.*]] = zext i16 [[TMP44]] to i32 -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <8 x i16> [[TMP5]], i32 2 -; CHECK-NEXT: [[CONV_2_5:%.*]] = zext i16 [[TMP45]] to i32 -; CHECK-NEXT: [[TMP46:%.*]] = extractelement <8 x i16> [[TMP5]], i32 1 -; CHECK-NEXT: [[CONV_1_5:%.*]] = zext i16 [[TMP46]] to i32 -; CHECK-NEXT: [[TMP47:%.*]] = extractelement <8 x i16> [[TMP5]], i32 0 -; CHECK-NEXT: [[CONV_556:%.*]] = zext i16 [[TMP47]] to i32 -; CHECK-NEXT: [[TMP48:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7 -; CHECK-NEXT: [[CONV_7_4:%.*]] = zext i16 [[TMP48]] to i32 -; CHECK-NEXT: [[TMP49:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6 -; CHECK-NEXT: [[CONV_6_4:%.*]] = zext i16 [[TMP49]] to i32 -; CHECK-NEXT: [[TMP50:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5 -; CHECK-NEXT: [[CONV_5_4:%.*]] = zext i16 [[TMP50]] to i32 -; CHECK-NEXT: [[TMP51:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4 -; CHECK-NEXT: [[CONV_4_4:%.*]] = zext i16 [[TMP51]] to i32 -; CHECK-NEXT: [[TMP52:%.*]] = extractelement <8 x i16> [[TMP4]], i32 3 -; CHECK-NEXT: [[CONV_3_4:%.*]] = zext i16 [[TMP52]] to i32 -; CHECK-NEXT: [[TMP53:%.*]] = extractelement <8 x i16> [[TMP4]], i32 2 -; CHECK-NEXT: [[CONV_2_4:%.*]] = zext i16 [[TMP53]] to i32 -; CHECK-NEXT: [[TMP54:%.*]] = extractelement <8 x i16> [[TMP4]], i32 1 -; CHECK-NEXT: [[CONV_1_4:%.*]] = zext i16 [[TMP54]] to i32 -; CHECK-NEXT: [[TMP55:%.*]] = extractelement <8 x i16> [[TMP4]], i32 0 -; CHECK-NEXT: [[CONV_452:%.*]] = zext i16 [[TMP55]] to i32 -; CHECK-NEXT: [[TMP56:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 -; CHECK-NEXT: [[CONV_7_3:%.*]] = zext i16 [[TMP56]] to i32 -; CHECK-NEXT: [[TMP57:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 -; CHECK-NEXT: [[CONV_6_3:%.*]] = zext i16 [[TMP57]] to i32 -; CHECK-NEXT: [[TMP58:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 -; CHECK-NEXT: [[CONV_5_3:%.*]] = zext i16 [[TMP58]] to i32 -; CHECK-NEXT: [[TMP59:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 -; CHECK-NEXT: [[CONV_4_3:%.*]] = zext i16 [[TMP59]] to i32 -; CHECK-NEXT: [[TMP60:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 -; CHECK-NEXT: [[CONV_3_3:%.*]] = zext i16 [[TMP60]] to i32 -; CHECK-NEXT: [[TMP61:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 -; CHECK-NEXT: [[CONV_2_3:%.*]] = zext i16 [[TMP61]] to i32 -; CHECK-NEXT: [[TMP62:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 -; CHECK-NEXT: [[CONV_1_3:%.*]] = zext i16 [[TMP62]] to i32 -; CHECK-NEXT: [[TMP63:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 -; CHECK-NEXT: [[CONV_348:%.*]] = zext i16 [[TMP63]] to i32 -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 -; CHECK-NEXT: [[CONV_7_2:%.*]] = zext i16 [[TMP64]] to i32 -; CHECK-NEXT: [[TMP65:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6 -; CHECK-NEXT: [[CONV_6_2:%.*]] = zext i16 [[TMP65]] to i32 -; CHECK-NEXT: [[TMP66:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5 -; CHECK-NEXT: [[CONV_5_2:%.*]] = zext i16 [[TMP66]] to i32 -; CHECK-NEXT: [[TMP67:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4 -; CHECK-NEXT: [[CONV_4_2:%.*]] = zext i16 [[TMP67]] to i32 -; CHECK-NEXT: [[TMP68:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3 -; CHECK-NEXT: [[CONV_3_2:%.*]] = zext i16 [[TMP68]] to i32 -; CHECK-NEXT: [[TMP69:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2 -; CHECK-NEXT: [[CONV_2_2:%.*]] = zext i16 [[TMP69]] to i32 -; CHECK-NEXT: [[TMP70:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1 -; CHECK-NEXT: [[CONV_1_2:%.*]] = zext i16 [[TMP70]] to i32 -; CHECK-NEXT: [[TMP71:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0 -; CHECK-NEXT: [[CONV_244:%.*]] = zext i16 [[TMP71]] to i32 -; CHECK-NEXT: [[TMP72:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 -; CHECK-NEXT: [[CONV_7_1:%.*]] = zext i16 [[TMP72]] to i32 -; CHECK-NEXT: [[TMP73:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6 -; CHECK-NEXT: [[CONV_6_1:%.*]] = zext i16 [[TMP73]] to i32 -; CHECK-NEXT: [[TMP74:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5 -; CHECK-NEXT: [[CONV_5_1:%.*]] = zext i16 [[TMP74]] to i32 -; CHECK-NEXT: [[TMP75:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4 -; CHECK-NEXT: [[CONV_4_1:%.*]] = zext i16 [[TMP75]] to i32 -; CHECK-NEXT: [[TMP76:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3 -; CHECK-NEXT: [[CONV_3_1:%.*]] = zext i16 [[TMP76]] to i32 -; CHECK-NEXT: [[TMP77:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2 -; CHECK-NEXT: [[CONV_2_1:%.*]] = zext i16 [[TMP77]] to i32 -; CHECK-NEXT: [[TMP78:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1 -; CHECK-NEXT: [[CONV_1_1:%.*]] = zext i16 [[TMP78]] to i32 -; CHECK-NEXT: [[TMP79:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 -; CHECK-NEXT: [[CONV_140:%.*]] = zext i16 [[TMP79]] to i32 -; CHECK-NEXT: [[TMP80:%.*]] = extractelement <8 x i16> [[TMP0]], i32 7 -; CHECK-NEXT: [[CONV_7:%.*]] = zext i16 [[TMP80]] to i32 -; CHECK-NEXT: [[TMP81:%.*]] = extractelement <8 x i16> [[TMP0]], i32 6 -; CHECK-NEXT: [[CONV_6:%.*]] = zext i16 [[TMP81]] to i32 -; CHECK-NEXT: [[TMP82:%.*]] = extractelement <8 x i16> [[TMP0]], i32 5 -; CHECK-NEXT: [[CONV_5:%.*]] = zext i16 [[TMP82]] to i32 -; CHECK-NEXT: [[TMP83:%.*]] = extractelement <8 x i16> [[TMP0]], i32 4 -; CHECK-NEXT: [[CONV_4:%.*]] = zext i16 [[TMP83]] to i32 -; CHECK-NEXT: [[TMP84:%.*]] = extractelement <8 x i16> [[TMP0]], i32 3 -; CHECK-NEXT: [[CONV_3:%.*]] = zext i16 [[TMP84]] to i32 -; CHECK-NEXT: [[TMP85:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2 -; CHECK-NEXT: [[CONV_2:%.*]] = zext i16 [[TMP85]] to i32 -; CHECK-NEXT: [[TMP86:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0 -; CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP86]] to i32 -; CHECK-NEXT: [[TMP87:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1 -; CHECK-NEXT: [[CONV_1:%.*]] = zext i16 [[TMP87]] to i32 -; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i32 [[CONV]], [[CONV_1]] -; CHECK-NEXT: [[TMP88:%.*]] = mul nuw nsw <64 x i32> [[TMP24]], [[TMP24]] -; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[CONV_2]] -; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[CONV_3]] -; CHECK-NEXT: [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[CONV_4]] -; CHECK-NEXT: [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[CONV_5]] -; CHECK-NEXT: [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[CONV_6]] -; CHECK-NEXT: [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[CONV_7]] -; CHECK-NEXT: [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[CONV_140]] -; CHECK-NEXT: [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[CONV_1_1]] -; CHECK-NEXT: [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[CONV_2_1]] -; CHECK-NEXT: [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[CONV_3_1]] -; CHECK-NEXT: [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[CONV_4_1]] -; CHECK-NEXT: [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[CONV_5_1]] -; CHECK-NEXT: [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[CONV_6_1]] -; CHECK-NEXT: [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[CONV_7_1]] -; CHECK-NEXT: [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[CONV_244]] -; CHECK-NEXT: [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[CONV_1_2]] -; CHECK-NEXT: [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[CONV_2_2]] -; CHECK-NEXT: [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[CONV_3_2]] -; CHECK-NEXT: [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[CONV_4_2]] -; CHECK-NEXT: [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[CONV_5_2]] -; CHECK-NEXT: [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[CONV_6_2]] -; CHECK-NEXT: [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[CONV_7_2]] -; CHECK-NEXT: [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[CONV_348]] -; CHECK-NEXT: [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[CONV_1_3]] -; CHECK-NEXT: [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[CONV_2_3]] -; CHECK-NEXT: [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[CONV_3_3]] -; CHECK-NEXT: [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[CONV_4_3]] -; CHECK-NEXT: [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[CONV_5_3]] -; CHECK-NEXT: [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[CONV_6_3]] -; CHECK-NEXT: [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[CONV_7_3]] -; CHECK-NEXT: [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[CONV_452]] -; CHECK-NEXT: [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[CONV_1_4]] -; CHECK-NEXT: [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[CONV_2_4]] -; CHECK-NEXT: [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[CONV_3_4]] -; CHECK-NEXT: [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[CONV_4_4]] -; CHECK-NEXT: [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[CONV_5_4]] -; CHECK-NEXT: [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[CONV_6_4]] -; CHECK-NEXT: [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[CONV_7_4]] -; CHECK-NEXT: [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[CONV_556]] -; CHECK-NEXT: [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[CONV_1_5]] -; CHECK-NEXT: [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[CONV_2_5]] -; CHECK-NEXT: [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[CONV_3_5]] -; CHECK-NEXT: [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[CONV_4_5]] -; CHECK-NEXT: [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[CONV_5_5]] -; CHECK-NEXT: [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[CONV_6_5]] -; CHECK-NEXT: [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[CONV_7_5]] -; CHECK-NEXT: [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[CONV_660]] -; CHECK-NEXT: [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[CONV_1_6]] -; CHECK-NEXT: [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[CONV_2_6]] -; CHECK-NEXT: [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[CONV_3_6]] -; CHECK-NEXT: [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[CONV_4_6]] -; CHECK-NEXT: [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[CONV_5_6]] -; CHECK-NEXT: [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[CONV_6_6]] -; CHECK-NEXT: [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[CONV_7_6]] -; CHECK-NEXT: [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[CONV_764]] -; CHECK-NEXT: [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[CONV_1_7]] -; CHECK-NEXT: [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[CONV_2_7]] -; CHECK-NEXT: [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[CONV_3_7]] -; CHECK-NEXT: [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[CONV_4_7]] -; CHECK-NEXT: [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[CONV_5_7]] -; CHECK-NEXT: [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[CONV_6_7]] -; CHECK-NEXT: [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[CONV_7_7]] -; CHECK-NEXT: [[TMP89:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP88]]) +; CHECK-NEXT: [[TMP8:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> poison, <8 x i16> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP9:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP8]], <8 x i16> [[TMP1]], i64 8) +; CHECK-NEXT: [[TMP10:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP9]], <8 x i16> [[TMP2]], i64 16) +; CHECK-NEXT: [[TMP11:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP10]], <8 x i16> [[TMP3]], i64 24) +; CHECK-NEXT: [[TMP12:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP11]], <8 x i16> [[TMP4]], i64 32) +; CHECK-NEXT: [[TMP13:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP12]], <8 x i16> [[TMP5]], i64 40) +; CHECK-NEXT: [[TMP14:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP13]], <8 x i16> [[TMP6]], i64 48) +; CHECK-NEXT: [[TMP15:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP14]], <8 x i16> [[TMP7]], i64 56) +; CHECK-NEXT: [[TMP16:%.*]] = zext <64 x i16> [[TMP15]] to <64 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <64 x i32> [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <64 x i32> [[TMP16]], i32 1 +; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i32 [[TMP17]], [[TMP18]] +; CHECK-NEXT: [[TMP19:%.*]] = mul nuw nsw <64 x i32> [[TMP16]], [[TMP16]] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <64 x i32> [[TMP16]], i32 2 +; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[TMP20]] +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <64 x i32> [[TMP16]], i32 3 +; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[TMP21]] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <64 x i32> [[TMP16]], i32 4 +; CHECK-NEXT: [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[TMP22]] +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <64 x i32> [[TMP16]], i32 5 +; CHECK-NEXT: [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[TMP23]] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <64 x i32> [[TMP16]], i32 6 +; CHECK-NEXT: [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[TMP24]] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <64 x i32> [[TMP16]], i32 7 +; CHECK-NEXT: [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[TMP25]] +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <64 x i32> [[TMP16]], i32 8 +; CHECK-NEXT: [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[TMP26]] +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <64 x i32> [[TMP16]], i32 9 +; CHECK-NEXT: [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[TMP27]] +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <64 x i32> [[TMP16]], i32 10 +; CHECK-NEXT: [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[TMP28]] +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <64 x i32> [[TMP16]], i32 11 +; CHECK-NEXT: [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[TMP29]] +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <64 x i32> [[TMP16]], i32 12 +; CHECK-NEXT: [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[TMP30]] +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <64 x i32> [[TMP16]], i32 13 +; CHECK-NEXT: [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[TMP31]] +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <64 x i32> [[TMP16]], i32 14 +; CHECK-NEXT: [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[TMP32]] +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <64 x i32> [[TMP16]], i32 15 +; CHECK-NEXT: [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[TMP33]] +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <64 x i32> [[TMP16]], i32 16 +; CHECK-NEXT: [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[TMP34]] +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <64 x i32> [[TMP16]], i32 17 +; CHECK-NEXT: [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[TMP35]] +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <64 x i32> [[TMP16]], i32 18 +; CHECK-NEXT: [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[TMP36]] +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <64 x i32> [[TMP16]], i32 19 +; CHECK-NEXT: [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[TMP37]] +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <64 x i32> [[TMP16]], i32 20 +; CHECK-NEXT: [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[TMP38]] +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <64 x i32> [[TMP16]], i32 21 +; CHECK-NEXT: [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[TMP39]] +; CHECK-NEXT: [[TMP40:%.*]] = extractelement <64 x i32> [[TMP16]], i32 22 +; CHECK-NEXT: [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[TMP40]] +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <64 x i32> [[TMP16]], i32 23 +; CHECK-NEXT: [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[TMP41]] +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <64 x i32> [[TMP16]], i32 24 +; CHECK-NEXT: [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[TMP42]] +; CHECK-NEXT: [[TMP43:%.*]] = extractelement <64 x i32> [[TMP16]], i32 25 +; CHECK-NEXT: [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[TMP43]] +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <64 x i32> [[TMP16]], i32 26 +; CHECK-NEXT: [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[TMP44]] +; CHECK-NEXT: [[TMP45:%.*]] = extractelement <64 x i32> [[TMP16]], i32 27 +; CHECK-NEXT: [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[TMP45]] +; CHECK-NEXT: [[TMP46:%.*]] = extractelement <64 x i32> [[TMP16]], i32 28 +; CHECK-NEXT: [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[TMP46]] +; CHECK-NEXT: [[TMP47:%.*]] = extractelement <64 x i32> [[TMP16]], i32 29 +; CHECK-NEXT: [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[TMP47]] +; CHECK-NEXT: [[TMP48:%.*]] = extractelement <64 x i32> [[TMP16]], i32 30 +; CHECK-NEXT: [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[TMP48]] +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <64 x i32> [[TMP16]], i32 31 +; CHECK-NEXT: [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[TMP49]] +; CHECK-NEXT: [[TMP50:%.*]] = extractelement <64 x i32> [[TMP16]], i32 32 +; CHECK-NEXT: [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[TMP50]] +; CHECK-NEXT: [[TMP51:%.*]] = extractelement <64 x i32> [[TMP16]], i32 33 +; CHECK-NEXT: [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[TMP51]] +; CHECK-NEXT: [[TMP52:%.*]] = extractelement <64 x i32> [[TMP16]], i32 34 +; CHECK-NEXT: [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[TMP52]] +; CHECK-NEXT: [[TMP53:%.*]] = extractelement <64 x i32> [[TMP16]], i32 35 +; CHECK-NEXT: [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[TMP53]] +; CHECK-NEXT: [[TMP54:%.*]] = extractelement <64 x i32> [[TMP16]], i32 36 +; CHECK-NEXT: [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[TMP54]] +; CHECK-NEXT: [[TMP55:%.*]] = extractelement <64 x i32> [[TMP16]], i32 37 +; CHECK-NEXT: [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[TMP55]] +; CHECK-NEXT: [[TMP56:%.*]] = extractelement <64 x i32> [[TMP16]], i32 38 +; CHECK-NEXT: [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[TMP56]] +; CHECK-NEXT: [[TMP57:%.*]] = extractelement <64 x i32> [[TMP16]], i32 39 +; CHECK-NEXT: [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[TMP57]] +; CHECK-NEXT: [[TMP58:%.*]] = extractelement <64 x i32> [[TMP16]], i32 40 +; CHECK-NEXT: [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[TMP58]] +; CHECK-NEXT: [[TMP59:%.*]] = extractelement <64 x i32> [[TMP16]], i32 41 +; CHECK-NEXT: [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[TMP59]] +; CHECK-NEXT: [[TMP60:%.*]] = extractelement <64 x i32> [[TMP16]], i32 42 +; CHECK-NEXT: [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[TMP60]] +; CHECK-NEXT: [[TMP61:%.*]] = extractelement <64 x i32> [[TMP16]], i32 43 +; CHECK-NEXT: [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[TMP61]] +; CHECK-NEXT: [[TMP62:%.*]] = extractelement <64 x i32> [[TMP16]], i32 44 +; CHECK-NEXT: [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[TMP62]] +; CHECK-NEXT: [[TMP63:%.*]] = extractelement <64 x i32> [[TMP16]], i32 45 +; CHECK-NEXT: [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[TMP63]] +; CHECK-NEXT: [[TMP64:%.*]] = extractelement <64 x i32> [[TMP16]], i32 46 +; CHECK-NEXT: [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[TMP64]] +; CHECK-NEXT: [[TMP65:%.*]] = extractelement <64 x i32> [[TMP16]], i32 47 +; CHECK-NEXT: [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[TMP65]] +; CHECK-NEXT: [[TMP66:%.*]] = extractelement <64 x i32> [[TMP16]], i32 48 +; CHECK-NEXT: [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[TMP66]] +; CHECK-NEXT: [[TMP67:%.*]] = extractelement <64 x i32> [[TMP16]], i32 49 +; CHECK-NEXT: [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[TMP67]] +; CHECK-NEXT: [[TMP68:%.*]] = extractelement <64 x i32> [[TMP16]], i32 50 +; CHECK-NEXT: [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[TMP68]] +; CHECK-NEXT: [[TMP69:%.*]] = extractelement <64 x i32> [[TMP16]], i32 51 +; CHECK-NEXT: [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[TMP69]] +; CHECK-NEXT: [[TMP70:%.*]] = extractelement <64 x i32> [[TMP16]], i32 52 +; CHECK-NEXT: [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[TMP70]] +; CHECK-NEXT: [[TMP71:%.*]] = extractelement <64 x i32> [[TMP16]], i32 53 +; CHECK-NEXT: [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[TMP71]] +; CHECK-NEXT: [[TMP72:%.*]] = extractelement <64 x i32> [[TMP16]], i32 54 +; CHECK-NEXT: [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[TMP72]] +; CHECK-NEXT: [[TMP73:%.*]] = extractelement <64 x i32> [[TMP16]], i32 55 +; CHECK-NEXT: [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[TMP73]] +; CHECK-NEXT: [[TMP74:%.*]] = extractelement <64 x i32> [[TMP16]], i32 56 +; CHECK-NEXT: [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[TMP74]] +; CHECK-NEXT: [[TMP75:%.*]] = extractelement <64 x i32> [[TMP16]], i32 57 +; CHECK-NEXT: [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[TMP75]] +; CHECK-NEXT: [[TMP76:%.*]] = extractelement <64 x i32> [[TMP16]], i32 58 +; CHECK-NEXT: [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[TMP76]] +; CHECK-NEXT: [[TMP77:%.*]] = extractelement <64 x i32> [[TMP16]], i32 59 +; CHECK-NEXT: [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[TMP77]] +; CHECK-NEXT: [[TMP78:%.*]] = extractelement <64 x i32> [[TMP16]], i32 60 +; CHECK-NEXT: [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[TMP78]] +; CHECK-NEXT: [[TMP79:%.*]] = extractelement <64 x i32> [[TMP16]], i32 61 +; CHECK-NEXT: [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[TMP79]] +; CHECK-NEXT: [[TMP80:%.*]] = extractelement <64 x i32> [[TMP16]], i32 62 +; CHECK-NEXT: [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[TMP80]] +; CHECK-NEXT: [[TMP81:%.*]] = extractelement <64 x i32> [[TMP16]], i32 63 +; CHECK-NEXT: [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[TMP81]] +; CHECK-NEXT: [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP19]]) ; CHECK-NEXT: [[CONV15:%.*]] = zext i32 [[ADD_7_7]] to i64 -; CHECK-NEXT: [[CONV16:%.*]] = zext i32 [[TMP89]] to i64 +; CHECK-NEXT: [[CONV16:%.*]] = zext i32 [[TMP82]] to i64 ; CHECK-NEXT: [[SHL:%.*]] = shl nuw i64 [[CONV16]], 32 ; CHECK-NEXT: [[ADD17:%.*]] = or i64 [[SHL]], [[CONV15]] ; CHECK-NEXT: ret i64 [[ADD17]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll index 6f6b66255a4340..8093285ad8717c 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll @@ -3,39 +3,63 @@ ; Test case reported on D134605 where the vectorization was causing a slowdown due to an underestimation in the cost of the extractions. +; NOTE: cost of shuffle <4 x float>, <4 x float>, <2 x i32> is 12! + define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, ptr %arg5, i1 %arg6, i1 %arg7, i1 %arg8) { ; CHECK-LABEL: @zot( ; CHECK-NEXT: bb: +; CHECK-NEXT: [[VAL:%.*]] = fmul fast float 0.000000e+00, 0.000000e+00 ; CHECK-NEXT: [[VAL9:%.*]] = fmul fast float 0.000000e+00, [[ARG:%.*]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> , float [[ARG]], i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> , [[TMP2]] +; CHECK-NEXT: [[VAL10:%.*]] = fmul fast float [[ARG3:%.*]], 1.000000e+00 +; CHECK-NEXT: [[VAL11:%.*]] = fmul fast float [[ARG3]], 1.000000e+00 ; CHECK-NEXT: [[VAL12:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP2]], float [[VAL12]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float 0.000000e+00, i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <4 x float> [[TMP5]], +; CHECK-NEXT: [[VAL13:%.*]] = fadd fast float [[VAL12]], 2.000000e+00 +; CHECK-NEXT: [[VAL14:%.*]] = fadd fast float 0.000000e+00, 0.000000e+00 +; CHECK-NEXT: [[VAL15:%.*]] = fadd fast float [[VAL14]], 1.000000e+00 +; CHECK-NEXT: [[VAL16:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 +; CHECK-NEXT: [[VAL17:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 ; CHECK-NEXT: br i1 [[ARG6:%.*]], label [[BB18:%.*]], label [[BB57:%.*]] ; CHECK: bb18: -; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x float> [ [[TMP6]], [[BB:%.*]] ] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP6]], i32 2 -; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[TMP8]], 2.000000e+00 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i32 3 -; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[TMP9]], 3.000000e+00 +; CHECK-NEXT: [[VAL19:%.*]] = phi float [ [[VAL13]], [[BB:%.*]] ] +; CHECK-NEXT: [[VAL20:%.*]] = phi float [ [[VAL15]], [[BB]] ] +; CHECK-NEXT: [[VAL21:%.*]] = phi float [ [[VAL16]], [[BB]] ] +; CHECK-NEXT: [[VAL22:%.*]] = phi float [ [[VAL17]], [[BB]] ] +; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[VAL16]], 2.000000e+00 +; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[VAL17]], 3.000000e+00 ; CHECK-NEXT: br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]] ; CHECK: bb25: -; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x float> [ [[TMP7]], [[BB18]] ] +; CHECK-NEXT: [[VAL26:%.*]] = phi float [ [[VAL19]], [[BB18]] ] +; CHECK-NEXT: [[VAL27:%.*]] = phi float [ [[VAL20]], [[BB18]] ] +; CHECK-NEXT: [[VAL28:%.*]] = phi float [ [[VAL21]], [[BB18]] ] +; CHECK-NEXT: [[VAL29:%.*]] = phi float [ [[VAL22]], [[BB18]] ] ; CHECK-NEXT: br label [[BB30:%.*]] ; CHECK: bb30: ; CHECK-NEXT: [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] ; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[VAL9]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1 -; CHECK-NEXT: [[TMP12:%.*]] = uitofp <4 x i8> [[TMP11]] to <4 x float> -; CHECK-NEXT: [[TMP13:%.*]] = fsub fast <4 x float> [[TMP12]], [[TMP3]] -; CHECK-NEXT: [[TMP14:%.*]] = fmul fast <4 x float> [[TMP13]], [[TMP10]] -; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP14]]) +; CHECK-NEXT: [[VAL33:%.*]] = load i8, ptr [[ARG5:%.*]], align 1 +; CHECK-NEXT: [[VAL34:%.*]] = uitofp i8 [[VAL33]] to float +; CHECK-NEXT: [[VAL35:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 1 +; CHECK-NEXT: [[VAL36:%.*]] = load i8, ptr [[VAL35]], align 1 +; CHECK-NEXT: [[VAL37:%.*]] = uitofp i8 [[VAL36]] to float +; CHECK-NEXT: [[VAL38:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 2 +; CHECK-NEXT: [[VAL39:%.*]] = load i8, ptr [[VAL38]], align 1 +; CHECK-NEXT: [[VAL40:%.*]] = uitofp i8 [[VAL39]] to float +; CHECK-NEXT: [[VAL41:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 3 +; CHECK-NEXT: [[VAL42:%.*]] = load i8, ptr [[VAL41]], align 1 +; CHECK-NEXT: [[VAL43:%.*]] = uitofp i8 [[VAL42]] to float +; CHECK-NEXT: [[VAL44:%.*]] = fsub fast float [[VAL34]], [[VAL]] +; CHECK-NEXT: [[VAL45:%.*]] = fsub fast float [[VAL37]], [[VAL9]] +; CHECK-NEXT: [[VAL46:%.*]] = fsub fast float [[VAL40]], [[VAL10]] +; CHECK-NEXT: [[VAL47:%.*]] = fsub fast float [[VAL43]], [[VAL11]] +; CHECK-NEXT: [[VAL48:%.*]] = fmul fast float [[VAL44]], [[VAL26]] +; CHECK-NEXT: [[VAL49:%.*]] = fmul fast float [[VAL45]], [[VAL27]] +; CHECK-NEXT: [[VAL50:%.*]] = fadd fast float [[VAL49]], [[VAL48]] +; CHECK-NEXT: [[VAL51:%.*]] = fmul fast float [[VAL46]], [[VAL28]] +; CHECK-NEXT: [[VAL52:%.*]] = fadd fast float [[VAL50]], [[VAL51]] +; CHECK-NEXT: [[VAL53:%.*]] = fmul fast float [[VAL47]], [[VAL29]] +; CHECK-NEXT: [[VAL54:%.*]] = fadd fast float [[VAL52]], [[VAL53]] ; CHECK-NEXT: [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]]) -; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[TMP15]]) +; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[VAL54]]) ; CHECK-NEXT: call void @ham(float [[VAL55]], float [[VAL56]]) ; CHECK-NEXT: br i1 [[ARG8:%.*]], label [[BB30]], label [[BB57]] ; CHECK: bb57: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll index e39cd8aaa111b1..4f881823746228 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll @@ -7,16 +7,13 @@ define void @p(double %0) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> , double [[TMP0]], i32 3 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[MUL16_150_1_I:%.*]] = fmul double 0.000000e+00, 0.000000e+00 ; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x double> zeroinitializer, [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP14]], double [[MUL16_150_1_I]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> zeroinitializer, [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x double> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> , <4 x double> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> , <2 x double> [[TMP7]], i64 2) ; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x double> [[TMP8]], [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = fadd <4 x double> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = fptosi <4 x double> [[TMP12]] to <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll index 95aa40f664c0ce..ff1d6253bec928 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -25,11 +25,11 @@ define void @s116_modified(ptr %a) { ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP2]], i64 2) +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP7]], <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x float> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP9]] ; CHECK-NEXT: store <4 x float> [[TMP10]], ptr [[A]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll index b59659ca75eb24..f04c359b432b5e 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll @@ -241,12 +241,9 @@ entry: define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) { ; CHECK-LABEL: @select_uniform_ugt_16xi8( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR:%.*]], align 1 -; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 8 +; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i8 8 ; CHECK-NEXT: [[L_8:%.*]] = load i8, ptr [[GEP_8]], align 1 ; CHECK-NEXT: [[CMP_8:%.*]] = icmp ugt i8 [[L_8]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0 -; CHECK-NEXT: [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]] ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 9 ; CHECK-NEXT: [[L_9:%.*]] = load i8, ptr [[GEP_9]], align 1 ; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 10 @@ -254,19 +251,28 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) { ; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 11 ; CHECK-NEXT: [[L_11:%.*]] = load i8, ptr [[GEP_11]], align 1 ; CHECK-NEXT: [[GEP_12:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 12 +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0 +; CHECK-NEXT: [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[L_9]], i32 9 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[L_10]], i32 10 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[L_11]], i32 11 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt <16 x i8> [[TMP9]], -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP11]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP10]], <16 x i8> [[TMP9]], <16 x i8> [[TMP12]] -; CHECK-NEXT: store <16 x i8> [[TMP13]], ptr [[PTR]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP6]], <8 x i8> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP7]], <4 x i8> [[TMP2]], i64 12) +; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt <16 x i8> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> [[TMP11]], <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[L_9]], i32 9 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[L_10]], i32 10 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[L_11]], i32 11 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP17]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP16]], <16 x i8> [[TMP18]] +; CHECK-NEXT: store <16 x i8> [[TMP19]], ptr [[PTR]], align 2 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll index 94a55c435c8c39..cd79250e8fb6be 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll @@ -4,12 +4,11 @@ define void @test(ptr noalias %p, ptr %p1) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ptr noalias [[P:%.*]], ptr [[P1:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[P]], align 2 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[P]], align 2 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr [[GEP2]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.vector.insert.v4i16.v2i16(<4 x i16> poison, <2 x i16> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i16> @llvm.vector.insert.v4i16.v2i16(<4 x i16> [[TMP3]], <2 x i16> [[TMP2]], i64 2) ; CHECK-NEXT: store <4 x i16> [[TMP5]], ptr [[P1]], align 2 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll index ff3d2c4c59394c..151b91184bf428 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll @@ -1013,22 +1013,20 @@ declare i32 @llvm.abs.i32(i32, i1) define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) { ; CHECK-LABEL: @stride_sum_abs_diff( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[Q:%.*]], align 4 -; CHECK-NEXT: [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[STRIDE:%.*]] -; CHECK-NEXT: [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q]], i64 [[STRIDE]] +; CHECK-NEXT: [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 [[STRIDE:%.*]] +; CHECK-NEXT: [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i64 [[STRIDE]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[Q]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[P_2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[Q_2]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP11]], i1 true) -; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]]) -; CHECK-NEXT: ret i32 [[TMP13]] +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP5]], <2 x i32> [[TMP3]], i64 2) +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP7]], <2 x i32> [[TMP4]], i64 2) +; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP9]], i1 true) +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) +; CHECK-NEXT: ret i32 [[TMP11]] ; %x.0 = load i32, ptr %p %y.0 = load i32, ptr %q @@ -1068,12 +1066,11 @@ define i32 @reduce_sum_2arrays_a(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[Q:%.*]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) -; CHECK-NEXT: ret i32 [[TMP6]] +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> poison, <4 x i8> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP2]], <4 x i8> [[TMP1]], i64 4) +; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]]) +; CHECK-NEXT: ret i32 [[TMP5]] ; entry: %x.0 = load i8, ptr %p, align 1 @@ -1117,12 +1114,11 @@ define i32 @reduce_sum_2arrays_b(ptr noalias noundef %x, ptr noalias %y) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[X:%.*]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[Y:%.*]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) -; CHECK-NEXT: ret i32 [[TMP6]] +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> poison, <4 x i8> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP2]], <4 x i8> [[TMP1]], i64 4) +; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]]) +; CHECK-NEXT: ret i32 [[TMP5]] ; entry: %0 = load i8, ptr %x, align 1 diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll index 0fcbead65d0d66..413aedefe9b6ad 100644 --- a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll +++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll @@ -13,12 +13,11 @@ define void @foo() local_unnamed_addr { ; CHECK-NEXT: [[ARRAYIDX372:%.*]] = getelementptr inbounds [4 x [4 x i32]], ptr @dct_luma, i64 0, i64 3, i64 0 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([4 x [4 x i32]], ptr @bar, i64 0, i64 3, i64 2), align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> , i32 [[ADD277]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = ashr <4 x i32> [[TMP6]], -; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr [[ARRAYIDX372]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP2]], <2 x i32> [[TMP1]], i64 2) +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[ADD277]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = ashr <4 x i32> [[TMP5]], +; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[ARRAYIDX372]], align 4 ; CHECK-NEXT: unreachable ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll index f7bd2431a76054..96b498ced7d0f8 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll @@ -392,16 +392,14 @@ define void @vec_shuff_reorder() #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr @fb, align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP7]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = fsub <4 x float> [[TMP7]], [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x i32> -; CHECK-NEXT: store <4 x float> [[TMP13]], ptr @fc, align 4 +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2) +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP7]], <2 x float> [[TMP4]], i64 2) +; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = fsub <4 x float> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> +; CHECK-NEXT: store <4 x float> [[TMP11]], ptr @fc, align 4 ; CHECK-NEXT: ret void ; %1 = load float, ptr @fb, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll index 3b03ca13ea65d0..87b1302e4cecf4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll @@ -6,30 +6,25 @@ define i1 @test(float %0, double %1) { ; CHECK-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) { ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> , float [[TMP0]], i32 3 ; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double> -; CHECK-NEXT: [[TMP5:%.*]] = fpext float 0.000000e+00 to double -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> zeroinitializer, [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x double> , double [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x double> [[TMP10]], double [[TMP5]], i32 2 -; CHECK-NEXT: [[TMP12:%.*]] = fmul <4 x double> [[TMP9]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x double> , <8 x double> [[TMP14]], <8 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x double> , <8 x double> [[TMP16]], <8 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x double> [[TMP17]], <8 x double> [[TMP18]], <8 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = fsub <8 x double> [[TMP15]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = fmul <8 x double> [[TMP15]], [[TMP19]] -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x double> [[TMP20]], <8 x double> [[TMP21]], <8 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = fptrunc <8 x double> [[TMP22]] to <8 x float> -; CHECK-NEXT: [[TMP24:%.*]] = fmul <8 x float> [[TMP23]], zeroinitializer -; CHECK-NEXT: [[TMP25:%.*]] = fcmp oeq <8 x float> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = freeze <8 x i1> [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP26]]) -; CHECK-NEXT: ret i1 [[TMP27]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> , <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> , <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP10]], i64 0) +; CHECK-NEXT: [[TMP13:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP11]], i64 0) +; CHECK-NEXT: [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP13]], <2 x double> [[TMP6]], i64 4) +; CHECK-NEXT: [[TMP15:%.*]] = fsub <8 x double> [[TMP12]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = fmul <8 x double> [[TMP12]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x double> [[TMP15]], <8 x double> [[TMP16]], <8 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = fptrunc <8 x double> [[TMP17]] to <8 x float> +; CHECK-NEXT: [[TMP19:%.*]] = fmul <8 x float> [[TMP18]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = fcmp oeq <8 x float> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = freeze <8 x i1> [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP21]]) +; CHECK-NEXT: ret i1 [[TMP22]] ; %3 = fpext float %0 to double %4 = fpext float 0.000000e+00 to double diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll index d326c855a10912..6ff03acf85cdfd 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll @@ -4,20 +4,19 @@ define i64 @foo(i32 %tmp7) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[TMP7:%.*]], i32 2 -; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = sub i32 undef, 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> , i32 [[TMP24]], i32 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 0, i32 5 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP24]], i32 6 -; CHECK-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP77:%.*]] = sub nsw <8 x i32> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP77]], <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i32> zeroinitializer, [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP10]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP11]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7:%.*]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 undef, i32 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 0, i32 5 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> , i32 [[TMP8]], i32 3 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP3]], <2 x i32> zeroinitializer, i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <8 x i32> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = add <8 x i32> zeroinitializer, [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP9]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP10]], 0 ; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[OP_RDX]] to i64 ; CHECK-NEXT: ret i64 [[TMP64]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll index d80d7b5ecd4e76..757d0b1708b6fb 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll @@ -8,19 +8,18 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) { ; CHECK-SAME: ptr [[I7:%.*]], i32 [[TMP0:%.*]], i1 [[TOBOOL62_NOT:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[RC21:%.*]] = alloca [0 x [0 x %struct.rect]], i32 0, align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[RC21]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x float> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[X1:%.*]] = getelementptr i8, ptr [[RC21]], i64 4 -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, ptr [[X1]], align 4 +; CHECK-NEXT: [[Y0:%.*]] = getelementptr i8, ptr [[RC21]], i64 8 +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[Y0]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[I7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[RC21]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 2 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 3 -; CHECK-NEXT: [[TMP12:%.*]] = fcmp olt <4 x float> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP11]], <2 x float> [[TMP8]], i64 0) +; CHECK-NEXT: [[TMP12:%.*]] = fcmp olt <4 x float> [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = fcmp olt <4 x float> [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP5]], <4 x float> zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP12]], <4 x float> zeroinitializer, <4 x float> [[TMP15]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll index fa022ad69af791..b0d9fea43a0e6c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -1016,15 +1016,13 @@ define i32 @maxi8_wrong_parent(i32) { ; THRESH-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @arr, align 16 ; THRESH-NEXT: br label [[PP:%.*]] ; THRESH: pp: -; THRESH-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 -; THRESH-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 -; THRESH-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> -; THRESH-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> -; THRESH-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> -; THRESH-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> -; THRESH-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> -; THRESH-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP9]]) -; THRESH-NEXT: ret i32 [[TMP10]] +; THRESH-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 +; THRESH-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16 +; THRESH-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP4]], i64 4) +; THRESH-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP2]], i64 0) +; THRESH-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP3]], i64 2) +; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP7]]) +; THRESH-NEXT: ret i32 [[TMP8]] ; %2 = load i32, ptr @arr, align 16 %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll index 6c4572593027d6..54c950a0785020 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll @@ -4,14 +4,20 @@ define void @inst_size(ptr %a, <2 x i64> %b) { ; CHECK-LABEL: @inst_size( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[VAL:%.*]] = extractelement <2 x i64> [[B:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[T41:%.*]] = icmp sgt i64 0, [[VAL]] -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i64> zeroinitializer, [[TMP1]] +; CHECK-NEXT: [[TMPL1:%.*]] = load i64, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[PTR2]], align 4 +; CHECK-NEXT: [[PTR4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 3 +; CHECK-NEXT: [[TMPL4:%.*]] = load i64, ptr [[PTR4]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[B:%.*]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[TMPL1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP2]], <2 x i64> [[TMP0]], i64 2) +; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i64> zeroinitializer, [[TMP3]] +; CHECK-NEXT: [[T45:%.*]] = icmp sgt i64 0, [[TMPL4]] ; CHECK-NEXT: br label [[BLOCK:%.*]] ; CHECK: block: -; CHECK-NEXT: [[PHI1:%.*]] = phi i1 [ [[T41]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x i1> [ [[TMP2]], [[ENTRY]] ] +; CHECK-NEXT: [[PHI5:%.*]] = phi i1 [ [[T45]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi <4 x i1> [ [[TMP4]], [[ENTRY]] ] ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll b/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll index 47b42bc8f32a7d..813c5e7418b30e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll @@ -10,10 +10,10 @@ define void @foo() personality ptr @bar { ; CHECK: bb2.loopexit: ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ [[TMP7:%.*]], [[BB9:%.*]] ], [ poison, [[BB2_LOOPEXIT:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ [[TMP8:%.*]], [[BB9:%.*]] ], [ poison, [[BB2_LOOPEXIT:%.*]] ] ; CHECK-NEXT: ret void ; CHECK: bb3: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP3:%.*]], [[BB6:%.*]] ], [ poison, [[BB1:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP4:%.*]], [[BB6:%.*]] ], [ poison, [[BB1:%.*]] ] ; CHECK-NEXT: [[TMP2:%.*]] = invoke i32 poison(ptr addrspace(1) nonnull poison, i32 0, i32 0, i32 poison) [ "deopt"() ] ; CHECK-NEXT: to label [[BB4:%.*]] unwind label [[BB10:%.*]] ; CHECK: bb4: @@ -21,29 +21,30 @@ define void @foo() personality ptr @bar { ; CHECK: bb5: ; CHECK-NEXT: br label [[BB7:%.*]] ; CHECK: bb6: -; CHECK-NEXT: [[TMP3]] = phi <2 x i32> [ , [[BB8:%.*]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ , [[BB8:%.*]] ] +; CHECK-NEXT: [[TMP4]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb7: ; CHECK-NEXT: [[LOCAL_5_84111:%.*]] = phi i32 [ poison, [[BB8]] ], [ poison, [[BB5]] ] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[LOCAL_5_84111]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = invoke i32 poison(ptr addrspace(1) nonnull poison, i32 poison, i32 poison, i32 poison) [ "deopt"() ] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[LOCAL_5_84111]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = invoke i32 poison(ptr addrspace(1) nonnull poison, i32 poison, i32 poison, i32 poison) [ "deopt"() ] ; CHECK-NEXT: to label [[BB8]] unwind label [[BB12:%.*]] ; CHECK: bb8: ; CHECK-NEXT: br i1 poison, label [[BB7]], label [[BB6]] ; CHECK: bb9: ; CHECK-NEXT: [[INDVARS_IV528799:%.*]] = phi i64 [ poison, [[BB10]] ], [ poison, [[BB12]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ [[TMP8:%.*]], [[BB10]] ], [ [[TMP9:%.*]], [[BB12]] ] -; CHECK-NEXT: [[TMP7]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x i32> [ [[TMP9:%.*]], [[BB10]] ], [ [[TMP10:%.*]], [[BB12]] ] +; CHECK-NEXT: [[TMP8]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP7]], i64 2) ; CHECK-NEXT: br label [[BB2]] ; CHECK: bb10: -; CHECK-NEXT: [[TMP8]] = phi <2 x i32> [ [[TMP1]], [[BB3]] ] +; CHECK-NEXT: [[TMP9]] = phi <2 x i32> [ [[TMP1]], [[BB3]] ] ; CHECK-NEXT: [[LANDING_PAD68:%.*]] = landingpad { ptr, i32 } ; CHECK-NEXT: cleanup ; CHECK-NEXT: br label [[BB9]] ; CHECK: bb11: ; CHECK-NEXT: ret void ; CHECK: bb12: -; CHECK-NEXT: [[TMP9]] = phi <2 x i32> [ [[TMP4]], [[BB7]] ] +; CHECK-NEXT: [[TMP10]] = phi <2 x i32> [ [[TMP5]], [[BB7]] ] ; CHECK-NEXT: [[LANDING_PAD149:%.*]] = landingpad { ptr, i32 } ; CHECK-NEXT: cleanup ; CHECK-NEXT: br label [[BB9]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll index 96151e0bd6c418..7201583f3450e0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -144,8 +144,8 @@ define float @foo3(ptr nocapture readonly %A) #0 { ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP8:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP14:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 ; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00 ; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]] @@ -154,23 +154,25 @@ define float @foo3(ptr nocapture readonly %A) #0 { ; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX14]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]] -; CHECK-NEXT: [[TMP8]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP10]], -; CHECK-NEXT: [[TMP12]] = fadd <4 x float> [[TMP3]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP13]], 121 +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 +; CHECK-NEXT: [[TMP9]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP11]], <2 x float> [[TMP8]], i64 2) +; CHECK-NEXT: [[TMP13:%.*]] = fmul <4 x float> [[TMP12]], +; CHECK-NEXT: [[TMP14]] = fadd <4 x float> [[TMP3]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP15]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP12]], i32 0 -; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP12]], i32 1 -; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP12]], i32 2 -; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP12]], i32 3 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP17]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP14]], i32 0 +; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP14]], i32 1 +; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP14]], i32 2 +; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP18]] +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP14]], i32 3 +; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP19]] ; CHECK-NEXT: ret float [[ADD31]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll index 865d8178667167..12389f4a3dbf4a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -390,14 +390,15 @@ define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) { define i1 @logical_and_icmp_clamp_pred_diff(<4 x i32> %x) { ; CHECK-LABEL: @logical_and_icmp_clamp_pred_diff( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp ult <8 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = freeze <8 x i1> [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP7]]) -; CHECK-NEXT: ret i1 [[TMP8]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP2]], <4 x i32> [[X]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <8 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ult <8 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP6]], <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = freeze <8 x i1> [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP8]]) +; CHECK-NEXT: ret i1 [[TMP9]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll index 7de2cde45525ae..8aaa71ef47a8c9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll @@ -10,16 +10,7 @@ ; YAML-NEXT: - String: 'SLP vectorized with cost ' ; YAML-NEXT: - Cost: '-4' ; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '4' -; YAML-LABEL: --- !Passed -; YAML-NEXT: Pass: slp-vectorizer -; YAML-NEXT: Name: VectorizedList -; YAML-NEXT: Function: test -; YAML-NEXT: Args: -; YAML-NEXT: - String: 'SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-2' -; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '2' +; YAML-NEXT: - TreeSize: '5' define <4 x float> @test(ptr %x, float %v, float %a) { ; CHECK-LABEL: define <4 x float> @test( @@ -28,9 +19,8 @@ define <4 x float> @test(ptr %x, float %v, float %a) { ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[A]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[V]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP1]], i64 2) ; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP3]], [[TMP7]] ; CHECK-NEXT: ret <4 x float> [[TMP8]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll index dadf5992ba288d..c01c44ff03c153 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll @@ -5,23 +5,25 @@ define void @test(i1 %c, ptr %arg) { ; CHECK-LABEL: @test( ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG:%.*]], align 8 -; CHECK-NEXT: [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 24 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP5]], <2 x i64> [[TMP2]], i64 2) ; CHECK-NEXT: br label [[JOIN:%.*]] ; CHECK: else: -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 ; CHECK-NEXT: [[ARG_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24 -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP10]], i64 0) +; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP11]], <2 x i64> [[TMP8]], i64 2) ; CHECK-NEXT: br label [[JOIN]] ; CHECK: join: -; CHECK-NEXT: [[TMP11:%.*]] = phi <4 x i64> [ [[TMP5]], [[IF]] ], [ [[TMP10]], [[ELSE]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i64> [ [[TMP6]], [[IF]] ], [ [[TMP12]], [[ELSE]] ] ; CHECK-NEXT: ret void ; br i1 %c, label %if, label %else diff --git a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget_debug_info.ll b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget_debug_info.ll index d45054b6bebce7..207b2d45c335e0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget_debug_info.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget_debug_info.ll @@ -14,7 +14,21 @@ declare void @unknown() define void @test(ptr %a, ptr %b, ptr %c, ptr %d) { ; VECTOR_DBG-LABEL: @test( ; VECTOR_DBG-NEXT: entry: -; VECTOR_DBG-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4 +; VECTOR_DBG-NEXT: [[L0:%.*]] = load float, ptr [[A:%.*]], align 4 +; VECTOR_DBG-NEXT: [[A1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; VECTOR_DBG-NEXT: [[L1:%.*]] = load float, ptr [[A1]], align 4 +; VECTOR_DBG-NEXT: [[A2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2 +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3:![0-9]+]], !DIExpression(), [[META5:![0-9]+]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: [[B1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 +; VECTOR_DBG-NEXT: [[B2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; VECTOR_DBG-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A2]], align 4 ; VECTOR_DBG-NEXT: call void @unknown() ; VECTOR_DBG-NEXT: call void @unknown() ; VECTOR_DBG-NEXT: call void @unknown() @@ -43,22 +57,22 @@ define void @test(ptr %a, ptr %b, ptr %c, ptr %d) { ; VECTOR_DBG-NEXT: call void @unknown() ; VECTOR_DBG-NEXT: call void @unknown() ; VECTOR_DBG-NEXT: call void @unknown() -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3:![0-9]+]], !DIExpression(), [[META5:![0-9]+]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: store <4 x float> [[TMP0]], ptr [[B:%.*]], align 4 +; VECTOR_DBG-NEXT: store float [[L0]], ptr [[B]], align 4 +; VECTOR_DBG-NEXT: store float [[L1]], ptr [[B1]], align 4 +; VECTOR_DBG-NEXT: store <2 x float> [[TMP0]], ptr [[B2]], align 4 ; VECTOR_DBG-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4 ; VECTOR_DBG-NEXT: store <4 x float> [[TMP1]], ptr [[D:%.*]], align 4 ; VECTOR_DBG-NEXT: ret void ; ; VECTOR_NODBG-LABEL: @test( ; VECTOR_NODBG-NEXT: entry: -; VECTOR_NODBG-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4 +; VECTOR_NODBG-NEXT: [[L0:%.*]] = load float, ptr [[A:%.*]], align 4 +; VECTOR_NODBG-NEXT: [[A1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; VECTOR_NODBG-NEXT: [[L1:%.*]] = load float, ptr [[A1]], align 4 +; VECTOR_NODBG-NEXT: [[A2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2 +; VECTOR_NODBG-NEXT: [[B1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 +; VECTOR_NODBG-NEXT: [[B2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; VECTOR_NODBG-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A2]], align 4 ; VECTOR_NODBG-NEXT: call void @unknown() ; VECTOR_NODBG-NEXT: call void @unknown() ; VECTOR_NODBG-NEXT: call void @unknown() @@ -87,7 +101,9 @@ define void @test(ptr %a, ptr %b, ptr %c, ptr %d) { ; VECTOR_NODBG-NEXT: call void @unknown() ; VECTOR_NODBG-NEXT: call void @unknown() ; VECTOR_NODBG-NEXT: call void @unknown() -; VECTOR_NODBG-NEXT: store <4 x float> [[TMP0]], ptr [[B:%.*]], align 4 +; VECTOR_NODBG-NEXT: store float [[L0]], ptr [[B]], align 4 +; VECTOR_NODBG-NEXT: store float [[L1]], ptr [[B1]], align 4 +; VECTOR_NODBG-NEXT: store <2 x float> [[TMP0]], ptr [[B2]], align 4 ; VECTOR_NODBG-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4 ; VECTOR_NODBG-NEXT: store <4 x float> [[TMP1]], ptr [[D:%.*]], align 4 ; VECTOR_NODBG-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll index 6825f43b5a9eb4..6ca1f8119c1cf0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll @@ -14,22 +14,21 @@ define dso_local void @_Z4testP1S(ptr %p) local_unnamed_addr { ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 4 ; CHECK-NEXT: [[I7:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4 ; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 12 -; CHECK-NEXT: [[I9:%.*]] = load i32, ptr [[ARRAYIDX27]], align 4 -; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 13 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX34]], align 4 +; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 14 +; CHECK-NEXT: [[I13:%.*]] = load i32, ptr [[ARRAYIDX41]], align 4 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 5 ; CHECK-NEXT: [[I15:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX27]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[I1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[I7]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[I9]], i32 4 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[I15]], i32 7 -; CHECK-NEXT: [[TMP11:%.*]] = add nsw <8 x i32> [[TMP10]], [[TMP2]] -; CHECK-NEXT: store <8 x i32> [[TMP11]], ptr [[P]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[I13]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[I15]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP8]], <2 x i32> [[TMP1]], i64 4) +; CHECK-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[TMP9]], [[TMP2]] +; CHECK-NEXT: store <8 x i32> [[TMP10]], ptr [[P]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -106,11 +105,10 @@ define dso_local void @test_unordered_splits(ptr nocapture %p) local_unnamed_add ; CHECK-NEXT: [[G20:%.*]] = getelementptr inbounds [16 x i32], ptr [[P2]], i32 0, i64 12 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[G10]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[G20]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: store <8 x i32> [[TMP5]], ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP2]], <4 x i32> [[TMP1]], i64 4) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: store <8 x i32> [[TMP4]], ptr [[P:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -165,14 +163,11 @@ define dso_local void @test_cost_splits(ptr nocapture %p) local_unnamed_addr { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[G12]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[G20]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[G22]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> -; CHECK-NEXT: store <8 x i32> [[TMP10]], ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> poison, <2 x i32> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP4]], <2 x i32> [[TMP1]], i64 2) +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP2]], i64 4) +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP3]], i64 6) +; CHECK-NEXT: store <8 x i32> [[TMP7]], ptr [[P:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll index eb3d395f4c6a6f..3eabed5882e58b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll @@ -160,9 +160,8 @@ define void @tiny_tree_not_fully_vectorizable2(ptr noalias nocapture %dst, ptr n ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> -; CHECK-NEXT: store <4 x float> [[TMP6]], ptr [[DST_ADDR_022]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP4]], <2 x float> [[TMP2]], i64 2) +; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[DST_ADDR_022]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds float, ptr [[SRC_ADDR_021]], i64 [[I_023]] ; CHECK-NEXT: [[ADD_PTR8]] = getelementptr inbounds float, ptr [[DST_ADDR_022]], i64 [[I_023]] ; CHECK-NEXT: [[INC]] = add i64 [[I_023]], 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll index 6ac6884ca5377f..e1b091cc6fcda7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll @@ -8,14 +8,14 @@ define void @test(ptr %a, ptr %b) { ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr null, align 4 ; CHECK-NEXT: [[ARRAYIDX120:%.*]] = getelementptr [4 x float], ptr [[B:%.*]], i64 0, i64 3 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX120]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 3 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr null, align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP6]], <2 x float> [[TMP1]], i64 0) ; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP5]], zeroinitializer From 3c54aa14aa5f92ea2c96b85efbc7945ed55451e4 Mon Sep 17 00:00:00 2001 From: Sumanth Gundapaneni Date: Thu, 22 Aug 2024 10:27:21 -0500 Subject: [PATCH 215/426] [Verifier] Make lrint and lround intrinsic cases concise. NFC (#105676) --- llvm/lib/IR/Verifier.cpp | 39 +++++++++------------------------------ 1 file changed, 9 insertions(+), 30 deletions(-) diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index c095e47996ba17..ac754b5d9638d9 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -5952,44 +5952,23 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { break; } case Intrinsic::lrint: - case Intrinsic::llrint: { - Type *ValTy = Call.getArgOperand(0)->getType(); - Type *ResultTy = Call.getType(); - Check( - ValTy->isFPOrFPVectorTy() && ResultTy->isIntOrIntVectorTy(), - "llvm.lrint, llvm.llrint: argument must be floating-point or vector " - "of floating-points, and result must be integer or vector of integers", - &Call); - Check(ValTy->isVectorTy() == ResultTy->isVectorTy(), - "llvm.lrint, llvm.llrint: argument and result disagree on vector use", - &Call); - if (ValTy->isVectorTy()) { - Check(cast(ValTy)->getElementCount() == - cast(ResultTy)->getElementCount(), - "llvm.lrint, llvm.llrint: argument must be same length as result", - &Call); - } - break; - } + case Intrinsic::llrint: case Intrinsic::lround: case Intrinsic::llround: { Type *ValTy = Call.getArgOperand(0)->getType(); Type *ResultTy = Call.getType(); auto *VTy = dyn_cast(ValTy); auto *RTy = dyn_cast(ResultTy); - Check( - ValTy->isFPOrFPVectorTy() && ResultTy->isIntOrIntVectorTy(), - "llvm.lround, llvm.llround: argument must be floating-point or vector " - "of floating-points, and result must be integer or vector of integers", - &Call); - Check( - ValTy->isVectorTy() == ResultTy->isVectorTy(), - "llvm.lround, llvm.llround: argument and result disagree on vector use", - &Call); + Check(ValTy->isFPOrFPVectorTy() && ResultTy->isIntOrIntVectorTy(), + ExpectedName + ": argument must be floating-point or vector " + "of floating-points, and result must be integer or " + "vector of integers", + &Call); + Check(ValTy->isVectorTy() == ResultTy->isVectorTy(), + ExpectedName + ": argument and result disagree on vector use", &Call); if (VTy) { Check(VTy->getElementCount() == RTy->getElementCount(), - "llvm.lround, llvm.llround: argument must be same length as result", - &Call); + ExpectedName + ": argument must be same length as result", &Call); } break; } From 9f418057dc73e4e5cb94a7cd671097275ffc29fc Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 22 Aug 2024 08:29:48 -0700 Subject: [PATCH 216/426] [lldb] Pick the correct architecutre when target and core file disagree (#105576) In f9f3316, Adrian fixed an issue where LLDB wouldn't update the target's architecture when the process reported a different triple that only differed in its sub-architecture. This unintentionally regressed core file debugging when the core file reports the base architecture (e.g. armv7) while the main binary knows the correct CPU subtype (e.g. armv7em). After the aforementioned change, we update the target architecture from armv7em to armv7. Fix the issue by trusting the target architecture over the ProcessMachCore process. rdar://133834304 --- .../Process/mach-core/ProcessMachCore.cpp | 15 +++-- lldb/test/Shell/Process/Inputs/a.out.yaml | 62 +++++++++++++++++++ lldb/test/Shell/Process/Inputs/corefile.yaml | 22 +++++++ .../Shell/Process/ProcessMachCoreArch.test | 8 +++ 4 files changed, 102 insertions(+), 5 deletions(-) create mode 100644 lldb/test/Shell/Process/Inputs/a.out.yaml create mode 100644 lldb/test/Shell/Process/Inputs/corefile.yaml create mode 100644 lldb/test/Shell/Process/ProcessMachCoreArch.test diff --git a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp index 930c707604bb38..348b18e38560a6 100644 --- a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp +++ b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp @@ -562,17 +562,22 @@ Status ProcessMachCore::DoLoadCore() { SetCanJIT(false); - // The corefile's architecture is our best starting point. - ArchSpec arch(m_core_module_sp->GetArchitecture()); - if (arch.IsValid()) - GetTarget().SetArchitecture(arch); - CreateMemoryRegions(); LoadBinariesAndSetDYLD(); CleanupMemoryRegionPermissions(); + ModuleSP exe_module_sp = GetTarget().GetExecutableModule(); + if (exe_module_sp && exe_module_sp->GetArchitecture().IsValid()) { + GetTarget().SetArchitecture(exe_module_sp->GetArchitecture()); + } else { + // The corefile's architecture is our best starting point. + ArchSpec arch(m_core_module_sp->GetArchitecture()); + if (arch.IsValid()) + GetTarget().SetArchitecture(arch); + } + AddressableBits addressable_bits = core_objfile->GetAddressableBits(); SetAddressableBitMasks(addressable_bits); diff --git a/lldb/test/Shell/Process/Inputs/a.out.yaml b/lldb/test/Shell/Process/Inputs/a.out.yaml new file mode 100644 index 00000000000000..f63457d39824c6 --- /dev/null +++ b/lldb/test/Shell/Process/Inputs/a.out.yaml @@ -0,0 +1,62 @@ +--- !mach-o +FileHeader: + magic: 0xFEEDFACE + cputype: 0xC + cpusubtype: 0x10 + filetype: 0x2 + ncmds: 3 + sizeofcmds: 272 + flags: 0x200085 +LoadCommands: + - cmd: LC_SEGMENT + cmdsize: 56 + segname: __PAGEZERO + vmaddr: 0 + vmsize: 16384 + fileoff: 0 + filesize: 0 + maxprot: 0 + initprot: 0 + nsects: 0 + flags: 0 + - cmd: LC_SEGMENT + cmdsize: 192 + segname: __TEXT + vmaddr: 16384 + vmsize: 32768 + fileoff: 0 + filesize: 32768 + maxprot: 5 + initprot: 5 + nsects: 2 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0xBFB4 + size: 4 + offset: 0x7FB4 + align: 1 + reloff: 0x0 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: '00207047' + - sectname: __unwind_info + segname: __TEXT + addr: 0xBFB8 + size: 72 + offset: 0x7FB8 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x0 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 010000001C000000000000001C000000000000001C00000002000000B57F00003400000034000000BA7F00000000000034000000030000000C000100100001000000000000000000 + - cmd: LC_UUID + cmdsize: 24 + uuid: C2065535-C63D-3C6A-BF79-19CF960DEF2E diff --git a/lldb/test/Shell/Process/Inputs/corefile.yaml b/lldb/test/Shell/Process/Inputs/corefile.yaml new file mode 100644 index 00000000000000..537da8e85cba35 --- /dev/null +++ b/lldb/test/Shell/Process/Inputs/corefile.yaml @@ -0,0 +1,22 @@ +--- !mach-o +FileHeader: + magic: 0xFEEDFACF + cputype: 0xC + cpusubtype: 0x9 + filetype: 0x4 + ncmds: 1 + sizeofcmds: 84 + flags: 0x0 + reserved: 0x0 +LoadCommands: + - cmd: LC_THREAD + cmdsize: 84 + PayloadBytes: [ 0x1, 0x0, 0x0, 0x0, 0x11, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x0, 0x0, 0x0, 0x2, 0x0, 0x0, 0x0, 0x3, 0x0, + 0x0, 0x0, 0x4, 0x0, 0x0, 0x0, 0x5, 0x0, 0x0, 0x0, + 0x6, 0x0, 0x0, 0x0, 0x7, 0x0, 0x0, 0x0, 0x8, 0x0, + 0x0, 0x0, 0x9, 0x0, 0x0, 0x0, 0xA, 0x0, 0x0, 0x0, + 0xB, 0x0, 0x0, 0x0, 0xC, 0x0, 0x0, 0x0, 0xD, 0x0, + 0x0, 0x0, 0xE, 0x0, 0x0, 0x0, 0xF, 0x0, 0x0, 0x0, + 0x10, 0x0, 0x0, 0x0, 0x11, 0x0 ] +... diff --git a/lldb/test/Shell/Process/ProcessMachCoreArch.test b/lldb/test/Shell/Process/ProcessMachCoreArch.test new file mode 100644 index 00000000000000..5d3bbd4ab6950e --- /dev/null +++ b/lldb/test/Shell/Process/ProcessMachCoreArch.test @@ -0,0 +1,8 @@ +# RUN: yaml2obj %S/Inputs/corefile.yaml -o %t.corefile +# RUN: yaml2obj %S/Inputs/a.out.yaml -o %t.out + +# RUN: %lldb -b -c %t.corefile %t.out -o 'target list ' | FileCheck %s --check-prefix BINARY +# BINARY: target {{.*}} arch=armv7em-apple + +# RUN: %lldb -b %t.corefile -o 'target list' | FileCheck %s --check-prefix CORE +# CORE: target {{.*}} arch=armv7-apple From fe5d1f901a709bc6a2180b7a77b9d5948c6c3482 Mon Sep 17 00:00:00 2001 From: Rodolfo Wottrich Date: Thu, 22 Aug 2024 16:30:27 +0100 Subject: [PATCH 217/426] [ARM] Fix missing ELF FPU attributes for fp-armv8-fullfp16-d16 (#105677) An assembly input with > .fpu fp-armv8-fullfp16-d16 crashes the compiler because the ELF FPU attribute emitter misses the respective entry. This patch fixes this. Interestingly, compiling with -mfpu=fp-armv8-fullfp16-d16 does not cause the crash because FPv5_D16 is an alias in the compiler and > .fpu fpv5-d16 is emitted instead, which does not crash. The existing .fpu directive test with multiple FPUs serves the purpose of verifying that each possible FPU option is defined, but does not trigger the crash because only the last .fpu directive goes effectively down the code path. Therefore one test for each FPU is required. Fixes #105674. --- .../Target/ARM/MCTargetDesc/ARMELFStreamer.cpp | 4 ++++ llvm/test/MC/ARM/directive-fpu-multiple.s | 2 ++ .../directive-fpu-single-crypto-neon-fp-armv8.s | 15 +++++++++++++++ .../directive-fpu-single-fp-armv8-fullfp16-d16.s | 15 +++++++++++++++ ...irective-fpu-single-fp-armv8-fullfp16-sp-d16.s | 15 +++++++++++++++ llvm/test/MC/ARM/directive-fpu-single-fp-armv8.s | 15 +++++++++++++++ .../MC/ARM/directive-fpu-single-fpv4-sp-d16.s | 15 +++++++++++++++ llvm/test/MC/ARM/directive-fpu-single-fpv5-d16.s | 15 +++++++++++++++ .../MC/ARM/directive-fpu-single-fpv5-sp-d16.s | 15 +++++++++++++++ .../MC/ARM/directive-fpu-single-neon-fp-armv8.s | 15 +++++++++++++++ llvm/test/MC/ARM/directive-fpu-single-neon-fp16.s | 15 +++++++++++++++ .../test/MC/ARM/directive-fpu-single-neon-vfpv4.s | 15 +++++++++++++++ llvm/test/MC/ARM/directive-fpu-single-neon.s | 15 +++++++++++++++ llvm/test/MC/ARM/directive-fpu-single-none.s | 10 ++++++++++ llvm/test/MC/ARM/directive-fpu-single-vfp.s | 15 +++++++++++++++ llvm/test/MC/ARM/directive-fpu-single-vfpv2.s | 15 +++++++++++++++ .../MC/ARM/directive-fpu-single-vfpv3-d16-fp16.s | 15 +++++++++++++++ llvm/test/MC/ARM/directive-fpu-single-vfpv3-d16.s | 15 +++++++++++++++ .../test/MC/ARM/directive-fpu-single-vfpv3-fp16.s | 15 +++++++++++++++ llvm/test/MC/ARM/directive-fpu-single-vfpv3.s | 15 +++++++++++++++ .../MC/ARM/directive-fpu-single-vfpv3xd-fp16.s | 15 +++++++++++++++ llvm/test/MC/ARM/directive-fpu-single-vfpv3xd.s | 15 +++++++++++++++ llvm/test/MC/ARM/directive-fpu-single-vfpv4-d16.s | 15 +++++++++++++++ llvm/test/MC/ARM/directive-fpu-single-vfpv4.s | 15 +++++++++++++++ 24 files changed, 331 insertions(+) create mode 100644 llvm/test/MC/ARM/directive-fpu-single-crypto-neon-fp-armv8.s create mode 100644 llvm/test/MC/ARM/directive-fpu-single-fp-armv8-fullfp16-d16.s create mode 100644 llvm/test/MC/ARM/directive-fpu-single-fp-armv8-fullfp16-sp-d16.s create mode 100644 llvm/test/MC/ARM/directive-fpu-single-fp-armv8.s create mode 100644 llvm/test/MC/ARM/directive-fpu-single-fpv4-sp-d16.s create mode 100644 llvm/test/MC/ARM/directive-fpu-single-fpv5-d16.s create mode 100644 llvm/test/MC/ARM/directive-fpu-single-fpv5-sp-d16.s create mode 100644 llvm/test/MC/ARM/directive-fpu-single-neon-fp-armv8.s create mode 100644 llvm/test/MC/ARM/directive-fpu-single-neon-fp16.s create mode 100644 llvm/test/MC/ARM/directive-fpu-single-neon-vfpv4.s create mode 100644 llvm/test/MC/ARM/directive-fpu-single-neon.s create mode 100644 llvm/test/MC/ARM/directive-fpu-single-none.s create mode 100644 llvm/test/MC/ARM/directive-fpu-single-vfp.s create mode 100644 llvm/test/MC/ARM/directive-fpu-single-vfpv2.s create mode 100644 llvm/test/MC/ARM/directive-fpu-single-vfpv3-d16-fp16.s create mode 100644 llvm/test/MC/ARM/directive-fpu-single-vfpv3-d16.s create mode 100644 llvm/test/MC/ARM/directive-fpu-single-vfpv3-fp16.s create mode 100644 llvm/test/MC/ARM/directive-fpu-single-vfpv3.s create mode 100644 llvm/test/MC/ARM/directive-fpu-single-vfpv3xd-fp16.s create mode 100644 llvm/test/MC/ARM/directive-fpu-single-vfpv3xd.s create mode 100644 llvm/test/MC/ARM/directive-fpu-single-vfpv4-d16.s create mode 100644 llvm/test/MC/ARM/directive-fpu-single-vfpv4.s diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index 59f29660a77770..c9631bd7c7aac5 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -992,6 +992,10 @@ void ARMTargetELFStreamer::emitFPUDefaultAttributes() { // uses the FP_ARMV8_D16 build attribute. case ARM::FK_FPV5_SP_D16: case ARM::FK_FPV5_D16: + // FPv5 and FP-ARMv8 have the same instructions, so are modeled as one + // FPU, but there are two different names for it depending on the CPU. + case ARM::FK_FP_ARMV8_FULLFP16_SP_D16: + case ARM::FK_FP_ARMV8_FULLFP16_D16: S.setAttributeItem(ARMBuildAttrs::FP_arch, ARMBuildAttrs::AllowFPARMv8B, /* OverwriteExisting= */ false); break; diff --git a/llvm/test/MC/ARM/directive-fpu-multiple.s b/llvm/test/MC/ARM/directive-fpu-multiple.s index ba407654854cd7..b129cbdf1db230 100644 --- a/llvm/test/MC/ARM/directive-fpu-multiple.s +++ b/llvm/test/MC/ARM/directive-fpu-multiple.s @@ -22,6 +22,8 @@ .fpu fpv5-d16 .fpu fpv5-sp-d16 .fpu fp-armv8 + .fpu fp-armv8-fullfp16-d16 + .fpu fp-armv8-fullfp16-sp-d16 .fpu neon .fpu neon-fp16 .fpu neon-vfpv4 diff --git a/llvm/test/MC/ARM/directive-fpu-single-crypto-neon-fp-armv8.s b/llvm/test/MC/ARM/directive-fpu-single-crypto-neon-fp-armv8.s new file mode 100644 index 00000000000000..58af1cbf41513d --- /dev/null +++ b/llvm/test/MC/ARM/directive-fpu-single-crypto-neon-fp-armv8.s @@ -0,0 +1,15 @@ +@ Check a single .fpu directive. + +@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \ +@ RUN: | llvm-readobj --arch-specific - \ +@ RUN: | FileCheck %s -check-prefix CHECK-ATTR + + .fpu crypto-neon-fp-armv8 + +@ CHECK-ATTR: FileAttributes { +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: FP_arch +@ CHECK-ATTR: Description: ARMv8-a FP +@ CHECK-ATTR: } +@ CHECK-ATTR: } + diff --git a/llvm/test/MC/ARM/directive-fpu-single-fp-armv8-fullfp16-d16.s b/llvm/test/MC/ARM/directive-fpu-single-fp-armv8-fullfp16-d16.s new file mode 100644 index 00000000000000..11e1f6b51d9aa8 --- /dev/null +++ b/llvm/test/MC/ARM/directive-fpu-single-fp-armv8-fullfp16-d16.s @@ -0,0 +1,15 @@ +@ Check a single .fpu directive. + +@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \ +@ RUN: | llvm-readobj --arch-specific - \ +@ RUN: | FileCheck %s -check-prefix CHECK-ATTR + + .fpu fp-armv8-fullfp16-d16 + +@ CHECK-ATTR: FileAttributes { +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: FP_arch +@ CHECK-ATTR: Description: ARMv8-a FP-D16 +@ CHECK-ATTR: } +@ CHECK-ATTR: } + diff --git a/llvm/test/MC/ARM/directive-fpu-single-fp-armv8-fullfp16-sp-d16.s b/llvm/test/MC/ARM/directive-fpu-single-fp-armv8-fullfp16-sp-d16.s new file mode 100644 index 00000000000000..6307deb0de400d --- /dev/null +++ b/llvm/test/MC/ARM/directive-fpu-single-fp-armv8-fullfp16-sp-d16.s @@ -0,0 +1,15 @@ +@ Check a single .fpu directive. + +@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \ +@ RUN: | llvm-readobj --arch-specific - \ +@ RUN: | FileCheck %s -check-prefix CHECK-ATTR + + .fpu fp-armv8-fullfp16-sp-d16 + +@ CHECK-ATTR: FileAttributes { +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: FP_arch +@ CHECK-ATTR: Description: ARMv8-a FP-D16 +@ CHECK-ATTR: } +@ CHECK-ATTR: } + diff --git a/llvm/test/MC/ARM/directive-fpu-single-fp-armv8.s b/llvm/test/MC/ARM/directive-fpu-single-fp-armv8.s new file mode 100644 index 00000000000000..48eb342849c933 --- /dev/null +++ b/llvm/test/MC/ARM/directive-fpu-single-fp-armv8.s @@ -0,0 +1,15 @@ +@ Check a single .fpu directive. + +@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \ +@ RUN: | llvm-readobj --arch-specific - \ +@ RUN: | FileCheck %s -check-prefix CHECK-ATTR + + .fpu fp-armv8 + +@ CHECK-ATTR: FileAttributes { +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: FP_arch +@ CHECK-ATTR: Description: ARMv8-a FP +@ CHECK-ATTR: } +@ CHECK-ATTR: } + diff --git a/llvm/test/MC/ARM/directive-fpu-single-fpv4-sp-d16.s b/llvm/test/MC/ARM/directive-fpu-single-fpv4-sp-d16.s new file mode 100644 index 00000000000000..e1e64c10cc3f79 --- /dev/null +++ b/llvm/test/MC/ARM/directive-fpu-single-fpv4-sp-d16.s @@ -0,0 +1,15 @@ +@ Check a single .fpu directive. + +@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \ +@ RUN: | llvm-readobj --arch-specific - \ +@ RUN: | FileCheck %s -check-prefix CHECK-ATTR + + .fpu fpv4-sp-d16 + +@ CHECK-ATTR: FileAttributes { +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: FP_arch +@ CHECK-ATTR: Description: VFPv4-D16 +@ CHECK-ATTR: } +@ CHECK-ATTR: } + diff --git a/llvm/test/MC/ARM/directive-fpu-single-fpv5-d16.s b/llvm/test/MC/ARM/directive-fpu-single-fpv5-d16.s new file mode 100644 index 00000000000000..dc03f7a709c9b5 --- /dev/null +++ b/llvm/test/MC/ARM/directive-fpu-single-fpv5-d16.s @@ -0,0 +1,15 @@ +@ Check a single .fpu directive. + +@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \ +@ RUN: | llvm-readobj --arch-specific - \ +@ RUN: | FileCheck %s -check-prefix CHECK-ATTR + + .fpu fpv5-d16 + +@ CHECK-ATTR: FileAttributes { +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: FP_arch +@ CHECK-ATTR: Description: ARMv8-a FP-D16 +@ CHECK-ATTR: } +@ CHECK-ATTR: } + diff --git a/llvm/test/MC/ARM/directive-fpu-single-fpv5-sp-d16.s b/llvm/test/MC/ARM/directive-fpu-single-fpv5-sp-d16.s new file mode 100644 index 00000000000000..850db4da120e26 --- /dev/null +++ b/llvm/test/MC/ARM/directive-fpu-single-fpv5-sp-d16.s @@ -0,0 +1,15 @@ +@ Check a single .fpu directive. + +@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \ +@ RUN: | llvm-readobj --arch-specific - \ +@ RUN: | FileCheck %s -check-prefix CHECK-ATTR + + .fpu fpv5-sp-d16 + +@ CHECK-ATTR: FileAttributes { +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: FP_arch +@ CHECK-ATTR: Description: ARMv8-a FP-D16 +@ CHECK-ATTR: } +@ CHECK-ATTR: } + diff --git a/llvm/test/MC/ARM/directive-fpu-single-neon-fp-armv8.s b/llvm/test/MC/ARM/directive-fpu-single-neon-fp-armv8.s new file mode 100644 index 00000000000000..f84d6cf5cc1132 --- /dev/null +++ b/llvm/test/MC/ARM/directive-fpu-single-neon-fp-armv8.s @@ -0,0 +1,15 @@ +@ Check a single .fpu directive. + +@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \ +@ RUN: | llvm-readobj --arch-specific - \ +@ RUN: | FileCheck %s -check-prefix CHECK-ATTR + + .fpu neon-fp-armv8 + +@ CHECK-ATTR: FileAttributes { +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: FP_arch +@ CHECK-ATTR: Description: ARMv8-a FP +@ CHECK-ATTR: } +@ CHECK-ATTR: } + diff --git a/llvm/test/MC/ARM/directive-fpu-single-neon-fp16.s b/llvm/test/MC/ARM/directive-fpu-single-neon-fp16.s new file mode 100644 index 00000000000000..fc7520ce8bb75f --- /dev/null +++ b/llvm/test/MC/ARM/directive-fpu-single-neon-fp16.s @@ -0,0 +1,15 @@ +@ Check a single .fpu directive. + +@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \ +@ RUN: | llvm-readobj --arch-specific - \ +@ RUN: | FileCheck %s -check-prefix CHECK-ATTR + + .fpu neon-fp16 + +@ CHECK-ATTR: FileAttributes { +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: FP_arch +@ CHECK-ATTR: Description: VFPv3 +@ CHECK-ATTR: } +@ CHECK-ATTR: } + diff --git a/llvm/test/MC/ARM/directive-fpu-single-neon-vfpv4.s b/llvm/test/MC/ARM/directive-fpu-single-neon-vfpv4.s new file mode 100644 index 00000000000000..5c56022f66ac9a --- /dev/null +++ b/llvm/test/MC/ARM/directive-fpu-single-neon-vfpv4.s @@ -0,0 +1,15 @@ +@ Check a single .fpu directive. + +@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \ +@ RUN: | llvm-readobj --arch-specific - \ +@ RUN: | FileCheck %s -check-prefix CHECK-ATTR + + .fpu neon-vfpv4 + +@ CHECK-ATTR: FileAttributes { +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: FP_arch +@ CHECK-ATTR: Description: VFPv4 +@ CHECK-ATTR: } +@ CHECK-ATTR: } + diff --git a/llvm/test/MC/ARM/directive-fpu-single-neon.s b/llvm/test/MC/ARM/directive-fpu-single-neon.s new file mode 100644 index 00000000000000..676ed11a14bd03 --- /dev/null +++ b/llvm/test/MC/ARM/directive-fpu-single-neon.s @@ -0,0 +1,15 @@ +@ Check a single .fpu directive. + +@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \ +@ RUN: | llvm-readobj --arch-specific - \ +@ RUN: | FileCheck %s -check-prefix CHECK-ATTR + + .fpu neon + +@ CHECK-ATTR: FileAttributes { +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: FP_arch +@ CHECK-ATTR: Description: VFPv3 +@ CHECK-ATTR: } +@ CHECK-ATTR: } + diff --git a/llvm/test/MC/ARM/directive-fpu-single-none.s b/llvm/test/MC/ARM/directive-fpu-single-none.s new file mode 100644 index 00000000000000..aa8df756e3a080 --- /dev/null +++ b/llvm/test/MC/ARM/directive-fpu-single-none.s @@ -0,0 +1,10 @@ +@ Check a single .fpu directive. + +@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \ +@ RUN: | llvm-readobj --arch-specific - \ +@ RUN: | FileCheck %s -check-prefix CHECK-ATTR + + .fpu none + +@ CHECK-ATTR-NOT: TagName: FP_arch + diff --git a/llvm/test/MC/ARM/directive-fpu-single-vfp.s b/llvm/test/MC/ARM/directive-fpu-single-vfp.s new file mode 100644 index 00000000000000..2023236ecf61d5 --- /dev/null +++ b/llvm/test/MC/ARM/directive-fpu-single-vfp.s @@ -0,0 +1,15 @@ +@ Check a single .fpu directive. + +@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \ +@ RUN: | llvm-readobj --arch-specific - \ +@ RUN: | FileCheck %s -check-prefix CHECK-ATTR + + .fpu vfp + +@ CHECK-ATTR: FileAttributes { +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: FP_arch +@ CHECK-ATTR: Description: VFPv2 +@ CHECK-ATTR: } +@ CHECK-ATTR: } + diff --git a/llvm/test/MC/ARM/directive-fpu-single-vfpv2.s b/llvm/test/MC/ARM/directive-fpu-single-vfpv2.s new file mode 100644 index 00000000000000..7c4d6b37e2b612 --- /dev/null +++ b/llvm/test/MC/ARM/directive-fpu-single-vfpv2.s @@ -0,0 +1,15 @@ +@ Check a single .fpu directive. + +@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \ +@ RUN: | llvm-readobj --arch-specific - \ +@ RUN: | FileCheck %s -check-prefix CHECK-ATTR + + .fpu vfp2 + +@ CHECK-ATTR: FileAttributes { +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: FP_arch +@ CHECK-ATTR: Description: VFPv2 +@ CHECK-ATTR: } +@ CHECK-ATTR: } + diff --git a/llvm/test/MC/ARM/directive-fpu-single-vfpv3-d16-fp16.s b/llvm/test/MC/ARM/directive-fpu-single-vfpv3-d16-fp16.s new file mode 100644 index 00000000000000..adc8cb276190a0 --- /dev/null +++ b/llvm/test/MC/ARM/directive-fpu-single-vfpv3-d16-fp16.s @@ -0,0 +1,15 @@ +@ Check a single .fpu directive. + +@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \ +@ RUN: | llvm-readobj --arch-specific - \ +@ RUN: | FileCheck %s -check-prefix CHECK-ATTR + + .fpu vfpv3-d16-fp16 + +@ CHECK-ATTR: FileAttributes { +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: FP_arch +@ CHECK-ATTR: Description: VFPv3-D16 +@ CHECK-ATTR: } +@ CHECK-ATTR: } + diff --git a/llvm/test/MC/ARM/directive-fpu-single-vfpv3-d16.s b/llvm/test/MC/ARM/directive-fpu-single-vfpv3-d16.s new file mode 100644 index 00000000000000..a33e1df3f48427 --- /dev/null +++ b/llvm/test/MC/ARM/directive-fpu-single-vfpv3-d16.s @@ -0,0 +1,15 @@ +@ Check a single .fpu directive. + +@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \ +@ RUN: | llvm-readobj --arch-specific - \ +@ RUN: | FileCheck %s -check-prefix CHECK-ATTR + + .fpu vfp3-d16 + +@ CHECK-ATTR: FileAttributes { +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: FP_arch +@ CHECK-ATTR: Description: VFPv3-D16 +@ CHECK-ATTR: } +@ CHECK-ATTR: } + diff --git a/llvm/test/MC/ARM/directive-fpu-single-vfpv3-fp16.s b/llvm/test/MC/ARM/directive-fpu-single-vfpv3-fp16.s new file mode 100644 index 00000000000000..5238e56bcf1f3a --- /dev/null +++ b/llvm/test/MC/ARM/directive-fpu-single-vfpv3-fp16.s @@ -0,0 +1,15 @@ +@ Check a single .fpu directive. + +@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \ +@ RUN: | llvm-readobj --arch-specific - \ +@ RUN: | FileCheck %s -check-prefix CHECK-ATTR + + .fpu vfpv3-fp16 + +@ CHECK-ATTR: FileAttributes { +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: FP_arch +@ CHECK-ATTR: Description: VFPv3 +@ CHECK-ATTR: } +@ CHECK-ATTR: } + diff --git a/llvm/test/MC/ARM/directive-fpu-single-vfpv3.s b/llvm/test/MC/ARM/directive-fpu-single-vfpv3.s new file mode 100644 index 00000000000000..6182b88ba3f9d9 --- /dev/null +++ b/llvm/test/MC/ARM/directive-fpu-single-vfpv3.s @@ -0,0 +1,15 @@ +@ Check a single .fpu directive. + +@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \ +@ RUN: | llvm-readobj --arch-specific - \ +@ RUN: | FileCheck %s -check-prefix CHECK-ATTR + + .fpu vfp3 + +@ CHECK-ATTR: FileAttributes { +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: FP_arch +@ CHECK-ATTR: Description: VFPv3 +@ CHECK-ATTR: } +@ CHECK-ATTR: } + diff --git a/llvm/test/MC/ARM/directive-fpu-single-vfpv3xd-fp16.s b/llvm/test/MC/ARM/directive-fpu-single-vfpv3xd-fp16.s new file mode 100644 index 00000000000000..6e91c565199db4 --- /dev/null +++ b/llvm/test/MC/ARM/directive-fpu-single-vfpv3xd-fp16.s @@ -0,0 +1,15 @@ +@ Check a single .fpu directive. + +@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \ +@ RUN: | llvm-readobj --arch-specific - \ +@ RUN: | FileCheck %s -check-prefix CHECK-ATTR + + .fpu vfpv3xd-fp16 + +@ CHECK-ATTR: FileAttributes { +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: FP_arch +@ CHECK-ATTR: Description: VFPv3-D16 +@ CHECK-ATTR: } +@ CHECK-ATTR: } + diff --git a/llvm/test/MC/ARM/directive-fpu-single-vfpv3xd.s b/llvm/test/MC/ARM/directive-fpu-single-vfpv3xd.s new file mode 100644 index 00000000000000..57e9b5379d9664 --- /dev/null +++ b/llvm/test/MC/ARM/directive-fpu-single-vfpv3xd.s @@ -0,0 +1,15 @@ +@ Check a single .fpu directive. + +@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \ +@ RUN: | llvm-readobj --arch-specific - \ +@ RUN: | FileCheck %s -check-prefix CHECK-ATTR + + .fpu vfpv3xd + +@ CHECK-ATTR: FileAttributes { +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: FP_arch +@ CHECK-ATTR: Description: VFPv3-D16 +@ CHECK-ATTR: } +@ CHECK-ATTR: } + diff --git a/llvm/test/MC/ARM/directive-fpu-single-vfpv4-d16.s b/llvm/test/MC/ARM/directive-fpu-single-vfpv4-d16.s new file mode 100644 index 00000000000000..604c4c2e941895 --- /dev/null +++ b/llvm/test/MC/ARM/directive-fpu-single-vfpv4-d16.s @@ -0,0 +1,15 @@ +@ Check a single .fpu directive. + +@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \ +@ RUN: | llvm-readobj --arch-specific - \ +@ RUN: | FileCheck %s -check-prefix CHECK-ATTR + + .fpu vfpv4-d16 + +@ CHECK-ATTR: FileAttributes { +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: FP_arch +@ CHECK-ATTR: Description: VFPv4-D16 +@ CHECK-ATTR: } +@ CHECK-ATTR: } + diff --git a/llvm/test/MC/ARM/directive-fpu-single-vfpv4.s b/llvm/test/MC/ARM/directive-fpu-single-vfpv4.s new file mode 100644 index 00000000000000..41c043a66b474c --- /dev/null +++ b/llvm/test/MC/ARM/directive-fpu-single-vfpv4.s @@ -0,0 +1,15 @@ +@ Check a single .fpu directive. + +@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \ +@ RUN: | llvm-readobj --arch-specific - \ +@ RUN: | FileCheck %s -check-prefix CHECK-ATTR + + .fpu vfpv4 + +@ CHECK-ATTR: FileAttributes { +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: FP_arch +@ CHECK-ATTR: Description: VFPv4 +@ CHECK-ATTR: } +@ CHECK-ATTR: } + From b21756f9f1038acd889dd3a12fd16f843d07c4a8 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Thu, 22 Aug 2024 16:34:04 +0100 Subject: [PATCH 218/426] [lldb][test] Add a unit-test for importRecordLayoutFromOrigin --- .../unittests/Symbol/TestClangASTImporter.cpp | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/lldb/unittests/Symbol/TestClangASTImporter.cpp b/lldb/unittests/Symbol/TestClangASTImporter.cpp index 41c7ed75155f35..f1b3d7911c4bd5 100644 --- a/lldb/unittests/Symbol/TestClangASTImporter.cpp +++ b/lldb/unittests/Symbol/TestClangASTImporter.cpp @@ -11,6 +11,7 @@ #include "Plugins/ExpressionParser/Clang/ClangASTImporter.h" #include "Plugins/ExpressionParser/Clang/ClangASTMetadata.h" #include "Plugins/ExpressionParser/Clang/ClangUtil.h" +#include "Plugins/SymbolFile/DWARF/DWARFASTParserClang.h" #include "Plugins/TypeSystem/Clang/TypeSystemClang.h" #include "TestingSupport/SubsystemRAII.h" #include "TestingSupport/Symbol/ClangTestUtils.h" @@ -276,3 +277,56 @@ TEST_F(TestClangASTImporter, RecordLayout) { EXPECT_EQ(0U, base_offsets.size()); EXPECT_EQ(0U, vbase_offsets.size()); } + +TEST_F(TestClangASTImporter, RecordLayoutFromOrigin) { + // Tests that we can retrieve the layout of a record that has + // an origin with an already existing LayoutInfo. We expect + // the layout to be retrieved from the ClangASTImporter of the + // origin decl. + + clang_utils::SourceASTWithRecord source; + + auto *dwarf_parser = + static_cast(source.ast->GetDWARFParser()); + auto &importer = dwarf_parser->GetClangASTImporter(); + + // Set the layout for the origin decl in the origin ClangASTImporter. + ClangASTImporter::LayoutInfo layout_info; + layout_info.bit_size = 32; + layout_info.alignment = 16; + layout_info.field_offsets[source.field_decl] = 1; + importer.SetRecordLayout(source.record_decl, layout_info); + + auto holder = + std::make_unique("target ast"); + auto *target_ast = holder->GetAST(); + + // Import the decl into a new TypeSystemClang. + CompilerType imported = importer.CopyType(*target_ast, source.record_type); + ASSERT_TRUE(imported.IsValid()); + + auto *imported_decl = cast(ClangUtil::GetAsTagDecl(imported)); + ClangASTImporter::DeclOrigin origin = importer.GetDeclOrigin(imported_decl); + ASSERT_TRUE(origin.Valid()); + ASSERT_EQ(origin.decl, source.record_decl); + + uint64_t bit_size; + uint64_t alignment; + llvm::DenseMap field_offsets; + llvm::DenseMap base_offsets; + llvm::DenseMap vbase_offsets; + + // Make sure we correctly read out the layout (despite not Having + // called SetRecordLayout on the new TypeSystem's ClangASTImporter). + auto success = + importer.LayoutRecordType(imported_decl, bit_size, alignment, + field_offsets, base_offsets, vbase_offsets); + EXPECT_TRUE(success); + + EXPECT_EQ(32U, bit_size); + EXPECT_EQ(16U, alignment); + EXPECT_EQ(1U, field_offsets.size()); + EXPECT_EQ(1U, field_offsets[*imported_decl->field_begin()]); + EXPECT_EQ(0U, base_offsets.size()); + EXPECT_EQ(0U, vbase_offsets.size()); +} From 8ab61404e866539f5e28e0f72ba7a510fa51dd3a Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 22 Aug 2024 16:43:24 +0100 Subject: [PATCH 219/426] [AArch64] Lower aarch64_neon_saddlv via SADDLV nodes. (#103307) This mirrors what GISel already does, extending the existing lowering of aarch64_neon_saddlv/aarch64_neon_uaddlv to SADDLV/UADDLV. This allows us to remove some tablegen patterns, and provides a little nicer codegen in places as the nodes represent the result being in a vector register correctly. --- .../Target/AArch64/AArch64ISelLowering.cpp | 26 +++--- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 93 ++----------------- .../aarch64-neon-vector-insert-uaddlv.ll | 26 ++---- .../test/CodeGen/AArch64/arm64-neon-across.ll | 32 ++----- 4 files changed, 41 insertions(+), 136 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index e98b430e62389b..33fc6e56a24793 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -6097,20 +6097,24 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MaskAsInt, DAG.getVectorIdxConstant(0, dl)); } + case Intrinsic::aarch64_neon_saddlv: case Intrinsic::aarch64_neon_uaddlv: { EVT OpVT = Op.getOperand(1).getValueType(); EVT ResVT = Op.getValueType(); - if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 || - OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) { - // In order to avoid insert_subvector, used v4i32 than v2i32. - SDValue UADDLV = - DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1)); - SDValue EXTRACT_VEC_ELT = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, UADDLV, - DAG.getConstant(0, dl, MVT::i64)); - return EXTRACT_VEC_ELT; - } - return SDValue(); + assert( + ((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 || + OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) || + (ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) && + "Unexpected aarch64_neon_u/saddlv type"); + // In order to avoid insert_subvector, use v4i32 rather than v2i32. + SDValue ADDLV = DAG.getNode( + IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV + : AArch64ISD::SADDLV, + dl, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1)); + SDValue EXTRACT_VEC_ELT = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, dl, ResVT == MVT::i32 ? MVT::i32 : MVT::i64, + ADDLV, DAG.getConstant(0, dl, MVT::i64)); + return EXTRACT_VEC_ELT; } case Intrinsic::experimental_cttz_elts: { SDValue CttzOp = Op.getOperand(1); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index ec225a5b234a26..2fff6fffcd7c6d 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -7196,17 +7196,6 @@ multiclass SIMDAcrossLaneLongPairIntrinsicGISel; defm : SIMDAcrossLaneLongPairIntrinsicGISel<"SADDLV", AArch64saddlp>; -// Patterns for uaddlv(uaddlp(x)) ==> uaddlv -def : Pat<(i64 (int_aarch64_neon_uaddlv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))), - (i64 (EXTRACT_SUBREG - (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$op), ssub)), - dsub))>; - -def : Pat<(i32 (int_aarch64_neon_uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))), - (i32 (EXTRACT_SUBREG - (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)), - ssub))>; - def : Pat<(v2i64 (AArch64uaddlv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))), (v2i64 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$op), ssub))>; @@ -7427,82 +7416,12 @@ defm : SIMDAcrossLanesVecReductionIntrinsic<"SMAXV", vecreduce_smax>; def : Pat<(i32 (vecreduce_smax (v2i32 V64:$Rn))), (i32 (EXTRACT_SUBREG (SMAXPv2i32 V64:$Rn, V64:$Rn), ssub))>; -multiclass SIMDAcrossLanesSignedLongIntrinsic { - def : Pat<(i32 (intOp (v8i8 V64:$Rn))), - (i32 (SMOVvi16to32 - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub), - (i64 0)))>; -def : Pat<(i32 (intOp (v16i8 V128:$Rn))), - (i32 (SMOVvi16to32 - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub), - (i64 0)))>; - -def : Pat<(i32 (intOp (v4i16 V64:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub), - ssub))>; -def : Pat<(i32 (intOp (v8i16 V128:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub), - ssub))>; - -def : Pat<(i64 (intOp (v4i32 V128:$Rn))), - (i64 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub), - dsub))>; -} - -multiclass SIMDAcrossLanesUnsignedLongIntrinsic { - def : Pat<(i32 (intOp (v8i8 V64:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub), - ssub))>; -def : Pat<(i32 (intOp (v16i8 V128:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub), - ssub))>; - -def : Pat<(i32 (intOp (v4i16 V64:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub), - ssub))>; -def : Pat<(i32 (intOp (v8i16 V128:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub), - ssub))>; - -def : Pat<(i64 (intOp (v4i32 V128:$Rn))), - (i64 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub), - dsub))>; -} - -defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_aarch64_neon_saddlv>; -defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_aarch64_neon_uaddlv>; - -// The vaddlv_s32 intrinsic gets mapped to SADDLP. -def : Pat<(i64 (int_aarch64_neon_saddlv (v2i32 V64:$Rn))), - (i64 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (SADDLPv2i32_v1i64 V64:$Rn), dsub), - dsub))>; -// The vaddlv_u32 intrinsic gets mapped to UADDLP. -def : Pat<(i64 (int_aarch64_neon_uaddlv (v2i32 V64:$Rn))), - (i64 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (UADDLPv2i32_v1i64 V64:$Rn), dsub), - dsub))>; +// The SADDLV v2i32 gets mapped to SADDLP. +def : Pat<(v2i64 (AArch64saddlv (v2i32 V64:$Rn))), + (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (SADDLPv2i32_v1i64 V64:$Rn), dsub))>; +// The UADDLV v2i32 gets mapped to UADDLP. +def : Pat<(v2i64 (AArch64uaddlv (v2i32 V64:$Rn))), + (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (UADDLPv2i32_v1i64 V64:$Rn), dsub))>; //------------------------------------------------------------------------------ // AdvSIMD modified immediate instructions diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll index 75a549e348d472..2e165179381820 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll @@ -146,11 +146,11 @@ define void @insert_vec_v6i64_uaddlv_from_v4i32(ptr %0) { ; CHECK-LABEL: insert_vec_v6i64_uaddlv_from_v4i32: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v0, #0000000000000000 -; CHECK-NEXT: movi.2d v2, #0000000000000000 ; CHECK-NEXT: uaddlv.4s d1, v0 -; CHECK-NEXT: str d2, [x0, #16] ; CHECK-NEXT: mov.d v0[0], v1[0] +; CHECK-NEXT: movi.2d v1, #0000000000000000 ; CHECK-NEXT: ucvtf.2d v0, v0 +; CHECK-NEXT: str d1, [x0, #16] ; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret @@ -491,9 +491,8 @@ define void @store_saddlv_v8i8(ptr %H, <8 x i8> %sum_h, i32 %idx) { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: saddlv.8b h0, v0 ; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: sbfiz x9, x1, #3, #32 -; CHECK-NEXT: smov.h w8, v0[0] -; CHECK-NEXT: str w8, [x0, x9] +; CHECK-NEXT: sbfiz x8, x1, #3, #32 +; CHECK-NEXT: str s0, [x0, x8] ; CHECK-NEXT: ret entry: %vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %sum_h) @@ -508,9 +507,8 @@ define void @store_saddlv_v16i8(ptr %H, <16 x i8> %sum_h, i32 %idx) { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: saddlv.16b h0, v0 ; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: sbfiz x9, x1, #3, #32 -; CHECK-NEXT: smov.h w8, v0[0] -; CHECK-NEXT: str w8, [x0, x9] +; CHECK-NEXT: sbfiz x8, x1, #3, #32 +; CHECK-NEXT: str s0, [x0, x8] ; CHECK-NEXT: ret entry: %vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %sum_h) @@ -526,8 +524,7 @@ define void @store_saddlv_v4i16(ptr %H, <4 x i16> %sum_h, i32 %idx) { ; CHECK-NEXT: saddlv.4h s0, v0 ; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: sbfiz x8, x1, #3, #32 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: str w9, [x0, x8] +; CHECK-NEXT: str s0, [x0, x8] ; CHECK-NEXT: ret entry: %vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> %sum_h) @@ -543,8 +540,7 @@ define void @store_saddlv_v8i16(ptr %H, <8 x i16> %sum_h, i32 %idx) { ; CHECK-NEXT: saddlv.8h s0, v0 ; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: sbfiz x8, x1, #3, #32 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: str w9, [x0, x8] +; CHECK-NEXT: str s0, [x0, x8] ; CHECK-NEXT: ret entry: %vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> %sum_h) @@ -558,8 +554,7 @@ define void @store_saddlv_v2i32(ptr %H, <2 x i32> %sum_h, i32 %idx) { ; CHECK-LABEL: store_saddlv_v2i32: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: saddlp.1d v0, v0 -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: str x8, [x0, w1, sxtw #3] +; CHECK-NEXT: str d0, [x0, w1, sxtw #3] ; CHECK-NEXT: ret entry: %vaddlvq_s32.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> %sum_h) @@ -573,8 +568,7 @@ define void @store_saddlv_v4i32(ptr %H, <4 x i32> %sum_h, i32 %idx) { ; CHECK-LABEL: store_saddlv_v4i32: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: saddlv.4s d0, v0 -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: str x8, [x0, w1, sxtw #3] +; CHECK-NEXT: str d0, [x0, w1, sxtw #3] ; CHECK-NEXT: ret entry: %vaddlvq_s32.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> %sum_h) diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-across.ll b/llvm/test/CodeGen/AArch64/arm64-neon-across.ll index 2899197abb2f44..84d009565ecb59 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-across.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-across.ll @@ -43,17 +43,11 @@ declare i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16>) declare i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8>) define i16 @test_vaddlv_s8(<8 x i8> %a) { -; CHECK-SD-LABEL: test_vaddlv_s8: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: saddlv h0, v0.8b -; CHECK-SD-NEXT: smov w0, v0.h[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vaddlv_s8: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: saddlv h0, v0.8b -; CHECK-GI-NEXT: fmov w0, s0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vaddlv_s8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: saddlv h0, v0.8b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret entry: %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %a) %0 = trunc i32 %saddlvv.i to i16 @@ -95,17 +89,11 @@ entry: } define i16 @test_vaddlvq_s8(<16 x i8> %a) { -; CHECK-SD-LABEL: test_vaddlvq_s8: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: saddlv h0, v0.16b -; CHECK-SD-NEXT: smov w0, v0.h[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vaddlvq_s8: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: saddlv h0, v0.16b -; CHECK-GI-NEXT: fmov w0, s0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vaddlvq_s8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: saddlv h0, v0.16b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret entry: %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %a) %0 = trunc i32 %saddlvv.i to i16 From 24740ecfd100907150c5aa2d1c53bf17fb73966c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 22 Aug 2024 14:57:23 +0100 Subject: [PATCH 220/426] [X86] Add BSR/BSF tests to check for implicit zero extension --- llvm/test/CodeGen/X86/ctlz.ll | 126 ++++++++++++++++++++++++++++++++++ llvm/test/CodeGen/X86/cttz.ll | 115 +++++++++++++++++++++++++++++++ 2 files changed, 241 insertions(+) diff --git a/llvm/test/CodeGen/X86/ctlz.ll b/llvm/test/CodeGen/X86/ctlz.ll index 6635be18b0f7a7..68defaff78d37d 100644 --- a/llvm/test/CodeGen/X86/ctlz.ll +++ b/llvm/test/CodeGen/X86/ctlz.ll @@ -1211,3 +1211,129 @@ define i64 @ctlz_xor63_i64_true(i64 %x) { %res = xor i64 %clz, 63 ret i64 %res } + +define i64 @ctlz_i32_sext(i32 %x) { +; X86-NOCMOV-LABEL: ctlz_i32_sext: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: testl %eax, %eax +; X86-NOCMOV-NEXT: je .LBB20_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: bsrl %eax, %eax +; X86-NOCMOV-NEXT: xorl $31, %eax +; X86-NOCMOV-NEXT: jmp .LBB20_3 +; X86-NOCMOV-NEXT: .LBB20_1: +; X86-NOCMOV-NEXT: movl $32, %eax +; X86-NOCMOV-NEXT: .LBB20_3: # %cond.end +; X86-NOCMOV-NEXT: xorl $31, %eax +; X86-NOCMOV-NEXT: xorl %edx, %edx +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: ctlz_i32_sext: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: bsrl {{[0-9]+}}(%esp), %ecx +; X86-CMOV-NEXT: movl $63, %eax +; X86-CMOV-NEXT: cmovnel %ecx, %eax +; X86-CMOV-NEXT: xorl %edx, %edx +; X86-CMOV-NEXT: retl +; +; X64-LABEL: ctlz_i32_sext: +; X64: # %bb.0: +; X64-NEXT: bsrl %edi, %ecx +; X64-NEXT: movl $63, %eax +; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: retq +; +; X86-CLZ-LABEL: ctlz_i32_sext: +; X86-CLZ: # %bb.0: +; X86-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax +; X86-CLZ-NEXT: xorl $31, %eax +; X86-CLZ-NEXT: xorl %edx, %edx +; X86-CLZ-NEXT: retl +; +; X64-CLZ-LABEL: ctlz_i32_sext: +; X64-CLZ: # %bb.0: +; X64-CLZ-NEXT: lzcntl %edi, %eax +; X64-CLZ-NEXT: xorl $31, %eax +; X64-CLZ-NEXT: retq +; +; X64-FASTLZCNT-LABEL: ctlz_i32_sext: +; X64-FASTLZCNT: # %bb.0: +; X64-FASTLZCNT-NEXT: lzcntl %edi, %eax +; X64-FASTLZCNT-NEXT: xorl $31, %eax +; X64-FASTLZCNT-NEXT: retq +; +; X86-FASTLZCNT-LABEL: ctlz_i32_sext: +; X86-FASTLZCNT: # %bb.0: +; X86-FASTLZCNT-NEXT: lzcntl {{[0-9]+}}(%esp), %eax +; X86-FASTLZCNT-NEXT: xorl $31, %eax +; X86-FASTLZCNT-NEXT: xorl %edx, %edx +; X86-FASTLZCNT-NEXT: retl + %tmp = call i32 @llvm.ctlz.i32( i32 %x, i1 false) + %xor = xor i32 %tmp, 31 + %ext = sext i32 %xor to i64 + ret i64 %ext +} + +define i64 @ctlz_i32_zext(i32 %x) { +; X86-NOCMOV-LABEL: ctlz_i32_zext: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: testl %eax, %eax +; X86-NOCMOV-NEXT: je .LBB21_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: bsrl %eax, %eax +; X86-NOCMOV-NEXT: xorl $31, %eax +; X86-NOCMOV-NEXT: jmp .LBB21_3 +; X86-NOCMOV-NEXT: .LBB21_1: +; X86-NOCMOV-NEXT: movl $32, %eax +; X86-NOCMOV-NEXT: .LBB21_3: # %cond.end +; X86-NOCMOV-NEXT: xorl $31, %eax +; X86-NOCMOV-NEXT: xorl %edx, %edx +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: ctlz_i32_zext: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: bsrl {{[0-9]+}}(%esp), %ecx +; X86-CMOV-NEXT: movl $63, %eax +; X86-CMOV-NEXT: cmovnel %ecx, %eax +; X86-CMOV-NEXT: xorl %edx, %edx +; X86-CMOV-NEXT: retl +; +; X64-LABEL: ctlz_i32_zext: +; X64: # %bb.0: +; X64-NEXT: bsrl %edi, %ecx +; X64-NEXT: movl $63, %eax +; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: retq +; +; X86-CLZ-LABEL: ctlz_i32_zext: +; X86-CLZ: # %bb.0: +; X86-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax +; X86-CLZ-NEXT: xorl $31, %eax +; X86-CLZ-NEXT: xorl %edx, %edx +; X86-CLZ-NEXT: retl +; +; X64-CLZ-LABEL: ctlz_i32_zext: +; X64-CLZ: # %bb.0: +; X64-CLZ-NEXT: lzcntl %edi, %eax +; X64-CLZ-NEXT: xorl $31, %eax +; X64-CLZ-NEXT: retq +; +; X64-FASTLZCNT-LABEL: ctlz_i32_zext: +; X64-FASTLZCNT: # %bb.0: +; X64-FASTLZCNT-NEXT: lzcntl %edi, %eax +; X64-FASTLZCNT-NEXT: xorl $31, %eax +; X64-FASTLZCNT-NEXT: retq +; +; X86-FASTLZCNT-LABEL: ctlz_i32_zext: +; X86-FASTLZCNT: # %bb.0: +; X86-FASTLZCNT-NEXT: lzcntl {{[0-9]+}}(%esp), %eax +; X86-FASTLZCNT-NEXT: xorl $31, %eax +; X86-FASTLZCNT-NEXT: xorl %edx, %edx +; X86-FASTLZCNT-NEXT: retl + %tmp = call i32 @llvm.ctlz.i32( i32 %x, i1 false) + %xor = xor i32 %tmp, 31 + %ext = zext i32 %xor to i64 + ret i64 %ext +} diff --git a/llvm/test/CodeGen/X86/cttz.ll b/llvm/test/CodeGen/X86/cttz.ll index 27f229b18bf057..30e5cccfb21982 100644 --- a/llvm/test/CodeGen/X86/cttz.ll +++ b/llvm/test/CodeGen/X86/cttz.ll @@ -661,3 +661,118 @@ define i32 @cttz_i32_msize(i32 %x) minsize { %tmp = call i32 @llvm.cttz.i32( i32 %x, i1 true) ret i32 %tmp } + +define i64 @cttz_i32_sext(i32 %x) { +; X86-NOCMOV-LABEL: cttz_i32_sext: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: testl %eax, %eax +; X86-NOCMOV-NEXT: je .LBB12_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: rep bsfl %eax, %eax +; X86-NOCMOV-NEXT: xorl %edx, %edx +; X86-NOCMOV-NEXT: retl +; X86-NOCMOV-NEXT: .LBB12_1: +; X86-NOCMOV-NEXT: movl $32, %eax +; X86-NOCMOV-NEXT: xorl %edx, %edx +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: cttz_i32_sext: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: bsfl {{[0-9]+}}(%esp), %ecx +; X86-CMOV-NEXT: movl $32, %eax +; X86-CMOV-NEXT: cmovnel %ecx, %eax +; X86-CMOV-NEXT: xorl %edx, %edx +; X86-CMOV-NEXT: retl +; +; X64-LABEL: cttz_i32_sext: +; X64: # %bb.0: +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000 +; X64-NEXT: orq %rdi, %rax +; X64-NEXT: rep bsfq %rax, %rax +; X64-NEXT: retq +; +; X86-CLZ-LABEL: cttz_i32_sext: +; X86-CLZ: # %bb.0: +; X86-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax +; X86-CLZ-NEXT: xorl %edx, %edx +; X86-CLZ-NEXT: retl +; +; X64-CLZ-LABEL: cttz_i32_sext: +; X64-CLZ: # %bb.0: +; X64-CLZ-NEXT: tzcntl %edi, %eax +; X64-CLZ-NEXT: retq +; +; X64-FASTLZCNT-LABEL: cttz_i32_sext: +; X64-FASTLZCNT: # %bb.0: +; X64-FASTLZCNT-NEXT: tzcntl %edi, %eax +; X64-FASTLZCNT-NEXT: retq +; +; X86-FASTLZCNT-LABEL: cttz_i32_sext: +; X86-FASTLZCNT: # %bb.0: +; X86-FASTLZCNT-NEXT: tzcntl {{[0-9]+}}(%esp), %eax +; X86-FASTLZCNT-NEXT: xorl %edx, %edx +; X86-FASTLZCNT-NEXT: retl + %tmp = call i32 @llvm.cttz.i32( i32 %x, i1 false) + %ext = sext i32 %tmp to i64 + ret i64 %ext +} + +define i64 @cttz_i32_zext(i32 %x) { +; X86-NOCMOV-LABEL: cttz_i32_zext: +; X86-NOCMOV: # %bb.0: +; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: testl %eax, %eax +; X86-NOCMOV-NEXT: je .LBB13_1 +; X86-NOCMOV-NEXT: # %bb.2: # %cond.false +; X86-NOCMOV-NEXT: rep bsfl %eax, %eax +; X86-NOCMOV-NEXT: xorl %edx, %edx +; X86-NOCMOV-NEXT: retl +; X86-NOCMOV-NEXT: .LBB13_1: +; X86-NOCMOV-NEXT: movl $32, %eax +; X86-NOCMOV-NEXT: xorl %edx, %edx +; X86-NOCMOV-NEXT: retl +; +; X86-CMOV-LABEL: cttz_i32_zext: +; X86-CMOV: # %bb.0: +; X86-CMOV-NEXT: bsfl {{[0-9]+}}(%esp), %ecx +; X86-CMOV-NEXT: movl $32, %eax +; X86-CMOV-NEXT: cmovnel %ecx, %eax +; X86-CMOV-NEXT: xorl %edx, %edx +; X86-CMOV-NEXT: retl +; +; X64-LABEL: cttz_i32_zext: +; X64: # %bb.0: +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000 +; X64-NEXT: orq %rdi, %rax +; X64-NEXT: rep bsfq %rax, %rax +; X64-NEXT: retq +; +; X86-CLZ-LABEL: cttz_i32_zext: +; X86-CLZ: # %bb.0: +; X86-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax +; X86-CLZ-NEXT: xorl %edx, %edx +; X86-CLZ-NEXT: retl +; +; X64-CLZ-LABEL: cttz_i32_zext: +; X64-CLZ: # %bb.0: +; X64-CLZ-NEXT: tzcntl %edi, %eax +; X64-CLZ-NEXT: retq +; +; X64-FASTLZCNT-LABEL: cttz_i32_zext: +; X64-FASTLZCNT: # %bb.0: +; X64-FASTLZCNT-NEXT: tzcntl %edi, %eax +; X64-FASTLZCNT-NEXT: retq +; +; X86-FASTLZCNT-LABEL: cttz_i32_zext: +; X86-FASTLZCNT: # %bb.0: +; X86-FASTLZCNT-NEXT: tzcntl {{[0-9]+}}(%esp), %eax +; X86-FASTLZCNT-NEXT: xorl %edx, %edx +; X86-FASTLZCNT-NEXT: retl + %tmp = call i32 @llvm.cttz.i32( i32 %x, i1 false) + %ext = zext i32 %tmp to i64 + ret i64 %ext +} + From 8c6f8c29e90666b747fc4b4612647554206a2be5 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 22 Aug 2024 08:51:47 -0700 Subject: [PATCH 221/426] Reland "[asan] Remove debug tracing from `report_globals` (#104404)" (#105601) This reverts commit 2704b804bec50c2b016bf678bd534c330ec655b6 and relands #104404. The Darwin should not fail after #105599. --- compiler-rt/lib/asan/asan_flags.inc | 7 ++----- compiler-rt/lib/asan/asan_globals.cpp | 19 ++++++++----------- .../Linux/initialization-nobug-lld.cpp | 2 +- .../Linux/odr_indicator_unregister.cpp | 2 +- .../asan/TestCases/Linux/odr_indicators.cpp | 4 ++-- .../TestCases/Windows/dll_global_dead_strip.c | 4 ++-- ...eport_globals_symbolization_at_startup.cpp | 2 +- .../TestCases/Windows/global_dead_strip.c | 4 ++-- .../Windows/report_globals_vs_freelibrary.cpp | 2 +- .../asan/TestCases/initialization-nobug.cpp | 8 ++++---- 10 files changed, 24 insertions(+), 30 deletions(-) diff --git a/compiler-rt/lib/asan/asan_flags.inc b/compiler-rt/lib/asan/asan_flags.inc index fad1577d912a5e..5e0ced9706e664 100644 --- a/compiler-rt/lib/asan/asan_flags.inc +++ b/compiler-rt/lib/asan/asan_flags.inc @@ -36,11 +36,8 @@ ASAN_FLAG(int, max_redzone, 2048, ASAN_FLAG( bool, debug, false, "If set, prints some debugging information and does additional checks.") -ASAN_FLAG( - int, report_globals, 1, - "Controls the way to handle globals (0 - don't detect buffer overflow on " - "globals, 1 - detect buffer overflow, 2 - print data about registered " - "globals).") +ASAN_FLAG(bool, report_globals, true, + "If set, detect and report errors on globals .") ASAN_FLAG(bool, check_initialization_order, false, "If set, attempts to catch initialization order issues.") ASAN_FLAG( diff --git a/compiler-rt/lib/asan/asan_globals.cpp b/compiler-rt/lib/asan/asan_globals.cpp index c83b782cb85f89..a1211430b1268a 100644 --- a/compiler-rt/lib/asan/asan_globals.cpp +++ b/compiler-rt/lib/asan/asan_globals.cpp @@ -22,6 +22,7 @@ #include "asan_thread.h" #include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_dense_map.h" +#include "sanitizer_common/sanitizer_internal_defs.h" #include "sanitizer_common/sanitizer_list.h" #include "sanitizer_common/sanitizer_mutex.h" #include "sanitizer_common/sanitizer_placement_new.h" @@ -179,7 +180,7 @@ int GetGlobalsForAddress(uptr addr, Global *globals, u32 *reg_sites, int res = 0; for (const auto &l : list_of_all_globals) { const Global &g = *l.g; - if (flags()->report_globals >= 2) + if (UNLIKELY(common_flags()->verbosity >= 3)) ReportGlobal(g, "Search"); if (IsAddressNearGlobal(addr, g)) { internal_memcpy(&globals[res], &g, sizeof(g)); @@ -270,7 +271,7 @@ static inline bool UseODRIndicator(const Global *g) { // so we store the globals in a map. static void RegisterGlobal(const Global *g) SANITIZER_REQUIRES(mu_for_globals) { CHECK(AsanInited()); - if (flags()->report_globals >= 2) + if (UNLIKELY(common_flags()->verbosity >= 3)) ReportGlobal(*g, "Added"); CHECK(flags()->report_globals); CHECK(AddrIsInMem(g->beg)); @@ -307,7 +308,7 @@ static void RegisterGlobal(const Global *g) SANITIZER_REQUIRES(mu_for_globals) { static void UnregisterGlobal(const Global *g) SANITIZER_REQUIRES(mu_for_globals) { CHECK(AsanInited()); - if (flags()->report_globals >= 2) + if (UNLIKELY(common_flags()->verbosity >= 3)) ReportGlobal(*g, "Removed"); CHECK(flags()->report_globals); CHECK(AddrIsInMem(g->beg)); @@ -438,7 +439,7 @@ void __asan_register_globals(__asan_global *globals, uptr n) { } GlobalRegistrationSite site = {stack_id, &globals[0], &globals[n - 1]}; global_registration_site_vector->push_back(site); - if (flags()->report_globals >= 2) { + if (UNLIKELY(common_flags()->verbosity >= 3)) { PRINT_CURRENT_STACK(); Printf("=== ID %d; %p %p\n", stack_id, (void *)&globals[0], (void *)&globals[n - 1]); @@ -497,9 +498,7 @@ void __asan_before_dynamic_init(const char *module_name) { Lock lock(&mu_for_globals); if (current_dynamic_init_module_name == module_name) return; - if (flags()->report_globals >= 3) - Printf("DynInitPoison module: %s\n", module_name); - + VPrintf(2, "DynInitPoison module: %s\n", module_name); if (current_dynamic_init_module_name == nullptr) { // First call, poison all globals from other modules. DynInitGlobals().forEach([&](auto &kv) { @@ -545,8 +544,7 @@ static void UnpoisonBeforeMain(void) { return; allow_after_dynamic_init = true; } - if (flags()->report_globals >= 3) - Printf("UnpoisonBeforeMain\n"); + VPrintf(2, "UnpoisonBeforeMain\n"); __asan_after_dynamic_init(); } @@ -570,8 +568,7 @@ void __asan_after_dynamic_init() { if (!current_dynamic_init_module_name) return; - if (flags()->report_globals >= 3) - Printf("DynInitUnpoison\n"); + VPrintf(2, "DynInitUnpoison\n"); DynInitGlobals().forEach([&](auto &kv) { UnpoisonDynamicGlobals(kv.second, /*mark_initialized=*/false); diff --git a/compiler-rt/test/asan/TestCases/Linux/initialization-nobug-lld.cpp b/compiler-rt/test/asan/TestCases/Linux/initialization-nobug-lld.cpp index 5cec029811cbc8..ef82c7a29575eb 100644 --- a/compiler-rt/test/asan/TestCases/Linux/initialization-nobug-lld.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/initialization-nobug-lld.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx_asan -O3 %S/../initialization-nobug.cpp %S/../Helpers/initialization-nobug-extra.cpp -fuse-ld=lld -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInit" +// RUN: %clangxx_asan -O3 %S/../initialization-nobug.cpp %S/../Helpers/initialization-nobug-extra.cpp -fuse-ld=lld -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInit" // Same as initialization-nobug.cpp, but with lld we expect just one // `DynInitUnpoison` executed after `AfterDynamicInit` at the end. diff --git a/compiler-rt/test/asan/TestCases/Linux/odr_indicator_unregister.cpp b/compiler-rt/test/asan/TestCases/Linux/odr_indicator_unregister.cpp index 0f2ed6597154bb..b75f5be101ef8a 100644 --- a/compiler-rt/test/asan/TestCases/Linux/odr_indicator_unregister.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/odr_indicator_unregister.cpp @@ -4,7 +4,7 @@ // RUN: %clangxx_asan -g -O0 -DSHARED_LIB -DSIZE=1 %s -fPIC -shared -o %t-so-1.so // RUN: %clangxx_asan -g -O0 -DSHARED_LIB -DSIZE=2 %s -fPIC -shared -o %t-so-2.so // RUN: %clangxx_asan -g -O0 %s %libdl -Wl,--export-dynamic -o %t -// RUN: %env_asan_opts=report_globals=2:detect_odr_violation=1 %run %t 2>&1 | FileCheck %s +// RUN: %env_asan_opts=report_globals=1:detect_odr_violation=1:verbosity=3 %run %t 2>&1 | FileCheck %s // FIXME: Checks do not match on Android. // UNSUPPORTED: android diff --git a/compiler-rt/test/asan/TestCases/Linux/odr_indicators.cpp b/compiler-rt/test/asan/TestCases/Linux/odr_indicators.cpp index 8af3ec09be78c4..f28a9f6d07386d 100644 --- a/compiler-rt/test/asan/TestCases/Linux/odr_indicators.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/odr_indicators.cpp @@ -1,8 +1,8 @@ // RUN: %clangxx_asan -fno-sanitize-address-use-odr-indicator -fPIC %s -o %t -// RUN: %env_asan_opts=report_globals=2 %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,INDICATOR0 +// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,INDICATOR0 // RUN: %clangxx_asan -fsanitize-address-use-odr-indicator -fPIC %s -o %t -// RUN: %env_asan_opts=report_globals=2 %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,INDICATOR1 +// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,INDICATOR1 #include diff --git a/compiler-rt/test/asan/TestCases/Windows/dll_global_dead_strip.c b/compiler-rt/test/asan/TestCases/Windows/dll_global_dead_strip.c index a0c96622efeea4..e5bd27bdf65fdf 100644 --- a/compiler-rt/test/asan/TestCases/Windows/dll_global_dead_strip.c +++ b/compiler-rt/test/asan/TestCases/Windows/dll_global_dead_strip.c @@ -1,11 +1,11 @@ // RUN: %clang_cl_asan %Od %p/dll_host.cpp %Fe%t // // RUN: %clang_cl_nocxx_asan %Gw %LD %Od %s %Fe%t.dll -// RUN: %env_asan_opts=report_globals=2 %run %t %t.dll 2>&1 | FileCheck %s --check-prefix=NOSTRIP +// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %t %t.dll 2>&1 | FileCheck %s --check-prefix=NOSTRIP // RUN: %clang_cl_nocxx_asan %Gw %LD -O2 %s %Fe%t.dll \ // RUN: %if target={{.*-windows-gnu}} %{ -Wl,--gc-sections %} \ // RUN: %else %{ -link -opt:ref %} -// RUN: %env_asan_opts=report_globals=2 %run %t %t.dll 2>&1 | FileCheck %s --check-prefix=STRIP +// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %t %t.dll 2>&1 | FileCheck %s --check-prefix=STRIP #include diff --git a/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp b/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp index 06a632e6708b1e..c74b66f2b43b3e 100644 --- a/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp +++ b/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp @@ -1,7 +1,7 @@ // RUN: %clang_cl_asan %LD %Od -DDLL %s %Fe%t.dll \ // RUN: %if target={{.*-windows-gnu}} %{ -Wl,--out-implib,%t.lib %} // RUN: %clang_cl_asan %Od -DEXE %s %t.lib %Fe%te.exe -// RUN: %env_asan_opts=report_globals=2 %run %te.exe 2>&1 | FileCheck %s +// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %te.exe 2>&1 | FileCheck %s // FIXME: Currently, the MT runtime build crashes on startup due to dbghelp.dll // initialization failure. diff --git a/compiler-rt/test/asan/TestCases/Windows/global_dead_strip.c b/compiler-rt/test/asan/TestCases/Windows/global_dead_strip.c index 0e15120a46f776..7f2405fdfc8364 100644 --- a/compiler-rt/test/asan/TestCases/Windows/global_dead_strip.c +++ b/compiler-rt/test/asan/TestCases/Windows/global_dead_strip.c @@ -1,9 +1,9 @@ // RUN: %clang_cl_nocxx_asan %Gw %Od %s %Fe%t.exe -// RUN: %env_asan_opts=report_globals=2 %t.exe 2>&1 | FileCheck %s --check-prefix=NOSTRIP +// RUN: %env_asan_opts=report_globals=1:verbosity=3 %t.exe 2>&1 | FileCheck %s --check-prefix=NOSTRIP // RUN: %clang_cl_nocxx_asan %Gw -O2 %s %Fe%t.exe \ // RUN: %if target={{.*-windows-gnu}} %{ -Wl,--gc-sections %} \ // RUN: %else %{ -link -opt:ref %} -// RUN: %env_asan_opts=report_globals=2 %t.exe 2>&1 | FileCheck %s --check-prefix=STRIP +// RUN: %env_asan_opts=report_globals=1:verbosity=3 %t.exe 2>&1 | FileCheck %s --check-prefix=STRIP #include int dead_global = 42; diff --git a/compiler-rt/test/asan/TestCases/Windows/report_globals_vs_freelibrary.cpp b/compiler-rt/test/asan/TestCases/Windows/report_globals_vs_freelibrary.cpp index 7cad3f39be1ec2..34ce18e146d677 100644 --- a/compiler-rt/test/asan/TestCases/Windows/report_globals_vs_freelibrary.cpp +++ b/compiler-rt/test/asan/TestCases/Windows/report_globals_vs_freelibrary.cpp @@ -1,6 +1,6 @@ // RUN: %clang_cl_asan %LD %Od -DDLL %s %Fe%t.dll // RUN: %clang_cl_asan %Od -DEXE %s %Fe%te.exe -// RUN: %env_asan_opts=report_globals=2 %run %te.exe %t.dll 2>&1 | FileCheck %s +// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %te.exe %t.dll 2>&1 | FileCheck %s #include #include diff --git a/compiler-rt/test/asan/TestCases/initialization-nobug.cpp b/compiler-rt/test/asan/TestCases/initialization-nobug.cpp index f66d501124bc48..61328b9de28ae6 100644 --- a/compiler-rt/test/asan/TestCases/initialization-nobug.cpp +++ b/compiler-rt/test/asan/TestCases/initialization-nobug.cpp @@ -1,10 +1,10 @@ // A collection of various initializers which shouldn't trip up initialization // order checking. If successful, this will just return 0. -// RUN: %clangxx_asan -O0 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" -// RUN: %clangxx_asan -O1 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" -// RUN: %clangxx_asan -O2 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" -// RUN: %clangxx_asan -O3 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" +// RUN: %clangxx_asan -O0 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" +// RUN: %clangxx_asan -O1 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" +// RUN: %clangxx_asan -O2 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" +// RUN: %clangxx_asan -O3 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" // Simple access: // Make sure that accessing a global in the same TU is safe From a625435d3ef4c7bbfceb44498b9b5a2cbbed838b Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 22 Aug 2024 08:52:01 -0700 Subject: [PATCH 222/426] [Vectorize] Fix warnings This patch fixes warnings of the form: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp:9300:23: error: loop variable '[E, Idx]' creates a copy from type 'const value_type' (aka 'const std::pair') [-Werror,-Wrange-loop-construct] --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index afaef6f9da9872..e8ab6839d9fa87 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -9297,7 +9297,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) if (CommonMask[Idx] != PoisonMaskElem) CommonMask[Idx] = Idx; - for (const auto [E, Idx] : SubVectors) { + for (const auto &[E, Idx] : SubVectors) { Cost += ::getShuffleCost( TTI, TTI::SK_InsertSubvector, FixedVectorType::get(ScalarTy, CommonMask.size()), std::nullopt, @@ -12455,7 +12455,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) if (CommonMask[Idx] != PoisonMaskElem) CommonMask[Idx] = Idx; - for (const auto [E, Idx] : SubVectors) { + for (const auto &[E, Idx] : SubVectors) { Vec = Builder.CreateInsertVector( Vec->getType(), Vec, E->VectorizedValue, Builder.getInt64(Idx)); if (!CommonMask.empty()) { @@ -12636,7 +12636,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, E->ReuseShuffleIndices.end()); SmallVector GatheredScalars(E->Scalars.begin(), E->Scalars.end()); // Clear values, to be replaced by insertvector instructions. - for (const auto [EIdx, Idx] : E->CombinedEntriesWithIndices) + for (const auto &[EIdx, Idx] : E->CombinedEntriesWithIndices) for_each(MutableArrayRef(GatheredScalars) .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()), [&](Value *&V) { V = PoisonValue::get(V->getType()); }); @@ -13073,7 +13073,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) { - for (const auto [EIdx, _] : E->CombinedEntriesWithIndices) + for (const auto &[EIdx, _] : E->CombinedEntriesWithIndices) (void)vectorizeTree(VectorizableTree[EIdx].get(), /*PostponedPHIs=*/false); return processBuildVector(E, ScalarTy, Builder, *this); From 0bd90ec421da16df6d020d5a21b642a489491c1e Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 22 Aug 2024 09:09:08 -0700 Subject: [PATCH 223/426] [AArch64] Fix a warning This patch fixes: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp:6102:9: error: unused variable 'OpVT' [-Werror,-Wunused-variable] --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 33fc6e56a24793..8c2f85657ff87e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -6106,6 +6106,7 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) || (ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) && "Unexpected aarch64_neon_u/saddlv type"); + (void)OpVT; // In order to avoid insert_subvector, use v4i32 rather than v2i32. SDValue ADDLV = DAG.getNode( IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV From 46707b0a83b7769965f9b1b3d08b2cc6bd26c469 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 22 Aug 2024 09:12:11 -0700 Subject: [PATCH 224/426] [AArch64,ELF] Allow implicit $d/$x at section beginning The start state of a new section is `EMS_None`, often leading to a $d/$x at offset 0. Introduce a MCTargetOption/cl::opt "implicit-mapsyms" to allow an alternative behavior (https://github.com/ARM-software/abi-aa/issues/274): * Set the start state to `EMS_Data` or `EMS_A64`. * For text sections, add an ending $x only if the final data is not instructions. * For non-text sections, add an ending $d only if the final data is not data commands. ``` .section .text.1,"ax" nop // emit $d .long 42 // emit $x .section .text.2,"ax" nop ``` This new behavior decreases the .symtab size significantly: ``` % bloaty a64-2/bin/clang -- a64-0/bin/clang FILE SIZE VM SIZE -------------- -------------- -5.4% -1.13Mi [ = ] 0 .strtab -50.9% -4.09Mi [ = ] 0 .symtab -4.0% -5.22Mi [ = ] 0 TOTAL ``` --- This scheme works as long as the user can rule out some error scenarios: * .text.1 assembled using the traditional behavior is combined with .text.2 using the new behavior * A linker script combining non-text sections and text sections. The lack of mapping symbols in the non-text sections could make them treated as code, unless the linker inserts extra mapping symbols. The above mix-and-match scenarios aren't an issue at all for a significant portion of users. A text section may start with data commands in rare cases (e.g. -fsanitize=function) that many users don't care about. When combing `(.text.0; .word 0)` and `(.text.1; .word 0)`, the ending $x of .text.0 and the initial $d of .text.1 may have the same address. If both sections reside in the same file, ensure the ending symbol comes before the initial $d of .text.1, so that a dumb linker respecting the symbol order will place the ending $x before the initial $d. Disassemblers using stable sort will see both symbols at the same address, and the second will win. When section ordering mechanisms (e.g. --symbol-ordering-file, --call-graph-profile-sort, `.text : { second.o(.text) first.o(.text) }`) are involved, the initial data in a text section following a text section with trailing data could be misidentified as code, but the issue is local and the risk could be acceptable. Pull Request: https://github.com/llvm/llvm-project/pull/99718 --- lld/test/ELF/aarch64-mapsyms-implicit.s | 45 +++++++++++ llvm/include/llvm/MC/MCAssembler.h | 1 + llvm/include/llvm/MC/MCTargetOptions.h | 2 + .../llvm/MC/MCTargetOptionsCommandFlags.h | 2 + llvm/lib/MC/MCTargetOptionsCommandFlags.cpp | 10 +++ .../MCTargetDesc/AArch64ELFStreamer.cpp | 79 +++++++++++++++++-- .../test/MC/AArch64/mapping-across-sections.s | 57 ++++++++++--- 7 files changed, 179 insertions(+), 17 deletions(-) create mode 100644 lld/test/ELF/aarch64-mapsyms-implicit.s diff --git a/lld/test/ELF/aarch64-mapsyms-implicit.s b/lld/test/ELF/aarch64-mapsyms-implicit.s new file mode 100644 index 00000000000000..42f24ff8a830eb --- /dev/null +++ b/lld/test/ELF/aarch64-mapsyms-implicit.s @@ -0,0 +1,45 @@ +# REQUIRES: aarch64 +# RUN: llvm-mc -filetype=obj -triple=aarch64 -implicit-mapsyms %s -o %t.o +# RUN: ld.lld %t.o -z keep-text-section-prefix -o %t +# RUN: llvm-objdump -d --no-print-imm-hex --show-all-symbols %t | FileCheck %s + +# CHECK: <_start>: +# CHECK-NEXT: nop +# CHECK-EMPTY: +# CHECK-NEXT: <$d>: +# CHECK-NEXT: .word 0x0000002a +# CHECK-EMPTY: +# CHECK-NEXT: <$x>: +# CHECK-NEXT: nop +# CHECK-EMPTY: +# CHECK-NEXT: Disassembly of section .text.hot: +# CHECK-EMPTY: +# CHECK-NEXT: <.text.hot>: +# CHECK-NEXT: nop +# CHECK-EMPTY: +# CHECK-NEXT: <$d>: +# CHECK-NEXT: .word 0x0000002a +# CHECK-EMPTY: +# CHECK-NEXT: <$d>: +# CHECK-NEXT: <$x>: +# CHECK-NEXT: udf #42 +# CHECK-EMPTY: +# CHECK-NEXT: <$x>: +# CHECK-NEXT: nop + +## Trailing data followed by a section starting with an instruction. +.section .text.1,"ax" +.globl _start +_start: + nop + .long 42 +.section .text.2,"ax" + nop + +## Trailing data followed by a section starting with a data directive. +.section .text.hot.1,"ax" + nop + .long 42 +.section .text.hot.2,"ax" + .long 42 + nop diff --git a/llvm/include/llvm/MC/MCAssembler.h b/llvm/include/llvm/MC/MCAssembler.h index c6fa48128d1891..a68eb49fda2825 100644 --- a/llvm/include/llvm/MC/MCAssembler.h +++ b/llvm/include/llvm/MC/MCAssembler.h @@ -218,6 +218,7 @@ class MCAssembler { const_iterator begin() const { return Sections.begin(); } const_iterator end() const { return Sections.end(); } + SmallVectorImpl &getSymbols() { return Symbols; } iterator_range::const_iterator>> symbols() const { diff --git a/llvm/include/llvm/MC/MCTargetOptions.h b/llvm/include/llvm/MC/MCTargetOptions.h index 899299fd15246a..a5371b3387a13d 100644 --- a/llvm/include/llvm/MC/MCTargetOptions.h +++ b/llvm/include/llvm/MC/MCTargetOptions.h @@ -64,6 +64,8 @@ class MCTargetOptions { // Use CREL relocation format for ELF. bool Crel = false; + bool ImplicitMapSyms = false; + // If true, prefer R_X86_64_[REX_]GOTPCRELX to R_X86_64_GOTPCREL on x86-64 // ELF. bool X86RelaxRelocations = true; diff --git a/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.h b/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.h index 9d592446f3ba77..5e82bc53f3b5ed 100644 --- a/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.h +++ b/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.h @@ -53,6 +53,8 @@ bool getSaveTempLabels(); bool getCrel(); +bool getImplicitMapSyms(); + bool getX86RelaxRelocations(); bool getX86Sse2Avx(); diff --git a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp index 813b1194b47cbf..1a4f7e93eeb74a 100644 --- a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp +++ b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp @@ -48,6 +48,7 @@ MCOPT(bool, NoDeprecatedWarn) MCOPT(bool, NoTypeCheck) MCOPT(bool, SaveTempLabels) MCOPT(bool, Crel) +MCOPT(bool, ImplicitMapSyms) MCOPT(bool, X86RelaxRelocations) MCOPT(bool, X86Sse2Avx) MCOPT(std::string, ABIName) @@ -134,6 +135,14 @@ llvm::mc::RegisterMCTargetOptionsFlags::RegisterMCTargetOptionsFlags() { cl::desc("Use CREL relocation format for ELF")); MCBINDOPT(Crel); + static cl::opt ImplicitMapSyms( + "implicit-mapsyms", + cl::desc("Allow mapping symbol at section beginning to be implicit, " + "lowering number of mapping symbols at the expense of some " + "portability. Recommended for projects that can build all their " + "object files using this option")); + MCBINDOPT(ImplicitMapSyms); + static cl::opt X86RelaxRelocations( "x86-relax-relocations", cl::desc( @@ -174,6 +183,7 @@ MCTargetOptions llvm::mc::InitMCTargetOptionsFromFlags() { Options.MCNoTypeCheck = getNoTypeCheck(); Options.MCSaveTempLabels = getSaveTempLabels(); Options.Crel = getCrel(); + Options.ImplicitMapSyms = getImplicitMapSyms(); Options.X86RelaxRelocations = getX86RelaxRelocations(); Options.X86Sse2Avx = getX86Sse2Avx(); Options.EmitDwarfUnwind = getEmitDwarfUnwind(); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index c69c87c685303c..490efb650d5038 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -24,14 +24,15 @@ #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCELFStreamer.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" -#include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbolELF.h" +#include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/MCWinCOFFStreamer.h" #include "llvm/Support/Casting.h" #include "llvm/Support/FormattedStream.h" @@ -176,19 +177,29 @@ void AArch64TargetAsmStreamer::emitInst(uint32_t Inst) { /// by MachO. Beware! class AArch64ELFStreamer : public MCELFStreamer { public: + friend AArch64TargetELFStreamer; AArch64ELFStreamer(MCContext &Context, std::unique_ptr TAB, std::unique_ptr OW, std::unique_ptr Emitter) : MCELFStreamer(Context, std::move(TAB), std::move(OW), std::move(Emitter)), - LastEMS(EMS_None) {} + LastEMS(EMS_None) { + auto *TO = getContext().getTargetOptions(); + ImplicitMapSyms = TO && TO->ImplicitMapSyms; + } void changeSection(MCSection *Section, uint32_t Subsection = 0) override { - // We have to keep track of the mapping symbol state of any sections we - // use. Each one should start off as EMS_None, which is provided as the - // default constructor by DenseMap::lookup. + // Save the mapping symbol state for potential reuse when revisiting the + // section. When ImplicitMapSyms is true, the initial state is + // EMS_A64 for text sections and EMS_Data for the others. LastMappingSymbols[getCurrentSection().first] = LastEMS; - LastEMS = LastMappingSymbols.lookup(Section); + auto It = LastMappingSymbols.find(Section); + if (It != LastMappingSymbols.end()) + LastEMS = It->second; + else if (ImplicitMapSyms) + LastEMS = Section->isText() ? EMS_A64 : EMS_Data; + else + LastEMS = EMS_None; MCELFStreamer::changeSection(Section, Subsection); } @@ -269,13 +280,15 @@ class AArch64ELFStreamer : public MCELFStreamer { LastEMS = EMS_A64; } - void emitMappingSymbol(StringRef Name) { + MCSymbol *emitMappingSymbol(StringRef Name) { auto *Symbol = cast(getContext().createLocalSymbol(Name)); emitLabel(Symbol); + return Symbol; } DenseMap LastMappingSymbols; ElfMappingSymbol LastEMS; + bool ImplicitMapSyms; }; } // end anonymous namespace @@ -297,6 +310,58 @@ void AArch64TargetELFStreamer::finish() { AArch64ELFStreamer &S = getStreamer(); MCContext &Ctx = S.getContext(); auto &Asm = S.getAssembler(); + + // If ImplicitMapSyms is specified, ensure that text sections end with + // the A64 state while non-text sections end with the data state. When + // sections are combined by the linker, the subsequent section will start with + // the right state. The ending mapping symbol is added right after the last + // symbol relative to the section. When a dumb linker combines (.text.0; .word + // 0) and (.text.1; .word 0), the ending $x of .text.0 precedes the $d of + // .text.1, even if they have the same address. + if (S.ImplicitMapSyms) { + auto &Syms = Asm.getSymbols(); + const size_t NumSyms = Syms.size(); + DenseMap> EndMapSym; + for (MCSection &Sec : Asm) { + S.switchSection(&Sec); + if (S.LastEMS == (Sec.isText() ? AArch64ELFStreamer::EMS_Data + : AArch64ELFStreamer::EMS_A64)) + EndMapSym.insert( + {&Sec, {NumSyms, S.emitMappingSymbol(Sec.isText() ? "$x" : "$d")}}); + } + if (Syms.size() != NumSyms) { + SmallVector NewSyms; + DenseMap Cnt; + Syms.truncate(NumSyms); + // Find the last symbol index for each candidate section. + for (auto [I, Sym] : llvm::enumerate(Syms)) { + if (!Sym->isInSection()) + continue; + auto It = EndMapSym.find(&Sym->getSection()); + if (It != EndMapSym.end()) + It->second.first = I; + } + SmallVector Idx; + for (auto [I, Sym] : llvm::enumerate(Syms)) { + NewSyms.push_back(Sym); + if (!Sym->isInSection()) + continue; + auto It = EndMapSym.find(&Sym->getSection()); + // If `Sym` is the last symbol relative to the section, add the ending + // mapping symbol after `Sym`. + if (It != EndMapSym.end() && I == It->second.first) { + NewSyms.push_back(It->second.second); + Idx.push_back(I); + } + } + Syms = std::move(NewSyms); + // F.second holds the number of symbols added before the FILE symbol. + // Take into account the inserted mapping symbols. + for (auto &F : S.getWriter().getFileNames()) + F.second += llvm::lower_bound(Idx, F.second) - Idx.begin(); + } + } + MCSectionELF *MemtagSec = nullptr; for (const MCSymbol &Symbol : Asm.symbols()) { const auto &Sym = cast(Symbol); diff --git a/llvm/test/MC/AArch64/mapping-across-sections.s b/llvm/test/MC/AArch64/mapping-across-sections.s index f453c86d45fb62..e688c770cc960d 100644 --- a/llvm/test/MC/AArch64/mapping-across-sections.s +++ b/llvm/test/MC/AArch64/mapping-across-sections.s @@ -1,5 +1,10 @@ // RUN: llvm-mc -triple=aarch64 -filetype=obj %s | llvm-objdump -t - | FileCheck %s --match-full-lines +// RUN: llvm-mc -triple=aarch64 -filetype=obj -implicit-mapsyms %s | llvm-objdump -t - | FileCheck %s --check-prefix=CHECK1 --match-full-lines +/// The test covers many state transitions. Let's use the first state and the last state to describe a section. +/// .text goes through cd -> dd -> cc -> dd. +/// .data goes through dd -> dc -> cd. +.file "0.s" .section .text1,"ax" add w0, w0, w0 @@ -12,29 +17,61 @@ add w0, w0, w0 .popsection .text -add w1, w1, w1 +.word 42 .section .text1,"ax" add w1, w1, w1 +.text +add w1, w1, w1 + +.section .data,"aw" +.word 42 +add w0, w0, w0 + .text .word 42 +## .rodata and subsequent symbols should be after the FILE symbol of "1.s". +.file "1.s" .section .rodata,"a" .word 42 add w0, w0, w0 +.section .data,"aw" +add w0, w0, w0 +.word 42 + +.text + .ident "clang" .section ".note.GNU-stack","",@progbits // CHECK: SYMBOL TABLE: -// CHECK-NEXT: 0000000000000000 l .text1 0000000000000000 $x -// CHECK-NEXT: 0000000000000000 l .text 0000000000000000 $x -// CHECK-NEXT: 0000000000000004 l .text 0000000000000000 $d -// CHECK-NEXT: 0000000000000000 l .data 0000000000000000 $d -// CHECK-NEXT: 0000000000000008 l .text 0000000000000000 $x -// CHECK-NEXT: 000000000000000c l .text 0000000000000000 $d -// CHECK-NEXT: 0000000000000000 l .rodata 0000000000000000 $d -// CHECK-NEXT: 0000000000000004 l .rodata 0000000000000000 $x -// CHECK-NEXT: 0000000000000000 l .comment 0000000000000000 $d +// CHECK-NEXT: 0000000000000000 l df *ABS* 0000000000000000 0.s +// CHECK-NEXT: 0000000000000000 l .text1 0000000000000000 $x +// CHECK-NEXT: 0000000000000000 l .text 0000000000000000 $x +// CHECK-NEXT: 0000000000000004 l .text 0000000000000000 $d +// CHECK-NEXT: 0000000000000000 l .data 0000000000000000 $d +// CHECK-NEXT: 000000000000000c l .text 0000000000000000 $x +// CHECK-NEXT: 0000000000000008 l .data 0000000000000000 $x +// CHECK-NEXT: 0000000000000010 l .text 0000000000000000 $d +// CHECK-NEXT: 0000000000000000 l df *ABS* 0000000000000000 1.s +// CHECK-NEXT: 0000000000000000 l .rodata 0000000000000000 $d +// CHECK-NEXT: 0000000000000004 l .rodata 0000000000000000 $x +// CHECK-NEXT: 0000000000000010 l .data 0000000000000000 $d +// CHECK-NEXT: 0000000000000000 l .comment 0000000000000000 $d // CHECK-NOT: {{.}} + +// CHECK1: SYMBOL TABLE: +// CHECK1-NEXT: 0000000000000000 l df *ABS* 0000000000000000 0.s +// CHECK1-NEXT: 0000000000000004 l .text 0000000000000000 $d +// CHECK1-NEXT: 000000000000000c l .text 0000000000000000 $x +// CHECK1-NEXT: 0000000000000008 l .data 0000000000000000 $x +// CHECK1-NEXT: 0000000000000010 l .text 0000000000000000 $d +// CHECK1-NEXT: 0000000000000014 l .text 0000000000000000 $x +// CHECK1-NEXT: 0000000000000000 l df *ABS* 0000000000000000 1.s +// CHECK1-NEXT: 0000000000000004 l .rodata 0000000000000000 $x +// CHECK1-NEXT: 0000000000000008 l .rodata 0000000000000000 $d +// CHECK1-NEXT: 0000000000000010 l .data 0000000000000000 $d +// CHECK1-NOT: {{.}} From 2012b25420160c3d4e595b29910afffa6c5f3fc2 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 22 Aug 2024 17:14:53 +0100 Subject: [PATCH 225/426] [AMDGPU][GlobalISel] Disable fixed-point iteration in all Combiners (#105517) Disable fixed-point iteration in all AMDGPU Combiners after #102163. This saves around 2% compile time in ad hoc testing on some large graphics shaders. I did not notice any regressions in the generated code, just a bunch of harmless differences in instruction selection and register allocation. --- llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 6 +++++- llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp | 6 ++++++ llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp | 6 ++++++ .../AMDGPU/GlobalISel/postlegalizercombiner-and.mir | 8 ++++++++ 4 files changed, 25 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index cfe9f33efc91b8..54d927c33fc553 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -499,7 +499,11 @@ bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, LI, EnableOpt, F.hasOptSize(), F.hasMinSize()); - + // Disable fixed-point iteration to reduce compile-time + CInfo.MaxIterations = 1; + CInfo.ObserverLvl = CombinerInfo::ObserverLevel::SinglePass; + // Legalizer performs DCE, so a full DCE pass is unnecessary. + CInfo.EnableFullDCE = false; AMDGPUPostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, /*CSEInfo*/ nullptr, RuleConfig, ST, MDT, LI); return Impl.combineMachineInstrs(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp index 4d0cb467ba374d..ff8189ce31f7f7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -276,6 +276,12 @@ bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { : &getAnalysis().getDomTree(); CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, nullptr, EnableOpt, F.hasOptSize(), F.hasMinSize()); + // Disable fixed-point iteration to reduce compile-time + CInfo.MaxIterations = 1; + CInfo.ObserverLvl = CombinerInfo::ObserverLevel::SinglePass; + // This is the first Combiner, so the input IR might contain dead + // instructions. + CInfo.EnableFullDCE = true; AMDGPUPreLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo, RuleConfig, STI, MDT, STI.getLegalizerInfo()); return Impl.combineMachineInstrs(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index 74f0540239c939..e236a5d7522e02 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -454,6 +454,12 @@ bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) { CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, LI, EnableOpt, F.hasOptSize(), F.hasMinSize()); + // Disable fixed-point iteration to reduce compile-time + CInfo.MaxIterations = 1; + CInfo.ObserverLvl = CombinerInfo::ObserverLevel::SinglePass; + // RegBankSelect seems not to leave dead instructions, so a full DCE pass is + // unnecessary. + CInfo.EnableFullDCE = false; AMDGPURegBankCombinerImpl Impl(MF, CInfo, TPC, *KB, /*CSEInfo*/ nullptr, RuleConfig, ST, MDT, LI); return Impl.combineMachineInstrs(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir index 67e6de1ce76449..fdc22a23f74163 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir @@ -203,6 +203,7 @@ body: | ; CHECK-LABEL: name: remove_and_65535_groupstaticsize ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: %lds_size:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.groupstaticsize) ; CHECK-NEXT: %mask:_(s32) = G_CONSTANT i32 65535 ; CHECK-NEXT: %and:_(s32) = G_AND %lds_size, %mask @@ -225,6 +226,7 @@ body: | ; CHECK-LABEL: name: remove_and_131071_groupstaticsize ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: %lds_size:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.groupstaticsize) ; CHECK-NEXT: $vgpr0 = COPY %lds_size(s32) %ptr:_(p1) = COPY $vgpr0_vgpr1 @@ -245,6 +247,7 @@ body: | ; CHECK-LABEL: name: no_remove_and_65536_groupstaticsize ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: %lds_size:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.groupstaticsize) ; CHECK-NEXT: %mask:_(s32) = G_CONSTANT i32 65536 ; CHECK-NEXT: %and:_(s32) = G_AND %lds_size, %mask @@ -267,6 +270,7 @@ body: | ; CHECK-LABEL: name: no_remove_and_32767_groupstaticsize ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: %lds_size:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.groupstaticsize) ; CHECK-NEXT: %mask:_(s32) = G_CONSTANT i32 32767 ; CHECK-NEXT: %and:_(s32) = G_AND %lds_size, %mask @@ -291,6 +295,8 @@ body: | ; CHECK-LABEL: name: remove_and_umin_lhs_only ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4 ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %ptr0:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: %ptr1:_(p1) = COPY $vgpr2_vgpr3 ; CHECK-NEXT: %val:_(s32) = COPY $vgpr4 ; CHECK-NEXT: %k255:_(s32) = G_CONSTANT i32 255 ; CHECK-NEXT: %umin0:_(s32) = G_UMIN %val, %k255 @@ -316,6 +322,8 @@ body: | ; CHECK-LABEL: name: remove_and_umin_rhs_only ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4 ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %ptr0:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: %ptr1:_(p1) = COPY $vgpr2_vgpr3 ; CHECK-NEXT: %val:_(s32) = COPY $vgpr4 ; CHECK-NEXT: %k255:_(s32) = G_CONSTANT i32 255 ; CHECK-NEXT: %umin0:_(s32) = G_UMIN %val, %k255 From 09262553fa1874bec04aebb1ecd3fd3386d316d5 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Thu, 22 Aug 2024 16:14:05 +0000 Subject: [PATCH 226/426] [lldb] Fix typos in ScriptedInterface.h --- lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h index 3ce47d0584a8a7..3850edf879ac45 100644 --- a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h +++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h @@ -36,8 +36,8 @@ class ScriptedInterface { template static Ret ErrorWithMessage(llvm::StringRef caller_name, llvm::StringRef error_msg, Status &error, - LLDBLog log_caterogy = LLDBLog::Process) { - LLDB_LOGF(GetLog(log_caterogy), "%s ERROR = %s", caller_name.data(), + LLDBLog log_category = LLDBLog::Process) { + LLDB_LOGF(GetLog(log_category), "%s ERROR = %s", caller_name.data(), error_msg.data()); std::string full_error_message = llvm::Twine(caller_name + llvm::Twine(" ERROR = ") + From 83fc989a227a0cafb945307d4f0d68a4df864dc1 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 22 Aug 2024 09:15:47 -0700 Subject: [PATCH 227/426] [CodeGen] Construct SmallVector with iterator ranges (NFC) (#105622) --- llvm/lib/CodeGen/MachineSink.cpp | 3 +-- llvm/lib/CodeGen/ShrinkWrap.cpp | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp index f10b98cebd133f..fe515ef5be541f 100644 --- a/llvm/lib/CodeGen/MachineSink.cpp +++ b/llvm/lib/CodeGen/MachineSink.cpp @@ -766,8 +766,7 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) { } if (SinkInstsIntoCycle) { - SmallVector Cycles(CI->toplevel_begin(), - CI->toplevel_end()); + SmallVector Cycles(CI->toplevel_cycles()); for (auto *Cycle : Cycles) { MachineBasicBlock *Preheader = Cycle->getCyclePreheader(); if (!Preheader) { diff --git a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp index b9f376a5af794d..600127f17110db 100644 --- a/llvm/lib/CodeGen/ShrinkWrap.cpp +++ b/llvm/lib/CodeGen/ShrinkWrap.cpp @@ -411,8 +411,7 @@ hasDirtyPred(const DenseSet &ReachableByDirty, /// Derives the list of all the basic blocks reachable from MBB. static void markAllReachable(DenseSet &Visited, const MachineBasicBlock &MBB) { - SmallVector Worklist(MBB.succ_begin(), - MBB.succ_end()); + SmallVector Worklist(MBB.successors()); Visited.insert(&MBB); while (!Worklist.empty()) { MachineBasicBlock *SuccMBB = Worklist.pop_back_val(); From eb549da9e5c1e626edb14ba9ce43e46ad3d088af Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 22 Aug 2024 09:20:53 -0700 Subject: [PATCH 228/426] [Driver] Add -Wa, options -mmapsyms={default,implicit} -Wa,-mmapsyms=implicit enables the alternative mapping symbol scheme discussed at #99718. While not conforming to the current aaelf64 ABI, the option is invaluable for those with full control over their toolchain, no reliance on weird relocatable files, and a strong focus on minimizing both relocatable and executable sizes. The option is discouraged when portability of the relocatable objects is a concern. https://maskray.me/blog/2024-07-21-mapping-symbols-rethinking-for-efficiency elaborates the risk. Pull Request: https://github.com/llvm/llvm-project/pull/104542 --- clang/include/clang/Basic/CodeGenOptions.def | 1 + clang/include/clang/Driver/Options.td | 6 +++++ clang/lib/CodeGen/BackendUtil.cpp | 1 + clang/lib/Driver/ToolChains/Clang.cpp | 12 +++++++++ clang/lib/Driver/ToolChains/CommonArgs.cpp | 24 +++++++++++++++-- clang/test/Driver/mmapsyms.c | 28 ++++++++++++++++++++ clang/test/Misc/cc1as-mmapsyms.c | 9 +++++++ clang/tools/driver/cc1as_main.cpp | 5 ++++ 8 files changed, 84 insertions(+), 2 deletions(-) create mode 100644 clang/test/Driver/mmapsyms.c create mode 100644 clang/test/Misc/cc1as-mmapsyms.c diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index ecea476abe3232..b600198998d85b 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -37,6 +37,7 @@ VALUE_CODEGENOPT(Name, Bits, Default) CODEGENOPT(DisableIntegratedAS, 1, 0) ///< -no-integrated-as CODEGENOPT(Crel, 1, 0) ///< -Wa,--crel +CODEGENOPT(ImplicitMapSyms, 1, 0) ///< -Wa,-mmapsyms=implicit CODEGENOPT(AsmVerbose , 1, 0) ///< -dA, -fverbose-asm. CODEGENOPT(PreserveAsmComments, 1, 1) ///< -dA, -fno-preserve-as-comments. CODEGENOPT(AssumeSaneOperatorNew , 1, 1) ///< implicit __attribute__((malloc)) operator new diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 5d8791727d2109..7a3c699a6a8e88 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -7142,6 +7142,12 @@ def massembler_fatal_warnings : Flag<["-"], "massembler-fatal-warnings">, def crel : Flag<["--"], "crel">, HelpText<"Enable CREL relocation format (ELF only)">, MarshallingInfoFlag>; +def mmapsyms_implicit : Flag<["-"], "mmapsyms=implicit">, + HelpText<"Allow mapping symbol at section beginning to be implicit, " + "lowering number of mapping symbols at the expense of some " + "portability. Recommended for projects that can build all their " + "object files using this option">, + MarshallingInfoFlag>; def mrelax_relocations_no : Flag<["-"], "mrelax-relocations=no">, HelpText<"Disable x86 relax relocations">, MarshallingInfoNegativeFlag>; diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 34c08818dbb9ad..fdd89edd72e109 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -471,6 +471,7 @@ static bool initTargetOptions(DiagnosticsEngine &Diags, Options.MCOptions.Dwarf64 = CodeGenOpts.Dwarf64; Options.MCOptions.PreserveAsmComments = CodeGenOpts.PreserveAsmComments; Options.MCOptions.Crel = CodeGenOpts.Crel; + Options.MCOptions.ImplicitMapSyms = CodeGenOpts.ImplicitMapSyms; Options.MCOptions.X86RelaxRelocations = CodeGenOpts.X86RelaxRelocations; Options.MCOptions.CompressDebugSections = CodeGenOpts.getCompressDebugSections(); diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 53fdc29948508e..9f1d57f43b6565 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -2554,6 +2554,7 @@ static void CollectArgsForIntegratedAssembler(Compilation &C, const llvm::Triple &Triple = C.getDefaultToolChain().getTriple(); bool IsELF = Triple.isOSBinFormatELF(); bool Crel = false, ExperimentalCrel = false; + bool ImplicitMapSyms = false; bool UseRelaxRelocations = C.getDefaultToolChain().useRelaxRelocations(); bool UseNoExecStack = false; bool Msa = false; @@ -2642,6 +2643,15 @@ static void CollectArgsForIntegratedAssembler(Compilation &C, // recognize but skip over here. continue; break; + case llvm::Triple::aarch64: + case llvm::Triple::aarch64_be: + case llvm::Triple::aarch64_32: + if (Equal.first == "-mmapsyms") { + ImplicitMapSyms = Equal.second == "implicit"; + checkArg(IsELF, {"default", "implicit"}); + continue; + } + break; case llvm::Triple::mips: case llvm::Triple::mipsel: case llvm::Triple::mips64: @@ -2786,6 +2796,8 @@ static void CollectArgsForIntegratedAssembler(Compilation &C, << "-Wa,--crel" << D.getTargetTriple(); } } + if (ImplicitMapSyms) + CmdArgs.push_back("-mmapsyms=implicit"); if (Msa) CmdArgs.push_back("-mmsa"); if (!UseRelaxRelocations) diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 320d2901da06ed..0738ed18f54078 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1143,10 +1143,27 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args, addMachineOutlinerArgs(D, Args, CmdArgs, ToolChain.getEffectiveTriple(), /*IsLTO=*/true, PluginOptPrefix); + bool IsELF = Triple.isOSBinFormatELF(); bool Crel = false; + bool ImplicitMapSyms = false; for (const Arg *A : Args.filtered(options::OPT_Wa_COMMA)) { for (StringRef V : A->getValues()) { - if (V == "--crel") + auto Equal = V.split('='); + auto checkArg = [&](bool ValidTarget, + std::initializer_list Set) { + if (!ValidTarget) { + D.Diag(diag::err_drv_unsupported_opt_for_target) + << (Twine("-Wa,") + Equal.first + "=").str() + << Triple.getTriple(); + } else if (!llvm::is_contained(Set, Equal.second)) { + D.Diag(diag::err_drv_unsupported_option_argument) + << (Twine("-Wa,") + Equal.first + "=").str() << Equal.second; + } + }; + if (Equal.first == "-mmapsyms") { + ImplicitMapSyms = Equal.second == "implicit"; + checkArg(IsELF && Triple.isAArch64(), {"default", "implicit"}); + } else if (V == "--crel") Crel = true; else if (V == "--no-crel") Crel = false; @@ -1156,13 +1173,16 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args, } } if (Crel) { - if (Triple.isOSBinFormatELF() && !Triple.isMIPS()) { + if (IsELF && !Triple.isMIPS()) { CmdArgs.push_back(Args.MakeArgString(Twine(PluginOptPrefix) + "-crel")); } else { D.Diag(diag::err_drv_unsupported_opt_for_target) << "-Wa,--crel" << D.getTargetTriple(); } } + if (ImplicitMapSyms) + CmdArgs.push_back( + Args.MakeArgString(Twine(PluginOptPrefix) + "-implicit-mapsyms")); } void tools::addOpenMPRuntimeLibraryPath(const ToolChain &TC, diff --git a/clang/test/Driver/mmapsyms.c b/clang/test/Driver/mmapsyms.c new file mode 100644 index 00000000000000..3535af3cf1cdda --- /dev/null +++ b/clang/test/Driver/mmapsyms.c @@ -0,0 +1,28 @@ +/// Alternative mapping symbol scheme for AArch64. +// RUN: %clang -### -c --target=aarch64 -Wa,-mmapsyms=implicit %s -Werror 2>&1 | FileCheck %s +// RUN: %clang -### -c --target=aarch64_be -Wa,-mmapsyms=implicit %s -Werror 2>&1 | FileCheck %s +// RUN: %clang -### -c --target=aarch64 -Wa,-mmapsyms=implicit,-mmapsyms=default %s -Werror 2>&1 | FileCheck %s --check-prefix=NO +// RUN: not %clang -### -c --target=arm64-apple-darwin -Wa,-mmapsyms=implicit %s 2>&1 | FileCheck %s --check-prefix=ERR +// RUN: not %clang -### -c --target=x86_64 -Wa,-mmapsyms=implicit %s 2>&1 | FileCheck %s --check-prefix=ERR2 + +// RUN: %clang -### -c --target=aarch64 -Werror -Wa,-mmapsyms=implicit -x assembler %s -Werror 2>&1 | FileCheck %s --check-prefix=ASM +// RUN: not %clang -### -c --target=x86_64 -Wa,-mmapsyms=implicit -x assembler %s 2>&1 | FileCheck %s --check-prefix=ERR2 + +// CHECK: "-cc1" {{.*}}"-mmapsyms=implicit" +// NO: "-cc1" +// NO-NOT: "-mmapsyms=implicit" +// ASM: "-cc1as" {{.*}}"-mmapsyms=implicit" +// ERR: error: unsupported option '-Wa,-mmapsyms=' for target 'arm64-apple-darwin' +// ERR2: error: unsupported argument '-mmapsyms=implicit' to option '-Wa,' + +/// Check LTO. +// RUN: %clang -### --target=aarch64-linux -Werror -flto -Wa,-mmapsyms=implicit %s 2>&1 | FileCheck %s --check-prefix=LTO +// RUN: %clang -### --target=aarch64-linux -Werror -flto -Wa,-mmapsyms=implicit -Wa,-mmapsyms=default %s 2>&1 | FileCheck %s --check-prefix=LTO-NO + +// LTO: "-plugin-opt=-implicit-mapsyms" +// LTO-NO-NOT: "-plugin-opt=-implicit-mapsyms" + +// RUN: touch %t.o +// RUN: not %clang -### --target=x86_64-unknown-linux -flto -Wa,-mmapsyms=implicit %t.o 2>&1 | FileCheck %s --check-prefix=LTO-ERR + +// LTO-ERR: error: unsupported option '-Wa,-mmapsyms=' for target 'x86_64-unknown-linux' diff --git a/clang/test/Misc/cc1as-mmapsyms.c b/clang/test/Misc/cc1as-mmapsyms.c new file mode 100644 index 00000000000000..550281903c216e --- /dev/null +++ b/clang/test/Misc/cc1as-mmapsyms.c @@ -0,0 +1,9 @@ +// REQUIRES: aarch64-registered-target +// RUN: %clang -cc1as -triple aarch64 %s -filetype obj -mmapsyms=implicit -o %t.o +// RUN: llvm-readelf -s %t.o | FileCheck %s + +// CHECK: Symbol table '.symtab' contains 1 entries: +nop + +.data +.quad 0 diff --git a/clang/tools/driver/cc1as_main.cpp b/clang/tools/driver/cc1as_main.cpp index 070cf8b44e8eb6..7fe97cc6e6ace1 100644 --- a/clang/tools/driver/cc1as_main.cpp +++ b/clang/tools/driver/cc1as_main.cpp @@ -164,6 +164,8 @@ struct AssemblerInvocation { LLVM_PREFERRED_TYPE(bool) unsigned Crel : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned ImplicitMapsyms : 1; LLVM_PREFERRED_TYPE(bool) unsigned X86RelaxRelocations : 1; @@ -211,6 +213,7 @@ struct AssemblerInvocation { EmitDwarfUnwind = EmitDwarfUnwindType::Default; EmitCompactUnwindNonCanonical = false; Crel = false; + ImplicitMapsyms = 0; X86RelaxRelocations = 0; X86Sse2Avx = 0; } @@ -382,6 +385,7 @@ bool AssemblerInvocation::CreateFromArgs(AssemblerInvocation &Opts, Opts.EmitCompactUnwindNonCanonical = Args.hasArg(OPT_femit_compact_unwind_non_canonical); Opts.Crel = Args.hasArg(OPT_crel); + Opts.ImplicitMapsyms = Args.hasArg(OPT_mmapsyms_implicit); Opts.X86RelaxRelocations = !Args.hasArg(OPT_mrelax_relocations_no); Opts.X86Sse2Avx = Args.hasArg(OPT_msse2avx); @@ -442,6 +446,7 @@ static bool ExecuteAssemblerImpl(AssemblerInvocation &Opts, MCOptions.EmitCompactUnwindNonCanonical = Opts.EmitCompactUnwindNonCanonical; MCOptions.MCSaveTempLabels = Opts.SaveTemporaryLabels; MCOptions.Crel = Opts.Crel; + MCOptions.ImplicitMapSyms = Opts.ImplicitMapsyms; MCOptions.X86RelaxRelocations = Opts.X86RelaxRelocations; MCOptions.X86Sse2Avx = Opts.X86Sse2Avx; MCOptions.CompressDebugSections = Opts.CompressDebugSections; From 6ec4c9c3eb4a556f848dac37a2d6f0d46ecc6f02 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 22 Aug 2024 17:22:21 +0100 Subject: [PATCH 229/426] [MCA][X86] Add scatter instruction test coverage for #105675 --- .../llvm-mca/X86/Generic/resources-avx512.s | 28 +++++++++- .../llvm-mca/X86/Generic/resources-avx512vl.s | 54 ++++++++++++++++++- .../X86/SapphireRapids/resources-avx512.s | 28 +++++++++- .../X86/SapphireRapids/resources-avx512vl.s | 54 ++++++++++++++++++- .../X86/SkylakeServer/resources-avx512.s | 28 +++++++++- .../X86/SkylakeServer/resources-avx512vl.s | 54 ++++++++++++++++++- .../llvm-mca/X86/Znver4/resources-avx512.s | 28 +++++++++- .../llvm-mca/X86/Znver4/resources-avx512vl.s | 54 ++++++++++++++++++- 8 files changed, 320 insertions(+), 8 deletions(-) diff --git a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512.s index 1df586faa543d1..c3453d890d76d5 100644 --- a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512.s +++ b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512.s @@ -814,6 +814,11 @@ vpermq %zmm16, %zmm17, %zmm19 {z}{k1} vpermq (%rax), %zmm17, %zmm19 {z}{k1} vpermq (%rax){1to8}, %zmm17, %zmm19 {z}{k1} +vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} +vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} +vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} + vpshufd $0, %zmm16, %zmm19 vpshufd $0, (%rax), %zmm19 vpshufd $0, (%rax){1to16}, %zmm19 @@ -884,6 +889,11 @@ vpunpcklqdq %zmm16, %zmm17, %zmm19 {z}{k1} vpunpcklqdq (%rax), %zmm17, %zmm19 {z}{k1} vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {z}{k1} +vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} +vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} +vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} + vshuff32x4 $0, %zmm16, %zmm17, %zmm19 vshuff32x4 $0, (%rax), %zmm17, %zmm19 vshuff32x4 $0, (%rax){1to16}, %zmm17, %zmm19 @@ -1792,6 +1802,10 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 1 1 1.00 vpermq %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 2 8 1.00 * vpermq (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 2 8 1.00 * vpermq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} +# CHECK-NEXT: 1 1 1.00 * vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 1 1 1.00 vpshufd $0, %zmm16, %zmm19 # CHECK-NEXT: 2 8 1.00 * vpshufd $0, (%rax), %zmm19 # CHECK-NEXT: 2 8 1.00 * vpshufd $0, (%rax){1to16}, %zmm19 @@ -1855,6 +1869,10 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 1 1 1.00 vpunpcklqdq %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 2 8 1.00 * vpunpcklqdq (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 2 8 1.00 * vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} +# CHECK-NEXT: 1 1 1.00 * vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 1 1 1.00 vshuff32x4 $0, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: 2 8 1.00 * vshuff32x4 $0, (%rax), %zmm17, %zmm19 # CHECK-NEXT: 2 8 1.00 * vshuff32x4 $0, (%rax){1to16}, %zmm17, %zmm19 @@ -2032,7 +2050,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] -# CHECK-NEXT: - 1506.00 198.00 335.00 17.00 523.00 300.00 300.00 +# CHECK-NEXT: - 1506.00 198.00 335.00 25.00 523.00 304.00 304.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: @@ -2750,6 +2768,10 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: - - - - - 1.00 - - vpermq %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - - - - 1.00 0.50 0.50 vpermq (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - - - - 1.00 0.50 0.50 vpermq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - - - - 1.00 - - vpshufd $0, %zmm16, %zmm19 # CHECK-NEXT: - - - - - 1.00 0.50 0.50 vpshufd $0, (%rax), %zmm19 # CHECK-NEXT: - - - - - 1.00 0.50 0.50 vpshufd $0, (%rax){1to16}, %zmm19 @@ -2813,6 +2835,10 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: - - - - - 1.00 - - vpunpcklqdq %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - - - - 1.00 0.50 0.50 vpunpcklqdq (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - - - - 1.00 0.50 0.50 vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - - - - 1.00 - - vshuff32x4 $0, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: - - - - - 1.00 0.50 0.50 vshuff32x4 $0, (%rax), %zmm17, %zmm19 # CHECK-NEXT: - - - - - 1.00 0.50 0.50 vshuff32x4 $0, (%rax){1to16}, %zmm17, %zmm19 diff --git a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vl.s index e8e7a80f690bfa..4a4f77826437bd 100644 --- a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vl.s +++ b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vl.s @@ -1344,6 +1344,16 @@ vpmulld %ymm16, %ymm17, %ymm19 {z}{k1} vpmulld (%rax), %ymm17, %ymm19 {z}{k1} vpmulld (%rax){1to8}, %ymm17, %ymm19 {z}{k1} +vpscatterdd %xmm1, (%rdx,%xmm0,4) {%k1} +vpscatterdq %xmm1, (%rdx,%xmm0,4) {%k1} +vpscatterqd %xmm1, (%rdx,%xmm0,4) {%k1} +vpscatterqq %xmm1, (%rdx,%xmm0,4) {%k1} + +vpscatterdd %ymm1, (%rdx,%ymm0,4) {%k1} +vpscatterdq %ymm1, (%rdx,%xmm0,4) {%k1} +vpscatterqd %xmm1, (%rdx,%ymm0,4) {%k1} +vpscatterqq %ymm1, (%rdx,%ymm0,4) {%k1} + vpshufd $0, %xmm16, %xmm19 vpshufd $0, (%rax), %xmm19 vpshufd $0, (%rax){1to4}, %xmm19 @@ -1500,6 +1510,16 @@ vpunpckldq %ymm16, %ymm17, %ymm19 {z}{k1} vpunpckldq (%rax), %ymm17, %ymm19 {z}{k1} vpunpckldq (%rax){1to8}, %ymm17, %ymm19 {z}{k1} +vscatterdps %xmm1, (%rdx,%xmm0,4) {%k1} +vscatterdpd %xmm1, (%rdx,%xmm0,4) {%k1} +vscatterqps %xmm1, (%rdx,%xmm0,4) {%k1} +vscatterqpd %xmm1, (%rdx,%xmm0,4) {%k1} + +vscatterdps %ymm1, (%rdx,%ymm0,4) {%k1} +vscatterdpd %ymm1, (%rdx,%xmm0,4) {%k1} +vscatterqps %xmm1, (%rdx,%ymm0,4) {%k1} +vscatterqpd %ymm1, (%rdx,%ymm0,4) {%k1} + vshuff32x4 $0, %ymm16, %ymm17, %ymm19 vshuff32x4 $0, (%rax), %ymm17, %ymm19 vshuff32x4 $0, (%rax){1to8}, %ymm17, %ymm19 @@ -2897,6 +2917,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: 1 5 1.00 vpmulld %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 2 12 1.00 * vpmulld (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 2 12 1.00 * vpmulld (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} +# CHECK-NEXT: 1 1 1.00 * vpscatterdd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vpscatterdq %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vpscatterqd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vpscatterqq %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vpscatterdd %ymm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vpscatterdq %ymm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vpscatterqd %xmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vpscatterqq %ymm1, (%rdx,%ymm0,4) {%k1} # CHECK-NEXT: 1 1 0.50 vpshufd $0, %xmm16, %xmm19 # CHECK-NEXT: 2 7 0.50 * vpshufd $0, (%rax), %xmm19 # CHECK-NEXT: 2 7 0.50 * vpshufd $0, (%rax){1to4}, %xmm19 @@ -3035,6 +3063,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: 1 1 1.00 vpunpckldq %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 2 8 1.00 * vpunpckldq (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 2 8 1.00 * vpunpckldq (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} +# CHECK-NEXT: 1 1 1.00 * vscatterdps %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vscatterdpd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vscatterqps %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vscatterqpd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vscatterdps %ymm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vscatterdpd %ymm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vscatterqps %xmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vscatterqpd %ymm1, (%rdx,%ymm0,4) {%k1} # CHECK-NEXT: 1 1 1.00 vshuff32x4 $0, %ymm16, %ymm17, %ymm19 # CHECK-NEXT: 2 8 1.00 * vshuff32x4 $0, (%rax), %ymm17, %ymm19 # CHECK-NEXT: 2 8 1.00 * vshuff32x4 $0, (%rax){1to8}, %ymm17, %ymm19 @@ -3228,7 +3264,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] -# CHECK-NEXT: - 1935.00 278.00 579.50 32.00 738.50 486.50 486.50 +# CHECK-NEXT: - 1935.00 278.00 579.50 48.00 738.50 494.50 494.50 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: @@ -4420,6 +4456,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: - - 1.00 - - - - - vpmulld %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: - - 1.00 - - - 0.50 0.50 vpmulld (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: - - 1.00 - - - 0.50 0.50 vpmulld (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vpscatterdd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vpscatterdq %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vpscatterqd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vpscatterqq %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vpscatterdd %ymm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vpscatterdq %ymm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vpscatterqd %xmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vpscatterqq %ymm1, (%rdx,%ymm0,4) {%k1} # CHECK-NEXT: - - - 0.50 - 0.50 - - vpshufd $0, %xmm16, %xmm19 # CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vpshufd $0, (%rax), %xmm19 # CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vpshufd $0, (%rax){1to4}, %xmm19 @@ -4558,6 +4602,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: - - - - - 1.00 - - vpunpckldq %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: - - - - - 1.00 0.50 0.50 vpunpckldq (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: - - - - - 1.00 0.50 0.50 vpunpckldq (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vscatterdps %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vscatterdpd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vscatterqps %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vscatterqpd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vscatterdps %ymm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vscatterdpd %ymm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vscatterqps %xmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: - - - - 1.00 - 0.50 0.50 vscatterqpd %ymm1, (%rdx,%ymm0,4) {%k1} # CHECK-NEXT: - - - - - 1.00 - - vshuff32x4 $0, %ymm16, %ymm17, %ymm19 # CHECK-NEXT: - - - - - 1.00 0.50 0.50 vshuff32x4 $0, (%rax), %ymm17, %ymm19 # CHECK-NEXT: - - - - - 1.00 0.50 0.50 vshuff32x4 $0, (%rax){1to8}, %ymm17, %ymm19 diff --git a/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512.s index 88f3313c70fde2..b2fde3929106a5 100644 --- a/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512.s +++ b/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512.s @@ -814,6 +814,11 @@ vpermq %zmm16, %zmm17, %zmm19 {z}{k1} vpermq (%rax), %zmm17, %zmm19 {z}{k1} vpermq (%rax){1to8}, %zmm17, %zmm19 {z}{k1} +vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} +vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} +vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} + vpshufd $0, %zmm16, %zmm19 vpshufd $0, (%rax), %zmm19 vpshufd $0, (%rax){1to16}, %zmm19 @@ -884,6 +889,11 @@ vpunpcklqdq %zmm16, %zmm17, %zmm19 {z}{k1} vpunpcklqdq (%rax), %zmm17, %zmm19 {z}{k1} vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {z}{k1} +vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} +vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} +vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} + vshuff32x4 $0, %zmm16, %zmm17, %zmm19 vshuff32x4 $0, (%rax), %zmm17, %zmm19 vshuff32x4 $0, (%rax){1to16}, %zmm17, %zmm19 @@ -1792,6 +1802,10 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 1 3 1.00 vpermq %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 2 11 1.00 * vpermq (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 2 11 1.00 * vpermq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} +# CHECK-NEXT: 35 19 8.00 * vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 19 12 4.00 * vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 19 12 4.00 * vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 19 12 4.00 * vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 1 1 1.00 vpshufd $0, %zmm16, %zmm19 # CHECK-NEXT: 2 9 1.00 * vpshufd $0, (%rax), %zmm19 # CHECK-NEXT: 2 9 1.00 * vpshufd $0, (%rax){1to16}, %zmm19 @@ -1855,6 +1869,10 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 1 1 1.00 vpunpcklqdq %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 2 9 1.00 * vpunpcklqdq (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 2 9 1.00 * vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} +# CHECK-NEXT: 35 19 8.00 * vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 19 12 4.00 * vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 19 12 4.00 * vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 19 12 4.00 * vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 1 3 1.00 vshuff32x4 $0, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: 2 11 1.00 * vshuff32x4 $0, (%rax), %zmm17, %zmm19 # CHECK-NEXT: 2 11 1.00 * vshuff32x4 $0, (%rax){1to16}, %zmm17, %zmm19 @@ -2037,7 +2055,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] -# CHECK-NEXT: 491.00 12.00 218.33 218.33 8.50 577.00 - 8.50 8.50 8.50 - 218.33 - +# CHECK-NEXT: 508.60 13.60 218.33 218.33 48.50 578.60 1.60 48.50 48.50 48.50 1.60 218.33 - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions: @@ -2755,6 +2773,10 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: - - - - - 1.00 - - - - - - - vpermq %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - 0.33 0.33 - 1.00 - - - - - 0.33 - vpermq (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - 0.33 0.33 - 1.00 - - - - - 0.33 - vpermq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} +# CHECK-NEXT: 2.20 0.20 - - 8.00 0.20 0.20 8.00 8.00 8.00 0.20 - - vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 2.20 0.20 - - 4.00 0.20 0.20 4.00 4.00 4.00 0.20 - - vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 2.20 0.20 - - 4.00 0.20 0.20 4.00 4.00 4.00 0.20 - - vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 2.20 0.20 - - 4.00 0.20 0.20 4.00 4.00 4.00 0.20 - - vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - - - - 1.00 - - - - - - - vpshufd $0, %zmm16, %zmm19 # CHECK-NEXT: - - 0.33 0.33 - 1.00 - - - - - 0.33 - vpshufd $0, (%rax), %zmm19 # CHECK-NEXT: - - 0.33 0.33 - 1.00 - - - - - 0.33 - vpshufd $0, (%rax){1to16}, %zmm19 @@ -2818,6 +2840,10 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: - - - - - 1.00 - - - - - - - vpunpcklqdq %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - 0.33 0.33 - 1.00 - - - - - 0.33 - vpunpcklqdq (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - 0.33 0.33 - 1.00 - - - - - 0.33 - vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} +# CHECK-NEXT: 2.20 0.20 - - 8.00 0.20 0.20 8.00 8.00 8.00 0.20 - - vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 2.20 0.20 - - 4.00 0.20 0.20 4.00 4.00 4.00 0.20 - - vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 2.20 0.20 - - 4.00 0.20 0.20 4.00 4.00 4.00 0.20 - - vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 2.20 0.20 - - 4.00 0.20 0.20 4.00 4.00 4.00 0.20 - - vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - - - - 1.00 - - - - - - - vshuff32x4 $0, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: - - 0.33 0.33 - 1.00 - - - - - 0.33 - vshuff32x4 $0, (%rax), %zmm17, %zmm19 # CHECK-NEXT: - - 0.33 0.33 - 1.00 - - - - - 0.33 - vshuff32x4 $0, (%rax){1to16}, %zmm17, %zmm19 diff --git a/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512vl.s index 3ad66f1c3d7128..d8c76832d38d3e 100644 --- a/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512vl.s +++ b/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512vl.s @@ -1344,6 +1344,16 @@ vpmulld %ymm16, %ymm17, %ymm19 {z}{k1} vpmulld (%rax), %ymm17, %ymm19 {z}{k1} vpmulld (%rax){1to8}, %ymm17, %ymm19 {z}{k1} +vpscatterdd %xmm1, (%rdx,%xmm0,4) {%k1} +vpscatterdq %xmm1, (%rdx,%xmm0,4) {%k1} +vpscatterqd %xmm1, (%rdx,%xmm0,4) {%k1} +vpscatterqq %xmm1, (%rdx,%xmm0,4) {%k1} + +vpscatterdd %ymm1, (%rdx,%ymm0,4) {%k1} +vpscatterdq %ymm1, (%rdx,%xmm0,4) {%k1} +vpscatterqd %xmm1, (%rdx,%ymm0,4) {%k1} +vpscatterqq %ymm1, (%rdx,%ymm0,4) {%k1} + vpshufd $0, %xmm16, %xmm19 vpshufd $0, (%rax), %xmm19 vpshufd $0, (%rax){1to4}, %xmm19 @@ -1500,6 +1510,16 @@ vpunpckldq %ymm16, %ymm17, %ymm19 {z}{k1} vpunpckldq (%rax), %ymm17, %ymm19 {z}{k1} vpunpckldq (%rax){1to8}, %ymm17, %ymm19 {z}{k1} +vscatterdps %xmm1, (%rdx,%xmm0,4) {%k1} +vscatterdpd %xmm1, (%rdx,%xmm0,4) {%k1} +vscatterqps %xmm1, (%rdx,%xmm0,4) {%k1} +vscatterqpd %xmm1, (%rdx,%xmm0,4) {%k1} + +vscatterdps %ymm1, (%rdx,%ymm0,4) {%k1} +vscatterdpd %ymm1, (%rdx,%xmm0,4) {%k1} +vscatterqps %xmm1, (%rdx,%ymm0,4) {%k1} +vscatterqpd %ymm1, (%rdx,%ymm0,4) {%k1} + vshuff32x4 $0, %ymm16, %ymm17, %ymm19 vshuff32x4 $0, (%rax), %ymm17, %ymm19 vshuff32x4 $0, (%rax){1to8}, %ymm17, %ymm19 @@ -2897,6 +2917,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: 2 10 1.00 vpmulld %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 3 18 1.00 * vpmulld (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 3 18 1.00 * vpmulld (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} +# CHECK-NEXT: 11 12 2.00 * vpscatterdd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 7 12 1.00 * vpscatterdq %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 7 12 1.00 * vpscatterqd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 7 12 1.00 * vpscatterqq %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 19 12 4.00 * vpscatterdd %ymm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 11 12 2.00 * vpscatterdq %ymm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 11 12 2.00 * vpscatterqd %xmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 11 12 2.00 * vpscatterqq %ymm1, (%rdx,%ymm0,4) {%k1} # CHECK-NEXT: 1 1 0.50 vpshufd $0, %xmm16, %xmm19 # CHECK-NEXT: 2 8 0.50 * vpshufd $0, (%rax), %xmm19 # CHECK-NEXT: 2 8 0.50 * vpshufd $0, (%rax){1to4}, %xmm19 @@ -3035,6 +3063,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: 1 1 0.50 vpunpckldq %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 2 9 0.50 * vpunpckldq (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 2 9 0.50 * vpunpckldq (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} +# CHECK-NEXT: 11 12 2.00 * vscatterdps %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 7 12 1.00 * vscatterdpd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 7 12 1.00 * vscatterqps %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 7 12 1.00 * vscatterqpd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 19 12 4.00 * vscatterdps %ymm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 11 12 2.00 * vscatterdpd %ymm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 11 12 2.00 * vscatterqps %xmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 11 12 2.00 * vscatterqpd %ymm1, (%rdx,%ymm0,4) {%k1} # CHECK-NEXT: 1 3 1.00 vshuff32x4 $0, %ymm16, %ymm17, %ymm19 # CHECK-NEXT: 2 11 1.00 * vshuff32x4 $0, (%rax), %ymm17, %ymm19 # CHECK-NEXT: 2 11 1.00 * vshuff32x4 $0, (%rax){1to8}, %ymm17, %ymm19 @@ -3233,7 +3269,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] -# CHECK-NEXT: 377.33 401.33 328.33 328.33 16.00 794.33 - 16.00 16.00 16.00 - 328.33 - +# CHECK-NEXT: 404.53 412.53 328.33 328.33 46.00 797.53 3.20 46.00 46.00 46.00 3.20 328.33 - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions: @@ -4425,6 +4461,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: 1.00 1.00 - - - - - - - - - - - vpmulld %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 1.00 1.00 0.33 0.33 - - - - - - - 0.33 - vpmulld (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 1.00 1.00 0.33 0.33 - - - - - - - 0.33 - vpmulld (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} +# CHECK-NEXT: 1.70 0.70 - - 2.00 0.20 0.20 2.00 2.00 2.00 0.20 - - vpscatterdd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1.70 0.70 - - 1.00 0.20 0.20 1.00 1.00 1.00 0.20 - - vpscatterdq %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1.70 0.70 - - 1.00 0.20 0.20 1.00 1.00 1.00 0.20 - - vpscatterqd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1.70 0.70 - - 1.00 0.20 0.20 1.00 1.00 1.00 0.20 - - vpscatterqq %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1.70 0.70 - - 4.00 0.20 0.20 4.00 4.00 4.00 0.20 - - vpscatterdd %ymm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 1.70 0.70 - - 2.00 0.20 0.20 2.00 2.00 2.00 0.20 - - vpscatterdq %ymm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1.70 0.70 - - 2.00 0.20 0.20 2.00 2.00 2.00 0.20 - - vpscatterqd %xmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 1.70 0.70 - - 2.00 0.20 0.20 2.00 2.00 2.00 0.20 - - vpscatterqq %ymm1, (%rdx,%ymm0,4) {%k1} # CHECK-NEXT: - 0.50 - - - 0.50 - - - - - - - vpshufd $0, %xmm16, %xmm19 # CHECK-NEXT: - 0.50 0.33 0.33 - 0.50 - - - - - 0.33 - vpshufd $0, (%rax), %xmm19 # CHECK-NEXT: - 0.50 0.33 0.33 - 0.50 - - - - - 0.33 - vpshufd $0, (%rax){1to4}, %xmm19 @@ -4563,6 +4607,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: - 0.50 - - - 0.50 - - - - - - - vpunpckldq %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: - 0.50 0.33 0.33 - 0.50 - - - - - 0.33 - vpunpckldq (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: - 0.50 0.33 0.33 - 0.50 - - - - - 0.33 - vpunpckldq (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} +# CHECK-NEXT: 1.70 0.70 - - 2.00 0.20 0.20 2.00 2.00 2.00 0.20 - - vscatterdps %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1.70 0.70 - - 1.00 0.20 0.20 1.00 1.00 1.00 0.20 - - vscatterdpd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1.70 0.70 - - 1.00 0.20 0.20 1.00 1.00 1.00 0.20 - - vscatterqps %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1.70 0.70 - - 1.00 0.20 0.20 1.00 1.00 1.00 0.20 - - vscatterqpd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1.70 0.70 - - 4.00 0.20 0.20 4.00 4.00 4.00 0.20 - - vscatterdps %ymm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 1.70 0.70 - - 2.00 0.20 0.20 2.00 2.00 2.00 0.20 - - vscatterdpd %ymm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1.70 0.70 - - 2.00 0.20 0.20 2.00 2.00 2.00 0.20 - - vscatterqps %xmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 1.70 0.70 - - 2.00 0.20 0.20 2.00 2.00 2.00 0.20 - - vscatterqpd %ymm1, (%rdx,%ymm0,4) {%k1} # CHECK-NEXT: - - - - - 1.00 - - - - - - - vshuff32x4 $0, %ymm16, %ymm17, %ymm19 # CHECK-NEXT: - - 0.33 0.33 - 1.00 - - - - - 0.33 - vshuff32x4 $0, (%rax), %ymm17, %ymm19 # CHECK-NEXT: - - 0.33 0.33 - 1.00 - - - - - 0.33 - vshuff32x4 $0, (%rax){1to8}, %ymm17, %ymm19 diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s index 108ef75b0ac417..5eaa0f91fdaaba 100644 --- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s +++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s @@ -814,6 +814,11 @@ vpermq %zmm16, %zmm17, %zmm19 {z}{k1} vpermq (%rax), %zmm17, %zmm19 {z}{k1} vpermq (%rax){1to8}, %zmm17, %zmm19 {z}{k1} +vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} +vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} +vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} + vpshufd $0, %zmm16, %zmm19 vpshufd $0, (%rax), %zmm19 vpshufd $0, (%rax){1to16}, %zmm19 @@ -884,6 +889,11 @@ vpunpcklqdq %zmm16, %zmm17, %zmm19 {z}{k1} vpunpcklqdq (%rax), %zmm17, %zmm19 {z}{k1} vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {z}{k1} +vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} +vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} +vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} + vshuff32x4 $0, %zmm16, %zmm17, %zmm19 vshuff32x4 $0, (%rax), %zmm17, %zmm19 vshuff32x4 $0, (%rax){1to16}, %zmm17, %zmm19 @@ -1792,6 +1802,10 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 1 3 1.00 vpermq %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 2 10 1.00 * vpermq (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 2 10 1.00 * vpermq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} +# CHECK-NEXT: 36 8 16.00 * vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 19 7 8.00 * vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 19 7 8.00 * vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 1 1 1.00 vpshufd $0, %zmm16, %zmm19 # CHECK-NEXT: 2 8 1.00 * vpshufd $0, (%rax), %zmm19 # CHECK-NEXT: 2 8 1.00 * vpshufd $0, (%rax){1to16}, %zmm19 @@ -1855,6 +1869,10 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 1 1 1.00 vpunpcklqdq %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 2 8 1.00 * vpunpcklqdq (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 2 8 1.00 * vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} +# CHECK-NEXT: 36 7 16.00 * vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 19 7 8.00 * vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 19 7 8.00 * vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 1 3 1.00 vshuff32x4 $0, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: 2 10 1.00 * vshuff32x4 $0, (%rax), %zmm17, %zmm19 # CHECK-NEXT: 2 10 1.00 * vshuff32x4 $0, (%rax){1to16}, %zmm17, %zmm19 @@ -2034,7 +2052,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] -# CHECK-NEXT: - 612.00 340.67 99.67 333.17 333.17 17.00 645.67 2.00 5.67 +# CHECK-NEXT: - 612.00 349.67 102.67 355.17 355.17 83.00 650.67 5.00 27.67 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: @@ -2752,6 +2770,10 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: - - - - - - - 1.00 - - vpermq %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vpermq (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vpermq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} +# CHECK-NEXT: - - 1.50 0.50 5.33 5.33 16.00 1.50 0.50 5.33 vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 0.50 0.50 2.67 vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: - - - - 0.33 0.33 1.00 - - 0.33 vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 0.50 0.50 2.67 vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - - - - - - 1.00 - - vpshufd $0, %zmm16, %zmm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vpshufd $0, (%rax), %zmm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vpshufd $0, (%rax){1to16}, %zmm19 @@ -2815,6 +2837,10 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: - - - - - - - 1.00 - - vpunpcklqdq %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vpunpcklqdq (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} +# CHECK-NEXT: - - 1.50 0.50 5.33 5.33 16.00 1.50 0.50 5.33 vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 0.50 0.50 2.67 vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: - - - - 0.33 0.33 1.00 - - 0.33 vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 0.50 0.50 2.67 vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - - - - - - 1.00 - - vshuff32x4 $0, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vshuff32x4 $0, (%rax), %zmm17, %zmm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vshuff32x4 $0, (%rax){1to16}, %zmm17, %zmm19 diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512vl.s index 2ad91ea514aa20..b4b18101a67b80 100644 --- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512vl.s +++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512vl.s @@ -1344,6 +1344,16 @@ vpmulld %ymm16, %ymm17, %ymm19 {z}{k1} vpmulld (%rax), %ymm17, %ymm19 {z}{k1} vpmulld (%rax){1to8}, %ymm17, %ymm19 {z}{k1} +vpscatterdd %xmm1, (%rdx,%xmm0,4) {%k1} +vpscatterdq %xmm1, (%rdx,%xmm0,4) {%k1} +vpscatterqd %xmm1, (%rdx,%xmm0,4) {%k1} +vpscatterqq %xmm1, (%rdx,%xmm0,4) {%k1} + +vpscatterdd %ymm1, (%rdx,%ymm0,4) {%k1} +vpscatterdq %ymm1, (%rdx,%xmm0,4) {%k1} +vpscatterqd %xmm1, (%rdx,%ymm0,4) {%k1} +vpscatterqq %ymm1, (%rdx,%ymm0,4) {%k1} + vpshufd $0, %xmm16, %xmm19 vpshufd $0, (%rax), %xmm19 vpshufd $0, (%rax){1to4}, %xmm19 @@ -1500,6 +1510,16 @@ vpunpckldq %ymm16, %ymm17, %ymm19 {z}{k1} vpunpckldq (%rax), %ymm17, %ymm19 {z}{k1} vpunpckldq (%rax){1to8}, %ymm17, %ymm19 {z}{k1} +vscatterdps %xmm1, (%rdx,%xmm0,4) {%k1} +vscatterdpd %xmm1, (%rdx,%xmm0,4) {%k1} +vscatterqps %xmm1, (%rdx,%xmm0,4) {%k1} +vscatterqpd %xmm1, (%rdx,%xmm0,4) {%k1} + +vscatterdps %ymm1, (%rdx,%ymm0,4) {%k1} +vscatterdpd %ymm1, (%rdx,%xmm0,4) {%k1} +vscatterqps %xmm1, (%rdx,%ymm0,4) {%k1} +vscatterqpd %ymm1, (%rdx,%ymm0,4) {%k1} + vshuff32x4 $0, %ymm16, %ymm17, %ymm19 vshuff32x4 $0, (%rax), %ymm17, %ymm19 vshuff32x4 $0, (%rax){1to8}, %ymm17, %ymm19 @@ -2897,6 +2917,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: 2 10 1.00 vpmulld %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 3 17 1.00 * vpmulld (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 3 17 1.00 * vpmulld (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} +# CHECK-NEXT: 12 8 4.00 * vpscatterdd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 7 7 2.00 * vpscatterdq %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 8 8 2.00 * vpscatterqd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 7 7 2.00 * vpscatterqq %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 20 8 8.00 * vpscatterdd %ymm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 11 7 4.00 * vpscatterdq %ymm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 8 8 2.00 * vpscatterqd %xmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 11 7 4.00 * vpscatterqq %ymm1, (%rdx,%ymm0,4) {%k1} # CHECK-NEXT: 1 1 1.00 vpshufd $0, %xmm16, %xmm19 # CHECK-NEXT: 2 7 1.00 * vpshufd $0, (%rax), %xmm19 # CHECK-NEXT: 2 7 1.00 * vpshufd $0, (%rax){1to4}, %xmm19 @@ -3035,6 +3063,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: 1 1 1.00 vpunpckldq %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 2 8 1.00 * vpunpckldq (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 2 8 1.00 * vpunpckldq (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} +# CHECK-NEXT: 12 8 4.00 * vscatterdps %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 7 7 2.00 * vscatterdpd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 8 8 2.00 * vscatterqps %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 7 7 2.00 * vscatterqpd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 20 8 8.00 * vscatterdps %ymm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 11 7 4.00 * vscatterdpd %ymm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 8 8 2.00 * vscatterqps %xmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 11 7 4.00 * vscatterqpd %ymm1, (%rdx,%ymm0,4) {%k1} # CHECK-NEXT: 1 3 1.00 vshuff32x4 $0, %ymm16, %ymm17, %ymm19 # CHECK-NEXT: 2 10 1.00 * vshuff32x4 $0, (%rax), %ymm17, %ymm19 # CHECK-NEXT: 2 10 1.00 * vshuff32x4 $0, (%rax){1to8}, %ymm17, %ymm19 @@ -3230,7 +3266,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] -# CHECK-NEXT: - 423.00 438.33 350.33 503.17 503.17 32.00 785.33 4.00 10.67 +# CHECK-NEXT: - 423.00 462.33 358.33 521.83 521.83 88.00 801.33 12.00 29.33 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: @@ -4422,6 +4458,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: - - 1.00 1.00 - - - - - - vpmulld %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: - - 1.00 1.00 0.50 0.50 - - - - vpmulld (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: - - 1.00 1.00 0.50 0.50 - - - - vpmulld (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} +# CHECK-NEXT: - - 1.50 0.50 1.33 1.33 4.00 1.50 0.50 1.33 vpscatterdd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 0.67 0.67 2.00 0.50 0.50 0.67 vpscatterdq %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 0.67 0.67 2.00 1.50 0.50 0.67 vpscatterqd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 0.67 0.67 2.00 0.50 0.50 0.67 vpscatterqq %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 1.50 0.50 2.67 vpscatterdd %ymm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 1.33 1.33 4.00 0.50 0.50 1.33 vpscatterdq %ymm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 0.67 0.67 2.00 1.50 0.50 0.67 vpscatterqd %xmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 1.33 1.33 4.00 0.50 0.50 1.33 vpscatterqq %ymm1, (%rdx,%ymm0,4) {%k1} # CHECK-NEXT: - - - - - - - 1.00 - - vpshufd $0, %xmm16, %xmm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vpshufd $0, (%rax), %xmm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vpshufd $0, (%rax){1to4}, %xmm19 @@ -4560,6 +4604,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: - - - - - - - 1.00 - - vpunpckldq %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vpunpckldq (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vpunpckldq (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} +# CHECK-NEXT: - - 1.50 0.50 1.33 1.33 4.00 1.50 0.50 1.33 vscatterdps %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 0.67 0.67 2.00 0.50 0.50 0.67 vscatterdpd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 0.67 0.67 2.00 1.50 0.50 0.67 vscatterqps %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 0.67 0.67 2.00 0.50 0.50 0.67 vscatterqpd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 1.50 0.50 2.67 vscatterdps %ymm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 1.33 1.33 4.00 0.50 0.50 1.33 vscatterdpd %ymm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 0.67 0.67 2.00 1.50 0.50 0.67 vscatterqps %xmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 1.33 1.33 4.00 0.50 0.50 1.33 vscatterqpd %ymm1, (%rdx,%ymm0,4) {%k1} # CHECK-NEXT: - - - - - - - 1.00 - - vshuff32x4 $0, %ymm16, %ymm17, %ymm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vshuff32x4 $0, (%rax), %ymm17, %ymm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vshuff32x4 $0, (%rax){1to8}, %ymm17, %ymm19 diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s index 51caeab1b3b7ca..6e52eddd9a8f5e 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s @@ -814,6 +814,11 @@ vpermq %zmm16, %zmm17, %zmm19 {z}{k1} vpermq (%rax), %zmm17, %zmm19 {z}{k1} vpermq (%rax){1to8}, %zmm17, %zmm19 {z}{k1} +vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} +vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} +vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} + vpshufd $0, %zmm16, %zmm19 vpshufd $0, (%rax), %zmm19 vpshufd $0, (%rax){1to16}, %zmm19 @@ -884,6 +889,11 @@ vpunpcklqdq %zmm16, %zmm17, %zmm19 {z}{k1} vpunpcklqdq (%rax), %zmm17, %zmm19 {z}{k1} vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {z}{k1} +vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} +vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} +vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} + vshuff32x4 $0, %zmm16, %zmm17, %zmm19 vshuff32x4 $0, (%rax), %zmm17, %zmm19 vshuff32x4 $0, (%rax){1to16}, %zmm17, %zmm19 @@ -1792,6 +1802,10 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 2 1 0.50 vpermq %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 2 8 0.50 * vpermq (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 2 8 0.50 * vpermq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} +# CHECK-NEXT: 1 1 1.00 * vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 1 1 1.00 vpshufd $0, %zmm16, %zmm19 # CHECK-NEXT: 1 8 1.00 * vpshufd $0, (%rax), %zmm19 # CHECK-NEXT: 1 8 1.00 * vpshufd $0, (%rax){1to16}, %zmm19 @@ -1855,6 +1869,10 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 1 1 1.00 vpunpcklqdq %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 1 8 1.00 * vpunpcklqdq (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 1 8 1.00 * vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} +# CHECK-NEXT: 1 1 1.00 * vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 1 2 1.00 vshuff32x4 $0, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: 3 9 1.00 * vshuff32x4 $0, (%rax), %zmm17, %zmm19 # CHECK-NEXT: 3 9 1.00 * vshuff32x4 $0, (%rax){1to16}, %zmm17, %zmm19 @@ -2047,7 +2065,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] -# CHECK-NEXT: 2.67 2.67 2.67 - - - - - 221.00 1060.50 618.00 352.50 297.00 297.00 17.00 200.00 200.00 200.00 194.33 194.33 194.33 8.50 8.50 +# CHECK-NEXT: 5.33 5.33 5.33 - - - - - 221.00 1060.50 618.00 352.50 297.00 297.00 17.00 205.33 205.33 205.33 194.33 194.33 194.33 16.50 16.50 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions: @@ -2765,6 +2783,10 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - vpermq %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermq (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - - - - - - - - - - vpshufd $0, %zmm16, %zmm19 # CHECK-NEXT: - - - - - - - - - 1.00 1.00 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpshufd $0, (%rax), %zmm19 # CHECK-NEXT: - - - - - - - - - 1.00 1.00 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpshufd $0, (%rax){1to16}, %zmm19 @@ -2828,6 +2850,10 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - - - - - - - - - - vpunpcklqdq %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - - - - - - - - 1.00 1.00 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpunpcklqdq (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - - - - - - - - 1.00 1.00 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - - - - - - - - 1.00 - - - - - - - - - - - - - vshuff32x4 $0, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: - - - - - - - - - 1.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vshuff32x4 $0, (%rax), %zmm17, %zmm19 # CHECK-NEXT: - - - - - - - - - 1.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vshuff32x4 $0, (%rax){1to16}, %zmm17, %zmm19 diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s index 2d26eb50351a08..4636e23d9df3e2 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s @@ -1344,6 +1344,16 @@ vpmulld %ymm16, %ymm17, %ymm19 {z}{k1} vpmulld (%rax), %ymm17, %ymm19 {z}{k1} vpmulld (%rax){1to8}, %ymm17, %ymm19 {z}{k1} +vpscatterdd %xmm1, (%rdx,%xmm0,4) {%k1} +vpscatterdq %xmm1, (%rdx,%xmm0,4) {%k1} +vpscatterqd %xmm1, (%rdx,%xmm0,4) {%k1} +vpscatterqq %xmm1, (%rdx,%xmm0,4) {%k1} + +vpscatterdd %ymm1, (%rdx,%ymm0,4) {%k1} +vpscatterdq %ymm1, (%rdx,%xmm0,4) {%k1} +vpscatterqd %xmm1, (%rdx,%ymm0,4) {%k1} +vpscatterqq %ymm1, (%rdx,%ymm0,4) {%k1} + vpshufd $0, %xmm16, %xmm19 vpshufd $0, (%rax), %xmm19 vpshufd $0, (%rax){1to4}, %xmm19 @@ -1500,6 +1510,16 @@ vpunpckldq %ymm16, %ymm17, %ymm19 {z}{k1} vpunpckldq (%rax), %ymm17, %ymm19 {z}{k1} vpunpckldq (%rax){1to8}, %ymm17, %ymm19 {z}{k1} +vscatterdps %xmm1, (%rdx,%xmm0,4) {%k1} +vscatterdpd %xmm1, (%rdx,%xmm0,4) {%k1} +vscatterqps %xmm1, (%rdx,%xmm0,4) {%k1} +vscatterqpd %xmm1, (%rdx,%xmm0,4) {%k1} + +vscatterdps %ymm1, (%rdx,%ymm0,4) {%k1} +vscatterdpd %ymm1, (%rdx,%xmm0,4) {%k1} +vscatterqps %xmm1, (%rdx,%ymm0,4) {%k1} +vscatterqpd %ymm1, (%rdx,%ymm0,4) {%k1} + vshuff32x4 $0, %ymm16, %ymm17, %ymm19 vshuff32x4 $0, (%rax), %ymm17, %ymm19 vshuff32x4 $0, (%rax){1to8}, %ymm17, %ymm19 @@ -2897,6 +2917,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: 1 3 0.50 vpmulld %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 1 10 0.50 * vpmulld (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 1 10 0.50 * vpmulld (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} +# CHECK-NEXT: 1 1 1.00 * vpscatterdd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vpscatterdq %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vpscatterqd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vpscatterqq %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vpscatterdd %ymm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vpscatterdq %ymm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vpscatterqd %xmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vpscatterqq %ymm1, (%rdx,%ymm0,4) {%k1} # CHECK-NEXT: 1 1 0.50 vpshufd $0, %xmm16, %xmm19 # CHECK-NEXT: 1 8 0.50 * vpshufd $0, (%rax), %xmm19 # CHECK-NEXT: 1 8 0.50 * vpshufd $0, (%rax){1to4}, %xmm19 @@ -3035,6 +3063,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: 1 1 0.50 vpunpckldq %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 1 8 0.50 * vpunpckldq (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 1 8 0.50 * vpunpckldq (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} +# CHECK-NEXT: 1 1 1.00 * vscatterdps %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vscatterdpd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vscatterqps %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vscatterqpd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vscatterdps %ymm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vscatterdpd %ymm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vscatterqps %xmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vscatterqpd %ymm1, (%rdx,%ymm0,4) {%k1} # CHECK-NEXT: 1 2 1.00 vshuff32x4 $0, %ymm16, %ymm17, %ymm19 # CHECK-NEXT: 3 9 1.00 * vshuff32x4 $0, (%rax), %ymm17, %ymm19 # CHECK-NEXT: 3 9 1.00 * vshuff32x4 $0, (%rax){1to8}, %ymm17, %ymm19 @@ -3243,7 +3279,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] -# CHECK-NEXT: 5.33 5.33 5.33 - - - - - 208.00 948.00 501.50 261.50 478.50 478.50 32.00 324.33 324.33 324.33 313.67 313.67 313.67 16.00 16.00 +# CHECK-NEXT: 10.67 10.67 10.67 - - - - - 208.00 948.00 501.50 261.50 478.50 478.50 32.00 335.00 335.00 335.00 313.67 313.67 313.67 32.00 32.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions: @@ -4435,6 +4471,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: - - - - - - - - 0.50 - - 0.50 - - - - - - - - - - - vpmulld %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: - - - - - - - - 0.50 - - 0.50 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpmulld (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: - - - - - - - - 0.50 - - 0.50 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpmulld (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vpscatterdd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vpscatterdq %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vpscatterqd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vpscatterqq %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vpscatterdd %ymm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vpscatterdq %ymm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vpscatterqd %xmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vpscatterqq %ymm1, (%rdx,%ymm0,4) {%k1} # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - vpshufd $0, %xmm16, %xmm19 # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpshufd $0, (%rax), %xmm19 # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpshufd $0, (%rax){1to4}, %xmm19 @@ -4573,6 +4617,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - vpunpckldq %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpunpckldq (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpunpckldq (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vscatterdps %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vscatterdpd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vscatterqps %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vscatterqpd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vscatterdps %ymm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vscatterdpd %ymm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vscatterqps %xmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.67 0.67 0.67 - - - 1.00 1.00 vscatterqpd %ymm1, (%rdx,%ymm0,4) {%k1} # CHECK-NEXT: - - - - - - - - - 1.00 - - - - - - - - - - - - - vshuff32x4 $0, %ymm16, %ymm17, %ymm19 # CHECK-NEXT: - - - - - - - - - 1.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vshuff32x4 $0, (%rax), %ymm17, %ymm19 # CHECK-NEXT: - - - - - - - - - 1.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vshuff32x4 $0, (%rax){1to8}, %ymm17, %ymm19 From 933f72217e4584db03f945a3b30e8c04537f4dab Mon Sep 17 00:00:00 2001 From: anjenner <161845516+anjenner@users.noreply.github.com> Date: Thu, 22 Aug 2024 17:24:49 +0100 Subject: [PATCH 230/426] [bindings][ocaml] Add missing AtomicRMW operations (#105673) --- llvm/bindings/ocaml/llvm/llvm.ml | 4 ++++ llvm/bindings/ocaml/llvm/llvm.mli | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/llvm/bindings/ocaml/llvm/llvm.ml b/llvm/bindings/ocaml/llvm/llvm.ml index 908e6658a89f73..8e059ae71613dd 100644 --- a/llvm/bindings/ocaml/llvm/llvm.ml +++ b/llvm/bindings/ocaml/llvm/llvm.ml @@ -296,6 +296,10 @@ module AtomicRMWBinOp = struct | UMin | FAdd | FSub + | FMax + | FMin + | UInc_Wrap + | UDec_Wrap end module ValueKind = struct diff --git a/llvm/bindings/ocaml/llvm/llvm.mli b/llvm/bindings/ocaml/llvm/llvm.mli index b8a430adf6cf2d..b8fdac7e38c6a7 100644 --- a/llvm/bindings/ocaml/llvm/llvm.mli +++ b/llvm/bindings/ocaml/llvm/llvm.mli @@ -331,6 +331,10 @@ module AtomicRMWBinOp : sig | UMin | FAdd | FSub + | FMax + | FMin + | UInc_Wrap + | UDec_Wrap end (** The kind of an [llvalue], the result of [classify_value v]. From 27727d85a95dd501ce6d6660900b656622de9ae0 Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Thu, 22 Aug 2024 12:36:37 -0400 Subject: [PATCH 231/426] [C23] Remove WG14 N2517 from the status page This paper proposes no normative changes, just updates an example in the standard. It was incorrect for us to have marked it as No in the first place. --- clang/www/c_status.html | 5 ----- 1 file changed, 5 deletions(-) diff --git a/clang/www/c_status.html b/clang/www/c_status.html index 99c14aaf506e51..6555b8e5e3da39 100644 --- a/clang/www/c_status.html +++ b/clang/www/c_status.html @@ -330,11 +330,6 @@

C23 implementation status

N2508 Clang 18 - - Clarification request for C17 example of undefined behavior - N2517 - No - Querying attribute support N2553 From 7d373cef4941e9be1c2c86375ba9a8943c55e9cd Mon Sep 17 00:00:00 2001 From: Brendan Dahl Date: Thu, 22 Aug 2024 09:44:33 -0700 Subject: [PATCH 232/426] [WebAssembly] Change half-precision feature name to fp16. (#105434) This better aligns with how the feature is being referred to and what runtimes (V8) are calling it. --- .../clang/Basic/BuiltinsWebAssembly.def | 22 +++++++++---------- clang/include/clang/Driver/Options.td | 4 ++-- clang/lib/Basic/Targets/WebAssembly.cpp | 16 +++++++------- clang/lib/Basic/Targets/WebAssembly.h | 6 ++--- clang/test/CodeGen/builtins-wasm.c | 4 ++-- clang/test/Driver/wasm-features.c | 8 +++---- .../test/Preprocessor/wasm-target-features.c | 16 +++++++------- llvm/lib/Target/WebAssembly/WebAssembly.td | 8 +++---- .../WebAssembly/WebAssemblyISelLowering.cpp | 4 ++-- .../WebAssembly/WebAssemblyInstrInfo.td | 6 ++--- .../WebAssembly/WebAssemblyInstrMemory.td | 4 ++-- .../WebAssembly/WebAssemblyInstrSIMD.td | 12 +++++----- .../Target/WebAssembly/WebAssemblySubtarget.h | 4 ++-- .../CodeGen/WebAssembly/half-precision.ll | 4 ++-- llvm/test/CodeGen/WebAssembly/offset.ll | 2 +- .../WebAssembly/target-features-cpus.ll | 6 ++--- llvm/test/MC/WebAssembly/simd-encodings.s | 2 +- 17 files changed, 63 insertions(+), 65 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def index df304a71e475ec..034d32c6291b3d 100644 --- a/clang/include/clang/Basic/BuiltinsWebAssembly.def +++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def @@ -135,10 +135,10 @@ TARGET_BUILTIN(__builtin_wasm_min_f64x2, "V2dV2dV2d", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_max_f64x2, "V2dV2dV2d", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_pmin_f64x2, "V2dV2dV2d", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_pmax_f64x2, "V2dV2dV2d", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_min_f16x8, "V8hV8hV8h", "nc", "half-precision") -TARGET_BUILTIN(__builtin_wasm_max_f16x8, "V8hV8hV8h", "nc", "half-precision") -TARGET_BUILTIN(__builtin_wasm_pmin_f16x8, "V8hV8hV8h", "nc", "half-precision") -TARGET_BUILTIN(__builtin_wasm_pmax_f16x8, "V8hV8hV8h", "nc", "half-precision") +TARGET_BUILTIN(__builtin_wasm_min_f16x8, "V8hV8hV8h", "nc", "fp16") +TARGET_BUILTIN(__builtin_wasm_max_f16x8, "V8hV8hV8h", "nc", "fp16") +TARGET_BUILTIN(__builtin_wasm_pmin_f16x8, "V8hV8hV8h", "nc", "fp16") +TARGET_BUILTIN(__builtin_wasm_pmax_f16x8, "V8hV8hV8h", "nc", "fp16") TARGET_BUILTIN(__builtin_wasm_ceil_f32x4, "V4fV4f", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_floor_f32x4, "V4fV4f", "nc", "simd128") @@ -170,8 +170,8 @@ TARGET_BUILTIN(__builtin_wasm_relaxed_madd_f32x4, "V4fV4fV4fV4f", "nc", "relaxed TARGET_BUILTIN(__builtin_wasm_relaxed_nmadd_f32x4, "V4fV4fV4fV4f", "nc", "relaxed-simd") TARGET_BUILTIN(__builtin_wasm_relaxed_madd_f64x2, "V2dV2dV2dV2d", "nc", "relaxed-simd") TARGET_BUILTIN(__builtin_wasm_relaxed_nmadd_f64x2, "V2dV2dV2dV2d", "nc", "relaxed-simd") -TARGET_BUILTIN(__builtin_wasm_relaxed_madd_f16x8, "V8hV8hV8hV8h", "nc", "half-precision") -TARGET_BUILTIN(__builtin_wasm_relaxed_nmadd_f16x8, "V8hV8hV8hV8h", "nc", "half-precision") +TARGET_BUILTIN(__builtin_wasm_relaxed_madd_f16x8, "V8hV8hV8hV8h", "nc", "fp16") +TARGET_BUILTIN(__builtin_wasm_relaxed_nmadd_f16x8, "V8hV8hV8hV8h", "nc", "fp16") TARGET_BUILTIN(__builtin_wasm_relaxed_laneselect_i8x16, "V16ScV16ScV16ScV16Sc", "nc", "relaxed-simd") TARGET_BUILTIN(__builtin_wasm_relaxed_laneselect_i16x8, "V8sV8sV8sV8s", "nc", "relaxed-simd") @@ -197,11 +197,11 @@ TARGET_BUILTIN(__builtin_wasm_relaxed_dot_i8x16_i7x16_add_s_i32x4, "V4iV16ScV16S TARGET_BUILTIN(__builtin_wasm_relaxed_dot_bf16x8_add_f32_f32x4, "V4fV8UsV8UsV4f", "nc", "relaxed-simd") // Half-Precision (fp16) -TARGET_BUILTIN(__builtin_wasm_loadf16_f32, "fh*", "nU", "half-precision") -TARGET_BUILTIN(__builtin_wasm_storef16_f32, "vfh*", "n", "half-precision") -TARGET_BUILTIN(__builtin_wasm_splat_f16x8, "V8hf", "nc", "half-precision") -TARGET_BUILTIN(__builtin_wasm_extract_lane_f16x8, "fV8hi", "nc", "half-precision") -TARGET_BUILTIN(__builtin_wasm_replace_lane_f16x8, "V8hV8hif", "nc", "half-precision") +TARGET_BUILTIN(__builtin_wasm_loadf16_f32, "fh*", "nU", "fp16") +TARGET_BUILTIN(__builtin_wasm_storef16_f32, "vfh*", "n", "fp16") +TARGET_BUILTIN(__builtin_wasm_splat_f16x8, "V8hf", "nc", "fp16") +TARGET_BUILTIN(__builtin_wasm_extract_lane_f16x8, "fV8hi", "nc", "fp16") +TARGET_BUILTIN(__builtin_wasm_replace_lane_f16x8, "V8hV8hif", "nc", "fp16") // Reference Types builtins // Some builtins are custom type-checked - see 't' as part of the third argument, diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 7a3c699a6a8e88..111608d30ff827 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -5033,8 +5033,8 @@ def mexception_handing : Flag<["-"], "mexception-handling">, Group, Group; def mextended_const : Flag<["-"], "mextended-const">, Group; def mno_extended_const : Flag<["-"], "mno-extended-const">, Group; -def mhalf_precision : Flag<["-"], "mhalf-precision">, Group; -def mno_half_precision : Flag<["-"], "mno-half-precision">, Group; +def mfp16 : Flag<["-"], "mfp16">, Group; +def mno_fp16 : Flag<["-"], "mno-fp16">, Group; def mmultimemory : Flag<["-"], "mmultimemory">, Group; def mno_multimemory : Flag<["-"], "mno-multimemory">, Group; def mmultivalue : Flag<["-"], "mmultivalue">, Group; diff --git a/clang/lib/Basic/Targets/WebAssembly.cpp b/clang/lib/Basic/Targets/WebAssembly.cpp index 1e565f0a5319f2..5ac9421663adea 100644 --- a/clang/lib/Basic/Targets/WebAssembly.cpp +++ b/clang/lib/Basic/Targets/WebAssembly.cpp @@ -49,7 +49,7 @@ bool WebAssemblyTargetInfo::hasFeature(StringRef Feature) const { .Case("bulk-memory", HasBulkMemory) .Case("exception-handling", HasExceptionHandling) .Case("extended-const", HasExtendedConst) - .Case("half-precision", HasHalfPrecision) + .Case("fp16", HasFP16) .Case("multimemory", HasMultiMemory) .Case("multivalue", HasMultivalue) .Case("mutable-globals", HasMutableGlobals) @@ -84,8 +84,8 @@ void WebAssemblyTargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__wasm_extended_const__"); if (HasMultiMemory) Builder.defineMacro("__wasm_multimemory__"); - if (HasHalfPrecision) - Builder.defineMacro("__wasm_half_precision__"); + if (HasFP16) + Builder.defineMacro("__wasm_fp16__"); if (HasMultivalue) Builder.defineMacro("__wasm_multivalue__"); if (HasMutableGlobals) @@ -162,7 +162,7 @@ bool WebAssemblyTargetInfo::initFeatureMap( Features["bulk-memory"] = true; Features["exception-handling"] = true; Features["extended-const"] = true; - Features["half-precision"] = true; + Features["fp16"] = true; Features["multimemory"] = true; Features["nontrapping-fptoint"] = true; Features["tail-call"] = true; @@ -212,13 +212,13 @@ bool WebAssemblyTargetInfo::handleTargetFeatures( HasExtendedConst = false; continue; } - if (Feature == "+half-precision") { + if (Feature == "+fp16") { SIMDLevel = std::max(SIMDLevel, SIMD128); - HasHalfPrecision = true; + HasFP16 = true; continue; } - if (Feature == "-half-precision") { - HasHalfPrecision = false; + if (Feature == "-fp16") { + HasFP16 = false; continue; } if (Feature == "+multimemory") { diff --git a/clang/lib/Basic/Targets/WebAssembly.h b/clang/lib/Basic/Targets/WebAssembly.h index e4a449d1ff3041..213ec42ca84bb7 100644 --- a/clang/lib/Basic/Targets/WebAssembly.h +++ b/clang/lib/Basic/Targets/WebAssembly.h @@ -57,7 +57,7 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyTargetInfo : public TargetInfo { bool HasBulkMemory = false; bool HasExceptionHandling = false; bool HasExtendedConst = false; - bool HasHalfPrecision = false; + bool HasFP16 = false; bool HasMultiMemory = false; bool HasMultivalue = false; bool HasMutableGlobals = false; @@ -90,9 +90,7 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyTargetInfo : public TargetInfo { StringRef getABI() const override; bool setABI(const std::string &Name) override; - bool useFP16ConversionIntrinsics() const override { - return !HasHalfPrecision; - } + bool useFP16ConversionIntrinsics() const override { return !HasFP16; } protected: void getTargetDefines(const LangOptions &Opts, diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c index f494aeada01579..3010b8954f1c2e 100644 --- a/clang/test/CodeGen/builtins-wasm.c +++ b/clang/test/CodeGen/builtins-wasm.c @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -triple wasm32-unknown-unknown -target-feature +reference-types -target-feature +simd128 -target-feature +relaxed-simd -target-feature +nontrapping-fptoint -target-feature +exception-handling -target-feature +bulk-memory -target-feature +atomics -target-feature +half-precision -flax-vector-conversions=none -O3 -emit-llvm -o - %s | FileCheck %s -check-prefixes WEBASSEMBLY,WEBASSEMBLY32 -// RUN: %clang_cc1 -triple wasm64-unknown-unknown -target-feature +reference-types -target-feature +simd128 -target-feature +relaxed-simd -target-feature +nontrapping-fptoint -target-feature +exception-handling -target-feature +bulk-memory -target-feature +atomics -target-feature +half-precision -flax-vector-conversions=none -O3 -emit-llvm -o - %s | FileCheck %s -check-prefixes WEBASSEMBLY,WEBASSEMBLY64 +// RUN: %clang_cc1 -triple wasm32-unknown-unknown -target-feature +reference-types -target-feature +simd128 -target-feature +relaxed-simd -target-feature +nontrapping-fptoint -target-feature +exception-handling -target-feature +bulk-memory -target-feature +atomics -target-feature +fp16 -flax-vector-conversions=none -O3 -emit-llvm -o - %s | FileCheck %s -check-prefixes WEBASSEMBLY,WEBASSEMBLY32 +// RUN: %clang_cc1 -triple wasm64-unknown-unknown -target-feature +reference-types -target-feature +simd128 -target-feature +relaxed-simd -target-feature +nontrapping-fptoint -target-feature +exception-handling -target-feature +bulk-memory -target-feature +atomics -target-feature +fp16 -flax-vector-conversions=none -O3 -emit-llvm -o - %s | FileCheck %s -check-prefixes WEBASSEMBLY,WEBASSEMBLY64 // RUN: not %clang_cc1 -triple wasm64-unknown-unknown -target-feature +reference-types -target-feature +nontrapping-fptoint -target-feature +exception-handling -target-feature +bulk-memory -target-feature +atomics -flax-vector-conversions=none -O3 -emit-llvm -o - %s 2>&1 | FileCheck %s -check-prefixes MISSING-SIMD // SIMD convenience types diff --git a/clang/test/Driver/wasm-features.c b/clang/test/Driver/wasm-features.c index b77cb5ea9b4958..57f0fc4ef36b6b 100644 --- a/clang/test/Driver/wasm-features.c +++ b/clang/test/Driver/wasm-features.c @@ -35,11 +35,11 @@ // EXTENDED-CONST: "-target-feature" "+extended-const" // NO-EXTENDED-CONST: "-target-feature" "-extended-const" -// RUN: %clang --target=wasm32-unknown-unknown -### %s -mhalf-precision 2>&1 | FileCheck %s -check-prefix=HALF-PRECISION -// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-half-precision 2>&1 | FileCheck %s -check-prefix=NO-HALF-PRECISION +// RUN: %clang --target=wasm32-unknown-unknown -### %s -mfp16 2>&1 | FileCheck %s -check-prefix=HALF-PRECISION +// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-fp16 2>&1 | FileCheck %s -check-prefix=NO-HALF-PRECISION -// HALF-PRECISION: "-target-feature" "+half-precision" -// NO-HALF-PRECISION: "-target-feature" "-half-precision" +// HALF-PRECISION: "-target-feature" "+fp16" +// NO-HALF-PRECISION: "-target-feature" "-fp16" // RUN: %clang --target=wasm32-unknown-unknown -### %s -mmultimemory 2>&1 | FileCheck %s -check-prefix=MULTIMEMORY // RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-multimemory 2>&1 | FileCheck %s -check-prefix=NO-MULTIMEMORY diff --git a/clang/test/Preprocessor/wasm-target-features.c b/clang/test/Preprocessor/wasm-target-features.c index d5539163b3bf5a..c64d3a0aa22825 100644 --- a/clang/test/Preprocessor/wasm-target-features.c +++ b/clang/test/Preprocessor/wasm-target-features.c @@ -44,13 +44,13 @@ // EXTENDED-CONST: #define __wasm_extended_const__ 1{{$}} // RUN: %clang -E -dM %s -o - 2>&1 \ -// RUN: -target wasm32-unknown-unknown -mhalf-precision \ -// RUN: | FileCheck %s -check-prefix=HALF-PRECISION +// RUN: -target wasm32-unknown-unknown -mfp16 \ +// RUN: | FileCheck %s -check-prefix=FP16 // RUN: %clang -E -dM %s -o - 2>&1 \ -// RUN: -target wasm64-unknown-unknown -mhalf-precision \ -// RUN: | FileCheck %s -check-prefix=HALF-PRECISION +// RUN: -target wasm64-unknown-unknown -mfp16 \ +// RUN: | FileCheck %s -check-prefix=FP16 // -// HALF-PRECISION: #define __wasm_half_precision__ 1{{$}} +// FP16: #define __wasm_fp16__ 1{{$}} // RUN: %clang -E -dM %s -o - 2>&1 \ // RUN: -target wasm32-unknown-unknown -mmultimemory \ @@ -144,7 +144,7 @@ // MVP-NOT: #define __wasm_bulk_memory__ 1{{$}} // MVP-NOT: #define __wasm_exception_handling__ 1{{$}} // MVP-NOT: #define __wasm_extended_const__ 1{{$}} -// MVP-NOT: #define __wasm_half_precision__ 1{{$}} +// MVP-NOT: #define __wasm_fp16__ 1{{$}} // MVP-NOT: #define __wasm_multimemory__ 1{{$}} // MVP-NOT: #define __wasm_multivalue__ 1{{$}} // MVP-NOT: #define __wasm_mutable_globals__ 1{{$}} @@ -178,7 +178,7 @@ // GENERIC-NOT: #define __wasm_bulk_memory__ 1{{$}} // GENERIC-NOT: #define __wasm_exception_handling__ 1{{$}} // GENERIC-NOT: #define __wasm_extended_const__ 1{{$}} -// GENERIC-NOT: #define __wasm_half_precision__ 1{{$}} +// GENERIC-NOT: #define __wasm__fp16__ 1{{$}} // GENERIC-NOT: #define __wasm_multimemory__ 1{{$}} // GENERIC-NOT: #define __wasm_nontrapping_fptoint__ 1{{$}} // GENERIC-NOT: #define __wasm_relaxed_simd__ 1{{$}} @@ -196,7 +196,7 @@ // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_bulk_memory__ 1{{$}} // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_exception_handling__ 1{{$}} // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_extended_const__ 1{{$}} -// BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_half_precision__ 1{{$}} +// BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_fp16__ 1{{$}} // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_multimemory__ 1{{$}} // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_multivalue__ 1{{$}} // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_mutable_globals__ 1{{$}} diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.td b/llvm/lib/Target/WebAssembly/WebAssembly.td index 97618617ff82f7..c632d4a74355d8 100644 --- a/llvm/lib/Target/WebAssembly/WebAssembly.td +++ b/llvm/lib/Target/WebAssembly/WebAssembly.td @@ -37,9 +37,9 @@ def FeatureExtendedConst : SubtargetFeature<"extended-const", "HasExtendedConst", "true", "Enable extended const expressions">; -def FeatureHalfPrecision : - SubtargetFeature<"half-precision", "HasHalfPrecision", "true", - "Enable half precision instructions">; +def FeatureFP16 : + SubtargetFeature<"fp16", "HasFP16", "true", + "Enable FP16 instructions">; def FeatureMultiMemory : SubtargetFeature<"multimemory", "HasMultiMemory", "true", @@ -117,7 +117,7 @@ def : ProcessorModel<"generic", NoSchedModel, def : ProcessorModel<"bleeding-edge", NoSchedModel, [FeatureAtomics, FeatureBulkMemory, FeatureExceptionHandling, FeatureExtendedConst, - FeatureHalfPrecision, FeatureMultiMemory, + FeatureFP16, FeatureMultiMemory, FeatureMultivalue, FeatureMutableGlobals, FeatureNontrappingFPToInt, FeatureRelaxedSIMD, FeatureReferenceTypes, FeatureSIMD128, FeatureSignExt, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 563601b722c803..13d3e3e31dd45d 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -70,7 +70,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( addRegisterClass(MVT::v2i64, &WebAssembly::V128RegClass); addRegisterClass(MVT::v2f64, &WebAssembly::V128RegClass); } - if (Subtarget->hasHalfPrecision()) { + if (Subtarget->hasFP16()) { addRegisterClass(MVT::v8f16, &WebAssembly::V128RegClass); } if (Subtarget->hasReferenceTypes()) { @@ -146,7 +146,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( setTruncStoreAction(T, MVT::f16, Expand); } - if (Subtarget->hasHalfPrecision()) { + if (Subtarget->hasFP16()) { setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal); setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index bb36ce7650183f..767ac86f1351b5 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -38,9 +38,9 @@ def HasExtendedConst : Predicate<"Subtarget->hasExtendedConst()">, AssemblerPredicate<(all_of FeatureExtendedConst), "extended-const">; -def HasHalfPrecision : - Predicate<"Subtarget->hasHalfPrecision()">, - AssemblerPredicate<(all_of FeatureHalfPrecision), "half-precision">; +def HasFP16 : + Predicate<"Subtarget->hasFP16()">, + AssemblerPredicate<(all_of FeatureFP16), "fp16">; def HasMultiMemory : Predicate<"Subtarget->hasMultiMemory()">, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td index 9d452879bbf80b..0cbe9d0c6a6a40 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td @@ -74,7 +74,7 @@ defm LOAD32_U_I64 : WebAssemblyLoad; // Half-precision load. defm LOAD_F16_F32 : - WebAssemblyLoad; + WebAssemblyLoad; // Pattern matching @@ -174,7 +174,7 @@ defm STORE32_I64 : WebAssemblyStore; // Half-precision store. defm STORE_F16_F32 : - WebAssemblyStore; + WebAssemblyStore; defm : StorePat; defm : StorePat; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index a1697299ee424c..887278e9c12ef3 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -43,7 +43,7 @@ multiclass HALF_PRECISION_I pattern_r, string asmstr_r = "", string asmstr_s = "", bits<32> simdop = -1> { defm "" : ABSTRACT_SIMD_I; + asmstr_s, simdop, [HasFP16]>; } @@ -750,7 +750,7 @@ multiclass SIMDCondition simdop, multiclass HalfPrecisionCondition simdop> { - defm "" : SIMDCondition; + defm "" : SIMDCondition; } multiclass SIMDConditionInt baseInst> { @@ -832,7 +832,7 @@ multiclass SIMDBinary simdop> { - defm "" : SIMDBinary; + defm "" : SIMDBinary; } multiclass SIMDBitwise simdop, @@ -857,7 +857,7 @@ multiclass SIMDUnary simdop> { - defm "" : SIMDUnary; + defm "" : SIMDUnary; } // Bitwise logic: v128.not @@ -1355,7 +1355,7 @@ multiclass SIMDConvert simdop> { - defm "" : SIMDConvert; + defm "" : SIMDConvert; } // Floating point to integer with saturation: trunc_sat @@ -1532,7 +1532,7 @@ multiclass SIMDMADD simdopA, bits<32> simdopS, list defm "" : SIMDMADD; defm "" : SIMDMADD; -defm "" : SIMDMADD; +defm "" : SIMDMADD; //===----------------------------------------------------------------------===// // Laneselect diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h index 540da4b51ccaa9..f990120775d155 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h @@ -43,7 +43,7 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo { bool HasBulkMemory = false; bool HasExceptionHandling = false; bool HasExtendedConst = false; - bool HasHalfPrecision = false; + bool HasFP16 = false; bool HasMultiMemory = false; bool HasMultivalue = false; bool HasMutableGlobals = false; @@ -96,7 +96,7 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo { bool hasBulkMemory() const { return HasBulkMemory; } bool hasExceptionHandling() const { return HasExceptionHandling; } bool hasExtendedConst() const { return HasExtendedConst; } - bool hasHalfPrecision() const { return HasHalfPrecision; } + bool hasFP16() const { return HasFP16; } bool hasMultiMemory() const { return HasMultiMemory; } bool hasMultivalue() const { return HasMultivalue; } bool hasMutableGlobals() const { return HasMutableGlobals; } diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll index dba4138ad59cce..adba502335f86c 100644 --- a/llvm/test/CodeGen/WebAssembly/half-precision.ll +++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s --mtriple=wasm32-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+half-precision,+simd128 | FileCheck %s -; RUN: llc < %s --mtriple=wasm64-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+half-precision,+simd128 | FileCheck %s +; RUN: llc < %s --mtriple=wasm32-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+fp16,+simd128 | FileCheck %s +; RUN: llc < %s --mtriple=wasm64-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+fp16,+simd128 | FileCheck %s declare float @llvm.wasm.loadf32.f16(ptr) declare void @llvm.wasm.storef16.f32(float, ptr) diff --git a/llvm/test/CodeGen/WebAssembly/offset.ll b/llvm/test/CodeGen/WebAssembly/offset.ll index 763c60cef8183f..130508424f6304 100644 --- a/llvm/test/CodeGen/WebAssembly/offset.ll +++ b/llvm/test/CodeGen/WebAssembly/offset.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -asm-verbose=false -wasm-disable-explicit-locals -wasm-keep-registers -disable-wasm-fallthrough-return-opt -mattr=+half-precision | FileCheck %s +; RUN: llc < %s -asm-verbose=false -wasm-disable-explicit-locals -wasm-keep-registers -disable-wasm-fallthrough-return-opt -mattr=+fp16 | FileCheck %s ; Test constant load and store address offsets. diff --git a/llvm/test/CodeGen/WebAssembly/target-features-cpus.ll b/llvm/test/CodeGen/WebAssembly/target-features-cpus.ll index d93147505c1b01..77d1564409f78c 100644 --- a/llvm/test/CodeGen/WebAssembly/target-features-cpus.ll +++ b/llvm/test/CodeGen/WebAssembly/target-features-cpus.ll @@ -28,7 +28,7 @@ target triple = "wasm32-unknown-unknown" ; GENERIC-NEXT: .ascii "sign-ext" ; bleeding-edge: +atomics, +bulk-memory, +exception-handling, +extended-const, -; +half-precision, +multimemory, +multivalue, +mutable-globals, +; +fp16, +multimemory, +multivalue, +mutable-globals, ; +nontrapping-fptoint, +relaxed-simd, +reference-types, ; +simd128, +sign-ext, +tail-call ; BLEEDING-EDGE-LABEL: .section .custom_section.target_features,"",@ @@ -46,8 +46,8 @@ target triple = "wasm32-unknown-unknown" ; BLEEDING-EDGE-NEXT: .int8 14 ; BLEEDING-EDGE-NEXT: .ascii "extended-const" ; BLEEDING-EDGE-NEXT: .int8 43 -; BLEEDING-EDGE-NEXT: .int8 14 -; BLEEDING-EDGE-NEXT: .ascii "half-precision" +; BLEEDING-EDGE-NEXT: .int8 4 +; BLEEDING-EDGE-NEXT: .ascii "fp16" ; BLEEDING-EDGE-NEXT: .int8 43 ; BLEEDING-EDGE-NEXT: .int8 11 ; BLEEDING-EDGE-NEXT: .ascii "multimemory" diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s index 7ae4d47d888cf8..45335b348b7e8f 100644 --- a/llvm/test/MC/WebAssembly/simd-encodings.s +++ b/llvm/test/MC/WebAssembly/simd-encodings.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc -no-type-check -show-encoding -triple=wasm32-unknown-unknown -mattr=+simd128,+relaxed-simd,+half-precision < %s | FileCheck %s +# RUN: llvm-mc -no-type-check -show-encoding -triple=wasm32-unknown-unknown -mattr=+simd128,+relaxed-simd,+fp16 < %s | FileCheck %s main: .functype main () -> () From bc860b49a86089bf9bb7ada9927f2027e6ad9096 Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Thu, 22 Aug 2024 09:55:24 -0700 Subject: [PATCH 233/426] [NFC] [SCCP] remove unused functions (#105603) --- llvm/include/llvm/Transforms/Utils/SCCPSolver.h | 3 --- llvm/lib/Transforms/Utils/SCCPSolver.cpp | 8 -------- 2 files changed, 11 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/SCCPSolver.h b/llvm/include/llvm/Transforms/Utils/SCCPSolver.h index 9f7ccd4a8a32cf..1f959311295258 100644 --- a/llvm/include/llvm/Transforms/Utils/SCCPSolver.h +++ b/llvm/include/llvm/Transforms/Utils/SCCPSolver.h @@ -167,9 +167,6 @@ class SCCPSolver { /// Return either a Constant or nullptr for a given Value. Constant *getConstantOrNull(Value *V) const; - /// Return a reference to the set of argument tracked functions. - SmallPtrSetImpl &getArgumentTrackedFunctions(); - /// Set the Lattice Value for the arguments of a specialization \p F. /// If an argument is Constant then its lattice value is marked with the /// corresponding actual argument in \p Args. Otherwise, its lattice value diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index c944859cc69b89..40f0f04c323ddc 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -829,10 +829,6 @@ class SCCPInstVisitor : public InstVisitor { Constant *getConstantOrNull(Value *V) const; - SmallPtrSetImpl &getArgumentTrackedFunctions() { - return TrackingIncomingArguments; - } - void setLatticeValueForSpecializationArguments(Function *F, const SmallVectorImpl &Args); @@ -2157,10 +2153,6 @@ Constant *SCCPSolver::getConstantOrNull(Value *V) const { return Visitor->getConstantOrNull(V); } -SmallPtrSetImpl &SCCPSolver::getArgumentTrackedFunctions() { - return Visitor->getArgumentTrackedFunctions(); -} - void SCCPSolver::setLatticeValueForSpecializationArguments(Function *F, const SmallVectorImpl &Args) { Visitor->setLatticeValueForSpecializationArguments(F, Args); From b9c4c4ccf921c0481d51d4e0c9e862aa9ea3fcf3 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Thu, 22 Aug 2024 19:06:09 +0200 Subject: [PATCH 234/426] [clang][bytecode] Fix 'if consteval' in non-constant contexts (#104707) The previous code made this a compile-time decision but it's not. --- clang/lib/AST/ByteCode/Compiler.cpp | 20 +++++++++++++------- clang/lib/AST/ByteCode/Interp.h | 5 +++++ clang/lib/AST/ByteCode/Opcodes.td | 2 ++ clang/test/CodeGenCXX/cxx2b-consteval-if.cpp | 1 + 4 files changed, 21 insertions(+), 7 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 3a3927a9671345..655983a1ca0494 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -4385,11 +4385,6 @@ bool Compiler::visitReturnStmt(const ReturnStmt *RS) { } template bool Compiler::visitIfStmt(const IfStmt *IS) { - if (IS->isNonNegatedConsteval()) - return visitStmt(IS->getThen()); - if (IS->isNegatedConsteval()) - return IS->getElse() ? visitStmt(IS->getElse()) : true; - if (auto *CondInit = IS->getInit()) if (!visitStmt(CondInit)) return false; @@ -4398,8 +4393,19 @@ template bool Compiler::visitIfStmt(const IfStmt *IS) { if (!visitDeclStmt(CondDecl)) return false; - if (!this->visitBool(IS->getCond())) - return false; + // Compile condition. + if (IS->isNonNegatedConsteval()) { + if (!this->emitIsConstantContext(IS)) + return false; + } else if (IS->isNegatedConsteval()) { + if (!this->emitIsConstantContext(IS)) + return false; + if (!this->emitInv(IS)) + return false; + } else { + if (!this->visitBool(IS->getCond())) + return false; + } if (const Stmt *Else = IS->getElse()) { LabelTy LabelElse = this->getLabel(); diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index fd4406c0db2b88..7ba51f737db491 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -3076,6 +3076,11 @@ static inline bool Free(InterpState &S, CodePtr OpPC, bool DeleteIsArrayForm) { BlockDesc, Source); } +static inline bool IsConstantContext(InterpState &S, CodePtr OpPC) { + S.Stk.push(Boolean::from(S.inConstantContext())); + return true; +} + inline bool CheckLiteralType(InterpState &S, CodePtr OpPC, const Type *T) { assert(T); assert(!S.getLangOpts().CPlusPlus23); diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td index 61319e1633d9ad..7374a441c8bb19 100644 --- a/clang/lib/AST/ByteCode/Opcodes.td +++ b/clang/lib/AST/ByteCode/Opcodes.td @@ -780,3 +780,5 @@ def AllocCN : Opcode { def Free : Opcode { let Args = [ArgBool]; } + +def IsConstantContext: Opcode; diff --git a/clang/test/CodeGenCXX/cxx2b-consteval-if.cpp b/clang/test/CodeGenCXX/cxx2b-consteval-if.cpp index 343b6a0bbd8a6b..a6aa862b975ebe 100644 --- a/clang/test/CodeGenCXX/cxx2b-consteval-if.cpp +++ b/clang/test/CodeGenCXX/cxx2b-consteval-if.cpp @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -std=c++23 %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -std=c++23 %s -emit-llvm -o - -fexperimental-new-constant-interpreter | FileCheck %s void should_be_used_1(); void should_be_used_2(); From 4a2a1b51cb6b88820e28019040fb78d0c82685ab Mon Sep 17 00:00:00 2001 From: Vladimir Vereschaka Date: Thu, 22 Aug 2024 10:09:03 -0700 Subject: [PATCH 235/426] [libc++] Adjust armv7 XFAIL target triple for the setfill_wchar_max test. (#105586) Also allow XFAIL for armv7-*-linux-gnueabihf targets, not only for armv7l-*. --- .../iostream.format/std.manip/setfill_wchar_max.pass.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcxx/test/std/input.output/iostream.format/std.manip/setfill_wchar_max.pass.cpp b/libcxx/test/std/input.output/iostream.format/std.manip/setfill_wchar_max.pass.cpp index d220a5c36a23bb..9d4126153cc235 100644 --- a/libcxx/test/std/input.output/iostream.format/std.manip/setfill_wchar_max.pass.cpp +++ b/libcxx/test/std/input.output/iostream.format/std.manip/setfill_wchar_max.pass.cpp @@ -15,7 +15,7 @@ // version 2 implementation fixes the problem. // XFAIL: target={{.*}}-windows{{.*}} && libcpp-abi-version=1 -// XFAIL: target=armv{{7|8}}l{{.*}}-linux-gnueabihf && libcpp-abi-version=1 +// XFAIL: target=armv{{7|8}}{{l?}}{{.*}}-linux-gnueabihf && libcpp-abi-version=1 // XFAIL: target=aarch64{{.*}}-linux-gnu && libcpp-abi-version=1 #include From c1e401f3624780f85f4c9a26960752ee3f37fafb Mon Sep 17 00:00:00 2001 From: Jason Molenda Date: Thu, 22 Aug 2024 10:10:15 -0700 Subject: [PATCH 236/426] [lldb] Change the two remaining SInt64 settings in Target to uint (#105460) TargetProperties.td had a few settings listed as signed integral values, but the Target.cpp methods reading those values were reading them as unsigned. e.g. target.max-memory-read-size, some accesses of target.max-children-count, still today, previously target.max-string-summary-length. After Jonas' change to use templates to read these values in https://reviews.llvm.org/D149774, when the code tried to fetch these values, we'd eventually end up calling OptionValue::GetAsUInt64 which checks that the value is actually a UInt64 before returning it; finding that it was an SInt64, it would drop the user setting and return the default value. This manifested as a bug that target.max-memory-read-size is never used for memory read. target.max-children-count is less straightforward, where one read of that setting was fetching it as an int64_t, the other as a uint64_t. I suspect all of these settings were originally marked as SInt64 so a user could do -1 for "infinite", getting it static_cast to a UINT64_MAX value along the way. I can't find any documentation for this behavior, but it seems like something Greg would have done. We've partially lost that behavior already via https://github.com/llvm/llvm-project/pull/72233 for target.max-string-summary-length, and this further removes it. We're still fetching UInt64's and returning them as uint32_t's but I'm not overly pressed about someone setting a count/size limit over 4GB. I added a simple API test for the memory read setting limit. --- lldb/source/Target/Target.cpp | 2 +- lldb/source/Target/TargetProperties.td | 4 +-- .../TestDataFormatterGenericForwardList.py | 5 ++- .../functionalities/memory/big-read/Makefile | 3 ++ .../big-read/TestMemoryReadMaximumSize.py | 31 +++++++++++++++++++ .../functionalities/memory/big-read/main.c | 9 ++++++ 6 files changed, 48 insertions(+), 6 deletions(-) create mode 100644 lldb/test/API/functionalities/memory/big-read/Makefile create mode 100644 lldb/test/API/functionalities/memory/big-read/TestMemoryReadMaximumSize.py create mode 100644 lldb/test/API/functionalities/memory/big-read/main.c diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp index 5a5d689e03fbc0..260974bddedf3a 100644 --- a/lldb/source/Target/Target.cpp +++ b/lldb/source/Target/Target.cpp @@ -4609,7 +4609,7 @@ uint32_t TargetProperties::GetMaxZeroPaddingInFloatFormat() const { uint32_t TargetProperties::GetMaximumNumberOfChildrenToDisplay() const { const uint32_t idx = ePropertyMaxChildrenCount; - return GetPropertyAtIndexAs( + return GetPropertyAtIndexAs( idx, g_target_properties[idx].default_uint_value); } diff --git a/lldb/source/Target/TargetProperties.td b/lldb/source/Target/TargetProperties.td index 421252aa4aea26..7bb5bd53688b14 100644 --- a/lldb/source/Target/TargetProperties.td +++ b/lldb/source/Target/TargetProperties.td @@ -92,7 +92,7 @@ let Definition = "target" in { def MaxZeroPaddingInFloatFormat: Property<"max-zero-padding-in-float-format", "UInt64">, DefaultUnsignedValue<6>, Desc<"The maximum number of zeroes to insert when displaying a very small float before falling back to scientific notation.">; - def MaxChildrenCount: Property<"max-children-count", "SInt64">, + def MaxChildrenCount: Property<"max-children-count", "UInt64">, DefaultUnsignedValue<256>, Desc<"Maximum number of children to expand in any level of depth.">; def MaxChildrenDepth: Property<"max-children-depth", "UInt64">, @@ -101,7 +101,7 @@ let Definition = "target" in { def MaxSummaryLength: Property<"max-string-summary-length", "UInt64">, DefaultUnsignedValue<1024>, Desc<"Maximum number of characters to show when using %s in summary strings.">; - def MaxMemReadSize: Property<"max-memory-read-size", "SInt64">, + def MaxMemReadSize: Property<"max-memory-read-size", "UInt64">, DefaultUnsignedValue<1024>, Desc<"Maximum number of bytes that 'memory read' will fetch before --force must be specified.">; def BreakpointUseAvoidList: Property<"breakpoints-use-platform-avoid-list", "Boolean">, diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py index 072a580afe24e4..185a24cf6dce30 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py @@ -2,7 +2,6 @@ Test lldb data formatter subsystem. """ - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * @@ -51,7 +50,7 @@ def do_test(self, stdlib_type): self.expect( "settings show target.max-children-count", matching=True, - substrs=["target.max-children-count (int) = 256"], + substrs=["target.max-children-count (unsigned) = 256"], ) self.expect( @@ -132,7 +131,7 @@ def do_test_ptr_and_ref(self, stdlib_type): self.expect( "settings show target.max-children-count", matching=True, - substrs=["target.max-children-count (int) = 256"], + substrs=["target.max-children-count (unsigned) = 256"], ) self.expect( diff --git a/lldb/test/API/functionalities/memory/big-read/Makefile b/lldb/test/API/functionalities/memory/big-read/Makefile new file mode 100644 index 00000000000000..10495940055b63 --- /dev/null +++ b/lldb/test/API/functionalities/memory/big-read/Makefile @@ -0,0 +1,3 @@ +C_SOURCES := main.c + +include Makefile.rules diff --git a/lldb/test/API/functionalities/memory/big-read/TestMemoryReadMaximumSize.py b/lldb/test/API/functionalities/memory/big-read/TestMemoryReadMaximumSize.py new file mode 100644 index 00000000000000..259fde71a63626 --- /dev/null +++ b/lldb/test/API/functionalities/memory/big-read/TestMemoryReadMaximumSize.py @@ -0,0 +1,31 @@ +""" +Test the maximum memory read setting. +""" + +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +import lldbsuite.test.lldbutil as lldbutil + + +class TestMemoryReadMaximumSize(TestBase): + def test_memory_read_max_setting(self): + """Test the target.max-memory-read-size setting.""" + self.build() + ( + self.target, + self.process, + self.thread, + self.bp, + ) = lldbutil.run_to_source_breakpoint( + self, "breakpoint here", lldb.SBFileSpec("main.c") + ) + self.assertTrue(self.bp.IsValid()) + + self.expect( + "mem rea -f x -s 4 -c 2048 `&c`", + error=True, + substrs=["Normally, 'memory read' will not read over 1024 bytes of data"], + ) + self.runCmd("settings set target.max-memory-read-size `2048 * sizeof(int)`") + self.expect("mem rea -f x -s 4 -c 2048 `&c`", substrs=["feed"]) diff --git a/lldb/test/API/functionalities/memory/big-read/main.c b/lldb/test/API/functionalities/memory/big-read/main.c new file mode 100644 index 00000000000000..a9143a50d093b8 --- /dev/null +++ b/lldb/test/API/functionalities/memory/big-read/main.c @@ -0,0 +1,9 @@ +#include +int main() { + int c[2048]; + memset(c, 0, 2048 * sizeof(int)); + + c[2047] = 0xfeed; + + return c[2047]; // breakpoint here +} From 6b11573b8c5e3d36beee099dbe7347c2a007bf53 Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Wed, 21 Aug 2024 10:30:05 -0700 Subject: [PATCH 237/426] Recommit "[FunctionAttrs] deduce attr `cold` on functions if all CG paths call a `cold` function" Fixed up the uar test that was failing. It seems with the new `cold` attribute the order of the functions is different. As far as I can tell this is not a concern. Closes #105559 --- compiler-rt/test/metadata/uar.cpp | 30 +- llvm/lib/Transforms/IPO/FunctionAttrs.cpp | 69 +++ llvm/test/Transforms/FunctionAttrs/cold.ll | 542 ++++++++++++++------- 3 files changed, 448 insertions(+), 193 deletions(-) diff --git a/compiler-rt/test/metadata/uar.cpp b/compiler-rt/test/metadata/uar.cpp index cbafe462c3643c..2c5537b815a93c 100644 --- a/compiler-rt/test/metadata/uar.cpp +++ b/compiler-rt/test/metadata/uar.cpp @@ -2,7 +2,7 @@ // RUN: %clangxx %s -O1 -o %t -fexperimental-sanitize-metadata=covered,uar -fsanitize=address,signed-integer-overflow,alignment && %t | FileCheck %s // RUN: %clangxx %s -O1 -o %t -mcmodel=large -fexperimental-sanitize-metadata=covered,uar -fsanitize=address,signed-integer-overflow,alignment && %t | FileCheck %s -// CHECK: metadata add version 2 +// CHECK-DAG: metadata add version 2 __attribute__((noinline, not_tail_called)) void escape(const volatile void *p) { [[maybe_unused]] static const volatile void *sink; @@ -14,51 +14,51 @@ __attribute__((noinline, not_tail_called)) void use(int x) { sink += x; } -// CHECK: empty: features=0 stack_args=0 +// CHECK-DAG: empty: features=0 stack_args=0 void empty() {} -// CHECK: simple: features=0 stack_args=0 +// CHECK-DAG: simple: features=0 stack_args=0 int simple(int *data, int index) { return data[index + 1]; } -// CHECK: builtins: features=0 stack_args=0 +// CHECK-DAG: builtins: features=0 stack_args=0 int builtins() { int x = 0; __builtin_prefetch(&x); return x; } -// CHECK: ellipsis: features=0 stack_args=0 +// CHECK-DAG: ellipsis: features=0 stack_args=0 void ellipsis(const char *fmt, ...) { int x; escape(&x); } -// CHECK: non_empty_function: features=2 stack_args=0 +// CHECK-DAG: non_empty_function: features=2 stack_args=0 void non_empty_function() { int x; escape(&x); } -// CHECK: no_stack_args: features=2 stack_args=0 +// CHECK-DAG: no_stack_args: features=2 stack_args=0 void no_stack_args(long a0, long a1, long a2, long a3, long a4, long a5) { int x; escape(&x); } -// CHECK: stack_args: features=6 stack_args=16 +// CHECK-DAG: stack_args: features=6 stack_args=16 void stack_args(long a0, long a1, long a2, long a3, long a4, long a5, long a6) { int x; escape(&x); } -// CHECK: more_stack_args: features=6 stack_args=32 +// CHECK-DAG: more_stack_args: features=6 stack_args=32 void more_stack_args(long a0, long a1, long a2, long a3, long a4, long a5, long a6, long a7, long a8) { int x; escape(&x); } -// CHECK: struct_stack_args: features=6 stack_args=144 +// CHECK-DAG: struct_stack_args: features=6 stack_args=144 struct large { char x[131]; }; @@ -69,28 +69,28 @@ void struct_stack_args(large a) { __attribute__((noinline)) int tail_called(int x) { return x; } -// CHECK: with_tail_call: features=2 +// CHECK-DAG: with_tail_call: features=2 int with_tail_call(int x) { [[clang::musttail]] return tail_called(x); } __attribute__((noinline, noreturn)) int noreturn(int x) { __builtin_trap(); } -// CHECK: with_noreturn_tail_call: features=0 +// CHECK-DAG: with_noreturn_tail_call: features=0 int with_noreturn_tail_call(int x) { return noreturn(x); } -// CHECK: local_array: features=0 +// CHECK-DAG: local_array: features=0 void local_array(int x) { int data[10]; use(data[x]); } -// CHECK: local_alloca: features=0 +// CHECK-DAG: local_alloca: features=0 void local_alloca(int size, int i, int j) { volatile int *p = static_cast(__builtin_alloca(size)); p[i] = 0; use(p[j]); } -// CHECK: escaping_alloca: features=2 +// CHECK-DAG: escaping_alloca: features=2 void escaping_alloca(int size, int i) { volatile int *p = static_cast(__builtin_alloca(size)); escape(&p[i]); diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index d50218aaa3b6cc..603a1565e48c45 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -82,6 +82,7 @@ STATISTIC(NumNoUnwind, "Number of functions marked as nounwind"); STATISTIC(NumNoFree, "Number of functions marked as nofree"); STATISTIC(NumWillReturn, "Number of functions marked as willreturn"); STATISTIC(NumNoSync, "Number of functions marked as nosync"); +STATISTIC(NumCold, "Number of functions marked as cold"); STATISTIC(NumThinLinkNoRecurse, "Number of functions marked as norecurse during thinlink"); @@ -1745,6 +1746,7 @@ static bool canReturn(Function &F) { return false; } + // Set the noreturn function attribute if possible. static void addNoReturnAttrs(const SCCNodeSet &SCCNodes, SmallSet &Changed) { @@ -1760,6 +1762,72 @@ static void addNoReturnAttrs(const SCCNodeSet &SCCNodes, } } +static bool +allBBPathsGoThroughCold(BasicBlock *BB, + SmallDenseMap &Visited) { + // If BB contains a cold callsite this path through the CG is cold. + // Ignore whether the instructions actually are guranteed to transfer + // execution. Divergent behavior is considered unlikely. + if (any_of(*BB, [](Instruction &I) { + if (auto *CB = dyn_cast(&I)) + return CB->hasFnAttr(Attribute::Cold); + return false; + })) { + Visited[BB] = true; + return true; + } + + auto Succs = successors(BB); + // We found a path that doesn't go through any cold callsite. + if (Succs.empty()) + return false; + + // We didn't find a cold callsite in this BB, so check that all successors + // contain a cold callsite (or that their successors do). + // Potential TODO: We could use static branch hints to assume certain + // successor paths are inherently cold, irrespective of if they contain a cold + // callsite. + for (auto *Succ : Succs) { + // Start with false, this is necessary to ensure we don't turn loops into + // cold. + auto R = Visited.try_emplace(Succ, false); + if (!R.second) { + if (R.first->second) + continue; + return false; + } + if (!allBBPathsGoThroughCold(Succ, Visited)) + return false; + Visited[Succ] = true; + } + + return true; +} + +static bool allPathsGoThroughCold(Function &F) { + SmallDenseMap Visited; + Visited[&F.front()] = false; + return allBBPathsGoThroughCold(&F.front(), Visited); +} + +// Set the cold function attribute if possible. +static void addColdAttrs(const SCCNodeSet &SCCNodes, + SmallSet &Changed) { + for (Function *F : SCCNodes) { + if (!F || !F->hasExactDefinition() || F->hasFnAttribute(Attribute::Naked) || + F->hasFnAttribute(Attribute::Cold) || F->hasFnAttribute(Attribute::Hot)) + continue; + + // Potential TODO: We could add attribute `cold` on functions with `coldcc`. + if (allPathsGoThroughCold(*F)) { + F->addFnAttr(Attribute::Cold); + ++NumCold; + Changed.insert(F); + continue; + } + } +} + static bool functionWillReturn(const Function &F) { // We can infer and propagate function attributes only when we know that the // definition we'll get at link time is *exactly* the definition we see now. @@ -1853,6 +1921,7 @@ deriveAttrsInPostOrder(ArrayRef Functions, AARGetterT &&AARGetter, addArgumentAttrs(Nodes.SCCNodes, Changed); inferConvergent(Nodes.SCCNodes, Changed); addNoReturnAttrs(Nodes.SCCNodes, Changed); + addColdAttrs(Nodes.SCCNodes, Changed); addWillReturn(Nodes.SCCNodes, Changed); addNoUndefAttrs(Nodes.SCCNodes, Changed); diff --git a/llvm/test/Transforms/FunctionAttrs/cold.ll b/llvm/test/Transforms/FunctionAttrs/cold.ll index 1fa8ae06797943..a205fbda062121 100644 --- a/llvm/test/Transforms/FunctionAttrs/cold.ll +++ b/llvm/test/Transforms/FunctionAttrs/cold.ll @@ -54,14 +54,23 @@ while.body2: } define void @test_no_exit() { -; COMMON: Function Attrs: noreturn -; COMMON-LABEL: define void @test_no_exit -; COMMON-SAME: () #[[ATTR2]] { -; COMMON-NEXT: entry: -; COMMON-NEXT: br label [[WHILE_BODY:%.*]] -; COMMON: while.body: -; COMMON-NEXT: call void @cold0() -; COMMON-NEXT: br label [[WHILE_BODY]] +; FNATTRS: Function Attrs: cold noreturn +; FNATTRS-LABEL: define void @test_no_exit +; FNATTRS-SAME: () #[[ATTR3:[0-9]+]] { +; FNATTRS-NEXT: entry: +; FNATTRS-NEXT: br label [[WHILE_BODY:%.*]] +; FNATTRS: while.body: +; FNATTRS-NEXT: call void @cold0() +; FNATTRS-NEXT: br label [[WHILE_BODY]] +; +; ATTRIBUTOR: Function Attrs: noreturn +; ATTRIBUTOR-LABEL: define void @test_no_exit +; ATTRIBUTOR-SAME: () #[[ATTR2]] { +; ATTRIBUTOR-NEXT: entry: +; ATTRIBUTOR-NEXT: br label [[WHILE_BODY:%.*]] +; ATTRIBUTOR: while.body: +; ATTRIBUTOR-NEXT: call void @cold0() +; ATTRIBUTOR-NEXT: br label [[WHILE_BODY]] ; entry: br label %while.body @@ -72,17 +81,29 @@ while.body: } define void @test_no_exit2() { -; COMMON: Function Attrs: noreturn -; COMMON-LABEL: define void @test_no_exit2 -; COMMON-SAME: () #[[ATTR2]] { -; COMMON-NEXT: entry: -; COMMON-NEXT: br label [[WHILE_BODY:%.*]] -; COMMON: while.body: -; COMMON-NEXT: call void @not_cold0() -; COMMON-NEXT: br label [[WHILE_BODY2:%.*]] -; COMMON: while.body2: -; COMMON-NEXT: call void @cold1() -; COMMON-NEXT: br label [[WHILE_BODY]] +; FNATTRS: Function Attrs: cold noreturn +; FNATTRS-LABEL: define void @test_no_exit2 +; FNATTRS-SAME: () #[[ATTR3]] { +; FNATTRS-NEXT: entry: +; FNATTRS-NEXT: br label [[WHILE_BODY:%.*]] +; FNATTRS: while.body: +; FNATTRS-NEXT: call void @not_cold0() +; FNATTRS-NEXT: br label [[WHILE_BODY2:%.*]] +; FNATTRS: while.body2: +; FNATTRS-NEXT: call void @cold1() +; FNATTRS-NEXT: br label [[WHILE_BODY]] +; +; ATTRIBUTOR: Function Attrs: noreturn +; ATTRIBUTOR-LABEL: define void @test_no_exit2 +; ATTRIBUTOR-SAME: () #[[ATTR2]] { +; ATTRIBUTOR-NEXT: entry: +; ATTRIBUTOR-NEXT: br label [[WHILE_BODY:%.*]] +; ATTRIBUTOR: while.body: +; ATTRIBUTOR-NEXT: call void @not_cold0() +; ATTRIBUTOR-NEXT: br label [[WHILE_BODY2:%.*]] +; ATTRIBUTOR: while.body2: +; ATTRIBUTOR-NEXT: call void @cold1() +; ATTRIBUTOR-NEXT: br label [[WHILE_BODY]] ; entry: br label %while.body @@ -97,18 +118,32 @@ while.body2: } define dso_local void @test_entry(i32 noundef %x) { -; COMMON-LABEL: define dso_local void @test_entry -; COMMON-SAME: (i32 noundef [[X:%.*]]) { -; COMMON-NEXT: entry: -; COMMON-NEXT: tail call void @cold0() -; COMMON-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 -; COMMON-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] -; COMMON: if.then: -; COMMON-NEXT: tail call void @not_cold0() -; COMMON-NEXT: br label [[IF_END]] -; COMMON: if.end: -; COMMON-NEXT: tail call void @not_cold1() -; COMMON-NEXT: ret void +; FNATTRS: Function Attrs: cold +; FNATTRS-LABEL: define dso_local void @test_entry +; FNATTRS-SAME: (i32 noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; FNATTRS-NEXT: entry: +; FNATTRS-NEXT: tail call void @cold0() +; FNATTRS-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 +; FNATTRS-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] +; FNATTRS: if.then: +; FNATTRS-NEXT: tail call void @not_cold0() +; FNATTRS-NEXT: br label [[IF_END]] +; FNATTRS: if.end: +; FNATTRS-NEXT: tail call void @not_cold1() +; FNATTRS-NEXT: ret void +; +; ATTRIBUTOR-LABEL: define dso_local void @test_entry +; ATTRIBUTOR-SAME: (i32 noundef [[X:%.*]]) { +; ATTRIBUTOR-NEXT: entry: +; ATTRIBUTOR-NEXT: tail call void @cold0() +; ATTRIBUTOR-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 +; ATTRIBUTOR-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] +; ATTRIBUTOR: if.then: +; ATTRIBUTOR-NEXT: tail call void @not_cold0() +; ATTRIBUTOR-NEXT: br label [[IF_END]] +; ATTRIBUTOR: if.end: +; ATTRIBUTOR-NEXT: tail call void @not_cold1() +; ATTRIBUTOR-NEXT: ret void ; entry: tail call void @cold0() @@ -125,12 +160,19 @@ if.end: } define dso_local void @test_hot_fail(i32 noundef %x) hot { -; COMMON: Function Attrs: hot -; COMMON-LABEL: define dso_local void @test_hot_fail -; COMMON-SAME: (i32 noundef [[X:%.*]]) #[[ATTR3:[0-9]+]] { -; COMMON-NEXT: entry: -; COMMON-NEXT: tail call void @cold0() -; COMMON-NEXT: ret void +; FNATTRS: Function Attrs: hot +; FNATTRS-LABEL: define dso_local void @test_hot_fail +; FNATTRS-SAME: (i32 noundef [[X:%.*]]) #[[ATTR4:[0-9]+]] { +; FNATTRS-NEXT: entry: +; FNATTRS-NEXT: tail call void @cold0() +; FNATTRS-NEXT: ret void +; +; ATTRIBUTOR: Function Attrs: hot +; ATTRIBUTOR-LABEL: define dso_local void @test_hot_fail +; ATTRIBUTOR-SAME: (i32 noundef [[X:%.*]]) #[[ATTR3:[0-9]+]] { +; ATTRIBUTOR-NEXT: entry: +; ATTRIBUTOR-NEXT: tail call void @cold0() +; ATTRIBUTOR-NEXT: ret void ; entry: tail call void @cold0() @@ -138,19 +180,34 @@ entry: } define dso_local void @test_br2(i32 noundef %x) { -; COMMON-LABEL: define dso_local void @test_br2 -; COMMON-SAME: (i32 noundef [[X:%.*]]) { -; COMMON-NEXT: entry: -; COMMON-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 -; COMMON-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] -; COMMON: if.then: -; COMMON-NEXT: tail call void @cold0() -; COMMON-NEXT: br label [[IF_END:%.*]] -; COMMON: if.else: -; COMMON-NEXT: tail call void @cold1() -; COMMON-NEXT: br label [[IF_END]] -; COMMON: if.end: -; COMMON-NEXT: ret void +; FNATTRS: Function Attrs: cold +; FNATTRS-LABEL: define dso_local void @test_br2 +; FNATTRS-SAME: (i32 noundef [[X:%.*]]) #[[ATTR0]] { +; FNATTRS-NEXT: entry: +; FNATTRS-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 +; FNATTRS-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; FNATTRS: if.then: +; FNATTRS-NEXT: tail call void @cold0() +; FNATTRS-NEXT: br label [[IF_END:%.*]] +; FNATTRS: if.else: +; FNATTRS-NEXT: tail call void @cold1() +; FNATTRS-NEXT: br label [[IF_END]] +; FNATTRS: if.end: +; FNATTRS-NEXT: ret void +; +; ATTRIBUTOR-LABEL: define dso_local void @test_br2 +; ATTRIBUTOR-SAME: (i32 noundef [[X:%.*]]) { +; ATTRIBUTOR-NEXT: entry: +; ATTRIBUTOR-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 +; ATTRIBUTOR-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; ATTRIBUTOR: if.then: +; ATTRIBUTOR-NEXT: tail call void @cold0() +; ATTRIBUTOR-NEXT: br label [[IF_END:%.*]] +; ATTRIBUTOR: if.else: +; ATTRIBUTOR-NEXT: tail call void @cold1() +; ATTRIBUTOR-NEXT: br label [[IF_END]] +; ATTRIBUTOR: if.end: +; ATTRIBUTOR-NEXT: ret void ; entry: %tobool.not = icmp eq i32 %x, 0 @@ -169,21 +226,38 @@ if.end: } define dso_local void @test_exit(i32 noundef %x) { -; COMMON-LABEL: define dso_local void @test_exit -; COMMON-SAME: (i32 noundef [[X:%.*]]) { -; COMMON-NEXT: entry: -; COMMON-NEXT: tail call void @not_cold0() -; COMMON-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 -; COMMON-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] -; COMMON: if.then: -; COMMON-NEXT: tail call void @not_cold1() -; COMMON-NEXT: br label [[IF_END:%.*]] -; COMMON: if.else: -; COMMON-NEXT: tail call void @not_cold2() -; COMMON-NEXT: br label [[IF_END]] -; COMMON: if.end: -; COMMON-NEXT: tail call void @cold0() -; COMMON-NEXT: ret void +; FNATTRS: Function Attrs: cold +; FNATTRS-LABEL: define dso_local void @test_exit +; FNATTRS-SAME: (i32 noundef [[X:%.*]]) #[[ATTR0]] { +; FNATTRS-NEXT: entry: +; FNATTRS-NEXT: tail call void @not_cold0() +; FNATTRS-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 +; FNATTRS-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; FNATTRS: if.then: +; FNATTRS-NEXT: tail call void @not_cold1() +; FNATTRS-NEXT: br label [[IF_END:%.*]] +; FNATTRS: if.else: +; FNATTRS-NEXT: tail call void @not_cold2() +; FNATTRS-NEXT: br label [[IF_END]] +; FNATTRS: if.end: +; FNATTRS-NEXT: tail call void @cold0() +; FNATTRS-NEXT: ret void +; +; ATTRIBUTOR-LABEL: define dso_local void @test_exit +; ATTRIBUTOR-SAME: (i32 noundef [[X:%.*]]) { +; ATTRIBUTOR-NEXT: entry: +; ATTRIBUTOR-NEXT: tail call void @not_cold0() +; ATTRIBUTOR-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 +; ATTRIBUTOR-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; ATTRIBUTOR: if.then: +; ATTRIBUTOR-NEXT: tail call void @not_cold1() +; ATTRIBUTOR-NEXT: br label [[IF_END:%.*]] +; ATTRIBUTOR: if.else: +; ATTRIBUTOR-NEXT: tail call void @not_cold2() +; ATTRIBUTOR-NEXT: br label [[IF_END]] +; ATTRIBUTOR: if.end: +; ATTRIBUTOR-NEXT: tail call void @cold0() +; ATTRIBUTOR-NEXT: ret void ; entry: tail call void @not_cold0() @@ -204,54 +278,104 @@ if.end: } define dso_local void @test_complex(i32 noundef %x) { -; COMMON-LABEL: define dso_local void @test_complex -; COMMON-SAME: (i32 noundef [[X:%.*]]) { -; COMMON-NEXT: entry: -; COMMON-NEXT: tail call void @not_cold0() -; COMMON-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 -; COMMON-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE11:%.*]], label [[IF_THEN:%.*]] -; COMMON: if.then: -; COMMON-NEXT: [[CALL:%.*]] = tail call i32 @get_val() -; COMMON-NEXT: [[TOBOOL1_NOT:%.*]] = icmp eq i32 [[CALL]], 0 -; COMMON-NEXT: br i1 [[TOBOOL1_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN2:%.*]] -; COMMON: if.then2: -; COMMON-NEXT: tail call void @cold1() -; COMMON-NEXT: br label [[IF_END12:%.*]] -; COMMON: if.else: -; COMMON-NEXT: [[CALL3:%.*]] = tail call i32 @get_val() -; COMMON-NEXT: [[TOBOOL4_NOT:%.*]] = icmp eq i32 [[CALL3]], 0 -; COMMON-NEXT: br i1 [[TOBOOL4_NOT]], label [[IF_ELSE6:%.*]], label [[IF_THEN5:%.*]] -; COMMON: if.then5: -; COMMON-NEXT: tail call void @cold0() -; COMMON-NEXT: br label [[IF_END12]] -; COMMON: if.else6: -; COMMON-NEXT: tail call void @not_cold0() -; COMMON-NEXT: [[CALL7:%.*]] = tail call i32 @get_val() -; COMMON-NEXT: switch i32 [[CALL7]], label [[SW_DEFAULT:%.*]] [ -; COMMON-NEXT: i32 0, label [[SW_BB:%.*]] -; COMMON-NEXT: i32 1, label [[SW_BB8:%.*]] -; COMMON-NEXT: i32 2, label [[SW_BB9:%.*]] -; COMMON-NEXT: ] -; COMMON: sw.bb: -; COMMON-NEXT: tail call void @not_cold0() -; COMMON-NEXT: br label [[CALL_COLD:%.*]] -; COMMON: sw.bb8: -; COMMON-NEXT: tail call void @not_cold1() -; COMMON-NEXT: br label [[CALL_COLD]] -; COMMON: sw.bb9: -; COMMON-NEXT: tail call void @not_cold2() -; COMMON-NEXT: br label [[CALL_COLD]] -; COMMON: sw.default: -; COMMON-NEXT: tail call void @cold0() -; COMMON-NEXT: br label [[IF_END12]] -; COMMON: call_cold: -; COMMON-NEXT: tail call void @cold_at_cb() #[[ATTR0:[0-9]+]] -; COMMON-NEXT: br label [[IF_END12]] -; COMMON: if.else11: -; COMMON-NEXT: tail call void @cold0() -; COMMON-NEXT: br label [[IF_END12]] -; COMMON: if.end12: -; COMMON-NEXT: ret void +; FNATTRS: Function Attrs: cold +; FNATTRS-LABEL: define dso_local void @test_complex +; FNATTRS-SAME: (i32 noundef [[X:%.*]]) #[[ATTR0]] { +; FNATTRS-NEXT: entry: +; FNATTRS-NEXT: tail call void @not_cold0() +; FNATTRS-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 +; FNATTRS-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE11:%.*]], label [[IF_THEN:%.*]] +; FNATTRS: if.then: +; FNATTRS-NEXT: [[CALL:%.*]] = tail call i32 @get_val() +; FNATTRS-NEXT: [[TOBOOL1_NOT:%.*]] = icmp eq i32 [[CALL]], 0 +; FNATTRS-NEXT: br i1 [[TOBOOL1_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN2:%.*]] +; FNATTRS: if.then2: +; FNATTRS-NEXT: tail call void @cold1() +; FNATTRS-NEXT: br label [[IF_END12:%.*]] +; FNATTRS: if.else: +; FNATTRS-NEXT: [[CALL3:%.*]] = tail call i32 @get_val() +; FNATTRS-NEXT: [[TOBOOL4_NOT:%.*]] = icmp eq i32 [[CALL3]], 0 +; FNATTRS-NEXT: br i1 [[TOBOOL4_NOT]], label [[IF_ELSE6:%.*]], label [[IF_THEN5:%.*]] +; FNATTRS: if.then5: +; FNATTRS-NEXT: tail call void @cold0() +; FNATTRS-NEXT: br label [[IF_END12]] +; FNATTRS: if.else6: +; FNATTRS-NEXT: tail call void @not_cold0() +; FNATTRS-NEXT: [[CALL7:%.*]] = tail call i32 @get_val() +; FNATTRS-NEXT: switch i32 [[CALL7]], label [[SW_DEFAULT:%.*]] [ +; FNATTRS-NEXT: i32 0, label [[SW_BB:%.*]] +; FNATTRS-NEXT: i32 1, label [[SW_BB8:%.*]] +; FNATTRS-NEXT: i32 2, label [[SW_BB9:%.*]] +; FNATTRS-NEXT: ] +; FNATTRS: sw.bb: +; FNATTRS-NEXT: tail call void @not_cold0() +; FNATTRS-NEXT: br label [[CALL_COLD:%.*]] +; FNATTRS: sw.bb8: +; FNATTRS-NEXT: tail call void @not_cold1() +; FNATTRS-NEXT: br label [[CALL_COLD]] +; FNATTRS: sw.bb9: +; FNATTRS-NEXT: tail call void @not_cold2() +; FNATTRS-NEXT: br label [[CALL_COLD]] +; FNATTRS: sw.default: +; FNATTRS-NEXT: tail call void @cold0() +; FNATTRS-NEXT: br label [[IF_END12]] +; FNATTRS: call_cold: +; FNATTRS-NEXT: tail call void @cold_at_cb() #[[ATTR0]] +; FNATTRS-NEXT: br label [[IF_END12]] +; FNATTRS: if.else11: +; FNATTRS-NEXT: tail call void @cold0() +; FNATTRS-NEXT: br label [[IF_END12]] +; FNATTRS: if.end12: +; FNATTRS-NEXT: ret void +; +; ATTRIBUTOR-LABEL: define dso_local void @test_complex +; ATTRIBUTOR-SAME: (i32 noundef [[X:%.*]]) { +; ATTRIBUTOR-NEXT: entry: +; ATTRIBUTOR-NEXT: tail call void @not_cold0() +; ATTRIBUTOR-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 +; ATTRIBUTOR-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE11:%.*]], label [[IF_THEN:%.*]] +; ATTRIBUTOR: if.then: +; ATTRIBUTOR-NEXT: [[CALL:%.*]] = tail call i32 @get_val() +; ATTRIBUTOR-NEXT: [[TOBOOL1_NOT:%.*]] = icmp eq i32 [[CALL]], 0 +; ATTRIBUTOR-NEXT: br i1 [[TOBOOL1_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN2:%.*]] +; ATTRIBUTOR: if.then2: +; ATTRIBUTOR-NEXT: tail call void @cold1() +; ATTRIBUTOR-NEXT: br label [[IF_END12:%.*]] +; ATTRIBUTOR: if.else: +; ATTRIBUTOR-NEXT: [[CALL3:%.*]] = tail call i32 @get_val() +; ATTRIBUTOR-NEXT: [[TOBOOL4_NOT:%.*]] = icmp eq i32 [[CALL3]], 0 +; ATTRIBUTOR-NEXT: br i1 [[TOBOOL4_NOT]], label [[IF_ELSE6:%.*]], label [[IF_THEN5:%.*]] +; ATTRIBUTOR: if.then5: +; ATTRIBUTOR-NEXT: tail call void @cold0() +; ATTRIBUTOR-NEXT: br label [[IF_END12]] +; ATTRIBUTOR: if.else6: +; ATTRIBUTOR-NEXT: tail call void @not_cold0() +; ATTRIBUTOR-NEXT: [[CALL7:%.*]] = tail call i32 @get_val() +; ATTRIBUTOR-NEXT: switch i32 [[CALL7]], label [[SW_DEFAULT:%.*]] [ +; ATTRIBUTOR-NEXT: i32 0, label [[SW_BB:%.*]] +; ATTRIBUTOR-NEXT: i32 1, label [[SW_BB8:%.*]] +; ATTRIBUTOR-NEXT: i32 2, label [[SW_BB9:%.*]] +; ATTRIBUTOR-NEXT: ] +; ATTRIBUTOR: sw.bb: +; ATTRIBUTOR-NEXT: tail call void @not_cold0() +; ATTRIBUTOR-NEXT: br label [[CALL_COLD:%.*]] +; ATTRIBUTOR: sw.bb8: +; ATTRIBUTOR-NEXT: tail call void @not_cold1() +; ATTRIBUTOR-NEXT: br label [[CALL_COLD]] +; ATTRIBUTOR: sw.bb9: +; ATTRIBUTOR-NEXT: tail call void @not_cold2() +; ATTRIBUTOR-NEXT: br label [[CALL_COLD]] +; ATTRIBUTOR: sw.default: +; ATTRIBUTOR-NEXT: tail call void @cold0() +; ATTRIBUTOR-NEXT: br label [[IF_END12]] +; ATTRIBUTOR: call_cold: +; ATTRIBUTOR-NEXT: tail call void @cold_at_cb() #[[ATTR0:[0-9]+]] +; ATTRIBUTOR-NEXT: br label [[IF_END12]] +; ATTRIBUTOR: if.else11: +; ATTRIBUTOR-NEXT: tail call void @cold0() +; ATTRIBUTOR-NEXT: br label [[IF_END12]] +; ATTRIBUTOR: if.end12: +; ATTRIBUTOR-NEXT: ret void ; entry: tail call void @not_cold0() @@ -314,63 +438,122 @@ if.end12: } define dso_local void @test_complex2(i32 noundef %x) { -; COMMON-LABEL: define dso_local void @test_complex2 -; COMMON-SAME: (i32 noundef [[X:%.*]]) { -; COMMON-NEXT: entry: -; COMMON-NEXT: tail call void @not_cold0() -; COMMON-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 -; COMMON-NEXT: [[CALL12:%.*]] = tail call i32 @get_val() -; COMMON-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE11:%.*]], label [[IF_THEN:%.*]] -; COMMON: if.then: -; COMMON-NEXT: [[TOBOOL1_NOT:%.*]] = icmp eq i32 [[CALL12]], 0 -; COMMON-NEXT: br i1 [[TOBOOL1_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN2:%.*]] -; COMMON: if.then2: -; COMMON-NEXT: tail call void @cold1() -; COMMON-NEXT: br label [[IF_END16:%.*]] -; COMMON: if.else: -; COMMON-NEXT: [[CALL3:%.*]] = tail call i32 @get_val() -; COMMON-NEXT: [[TOBOOL4_NOT:%.*]] = icmp eq i32 [[CALL3]], 0 -; COMMON-NEXT: br i1 [[TOBOOL4_NOT]], label [[IF_ELSE6:%.*]], label [[IF_THEN5:%.*]] -; COMMON: if.then5: -; COMMON-NEXT: tail call void @cold0() -; COMMON-NEXT: br label [[IF_END16]] -; COMMON: if.else6: -; COMMON-NEXT: tail call void @not_cold0() -; COMMON-NEXT: [[CALL7:%.*]] = tail call i32 @get_val() -; COMMON-NEXT: switch i32 [[CALL7]], label [[SW_DEFAULT:%.*]] [ -; COMMON-NEXT: i32 0, label [[SW_BB:%.*]] -; COMMON-NEXT: i32 1, label [[SW_BB8:%.*]] -; COMMON-NEXT: i32 2, label [[SW_BB9:%.*]] -; COMMON-NEXT: ] -; COMMON: sw.bb: -; COMMON-NEXT: tail call void @not_cold0() -; COMMON-NEXT: br label [[CALL_COLD:%.*]] -; COMMON: sw.bb8: -; COMMON-NEXT: tail call void @not_cold1() -; COMMON-NEXT: br label [[CALL_COLD]] -; COMMON: sw.bb9: -; COMMON-NEXT: tail call void @not_cold2() -; COMMON-NEXT: br label [[CALL_COLD]] -; COMMON: sw.default: -; COMMON-NEXT: tail call void @cold0() -; COMMON-NEXT: br label [[IF_END16]] -; COMMON: call_cold: -; COMMON-NEXT: tail call void @cold_at_cb() #[[ATTR0]] -; COMMON-NEXT: br label [[IF_END16]] -; COMMON: if.else11: -; COMMON-NEXT: [[CMP:%.*]] = icmp slt i32 [[CALL12]], 1 -; COMMON-NEXT: br i1 [[CMP]], label [[IF_END14:%.*]], label [[FOR_BODY:%.*]] -; COMMON: if.end14: -; COMMON-NEXT: tail call void @cold1() -; COMMON-NEXT: br label [[IF_END16]] -; COMMON: for.body: -; COMMON-NEXT: [[I_021:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[IF_ELSE11]] ] -; COMMON-NEXT: tail call void @cold0() -; COMMON-NEXT: [[INC]] = add nuw nsw i32 [[I_021]], 1 -; COMMON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[CALL12]] -; COMMON-NEXT: br i1 [[EXITCOND_NOT]], label [[IF_END16]], label [[FOR_BODY]] -; COMMON: if.end16: -; COMMON-NEXT: ret void +; FNATTRS: Function Attrs: cold +; FNATTRS-LABEL: define dso_local void @test_complex2 +; FNATTRS-SAME: (i32 noundef [[X:%.*]]) #[[ATTR0]] { +; FNATTRS-NEXT: entry: +; FNATTRS-NEXT: tail call void @not_cold0() +; FNATTRS-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 +; FNATTRS-NEXT: [[CALL12:%.*]] = tail call i32 @get_val() +; FNATTRS-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE11:%.*]], label [[IF_THEN:%.*]] +; FNATTRS: if.then: +; FNATTRS-NEXT: [[TOBOOL1_NOT:%.*]] = icmp eq i32 [[CALL12]], 0 +; FNATTRS-NEXT: br i1 [[TOBOOL1_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN2:%.*]] +; FNATTRS: if.then2: +; FNATTRS-NEXT: tail call void @cold1() +; FNATTRS-NEXT: br label [[IF_END16:%.*]] +; FNATTRS: if.else: +; FNATTRS-NEXT: [[CALL3:%.*]] = tail call i32 @get_val() +; FNATTRS-NEXT: [[TOBOOL4_NOT:%.*]] = icmp eq i32 [[CALL3]], 0 +; FNATTRS-NEXT: br i1 [[TOBOOL4_NOT]], label [[IF_ELSE6:%.*]], label [[IF_THEN5:%.*]] +; FNATTRS: if.then5: +; FNATTRS-NEXT: tail call void @cold0() +; FNATTRS-NEXT: br label [[IF_END16]] +; FNATTRS: if.else6: +; FNATTRS-NEXT: tail call void @not_cold0() +; FNATTRS-NEXT: [[CALL7:%.*]] = tail call i32 @get_val() +; FNATTRS-NEXT: switch i32 [[CALL7]], label [[SW_DEFAULT:%.*]] [ +; FNATTRS-NEXT: i32 0, label [[SW_BB:%.*]] +; FNATTRS-NEXT: i32 1, label [[SW_BB8:%.*]] +; FNATTRS-NEXT: i32 2, label [[SW_BB9:%.*]] +; FNATTRS-NEXT: ] +; FNATTRS: sw.bb: +; FNATTRS-NEXT: tail call void @not_cold0() +; FNATTRS-NEXT: br label [[CALL_COLD:%.*]] +; FNATTRS: sw.bb8: +; FNATTRS-NEXT: tail call void @not_cold1() +; FNATTRS-NEXT: br label [[CALL_COLD]] +; FNATTRS: sw.bb9: +; FNATTRS-NEXT: tail call void @not_cold2() +; FNATTRS-NEXT: br label [[CALL_COLD]] +; FNATTRS: sw.default: +; FNATTRS-NEXT: tail call void @cold0() +; FNATTRS-NEXT: br label [[IF_END16]] +; FNATTRS: call_cold: +; FNATTRS-NEXT: tail call void @cold_at_cb() #[[ATTR0]] +; FNATTRS-NEXT: br label [[IF_END16]] +; FNATTRS: if.else11: +; FNATTRS-NEXT: [[CMP:%.*]] = icmp slt i32 [[CALL12]], 1 +; FNATTRS-NEXT: br i1 [[CMP]], label [[IF_END14:%.*]], label [[FOR_BODY:%.*]] +; FNATTRS: if.end14: +; FNATTRS-NEXT: tail call void @cold1() +; FNATTRS-NEXT: br label [[IF_END16]] +; FNATTRS: for.body: +; FNATTRS-NEXT: [[I_021:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[IF_ELSE11]] ] +; FNATTRS-NEXT: tail call void @cold0() +; FNATTRS-NEXT: [[INC]] = add nuw nsw i32 [[I_021]], 1 +; FNATTRS-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[CALL12]] +; FNATTRS-NEXT: br i1 [[EXITCOND_NOT]], label [[IF_END16]], label [[FOR_BODY]] +; FNATTRS: if.end16: +; FNATTRS-NEXT: ret void +; +; ATTRIBUTOR-LABEL: define dso_local void @test_complex2 +; ATTRIBUTOR-SAME: (i32 noundef [[X:%.*]]) { +; ATTRIBUTOR-NEXT: entry: +; ATTRIBUTOR-NEXT: tail call void @not_cold0() +; ATTRIBUTOR-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0 +; ATTRIBUTOR-NEXT: [[CALL12:%.*]] = tail call i32 @get_val() +; ATTRIBUTOR-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE11:%.*]], label [[IF_THEN:%.*]] +; ATTRIBUTOR: if.then: +; ATTRIBUTOR-NEXT: [[TOBOOL1_NOT:%.*]] = icmp eq i32 [[CALL12]], 0 +; ATTRIBUTOR-NEXT: br i1 [[TOBOOL1_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN2:%.*]] +; ATTRIBUTOR: if.then2: +; ATTRIBUTOR-NEXT: tail call void @cold1() +; ATTRIBUTOR-NEXT: br label [[IF_END16:%.*]] +; ATTRIBUTOR: if.else: +; ATTRIBUTOR-NEXT: [[CALL3:%.*]] = tail call i32 @get_val() +; ATTRIBUTOR-NEXT: [[TOBOOL4_NOT:%.*]] = icmp eq i32 [[CALL3]], 0 +; ATTRIBUTOR-NEXT: br i1 [[TOBOOL4_NOT]], label [[IF_ELSE6:%.*]], label [[IF_THEN5:%.*]] +; ATTRIBUTOR: if.then5: +; ATTRIBUTOR-NEXT: tail call void @cold0() +; ATTRIBUTOR-NEXT: br label [[IF_END16]] +; ATTRIBUTOR: if.else6: +; ATTRIBUTOR-NEXT: tail call void @not_cold0() +; ATTRIBUTOR-NEXT: [[CALL7:%.*]] = tail call i32 @get_val() +; ATTRIBUTOR-NEXT: switch i32 [[CALL7]], label [[SW_DEFAULT:%.*]] [ +; ATTRIBUTOR-NEXT: i32 0, label [[SW_BB:%.*]] +; ATTRIBUTOR-NEXT: i32 1, label [[SW_BB8:%.*]] +; ATTRIBUTOR-NEXT: i32 2, label [[SW_BB9:%.*]] +; ATTRIBUTOR-NEXT: ] +; ATTRIBUTOR: sw.bb: +; ATTRIBUTOR-NEXT: tail call void @not_cold0() +; ATTRIBUTOR-NEXT: br label [[CALL_COLD:%.*]] +; ATTRIBUTOR: sw.bb8: +; ATTRIBUTOR-NEXT: tail call void @not_cold1() +; ATTRIBUTOR-NEXT: br label [[CALL_COLD]] +; ATTRIBUTOR: sw.bb9: +; ATTRIBUTOR-NEXT: tail call void @not_cold2() +; ATTRIBUTOR-NEXT: br label [[CALL_COLD]] +; ATTRIBUTOR: sw.default: +; ATTRIBUTOR-NEXT: tail call void @cold0() +; ATTRIBUTOR-NEXT: br label [[IF_END16]] +; ATTRIBUTOR: call_cold: +; ATTRIBUTOR-NEXT: tail call void @cold_at_cb() #[[ATTR0]] +; ATTRIBUTOR-NEXT: br label [[IF_END16]] +; ATTRIBUTOR: if.else11: +; ATTRIBUTOR-NEXT: [[CMP:%.*]] = icmp slt i32 [[CALL12]], 1 +; ATTRIBUTOR-NEXT: br i1 [[CMP]], label [[IF_END14:%.*]], label [[FOR_BODY:%.*]] +; ATTRIBUTOR: if.end14: +; ATTRIBUTOR-NEXT: tail call void @cold1() +; ATTRIBUTOR-NEXT: br label [[IF_END16]] +; ATTRIBUTOR: for.body: +; ATTRIBUTOR-NEXT: [[I_021:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[IF_ELSE11]] ] +; ATTRIBUTOR-NEXT: tail call void @cold0() +; ATTRIBUTOR-NEXT: [[INC]] = add nuw nsw i32 [[I_021]], 1 +; ATTRIBUTOR-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[CALL12]] +; ATTRIBUTOR-NEXT: br i1 [[EXITCOND_NOT]], label [[IF_END16]], label [[FOR_BODY]] +; ATTRIBUTOR: if.end16: +; ATTRIBUTOR-NEXT: ret void ; entry: tail call void @not_cold0() @@ -485,7 +668,7 @@ define dso_local void @test_complex_fail(i32 noundef %x) { ; COMMON-NEXT: tail call void @cold0() ; COMMON-NEXT: br label [[IF_END12]] ; COMMON: call_cold: -; COMMON-NEXT: tail call void @cold_at_cb() #[[ATTR0]] +; COMMON-NEXT: tail call void @cold_at_cb() #[[ATTR0:[0-9]+]] ; COMMON-NEXT: br label [[IF_END12]] ; COMMON: if.else11: ; COMMON-NEXT: tail call void @cold0() @@ -684,11 +867,14 @@ if.end16: } ;. -; COMMON: attributes #[[ATTR0]] = { cold } -; COMMON: attributes #[[ATTR1]] = { nofree norecurse noreturn nosync nounwind memory(none) } -; COMMON: attributes #[[ATTR2]] = { noreturn } -; COMMON: attributes #[[ATTR3]] = { hot } +; FNATTRS: attributes #[[ATTR0]] = { cold } +; FNATTRS: attributes #[[ATTR1]] = { nofree norecurse noreturn nosync nounwind memory(none) } +; FNATTRS: attributes #[[ATTR2]] = { noreturn } +; FNATTRS: attributes #[[ATTR3]] = { cold noreturn } +; FNATTRS: attributes #[[ATTR4]] = { hot } +;. +; ATTRIBUTOR: attributes #[[ATTR0]] = { cold } +; ATTRIBUTOR: attributes #[[ATTR1]] = { nofree norecurse noreturn nosync nounwind memory(none) } +; ATTRIBUTOR: attributes #[[ATTR2]] = { noreturn } +; ATTRIBUTOR: attributes #[[ATTR3]] = { hot } ;. -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; ATTRIBUTOR: {{.*}} -; FNATTRS: {{.*}} From b2cd81c93831fe256bddec5efa5a2765400076de Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 22 Aug 2024 10:19:56 -0700 Subject: [PATCH 238/426] [IR] Simplify comparisons with std::optional (NFC) (#105624) For variable X of type std::optional, X && X.value_or(Y) == Z is equivalent to X == Z when Y != Z. --- llvm/lib/IR/VectorBuilder.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/IR/VectorBuilder.cpp b/llvm/lib/IR/VectorBuilder.cpp index 8dbf25277bf5d2..b8f56a7a2e5f9b 100644 --- a/llvm/lib/IR/VectorBuilder.cpp +++ b/llvm/lib/IR/VectorBuilder.cpp @@ -96,8 +96,7 @@ Value *VectorBuilder::createVectorInstructionImpl(Intrinsic::ID VPID, // Insert mask and evl operands in between the instruction operands. for (size_t VPParamIdx = 0, ParamIdx = 0; VPParamIdx < NumVPParams; ++VPParamIdx) { - if ((MaskPosOpt && MaskPosOpt.value_or(NumVPParams) == VPParamIdx) || - (VLenPosOpt && VLenPosOpt.value_or(NumVPParams) == VPParamIdx)) + if (MaskPosOpt == VPParamIdx || VLenPosOpt == VPParamIdx) continue; assert(ParamIdx < NumInstParams); IntrinParams[VPParamIdx] = InstOpArray[ParamIdx++]; From 7faf2c95a4f1c3148c891608ed516eda3c9d3eb4 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 22 Aug 2024 18:07:44 +0100 Subject: [PATCH 239/426] [MCA][X86] Add scatter instruction test coverage for #105675 Missed IceLakeServer when I updated the other CPUs in 6ec4c9c3eb4a556f848dac37a2d6f0d46ecc6f02 --- .../X86/IceLakeServer/resources-avx512.s | 28 +++++++++- .../X86/IceLakeServer/resources-avx512vl.s | 54 ++++++++++++++++++- 2 files changed, 80 insertions(+), 2 deletions(-) diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s index 1ff8eccf290a6f..c4df992f3aebca 100644 --- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s +++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s @@ -814,6 +814,11 @@ vpermq %zmm16, %zmm17, %zmm19 {z}{k1} vpermq (%rax), %zmm17, %zmm19 {z}{k1} vpermq (%rax){1to8}, %zmm17, %zmm19 {z}{k1} +vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} +vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} +vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} + vpshufd $0, %zmm16, %zmm19 vpshufd $0, (%rax), %zmm19 vpshufd $0, (%rax){1to16}, %zmm19 @@ -884,6 +889,11 @@ vpunpcklqdq %zmm16, %zmm17, %zmm19 {z}{k1} vpunpcklqdq (%rax), %zmm17, %zmm19 {z}{k1} vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {z}{k1} +vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} +vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} +vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} + vshuff32x4 $0, %zmm16, %zmm17, %zmm19 vshuff32x4 $0, (%rax), %zmm17, %zmm19 vshuff32x4 $0, (%rax){1to16}, %zmm17, %zmm19 @@ -1792,6 +1802,10 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 1 3 1.00 vpermq %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 2 10 1.00 * vpermq (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 2 10 1.00 * vpermq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} +# CHECK-NEXT: 36 8 8.00 * vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 19 7 4.00 * vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 1 1 0.50 * vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 19 7 4.00 * vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 1 1 1.00 vpshufd $0, %zmm16, %zmm19 # CHECK-NEXT: 2 8 1.00 * vpshufd $0, (%rax), %zmm19 # CHECK-NEXT: 2 8 1.00 * vpshufd $0, (%rax){1to16}, %zmm19 @@ -1855,6 +1869,10 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 1 1 1.00 vpunpcklqdq %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 2 8 1.00 * vpunpcklqdq (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 2 8 1.00 * vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} +# CHECK-NEXT: 36 7 8.00 * vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 19 7 4.00 * vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 1 1 0.50 * vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 19 7 4.00 * vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 1 3 1.00 vshuff32x4 $0, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: 2 10 1.00 * vshuff32x4 $0, (%rax), %zmm17, %zmm19 # CHECK-NEXT: 2 10 1.00 * vshuff32x4 $0, (%rax){1to16}, %zmm17, %zmm19 @@ -2036,7 +2054,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] -# CHECK-NEXT: - 612.00 399.17 99.67 327.50 327.50 8.50 587.17 2.00 8.50 8.50 8.50 +# CHECK-NEXT: - 612.00 408.17 102.67 327.50 327.50 41.50 592.17 5.00 41.50 41.50 41.50 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions: @@ -2754,6 +2772,10 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: - - - - - - - 1.00 - - - - vpermq %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - vpermq (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - vpermq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} +# CHECK-NEXT: - - 1.50 0.50 - - 8.00 1.50 0.50 8.00 8.00 8.00 vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 - - 4.00 0.50 0.50 4.00 4.00 4.00 vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: - - - - - - 0.50 - - 0.50 0.50 0.50 vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 - - 4.00 0.50 0.50 4.00 4.00 4.00 vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - - - - - - 1.00 - - - - vpshufd $0, %zmm16, %zmm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - vpshufd $0, (%rax), %zmm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - vpshufd $0, (%rax){1to16}, %zmm19 @@ -2817,6 +2839,10 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: - - - - - - - 1.00 - - - - vpunpcklqdq %zmm16, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - vpunpcklqdq (%rax), %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} +# CHECK-NEXT: - - 1.50 0.50 - - 8.00 1.50 0.50 8.00 8.00 8.00 vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 - - 4.00 0.50 0.50 4.00 4.00 4.00 vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: - - - - - - 0.50 - - 0.50 0.50 0.50 vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 - - 4.00 0.50 0.50 4.00 4.00 4.00 vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - - - - - - 1.00 - - - - vshuff32x4 $0, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - vshuff32x4 $0, (%rax), %zmm17, %zmm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - vshuff32x4 $0, (%rax){1to16}, %zmm17, %zmm19 diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vl.s index 375087ae0cfe4e..00e5c3b03f6f52 100644 --- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vl.s +++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vl.s @@ -1344,6 +1344,16 @@ vpmulld %ymm16, %ymm17, %ymm19 {z}{k1} vpmulld (%rax), %ymm17, %ymm19 {z}{k1} vpmulld (%rax){1to8}, %ymm17, %ymm19 {z}{k1} +vpscatterdd %xmm1, (%rdx,%xmm0,4) {%k1} +vpscatterdq %xmm1, (%rdx,%xmm0,4) {%k1} +vpscatterqd %xmm1, (%rdx,%xmm0,4) {%k1} +vpscatterqq %xmm1, (%rdx,%xmm0,4) {%k1} + +vpscatterdd %ymm1, (%rdx,%ymm0,4) {%k1} +vpscatterdq %ymm1, (%rdx,%xmm0,4) {%k1} +vpscatterqd %xmm1, (%rdx,%ymm0,4) {%k1} +vpscatterqq %ymm1, (%rdx,%ymm0,4) {%k1} + vpshufd $0, %xmm16, %xmm19 vpshufd $0, (%rax), %xmm19 vpshufd $0, (%rax){1to4}, %xmm19 @@ -1500,6 +1510,16 @@ vpunpckldq %ymm16, %ymm17, %ymm19 {z}{k1} vpunpckldq (%rax), %ymm17, %ymm19 {z}{k1} vpunpckldq (%rax){1to8}, %ymm17, %ymm19 {z}{k1} +vscatterdps %xmm1, (%rdx,%xmm0,4) {%k1} +vscatterdpd %xmm1, (%rdx,%xmm0,4) {%k1} +vscatterqps %xmm1, (%rdx,%xmm0,4) {%k1} +vscatterqpd %xmm1, (%rdx,%xmm0,4) {%k1} + +vscatterdps %ymm1, (%rdx,%ymm0,4) {%k1} +vscatterdpd %ymm1, (%rdx,%xmm0,4) {%k1} +vscatterqps %xmm1, (%rdx,%ymm0,4) {%k1} +vscatterqpd %ymm1, (%rdx,%ymm0,4) {%k1} + vshuff32x4 $0, %ymm16, %ymm17, %ymm19 vshuff32x4 $0, (%rax), %ymm17, %ymm19 vshuff32x4 $0, (%rax){1to8}, %ymm17, %ymm19 @@ -2897,6 +2917,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: 2 10 1.00 vpmulld %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 3 17 1.00 * vpmulld (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 3 17 1.00 * vpmulld (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} +# CHECK-NEXT: 12 8 2.00 * vpscatterdd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 7 7 1.00 * vpscatterdq %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 8 8 1.00 * vpscatterqd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 7 7 1.00 * vpscatterqq %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 20 8 4.00 * vpscatterdd %ymm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 11 7 2.00 * vpscatterdq %ymm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 8 8 1.00 * vpscatterqd %xmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 11 7 2.00 * vpscatterqq %ymm1, (%rdx,%ymm0,4) {%k1} # CHECK-NEXT: 1 1 0.50 vpshufd $0, %xmm16, %xmm19 # CHECK-NEXT: 2 7 0.50 * vpshufd $0, (%rax), %xmm19 # CHECK-NEXT: 2 7 0.50 * vpshufd $0, (%rax){1to4}, %xmm19 @@ -3035,6 +3063,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: 1 1 0.50 vpunpckldq %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 2 8 0.50 * vpunpckldq (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: 2 8 0.50 * vpunpckldq (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} +# CHECK-NEXT: 12 8 2.00 * vscatterdps %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 7 7 1.00 * vscatterdpd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 8 8 1.00 * vscatterqps %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 7 7 1.00 * vscatterqpd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 20 8 4.00 * vscatterdps %ymm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 11 7 2.00 * vscatterdpd %ymm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: 8 8 1.00 * vscatterqps %xmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: 11 7 2.00 * vscatterqpd %ymm1, (%rdx,%ymm0,4) {%k1} # CHECK-NEXT: 1 3 1.00 vshuff32x4 $0, %ymm16, %ymm17, %ymm19 # CHECK-NEXT: 2 10 1.00 * vshuff32x4 $0, (%rax), %ymm17, %ymm19 # CHECK-NEXT: 2 10 1.00 * vshuff32x4 $0, (%rax){1to8}, %ymm17, %ymm19 @@ -3232,7 +3268,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] -# CHECK-NEXT: - 423.00 438.33 413.33 492.50 492.50 16.00 722.33 4.00 16.00 16.00 16.00 +# CHECK-NEXT: - 423.00 462.33 421.33 492.50 492.50 44.00 738.33 12.00 44.00 44.00 44.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions: @@ -4424,6 +4460,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: - - 1.00 1.00 - - - - - - - - vpmulld %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: - - 1.00 1.00 0.50 0.50 - - - - - - vpmulld (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: - - 1.00 1.00 0.50 0.50 - - - - - - vpmulld (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} +# CHECK-NEXT: - - 1.50 0.50 - - 2.00 1.50 0.50 2.00 2.00 2.00 vpscatterdd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 - - 1.00 0.50 0.50 1.00 1.00 1.00 vpscatterdq %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 - - 1.00 1.50 0.50 1.00 1.00 1.00 vpscatterqd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 - - 1.00 0.50 0.50 1.00 1.00 1.00 vpscatterqq %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 - - 4.00 1.50 0.50 4.00 4.00 4.00 vpscatterdd %ymm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 - - 2.00 0.50 0.50 2.00 2.00 2.00 vpscatterdq %ymm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 - - 1.00 1.50 0.50 1.00 1.00 1.00 vpscatterqd %xmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 - - 2.00 0.50 0.50 2.00 2.00 2.00 vpscatterqq %ymm1, (%rdx,%ymm0,4) {%k1} # CHECK-NEXT: - - - 0.50 - - - 0.50 - - - - vpshufd $0, %xmm16, %xmm19 # CHECK-NEXT: - - - 0.50 0.50 0.50 - 0.50 - - - - vpshufd $0, (%rax), %xmm19 # CHECK-NEXT: - - - 0.50 0.50 0.50 - 0.50 - - - - vpshufd $0, (%rax){1to4}, %xmm19 @@ -4562,6 +4606,14 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1} # CHECK-NEXT: - - - 0.50 - - - 0.50 - - - - vpunpckldq %ymm16, %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: - - - 0.50 0.50 0.50 - 0.50 - - - - vpunpckldq (%rax), %ymm17, %ymm19 {%k1} {z} # CHECK-NEXT: - - - 0.50 0.50 0.50 - 0.50 - - - - vpunpckldq (%rax){1to8}, %ymm17, %ymm19 {%k1} {z} +# CHECK-NEXT: - - 1.50 0.50 - - 2.00 1.50 0.50 2.00 2.00 2.00 vscatterdps %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 - - 1.00 0.50 0.50 1.00 1.00 1.00 vscatterdpd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 - - 1.00 1.50 0.50 1.00 1.00 1.00 vscatterqps %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 - - 1.00 0.50 0.50 1.00 1.00 1.00 vscatterqpd %xmm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 - - 4.00 1.50 0.50 4.00 4.00 4.00 vscatterdps %ymm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 - - 2.00 0.50 0.50 2.00 2.00 2.00 vscatterdpd %ymm1, (%rdx,%xmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 - - 1.00 1.50 0.50 1.00 1.00 1.00 vscatterqps %xmm1, (%rdx,%ymm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 - - 2.00 0.50 0.50 2.00 2.00 2.00 vscatterqpd %ymm1, (%rdx,%ymm0,4) {%k1} # CHECK-NEXT: - - - - - - - 1.00 - - - - vshuff32x4 $0, %ymm16, %ymm17, %ymm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - vshuff32x4 $0, (%rax), %ymm17, %ymm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - vshuff32x4 $0, (%rax){1to8}, %ymm17, %ymm19 From 2c1f0642a2647883f35463aebf4f90a6b1f158c1 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 22 Aug 2024 18:10:03 +0100 Subject: [PATCH 240/426] [MCA][X86] Add missing 512-bit vpscatterqd/vscatterqps schedule data This doesn't match uops.info yet - but it matches the existing vpscatterdq/vscatterqpd entries like uops.info says it should Fixes #105675 --- llvm/lib/Target/X86/X86SchedIceLake.td | 2 ++ llvm/lib/Target/X86/X86SchedSkylakeServer.td | 2 ++ .../llvm-mca/X86/IceLakeServer/resources-avx512.s | 10 +++++----- .../llvm-mca/X86/SkylakeServer/resources-avx512.s | 10 +++++----- 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td index fd372ba4656eba..29b1464e19a32b 100644 --- a/llvm/lib/Target/X86/X86SchedIceLake.td +++ b/llvm/lib/Target/X86/X86SchedIceLake.td @@ -1524,8 +1524,10 @@ def ICXWriteResGroup113 : SchedWriteRes<[ICXPort0,ICXPort49,ICXPort78,ICXPort015 let ReleaseAtCycles = [1,8,8,2]; } def: InstRW<[ICXWriteResGroup113], (instrs VPSCATTERDQZmr, + VPSCATTERQDZmr, VPSCATTERQQZmr, VSCATTERDPDZmr, + VSCATTERQPSZmr, VSCATTERQPDZmr)>; def ICXWriteResGroup114 : SchedWriteRes<[ICXPort0,ICXPort49,ICXPort5,ICXPort78,ICXPort0156]> { diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index 4fded44085e897..2423602d06c470 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -1499,8 +1499,10 @@ def SKXWriteResGroup113 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237,SKXPort015 let ReleaseAtCycles = [1,8,8,2]; } def: InstRW<[SKXWriteResGroup113], (instrs VPSCATTERDQZmr, + VPSCATTERQDZmr, VPSCATTERQQZmr, VSCATTERDPDZmr, + VSCATTERQPSZmr, VSCATTERQPDZmr)>; def SKXWriteResGroup114 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> { diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s index c4df992f3aebca..c509e766540b15 100644 --- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s +++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s @@ -1804,7 +1804,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 2 10 1.00 * vpermq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 36 8 8.00 * vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 4.00 * vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: 1 1 0.50 * vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 19 7 4.00 * vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 4.00 * vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 1 1 1.00 vpshufd $0, %zmm16, %zmm19 # CHECK-NEXT: 2 8 1.00 * vpshufd $0, (%rax), %zmm19 @@ -1871,7 +1871,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 2 8 1.00 * vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 36 7 8.00 * vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 4.00 * vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: 1 1 0.50 * vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 19 7 4.00 * vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 4.00 * vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 1 3 1.00 vshuff32x4 $0, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: 2 10 1.00 * vshuff32x4 $0, (%rax), %zmm17, %zmm19 @@ -2054,7 +2054,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] -# CHECK-NEXT: - 612.00 408.17 102.67 327.50 327.50 41.50 592.17 5.00 41.50 41.50 41.50 +# CHECK-NEXT: - 612.00 411.17 103.67 327.50 327.50 48.50 593.17 6.00 48.50 48.50 48.50 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions: @@ -2774,7 +2774,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - vpermq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - 1.50 0.50 - - 8.00 1.50 0.50 8.00 8.00 8.00 vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 - - 4.00 0.50 0.50 4.00 4.00 4.00 vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: - - - - - - 0.50 - - 0.50 0.50 0.50 vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 - - 4.00 0.50 0.50 4.00 4.00 4.00 vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 - - 4.00 0.50 0.50 4.00 4.00 4.00 vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - - - - - - 1.00 - - - - vpshufd $0, %zmm16, %zmm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - vpshufd $0, (%rax), %zmm19 @@ -2841,7 +2841,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - 1.50 0.50 - - 8.00 1.50 0.50 8.00 8.00 8.00 vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 - - 4.00 0.50 0.50 4.00 4.00 4.00 vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: - - - - - - 0.50 - - 0.50 0.50 0.50 vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 - - 4.00 0.50 0.50 4.00 4.00 4.00 vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 - - 4.00 0.50 0.50 4.00 4.00 4.00 vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - - - - - - 1.00 - - - - vshuff32x4 $0, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - vshuff32x4 $0, (%rax), %zmm17, %zmm19 diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s index 5eaa0f91fdaaba..9c006d4ebb077d 100644 --- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s +++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s @@ -1804,7 +1804,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 2 10 1.00 * vpermq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 36 8 16.00 * vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 8.00 * vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: 1 1 1.00 * vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 19 7 8.00 * vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 8.00 * vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 1 1 1.00 vpshufd $0, %zmm16, %zmm19 # CHECK-NEXT: 2 8 1.00 * vpshufd $0, (%rax), %zmm19 @@ -1871,7 +1871,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 2 8 1.00 * vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 36 7 16.00 * vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 8.00 * vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: 1 1 1.00 * vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 19 7 8.00 * vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 8.00 * vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 1 3 1.00 vshuff32x4 $0, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: 2 10 1.00 * vshuff32x4 $0, (%rax), %zmm17, %zmm19 @@ -2052,7 +2052,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] -# CHECK-NEXT: - 612.00 349.67 102.67 355.17 355.17 83.00 650.67 5.00 27.67 +# CHECK-NEXT: - 612.00 352.67 103.67 359.83 359.83 97.00 651.67 6.00 32.33 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: @@ -2772,7 +2772,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vpermq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - 1.50 0.50 5.33 5.33 16.00 1.50 0.50 5.33 vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 0.50 0.50 2.67 vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: - - - - 0.33 0.33 1.00 - - 0.33 vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 0.50 0.50 2.67 vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 0.50 0.50 2.67 vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - - - - - - 1.00 - - vpshufd $0, %zmm16, %zmm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vpshufd $0, (%rax), %zmm19 @@ -2839,7 +2839,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - 1.50 0.50 5.33 5.33 16.00 1.50 0.50 5.33 vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 0.50 0.50 2.67 vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: - - - - 0.33 0.33 1.00 - - 0.33 vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 0.50 0.50 2.67 vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 0.50 0.50 2.67 vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - - - - - - 1.00 - - vshuff32x4 $0, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vshuff32x4 $0, (%rax), %zmm17, %zmm19 From c5a0c37b4279c2925b03993cd86a83bfc053f0cd Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 22 Aug 2024 10:30:31 -0700 Subject: [PATCH 241/426] [Xtensa,test] Fix div.ll after #99981 --- llvm/test/CodeGen/Xtensa/div.ll | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/Xtensa/div.ll b/llvm/test/CodeGen/Xtensa/div.ll index e10e976fb1b386..8d51c571efb4c7 100644 --- a/llvm/test/CodeGen/Xtensa/div.ll +++ b/llvm/test/CodeGen/Xtensa/div.ll @@ -363,8 +363,7 @@ define i8 @sdiv8_constant(i8 %a) nounwind { define i8 @sdiv8_pow2(i8 %a) nounwind { ; XTENSA-LABEL: sdiv8_pow2: ; XTENSA: slli a8, a2, 24 -; XTENSA-NEXT: srai a8, a8, 24 -; XTENSA-NEXT: srli a8, a8, 12 +; XTENSA-NEXT: srai a8, a8, 31 ; XTENSA-NEXT: movi a9, 7 ; XTENSA-NEXT: and a8, a8, a9 ; XTENSA-NEXT: add a8, a2, a8 @@ -473,8 +472,7 @@ define i32 @sdiv_pow2_2(i32 %a) nounwind { define i16 @sdiv16_pow2(i16 %a) nounwind { ; XTENSA-LABEL: sdiv16_pow2: ; XTENSA: slli a8, a2, 16 -; XTENSA-NEXT: srai a8, a8, 16 -; XTENSA-NEXT: extui a8, a8, 28, 4 +; XTENSA-NEXT: srai a8, a8, 31 ; XTENSA-NEXT: movi a9, 7 ; XTENSA-NEXT: and a8, a8, a9 ; XTENSA-NEXT: add a8, a2, a8 From 1fa6c99a09ccca7558cb3c46fa5d4cbfb4d4bea5 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 22 Aug 2024 18:30:48 +0100 Subject: [PATCH 242/426] [VPlan] Move EVL memory recipes to VPlanRecipes.cpp (NFC) Move VPWiden[Load|Store]EVLRecipe::executeto VPlanRecipes.cpp in line with other ::execute implementations that don't depend on anything defined in LoopVectorization.cpp --- .../Transforms/Vectorize/LoopVectorize.cpp | 96 ----------------- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 100 ++++++++++++++++++ 2 files changed, 100 insertions(+), 96 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 2145bb8c9ca872..23d0f39ad93ebe 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9427,102 +9427,6 @@ void VPReplicateRecipe::execute(VPTransformState &State) { State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State); } -/// Use all-true mask for reverse rather than actual mask, as it avoids a -/// dependence w/o affecting the result. -static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, - Value *EVL, const Twine &Name) { - VectorType *ValTy = cast(Operand->getType()); - Value *AllTrueMask = - Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue()); - return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse, - {Operand, AllTrueMask, EVL}, nullptr, Name); -} - -void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { - assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " - "explicit vector length."); - auto *LI = cast(&Ingredient); - - Type *ScalarDataTy = getLoadStoreType(&Ingredient); - auto *DataTy = VectorType::get(ScalarDataTy, State.VF); - const Align Alignment = getLoadStoreAlignment(&Ingredient); - bool CreateGather = !isConsecutive(); - - auto &Builder = State.Builder; - State.setDebugLocFrom(getDebugLoc()); - CallInst *NewLI; - Value *EVL = State.get(getEVL(), VPIteration(0, 0)); - Value *Addr = State.get(getAddr(), 0, !CreateGather); - Value *Mask = nullptr; - if (VPValue *VPMask = getMask()) { - Mask = State.get(VPMask, 0); - if (isReverse()) - Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); - } else { - Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); - } - - if (CreateGather) { - NewLI = - Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, - nullptr, "wide.masked.gather"); - } else { - VectorBuilder VBuilder(Builder); - VBuilder.setEVL(EVL).setMask(Mask); - NewLI = cast(VBuilder.createVectorInstruction( - Instruction::Load, DataTy, Addr, "vp.op.load")); - } - NewLI->addParamAttr( - 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); - State.addMetadata(NewLI, LI); - Instruction *Res = NewLI; - if (isReverse()) - Res = createReverseEVL(Builder, Res, EVL, "vp.reverse"); - State.set(this, Res, 0); -} - -void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { - assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " - "explicit vector length."); - auto *SI = cast(&Ingredient); - - VPValue *StoredValue = getStoredValue(); - bool CreateScatter = !isConsecutive(); - const Align Alignment = getLoadStoreAlignment(&Ingredient); - - auto &Builder = State.Builder; - State.setDebugLocFrom(getDebugLoc()); - - CallInst *NewSI = nullptr; - Value *StoredVal = State.get(StoredValue, 0); - Value *EVL = State.get(getEVL(), VPIteration(0, 0)); - if (isReverse()) - StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse"); - Value *Mask = nullptr; - if (VPValue *VPMask = getMask()) { - Mask = State.get(VPMask, 0); - if (isReverse()) - Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); - } else { - Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); - } - Value *Addr = State.get(getAddr(), 0, !CreateScatter); - if (CreateScatter) { - NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), - Intrinsic::vp_scatter, - {StoredVal, Addr, Mask, EVL}); - } else { - VectorBuilder VBuilder(Builder); - VBuilder.setEVL(EVL).setMask(Mask); - NewSI = cast(VBuilder.createVectorInstruction( - Instruction::Store, Type::getVoidTy(EVL->getContext()), - {StoredVal, Addr})); - } - NewSI->addParamAttr( - 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment)); - State.addMetadata(NewSI, SI); -} - // Determine how to lower the scalar epilogue, which depends on 1) optimising // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing // predication, and 4) a TTI hook that analyses whether the loop is suitable diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 63e0e8a8981373..fe1325f4163004 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2127,7 +2127,63 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, O << " = load "; printOperands(O, SlotTracker); } +#endif + +/// Use all-true mask for reverse rather than actual mask, as it avoids a +/// dependence w/o affecting the result. +static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, + Value *EVL, const Twine &Name) { + VectorType *ValTy = cast(Operand->getType()); + Value *AllTrueMask = + Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue()); + return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse, + {Operand, AllTrueMask, EVL}, nullptr, Name); +} + +void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { + assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " + "explicit vector length."); + auto *LI = cast(&Ingredient); + + Type *ScalarDataTy = getLoadStoreType(&Ingredient); + auto *DataTy = VectorType::get(ScalarDataTy, State.VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + bool CreateGather = !isConsecutive(); + + auto &Builder = State.Builder; + State.setDebugLocFrom(getDebugLoc()); + CallInst *NewLI; + Value *EVL = State.get(getEVL(), VPIteration(0, 0)); + Value *Addr = State.get(getAddr(), 0, !CreateGather); + Value *Mask = nullptr; + if (VPValue *VPMask = getMask()) { + Mask = State.get(VPMask, 0); + if (isReverse()) + Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); + } else { + Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); + } + + if (CreateGather) { + NewLI = + Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, + nullptr, "wide.masked.gather"); + } else { + VectorBuilder VBuilder(Builder); + VBuilder.setEVL(EVL).setMask(Mask); + NewLI = cast(VBuilder.createVectorInstruction( + Instruction::Load, DataTy, Addr, "vp.op.load")); + } + NewLI->addParamAttr( + 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); + State.addMetadata(NewLI, LI); + Instruction *Res = NewLI; + if (isReverse()) + Res = createReverseEVL(Builder, Res, EVL, "vp.reverse"); + State.set(this, Res, 0); +} +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN "; @@ -2183,7 +2239,51 @@ void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, O << Indent << "WIDEN store "; printOperands(O, SlotTracker); } +#endif + +void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { + assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " + "explicit vector length."); + auto *SI = cast(&Ingredient); + + VPValue *StoredValue = getStoredValue(); + bool CreateScatter = !isConsecutive(); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + + auto &Builder = State.Builder; + State.setDebugLocFrom(getDebugLoc()); + + CallInst *NewSI = nullptr; + Value *StoredVal = State.get(StoredValue, 0); + Value *EVL = State.get(getEVL(), VPIteration(0, 0)); + if (isReverse()) + StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse"); + Value *Mask = nullptr; + if (VPValue *VPMask = getMask()) { + Mask = State.get(VPMask, 0); + if (isReverse()) + Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); + } else { + Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); + } + Value *Addr = State.get(getAddr(), 0, !CreateScatter); + if (CreateScatter) { + NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), + Intrinsic::vp_scatter, + {StoredVal, Addr, Mask, EVL}); + } else { + VectorBuilder VBuilder(Builder); + VBuilder.setEVL(EVL).setMask(Mask); + NewSI = cast(VBuilder.createVectorInstruction( + Instruction::Store, Type::getVoidTy(EVL->getContext()), + {StoredVal, Addr})); + } + NewSI->addParamAttr( + 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment)); + State.addMetadata(NewSI, SI); +} +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN vp.store "; From e31322ba59071b4aa288c9d956f4c9d1ee10b080 Mon Sep 17 00:00:00 2001 From: Haowei Date: Thu, 22 Aug 2024 10:37:59 -0700 Subject: [PATCH 243/426] [libc++] Fix transform_error.mandates.verify.cpp test on msvc (#104635) PR #102851 marks reference types in union as error on msvc by changing the clang, which makes 'transform_error.mandates.verify.cpp' no longer failing on msvc from ToT. However, all libcxx buildbots do not build clang from source, therefore, this test will still fail on these bots, which is incorrect. This patch changed the expected error message of this test so it can pass with both release branch clang and ToT clang. --- .../transform_error.mandates.verify.cpp | 12 +++++++++--- .../transform_error.mandates.verify.cpp | 14 +++++++++++--- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp index 5bf094bf37709d..61374094b7adfb 100644 --- a/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp +++ b/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp @@ -13,8 +13,13 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 // With clang-cl, some warnings have a 'which is a Microsoft extension' suffix -// which break the tests. -// XFAIL: msvc +// which break the tests. But #102851 will turn it into an error, making the test pass. +// However, upstream libcxx buildbots do not build clang from source while testing, so +// this tests still expected to fail on these bots. +// +// TODO(LLVM 22): Remove '0-1' from 'expected-error-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}}}' +// and remove 'expected-warning-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}, which is a Microsoft extension}}' +// once LLVM 22 releases. See https://github.com/llvm/llvm-project/issues/104885. // Test the mandates @@ -56,11 +61,12 @@ void test() { e.transform_error(return_unexpected); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}} // expected-error-re@*:* 0-1 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}} // expected-error-re@*:* {{static assertion failed {{.*}}[expected.object.general] A program that instantiates the definition of template expected for {{.*}} is ill-formed.}} - // expected-error-re@*:* {{union member {{.*}} has reference type {{.*}}}} + // expected-error-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}}} e.transform_error(return_no_object); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}} // expected-error-re@*:* 0-1 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}} // expected-error-re@*:* {{static assertion failed {{.*}}[expected.object.general] A program that instantiates the definition of template expected for {{.*}} is ill-formed.}} + // expected-warning-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}, which is a Microsoft extension}} } // Test const& overload diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp index 508b01a7bcea39..16233cd90d2199 100644 --- a/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp +++ b/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp @@ -13,8 +13,14 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 // With clang-cl, some warnings have a 'which is a Microsoft extension' suffix -// which break the tests. -// XFAIL: msvc +// which break the tests. But #102851 will turn it into an error, making the test pass. +// However, upstream libcxx buildbots do not build clang from source while testing, so +// this tests still expected to fail on these bots. +// +// TODO(LLVM 22): Remove '0-1' from 'expected-error-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}}}' +// and remove 'expected-warning-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}, which is a Microsoft extension}}' +// and remove 'expected-error-re@*:* 0-1 {{call to deleted constructor of {{.*}}}}' +// once LLVM 22 releases. See See https://github.com/llvm/llvm-project/issues/104885. // Test the mandates @@ -57,11 +63,12 @@ void test() { // expected-error-re@*:* 0-1 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}} // expected-error-re@*:* {{static assertion failed {{.*}}A program that instantiates expected with a E that is not a valid argument for unexpected is ill-formed}} // expected-error-re@*:* 0-1 {{call to deleted constructor of {{.*}}}} - // expected-error-re@*:* {{union member {{.*}} has reference type {{.*}}}} + // expected-error-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}}} e.transform_error(return_no_object); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}} // expected-error-re@*:* 0-1 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}} // expected-error-re@*:* {{static assertion failed {{.*}}A program that instantiates expected with a E that is not a valid argument for unexpected is ill-formed}} + // expected-warning-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}, which is a Microsoft extension}} } // Test const& overload @@ -71,6 +78,7 @@ void test() { // expected-error-re@*:* 0-1 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}} e.transform_error(return_no_object); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}} // expected-error-re@*:* 0-1 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}} + // expected-error-re@*:* 0-1 {{call to deleted constructor of {{.*}}}} } // Test && overload From 8f005f8306dc52577b3b9482d271fb463f0152a5 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 22 Aug 2024 12:41:20 -0500 Subject: [PATCH 244/426] [libc] Add `ctype.h` locale variants (#102711) Summary: This patch adds all the libc ctype variants. These ignore the locale ingormation completely, so they're pretty much just stubs. Because these use locale information, which is system scope, we do not enable building them outisde of full build mode. --- libc/config/gpu/entrypoints.txt | 23 +++ libc/config/gpu/headers.txt | 1 + libc/config/linux/x86_64/entrypoints.txt | 25 +++ libc/config/linux/x86_64/headers.txt | 1 + libc/hdr/types/CMakeLists.txt | 9 + libc/hdr/types/locale_t.h | 22 +++ libc/include/CMakeLists.txt | 13 ++ libc/include/llvm-libc-macros/CMakeLists.txt | 6 + libc/include/llvm-libc-macros/locale-macros.h | 32 ++++ libc/include/llvm-libc-types/CMakeLists.txt | 2 + libc/include/llvm-libc-types/locale_t.h | 22 +++ libc/include/llvm-libc-types/struct_lconv.h | 39 +++++ libc/include/locale.h.def | 20 +++ libc/newhdrgen/yaml/ctype.yaml | 102 +++++++++++- libc/newhdrgen/yaml/locale.yaml | 41 +++++ libc/spec/stdc.td | 131 ++++++++++++++- libc/src/CMakeLists.txt | 1 + libc/src/ctype/CMakeLists.txt | 156 ++++++++++++++++++ libc/src/ctype/isalnum.cpp | 2 - libc/src/ctype/isalnum_l.cpp | 21 +++ libc/src/ctype/isalnum_l.h | 21 +++ libc/src/ctype/isalpha.cpp | 2 - libc/src/ctype/isalpha_l.cpp | 21 +++ libc/src/ctype/isalpha_l.h | 21 +++ libc/src/ctype/isblank.cpp | 2 - libc/src/ctype/isblank_l.cpp | 20 +++ libc/src/ctype/isblank_l.h | 21 +++ libc/src/ctype/iscntrl.cpp | 2 - libc/src/ctype/iscntrl_l.cpp | 21 +++ libc/src/ctype/iscntrl_l.h | 21 +++ libc/src/ctype/isdigit.cpp | 2 - libc/src/ctype/isdigit_l.cpp | 20 +++ libc/src/ctype/isdigit_l.h | 21 +++ libc/src/ctype/isgraph.cpp | 2 - libc/src/ctype/isgraph_l.cpp | 21 +++ libc/src/ctype/isgraph_l.h | 21 +++ libc/src/ctype/islower.cpp | 2 - libc/src/ctype/islower_l.cpp | 21 +++ libc/src/ctype/islower_l.h | 21 +++ libc/src/ctype/isprint.cpp | 2 - libc/src/ctype/isprint_l.cpp | 21 +++ libc/src/ctype/isprint_l.h | 21 +++ libc/src/ctype/ispunct.cpp | 2 - libc/src/ctype/ispunct_l.cpp | 22 +++ libc/src/ctype/ispunct_l.h | 21 +++ libc/src/ctype/isspace.cpp | 2 - libc/src/ctype/isspace_l.cpp | 21 +++ libc/src/ctype/isspace_l.h | 21 +++ libc/src/ctype/isupper.cpp | 2 - libc/src/ctype/isupper_l.cpp | 21 +++ libc/src/ctype/isupper_l.h | 21 +++ libc/src/ctype/isxdigit.cpp | 2 - libc/src/ctype/isxdigit_l.cpp | 22 +++ libc/src/ctype/isxdigit_l.h | 21 +++ libc/src/ctype/tolower.cpp | 2 - libc/src/ctype/tolower_l.cpp | 21 +++ libc/src/ctype/tolower_l.h | 21 +++ libc/src/ctype/toupper.cpp | 2 - libc/src/ctype/toupper_l.cpp | 23 +++ libc/src/ctype/toupper_l.h | 21 +++ libc/src/locale/CMakeLists.txt | 76 +++++++++ libc/src/locale/duplocale.cpp | 21 +++ libc/src/locale/duplocale.h | 22 +++ libc/src/locale/freelocale.cpp | 21 +++ libc/src/locale/freelocale.h | 22 +++ libc/src/locale/locale.cpp | 21 +++ libc/src/locale/locale.h | 36 ++++ libc/src/locale/localeconv.cpp | 49 ++++++ libc/src/locale/localeconv.h | 22 +++ libc/src/locale/newlocale.cpp | 28 ++++ libc/src/locale/newlocale.h | 22 +++ libc/src/locale/setlocale.cpp | 28 ++++ libc/src/locale/setlocale.h | 22 +++ libc/src/locale/uselocale.cpp | 23 +++ libc/src/locale/uselocale.h | 22 +++ libc/test/src/CMakeLists.txt | 1 + libc/test/src/locale/CMakeLists.txt | 25 +++ libc/test/src/locale/locale_test.cpp | 27 +++ libc/test/src/locale/localeconv_test.cpp | 17 ++ 79 files changed, 1738 insertions(+), 31 deletions(-) create mode 100644 libc/hdr/types/locale_t.h create mode 100644 libc/include/llvm-libc-macros/locale-macros.h create mode 100644 libc/include/llvm-libc-types/locale_t.h create mode 100644 libc/include/llvm-libc-types/struct_lconv.h create mode 100644 libc/include/locale.h.def create mode 100644 libc/newhdrgen/yaml/locale.yaml create mode 100644 libc/src/ctype/isalnum_l.cpp create mode 100644 libc/src/ctype/isalnum_l.h create mode 100644 libc/src/ctype/isalpha_l.cpp create mode 100644 libc/src/ctype/isalpha_l.h create mode 100644 libc/src/ctype/isblank_l.cpp create mode 100644 libc/src/ctype/isblank_l.h create mode 100644 libc/src/ctype/iscntrl_l.cpp create mode 100644 libc/src/ctype/iscntrl_l.h create mode 100644 libc/src/ctype/isdigit_l.cpp create mode 100644 libc/src/ctype/isdigit_l.h create mode 100644 libc/src/ctype/isgraph_l.cpp create mode 100644 libc/src/ctype/isgraph_l.h create mode 100644 libc/src/ctype/islower_l.cpp create mode 100644 libc/src/ctype/islower_l.h create mode 100644 libc/src/ctype/isprint_l.cpp create mode 100644 libc/src/ctype/isprint_l.h create mode 100644 libc/src/ctype/ispunct_l.cpp create mode 100644 libc/src/ctype/ispunct_l.h create mode 100644 libc/src/ctype/isspace_l.cpp create mode 100644 libc/src/ctype/isspace_l.h create mode 100644 libc/src/ctype/isupper_l.cpp create mode 100644 libc/src/ctype/isupper_l.h create mode 100644 libc/src/ctype/isxdigit_l.cpp create mode 100644 libc/src/ctype/isxdigit_l.h create mode 100644 libc/src/ctype/tolower_l.cpp create mode 100644 libc/src/ctype/tolower_l.h create mode 100644 libc/src/ctype/toupper_l.cpp create mode 100644 libc/src/ctype/toupper_l.h create mode 100644 libc/src/locale/CMakeLists.txt create mode 100644 libc/src/locale/duplocale.cpp create mode 100644 libc/src/locale/duplocale.h create mode 100644 libc/src/locale/freelocale.cpp create mode 100644 libc/src/locale/freelocale.h create mode 100644 libc/src/locale/locale.cpp create mode 100644 libc/src/locale/locale.h create mode 100644 libc/src/locale/localeconv.cpp create mode 100644 libc/src/locale/localeconv.h create mode 100644 libc/src/locale/newlocale.cpp create mode 100644 libc/src/locale/newlocale.h create mode 100644 libc/src/locale/setlocale.cpp create mode 100644 libc/src/locale/setlocale.h create mode 100644 libc/src/locale/uselocale.cpp create mode 100644 libc/src/locale/uselocale.h create mode 100644 libc/test/src/locale/CMakeLists.txt create mode 100644 libc/test/src/locale/locale_test.cpp create mode 100644 libc/test/src/locale/localeconv_test.cpp diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt index d7f35bc1edf5a0..7b869902074d8e 100644 --- a/libc/config/gpu/entrypoints.txt +++ b/libc/config/gpu/entrypoints.txt @@ -4,21 +4,35 @@ set(TARGET_LIBC_ENTRYPOINTS # ctype.h entrypoints libc.src.ctype.isalnum + libc.src.ctype.isalnum_l libc.src.ctype.isalpha + libc.src.ctype.isalpha_l libc.src.ctype.isascii libc.src.ctype.isblank + libc.src.ctype.isblank_l libc.src.ctype.iscntrl + libc.src.ctype.iscntrl_l libc.src.ctype.isdigit + libc.src.ctype.isdigit_l libc.src.ctype.isgraph + libc.src.ctype.isgraph_l libc.src.ctype.islower + libc.src.ctype.islower_l libc.src.ctype.isprint + libc.src.ctype.isprint_l libc.src.ctype.ispunct + libc.src.ctype.ispunct_l libc.src.ctype.isspace + libc.src.ctype.isspace_l libc.src.ctype.isupper + libc.src.ctype.isupper_l libc.src.ctype.isxdigit + libc.src.ctype.isxdigit_l libc.src.ctype.toascii libc.src.ctype.tolower + libc.src.ctype.tolower_l libc.src.ctype.toupper + libc.src.ctype.toupper_l # string.h entrypoints libc.src.string.bcmp @@ -233,6 +247,15 @@ set(TARGET_LIBC_ENTRYPOINTS # wchar.h entrypoints libc.src.wchar.wctob + # locale.h entrypoints + libc.src.locale.localeconv + libc.src.locale.duplocale + libc.src.locale.freelocale + libc.src.locale.localeconv + libc.src.locale.newlocale + libc.src.locale.setlocale + libc.src.locale.uselocale + # gpu/rpc.h entrypoints libc.src.gpu.rpc_host_call ) diff --git a/libc/config/gpu/headers.txt b/libc/config/gpu/headers.txt index 99280b7563a80f..fc952c40f4daa2 100644 --- a/libc/config/gpu/headers.txt +++ b/libc/config/gpu/headers.txt @@ -16,6 +16,7 @@ set(TARGET_PUBLIC_HEADERS libc.include.wchar libc.include.uchar libc.include.features + libc.include.locale # Header for RPC extensions libc.include.gpu_rpc diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 65c5757efe6274..bac1e3cfa85da7 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -782,6 +782,22 @@ endif() if(LLVM_LIBC_FULL_BUILD) list(APPEND TARGET_LIBC_ENTRYPOINTS + # ctype.h entrypoints + libc.src.ctype.isalnum_l + libc.src.ctype.isalpha_l + libc.src.ctype.isblank_l + libc.src.ctype.iscntrl_l + libc.src.ctype.isdigit_l + libc.src.ctype.isgraph_l + libc.src.ctype.islower_l + libc.src.ctype.isprint_l + libc.src.ctype.ispunct_l + libc.src.ctype.isspace_l + libc.src.ctype.isupper_l + libc.src.ctype.isxdigit_l + libc.src.ctype.tolower_l + libc.src.ctype.toupper_l + # assert.h entrypoints libc.src.assert.__assert_fail @@ -982,6 +998,15 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.time.nanosleep libc.src.time.time + # locale.h entrypoints + libc.src.locale.localeconv + libc.src.locale.duplocale + libc.src.locale.freelocale + libc.src.locale.localeconv + libc.src.locale.newlocale + libc.src.locale.setlocale + libc.src.locale.uselocale + # unistd.h entrypoints libc.src.unistd.__llvm_libc_syscall libc.src.unistd._exit diff --git a/libc/config/linux/x86_64/headers.txt b/libc/config/linux/x86_64/headers.txt index 77e454e64395df..881e149d9c40d3 100644 --- a/libc/config/linux/x86_64/headers.txt +++ b/libc/config/linux/x86_64/headers.txt @@ -33,6 +33,7 @@ set(TARGET_PUBLIC_HEADERS libc.include.unistd libc.include.wchar libc.include.uchar + libc.include.locale libc.include.arpa_inet diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt index 4fc28fd82e68db..f41576c07d99be 100644 --- a/libc/hdr/types/CMakeLists.txt +++ b/libc/hdr/types/CMakeLists.txt @@ -162,3 +162,12 @@ add_proxy_header_library( libc.include.llvm-libc-types.cookie_io_functions_t libc.include.stdio ) + +add_proxy_header_library( + locale_t + HDRS + locale_t.h + FULL_BUILD_DEPENDS + libc.include.llvm-libc-types.locale_t + libc.include.locale +) diff --git a/libc/hdr/types/locale_t.h b/libc/hdr/types/locale_t.h new file mode 100644 index 00000000000000..485258b4616962 --- /dev/null +++ b/libc/hdr/types/locale_t.h @@ -0,0 +1,22 @@ +//===-- Definition of macros from locale_t.h ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_HDR_LOCALE_T_H +#define LLVM_LIBC_HDR_LOCALE_T_H + +#ifdef LIBC_FULL_BUILD + +#include "include/llvm-libc-types/locale_t.h" + +#else // overlay mode + +#error "type not available in overlay mode" + +#endif // LLVM_LIBC_FULL_BUILD + +#endif // LLVM_LIBC_HDR_LOCALE_T_H diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index 4e3ae7f801f4a0..910f9eea015f27 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -45,6 +45,7 @@ add_header_macro( ctype.h DEPENDS .llvm_libc_common_h + .llvm-libc-types.locale_t ) add_header_macro( @@ -719,6 +720,18 @@ add_header_macro( .llvm-libc-types.wchar_t ) +add_header_macro( + locale + ../libc/newhdrgen/yaml/locale.yaml + locale.h.def + locale.h + DEPENDS + .llvm_libc_common_h + .llvm-libc-macros.locale_macros + .llvm-libc-types.locale_t + .llvm-libc-types.struct_lconv +) + if(LIBC_TARGET_OS_IS_GPU) file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/gpu) diff --git a/libc/include/llvm-libc-macros/CMakeLists.txt b/libc/include/llvm-libc-macros/CMakeLists.txt index 60a8725f9ef63f..7b980232ba0429 100644 --- a/libc/include/llvm-libc-macros/CMakeLists.txt +++ b/libc/include/llvm-libc-macros/CMakeLists.txt @@ -295,3 +295,9 @@ add_macro_header( HDR elf-macros.h ) + +add_macro_header( + locale_macros + HDR + locale-macros.h +) diff --git a/libc/include/llvm-libc-macros/locale-macros.h b/libc/include/llvm-libc-macros/locale-macros.h new file mode 100644 index 00000000000000..892f8b69f3a777 --- /dev/null +++ b/libc/include/llvm-libc-macros/locale-macros.h @@ -0,0 +1,32 @@ +//===-- Definition of macros from locale.h --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_MACROS_LOCALE_MACROS_H +#define LLVM_LIBC_MACROS_LOCALE_MACROS_H + +#include "../llvm-libc-types/locale_t.h" + +#define LC_CTYPE 0 +#define LC_NUMERIC 1 +#define LC_TIME 2 +#define LC_COLLATE 3 +#define LC_MONETARY 4 +#define LC_MESSAGES 5 +#define LC_ALL 6 + +#define LC_GLOBAL_LOCALE ((locale_t)(-1)) + +#define LC_CTYPE_MASK (1 << LC_CTYPE) +#define LC_NUMERIC_MASK (1 << LC_NUMERIC) +#define LC_TIME_MASK (1 << LC_TIME) +#define LC_COLLATE_MASK (1 << LC_COLLATE) +#define LC_MONETARY_MASK (1 << LC_MONETARY) +#define LC_MESSAGES_MASK (1 << LC_MESSAGES) +#define LC_ALL_MASK 0x7fffffff + +#endif // LLVM_LIBC_MACROS_LOCALE_MACROS_H diff --git a/libc/include/llvm-libc-types/CMakeLists.txt b/libc/include/llvm-libc-types/CMakeLists.txt index 0fa86e0152f9ba..583b84ccaae67c 100644 --- a/libc/include/llvm-libc-types/CMakeLists.txt +++ b/libc/include/llvm-libc-types/CMakeLists.txt @@ -142,3 +142,5 @@ DEPENDS .fsblkcnt_t .fsfilcnt_t ) +add_header(locale_t HDR locale_t.h) +add_header(struct_lconv HDR struct_lconv.h) diff --git a/libc/include/llvm-libc-types/locale_t.h b/libc/include/llvm-libc-types/locale_t.h new file mode 100644 index 00000000000000..6d783001acf9f2 --- /dev/null +++ b/libc/include/llvm-libc-types/locale_t.h @@ -0,0 +1,22 @@ +//===-- Definition of type locale_t ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_TYPES_LOCALE_T_H +#define LLVM_LIBC_TYPES_LOCALE_T_H + +#define NUM_LOCALE_CATEGORIES 6 + +struct __locale_data; + +struct __locale_t { + struct __locale_data *data[NUM_LOCALE_CATEGORIES]; +}; + +typedef struct __locale_t *locale_t; + +#endif // LLVM_LIBC_TYPES_LOCALE_T_H diff --git a/libc/include/llvm-libc-types/struct_lconv.h b/libc/include/llvm-libc-types/struct_lconv.h new file mode 100644 index 00000000000000..9d69f055484dad --- /dev/null +++ b/libc/include/llvm-libc-types/struct_lconv.h @@ -0,0 +1,39 @@ +//===-- Definition of type lconv ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_TYPES_LCONV_H +#define LLVM_LIBC_TYPES_LCONV_H + +struct lconv { + char *decimal_point; + char *thousands_sep; + char *grouping; + char *mon_decimal_point; + char *mon_thousands_sep; + char *mon_grouping; + char *positive_sign; + char *negative_sign; + char *currency_symbol; + char frac_digits; + char p_cs_precedes; + char n_cs_precedes; + char p_sep_by_space; + char n_sep_by_space; + char p_sign_posn; + char n_sign_posn; + char *int_curr_symbol; + char int_frac_digits; + char int_p_cs_precedes; + char int_n_cs_precedes; + char int_p_sep_by_space; + char int_n_sep_by_space; + char int_p_sign_posn; + char int_n_sign_posn; +}; + +#endif // LLVM_LIBC_TYPES_LCONV_H diff --git a/libc/include/locale.h.def b/libc/include/locale.h.def new file mode 100644 index 00000000000000..516c6e6275e681 --- /dev/null +++ b/libc/include/locale.h.def @@ -0,0 +1,20 @@ +//===-- C standard library header locale.h --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_LOCALE_H +#define LLVM_LIBC_LOCALE_H + +#include "__llvm-libc-common.h" + +#include "llvm-libc-macros/locale-macros.h" +#include "llvm-libc-types/locale_t.h" +#include "llvm-libc-types/struct_lconv.h" + +%%public_api() + +#endif // LLVM_LIBC_LOCALE_H diff --git a/libc/newhdrgen/yaml/ctype.yaml b/libc/newhdrgen/yaml/ctype.yaml index f3108a34d43377..b4823c3e53234a 100644 --- a/libc/newhdrgen/yaml/ctype.yaml +++ b/libc/newhdrgen/yaml/ctype.yaml @@ -1,6 +1,7 @@ header: ctype.h macros: [] -types: [] +types: + - type_name: locale_t enums: [] objects: [] functions: @@ -100,4 +101,101 @@ functions: return_type: int arguments: - type: int - functions: null + - name: isalnum_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: isalpha_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: isblank_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: iscntrl_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: isdigit_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: isgraph_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: islower_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: isprint_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: ispunct_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: isspace_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: isupper_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: isxdigit_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: tolower_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: toupper_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t diff --git a/libc/newhdrgen/yaml/locale.yaml b/libc/newhdrgen/yaml/locale.yaml new file mode 100644 index 00000000000000..7da7966ea730f6 --- /dev/null +++ b/libc/newhdrgen/yaml/locale.yaml @@ -0,0 +1,41 @@ +header: locale.h +functions: + - name: localeconv + standards: + - stdc + return_type: struct lconv * + arguments: + - type: void + - name: duplocale + standards: + - stdc + return_type: locale_t + arguments: + - type: locale_t + - name: freelocale + standards: + - stdc + return_type: void + arguments: + - type: locale_t + - name: newlocale + standards: + - stdc + return_type: locale_t + arguments: + - type: int + - type: const char * + - type: locale_t + - name: setlocale + standards: + - stdc + return_type: char * + arguments: + - type: int + - type: const char * + - name: uselocale + standards: + - stdc + return_type: locale_t + arguments: + - type: locale_t diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index 118dcce829be23..402d8c335470ad 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -4,6 +4,7 @@ def StdC : StandardSpec<"stdc"> { PtrType StructTmPtr = PtrType; PtrType TimeTTypePtr = PtrType; NamedType ClockT = NamedType<"clock_t">; + NamedType LocaleT = NamedType<"locale_t">; NamedType DivTType = NamedType<"div_t">; NamedType LDivTType = NamedType<"ldiv_t">; @@ -34,7 +35,9 @@ def StdC : StandardSpec<"stdc"> { HeaderSpec CType = HeaderSpec< "ctype.h", [], // Macros - [], // Types + [ + LocaleT + ], // Types [], // Enumerations [ FunctionSpec< @@ -107,6 +110,76 @@ def StdC : StandardSpec<"stdc"> { RetValSpec, [ArgSpec] >, + FunctionSpec< + "isalnum_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "isalpha_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "isblank_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "iscntrl_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "isdigit_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "isgraph_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "islower_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "isprint_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "ispunct_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "isspace_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "isupper_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "isxdigit_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "tolower_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "toupper_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, ] >; @@ -1591,6 +1664,61 @@ def StdC : StandardSpec<"stdc"> { ] >; + + NamedType StructLconv : NamedType<"struct lconv">; + PtrType StructLconvPtr : PtrType; + + HeaderSpec Locale = HeaderSpec< + "locale.h", + [], // Macros + [LocaleT, StructLconv], // Types + [], // Enumerations + [ + FunctionSpec< + "duplocale", + RetValSpec, + [ + ArgSpec + ] + >, + FunctionSpec< + "freelocale", + RetValSpec, + [ + ArgSpec + ] + >, + FunctionSpec< + "localeconv", + RetValSpec, + [] + >, + FunctionSpec< + "newlocale", + RetValSpec, + [ + ArgSpec, + ArgSpec, + ArgSpec + ] + >, + FunctionSpec< + "setlocale", + RetValSpec, + [ + ArgSpec, + ArgSpec + ] + >, + FunctionSpec< + "uselocale", + RetValSpec, + [ + ArgSpec + ] + > + ] // Functions + >; let Headers = [ Assert, @@ -1613,5 +1741,6 @@ def StdC : StandardSpec<"stdc"> { Time, UChar, WChar, + Locale, ]; } diff --git a/libc/src/CMakeLists.txt b/libc/src/CMakeLists.txt index 9597e2380172b5..d554c12fb1ec89 100644 --- a/libc/src/CMakeLists.txt +++ b/libc/src/CMakeLists.txt @@ -40,3 +40,4 @@ add_subdirectory(signal) add_subdirectory(spawn) add_subdirectory(threads) add_subdirectory(time) +add_subdirectory(locale) diff --git a/libc/src/ctype/CMakeLists.txt b/libc/src/ctype/CMakeLists.txt index ae4eec9615dc19..8830c1bccf9eaa 100644 --- a/libc/src/ctype/CMakeLists.txt +++ b/libc/src/ctype/CMakeLists.txt @@ -146,3 +146,159 @@ add_entrypoint_object( DEPENDS libc.src.__support.ctype_utils ) + +# Do not build the locale versions in overlay mode. +if(NOT LLVM_LIBC_FULL_BUILD) + return() +endif() + +add_entrypoint_object( + isalnum_l + SRCS + isalnum_l.cpp + HDRS + isalnum_l.h + DEPENDS + libc.include.ctype + libc.src.__support.ctype_utils + libc.hdr.types.locale_t +) + +add_entrypoint_object( + isalpha_l + SRCS + isalpha_l.cpp + HDRS + isalpha_l.h + DEPENDS + libc.src.__support.ctype_utils + libc.hdr.types.locale_t +) + +add_entrypoint_object( + isblank_l + SRCS + isblank_l.cpp + HDRS + isblank_l.h + DEPENDS + libc.hdr.types.locale_t +) + +add_entrypoint_object( + iscntrl_l + SRCS + iscntrl_l.cpp + HDRS + iscntrl_l.h + DEPENDS + libc.hdr.types.locale_t +) + +add_entrypoint_object( + isdigit_l + SRCS + isdigit_l.cpp + HDRS + isdigit_l.h + DEPENDS + libc.src.__support.ctype_utils + libc.hdr.types.locale_t +) + +add_entrypoint_object( + isgraph_l + SRCS + isgraph_l.cpp + HDRS + isgraph_l.h + DEPENDS + libc.src.__support.ctype_utils + libc.hdr.types.locale_t +) + +add_entrypoint_object( + islower_l + SRCS + islower_l.cpp + HDRS + islower_l.h + DEPENDS + libc.src.__support.ctype_utils + libc.hdr.types.locale_t +) + +add_entrypoint_object( + isprint_l + SRCS + isprint_l.cpp + HDRS + isprint_l.h + DEPENDS + libc.hdr.types.locale_t +) + +add_entrypoint_object( + ispunct_l + SRCS + ispunct_l.cpp + HDRS + ispunct_l.h + DEPENDS + libc.src.__support.ctype_utils + libc.hdr.types.locale_t +) + +add_entrypoint_object( + isspace_l + SRCS + isspace_l.cpp + HDRS + isspace_l.h + DEPENDS + libc.hdr.types.locale_t +) + +add_entrypoint_object( + isupper_l + SRCS + isupper_l.cpp + HDRS + isupper_l.h + DEPENDS + libc.src.__support.ctype_utils + libc.hdr.types.locale_t +) + +add_entrypoint_object( + isxdigit_l + SRCS + isxdigit_l.cpp + HDRS + isxdigit_l.h + DEPENDS + libc.src.__support.ctype_utils + libc.hdr.types.locale_t +) + +add_entrypoint_object( + tolower_l + SRCS + tolower_l.cpp + HDRS + tolower_l.h + DEPENDS + libc.src.__support.ctype_utils + libc.hdr.types.locale_t +) + +add_entrypoint_object( + toupper_l + SRCS + toupper_l.cpp + HDRS + toupper_l.h + DEPENDS + libc.src.__support.ctype_utils + libc.hdr.types.locale_t +) diff --git a/libc/src/ctype/isalnum.cpp b/libc/src/ctype/isalnum.cpp index 382553c23a6bfb..54a3e357488790 100644 --- a/libc/src/ctype/isalnum.cpp +++ b/libc/src/ctype/isalnum.cpp @@ -14,8 +14,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isalnum, (int c)) { return static_cast(internal::isalnum(static_cast(c))); } diff --git a/libc/src/ctype/isalnum_l.cpp b/libc/src/ctype/isalnum_l.cpp new file mode 100644 index 00000000000000..671d9b75c4c33a --- /dev/null +++ b/libc/src/ctype/isalnum_l.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of isalnum -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isalnum_l.h" +#include "src/__support/ctype_utils.h" + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, isalnum_l, (int c, locale_t)) { + return static_cast(internal::isalnum(static_cast(c))); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isalnum_l.h b/libc/src/ctype/isalnum_l.h new file mode 100644 index 00000000000000..5bc892e6c8747e --- /dev/null +++ b/libc/src/ctype/isalnum_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for isalnum_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISALNUM_H +#define LLVM_LIBC_SRC_CTYPE_ISALNUM_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int isalnum_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISALNUM_H diff --git a/libc/src/ctype/isalpha.cpp b/libc/src/ctype/isalpha.cpp index 1a63406780b6e0..78b26f6a486eae 100644 --- a/libc/src/ctype/isalpha.cpp +++ b/libc/src/ctype/isalpha.cpp @@ -14,8 +14,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isalpha, (int c)) { return static_cast(internal::isalpha(static_cast(c))); } diff --git a/libc/src/ctype/isalpha_l.cpp b/libc/src/ctype/isalpha_l.cpp new file mode 100644 index 00000000000000..0619d979bedf22 --- /dev/null +++ b/libc/src/ctype/isalpha_l.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of isalpha -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isalpha_l.h" + +#include "src/__support/common.h" +#include "src/__support/ctype_utils.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, isalpha_l, (int c, locale_t)) { + return static_cast(internal::isalpha(static_cast(c))); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isalpha_l.h b/libc/src/ctype/isalpha_l.h new file mode 100644 index 00000000000000..3591f1175cb9a9 --- /dev/null +++ b/libc/src/ctype/isalpha_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for isalpha_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISALPHA_H +#define LLVM_LIBC_SRC_CTYPE_ISALPHA_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int isalpha_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISALPHA_H diff --git a/libc/src/ctype/isblank.cpp b/libc/src/ctype/isblank.cpp index a4f33d265bd2dd..e0a20829f86cee 100644 --- a/libc/src/ctype/isblank.cpp +++ b/libc/src/ctype/isblank.cpp @@ -13,8 +13,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isblank, (int c)) { return static_cast(c == ' ' || c == '\t'); } diff --git a/libc/src/ctype/isblank_l.cpp b/libc/src/ctype/isblank_l.cpp new file mode 100644 index 00000000000000..4f6b0bfac29724 --- /dev/null +++ b/libc/src/ctype/isblank_l.cpp @@ -0,0 +1,20 @@ +//===-- Implementation of isblank -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isblank_l.h" + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, isblank_l, (int c, locale_t)) { + return static_cast(c == ' ' || c == '\t'); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isblank_l.h b/libc/src/ctype/isblank_l.h new file mode 100644 index 00000000000000..61ede30ae76775 --- /dev/null +++ b/libc/src/ctype/isblank_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for isblank_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISBLANK_H +#define LLVM_LIBC_SRC_CTYPE_ISBLANK_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int isblank_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISBLANK_H diff --git a/libc/src/ctype/iscntrl.cpp b/libc/src/ctype/iscntrl.cpp index fb582fd6ef0820..2218adfcc33f3b 100644 --- a/libc/src/ctype/iscntrl.cpp +++ b/libc/src/ctype/iscntrl.cpp @@ -13,8 +13,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, iscntrl, (int c)) { const unsigned ch = static_cast(c); return static_cast(ch < 0x20 || ch == 0x7f); diff --git a/libc/src/ctype/iscntrl_l.cpp b/libc/src/ctype/iscntrl_l.cpp new file mode 100644 index 00000000000000..83aa480299fadc --- /dev/null +++ b/libc/src/ctype/iscntrl_l.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of iscntrl -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/iscntrl_l.h" + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, iscntrl_l, (int c, locale_t)) { + const unsigned ch = static_cast(c); + return static_cast(ch < 0x20 || ch == 0x7f); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/iscntrl_l.h b/libc/src/ctype/iscntrl_l.h new file mode 100644 index 00000000000000..7dee44fcd0bebc --- /dev/null +++ b/libc/src/ctype/iscntrl_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for iscntrl_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISCNTRL_H +#define LLVM_LIBC_SRC_CTYPE_ISCNTRL_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int iscntrl_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISCNTRL_H diff --git a/libc/src/ctype/isdigit.cpp b/libc/src/ctype/isdigit.cpp index 43c5f1940c7f00..1f711943861f8b 100644 --- a/libc/src/ctype/isdigit.cpp +++ b/libc/src/ctype/isdigit.cpp @@ -13,8 +13,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isdigit, (int c)) { return static_cast(internal::isdigit(static_cast(c))); } diff --git a/libc/src/ctype/isdigit_l.cpp b/libc/src/ctype/isdigit_l.cpp new file mode 100644 index 00000000000000..ca981362bfe839 --- /dev/null +++ b/libc/src/ctype/isdigit_l.cpp @@ -0,0 +1,20 @@ +//===-- Implementation of isdigit -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isdigit_l.h" +#include "src/__support/common.h" +#include "src/__support/ctype_utils.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, isdigit_l, (int c, locale_t)) { + return static_cast(internal::isdigit(static_cast(c))); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isdigit_l.h b/libc/src/ctype/isdigit_l.h new file mode 100644 index 00000000000000..abeec3464941a0 --- /dev/null +++ b/libc/src/ctype/isdigit_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for isdigit_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISDIGIT_H +#define LLVM_LIBC_SRC_CTYPE_ISDIGIT_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int isdigit_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISDIGIT_H diff --git a/libc/src/ctype/isgraph.cpp b/libc/src/ctype/isgraph.cpp index a5b6e501b5813f..74bb2e75d138e6 100644 --- a/libc/src/ctype/isgraph.cpp +++ b/libc/src/ctype/isgraph.cpp @@ -14,8 +14,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isgraph, (int c)) { return static_cast(internal::isgraph(static_cast(c))); } diff --git a/libc/src/ctype/isgraph_l.cpp b/libc/src/ctype/isgraph_l.cpp new file mode 100644 index 00000000000000..cbef6df148aed6 --- /dev/null +++ b/libc/src/ctype/isgraph_l.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of isgraph -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isgraph_l.h" + +#include "src/__support/common.h" +#include "src/__support/ctype_utils.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, isgraph_l, (int c, locale_t)) { + return static_cast(internal::isgraph(static_cast(c))); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isgraph_l.h b/libc/src/ctype/isgraph_l.h new file mode 100644 index 00000000000000..d96a4608655092 --- /dev/null +++ b/libc/src/ctype/isgraph_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for isgraph_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISGRAPH_H +#define LLVM_LIBC_SRC_CTYPE_ISGRAPH_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int isgraph_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISGRAPH_H diff --git a/libc/src/ctype/islower.cpp b/libc/src/ctype/islower.cpp index 61ccbcc1db413b..831aad32d3a22e 100644 --- a/libc/src/ctype/islower.cpp +++ b/libc/src/ctype/islower.cpp @@ -14,8 +14,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, islower, (int c)) { return static_cast(internal::islower(static_cast(c))); } diff --git a/libc/src/ctype/islower_l.cpp b/libc/src/ctype/islower_l.cpp new file mode 100644 index 00000000000000..b9be6acc81c992 --- /dev/null +++ b/libc/src/ctype/islower_l.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of islower -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/islower_l.h" +#include "src/__support/ctype_utils.h" + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, islower_l, (int c, locale_t)) { + return static_cast(internal::islower(static_cast(c))); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/islower_l.h b/libc/src/ctype/islower_l.h new file mode 100644 index 00000000000000..7d3e2f139602b9 --- /dev/null +++ b/libc/src/ctype/islower_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for islower_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISLOWER_H +#define LLVM_LIBC_SRC_CTYPE_ISLOWER_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int islower_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISLOWER_H diff --git a/libc/src/ctype/isprint.cpp b/libc/src/ctype/isprint.cpp index 42ab9cc8d238a1..349aefe1c17bbd 100644 --- a/libc/src/ctype/isprint.cpp +++ b/libc/src/ctype/isprint.cpp @@ -13,8 +13,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isprint, (int c)) { const unsigned ch = static_cast(c); return static_cast((ch - ' ') < 95); diff --git a/libc/src/ctype/isprint_l.cpp b/libc/src/ctype/isprint_l.cpp new file mode 100644 index 00000000000000..8f51f7f0e3e94b --- /dev/null +++ b/libc/src/ctype/isprint_l.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of isprint -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isprint_l.h" + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, isprint_l, (int c, locale_t)) { + const unsigned ch = static_cast(c); + return static_cast((ch - ' ') < 95); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isprint_l.h b/libc/src/ctype/isprint_l.h new file mode 100644 index 00000000000000..bd2ea9354c36a0 --- /dev/null +++ b/libc/src/ctype/isprint_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for isprint_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISPRINT_H +#define LLVM_LIBC_SRC_CTYPE_ISPRINT_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int isprint_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISPRINT_H diff --git a/libc/src/ctype/ispunct.cpp b/libc/src/ctype/ispunct.cpp index c1906e3acdd80e..0635294220b9c3 100644 --- a/libc/src/ctype/ispunct.cpp +++ b/libc/src/ctype/ispunct.cpp @@ -14,8 +14,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, ispunct, (int c)) { const unsigned ch = static_cast(c); return static_cast(!internal::isalnum(ch) && internal::isgraph(ch)); diff --git a/libc/src/ctype/ispunct_l.cpp b/libc/src/ctype/ispunct_l.cpp new file mode 100644 index 00000000000000..e825fbe2001b08 --- /dev/null +++ b/libc/src/ctype/ispunct_l.cpp @@ -0,0 +1,22 @@ +//===-- Implementation of ispunct -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/ispunct_l.h" + +#include "src/__support/common.h" +#include "src/__support/ctype_utils.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, ispunct_l, (int c, locale_t)) { + const unsigned ch = static_cast(c); + return static_cast(!internal::isalnum(ch) && internal::isgraph(ch)); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/ispunct_l.h b/libc/src/ctype/ispunct_l.h new file mode 100644 index 00000000000000..862daf4836f788 --- /dev/null +++ b/libc/src/ctype/ispunct_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for ispunct_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISPUNCT_H +#define LLVM_LIBC_SRC_CTYPE_ISPUNCT_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int ispunct_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISPUNCT_H diff --git a/libc/src/ctype/isspace.cpp b/libc/src/ctype/isspace.cpp index f8908493787841..005bf460fc1032 100644 --- a/libc/src/ctype/isspace.cpp +++ b/libc/src/ctype/isspace.cpp @@ -14,8 +14,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isspace, (int c)) { return static_cast(internal::isspace(static_cast(c))); } diff --git a/libc/src/ctype/isspace_l.cpp b/libc/src/ctype/isspace_l.cpp new file mode 100644 index 00000000000000..5c46dd68051261 --- /dev/null +++ b/libc/src/ctype/isspace_l.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of isspace -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isspace_l.h" +#include "src/__support/ctype_utils.h" + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, isspace_l, (int c, locale_t)) { + return static_cast(internal::isspace(static_cast(c))); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isspace_l.h b/libc/src/ctype/isspace_l.h new file mode 100644 index 00000000000000..61bbf127956da7 --- /dev/null +++ b/libc/src/ctype/isspace_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for isspace_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISSPACE_H +#define LLVM_LIBC_SRC_CTYPE_ISSPACE_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int isspace_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISSPACE_H diff --git a/libc/src/ctype/isupper.cpp b/libc/src/ctype/isupper.cpp index 8f929ea1a009e4..965fa336b28b4d 100644 --- a/libc/src/ctype/isupper.cpp +++ b/libc/src/ctype/isupper.cpp @@ -14,8 +14,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isupper, (int c)) { return static_cast(internal::isupper(static_cast(c))); } diff --git a/libc/src/ctype/isupper_l.cpp b/libc/src/ctype/isupper_l.cpp new file mode 100644 index 00000000000000..358990261d603f --- /dev/null +++ b/libc/src/ctype/isupper_l.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of isupper -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isupper_l.h" +#include "src/__support/ctype_utils.h" + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, isupper_l, (int c, locale_t)) { + return static_cast(internal::isupper(static_cast(c))); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isupper_l.h b/libc/src/ctype/isupper_l.h new file mode 100644 index 00000000000000..9bee7ef8c09f59 --- /dev/null +++ b/libc/src/ctype/isupper_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for isupper_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISUPPER_H +#define LLVM_LIBC_SRC_CTYPE_ISUPPER_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int isupper_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISUPPER_H diff --git a/libc/src/ctype/isxdigit.cpp b/libc/src/ctype/isxdigit.cpp index 391c5c53cee1e1..6b730c354db083 100644 --- a/libc/src/ctype/isxdigit.cpp +++ b/libc/src/ctype/isxdigit.cpp @@ -14,8 +14,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isxdigit, (int c)) { const unsigned ch = static_cast(c); return static_cast(internal::isdigit(ch) || (ch | 32) - 'a' < 6); diff --git a/libc/src/ctype/isxdigit_l.cpp b/libc/src/ctype/isxdigit_l.cpp new file mode 100644 index 00000000000000..8a5c7d4d28ab1c --- /dev/null +++ b/libc/src/ctype/isxdigit_l.cpp @@ -0,0 +1,22 @@ +//===-- Implementation of isxdigit ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isxdigit_l.h" +#include "src/__support/ctype_utils.h" + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, isxdigit_l, (int c, locale_t)) { + const unsigned ch = static_cast(c); + return static_cast(internal::isdigit(ch) || (ch | 32) - 'a' < 6); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isxdigit_l.h b/libc/src/ctype/isxdigit_l.h new file mode 100644 index 00000000000000..ee847eda4eae9a --- /dev/null +++ b/libc/src/ctype/isxdigit_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for isxdigit_l ----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISXDIGIT_H +#define LLVM_LIBC_SRC_CTYPE_ISXDIGIT_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int isxdigit_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISXDIGIT_H diff --git a/libc/src/ctype/tolower.cpp b/libc/src/ctype/tolower.cpp index e230428eef2b14..3ecad7bc5d5d54 100644 --- a/libc/src/ctype/tolower.cpp +++ b/libc/src/ctype/tolower.cpp @@ -14,8 +14,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, tolower, (int c)) { return internal::tolower(c); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/tolower_l.cpp b/libc/src/ctype/tolower_l.cpp new file mode 100644 index 00000000000000..7ccf31617e5925 --- /dev/null +++ b/libc/src/ctype/tolower_l.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of tolower -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/tolower_l.h" +#include "src/__support/ctype_utils.h" + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, tolower_l, (int c, locale_t)) { + return internal::tolower(c); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/tolower_l.h b/libc/src/ctype/tolower_l.h new file mode 100644 index 00000000000000..6099b8c813469c --- /dev/null +++ b/libc/src/ctype/tolower_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for tolower_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_TOLOWER_H +#define LLVM_LIBC_SRC_CTYPE_TOLOWER_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int tolower_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_TOLOWER_H diff --git a/libc/src/ctype/toupper.cpp b/libc/src/ctype/toupper.cpp index 97c1ac2c02b8c0..b5a23fc7f588bd 100644 --- a/libc/src/ctype/toupper.cpp +++ b/libc/src/ctype/toupper.cpp @@ -14,8 +14,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, toupper, (int c)) { if (internal::islower(c)) return c - ('a' - 'A'); diff --git a/libc/src/ctype/toupper_l.cpp b/libc/src/ctype/toupper_l.cpp new file mode 100644 index 00000000000000..f536ff36236160 --- /dev/null +++ b/libc/src/ctype/toupper_l.cpp @@ -0,0 +1,23 @@ +//===-- Implementation of toupper_l ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/toupper_l.h" +#include "src/__support/ctype_utils.h" + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, toupper_l, (int c, locale_t)) { + if (internal::islower(c)) + return c - ('a' - 'A'); + return c; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/toupper_l.h b/libc/src/ctype/toupper_l.h new file mode 100644 index 00000000000000..8877c35d492bd8 --- /dev/null +++ b/libc/src/ctype/toupper_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for toupper_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_TOUPPER_H +#define LLVM_LIBC_SRC_CTYPE_TOUPPER_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int toupper_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_TOUPPER_H diff --git a/libc/src/locale/CMakeLists.txt b/libc/src/locale/CMakeLists.txt new file mode 100644 index 00000000000000..6aaeb2ac31488b --- /dev/null +++ b/libc/src/locale/CMakeLists.txt @@ -0,0 +1,76 @@ +add_object_library( + locale + SRCS + locale.cpp + HDRS + locale.h + DEPENDS + libc.include.locale +) + +add_entrypoint_object( + localeconv + SRCS + localeconv.cpp + HDRS + localeconv.h + DEPENDS + libc.include.locale + CXX_STANDARD + 20 # For designated initializers +) + +add_entrypoint_object( + newlocale + SRCS + newlocale.cpp + HDRS + newlocale.h + DEPENDS + libc.include.locale + .locale +) + +add_entrypoint_object( + duplocale + SRCS + duplocale.cpp + HDRS + duplocale.h + DEPENDS + libc.include.locale + .locale +) + +add_entrypoint_object( + setlocale + SRCS + setlocale.cpp + HDRS + setlocale.h + DEPENDS + libc.include.locale + .locale +) + +add_entrypoint_object( + uselocale + SRCS + uselocale.cpp + HDRS + uselocale.h + DEPENDS + libc.include.locale + .locale +) + +add_entrypoint_object( + freelocale + SRCS + freelocale.cpp + HDRS + freelocale.h + DEPENDS + libc.include.locale + .locale +) diff --git a/libc/src/locale/duplocale.cpp b/libc/src/locale/duplocale.cpp new file mode 100644 index 00000000000000..d1bd0835121fcd --- /dev/null +++ b/libc/src/locale/duplocale.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of duplocale ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/locale/duplocale.h" +#include "include/llvm-libc-macros/locale-macros.h" +#include "src/locale/locale.h" + +#include "src/__support/CPP/string_view.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(locale_t, duplocale, (locale_t loc)) { return loc; } + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/locale/duplocale.h b/libc/src/locale/duplocale.h new file mode 100644 index 00000000000000..a745383860d834 --- /dev/null +++ b/libc/src/locale/duplocale.h @@ -0,0 +1,22 @@ +//===-- Implementation header for duplocale ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_LOCALE_DUPLOCALE_H +#define LLVM_LIBC_SRC_LOCALE_DUPLOCALE_H + +#include "src/__support/macros/config.h" + +#include "hdr/types/locale_t.h" + +namespace LIBC_NAMESPACE_DECL { + +locale_t duplocale(locale_t loc); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_LOCALE_DUPLOCALE_H diff --git a/libc/src/locale/freelocale.cpp b/libc/src/locale/freelocale.cpp new file mode 100644 index 00000000000000..2008995f101bf0 --- /dev/null +++ b/libc/src/locale/freelocale.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of freelocale --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/locale/freelocale.h" +#include "include/llvm-libc-macros/locale-macros.h" +#include "src/locale/locale.h" + +#include "src/__support/CPP/string_view.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(void, freelocale, (locale_t)) {} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/locale/freelocale.h b/libc/src/locale/freelocale.h new file mode 100644 index 00000000000000..77ece304307383 --- /dev/null +++ b/libc/src/locale/freelocale.h @@ -0,0 +1,22 @@ +//===-- Implementation header for freelocale --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_LOCALE_FREELOCALE_H +#define LLVM_LIBC_SRC_LOCALE_FREELOCALE_H + +#include "src/__support/macros/config.h" + +#include "hdr/types/locale_t.h" + +namespace LIBC_NAMESPACE_DECL { + +void freelocale(locale_t loc); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_LOCALE_FREELOCALE_H diff --git a/libc/src/locale/locale.cpp b/libc/src/locale/locale.cpp new file mode 100644 index 00000000000000..18ebc33ad58234 --- /dev/null +++ b/libc/src/locale/locale.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of locale ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/locale/locale.h" + +#include "include/llvm-libc-macros/locale-macros.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +__locale_t c_locale = {nullptr}; + +LIBC_THREAD_LOCAL locale_t locale = nullptr; + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/locale/locale.h b/libc/src/locale/locale.h new file mode 100644 index 00000000000000..6d6db2bcacad3f --- /dev/null +++ b/libc/src/locale/locale.h @@ -0,0 +1,36 @@ +//===-- Implementation header for the locale --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_LOCALE_LOCALECONV_H +#define LLVM_LIBC_SRC_LOCALE_LOCALECONV_H + +#include "src/__support/macros/attributes.h" +#include "src/__support/macros/config.h" + +#include "hdr/types/locale_t.h" + +#include + +namespace LIBC_NAMESPACE_DECL { + +// We only support the "C" locale right now. +static constexpr size_t MAX_LOCALE_NAME_SIZE = 2; + +struct __locale_data { + char name[MAX_LOCALE_NAME_SIZE]; +}; + +// The pointer to the default "C" locale. +extern __locale_t c_locale; + +// The global locale instance. +LIBC_THREAD_LOCAL extern locale_t locale; + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_LOCALE_LOCALECONV_H diff --git a/libc/src/locale/localeconv.cpp b/libc/src/locale/localeconv.cpp new file mode 100644 index 00000000000000..e4d7536bf1ffb7 --- /dev/null +++ b/libc/src/locale/localeconv.cpp @@ -0,0 +1,49 @@ +//===-- Implementation of localeconv --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/locale/localeconv.h" + +#include "src/__support/CPP/limits.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +static char DOT_STRING[] = "."; +static char EMPTY_STRING[] = ""; + +static struct lconv C_LCONV = { + .decimal_point = DOT_STRING, + .thousands_sep = EMPTY_STRING, + .grouping = EMPTY_STRING, + .mon_decimal_point = EMPTY_STRING, + .mon_thousands_sep = EMPTY_STRING, + .mon_grouping = EMPTY_STRING, + .positive_sign = EMPTY_STRING, + .negative_sign = EMPTY_STRING, + .currency_symbol = EMPTY_STRING, + .frac_digits = CHAR_MAX, + .p_cs_precedes = CHAR_MAX, + .n_cs_precedes = CHAR_MAX, + .p_sep_by_space = CHAR_MAX, + .n_sep_by_space = CHAR_MAX, + .p_sign_posn = CHAR_MAX, + .n_sign_posn = CHAR_MAX, + .int_curr_symbol = EMPTY_STRING, + .int_frac_digits = CHAR_MAX, + .int_p_cs_precedes = CHAR_MAX, + .int_n_cs_precedes = CHAR_MAX, + .int_p_sep_by_space = CHAR_MAX, + .int_n_sep_by_space = CHAR_MAX, + .int_p_sign_posn = CHAR_MAX, + .int_n_sign_posn = CHAR_MAX, +}; + +LLVM_LIBC_FUNCTION(struct lconv *, localeconv, ()) { return &C_LCONV; } + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/locale/localeconv.h b/libc/src/locale/localeconv.h new file mode 100644 index 00000000000000..a8f7599b572bf8 --- /dev/null +++ b/libc/src/locale/localeconv.h @@ -0,0 +1,22 @@ +//===-- Implementation header for localeconv --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_LOCALE_LOCALECONV_H +#define LLVM_LIBC_SRC_LOCALE_LOCALECONV_H + +#include "src/__support/macros/config.h" + +#include "include/llvm-libc-types/struct_lconv.h" + +namespace LIBC_NAMESPACE_DECL { + +struct lconv *localeconv(); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_LOCALE_LOCALECONV_H diff --git a/libc/src/locale/newlocale.cpp b/libc/src/locale/newlocale.cpp new file mode 100644 index 00000000000000..379e7e6385d09f --- /dev/null +++ b/libc/src/locale/newlocale.cpp @@ -0,0 +1,28 @@ +//===-- Implementation of newlocale ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/locale/newlocale.h" +#include "include/llvm-libc-macros/locale-macros.h" +#include "src/locale/locale.h" + +#include "src/__support/CPP/string_view.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(locale_t, newlocale, + (int category_mask, const char *locale_name, locale_t)) { + cpp::string_view name(locale_name); + if (category_mask > LC_ALL || (!name.empty() && name != "C")) + return nullptr; + + return &c_locale; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/locale/newlocale.h b/libc/src/locale/newlocale.h new file mode 100644 index 00000000000000..08a0071cb7aeaa --- /dev/null +++ b/libc/src/locale/newlocale.h @@ -0,0 +1,22 @@ +//===-- Implementation header for setlocale ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_LOCALE_SETLOCALE_H +#define LLVM_LIBC_SRC_LOCALE_SETLOCALE_H + +#include "src/__support/macros/config.h" + +#include "hdr/types/locale_t.h" + +namespace LIBC_NAMESPACE_DECL { + +locale_t newlocale(int category_mask, const char *locale_name, locale_t base); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_LOCALE_SETLOCALE_H diff --git a/libc/src/locale/setlocale.cpp b/libc/src/locale/setlocale.cpp new file mode 100644 index 00000000000000..0950ad73cbe2cf --- /dev/null +++ b/libc/src/locale/setlocale.cpp @@ -0,0 +1,28 @@ +//===-- Implementation of setlocale ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/locale/setlocale.h" +#include "include/llvm-libc-macros/locale-macros.h" +#include "src/locale/locale.h" + +#include "src/__support/CPP/string_view.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(char *, setlocale, (int category, const char *locale_name)) { + cpp::string_view name(locale_name); + if (category > LC_ALL || (!name.empty() && name != "C")) + return nullptr; + + static char locale_str[] = "C"; + return locale_str; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/locale/setlocale.h b/libc/src/locale/setlocale.h new file mode 100644 index 00000000000000..a9213cf409a7b6 --- /dev/null +++ b/libc/src/locale/setlocale.h @@ -0,0 +1,22 @@ +//===-- Implementation header for setlocale ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_LOCALE_SETLOCALE_H +#define LLVM_LIBC_SRC_LOCALE_SETLOCALE_H + +#include "src/__support/macros/config.h" + +#include "hdr/types/locale_t.h" + +namespace LIBC_NAMESPACE_DECL { + +char *setlocale(int category, const char *locale_name); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_LOCALE_SETLOCALE_H diff --git a/libc/src/locale/uselocale.cpp b/libc/src/locale/uselocale.cpp new file mode 100644 index 00000000000000..d6fdad248f12b2 --- /dev/null +++ b/libc/src/locale/uselocale.cpp @@ -0,0 +1,23 @@ +//===-- Implementation of uselocale ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/locale/uselocale.h" +#include "src/locale/locale.h" + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(locale_t, uselocale, (locale_t newloc)) { + if (!newloc) + return locale; + return locale = newloc; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/locale/uselocale.h b/libc/src/locale/uselocale.h new file mode 100644 index 00000000000000..15403490d2f8cc --- /dev/null +++ b/libc/src/locale/uselocale.h @@ -0,0 +1,22 @@ +//===-- Implementation header for uselocale ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_LOCALE_USELOCALE_H +#define LLVM_LIBC_SRC_LOCALE_USELOCALE_H + +#include "src/__support/macros/config.h" + +#include "hdr/types/locale_t.h" + +namespace LIBC_NAMESPACE_DECL { + +locale_t uselocale(locale_t newloc); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_LOCALE_USELOCALE_H diff --git a/libc/test/src/CMakeLists.txt b/libc/test/src/CMakeLists.txt index 60ea7e6a90d715..ddc6a5c7f6965f 100644 --- a/libc/test/src/CMakeLists.txt +++ b/libc/test/src/CMakeLists.txt @@ -82,6 +82,7 @@ add_subdirectory(setjmp) add_subdirectory(signal) add_subdirectory(spawn) add_subdirectory(time) +add_subdirectory(locale) if(${LIBC_TARGET_OS} STREQUAL "linux") add_subdirectory(pthread) diff --git a/libc/test/src/locale/CMakeLists.txt b/libc/test/src/locale/CMakeLists.txt new file mode 100644 index 00000000000000..3192004db26dd6 --- /dev/null +++ b/libc/test/src/locale/CMakeLists.txt @@ -0,0 +1,25 @@ +add_custom_target(libc-locale-tests) + +add_libc_test( + locale_test + SUITE + libc-locale-tests + SRCS + locale_test.cpp + DEPENDS + libc.include.locale + libc.src.locale.newlocale + libc.src.locale.uselocale + libc.src.locale.freelocale +) + +add_libc_test( + localeconv_test + SUITE + libc-locale-tests + SRCS + localeconv_test.cpp + DEPENDS + libc.include.locale + libc.src.locale.localeconv +) diff --git a/libc/test/src/locale/locale_test.cpp b/libc/test/src/locale/locale_test.cpp new file mode 100644 index 00000000000000..bc48bb851f4e4c --- /dev/null +++ b/libc/test/src/locale/locale_test.cpp @@ -0,0 +1,27 @@ +//===-- Unittests for locale ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/locale/freelocale.h" +#include "src/locale/newlocale.h" +#include "src/locale/uselocale.h" + +#include "test/UnitTest/Test.h" + +#include "include/llvm-libc-macros/locale-macros.h" + +TEST(LlvmLibcLocale, DefaultLocale) { + locale_t new_locale = LIBC_NAMESPACE::newlocale(LC_ALL, "C", nullptr); + EXPECT_NE(new_locale, static_cast(nullptr)); + + locale_t old_locale = LIBC_NAMESPACE::uselocale(new_locale); + EXPECT_NE(old_locale, static_cast(nullptr)); + + LIBC_NAMESPACE::freelocale(new_locale); + + LIBC_NAMESPACE::uselocale(old_locale); +} diff --git a/libc/test/src/locale/localeconv_test.cpp b/libc/test/src/locale/localeconv_test.cpp new file mode 100644 index 00000000000000..79264276dec354 --- /dev/null +++ b/libc/test/src/locale/localeconv_test.cpp @@ -0,0 +1,17 @@ +//===-- Unittests for localeconv ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "include/llvm-libc-macros/locale-macros.h" +#include "src/locale/localeconv.h" + +#include "test/UnitTest/Test.h" + +TEST(LlvmLibcLocale, DefaultLocale) { + struct lconv *conv = LIBC_NAMESPACE::localeconv(); + EXPECT_STREQ(conv->decimal_point, "."); +} From 2f4232db0b72635b89c2356c8a2c0504b075a0ab Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 22 Aug 2024 12:45:16 -0500 Subject: [PATCH 245/426] Revert " [libc] Add `ctype.h` locale variants (#102711)" This reverts commit 8f005f8306dc52577b3b9482d271fb463f0152a5. --- libc/config/gpu/entrypoints.txt | 23 --- libc/config/gpu/headers.txt | 1 - libc/config/linux/x86_64/entrypoints.txt | 25 --- libc/config/linux/x86_64/headers.txt | 1 - libc/hdr/types/CMakeLists.txt | 9 - libc/hdr/types/locale_t.h | 22 --- libc/include/CMakeLists.txt | 13 -- libc/include/llvm-libc-macros/CMakeLists.txt | 6 - libc/include/llvm-libc-macros/locale-macros.h | 32 ---- libc/include/llvm-libc-types/CMakeLists.txt | 2 - libc/include/llvm-libc-types/locale_t.h | 22 --- libc/include/llvm-libc-types/struct_lconv.h | 39 ----- libc/include/locale.h.def | 20 --- libc/newhdrgen/yaml/ctype.yaml | 102 +----------- libc/newhdrgen/yaml/locale.yaml | 41 ----- libc/spec/stdc.td | 131 +-------------- libc/src/CMakeLists.txt | 1 - libc/src/ctype/CMakeLists.txt | 156 ------------------ libc/src/ctype/isalnum.cpp | 2 + libc/src/ctype/isalnum_l.cpp | 21 --- libc/src/ctype/isalnum_l.h | 21 --- libc/src/ctype/isalpha.cpp | 2 + libc/src/ctype/isalpha_l.cpp | 21 --- libc/src/ctype/isalpha_l.h | 21 --- libc/src/ctype/isblank.cpp | 2 + libc/src/ctype/isblank_l.cpp | 20 --- libc/src/ctype/isblank_l.h | 21 --- libc/src/ctype/iscntrl.cpp | 2 + libc/src/ctype/iscntrl_l.cpp | 21 --- libc/src/ctype/iscntrl_l.h | 21 --- libc/src/ctype/isdigit.cpp | 2 + libc/src/ctype/isdigit_l.cpp | 20 --- libc/src/ctype/isdigit_l.h | 21 --- libc/src/ctype/isgraph.cpp | 2 + libc/src/ctype/isgraph_l.cpp | 21 --- libc/src/ctype/isgraph_l.h | 21 --- libc/src/ctype/islower.cpp | 2 + libc/src/ctype/islower_l.cpp | 21 --- libc/src/ctype/islower_l.h | 21 --- libc/src/ctype/isprint.cpp | 2 + libc/src/ctype/isprint_l.cpp | 21 --- libc/src/ctype/isprint_l.h | 21 --- libc/src/ctype/ispunct.cpp | 2 + libc/src/ctype/ispunct_l.cpp | 22 --- libc/src/ctype/ispunct_l.h | 21 --- libc/src/ctype/isspace.cpp | 2 + libc/src/ctype/isspace_l.cpp | 21 --- libc/src/ctype/isspace_l.h | 21 --- libc/src/ctype/isupper.cpp | 2 + libc/src/ctype/isupper_l.cpp | 21 --- libc/src/ctype/isupper_l.h | 21 --- libc/src/ctype/isxdigit.cpp | 2 + libc/src/ctype/isxdigit_l.cpp | 22 --- libc/src/ctype/isxdigit_l.h | 21 --- libc/src/ctype/tolower.cpp | 2 + libc/src/ctype/tolower_l.cpp | 21 --- libc/src/ctype/tolower_l.h | 21 --- libc/src/ctype/toupper.cpp | 2 + libc/src/ctype/toupper_l.cpp | 23 --- libc/src/ctype/toupper_l.h | 21 --- libc/src/locale/CMakeLists.txt | 76 --------- libc/src/locale/duplocale.cpp | 21 --- libc/src/locale/duplocale.h | 22 --- libc/src/locale/freelocale.cpp | 21 --- libc/src/locale/freelocale.h | 22 --- libc/src/locale/locale.cpp | 21 --- libc/src/locale/locale.h | 36 ---- libc/src/locale/localeconv.cpp | 49 ------ libc/src/locale/localeconv.h | 22 --- libc/src/locale/newlocale.cpp | 28 ---- libc/src/locale/newlocale.h | 22 --- libc/src/locale/setlocale.cpp | 28 ---- libc/src/locale/setlocale.h | 22 --- libc/src/locale/uselocale.cpp | 23 --- libc/src/locale/uselocale.h | 22 --- libc/test/src/CMakeLists.txt | 1 - libc/test/src/locale/CMakeLists.txt | 25 --- libc/test/src/locale/locale_test.cpp | 27 --- libc/test/src/locale/localeconv_test.cpp | 17 -- 79 files changed, 31 insertions(+), 1738 deletions(-) delete mode 100644 libc/hdr/types/locale_t.h delete mode 100644 libc/include/llvm-libc-macros/locale-macros.h delete mode 100644 libc/include/llvm-libc-types/locale_t.h delete mode 100644 libc/include/llvm-libc-types/struct_lconv.h delete mode 100644 libc/include/locale.h.def delete mode 100644 libc/newhdrgen/yaml/locale.yaml delete mode 100644 libc/src/ctype/isalnum_l.cpp delete mode 100644 libc/src/ctype/isalnum_l.h delete mode 100644 libc/src/ctype/isalpha_l.cpp delete mode 100644 libc/src/ctype/isalpha_l.h delete mode 100644 libc/src/ctype/isblank_l.cpp delete mode 100644 libc/src/ctype/isblank_l.h delete mode 100644 libc/src/ctype/iscntrl_l.cpp delete mode 100644 libc/src/ctype/iscntrl_l.h delete mode 100644 libc/src/ctype/isdigit_l.cpp delete mode 100644 libc/src/ctype/isdigit_l.h delete mode 100644 libc/src/ctype/isgraph_l.cpp delete mode 100644 libc/src/ctype/isgraph_l.h delete mode 100644 libc/src/ctype/islower_l.cpp delete mode 100644 libc/src/ctype/islower_l.h delete mode 100644 libc/src/ctype/isprint_l.cpp delete mode 100644 libc/src/ctype/isprint_l.h delete mode 100644 libc/src/ctype/ispunct_l.cpp delete mode 100644 libc/src/ctype/ispunct_l.h delete mode 100644 libc/src/ctype/isspace_l.cpp delete mode 100644 libc/src/ctype/isspace_l.h delete mode 100644 libc/src/ctype/isupper_l.cpp delete mode 100644 libc/src/ctype/isupper_l.h delete mode 100644 libc/src/ctype/isxdigit_l.cpp delete mode 100644 libc/src/ctype/isxdigit_l.h delete mode 100644 libc/src/ctype/tolower_l.cpp delete mode 100644 libc/src/ctype/tolower_l.h delete mode 100644 libc/src/ctype/toupper_l.cpp delete mode 100644 libc/src/ctype/toupper_l.h delete mode 100644 libc/src/locale/CMakeLists.txt delete mode 100644 libc/src/locale/duplocale.cpp delete mode 100644 libc/src/locale/duplocale.h delete mode 100644 libc/src/locale/freelocale.cpp delete mode 100644 libc/src/locale/freelocale.h delete mode 100644 libc/src/locale/locale.cpp delete mode 100644 libc/src/locale/locale.h delete mode 100644 libc/src/locale/localeconv.cpp delete mode 100644 libc/src/locale/localeconv.h delete mode 100644 libc/src/locale/newlocale.cpp delete mode 100644 libc/src/locale/newlocale.h delete mode 100644 libc/src/locale/setlocale.cpp delete mode 100644 libc/src/locale/setlocale.h delete mode 100644 libc/src/locale/uselocale.cpp delete mode 100644 libc/src/locale/uselocale.h delete mode 100644 libc/test/src/locale/CMakeLists.txt delete mode 100644 libc/test/src/locale/locale_test.cpp delete mode 100644 libc/test/src/locale/localeconv_test.cpp diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt index 7b869902074d8e..d7f35bc1edf5a0 100644 --- a/libc/config/gpu/entrypoints.txt +++ b/libc/config/gpu/entrypoints.txt @@ -4,35 +4,21 @@ set(TARGET_LIBC_ENTRYPOINTS # ctype.h entrypoints libc.src.ctype.isalnum - libc.src.ctype.isalnum_l libc.src.ctype.isalpha - libc.src.ctype.isalpha_l libc.src.ctype.isascii libc.src.ctype.isblank - libc.src.ctype.isblank_l libc.src.ctype.iscntrl - libc.src.ctype.iscntrl_l libc.src.ctype.isdigit - libc.src.ctype.isdigit_l libc.src.ctype.isgraph - libc.src.ctype.isgraph_l libc.src.ctype.islower - libc.src.ctype.islower_l libc.src.ctype.isprint - libc.src.ctype.isprint_l libc.src.ctype.ispunct - libc.src.ctype.ispunct_l libc.src.ctype.isspace - libc.src.ctype.isspace_l libc.src.ctype.isupper - libc.src.ctype.isupper_l libc.src.ctype.isxdigit - libc.src.ctype.isxdigit_l libc.src.ctype.toascii libc.src.ctype.tolower - libc.src.ctype.tolower_l libc.src.ctype.toupper - libc.src.ctype.toupper_l # string.h entrypoints libc.src.string.bcmp @@ -247,15 +233,6 @@ set(TARGET_LIBC_ENTRYPOINTS # wchar.h entrypoints libc.src.wchar.wctob - # locale.h entrypoints - libc.src.locale.localeconv - libc.src.locale.duplocale - libc.src.locale.freelocale - libc.src.locale.localeconv - libc.src.locale.newlocale - libc.src.locale.setlocale - libc.src.locale.uselocale - # gpu/rpc.h entrypoints libc.src.gpu.rpc_host_call ) diff --git a/libc/config/gpu/headers.txt b/libc/config/gpu/headers.txt index fc952c40f4daa2..99280b7563a80f 100644 --- a/libc/config/gpu/headers.txt +++ b/libc/config/gpu/headers.txt @@ -16,7 +16,6 @@ set(TARGET_PUBLIC_HEADERS libc.include.wchar libc.include.uchar libc.include.features - libc.include.locale # Header for RPC extensions libc.include.gpu_rpc diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index bac1e3cfa85da7..65c5757efe6274 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -782,22 +782,6 @@ endif() if(LLVM_LIBC_FULL_BUILD) list(APPEND TARGET_LIBC_ENTRYPOINTS - # ctype.h entrypoints - libc.src.ctype.isalnum_l - libc.src.ctype.isalpha_l - libc.src.ctype.isblank_l - libc.src.ctype.iscntrl_l - libc.src.ctype.isdigit_l - libc.src.ctype.isgraph_l - libc.src.ctype.islower_l - libc.src.ctype.isprint_l - libc.src.ctype.ispunct_l - libc.src.ctype.isspace_l - libc.src.ctype.isupper_l - libc.src.ctype.isxdigit_l - libc.src.ctype.tolower_l - libc.src.ctype.toupper_l - # assert.h entrypoints libc.src.assert.__assert_fail @@ -998,15 +982,6 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.time.nanosleep libc.src.time.time - # locale.h entrypoints - libc.src.locale.localeconv - libc.src.locale.duplocale - libc.src.locale.freelocale - libc.src.locale.localeconv - libc.src.locale.newlocale - libc.src.locale.setlocale - libc.src.locale.uselocale - # unistd.h entrypoints libc.src.unistd.__llvm_libc_syscall libc.src.unistd._exit diff --git a/libc/config/linux/x86_64/headers.txt b/libc/config/linux/x86_64/headers.txt index 881e149d9c40d3..77e454e64395df 100644 --- a/libc/config/linux/x86_64/headers.txt +++ b/libc/config/linux/x86_64/headers.txt @@ -33,7 +33,6 @@ set(TARGET_PUBLIC_HEADERS libc.include.unistd libc.include.wchar libc.include.uchar - libc.include.locale libc.include.arpa_inet diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt index f41576c07d99be..4fc28fd82e68db 100644 --- a/libc/hdr/types/CMakeLists.txt +++ b/libc/hdr/types/CMakeLists.txt @@ -162,12 +162,3 @@ add_proxy_header_library( libc.include.llvm-libc-types.cookie_io_functions_t libc.include.stdio ) - -add_proxy_header_library( - locale_t - HDRS - locale_t.h - FULL_BUILD_DEPENDS - libc.include.llvm-libc-types.locale_t - libc.include.locale -) diff --git a/libc/hdr/types/locale_t.h b/libc/hdr/types/locale_t.h deleted file mode 100644 index 485258b4616962..00000000000000 --- a/libc/hdr/types/locale_t.h +++ /dev/null @@ -1,22 +0,0 @@ -//===-- Definition of macros from locale_t.h ------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_HDR_LOCALE_T_H -#define LLVM_LIBC_HDR_LOCALE_T_H - -#ifdef LIBC_FULL_BUILD - -#include "include/llvm-libc-types/locale_t.h" - -#else // overlay mode - -#error "type not available in overlay mode" - -#endif // LLVM_LIBC_FULL_BUILD - -#endif // LLVM_LIBC_HDR_LOCALE_T_H diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index 910f9eea015f27..4e3ae7f801f4a0 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -45,7 +45,6 @@ add_header_macro( ctype.h DEPENDS .llvm_libc_common_h - .llvm-libc-types.locale_t ) add_header_macro( @@ -720,18 +719,6 @@ add_header_macro( .llvm-libc-types.wchar_t ) -add_header_macro( - locale - ../libc/newhdrgen/yaml/locale.yaml - locale.h.def - locale.h - DEPENDS - .llvm_libc_common_h - .llvm-libc-macros.locale_macros - .llvm-libc-types.locale_t - .llvm-libc-types.struct_lconv -) - if(LIBC_TARGET_OS_IS_GPU) file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/gpu) diff --git a/libc/include/llvm-libc-macros/CMakeLists.txt b/libc/include/llvm-libc-macros/CMakeLists.txt index 7b980232ba0429..60a8725f9ef63f 100644 --- a/libc/include/llvm-libc-macros/CMakeLists.txt +++ b/libc/include/llvm-libc-macros/CMakeLists.txt @@ -295,9 +295,3 @@ add_macro_header( HDR elf-macros.h ) - -add_macro_header( - locale_macros - HDR - locale-macros.h -) diff --git a/libc/include/llvm-libc-macros/locale-macros.h b/libc/include/llvm-libc-macros/locale-macros.h deleted file mode 100644 index 892f8b69f3a777..00000000000000 --- a/libc/include/llvm-libc-macros/locale-macros.h +++ /dev/null @@ -1,32 +0,0 @@ -//===-- Definition of macros from locale.h --------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_MACROS_LOCALE_MACROS_H -#define LLVM_LIBC_MACROS_LOCALE_MACROS_H - -#include "../llvm-libc-types/locale_t.h" - -#define LC_CTYPE 0 -#define LC_NUMERIC 1 -#define LC_TIME 2 -#define LC_COLLATE 3 -#define LC_MONETARY 4 -#define LC_MESSAGES 5 -#define LC_ALL 6 - -#define LC_GLOBAL_LOCALE ((locale_t)(-1)) - -#define LC_CTYPE_MASK (1 << LC_CTYPE) -#define LC_NUMERIC_MASK (1 << LC_NUMERIC) -#define LC_TIME_MASK (1 << LC_TIME) -#define LC_COLLATE_MASK (1 << LC_COLLATE) -#define LC_MONETARY_MASK (1 << LC_MONETARY) -#define LC_MESSAGES_MASK (1 << LC_MESSAGES) -#define LC_ALL_MASK 0x7fffffff - -#endif // LLVM_LIBC_MACROS_LOCALE_MACROS_H diff --git a/libc/include/llvm-libc-types/CMakeLists.txt b/libc/include/llvm-libc-types/CMakeLists.txt index 583b84ccaae67c..0fa86e0152f9ba 100644 --- a/libc/include/llvm-libc-types/CMakeLists.txt +++ b/libc/include/llvm-libc-types/CMakeLists.txt @@ -142,5 +142,3 @@ DEPENDS .fsblkcnt_t .fsfilcnt_t ) -add_header(locale_t HDR locale_t.h) -add_header(struct_lconv HDR struct_lconv.h) diff --git a/libc/include/llvm-libc-types/locale_t.h b/libc/include/llvm-libc-types/locale_t.h deleted file mode 100644 index 6d783001acf9f2..00000000000000 --- a/libc/include/llvm-libc-types/locale_t.h +++ /dev/null @@ -1,22 +0,0 @@ -//===-- Definition of type locale_t ---------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_TYPES_LOCALE_T_H -#define LLVM_LIBC_TYPES_LOCALE_T_H - -#define NUM_LOCALE_CATEGORIES 6 - -struct __locale_data; - -struct __locale_t { - struct __locale_data *data[NUM_LOCALE_CATEGORIES]; -}; - -typedef struct __locale_t *locale_t; - -#endif // LLVM_LIBC_TYPES_LOCALE_T_H diff --git a/libc/include/llvm-libc-types/struct_lconv.h b/libc/include/llvm-libc-types/struct_lconv.h deleted file mode 100644 index 9d69f055484dad..00000000000000 --- a/libc/include/llvm-libc-types/struct_lconv.h +++ /dev/null @@ -1,39 +0,0 @@ -//===-- Definition of type lconv ------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_TYPES_LCONV_H -#define LLVM_LIBC_TYPES_LCONV_H - -struct lconv { - char *decimal_point; - char *thousands_sep; - char *grouping; - char *mon_decimal_point; - char *mon_thousands_sep; - char *mon_grouping; - char *positive_sign; - char *negative_sign; - char *currency_symbol; - char frac_digits; - char p_cs_precedes; - char n_cs_precedes; - char p_sep_by_space; - char n_sep_by_space; - char p_sign_posn; - char n_sign_posn; - char *int_curr_symbol; - char int_frac_digits; - char int_p_cs_precedes; - char int_n_cs_precedes; - char int_p_sep_by_space; - char int_n_sep_by_space; - char int_p_sign_posn; - char int_n_sign_posn; -}; - -#endif // LLVM_LIBC_TYPES_LCONV_H diff --git a/libc/include/locale.h.def b/libc/include/locale.h.def deleted file mode 100644 index 516c6e6275e681..00000000000000 --- a/libc/include/locale.h.def +++ /dev/null @@ -1,20 +0,0 @@ -//===-- C standard library header locale.h --------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_LOCALE_H -#define LLVM_LIBC_LOCALE_H - -#include "__llvm-libc-common.h" - -#include "llvm-libc-macros/locale-macros.h" -#include "llvm-libc-types/locale_t.h" -#include "llvm-libc-types/struct_lconv.h" - -%%public_api() - -#endif // LLVM_LIBC_LOCALE_H diff --git a/libc/newhdrgen/yaml/ctype.yaml b/libc/newhdrgen/yaml/ctype.yaml index b4823c3e53234a..f3108a34d43377 100644 --- a/libc/newhdrgen/yaml/ctype.yaml +++ b/libc/newhdrgen/yaml/ctype.yaml @@ -1,7 +1,6 @@ header: ctype.h macros: [] -types: - - type_name: locale_t +types: [] enums: [] objects: [] functions: @@ -101,101 +100,4 @@ functions: return_type: int arguments: - type: int - - name: isalnum_l - standards: - - stdc - return_type: int - arguments: - - type: int - - type: locale_t - - name: isalpha_l - standards: - - stdc - return_type: int - arguments: - - type: int - - type: locale_t - - name: isblank_l - standards: - - stdc - return_type: int - arguments: - - type: int - - type: locale_t - - name: iscntrl_l - standards: - - stdc - return_type: int - arguments: - - type: int - - type: locale_t - - name: isdigit_l - standards: - - stdc - return_type: int - arguments: - - type: int - - type: locale_t - - name: isgraph_l - standards: - - stdc - return_type: int - arguments: - - type: int - - type: locale_t - - name: islower_l - standards: - - stdc - return_type: int - arguments: - - type: int - - type: locale_t - - name: isprint_l - standards: - - stdc - return_type: int - arguments: - - type: int - - type: locale_t - - name: ispunct_l - standards: - - stdc - return_type: int - arguments: - - type: int - - type: locale_t - - name: isspace_l - standards: - - stdc - return_type: int - arguments: - - type: int - - type: locale_t - - name: isupper_l - standards: - - stdc - return_type: int - arguments: - - type: int - - type: locale_t - - name: isxdigit_l - standards: - - stdc - return_type: int - arguments: - - type: int - - type: locale_t - - name: tolower_l - standards: - - stdc - return_type: int - arguments: - - type: int - - type: locale_t - - name: toupper_l - standards: - - stdc - return_type: int - arguments: - - type: int - - type: locale_t + functions: null diff --git a/libc/newhdrgen/yaml/locale.yaml b/libc/newhdrgen/yaml/locale.yaml deleted file mode 100644 index 7da7966ea730f6..00000000000000 --- a/libc/newhdrgen/yaml/locale.yaml +++ /dev/null @@ -1,41 +0,0 @@ -header: locale.h -functions: - - name: localeconv - standards: - - stdc - return_type: struct lconv * - arguments: - - type: void - - name: duplocale - standards: - - stdc - return_type: locale_t - arguments: - - type: locale_t - - name: freelocale - standards: - - stdc - return_type: void - arguments: - - type: locale_t - - name: newlocale - standards: - - stdc - return_type: locale_t - arguments: - - type: int - - type: const char * - - type: locale_t - - name: setlocale - standards: - - stdc - return_type: char * - arguments: - - type: int - - type: const char * - - name: uselocale - standards: - - stdc - return_type: locale_t - arguments: - - type: locale_t diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index 402d8c335470ad..118dcce829be23 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -4,7 +4,6 @@ def StdC : StandardSpec<"stdc"> { PtrType StructTmPtr = PtrType; PtrType TimeTTypePtr = PtrType; NamedType ClockT = NamedType<"clock_t">; - NamedType LocaleT = NamedType<"locale_t">; NamedType DivTType = NamedType<"div_t">; NamedType LDivTType = NamedType<"ldiv_t">; @@ -35,9 +34,7 @@ def StdC : StandardSpec<"stdc"> { HeaderSpec CType = HeaderSpec< "ctype.h", [], // Macros - [ - LocaleT - ], // Types + [], // Types [], // Enumerations [ FunctionSpec< @@ -110,76 +107,6 @@ def StdC : StandardSpec<"stdc"> { RetValSpec, [ArgSpec] >, - FunctionSpec< - "isalnum_l", - RetValSpec, - [ArgSpec, ArgSpec] - >, - FunctionSpec< - "isalpha_l", - RetValSpec, - [ArgSpec, ArgSpec] - >, - FunctionSpec< - "isblank_l", - RetValSpec, - [ArgSpec, ArgSpec] - >, - FunctionSpec< - "iscntrl_l", - RetValSpec, - [ArgSpec, ArgSpec] - >, - FunctionSpec< - "isdigit_l", - RetValSpec, - [ArgSpec, ArgSpec] - >, - FunctionSpec< - "isgraph_l", - RetValSpec, - [ArgSpec, ArgSpec] - >, - FunctionSpec< - "islower_l", - RetValSpec, - [ArgSpec, ArgSpec] - >, - FunctionSpec< - "isprint_l", - RetValSpec, - [ArgSpec, ArgSpec] - >, - FunctionSpec< - "ispunct_l", - RetValSpec, - [ArgSpec, ArgSpec] - >, - FunctionSpec< - "isspace_l", - RetValSpec, - [ArgSpec, ArgSpec] - >, - FunctionSpec< - "isupper_l", - RetValSpec, - [ArgSpec, ArgSpec] - >, - FunctionSpec< - "isxdigit_l", - RetValSpec, - [ArgSpec, ArgSpec] - >, - FunctionSpec< - "tolower_l", - RetValSpec, - [ArgSpec, ArgSpec] - >, - FunctionSpec< - "toupper_l", - RetValSpec, - [ArgSpec, ArgSpec] - >, ] >; @@ -1664,61 +1591,6 @@ def StdC : StandardSpec<"stdc"> { ] >; - - NamedType StructLconv : NamedType<"struct lconv">; - PtrType StructLconvPtr : PtrType; - - HeaderSpec Locale = HeaderSpec< - "locale.h", - [], // Macros - [LocaleT, StructLconv], // Types - [], // Enumerations - [ - FunctionSpec< - "duplocale", - RetValSpec, - [ - ArgSpec - ] - >, - FunctionSpec< - "freelocale", - RetValSpec, - [ - ArgSpec - ] - >, - FunctionSpec< - "localeconv", - RetValSpec, - [] - >, - FunctionSpec< - "newlocale", - RetValSpec, - [ - ArgSpec, - ArgSpec, - ArgSpec - ] - >, - FunctionSpec< - "setlocale", - RetValSpec, - [ - ArgSpec, - ArgSpec - ] - >, - FunctionSpec< - "uselocale", - RetValSpec, - [ - ArgSpec - ] - > - ] // Functions - >; let Headers = [ Assert, @@ -1741,6 +1613,5 @@ def StdC : StandardSpec<"stdc"> { Time, UChar, WChar, - Locale, ]; } diff --git a/libc/src/CMakeLists.txt b/libc/src/CMakeLists.txt index d554c12fb1ec89..9597e2380172b5 100644 --- a/libc/src/CMakeLists.txt +++ b/libc/src/CMakeLists.txt @@ -40,4 +40,3 @@ add_subdirectory(signal) add_subdirectory(spawn) add_subdirectory(threads) add_subdirectory(time) -add_subdirectory(locale) diff --git a/libc/src/ctype/CMakeLists.txt b/libc/src/ctype/CMakeLists.txt index 8830c1bccf9eaa..ae4eec9615dc19 100644 --- a/libc/src/ctype/CMakeLists.txt +++ b/libc/src/ctype/CMakeLists.txt @@ -146,159 +146,3 @@ add_entrypoint_object( DEPENDS libc.src.__support.ctype_utils ) - -# Do not build the locale versions in overlay mode. -if(NOT LLVM_LIBC_FULL_BUILD) - return() -endif() - -add_entrypoint_object( - isalnum_l - SRCS - isalnum_l.cpp - HDRS - isalnum_l.h - DEPENDS - libc.include.ctype - libc.src.__support.ctype_utils - libc.hdr.types.locale_t -) - -add_entrypoint_object( - isalpha_l - SRCS - isalpha_l.cpp - HDRS - isalpha_l.h - DEPENDS - libc.src.__support.ctype_utils - libc.hdr.types.locale_t -) - -add_entrypoint_object( - isblank_l - SRCS - isblank_l.cpp - HDRS - isblank_l.h - DEPENDS - libc.hdr.types.locale_t -) - -add_entrypoint_object( - iscntrl_l - SRCS - iscntrl_l.cpp - HDRS - iscntrl_l.h - DEPENDS - libc.hdr.types.locale_t -) - -add_entrypoint_object( - isdigit_l - SRCS - isdigit_l.cpp - HDRS - isdigit_l.h - DEPENDS - libc.src.__support.ctype_utils - libc.hdr.types.locale_t -) - -add_entrypoint_object( - isgraph_l - SRCS - isgraph_l.cpp - HDRS - isgraph_l.h - DEPENDS - libc.src.__support.ctype_utils - libc.hdr.types.locale_t -) - -add_entrypoint_object( - islower_l - SRCS - islower_l.cpp - HDRS - islower_l.h - DEPENDS - libc.src.__support.ctype_utils - libc.hdr.types.locale_t -) - -add_entrypoint_object( - isprint_l - SRCS - isprint_l.cpp - HDRS - isprint_l.h - DEPENDS - libc.hdr.types.locale_t -) - -add_entrypoint_object( - ispunct_l - SRCS - ispunct_l.cpp - HDRS - ispunct_l.h - DEPENDS - libc.src.__support.ctype_utils - libc.hdr.types.locale_t -) - -add_entrypoint_object( - isspace_l - SRCS - isspace_l.cpp - HDRS - isspace_l.h - DEPENDS - libc.hdr.types.locale_t -) - -add_entrypoint_object( - isupper_l - SRCS - isupper_l.cpp - HDRS - isupper_l.h - DEPENDS - libc.src.__support.ctype_utils - libc.hdr.types.locale_t -) - -add_entrypoint_object( - isxdigit_l - SRCS - isxdigit_l.cpp - HDRS - isxdigit_l.h - DEPENDS - libc.src.__support.ctype_utils - libc.hdr.types.locale_t -) - -add_entrypoint_object( - tolower_l - SRCS - tolower_l.cpp - HDRS - tolower_l.h - DEPENDS - libc.src.__support.ctype_utils - libc.hdr.types.locale_t -) - -add_entrypoint_object( - toupper_l - SRCS - toupper_l.cpp - HDRS - toupper_l.h - DEPENDS - libc.src.__support.ctype_utils - libc.hdr.types.locale_t -) diff --git a/libc/src/ctype/isalnum.cpp b/libc/src/ctype/isalnum.cpp index 54a3e357488790..382553c23a6bfb 100644 --- a/libc/src/ctype/isalnum.cpp +++ b/libc/src/ctype/isalnum.cpp @@ -14,6 +14,8 @@ namespace LIBC_NAMESPACE_DECL { +// TODO: Currently restricted to default locale. +// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isalnum, (int c)) { return static_cast(internal::isalnum(static_cast(c))); } diff --git a/libc/src/ctype/isalnum_l.cpp b/libc/src/ctype/isalnum_l.cpp deleted file mode 100644 index 671d9b75c4c33a..00000000000000 --- a/libc/src/ctype/isalnum_l.cpp +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation of isalnum -----------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/ctype/isalnum_l.h" -#include "src/__support/ctype_utils.h" - -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, isalnum_l, (int c, locale_t)) { - return static_cast(internal::isalnum(static_cast(c))); -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isalnum_l.h b/libc/src/ctype/isalnum_l.h deleted file mode 100644 index 5bc892e6c8747e..00000000000000 --- a/libc/src/ctype/isalnum_l.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for isalnum_l -----------------------*-C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_CTYPE_ISALNUM_H -#define LLVM_LIBC_SRC_CTYPE_ISALNUM_H - -#include "hdr/types/locale_t.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -int isalnum_l(int c, locale_t locale); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_CTYPE_ISALNUM_H diff --git a/libc/src/ctype/isalpha.cpp b/libc/src/ctype/isalpha.cpp index 78b26f6a486eae..1a63406780b6e0 100644 --- a/libc/src/ctype/isalpha.cpp +++ b/libc/src/ctype/isalpha.cpp @@ -14,6 +14,8 @@ namespace LIBC_NAMESPACE_DECL { +// TODO: Currently restricted to default locale. +// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isalpha, (int c)) { return static_cast(internal::isalpha(static_cast(c))); } diff --git a/libc/src/ctype/isalpha_l.cpp b/libc/src/ctype/isalpha_l.cpp deleted file mode 100644 index 0619d979bedf22..00000000000000 --- a/libc/src/ctype/isalpha_l.cpp +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation of isalpha -----------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/ctype/isalpha_l.h" - -#include "src/__support/common.h" -#include "src/__support/ctype_utils.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, isalpha_l, (int c, locale_t)) { - return static_cast(internal::isalpha(static_cast(c))); -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isalpha_l.h b/libc/src/ctype/isalpha_l.h deleted file mode 100644 index 3591f1175cb9a9..00000000000000 --- a/libc/src/ctype/isalpha_l.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for isalpha_l -----------------------*-C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_CTYPE_ISALPHA_H -#define LLVM_LIBC_SRC_CTYPE_ISALPHA_H - -#include "hdr/types/locale_t.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -int isalpha_l(int c, locale_t locale); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_CTYPE_ISALPHA_H diff --git a/libc/src/ctype/isblank.cpp b/libc/src/ctype/isblank.cpp index e0a20829f86cee..a4f33d265bd2dd 100644 --- a/libc/src/ctype/isblank.cpp +++ b/libc/src/ctype/isblank.cpp @@ -13,6 +13,8 @@ namespace LIBC_NAMESPACE_DECL { +// TODO: Currently restricted to default locale. +// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isblank, (int c)) { return static_cast(c == ' ' || c == '\t'); } diff --git a/libc/src/ctype/isblank_l.cpp b/libc/src/ctype/isblank_l.cpp deleted file mode 100644 index 4f6b0bfac29724..00000000000000 --- a/libc/src/ctype/isblank_l.cpp +++ /dev/null @@ -1,20 +0,0 @@ -//===-- Implementation of isblank -----------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/ctype/isblank_l.h" - -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, isblank_l, (int c, locale_t)) { - return static_cast(c == ' ' || c == '\t'); -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isblank_l.h b/libc/src/ctype/isblank_l.h deleted file mode 100644 index 61ede30ae76775..00000000000000 --- a/libc/src/ctype/isblank_l.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for isblank_l -----------------------*-C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_CTYPE_ISBLANK_H -#define LLVM_LIBC_SRC_CTYPE_ISBLANK_H - -#include "hdr/types/locale_t.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -int isblank_l(int c, locale_t locale); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_CTYPE_ISBLANK_H diff --git a/libc/src/ctype/iscntrl.cpp b/libc/src/ctype/iscntrl.cpp index 2218adfcc33f3b..fb582fd6ef0820 100644 --- a/libc/src/ctype/iscntrl.cpp +++ b/libc/src/ctype/iscntrl.cpp @@ -13,6 +13,8 @@ namespace LIBC_NAMESPACE_DECL { +// TODO: Currently restricted to default locale. +// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, iscntrl, (int c)) { const unsigned ch = static_cast(c); return static_cast(ch < 0x20 || ch == 0x7f); diff --git a/libc/src/ctype/iscntrl_l.cpp b/libc/src/ctype/iscntrl_l.cpp deleted file mode 100644 index 83aa480299fadc..00000000000000 --- a/libc/src/ctype/iscntrl_l.cpp +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation of iscntrl -----------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/ctype/iscntrl_l.h" - -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, iscntrl_l, (int c, locale_t)) { - const unsigned ch = static_cast(c); - return static_cast(ch < 0x20 || ch == 0x7f); -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/iscntrl_l.h b/libc/src/ctype/iscntrl_l.h deleted file mode 100644 index 7dee44fcd0bebc..00000000000000 --- a/libc/src/ctype/iscntrl_l.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for iscntrl_l -----------------------*-C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_CTYPE_ISCNTRL_H -#define LLVM_LIBC_SRC_CTYPE_ISCNTRL_H - -#include "hdr/types/locale_t.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -int iscntrl_l(int c, locale_t locale); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_CTYPE_ISCNTRL_H diff --git a/libc/src/ctype/isdigit.cpp b/libc/src/ctype/isdigit.cpp index 1f711943861f8b..43c5f1940c7f00 100644 --- a/libc/src/ctype/isdigit.cpp +++ b/libc/src/ctype/isdigit.cpp @@ -13,6 +13,8 @@ namespace LIBC_NAMESPACE_DECL { +// TODO: Currently restricted to default locale. +// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isdigit, (int c)) { return static_cast(internal::isdigit(static_cast(c))); } diff --git a/libc/src/ctype/isdigit_l.cpp b/libc/src/ctype/isdigit_l.cpp deleted file mode 100644 index ca981362bfe839..00000000000000 --- a/libc/src/ctype/isdigit_l.cpp +++ /dev/null @@ -1,20 +0,0 @@ -//===-- Implementation of isdigit -----------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/ctype/isdigit_l.h" -#include "src/__support/common.h" -#include "src/__support/ctype_utils.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, isdigit_l, (int c, locale_t)) { - return static_cast(internal::isdigit(static_cast(c))); -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isdigit_l.h b/libc/src/ctype/isdigit_l.h deleted file mode 100644 index abeec3464941a0..00000000000000 --- a/libc/src/ctype/isdigit_l.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for isdigit_l -----------------------*-C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_CTYPE_ISDIGIT_H -#define LLVM_LIBC_SRC_CTYPE_ISDIGIT_H - -#include "hdr/types/locale_t.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -int isdigit_l(int c, locale_t locale); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_CTYPE_ISDIGIT_H diff --git a/libc/src/ctype/isgraph.cpp b/libc/src/ctype/isgraph.cpp index 74bb2e75d138e6..a5b6e501b5813f 100644 --- a/libc/src/ctype/isgraph.cpp +++ b/libc/src/ctype/isgraph.cpp @@ -14,6 +14,8 @@ namespace LIBC_NAMESPACE_DECL { +// TODO: Currently restricted to default locale. +// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isgraph, (int c)) { return static_cast(internal::isgraph(static_cast(c))); } diff --git a/libc/src/ctype/isgraph_l.cpp b/libc/src/ctype/isgraph_l.cpp deleted file mode 100644 index cbef6df148aed6..00000000000000 --- a/libc/src/ctype/isgraph_l.cpp +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation of isgraph -----------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/ctype/isgraph_l.h" - -#include "src/__support/common.h" -#include "src/__support/ctype_utils.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, isgraph_l, (int c, locale_t)) { - return static_cast(internal::isgraph(static_cast(c))); -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isgraph_l.h b/libc/src/ctype/isgraph_l.h deleted file mode 100644 index d96a4608655092..00000000000000 --- a/libc/src/ctype/isgraph_l.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for isgraph_l -----------------------*-C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_CTYPE_ISGRAPH_H -#define LLVM_LIBC_SRC_CTYPE_ISGRAPH_H - -#include "hdr/types/locale_t.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -int isgraph_l(int c, locale_t locale); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_CTYPE_ISGRAPH_H diff --git a/libc/src/ctype/islower.cpp b/libc/src/ctype/islower.cpp index 831aad32d3a22e..61ccbcc1db413b 100644 --- a/libc/src/ctype/islower.cpp +++ b/libc/src/ctype/islower.cpp @@ -14,6 +14,8 @@ namespace LIBC_NAMESPACE_DECL { +// TODO: Currently restricted to default locale. +// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, islower, (int c)) { return static_cast(internal::islower(static_cast(c))); } diff --git a/libc/src/ctype/islower_l.cpp b/libc/src/ctype/islower_l.cpp deleted file mode 100644 index b9be6acc81c992..00000000000000 --- a/libc/src/ctype/islower_l.cpp +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation of islower -----------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/ctype/islower_l.h" -#include "src/__support/ctype_utils.h" - -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, islower_l, (int c, locale_t)) { - return static_cast(internal::islower(static_cast(c))); -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/islower_l.h b/libc/src/ctype/islower_l.h deleted file mode 100644 index 7d3e2f139602b9..00000000000000 --- a/libc/src/ctype/islower_l.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for islower_l -----------------------*-C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_CTYPE_ISLOWER_H -#define LLVM_LIBC_SRC_CTYPE_ISLOWER_H - -#include "hdr/types/locale_t.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -int islower_l(int c, locale_t locale); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_CTYPE_ISLOWER_H diff --git a/libc/src/ctype/isprint.cpp b/libc/src/ctype/isprint.cpp index 349aefe1c17bbd..42ab9cc8d238a1 100644 --- a/libc/src/ctype/isprint.cpp +++ b/libc/src/ctype/isprint.cpp @@ -13,6 +13,8 @@ namespace LIBC_NAMESPACE_DECL { +// TODO: Currently restricted to default locale. +// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isprint, (int c)) { const unsigned ch = static_cast(c); return static_cast((ch - ' ') < 95); diff --git a/libc/src/ctype/isprint_l.cpp b/libc/src/ctype/isprint_l.cpp deleted file mode 100644 index 8f51f7f0e3e94b..00000000000000 --- a/libc/src/ctype/isprint_l.cpp +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation of isprint -----------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/ctype/isprint_l.h" - -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, isprint_l, (int c, locale_t)) { - const unsigned ch = static_cast(c); - return static_cast((ch - ' ') < 95); -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isprint_l.h b/libc/src/ctype/isprint_l.h deleted file mode 100644 index bd2ea9354c36a0..00000000000000 --- a/libc/src/ctype/isprint_l.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for isprint_l -----------------------*-C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_CTYPE_ISPRINT_H -#define LLVM_LIBC_SRC_CTYPE_ISPRINT_H - -#include "hdr/types/locale_t.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -int isprint_l(int c, locale_t locale); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_CTYPE_ISPRINT_H diff --git a/libc/src/ctype/ispunct.cpp b/libc/src/ctype/ispunct.cpp index 0635294220b9c3..c1906e3acdd80e 100644 --- a/libc/src/ctype/ispunct.cpp +++ b/libc/src/ctype/ispunct.cpp @@ -14,6 +14,8 @@ namespace LIBC_NAMESPACE_DECL { +// TODO: Currently restricted to default locale. +// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, ispunct, (int c)) { const unsigned ch = static_cast(c); return static_cast(!internal::isalnum(ch) && internal::isgraph(ch)); diff --git a/libc/src/ctype/ispunct_l.cpp b/libc/src/ctype/ispunct_l.cpp deleted file mode 100644 index e825fbe2001b08..00000000000000 --- a/libc/src/ctype/ispunct_l.cpp +++ /dev/null @@ -1,22 +0,0 @@ -//===-- Implementation of ispunct -----------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/ctype/ispunct_l.h" - -#include "src/__support/common.h" -#include "src/__support/ctype_utils.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, ispunct_l, (int c, locale_t)) { - const unsigned ch = static_cast(c); - return static_cast(!internal::isalnum(ch) && internal::isgraph(ch)); -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/ispunct_l.h b/libc/src/ctype/ispunct_l.h deleted file mode 100644 index 862daf4836f788..00000000000000 --- a/libc/src/ctype/ispunct_l.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for ispunct_l -----------------------*-C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_CTYPE_ISPUNCT_H -#define LLVM_LIBC_SRC_CTYPE_ISPUNCT_H - -#include "hdr/types/locale_t.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -int ispunct_l(int c, locale_t locale); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_CTYPE_ISPUNCT_H diff --git a/libc/src/ctype/isspace.cpp b/libc/src/ctype/isspace.cpp index 005bf460fc1032..f8908493787841 100644 --- a/libc/src/ctype/isspace.cpp +++ b/libc/src/ctype/isspace.cpp @@ -14,6 +14,8 @@ namespace LIBC_NAMESPACE_DECL { +// TODO: Currently restricted to default locale. +// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isspace, (int c)) { return static_cast(internal::isspace(static_cast(c))); } diff --git a/libc/src/ctype/isspace_l.cpp b/libc/src/ctype/isspace_l.cpp deleted file mode 100644 index 5c46dd68051261..00000000000000 --- a/libc/src/ctype/isspace_l.cpp +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation of isspace -----------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/ctype/isspace_l.h" -#include "src/__support/ctype_utils.h" - -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, isspace_l, (int c, locale_t)) { - return static_cast(internal::isspace(static_cast(c))); -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isspace_l.h b/libc/src/ctype/isspace_l.h deleted file mode 100644 index 61bbf127956da7..00000000000000 --- a/libc/src/ctype/isspace_l.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for isspace_l -----------------------*-C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_CTYPE_ISSPACE_H -#define LLVM_LIBC_SRC_CTYPE_ISSPACE_H - -#include "hdr/types/locale_t.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -int isspace_l(int c, locale_t locale); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_CTYPE_ISSPACE_H diff --git a/libc/src/ctype/isupper.cpp b/libc/src/ctype/isupper.cpp index 965fa336b28b4d..8f929ea1a009e4 100644 --- a/libc/src/ctype/isupper.cpp +++ b/libc/src/ctype/isupper.cpp @@ -14,6 +14,8 @@ namespace LIBC_NAMESPACE_DECL { +// TODO: Currently restricted to default locale. +// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isupper, (int c)) { return static_cast(internal::isupper(static_cast(c))); } diff --git a/libc/src/ctype/isupper_l.cpp b/libc/src/ctype/isupper_l.cpp deleted file mode 100644 index 358990261d603f..00000000000000 --- a/libc/src/ctype/isupper_l.cpp +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation of isupper -----------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/ctype/isupper_l.h" -#include "src/__support/ctype_utils.h" - -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, isupper_l, (int c, locale_t)) { - return static_cast(internal::isupper(static_cast(c))); -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isupper_l.h b/libc/src/ctype/isupper_l.h deleted file mode 100644 index 9bee7ef8c09f59..00000000000000 --- a/libc/src/ctype/isupper_l.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for isupper_l -----------------------*-C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_CTYPE_ISUPPER_H -#define LLVM_LIBC_SRC_CTYPE_ISUPPER_H - -#include "hdr/types/locale_t.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -int isupper_l(int c, locale_t locale); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_CTYPE_ISUPPER_H diff --git a/libc/src/ctype/isxdigit.cpp b/libc/src/ctype/isxdigit.cpp index 6b730c354db083..391c5c53cee1e1 100644 --- a/libc/src/ctype/isxdigit.cpp +++ b/libc/src/ctype/isxdigit.cpp @@ -14,6 +14,8 @@ namespace LIBC_NAMESPACE_DECL { +// TODO: Currently restricted to default locale. +// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isxdigit, (int c)) { const unsigned ch = static_cast(c); return static_cast(internal::isdigit(ch) || (ch | 32) - 'a' < 6); diff --git a/libc/src/ctype/isxdigit_l.cpp b/libc/src/ctype/isxdigit_l.cpp deleted file mode 100644 index 8a5c7d4d28ab1c..00000000000000 --- a/libc/src/ctype/isxdigit_l.cpp +++ /dev/null @@ -1,22 +0,0 @@ -//===-- Implementation of isxdigit ----------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/ctype/isxdigit_l.h" -#include "src/__support/ctype_utils.h" - -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, isxdigit_l, (int c, locale_t)) { - const unsigned ch = static_cast(c); - return static_cast(internal::isdigit(ch) || (ch | 32) - 'a' < 6); -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isxdigit_l.h b/libc/src/ctype/isxdigit_l.h deleted file mode 100644 index ee847eda4eae9a..00000000000000 --- a/libc/src/ctype/isxdigit_l.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for isxdigit_l ----------------------*-C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_CTYPE_ISXDIGIT_H -#define LLVM_LIBC_SRC_CTYPE_ISXDIGIT_H - -#include "hdr/types/locale_t.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -int isxdigit_l(int c, locale_t locale); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_CTYPE_ISXDIGIT_H diff --git a/libc/src/ctype/tolower.cpp b/libc/src/ctype/tolower.cpp index 3ecad7bc5d5d54..e230428eef2b14 100644 --- a/libc/src/ctype/tolower.cpp +++ b/libc/src/ctype/tolower.cpp @@ -14,6 +14,8 @@ namespace LIBC_NAMESPACE_DECL { +// TODO: Currently restricted to default locale. +// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, tolower, (int c)) { return internal::tolower(c); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/tolower_l.cpp b/libc/src/ctype/tolower_l.cpp deleted file mode 100644 index 7ccf31617e5925..00000000000000 --- a/libc/src/ctype/tolower_l.cpp +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation of tolower -----------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/ctype/tolower_l.h" -#include "src/__support/ctype_utils.h" - -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, tolower_l, (int c, locale_t)) { - return internal::tolower(c); -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/tolower_l.h b/libc/src/ctype/tolower_l.h deleted file mode 100644 index 6099b8c813469c..00000000000000 --- a/libc/src/ctype/tolower_l.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for tolower_l -----------------------*-C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_CTYPE_TOLOWER_H -#define LLVM_LIBC_SRC_CTYPE_TOLOWER_H - -#include "hdr/types/locale_t.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -int tolower_l(int c, locale_t locale); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_CTYPE_TOLOWER_H diff --git a/libc/src/ctype/toupper.cpp b/libc/src/ctype/toupper.cpp index b5a23fc7f588bd..97c1ac2c02b8c0 100644 --- a/libc/src/ctype/toupper.cpp +++ b/libc/src/ctype/toupper.cpp @@ -14,6 +14,8 @@ namespace LIBC_NAMESPACE_DECL { +// TODO: Currently restricted to default locale. +// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, toupper, (int c)) { if (internal::islower(c)) return c - ('a' - 'A'); diff --git a/libc/src/ctype/toupper_l.cpp b/libc/src/ctype/toupper_l.cpp deleted file mode 100644 index f536ff36236160..00000000000000 --- a/libc/src/ctype/toupper_l.cpp +++ /dev/null @@ -1,23 +0,0 @@ -//===-- Implementation of toupper_l ---------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/ctype/toupper_l.h" -#include "src/__support/ctype_utils.h" - -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, toupper_l, (int c, locale_t)) { - if (internal::islower(c)) - return c - ('a' - 'A'); - return c; -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/toupper_l.h b/libc/src/ctype/toupper_l.h deleted file mode 100644 index 8877c35d492bd8..00000000000000 --- a/libc/src/ctype/toupper_l.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for toupper_l -----------------------*-C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_CTYPE_TOUPPER_H -#define LLVM_LIBC_SRC_CTYPE_TOUPPER_H - -#include "hdr/types/locale_t.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -int toupper_l(int c, locale_t locale); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_CTYPE_TOUPPER_H diff --git a/libc/src/locale/CMakeLists.txt b/libc/src/locale/CMakeLists.txt deleted file mode 100644 index 6aaeb2ac31488b..00000000000000 --- a/libc/src/locale/CMakeLists.txt +++ /dev/null @@ -1,76 +0,0 @@ -add_object_library( - locale - SRCS - locale.cpp - HDRS - locale.h - DEPENDS - libc.include.locale -) - -add_entrypoint_object( - localeconv - SRCS - localeconv.cpp - HDRS - localeconv.h - DEPENDS - libc.include.locale - CXX_STANDARD - 20 # For designated initializers -) - -add_entrypoint_object( - newlocale - SRCS - newlocale.cpp - HDRS - newlocale.h - DEPENDS - libc.include.locale - .locale -) - -add_entrypoint_object( - duplocale - SRCS - duplocale.cpp - HDRS - duplocale.h - DEPENDS - libc.include.locale - .locale -) - -add_entrypoint_object( - setlocale - SRCS - setlocale.cpp - HDRS - setlocale.h - DEPENDS - libc.include.locale - .locale -) - -add_entrypoint_object( - uselocale - SRCS - uselocale.cpp - HDRS - uselocale.h - DEPENDS - libc.include.locale - .locale -) - -add_entrypoint_object( - freelocale - SRCS - freelocale.cpp - HDRS - freelocale.h - DEPENDS - libc.include.locale - .locale -) diff --git a/libc/src/locale/duplocale.cpp b/libc/src/locale/duplocale.cpp deleted file mode 100644 index d1bd0835121fcd..00000000000000 --- a/libc/src/locale/duplocale.cpp +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation of duplocale ---------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/locale/duplocale.h" -#include "include/llvm-libc-macros/locale-macros.h" -#include "src/locale/locale.h" - -#include "src/__support/CPP/string_view.h" -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(locale_t, duplocale, (locale_t loc)) { return loc; } - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/locale/duplocale.h b/libc/src/locale/duplocale.h deleted file mode 100644 index a745383860d834..00000000000000 --- a/libc/src/locale/duplocale.h +++ /dev/null @@ -1,22 +0,0 @@ -//===-- Implementation header for duplocale ---------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_LOCALE_DUPLOCALE_H -#define LLVM_LIBC_SRC_LOCALE_DUPLOCALE_H - -#include "src/__support/macros/config.h" - -#include "hdr/types/locale_t.h" - -namespace LIBC_NAMESPACE_DECL { - -locale_t duplocale(locale_t loc); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_LOCALE_DUPLOCALE_H diff --git a/libc/src/locale/freelocale.cpp b/libc/src/locale/freelocale.cpp deleted file mode 100644 index 2008995f101bf0..00000000000000 --- a/libc/src/locale/freelocale.cpp +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation of freelocale --------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/locale/freelocale.h" -#include "include/llvm-libc-macros/locale-macros.h" -#include "src/locale/locale.h" - -#include "src/__support/CPP/string_view.h" -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(void, freelocale, (locale_t)) {} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/locale/freelocale.h b/libc/src/locale/freelocale.h deleted file mode 100644 index 77ece304307383..00000000000000 --- a/libc/src/locale/freelocale.h +++ /dev/null @@ -1,22 +0,0 @@ -//===-- Implementation header for freelocale --------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_LOCALE_FREELOCALE_H -#define LLVM_LIBC_SRC_LOCALE_FREELOCALE_H - -#include "src/__support/macros/config.h" - -#include "hdr/types/locale_t.h" - -namespace LIBC_NAMESPACE_DECL { - -void freelocale(locale_t loc); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_LOCALE_FREELOCALE_H diff --git a/libc/src/locale/locale.cpp b/libc/src/locale/locale.cpp deleted file mode 100644 index 18ebc33ad58234..00000000000000 --- a/libc/src/locale/locale.cpp +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation of locale ------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/locale/locale.h" - -#include "include/llvm-libc-macros/locale-macros.h" -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -__locale_t c_locale = {nullptr}; - -LIBC_THREAD_LOCAL locale_t locale = nullptr; - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/locale/locale.h b/libc/src/locale/locale.h deleted file mode 100644 index 6d6db2bcacad3f..00000000000000 --- a/libc/src/locale/locale.h +++ /dev/null @@ -1,36 +0,0 @@ -//===-- Implementation header for the locale --------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_LOCALE_LOCALECONV_H -#define LLVM_LIBC_SRC_LOCALE_LOCALECONV_H - -#include "src/__support/macros/attributes.h" -#include "src/__support/macros/config.h" - -#include "hdr/types/locale_t.h" - -#include - -namespace LIBC_NAMESPACE_DECL { - -// We only support the "C" locale right now. -static constexpr size_t MAX_LOCALE_NAME_SIZE = 2; - -struct __locale_data { - char name[MAX_LOCALE_NAME_SIZE]; -}; - -// The pointer to the default "C" locale. -extern __locale_t c_locale; - -// The global locale instance. -LIBC_THREAD_LOCAL extern locale_t locale; - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_LOCALE_LOCALECONV_H diff --git a/libc/src/locale/localeconv.cpp b/libc/src/locale/localeconv.cpp deleted file mode 100644 index e4d7536bf1ffb7..00000000000000 --- a/libc/src/locale/localeconv.cpp +++ /dev/null @@ -1,49 +0,0 @@ -//===-- Implementation of localeconv --------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/locale/localeconv.h" - -#include "src/__support/CPP/limits.h" -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -static char DOT_STRING[] = "."; -static char EMPTY_STRING[] = ""; - -static struct lconv C_LCONV = { - .decimal_point = DOT_STRING, - .thousands_sep = EMPTY_STRING, - .grouping = EMPTY_STRING, - .mon_decimal_point = EMPTY_STRING, - .mon_thousands_sep = EMPTY_STRING, - .mon_grouping = EMPTY_STRING, - .positive_sign = EMPTY_STRING, - .negative_sign = EMPTY_STRING, - .currency_symbol = EMPTY_STRING, - .frac_digits = CHAR_MAX, - .p_cs_precedes = CHAR_MAX, - .n_cs_precedes = CHAR_MAX, - .p_sep_by_space = CHAR_MAX, - .n_sep_by_space = CHAR_MAX, - .p_sign_posn = CHAR_MAX, - .n_sign_posn = CHAR_MAX, - .int_curr_symbol = EMPTY_STRING, - .int_frac_digits = CHAR_MAX, - .int_p_cs_precedes = CHAR_MAX, - .int_n_cs_precedes = CHAR_MAX, - .int_p_sep_by_space = CHAR_MAX, - .int_n_sep_by_space = CHAR_MAX, - .int_p_sign_posn = CHAR_MAX, - .int_n_sign_posn = CHAR_MAX, -}; - -LLVM_LIBC_FUNCTION(struct lconv *, localeconv, ()) { return &C_LCONV; } - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/locale/localeconv.h b/libc/src/locale/localeconv.h deleted file mode 100644 index a8f7599b572bf8..00000000000000 --- a/libc/src/locale/localeconv.h +++ /dev/null @@ -1,22 +0,0 @@ -//===-- Implementation header for localeconv --------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_LOCALE_LOCALECONV_H -#define LLVM_LIBC_SRC_LOCALE_LOCALECONV_H - -#include "src/__support/macros/config.h" - -#include "include/llvm-libc-types/struct_lconv.h" - -namespace LIBC_NAMESPACE_DECL { - -struct lconv *localeconv(); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_LOCALE_LOCALECONV_H diff --git a/libc/src/locale/newlocale.cpp b/libc/src/locale/newlocale.cpp deleted file mode 100644 index 379e7e6385d09f..00000000000000 --- a/libc/src/locale/newlocale.cpp +++ /dev/null @@ -1,28 +0,0 @@ -//===-- Implementation of newlocale ---------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/locale/newlocale.h" -#include "include/llvm-libc-macros/locale-macros.h" -#include "src/locale/locale.h" - -#include "src/__support/CPP/string_view.h" -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(locale_t, newlocale, - (int category_mask, const char *locale_name, locale_t)) { - cpp::string_view name(locale_name); - if (category_mask > LC_ALL || (!name.empty() && name != "C")) - return nullptr; - - return &c_locale; -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/locale/newlocale.h b/libc/src/locale/newlocale.h deleted file mode 100644 index 08a0071cb7aeaa..00000000000000 --- a/libc/src/locale/newlocale.h +++ /dev/null @@ -1,22 +0,0 @@ -//===-- Implementation header for setlocale ---------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_LOCALE_SETLOCALE_H -#define LLVM_LIBC_SRC_LOCALE_SETLOCALE_H - -#include "src/__support/macros/config.h" - -#include "hdr/types/locale_t.h" - -namespace LIBC_NAMESPACE_DECL { - -locale_t newlocale(int category_mask, const char *locale_name, locale_t base); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_LOCALE_SETLOCALE_H diff --git a/libc/src/locale/setlocale.cpp b/libc/src/locale/setlocale.cpp deleted file mode 100644 index 0950ad73cbe2cf..00000000000000 --- a/libc/src/locale/setlocale.cpp +++ /dev/null @@ -1,28 +0,0 @@ -//===-- Implementation of setlocale ---------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/locale/setlocale.h" -#include "include/llvm-libc-macros/locale-macros.h" -#include "src/locale/locale.h" - -#include "src/__support/CPP/string_view.h" -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(char *, setlocale, (int category, const char *locale_name)) { - cpp::string_view name(locale_name); - if (category > LC_ALL || (!name.empty() && name != "C")) - return nullptr; - - static char locale_str[] = "C"; - return locale_str; -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/locale/setlocale.h b/libc/src/locale/setlocale.h deleted file mode 100644 index a9213cf409a7b6..00000000000000 --- a/libc/src/locale/setlocale.h +++ /dev/null @@ -1,22 +0,0 @@ -//===-- Implementation header for setlocale ---------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_LOCALE_SETLOCALE_H -#define LLVM_LIBC_SRC_LOCALE_SETLOCALE_H - -#include "src/__support/macros/config.h" - -#include "hdr/types/locale_t.h" - -namespace LIBC_NAMESPACE_DECL { - -char *setlocale(int category, const char *locale_name); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_LOCALE_SETLOCALE_H diff --git a/libc/src/locale/uselocale.cpp b/libc/src/locale/uselocale.cpp deleted file mode 100644 index d6fdad248f12b2..00000000000000 --- a/libc/src/locale/uselocale.cpp +++ /dev/null @@ -1,23 +0,0 @@ -//===-- Implementation of uselocale ---------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/locale/uselocale.h" -#include "src/locale/locale.h" - -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(locale_t, uselocale, (locale_t newloc)) { - if (!newloc) - return locale; - return locale = newloc; -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/locale/uselocale.h b/libc/src/locale/uselocale.h deleted file mode 100644 index 15403490d2f8cc..00000000000000 --- a/libc/src/locale/uselocale.h +++ /dev/null @@ -1,22 +0,0 @@ -//===-- Implementation header for uselocale ---------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_LOCALE_USELOCALE_H -#define LLVM_LIBC_SRC_LOCALE_USELOCALE_H - -#include "src/__support/macros/config.h" - -#include "hdr/types/locale_t.h" - -namespace LIBC_NAMESPACE_DECL { - -locale_t uselocale(locale_t newloc); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_LOCALE_USELOCALE_H diff --git a/libc/test/src/CMakeLists.txt b/libc/test/src/CMakeLists.txt index ddc6a5c7f6965f..60ea7e6a90d715 100644 --- a/libc/test/src/CMakeLists.txt +++ b/libc/test/src/CMakeLists.txt @@ -82,7 +82,6 @@ add_subdirectory(setjmp) add_subdirectory(signal) add_subdirectory(spawn) add_subdirectory(time) -add_subdirectory(locale) if(${LIBC_TARGET_OS} STREQUAL "linux") add_subdirectory(pthread) diff --git a/libc/test/src/locale/CMakeLists.txt b/libc/test/src/locale/CMakeLists.txt deleted file mode 100644 index 3192004db26dd6..00000000000000 --- a/libc/test/src/locale/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -add_custom_target(libc-locale-tests) - -add_libc_test( - locale_test - SUITE - libc-locale-tests - SRCS - locale_test.cpp - DEPENDS - libc.include.locale - libc.src.locale.newlocale - libc.src.locale.uselocale - libc.src.locale.freelocale -) - -add_libc_test( - localeconv_test - SUITE - libc-locale-tests - SRCS - localeconv_test.cpp - DEPENDS - libc.include.locale - libc.src.locale.localeconv -) diff --git a/libc/test/src/locale/locale_test.cpp b/libc/test/src/locale/locale_test.cpp deleted file mode 100644 index bc48bb851f4e4c..00000000000000 --- a/libc/test/src/locale/locale_test.cpp +++ /dev/null @@ -1,27 +0,0 @@ -//===-- Unittests for locale ----------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/locale/freelocale.h" -#include "src/locale/newlocale.h" -#include "src/locale/uselocale.h" - -#include "test/UnitTest/Test.h" - -#include "include/llvm-libc-macros/locale-macros.h" - -TEST(LlvmLibcLocale, DefaultLocale) { - locale_t new_locale = LIBC_NAMESPACE::newlocale(LC_ALL, "C", nullptr); - EXPECT_NE(new_locale, static_cast(nullptr)); - - locale_t old_locale = LIBC_NAMESPACE::uselocale(new_locale); - EXPECT_NE(old_locale, static_cast(nullptr)); - - LIBC_NAMESPACE::freelocale(new_locale); - - LIBC_NAMESPACE::uselocale(old_locale); -} diff --git a/libc/test/src/locale/localeconv_test.cpp b/libc/test/src/locale/localeconv_test.cpp deleted file mode 100644 index 79264276dec354..00000000000000 --- a/libc/test/src/locale/localeconv_test.cpp +++ /dev/null @@ -1,17 +0,0 @@ -//===-- Unittests for localeconv ------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "include/llvm-libc-macros/locale-macros.h" -#include "src/locale/localeconv.h" - -#include "test/UnitTest/Test.h" - -TEST(LlvmLibcLocale, DefaultLocale) { - struct lconv *conv = LIBC_NAMESPACE::localeconv(); - EXPECT_STREQ(conv->decimal_point, "."); -} From 78d8ab2ab9e9f48e72597b5642285a5bbfcb75a5 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 22 Aug 2024 12:58:46 -0500 Subject: [PATCH 246/426] [libc] Initial support for 'locale.h' in the LLVM libc (#102689) Summary: This patch adds the macros and entrypoints associated with the `locale.h` entrypoints. These are mostly stubs, as we (for now and the forseeable future) only expect to support the C and maybe C.UTF-8 locales in the LLVM libc. --- libc/config/gpu/entrypoints.txt | 9 +++ libc/config/gpu/headers.txt | 1 + libc/config/linux/x86_64/entrypoints.txt | 9 +++ libc/config/linux/x86_64/headers.txt | 1 + libc/hdr/types/CMakeLists.txt | 9 +++ libc/hdr/types/locale_t.h | 22 ++++++ libc/include/CMakeLists.txt | 12 +++ libc/include/llvm-libc-macros/CMakeLists.txt | 6 ++ libc/include/llvm-libc-macros/locale-macros.h | 32 ++++++++ libc/include/llvm-libc-types/CMakeLists.txt | 2 + libc/include/llvm-libc-types/locale_t.h | 22 ++++++ libc/include/llvm-libc-types/struct_lconv.h | 39 ++++++++++ libc/include/locale.h.def | 20 +++++ libc/newhdrgen/yaml/locale.yaml | 41 ++++++++++ libc/spec/stdc.td | 57 ++++++++++++++ libc/src/CMakeLists.txt | 1 + libc/src/locale/CMakeLists.txt | 76 +++++++++++++++++++ libc/src/locale/duplocale.cpp | 21 +++++ libc/src/locale/duplocale.h | 22 ++++++ libc/src/locale/freelocale.cpp | 21 +++++ libc/src/locale/freelocale.h | 22 ++++++ libc/src/locale/locale.cpp | 21 +++++ libc/src/locale/locale.h | 36 +++++++++ libc/src/locale/localeconv.cpp | 49 ++++++++++++ libc/src/locale/localeconv.h | 22 ++++++ libc/src/locale/newlocale.cpp | 28 +++++++ libc/src/locale/newlocale.h | 22 ++++++ libc/src/locale/setlocale.cpp | 28 +++++++ libc/src/locale/setlocale.h | 22 ++++++ libc/src/locale/uselocale.cpp | 23 ++++++ libc/src/locale/uselocale.h | 22 ++++++ libc/test/src/CMakeLists.txt | 1 + libc/test/src/locale/CMakeLists.txt | 25 ++++++ libc/test/src/locale/locale_test.cpp | 27 +++++++ libc/test/src/locale/localeconv_test.cpp | 17 +++++ 35 files changed, 788 insertions(+) create mode 100644 libc/hdr/types/locale_t.h create mode 100644 libc/include/llvm-libc-macros/locale-macros.h create mode 100644 libc/include/llvm-libc-types/locale_t.h create mode 100644 libc/include/llvm-libc-types/struct_lconv.h create mode 100644 libc/include/locale.h.def create mode 100644 libc/newhdrgen/yaml/locale.yaml create mode 100644 libc/src/locale/CMakeLists.txt create mode 100644 libc/src/locale/duplocale.cpp create mode 100644 libc/src/locale/duplocale.h create mode 100644 libc/src/locale/freelocale.cpp create mode 100644 libc/src/locale/freelocale.h create mode 100644 libc/src/locale/locale.cpp create mode 100644 libc/src/locale/locale.h create mode 100644 libc/src/locale/localeconv.cpp create mode 100644 libc/src/locale/localeconv.h create mode 100644 libc/src/locale/newlocale.cpp create mode 100644 libc/src/locale/newlocale.h create mode 100644 libc/src/locale/setlocale.cpp create mode 100644 libc/src/locale/setlocale.h create mode 100644 libc/src/locale/uselocale.cpp create mode 100644 libc/src/locale/uselocale.h create mode 100644 libc/test/src/locale/CMakeLists.txt create mode 100644 libc/test/src/locale/locale_test.cpp create mode 100644 libc/test/src/locale/localeconv_test.cpp diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt index d7f35bc1edf5a0..0674f23687c0a5 100644 --- a/libc/config/gpu/entrypoints.txt +++ b/libc/config/gpu/entrypoints.txt @@ -233,6 +233,15 @@ set(TARGET_LIBC_ENTRYPOINTS # wchar.h entrypoints libc.src.wchar.wctob + # locale.h entrypoints + libc.src.locale.localeconv + libc.src.locale.duplocale + libc.src.locale.freelocale + libc.src.locale.localeconv + libc.src.locale.newlocale + libc.src.locale.setlocale + libc.src.locale.uselocale + # gpu/rpc.h entrypoints libc.src.gpu.rpc_host_call ) diff --git a/libc/config/gpu/headers.txt b/libc/config/gpu/headers.txt index 99280b7563a80f..fc952c40f4daa2 100644 --- a/libc/config/gpu/headers.txt +++ b/libc/config/gpu/headers.txt @@ -16,6 +16,7 @@ set(TARGET_PUBLIC_HEADERS libc.include.wchar libc.include.uchar libc.include.features + libc.include.locale # Header for RPC extensions libc.include.gpu_rpc diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 65c5757efe6274..e7c3c7db64abe5 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -982,6 +982,15 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.time.nanosleep libc.src.time.time + # locale.h entrypoints + libc.src.locale.localeconv + libc.src.locale.duplocale + libc.src.locale.freelocale + libc.src.locale.localeconv + libc.src.locale.newlocale + libc.src.locale.setlocale + libc.src.locale.uselocale + # unistd.h entrypoints libc.src.unistd.__llvm_libc_syscall libc.src.unistd._exit diff --git a/libc/config/linux/x86_64/headers.txt b/libc/config/linux/x86_64/headers.txt index 77e454e64395df..881e149d9c40d3 100644 --- a/libc/config/linux/x86_64/headers.txt +++ b/libc/config/linux/x86_64/headers.txt @@ -33,6 +33,7 @@ set(TARGET_PUBLIC_HEADERS libc.include.unistd libc.include.wchar libc.include.uchar + libc.include.locale libc.include.arpa_inet diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt index 4fc28fd82e68db..f41576c07d99be 100644 --- a/libc/hdr/types/CMakeLists.txt +++ b/libc/hdr/types/CMakeLists.txt @@ -162,3 +162,12 @@ add_proxy_header_library( libc.include.llvm-libc-types.cookie_io_functions_t libc.include.stdio ) + +add_proxy_header_library( + locale_t + HDRS + locale_t.h + FULL_BUILD_DEPENDS + libc.include.llvm-libc-types.locale_t + libc.include.locale +) diff --git a/libc/hdr/types/locale_t.h b/libc/hdr/types/locale_t.h new file mode 100644 index 00000000000000..485258b4616962 --- /dev/null +++ b/libc/hdr/types/locale_t.h @@ -0,0 +1,22 @@ +//===-- Definition of macros from locale_t.h ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_HDR_LOCALE_T_H +#define LLVM_LIBC_HDR_LOCALE_T_H + +#ifdef LIBC_FULL_BUILD + +#include "include/llvm-libc-types/locale_t.h" + +#else // overlay mode + +#error "type not available in overlay mode" + +#endif // LLVM_LIBC_FULL_BUILD + +#endif // LLVM_LIBC_HDR_LOCALE_T_H diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index 4e3ae7f801f4a0..8e00c9f1292e81 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -719,6 +719,18 @@ add_header_macro( .llvm-libc-types.wchar_t ) +add_header_macro( + locale + ../libc/newhdrgen/yaml/locale.yaml + locale.h.def + locale.h + DEPENDS + .llvm_libc_common_h + .llvm-libc-macros.locale_macros + .llvm-libc-types.locale_t + .llvm-libc-types.struct_lconv +) + if(LIBC_TARGET_OS_IS_GPU) file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/gpu) diff --git a/libc/include/llvm-libc-macros/CMakeLists.txt b/libc/include/llvm-libc-macros/CMakeLists.txt index 60a8725f9ef63f..7b980232ba0429 100644 --- a/libc/include/llvm-libc-macros/CMakeLists.txt +++ b/libc/include/llvm-libc-macros/CMakeLists.txt @@ -295,3 +295,9 @@ add_macro_header( HDR elf-macros.h ) + +add_macro_header( + locale_macros + HDR + locale-macros.h +) diff --git a/libc/include/llvm-libc-macros/locale-macros.h b/libc/include/llvm-libc-macros/locale-macros.h new file mode 100644 index 00000000000000..892f8b69f3a777 --- /dev/null +++ b/libc/include/llvm-libc-macros/locale-macros.h @@ -0,0 +1,32 @@ +//===-- Definition of macros from locale.h --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_MACROS_LOCALE_MACROS_H +#define LLVM_LIBC_MACROS_LOCALE_MACROS_H + +#include "../llvm-libc-types/locale_t.h" + +#define LC_CTYPE 0 +#define LC_NUMERIC 1 +#define LC_TIME 2 +#define LC_COLLATE 3 +#define LC_MONETARY 4 +#define LC_MESSAGES 5 +#define LC_ALL 6 + +#define LC_GLOBAL_LOCALE ((locale_t)(-1)) + +#define LC_CTYPE_MASK (1 << LC_CTYPE) +#define LC_NUMERIC_MASK (1 << LC_NUMERIC) +#define LC_TIME_MASK (1 << LC_TIME) +#define LC_COLLATE_MASK (1 << LC_COLLATE) +#define LC_MONETARY_MASK (1 << LC_MONETARY) +#define LC_MESSAGES_MASK (1 << LC_MESSAGES) +#define LC_ALL_MASK 0x7fffffff + +#endif // LLVM_LIBC_MACROS_LOCALE_MACROS_H diff --git a/libc/include/llvm-libc-types/CMakeLists.txt b/libc/include/llvm-libc-types/CMakeLists.txt index 0fa86e0152f9ba..583b84ccaae67c 100644 --- a/libc/include/llvm-libc-types/CMakeLists.txt +++ b/libc/include/llvm-libc-types/CMakeLists.txt @@ -142,3 +142,5 @@ DEPENDS .fsblkcnt_t .fsfilcnt_t ) +add_header(locale_t HDR locale_t.h) +add_header(struct_lconv HDR struct_lconv.h) diff --git a/libc/include/llvm-libc-types/locale_t.h b/libc/include/llvm-libc-types/locale_t.h new file mode 100644 index 00000000000000..6d783001acf9f2 --- /dev/null +++ b/libc/include/llvm-libc-types/locale_t.h @@ -0,0 +1,22 @@ +//===-- Definition of type locale_t ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_TYPES_LOCALE_T_H +#define LLVM_LIBC_TYPES_LOCALE_T_H + +#define NUM_LOCALE_CATEGORIES 6 + +struct __locale_data; + +struct __locale_t { + struct __locale_data *data[NUM_LOCALE_CATEGORIES]; +}; + +typedef struct __locale_t *locale_t; + +#endif // LLVM_LIBC_TYPES_LOCALE_T_H diff --git a/libc/include/llvm-libc-types/struct_lconv.h b/libc/include/llvm-libc-types/struct_lconv.h new file mode 100644 index 00000000000000..9d69f055484dad --- /dev/null +++ b/libc/include/llvm-libc-types/struct_lconv.h @@ -0,0 +1,39 @@ +//===-- Definition of type lconv ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_TYPES_LCONV_H +#define LLVM_LIBC_TYPES_LCONV_H + +struct lconv { + char *decimal_point; + char *thousands_sep; + char *grouping; + char *mon_decimal_point; + char *mon_thousands_sep; + char *mon_grouping; + char *positive_sign; + char *negative_sign; + char *currency_symbol; + char frac_digits; + char p_cs_precedes; + char n_cs_precedes; + char p_sep_by_space; + char n_sep_by_space; + char p_sign_posn; + char n_sign_posn; + char *int_curr_symbol; + char int_frac_digits; + char int_p_cs_precedes; + char int_n_cs_precedes; + char int_p_sep_by_space; + char int_n_sep_by_space; + char int_p_sign_posn; + char int_n_sign_posn; +}; + +#endif // LLVM_LIBC_TYPES_LCONV_H diff --git a/libc/include/locale.h.def b/libc/include/locale.h.def new file mode 100644 index 00000000000000..516c6e6275e681 --- /dev/null +++ b/libc/include/locale.h.def @@ -0,0 +1,20 @@ +//===-- C standard library header locale.h --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_LOCALE_H +#define LLVM_LIBC_LOCALE_H + +#include "__llvm-libc-common.h" + +#include "llvm-libc-macros/locale-macros.h" +#include "llvm-libc-types/locale_t.h" +#include "llvm-libc-types/struct_lconv.h" + +%%public_api() + +#endif // LLVM_LIBC_LOCALE_H diff --git a/libc/newhdrgen/yaml/locale.yaml b/libc/newhdrgen/yaml/locale.yaml new file mode 100644 index 00000000000000..7da7966ea730f6 --- /dev/null +++ b/libc/newhdrgen/yaml/locale.yaml @@ -0,0 +1,41 @@ +header: locale.h +functions: + - name: localeconv + standards: + - stdc + return_type: struct lconv * + arguments: + - type: void + - name: duplocale + standards: + - stdc + return_type: locale_t + arguments: + - type: locale_t + - name: freelocale + standards: + - stdc + return_type: void + arguments: + - type: locale_t + - name: newlocale + standards: + - stdc + return_type: locale_t + arguments: + - type: int + - type: const char * + - type: locale_t + - name: setlocale + standards: + - stdc + return_type: char * + arguments: + - type: int + - type: const char * + - name: uselocale + standards: + - stdc + return_type: locale_t + arguments: + - type: locale_t diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index 118dcce829be23..f9573997c65739 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -4,6 +4,7 @@ def StdC : StandardSpec<"stdc"> { PtrType StructTmPtr = PtrType; PtrType TimeTTypePtr = PtrType; NamedType ClockT = NamedType<"clock_t">; + NamedType LocaleT = NamedType<"locale_t">; NamedType DivTType = NamedType<"div_t">; NamedType LDivTType = NamedType<"ldiv_t">; @@ -1591,6 +1592,61 @@ def StdC : StandardSpec<"stdc"> { ] >; + + NamedType StructLconv : NamedType<"struct lconv">; + PtrType StructLconvPtr : PtrType; + + HeaderSpec Locale = HeaderSpec< + "locale.h", + [], // Macros + [LocaleT, StructLconv], // Types + [], // Enumerations + [ + FunctionSpec< + "duplocale", + RetValSpec, + [ + ArgSpec + ] + >, + FunctionSpec< + "freelocale", + RetValSpec, + [ + ArgSpec + ] + >, + FunctionSpec< + "localeconv", + RetValSpec, + [] + >, + FunctionSpec< + "newlocale", + RetValSpec, + [ + ArgSpec, + ArgSpec, + ArgSpec + ] + >, + FunctionSpec< + "setlocale", + RetValSpec, + [ + ArgSpec, + ArgSpec + ] + >, + FunctionSpec< + "uselocale", + RetValSpec, + [ + ArgSpec + ] + > + ] // Functions + >; let Headers = [ Assert, @@ -1613,5 +1669,6 @@ def StdC : StandardSpec<"stdc"> { Time, UChar, WChar, + Locale, ]; } diff --git a/libc/src/CMakeLists.txt b/libc/src/CMakeLists.txt index 9597e2380172b5..d554c12fb1ec89 100644 --- a/libc/src/CMakeLists.txt +++ b/libc/src/CMakeLists.txt @@ -40,3 +40,4 @@ add_subdirectory(signal) add_subdirectory(spawn) add_subdirectory(threads) add_subdirectory(time) +add_subdirectory(locale) diff --git a/libc/src/locale/CMakeLists.txt b/libc/src/locale/CMakeLists.txt new file mode 100644 index 00000000000000..6aaeb2ac31488b --- /dev/null +++ b/libc/src/locale/CMakeLists.txt @@ -0,0 +1,76 @@ +add_object_library( + locale + SRCS + locale.cpp + HDRS + locale.h + DEPENDS + libc.include.locale +) + +add_entrypoint_object( + localeconv + SRCS + localeconv.cpp + HDRS + localeconv.h + DEPENDS + libc.include.locale + CXX_STANDARD + 20 # For designated initializers +) + +add_entrypoint_object( + newlocale + SRCS + newlocale.cpp + HDRS + newlocale.h + DEPENDS + libc.include.locale + .locale +) + +add_entrypoint_object( + duplocale + SRCS + duplocale.cpp + HDRS + duplocale.h + DEPENDS + libc.include.locale + .locale +) + +add_entrypoint_object( + setlocale + SRCS + setlocale.cpp + HDRS + setlocale.h + DEPENDS + libc.include.locale + .locale +) + +add_entrypoint_object( + uselocale + SRCS + uselocale.cpp + HDRS + uselocale.h + DEPENDS + libc.include.locale + .locale +) + +add_entrypoint_object( + freelocale + SRCS + freelocale.cpp + HDRS + freelocale.h + DEPENDS + libc.include.locale + .locale +) diff --git a/libc/src/locale/duplocale.cpp b/libc/src/locale/duplocale.cpp new file mode 100644 index 00000000000000..d1bd0835121fcd --- /dev/null +++ b/libc/src/locale/duplocale.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of duplocale ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/locale/duplocale.h" +#include "include/llvm-libc-macros/locale-macros.h" +#include "src/locale/locale.h" + +#include "src/__support/CPP/string_view.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(locale_t, duplocale, (locale_t loc)) { return loc; } + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/locale/duplocale.h b/libc/src/locale/duplocale.h new file mode 100644 index 00000000000000..a745383860d834 --- /dev/null +++ b/libc/src/locale/duplocale.h @@ -0,0 +1,22 @@ +//===-- Implementation header for duplocale ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_LOCALE_DUPLOCALE_H +#define LLVM_LIBC_SRC_LOCALE_DUPLOCALE_H + +#include "src/__support/macros/config.h" + +#include "hdr/types/locale_t.h" + +namespace LIBC_NAMESPACE_DECL { + +locale_t duplocale(locale_t loc); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_LOCALE_DUPLOCALE_H diff --git a/libc/src/locale/freelocale.cpp b/libc/src/locale/freelocale.cpp new file mode 100644 index 00000000000000..2008995f101bf0 --- /dev/null +++ b/libc/src/locale/freelocale.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of freelocale --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/locale/freelocale.h" +#include "include/llvm-libc-macros/locale-macros.h" +#include "src/locale/locale.h" + +#include "src/__support/CPP/string_view.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(void, freelocale, (locale_t)) {} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/locale/freelocale.h b/libc/src/locale/freelocale.h new file mode 100644 index 00000000000000..77ece304307383 --- /dev/null +++ b/libc/src/locale/freelocale.h @@ -0,0 +1,22 @@ +//===-- Implementation header for freelocale --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_LOCALE_FREELOCALE_H +#define LLVM_LIBC_SRC_LOCALE_FREELOCALE_H + +#include "src/__support/macros/config.h" + +#include "hdr/types/locale_t.h" + +namespace LIBC_NAMESPACE_DECL { + +void freelocale(locale_t loc); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_LOCALE_FREELOCALE_H diff --git a/libc/src/locale/locale.cpp b/libc/src/locale/locale.cpp new file mode 100644 index 00000000000000..1610fb5dd34004 --- /dev/null +++ b/libc/src/locale/locale.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of locale ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/locale/locale.h" + +#include "include/llvm-libc-macros/locale-macros.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +__locale_t c_locale = {nullptr}; + +locale_t locale = nullptr; + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/locale/locale.h b/libc/src/locale/locale.h new file mode 100644 index 00000000000000..6d6db2bcacad3f --- /dev/null +++ b/libc/src/locale/locale.h @@ -0,0 +1,36 @@ +//===-- Implementation header for the locale --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_LOCALE_LOCALECONV_H +#define LLVM_LIBC_SRC_LOCALE_LOCALECONV_H + +#include "src/__support/macros/attributes.h" +#include "src/__support/macros/config.h" + +#include "hdr/types/locale_t.h" + +#include + +namespace LIBC_NAMESPACE_DECL { + +// We only support the "C" locale right now. +static constexpr size_t MAX_LOCALE_NAME_SIZE = 2; + +struct __locale_data { + char name[MAX_LOCALE_NAME_SIZE]; +}; + +// The pointer to the default "C" locale. +extern __locale_t c_locale; + +// The global locale instance. +LIBC_THREAD_LOCAL extern locale_t locale; + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_LOCALE_LOCALECONV_H diff --git a/libc/src/locale/localeconv.cpp b/libc/src/locale/localeconv.cpp new file mode 100644 index 00000000000000..e4d7536bf1ffb7 --- /dev/null +++ b/libc/src/locale/localeconv.cpp @@ -0,0 +1,49 @@ +//===-- Implementation of localeconv --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/locale/localeconv.h" + +#include "src/__support/CPP/limits.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +static char DOT_STRING[] = "."; +static char EMPTY_STRING[] = ""; + +static struct lconv C_LCONV = { + .decimal_point = DOT_STRING, + .thousands_sep = EMPTY_STRING, + .grouping = EMPTY_STRING, + .mon_decimal_point = EMPTY_STRING, + .mon_thousands_sep = EMPTY_STRING, + .mon_grouping = EMPTY_STRING, + .positive_sign = EMPTY_STRING, + .negative_sign = EMPTY_STRING, + .currency_symbol = EMPTY_STRING, + .frac_digits = CHAR_MAX, + .p_cs_precedes = CHAR_MAX, + .n_cs_precedes = CHAR_MAX, + .p_sep_by_space = CHAR_MAX, + .n_sep_by_space = CHAR_MAX, + .p_sign_posn = CHAR_MAX, + .n_sign_posn = CHAR_MAX, + .int_curr_symbol = EMPTY_STRING, + .int_frac_digits = CHAR_MAX, + .int_p_cs_precedes = CHAR_MAX, + .int_n_cs_precedes = CHAR_MAX, + .int_p_sep_by_space = CHAR_MAX, + .int_n_sep_by_space = CHAR_MAX, + .int_p_sign_posn = CHAR_MAX, + .int_n_sign_posn = CHAR_MAX, +}; + +LLVM_LIBC_FUNCTION(struct lconv *, localeconv, ()) { return &C_LCONV; } + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/locale/localeconv.h b/libc/src/locale/localeconv.h new file mode 100644 index 00000000000000..a8f7599b572bf8 --- /dev/null +++ b/libc/src/locale/localeconv.h @@ -0,0 +1,22 @@ +//===-- Implementation header for localeconv --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_LOCALE_LOCALECONV_H +#define LLVM_LIBC_SRC_LOCALE_LOCALECONV_H + +#include "src/__support/macros/config.h" + +#include "include/llvm-libc-types/struct_lconv.h" + +namespace LIBC_NAMESPACE_DECL { + +struct lconv *localeconv(); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_LOCALE_LOCALECONV_H diff --git a/libc/src/locale/newlocale.cpp b/libc/src/locale/newlocale.cpp new file mode 100644 index 00000000000000..379e7e6385d09f --- /dev/null +++ b/libc/src/locale/newlocale.cpp @@ -0,0 +1,28 @@ +//===-- Implementation of newlocale ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/locale/newlocale.h" +#include "include/llvm-libc-macros/locale-macros.h" +#include "src/locale/locale.h" + +#include "src/__support/CPP/string_view.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(locale_t, newlocale, + (int category_mask, const char *locale_name, locale_t)) { + cpp::string_view name(locale_name); + if (category_mask > LC_ALL || (!name.empty() && name != "C")) + return nullptr; + + return &c_locale; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/locale/newlocale.h b/libc/src/locale/newlocale.h new file mode 100644 index 00000000000000..08a0071cb7aeaa --- /dev/null +++ b/libc/src/locale/newlocale.h @@ -0,0 +1,22 @@ +//===-- Implementation header for setlocale ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_LOCALE_SETLOCALE_H +#define LLVM_LIBC_SRC_LOCALE_SETLOCALE_H + +#include "src/__support/macros/config.h" + +#include "hdr/types/locale_t.h" + +namespace LIBC_NAMESPACE_DECL { + +locale_t newlocale(int category_mask, const char *locale_name, locale_t base); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_LOCALE_SETLOCALE_H diff --git a/libc/src/locale/setlocale.cpp b/libc/src/locale/setlocale.cpp new file mode 100644 index 00000000000000..0950ad73cbe2cf --- /dev/null +++ b/libc/src/locale/setlocale.cpp @@ -0,0 +1,28 @@ +//===-- Implementation of setlocale ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/locale/setlocale.h" +#include "include/llvm-libc-macros/locale-macros.h" +#include "src/locale/locale.h" + +#include "src/__support/CPP/string_view.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(char *, setlocale, (int category, const char *locale_name)) { + cpp::string_view name(locale_name); + if (category > LC_ALL || (!name.empty() && name != "C")) + return nullptr; + + static char locale_str[] = "C"; + return locale_str; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/locale/setlocale.h b/libc/src/locale/setlocale.h new file mode 100644 index 00000000000000..a9213cf409a7b6 --- /dev/null +++ b/libc/src/locale/setlocale.h @@ -0,0 +1,22 @@ +//===-- Implementation header for setlocale ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_LOCALE_SETLOCALE_H +#define LLVM_LIBC_SRC_LOCALE_SETLOCALE_H + +#include "src/__support/macros/config.h" + +#include "hdr/types/locale_t.h" + +namespace LIBC_NAMESPACE_DECL { + +char *setlocale(int category, const char *locale_name); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_LOCALE_SETLOCALE_H diff --git a/libc/src/locale/uselocale.cpp b/libc/src/locale/uselocale.cpp new file mode 100644 index 00000000000000..d6fdad248f12b2 --- /dev/null +++ b/libc/src/locale/uselocale.cpp @@ -0,0 +1,23 @@ +//===-- Implementation of uselocale ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/locale/uselocale.h" +#include "src/locale/locale.h" + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(locale_t, uselocale, (locale_t newloc)) { + if (!newloc) + return locale; + return locale = newloc; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/locale/uselocale.h b/libc/src/locale/uselocale.h new file mode 100644 index 00000000000000..15403490d2f8cc --- /dev/null +++ b/libc/src/locale/uselocale.h @@ -0,0 +1,22 @@ +//===-- Implementation header for uselocale ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_LOCALE_USELOCALE_H +#define LLVM_LIBC_SRC_LOCALE_USELOCALE_H + +#include "src/__support/macros/config.h" + +#include "hdr/types/locale_t.h" + +namespace LIBC_NAMESPACE_DECL { + +locale_t uselocale(locale_t newloc); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_LOCALE_USELOCALE_H diff --git a/libc/test/src/CMakeLists.txt b/libc/test/src/CMakeLists.txt index 60ea7e6a90d715..ddc6a5c7f6965f 100644 --- a/libc/test/src/CMakeLists.txt +++ b/libc/test/src/CMakeLists.txt @@ -82,6 +82,7 @@ add_subdirectory(setjmp) add_subdirectory(signal) add_subdirectory(spawn) add_subdirectory(time) +add_subdirectory(locale) if(${LIBC_TARGET_OS} STREQUAL "linux") add_subdirectory(pthread) diff --git a/libc/test/src/locale/CMakeLists.txt b/libc/test/src/locale/CMakeLists.txt new file mode 100644 index 00000000000000..3192004db26dd6 --- /dev/null +++ b/libc/test/src/locale/CMakeLists.txt @@ -0,0 +1,25 @@ +add_custom_target(libc-locale-tests) + +add_libc_test( + locale_test + SUITE + libc-locale-tests + SRCS + locale_test.cpp + DEPENDS + libc.include.locale + libc.src.locale.newlocale + libc.src.locale.uselocale + libc.src.locale.freelocale +) + +add_libc_test( + localeconv_test + SUITE + libc-locale-tests + SRCS + localeconv_test.cpp + DEPENDS + libc.include.locale + libc.src.locale.localeconv +) diff --git a/libc/test/src/locale/locale_test.cpp b/libc/test/src/locale/locale_test.cpp new file mode 100644 index 00000000000000..bc48bb851f4e4c --- /dev/null +++ b/libc/test/src/locale/locale_test.cpp @@ -0,0 +1,27 @@ +//===-- Unittests for locale ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/locale/freelocale.h" +#include "src/locale/newlocale.h" +#include "src/locale/uselocale.h" + +#include "test/UnitTest/Test.h" + +#include "include/llvm-libc-macros/locale-macros.h" + +TEST(LlvmLibcLocale, DefaultLocale) { + locale_t new_locale = LIBC_NAMESPACE::newlocale(LC_ALL, "C", nullptr); + EXPECT_NE(new_locale, static_cast(nullptr)); + + locale_t old_locale = LIBC_NAMESPACE::uselocale(new_locale); + EXPECT_NE(old_locale, static_cast(nullptr)); + + LIBC_NAMESPACE::freelocale(new_locale); + + LIBC_NAMESPACE::uselocale(old_locale); +} diff --git a/libc/test/src/locale/localeconv_test.cpp b/libc/test/src/locale/localeconv_test.cpp new file mode 100644 index 00000000000000..79264276dec354 --- /dev/null +++ b/libc/test/src/locale/localeconv_test.cpp @@ -0,0 +1,17 @@ +//===-- Unittests for localeconv ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "include/llvm-libc-macros/locale-macros.h" +#include "src/locale/localeconv.h" + +#include "test/UnitTest/Test.h" + +TEST(LlvmLibcLocale, DefaultLocale) { + struct lconv *conv = LIBC_NAMESPACE::localeconv(); + EXPECT_STREQ(conv->decimal_point, "."); +} From f3a47b9e25e7aca0dc2cd2dec30cedd9aeffaecf Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Thu, 22 Aug 2024 10:58:16 -0700 Subject: [PATCH 247/426] [NFC] [Docs] add missing space --- llvm/docs/ProgrammersManual.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst index 41d1388e5bf7e9..98803ddffd0823 100644 --- a/llvm/docs/ProgrammersManual.rst +++ b/llvm/docs/ProgrammersManual.rst @@ -1363,7 +1363,7 @@ Whatever code you want that control, use ``DebugCounter::shouldExecute`` to cont I->eraseFromParent(); That's all you have to do. Now, using opt, you can control when this code triggers using -the '``--debug-counter``' Options.To specify when to execute the codepath. +the '``--debug-counter``' Options. To specify when to execute the codepath. .. code-block:: none From 518b1f02835c4face8aaaf646a0f3878c2382b0b Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 22 Aug 2024 13:09:27 -0500 Subject: [PATCH 248/426] [libc] Fix leftover thread local --- libc/src/locale/locale.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libc/src/locale/locale.h b/libc/src/locale/locale.h index 6d6db2bcacad3f..14befa6e7a9c24 100644 --- a/libc/src/locale/locale.h +++ b/libc/src/locale/locale.h @@ -29,7 +29,7 @@ struct __locale_data { extern __locale_t c_locale; // The global locale instance. -LIBC_THREAD_LOCAL extern locale_t locale; +extern locale_t locale; } // namespace LIBC_NAMESPACE_DECL From 319c7a42ba2e5be56757d622747ba317d3b9e9ad Mon Sep 17 00:00:00 2001 From: Greg Roth Date: Thu, 22 Aug 2024 12:13:52 -0600 Subject: [PATCH 249/426] [HLSL][SPIRV]Add SPIRV generation for HLSL dot (#104656) This adds the SPIRV fdot, sdot, and udot intrinsics and allows them to be created at codegen depending on the target architecture. This required moving some of the DXIL-specific choices to DXIL instruction expansion out of codegen and providing it with at a more generic fdot intrinsic as well. Removed some stale comments that gave the obsolete impression that type conversions should be expected to match overloads. The SPIRV intrinsic handling involves generating multiply and add operations for integers and the existing OpDot operation for floating point. New tests for generating SPIRV float and integer dot intrinsics are added as well as expanding HLSL tests to include SPIRV generation Used new dot product intrinsic generation to implement normalize() in SPIRV Incidentally changed existing dot intrinsic definitions to use DefaultAttrsIntrinsic to match the newly added inrinsics Fixes #88056 --- clang/lib/CodeGen/CGBuiltin.cpp | 47 ++-- clang/lib/CodeGen/CGHLSLRuntime.h | 3 + .../CodeGenHLSL/builtins/dot-builtin.hlsl | 12 +- clang/test/CodeGenHLSL/builtins/dot.hlsl | 207 +++++++++--------- llvm/include/llvm/IR/IntrinsicsDirectX.td | 34 +-- llvm/include/llvm/IR/IntrinsicsSPIRV.td | 12 + llvm/lib/Target/DirectX/DXIL.td | 6 +- .../Target/DirectX/DXILIntrinsicExpansion.cpp | 102 ++++++--- .../Target/SPIRV/SPIRVInstructionSelector.cpp | 102 +++++++++ llvm/test/CodeGen/DirectX/fdot.ll | 119 +++++----- llvm/test/CodeGen/DirectX/idot.ll | 24 +- .../CodeGen/SPIRV/hlsl-intrinsics/fdot.ll | 75 +++++++ .../CodeGen/SPIRV/hlsl-intrinsics/idot.ll | 88 ++++++++ 13 files changed, 579 insertions(+), 252 deletions(-) create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fdot.ll create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/idot.ll diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 3d77b118235ca0..2a733e4d834cfa 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18497,22 +18497,14 @@ llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments, return Arg; } -Intrinsic::ID getDotProductIntrinsic(QualType QT, int elementCount) { - if (QT->hasFloatingRepresentation()) { - switch (elementCount) { - case 2: - return Intrinsic::dx_dot2; - case 3: - return Intrinsic::dx_dot3; - case 4: - return Intrinsic::dx_dot4; - } - } - if (QT->hasSignedIntegerRepresentation()) - return Intrinsic::dx_sdot; - - assert(QT->hasUnsignedIntegerRepresentation()); - return Intrinsic::dx_udot; +// Return dot product intrinsic that corresponds to the QT scalar type +Intrinsic::ID getDotProductIntrinsic(CGHLSLRuntime &RT, QualType QT) { + if (QT->isFloatingType()) + return RT.getFDotIntrinsic(); + if (QT->isSignedIntegerType()) + return RT.getSDotIntrinsic(); + assert(QT->isUnsignedIntegerType()); + return RT.getUDotIntrinsic(); } Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, @@ -18555,37 +18547,38 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, Value *Op1 = EmitScalarExpr(E->getArg(1)); llvm::Type *T0 = Op0->getType(); llvm::Type *T1 = Op1->getType(); + + // If the arguments are scalars, just emit a multiply if (!T0->isVectorTy() && !T1->isVectorTy()) { if (T0->isFloatingPointTy()) - return Builder.CreateFMul(Op0, Op1, "dx.dot"); + return Builder.CreateFMul(Op0, Op1, "hlsl.dot"); if (T0->isIntegerTy()) - return Builder.CreateMul(Op0, Op1, "dx.dot"); + return Builder.CreateMul(Op0, Op1, "hlsl.dot"); - // Bools should have been promoted llvm_unreachable( "Scalar dot product is only supported on ints and floats."); } + // For vectors, validate types and emit the appropriate intrinsic + // A VectorSplat should have happened assert(T0->isVectorTy() && T1->isVectorTy() && "Dot product of vector and scalar is not supported."); - // A vector sext or sitofp should have happened - assert(T0->getScalarType() == T1->getScalarType() && - "Dot product of vectors need the same element types."); - auto *VecTy0 = E->getArg(0)->getType()->getAs(); [[maybe_unused]] auto *VecTy1 = E->getArg(1)->getType()->getAs(); - // A HLSLVectorTruncation should have happend + + assert(VecTy0->getElementType() == VecTy1->getElementType() && + "Dot product of vectors need the same element types."); + assert(VecTy0->getNumElements() == VecTy1->getNumElements() && "Dot product requires vectors to be of the same size."); return Builder.CreateIntrinsic( /*ReturnType=*/T0->getScalarType(), - getDotProductIntrinsic(E->getArg(0)->getType(), - VecTy0->getNumElements()), - ArrayRef{Op0, Op1}, nullptr, "dx.dot"); + getDotProductIntrinsic(CGM.getHLSLRuntime(), VecTy0->getElementType()), + ArrayRef{Op0, Op1}, nullptr, "hlsl.dot"); } break; case Builtin::BI__builtin_hlsl_lerp: { Value *X = EmitScalarExpr(E->getArg(0)); diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h index b1455b5779acf9..55a4b97c160cd6 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.h +++ b/clang/lib/CodeGen/CGHLSLRuntime.h @@ -81,6 +81,9 @@ class CGHLSLRuntime { GENERATE_HLSL_INTRINSIC_FUNCTION(Rsqrt, rsqrt) GENERATE_HLSL_INTRINSIC_FUNCTION(Saturate, saturate) GENERATE_HLSL_INTRINSIC_FUNCTION(ThreadId, thread_id) + GENERATE_HLSL_INTRINSIC_FUNCTION(FDot, fdot) + GENERATE_HLSL_INTRINSIC_FUNCTION(SDot, sdot) + GENERATE_HLSL_INTRINSIC_FUNCTION(UDot, udot) //===----------------------------------------------------------------------===// // End of reserved area for HLSL intrinsic getters. diff --git a/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl index b0b95074c972d5..482f089d4770fd 100644 --- a/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl +++ b/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl @@ -2,8 +2,8 @@ // CHECK-LABEL: builtin_bool_to_float_type_promotion // CHECK: %conv1 = uitofp i1 %loadedv to double -// CHECK: %dx.dot = fmul double %conv, %conv1 -// CHECK: %conv2 = fptrunc double %dx.dot to float +// CHECK: %hlsl.dot = fmul double %conv, %conv1 +// CHECK: %conv2 = fptrunc double %hlsl.dot to float // CHECK: ret float %conv2 float builtin_bool_to_float_type_promotion ( float p0, bool p1 ) { return __builtin_hlsl_dot ( p0, p1 ); @@ -12,8 +12,8 @@ float builtin_bool_to_float_type_promotion ( float p0, bool p1 ) { // CHECK-LABEL: builtin_bool_to_float_arg1_type_promotion // CHECK: %conv = uitofp i1 %loadedv to double // CHECK: %conv1 = fpext float %1 to double -// CHECK: %dx.dot = fmul double %conv, %conv1 -// CHECK: %conv2 = fptrunc double %dx.dot to float +// CHECK: %hlsl.dot = fmul double %conv, %conv1 +// CHECK: %conv2 = fptrunc double %hlsl.dot to float // CHECK: ret float %conv2 float builtin_bool_to_float_arg1_type_promotion ( bool p0, float p1 ) { return __builtin_hlsl_dot ( p0, p1 ); @@ -22,8 +22,8 @@ float builtin_bool_to_float_arg1_type_promotion ( bool p0, float p1 ) { // CHECK-LABEL: builtin_dot_int_to_float_promotion // CHECK: %conv = fpext float %0 to double // CHECK: %conv1 = sitofp i32 %1 to double -// CHECK: dx.dot = fmul double %conv, %conv1 -// CHECK: %conv2 = fptrunc double %dx.dot to float +// CHECK: dot = fmul double %conv, %conv1 +// CHECK: %conv2 = fptrunc double %hlsl.dot to float // CHECK: ret float %conv2 float builtin_dot_int_to_float_promotion ( float p0, int p1 ) { return __builtin_hlsl_dot ( p0, p1 ); diff --git a/clang/test/CodeGenHLSL/builtins/dot.hlsl b/clang/test/CodeGenHLSL/builtins/dot.hlsl index ae6e45c3f9482a..2b76fae61147b4 100644 --- a/clang/test/CodeGenHLSL/builtins/dot.hlsl +++ b/clang/test/CodeGenHLSL/builtins/dot.hlsl @@ -1,161 +1,172 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ // RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF +// RUN: --check-prefixes=CHECK,DXCHECK,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ // RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,DXCHECK,NO_HALF -#ifdef __HLSL_ENABLE_16_BIT -// NATIVE_HALF: %dx.dot = mul i16 %0, %1 -// NATIVE_HALF: ret i16 %dx.dot -int16_t test_dot_short(int16_t p0, int16_t p1) { return dot(p0, p1); } - -// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.sdot.v2i16(<2 x i16> %0, <2 x i16> %1) -// NATIVE_HALF: ret i16 %dx.dot -int16_t test_dot_short2(int16_t2 p0, int16_t2 p1) { return dot(p0, p1); } - -// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.sdot.v3i16(<3 x i16> %0, <3 x i16> %1) -// NATIVE_HALF: ret i16 %dx.dot -int16_t test_dot_short3(int16_t3 p0, int16_t3 p1) { return dot(p0, p1); } - -// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.sdot.v4i16(<4 x i16> %0, <4 x i16> %1) -// NATIVE_HALF: ret i16 %dx.dot -int16_t test_dot_short4(int16_t4 p0, int16_t4 p1) { return dot(p0, p1); } - -// NATIVE_HALF: %dx.dot = mul i16 %0, %1 -// NATIVE_HALF: ret i16 %dx.dot -uint16_t test_dot_ushort(uint16_t p0, uint16_t p1) { return dot(p0, p1); } - -// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.udot.v2i16(<2 x i16> %0, <2 x i16> %1) -// NATIVE_HALF: ret i16 %dx.dot -uint16_t test_dot_ushort2(uint16_t2 p0, uint16_t2 p1) { return dot(p0, p1); } - -// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.udot.v3i16(<3 x i16> %0, <3 x i16> %1) -// NATIVE_HALF: ret i16 %dx.dot -uint16_t test_dot_ushort3(uint16_t3 p0, uint16_t3 p1) { return dot(p0, p1); } +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: --check-prefixes=CHECK,SPVCHECK,NATIVE_HALF +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,SPVCHECK,NO_HALF -// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.udot.v4i16(<4 x i16> %0, <4 x i16> %1) -// NATIVE_HALF: ret i16 %dx.dot -uint16_t test_dot_ushort4(uint16_t4 p0, uint16_t4 p1) { return dot(p0, p1); } -#endif -// CHECK: %dx.dot = mul i32 %0, %1 -// CHECK: ret i32 %dx.dot +// CHECK: %hlsl.dot = mul i32 +// CHECK: ret i32 %hlsl.dot int test_dot_int(int p0, int p1) { return dot(p0, p1); } -// CHECK: %dx.dot = call i32 @llvm.dx.sdot.v2i32(<2 x i32> %0, <2 x i32> %1) -// CHECK: ret i32 %dx.dot +// Capture the expected interchange format so not every check needs to be duplicated +// DXCHECK: %hlsl.dot = call i32 @llvm.[[ICF:dx]].sdot.v2i32(<2 x i32> +// SPVCHECK: %hlsl.dot = call i32 @llvm.[[ICF:spv]].sdot.v2i32(<2 x i32> +// CHECK: ret i32 %hlsl.dot int test_dot_int2(int2 p0, int2 p1) { return dot(p0, p1); } -// CHECK: %dx.dot = call i32 @llvm.dx.sdot.v3i32(<3 x i32> %0, <3 x i32> %1) -// CHECK: ret i32 %dx.dot +// CHECK: %hlsl.dot = call i32 @llvm.[[ICF]].sdot.v3i32(<3 x i32> +// CHECK: ret i32 %hlsl.dot int test_dot_int3(int3 p0, int3 p1) { return dot(p0, p1); } -// CHECK: %dx.dot = call i32 @llvm.dx.sdot.v4i32(<4 x i32> %0, <4 x i32> %1) -// CHECK: ret i32 %dx.dot +// CHECK: %hlsl.dot = call i32 @llvm.[[ICF]].sdot.v4i32(<4 x i32> +// CHECK: ret i32 %hlsl.dot int test_dot_int4(int4 p0, int4 p1) { return dot(p0, p1); } -// CHECK: %dx.dot = mul i32 %0, %1 -// CHECK: ret i32 %dx.dot +// CHECK: %hlsl.dot = mul i32 +// CHECK: ret i32 %hlsl.dot uint test_dot_uint(uint p0, uint p1) { return dot(p0, p1); } -// CHECK: %dx.dot = call i32 @llvm.dx.udot.v2i32(<2 x i32> %0, <2 x i32> %1) -// CHECK: ret i32 %dx.dot +// CHECK: %hlsl.dot = call i32 @llvm.[[ICF]].udot.v2i32(<2 x i32> +// CHECK: ret i32 %hlsl.dot uint test_dot_uint2(uint2 p0, uint2 p1) { return dot(p0, p1); } -// CHECK: %dx.dot = call i32 @llvm.dx.udot.v3i32(<3 x i32> %0, <3 x i32> %1) -// CHECK: ret i32 %dx.dot +// CHECK: %hlsl.dot = call i32 @llvm.[[ICF]].udot.v3i32(<3 x i32> +// CHECK: ret i32 %hlsl.dot uint test_dot_uint3(uint3 p0, uint3 p1) { return dot(p0, p1); } -// CHECK: %dx.dot = call i32 @llvm.dx.udot.v4i32(<4 x i32> %0, <4 x i32> %1) -// CHECK: ret i32 %dx.dot +// CHECK: %hlsl.dot = call i32 @llvm.[[ICF]].udot.v4i32(<4 x i32> +// CHECK: ret i32 %hlsl.dot uint test_dot_uint4(uint4 p0, uint4 p1) { return dot(p0, p1); } -// CHECK: %dx.dot = mul i64 %0, %1 -// CHECK: ret i64 %dx.dot +// CHECK: %hlsl.dot = mul i64 +// CHECK: ret i64 %hlsl.dot int64_t test_dot_long(int64_t p0, int64_t p1) { return dot(p0, p1); } -// CHECK: %dx.dot = call i64 @llvm.dx.sdot.v2i64(<2 x i64> %0, <2 x i64> %1) -// CHECK: ret i64 %dx.dot +// CHECK: %hlsl.dot = call i64 @llvm.[[ICF]].sdot.v2i64(<2 x i64> +// CHECK: ret i64 %hlsl.dot int64_t test_dot_long2(int64_t2 p0, int64_t2 p1) { return dot(p0, p1); } -// CHECK: %dx.dot = call i64 @llvm.dx.sdot.v3i64(<3 x i64> %0, <3 x i64> %1) -// CHECK: ret i64 %dx.dot +// CHECK: %hlsl.dot = call i64 @llvm.[[ICF]].sdot.v3i64(<3 x i64> +// CHECK: ret i64 %hlsl.dot int64_t test_dot_long3(int64_t3 p0, int64_t3 p1) { return dot(p0, p1); } -// CHECK: %dx.dot = call i64 @llvm.dx.sdot.v4i64(<4 x i64> %0, <4 x i64> %1) -// CHECK: ret i64 %dx.dot +// CHECK: %hlsl.dot = call i64 @llvm.[[ICF]].sdot.v4i64(<4 x i64> +// CHECK: ret i64 %hlsl.dot int64_t test_dot_long4(int64_t4 p0, int64_t4 p1) { return dot(p0, p1); } -// CHECK: %dx.dot = mul i64 %0, %1 -// CHECK: ret i64 %dx.dot +// CHECK: %hlsl.dot = mul i64 +// CHECK: ret i64 %hlsl.dot uint64_t test_dot_ulong(uint64_t p0, uint64_t p1) { return dot(p0, p1); } -// CHECK: %dx.dot = call i64 @llvm.dx.udot.v2i64(<2 x i64> %0, <2 x i64> %1) -// CHECK: ret i64 %dx.dot +// CHECK: %hlsl.dot = call i64 @llvm.[[ICF]].udot.v2i64(<2 x i64> +// CHECK: ret i64 %hlsl.dot uint64_t test_dot_ulong2(uint64_t2 p0, uint64_t2 p1) { return dot(p0, p1); } -// CHECK: %dx.dot = call i64 @llvm.dx.udot.v3i64(<3 x i64> %0, <3 x i64> %1) -// CHECK: ret i64 %dx.dot +// CHECK: %hlsl.dot = call i64 @llvm.[[ICF]].udot.v3i64(<3 x i64> +// CHECK: ret i64 %hlsl.dot uint64_t test_dot_ulong3(uint64_t3 p0, uint64_t3 p1) { return dot(p0, p1); } -// CHECK: %dx.dot = call i64 @llvm.dx.udot.v4i64(<4 x i64> %0, <4 x i64> %1) -// CHECK: ret i64 %dx.dot +// CHECK: %hlsl.dot = call i64 @llvm.[[ICF]].udot.v4i64(<4 x i64> +// CHECK: ret i64 %hlsl.dot uint64_t test_dot_ulong4(uint64_t4 p0, uint64_t4 p1) { return dot(p0, p1); } -// NATIVE_HALF: %dx.dot = fmul half %0, %1 -// NATIVE_HALF: ret half %dx.dot -// NO_HALF: %dx.dot = fmul float %0, %1 -// NO_HALF: ret float %dx.dot +#ifdef __HLSL_ENABLE_16_BIT +// NATIVE_HALF: %hlsl.dot = mul i16 +// NATIVE_HALF: ret i16 %hlsl.dot +int16_t test_dot_short(int16_t p0, int16_t p1) { return dot(p0, p1); } + +// NATIVE_HALF: %hlsl.dot = call i16 @llvm.[[ICF]].sdot.v2i16(<2 x i16> +// NATIVE_HALF: ret i16 %hlsl.dot +int16_t test_dot_short2(int16_t2 p0, int16_t2 p1) { return dot(p0, p1); } + +// NATIVE_HALF: %hlsl.dot = call i16 @llvm.[[ICF]].sdot.v3i16(<3 x i16> +// NATIVE_HALF: ret i16 %hlsl.dot +int16_t test_dot_short3(int16_t3 p0, int16_t3 p1) { return dot(p0, p1); } + +// NATIVE_HALF: %hlsl.dot = call i16 @llvm.[[ICF]].sdot.v4i16(<4 x i16> +// NATIVE_HALF: ret i16 %hlsl.dot +int16_t test_dot_short4(int16_t4 p0, int16_t4 p1) { return dot(p0, p1); } + +// NATIVE_HALF: %hlsl.dot = mul i16 +// NATIVE_HALF: ret i16 %hlsl.dot +uint16_t test_dot_ushort(uint16_t p0, uint16_t p1) { return dot(p0, p1); } + +// NATIVE_HALF: %hlsl.dot = call i16 @llvm.[[ICF]].udot.v2i16(<2 x i16> +// NATIVE_HALF: ret i16 %hlsl.dot +uint16_t test_dot_ushort2(uint16_t2 p0, uint16_t2 p1) { return dot(p0, p1); } + +// NATIVE_HALF: %hlsl.dot = call i16 @llvm.[[ICF]].udot.v3i16(<3 x i16> +// NATIVE_HALF: ret i16 %hlsl.dot +uint16_t test_dot_ushort3(uint16_t3 p0, uint16_t3 p1) { return dot(p0, p1); } + +// NATIVE_HALF: %hlsl.dot = call i16 @llvm.[[ICF]].udot.v4i16(<4 x i16> +// NATIVE_HALF: ret i16 %hlsl.dot +uint16_t test_dot_ushort4(uint16_t4 p0, uint16_t4 p1) { return dot(p0, p1); } +#endif + +// NATIVE_HALF: %hlsl.dot = fmul half +// NATIVE_HALF: ret half %hlsl.dot +// NO_HALF: %hlsl.dot = fmul float +// NO_HALF: ret float %hlsl.dot half test_dot_half(half p0, half p1) { return dot(p0, p1); } -// NATIVE_HALF: %dx.dot = call half @llvm.dx.dot2.v2f16(<2 x half> %0, <2 x half> %1) -// NATIVE_HALF: ret half %dx.dot -// NO_HALF: %dx.dot = call float @llvm.dx.dot2.v2f32(<2 x float> %0, <2 x float> %1) -// NO_HALF: ret float %dx.dot +// NATIVE_HALF: %hlsl.dot = call half @llvm.[[ICF]].fdot.v2f16(<2 x half> +// NATIVE_HALF: ret half %hlsl.dot +// NO_HALF: %hlsl.dot = call float @llvm.[[ICF]].fdot.v2f32(<2 x float> +// NO_HALF: ret float %hlsl.dot half test_dot_half2(half2 p0, half2 p1) { return dot(p0, p1); } -// NATIVE_HALF: %dx.dot = call half @llvm.dx.dot3.v3f16(<3 x half> %0, <3 x half> %1) -// NATIVE_HALF: ret half %dx.dot -// NO_HALF: %dx.dot = call float @llvm.dx.dot3.v3f32(<3 x float> %0, <3 x float> %1) -// NO_HALF: ret float %dx.dot +// NATIVE_HALF: %hlsl.dot = call half @llvm.[[ICF]].fdot.v3f16(<3 x half> +// NATIVE_HALF: ret half %hlsl.dot +// NO_HALF: %hlsl.dot = call float @llvm.[[ICF]].fdot.v3f32(<3 x float> +// NO_HALF: ret float %hlsl.dot half test_dot_half3(half3 p0, half3 p1) { return dot(p0, p1); } -// NATIVE_HALF: %dx.dot = call half @llvm.dx.dot4.v4f16(<4 x half> %0, <4 x half> %1) -// NATIVE_HALF: ret half %dx.dot -// NO_HALF: %dx.dot = call float @llvm.dx.dot4.v4f32(<4 x float> %0, <4 x float> %1) -// NO_HALF: ret float %dx.dot +// NATIVE_HALF: %hlsl.dot = call half @llvm.[[ICF]].fdot.v4f16(<4 x half> +// NATIVE_HALF: ret half %hlsl.dot +// NO_HALF: %hlsl.dot = call float @llvm.[[ICF]].fdot.v4f32(<4 x float> +// NO_HALF: ret float %hlsl.dot half test_dot_half4(half4 p0, half4 p1) { return dot(p0, p1); } -// CHECK: %dx.dot = fmul float %0, %1 -// CHECK: ret float %dx.dot +// CHECK: %hlsl.dot = fmul float +// CHECK: ret float %hlsl.dot float test_dot_float(float p0, float p1) { return dot(p0, p1); } -// CHECK: %dx.dot = call float @llvm.dx.dot2.v2f32(<2 x float> %0, <2 x float> %1) -// CHECK: ret float %dx.dot +// CHECK: %hlsl.dot = call float @llvm.[[ICF]].fdot.v2f32(<2 x float> +// CHECK: ret float %hlsl.dot float test_dot_float2(float2 p0, float2 p1) { return dot(p0, p1); } -// CHECK: %dx.dot = call float @llvm.dx.dot3.v3f32(<3 x float> %0, <3 x float> %1) -// CHECK: ret float %dx.dot +// CHECK: %hlsl.dot = call float @llvm.[[ICF]].fdot.v3f32(<3 x float> +// CHECK: ret float %hlsl.dot float test_dot_float3(float3 p0, float3 p1) { return dot(p0, p1); } -// CHECK: %dx.dot = call float @llvm.dx.dot4.v4f32(<4 x float> %0, <4 x float> %1) -// CHECK: ret float %dx.dot +// CHECK: %hlsl.dot = call float @llvm.[[ICF]].fdot.v4f32(<4 x float> +// CHECK: ret float %hlsl.dot float test_dot_float4(float4 p0, float4 p1) { return dot(p0, p1); } -// CHECK: %dx.dot = call float @llvm.dx.dot2.v2f32(<2 x float> %splat.splat, <2 x float> %1) -// CHECK: ret float %dx.dot +// CHECK: %hlsl.dot = call float @llvm.[[ICF]].fdot.v2f32(<2 x float> %splat.splat, <2 x float> +// CHECK: ret float %hlsl.dot float test_dot_float2_splat(float p0, float2 p1) { return dot(p0, p1); } -// CHECK: %dx.dot = call float @llvm.dx.dot3.v3f32(<3 x float> %splat.splat, <3 x float> %1) -// CHECK: ret float %dx.dot +// CHECK: %hlsl.dot = call float @llvm.[[ICF]].fdot.v3f32(<3 x float> %splat.splat, <3 x float> +// CHECK: ret float %hlsl.dot float test_dot_float3_splat(float p0, float3 p1) { return dot(p0, p1); } -// CHECK: %dx.dot = call float @llvm.dx.dot4.v4f32(<4 x float> %splat.splat, <4 x float> %1) -// CHECK: ret float %dx.dot +// CHECK: %hlsl.dot = call float @llvm.[[ICF]].fdot.v4f32(<4 x float> %splat.splat, <4 x float> +// CHECK: ret float %hlsl.dot float test_dot_float4_splat(float p0, float4 p1) { return dot(p0, p1); } -// CHECK: %dx.dot = fmul double %0, %1 -// CHECK: ret double %dx.dot +// CHECK: %hlsl.dot = fmul double +// CHECK: ret double %hlsl.dot double test_dot_double(double p0, double p1) { return dot(p0, p1); } diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index a0807a01ea5ab2..e959e70dc1cd4f 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -36,26 +36,30 @@ def int_dx_clamp : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, LLVMM def int_dx_uclamp : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>; def int_dx_saturate : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; -def int_dx_dot2 : - Intrinsic<[LLVMVectorElementType<0>], +def int_dx_dot2 : + DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [llvm_anyfloat_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>], - [IntrNoMem, IntrWillReturn, Commutative] >; -def int_dx_dot3 : - Intrinsic<[LLVMVectorElementType<0>], + [IntrNoMem, Commutative] >; +def int_dx_dot3 : + DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [llvm_anyfloat_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>], - [IntrNoMem, IntrWillReturn, Commutative] >; -def int_dx_dot4 : - Intrinsic<[LLVMVectorElementType<0>], + [IntrNoMem, Commutative] >; +def int_dx_dot4 : + DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [llvm_anyfloat_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>], - [IntrNoMem, IntrWillReturn, Commutative] >; -def int_dx_sdot : - Intrinsic<[LLVMVectorElementType<0>], + [IntrNoMem, Commutative] >; +def int_dx_fdot : + DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [llvm_anyfloat_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>], + [IntrNoMem, Commutative] >; +def int_dx_sdot : + DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>], - [IntrNoMem, IntrWillReturn, Commutative] >; -def int_dx_udot : - Intrinsic<[LLVMVectorElementType<0>], + [IntrNoMem, Commutative] >; +def int_dx_udot : + DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>], - [IntrNoMem, IntrWillReturn, Commutative] >; + [IntrNoMem, Commutative] >; def int_dx_frac : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td index 4e130ad0c907d9..63d9ba43a1183b 100644 --- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td +++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td @@ -67,4 +67,16 @@ let TargetPrefix = "spv" in { def int_spv_normalize : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty]>; def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty]>; def int_spv_saturate : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; + def int_spv_fdot : + DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [llvm_anyfloat_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>], + [IntrNoMem, Commutative] >; + def int_spv_sdot : + DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>], + [IntrNoMem, Commutative] >; + def int_spv_udot : + DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], + [llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>], + [IntrNoMem, Commutative] >; } diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 0c5bc20ccc0bb8..c4b278c109dbb9 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -652,7 +652,7 @@ def UMad : DXILOp<49, tertiary> { def Dot2 : DXILOp<54, dot2> { let Doc = "dot product of two float vectors Dot(a,b) = a[0]*b[0] + ... + " - "a[n]*b[n] where n is between 0 and 1"; + "a[n]*b[n] where n is 0 to 1 inclusive"; let LLVMIntrinsic = int_dx_dot2; let arguments = !listsplat(OverloadTy, 4); let result = OverloadTy; @@ -663,7 +663,7 @@ def Dot2 : DXILOp<54, dot2> { def Dot3 : DXILOp<55, dot3> { let Doc = "dot product of two float vectors Dot(a,b) = a[0]*b[0] + ... + " - "a[n]*b[n] where n is between 0 and 2"; + "a[n]*b[n] where n is 0 to 2 inclusive"; let LLVMIntrinsic = int_dx_dot3; let arguments = !listsplat(OverloadTy, 6); let result = OverloadTy; @@ -674,7 +674,7 @@ def Dot3 : DXILOp<55, dot3> { def Dot4 : DXILOp<56, dot4> { let Doc = "dot product of two float vectors Dot(a,b) = a[0]*b[0] + ... + " - "a[n]*b[n] where n is between 0 and 3"; + "a[n]*b[n] where n is 0 to 3 inclusive"; let LLVMIntrinsic = int_dx_dot4; let arguments = !listsplat(OverloadTy, 8); let result = OverloadTy; diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp index 2c481d15be5bde..e49169cff8aa86 100644 --- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp +++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp @@ -44,6 +44,7 @@ static bool isIntrinsicExpansion(Function &F) { case Intrinsic::dx_lerp: case Intrinsic::dx_length: case Intrinsic::dx_normalize: + case Intrinsic::dx_fdot: case Intrinsic::dx_sdot: case Intrinsic::dx_udot: return true; @@ -68,25 +69,73 @@ static Value *expandAbs(CallInst *Orig) { "dx.max"); } -static Value *expandIntegerDot(CallInst *Orig, Intrinsic::ID DotIntrinsic) { +// Create appropriate DXIL float dot intrinsic for the given A and B operands +// The appropriate opcode will be determined by the size of the operands +// The dot product is placed in the position indicated by Orig +static Value *expandFloatDotIntrinsic(CallInst *Orig, Value *A, Value *B) { + Type *ATy = A->getType(); + [[maybe_unused]] Type *BTy = B->getType(); + assert(ATy->isVectorTy() && BTy->isVectorTy()); + + IRBuilder<> Builder(Orig); + + auto *AVec = dyn_cast(ATy); + + assert(ATy->getScalarType()->isFloatingPointTy()); + + Intrinsic::ID DotIntrinsic = Intrinsic::dx_dot4; + switch (AVec->getNumElements()) { + case 2: + DotIntrinsic = Intrinsic::dx_dot2; + break; + case 3: + DotIntrinsic = Intrinsic::dx_dot3; + break; + case 4: + DotIntrinsic = Intrinsic::dx_dot4; + break; + default: + report_fatal_error( + Twine("Invalid dot product input vector: length is outside 2-4"), + /* gen_crash_diag=*/false); + return nullptr; + } + return Builder.CreateIntrinsic(ATy->getScalarType(), DotIntrinsic, + ArrayRef{A, B}, nullptr, "dot"); +} + +// Create the appropriate DXIL float dot intrinsic for the operands of Orig +// The appropriate opcode will be determined by the size of the operands +// The dot product is placed in the position indicated by Orig +static Value *expandFloatDotIntrinsic(CallInst *Orig) { + return expandFloatDotIntrinsic(Orig, Orig->getOperand(0), + Orig->getOperand(1)); +} + +// Expand integer dot product to multiply and add ops +static Value *expandIntegerDotIntrinsic(CallInst *Orig, + Intrinsic::ID DotIntrinsic) { assert(DotIntrinsic == Intrinsic::dx_sdot || DotIntrinsic == Intrinsic::dx_udot); - Intrinsic::ID MadIntrinsic = DotIntrinsic == Intrinsic::dx_sdot - ? Intrinsic::dx_imad - : Intrinsic::dx_umad; Value *A = Orig->getOperand(0); Value *B = Orig->getOperand(1); - [[maybe_unused]] Type *ATy = A->getType(); + Type *ATy = A->getType(); [[maybe_unused]] Type *BTy = B->getType(); assert(ATy->isVectorTy() && BTy->isVectorTy()); - IRBuilder<> Builder(Orig->getParent()); - Builder.SetInsertPoint(Orig); + IRBuilder<> Builder(Orig); + + auto *AVec = dyn_cast(ATy); - auto *AVec = dyn_cast(A->getType()); + assert(ATy->getScalarType()->isIntegerTy()); + + Value *Result; + Intrinsic::ID MadIntrinsic = DotIntrinsic == Intrinsic::dx_sdot + ? Intrinsic::dx_imad + : Intrinsic::dx_umad; Value *Elt0 = Builder.CreateExtractElement(A, (uint64_t)0); Value *Elt1 = Builder.CreateExtractElement(B, (uint64_t)0); - Value *Result = Builder.CreateMul(Elt0, Elt1); + Result = Builder.CreateMul(Elt0, Elt1); for (unsigned I = 1; I < AVec->getNumElements(); I++) { Elt0 = Builder.CreateExtractElement(A, I); Elt1 = Builder.CreateExtractElement(B, I); @@ -211,6 +260,8 @@ static Value *expandLog10Intrinsic(CallInst *Orig) { return expandLogIntrinsic(Orig, numbers::ln2f / numbers::ln10f); } +// Use dot product of vector operand with itself to calculate the length. +// Divide the vector by that length to normalize it. static Value *expandNormalizeIntrinsic(CallInst *Orig) { Value *X = Orig->getOperand(0); Type *Ty = Orig->getType(); @@ -229,30 +280,7 @@ static Value *expandNormalizeIntrinsic(CallInst *Orig) { return Builder.CreateFDiv(X, X); } - unsigned XVecSize = XVec->getNumElements(); - Value *DotProduct = nullptr; - // use the dot intrinsic corresponding to the vector size - switch (XVecSize) { - case 1: - report_fatal_error(Twine("Invalid input vector: length is zero"), - /* gen_crash_diag=*/false); - break; - case 2: - DotProduct = Builder.CreateIntrinsic( - EltTy, Intrinsic::dx_dot2, ArrayRef{X, X}, nullptr, "dx.dot2"); - break; - case 3: - DotProduct = Builder.CreateIntrinsic( - EltTy, Intrinsic::dx_dot3, ArrayRef{X, X}, nullptr, "dx.dot3"); - break; - case 4: - DotProduct = Builder.CreateIntrinsic( - EltTy, Intrinsic::dx_dot4, ArrayRef{X, X}, nullptr, "dx.dot4"); - break; - default: - report_fatal_error(Twine("Invalid input vector: vector size is invalid."), - /* gen_crash_diag=*/false); - } + Value *DotProduct = expandFloatDotIntrinsic(Orig, X, X); // verify that the length is non-zero // (if the dot product is non-zero, then the length is non-zero) @@ -267,7 +295,8 @@ static Value *expandNormalizeIntrinsic(CallInst *Orig) { ArrayRef{DotProduct}, nullptr, "dx.rsqrt"); - Value *MultiplicandVec = Builder.CreateVectorSplat(XVecSize, Multiplicand); + Value *MultiplicandVec = + Builder.CreateVectorSplat(XVec->getNumElements(), Multiplicand); return Builder.CreateFMul(X, MultiplicandVec); } @@ -363,9 +392,12 @@ static bool expandIntrinsic(Function &F, CallInst *Orig) { case Intrinsic::dx_normalize: Result = expandNormalizeIntrinsic(Orig); break; + case Intrinsic::dx_fdot: + Result = expandFloatDotIntrinsic(Orig); + break; case Intrinsic::dx_sdot: case Intrinsic::dx_udot: - Result = expandIntegerDot(Orig, F.getIntrinsicID()); + Result = expandIntegerDotIntrinsic(Orig, F.getIntrinsicID()); break; } diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 1104b6a7212935..9e10d947081cc3 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -184,6 +184,12 @@ class SPIRVInstructionSelector : public InstructionSelector { bool selectRsqrt(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const; + bool selectFloatDot(Register ResVReg, const SPIRVType *ResType, + MachineInstr &I) const; + + bool selectIntegerDot(Register ResVReg, const SPIRVType *ResType, + MachineInstr &I) const; + void renderImm32(MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const; void renderFImm64(MachineInstrBuilder &MIB, const MachineInstr &I, @@ -1484,6 +1490,97 @@ bool SPIRVInstructionSelector::selectRsqrt(Register ResVReg, .constrainAllUses(TII, TRI, RBI); } +// Select the OpDot instruction for the given float dot +bool SPIRVInstructionSelector::selectFloatDot(Register ResVReg, + const SPIRVType *ResType, + MachineInstr &I) const { + assert(I.getNumOperands() == 4); + assert(I.getOperand(2).isReg()); + assert(I.getOperand(3).isReg()); + + [[maybe_unused]] SPIRVType *VecType = + GR.getSPIRVTypeForVReg(I.getOperand(2).getReg()); + + assert(VecType->getOpcode() == SPIRV::OpTypeVector && + GR.getScalarOrVectorComponentCount(VecType) > 1 && + "dot product requires a vector of at least 2 components"); + + [[maybe_unused]] SPIRVType *EltType = + GR.getSPIRVTypeForVReg(VecType->getOperand(1).getReg()); + + assert(EltType->getOpcode() == SPIRV::OpTypeFloat); + + MachineBasicBlock &BB = *I.getParent(); + return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpDot)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(I.getOperand(2).getReg()) + .addUse(I.getOperand(3).getReg()) + .constrainAllUses(TII, TRI, RBI); +} + +// Since pre-1.6 SPIRV has no integer dot implementation, +// expand by piecewise multiplying and adding the results +bool SPIRVInstructionSelector::selectIntegerDot(Register ResVReg, + const SPIRVType *ResType, + MachineInstr &I) const { + assert(I.getNumOperands() == 4); + assert(I.getOperand(2).isReg()); + assert(I.getOperand(3).isReg()); + MachineBasicBlock &BB = *I.getParent(); + + // Multiply the vectors, then sum the results + Register Vec0 = I.getOperand(2).getReg(); + Register Vec1 = I.getOperand(3).getReg(); + Register TmpVec = MRI->createVirtualRegister(&SPIRV::IDRegClass); + SPIRVType *VecType = GR.getSPIRVTypeForVReg(Vec0); + + bool Result = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpIMulV)) + .addDef(TmpVec) + .addUse(GR.getSPIRVTypeID(VecType)) + .addUse(Vec0) + .addUse(Vec1) + .constrainAllUses(TII, TRI, RBI); + + assert(VecType->getOpcode() == SPIRV::OpTypeVector && + GR.getScalarOrVectorComponentCount(VecType) > 1 && + "dot product requires a vector of at least 2 components"); + + Register Res = MRI->createVirtualRegister(&SPIRV::IDRegClass); + Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCompositeExtract)) + .addDef(Res) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(TmpVec) + .addImm(0) + .constrainAllUses(TII, TRI, RBI); + + for (unsigned i = 1; i < GR.getScalarOrVectorComponentCount(VecType); i++) { + Register Elt = MRI->createVirtualRegister(&SPIRV::IDRegClass); + + Result |= + BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCompositeExtract)) + .addDef(Elt) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(TmpVec) + .addImm(i) + .constrainAllUses(TII, TRI, RBI); + + Register Sum = i < GR.getScalarOrVectorComponentCount(VecType) - 1 + ? MRI->createVirtualRegister(&SPIRV::IDRegClass) + : ResVReg; + + Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpIAddS)) + .addDef(Sum) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(Res) + .addUse(Elt) + .constrainAllUses(TII, TRI, RBI); + Res = Sum; + } + + return Result; +} + /// Transform saturate(x) to clamp(x, 0.0f, 1.0f) as SPIRV /// does not have a saturate builtin. bool SPIRVInstructionSelector::selectSaturate(Register ResVReg, @@ -2223,6 +2320,11 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, break; case Intrinsic::spv_thread_id: return selectSpvThreadId(ResVReg, ResType, I); + case Intrinsic::spv_fdot: + return selectFloatDot(ResVReg, ResType, I); + case Intrinsic::spv_udot: + case Intrinsic::spv_sdot: + return selectIntegerDot(ResVReg, ResType, I); case Intrinsic::spv_all: return selectAll(ResVReg, ResType, I); case Intrinsic::spv_any: diff --git a/llvm/test/CodeGen/DirectX/fdot.ll b/llvm/test/CodeGen/DirectX/fdot.ll index 56817a172ff9e3..aa1b15972e266d 100644 --- a/llvm/test/CodeGen/DirectX/fdot.ll +++ b/llvm/test/CodeGen/DirectX/fdot.ll @@ -1,94 +1,101 @@ -; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s +; RUN: opt -S -dxil-intrinsic-expansion -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK +; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK -; Make sure dxil operation function calls for dot are generated for int/uint vectors. +; Make sure dxil operation function calls for dot are generated for float type vectors. ; CHECK-LABEL: dot_half2 define noundef half @dot_half2(<2 x half> noundef %a, <2 x half> noundef %b) { entry: -; CHECK: extractelement <2 x half> %a, i32 0 -; CHECK: extractelement <2 x half> %a, i32 1 -; CHECK: extractelement <2 x half> %b, i32 0 -; CHECK: extractelement <2 x half> %b, i32 1 -; CHECK: call half @dx.op.dot2.f16(i32 54, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}) - %dx.dot = call half @llvm.dx.dot2.v2f16(<2 x half> %a, <2 x half> %b) +; DOPCHECK: extractelement <2 x half> %a, i32 0 +; DOPCHECK: extractelement <2 x half> %a, i32 1 +; DOPCHECK: extractelement <2 x half> %b, i32 0 +; DOPCHECK: extractelement <2 x half> %b, i32 1 +; DOPCHECK: call half @dx.op.dot2.f16(i32 54, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}) +; EXPCHECK: call half @llvm.dx.dot2.v2f16(<2 x half> %a, <2 x half> %b) + %dx.dot = call half @llvm.dx.fdot.v2f16(<2 x half> %a, <2 x half> %b) ret half %dx.dot } ; CHECK-LABEL: dot_half3 define noundef half @dot_half3(<3 x half> noundef %a, <3 x half> noundef %b) { entry: -; CHECK: extractelement <3 x half> %a, i32 0 -; CHECK: extractelement <3 x half> %a, i32 1 -; CHECK: extractelement <3 x half> %a, i32 2 -; CHECK: extractelement <3 x half> %b, i32 0 -; CHECK: extractelement <3 x half> %b, i32 1 -; CHECK: extractelement <3 x half> %b, i32 2 -; CHECK: call half @dx.op.dot3.f16(i32 55, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}) - %dx.dot = call half @llvm.dx.dot3.v3f16(<3 x half> %a, <3 x half> %b) +; DOPCHECK: extractelement <3 x half> %a, i32 0 +; DOPCHECK: extractelement <3 x half> %a, i32 1 +; DOPCHECK: extractelement <3 x half> %a, i32 2 +; DOPCHECK: extractelement <3 x half> %b, i32 0 +; DOPCHECK: extractelement <3 x half> %b, i32 1 +; DOPCHECK: extractelement <3 x half> %b, i32 2 +; DOPCHECK: call half @dx.op.dot3.f16(i32 55, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}) +; EXPCHECK: call half @llvm.dx.dot3.v3f16(<3 x half> %a, <3 x half> %b) + %dx.dot = call half @llvm.dx.fdot.v3f16(<3 x half> %a, <3 x half> %b) ret half %dx.dot } ; CHECK-LABEL: dot_half4 define noundef half @dot_half4(<4 x half> noundef %a, <4 x half> noundef %b) { entry: -; CHECK: extractelement <4 x half> %a, i32 0 -; CHECK: extractelement <4 x half> %a, i32 1 -; CHECK: extractelement <4 x half> %a, i32 2 -; CHECK: extractelement <4 x half> %a, i32 3 -; CHECK: extractelement <4 x half> %b, i32 0 -; CHECK: extractelement <4 x half> %b, i32 1 -; CHECK: extractelement <4 x half> %b, i32 2 -; CHECK: extractelement <4 x half> %b, i32 3 -; CHECK: call half @dx.op.dot4.f16(i32 56, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}) - %dx.dot = call half @llvm.dx.dot4.v4f16(<4 x half> %a, <4 x half> %b) +; DOPCHECK: extractelement <4 x half> %a, i32 0 +; DOPCHECK: extractelement <4 x half> %a, i32 1 +; DOPCHECK: extractelement <4 x half> %a, i32 2 +; DOPCHECK: extractelement <4 x half> %a, i32 3 +; DOPCHECK: extractelement <4 x half> %b, i32 0 +; DOPCHECK: extractelement <4 x half> %b, i32 1 +; DOPCHECK: extractelement <4 x half> %b, i32 2 +; DOPCHECK: extractelement <4 x half> %b, i32 3 +; DOPCHECK: call half @dx.op.dot4.f16(i32 56, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}) +; EXPCHECK: call half @llvm.dx.dot4.v4f16(<4 x half> %a, <4 x half> %b) + %dx.dot = call half @llvm.dx.fdot.v4f16(<4 x half> %a, <4 x half> %b) ret half %dx.dot } ; CHECK-LABEL: dot_float2 define noundef float @dot_float2(<2 x float> noundef %a, <2 x float> noundef %b) { entry: -; CHECK: extractelement <2 x float> %a, i32 0 -; CHECK: extractelement <2 x float> %a, i32 1 -; CHECK: extractelement <2 x float> %b, i32 0 -; CHECK: extractelement <2 x float> %b, i32 1 -; CHECK: call float @dx.op.dot2.f32(i32 54, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}) - %dx.dot = call float @llvm.dx.dot2.v2f32(<2 x float> %a, <2 x float> %b) +; DOPCHECK: extractelement <2 x float> %a, i32 0 +; DOPCHECK: extractelement <2 x float> %a, i32 1 +; DOPCHECK: extractelement <2 x float> %b, i32 0 +; DOPCHECK: extractelement <2 x float> %b, i32 1 +; DOPCHECK: call float @dx.op.dot2.f32(i32 54, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}) +; EXPCHECK: call float @llvm.dx.dot2.v2f32(<2 x float> %a, <2 x float> %b) + %dx.dot = call float @llvm.dx.fdot.v2f32(<2 x float> %a, <2 x float> %b) ret float %dx.dot } ; CHECK-LABEL: dot_float3 define noundef float @dot_float3(<3 x float> noundef %a, <3 x float> noundef %b) { entry: -; CHECK: extractelement <3 x float> %a, i32 0 -; CHECK: extractelement <3 x float> %a, i32 1 -; CHECK: extractelement <3 x float> %a, i32 2 -; CHECK: extractelement <3 x float> %b, i32 0 -; CHECK: extractelement <3 x float> %b, i32 1 -; CHECK: extractelement <3 x float> %b, i32 2 -; CHECK: call float @dx.op.dot3.f32(i32 55, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}) - %dx.dot = call float @llvm.dx.dot3.v3f32(<3 x float> %a, <3 x float> %b) +; DOPCHECK: extractelement <3 x float> %a, i32 0 +; DOPCHECK: extractelement <3 x float> %a, i32 1 +; DOPCHECK: extractelement <3 x float> %a, i32 2 +; DOPCHECK: extractelement <3 x float> %b, i32 0 +; DOPCHECK: extractelement <3 x float> %b, i32 1 +; DOPCHECK: extractelement <3 x float> %b, i32 2 +; DOPCHECK: call float @dx.op.dot3.f32(i32 55, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}) +; EXPCHECK: call float @llvm.dx.dot3.v3f32(<3 x float> %a, <3 x float> %b) + %dx.dot = call float @llvm.dx.fdot.v3f32(<3 x float> %a, <3 x float> %b) ret float %dx.dot } ; CHECK-LABEL: dot_float4 define noundef float @dot_float4(<4 x float> noundef %a, <4 x float> noundef %b) { entry: -; CHECK: extractelement <4 x float> %a, i32 0 -; CHECK: extractelement <4 x float> %a, i32 1 -; CHECK: extractelement <4 x float> %a, i32 2 -; CHECK: extractelement <4 x float> %a, i32 3 -; CHECK: extractelement <4 x float> %b, i32 0 -; CHECK: extractelement <4 x float> %b, i32 1 -; CHECK: extractelement <4 x float> %b, i32 2 -; CHECK: extractelement <4 x float> %b, i32 3 -; CHECK: call float @dx.op.dot4.f32(i32 56, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}) - %dx.dot = call float @llvm.dx.dot4.v4f32(<4 x float> %a, <4 x float> %b) +; DOPCHECK: extractelement <4 x float> %a, i32 0 +; DOPCHECK: extractelement <4 x float> %a, i32 1 +; DOPCHECK: extractelement <4 x float> %a, i32 2 +; DOPCHECK: extractelement <4 x float> %a, i32 3 +; DOPCHECK: extractelement <4 x float> %b, i32 0 +; DOPCHECK: extractelement <4 x float> %b, i32 1 +; DOPCHECK: extractelement <4 x float> %b, i32 2 +; DOPCHECK: extractelement <4 x float> %b, i32 3 +; DOPCHECK: call float @dx.op.dot4.f32(i32 56, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}) +; EXPCHECK: call float @llvm.dx.dot4.v4f32(<4 x float> %a, <4 x float> %b) + %dx.dot = call float @llvm.dx.fdot.v4f32(<4 x float> %a, <4 x float> %b) ret float %dx.dot } -declare half @llvm.dx.dot.v2f16(<2 x half> , <2 x half> ) -declare half @llvm.dx.dot.v3f16(<3 x half> , <3 x half> ) -declare half @llvm.dx.dot.v4f16(<4 x half> , <4 x half> ) -declare float @llvm.dx.dot.v2f32(<2 x float>, <2 x float>) -declare float @llvm.dx.dot.v3f32(<3 x float>, <3 x float>) -declare float @llvm.dx.dot.v4f32(<4 x float>, <4 x float>) +declare half @llvm.dx.fdot.v2f16(<2 x half> , <2 x half> ) +declare half @llvm.dx.fdot.v3f16(<3 x half> , <3 x half> ) +declare half @llvm.dx.fdot.v4f16(<4 x half> , <4 x half> ) +declare float @llvm.dx.fdot.v2f32(<2 x float>, <2 x float>) +declare float @llvm.dx.fdot.v3f32(<3 x float>, <3 x float>) +declare float @llvm.dx.fdot.v4f32(<4 x float>, <4 x float>) diff --git a/llvm/test/CodeGen/DirectX/idot.ll b/llvm/test/CodeGen/DirectX/idot.ll index eac1b91106ddef..5848868ed0556a 100644 --- a/llvm/test/CodeGen/DirectX/idot.ll +++ b/llvm/test/CodeGen/DirectX/idot.ll @@ -13,12 +13,12 @@ entry: ; CHECK: extractelement <2 x i16> %b, i64 1 ; EXPCHECK: call i16 @llvm.dx.imad.i16(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) ; DOPCHECK: call i16 @dx.op.tertiary.i16(i32 48, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) - %dx.dot = call i16 @llvm.dx.sdot.v3i16(<2 x i16> %a, <2 x i16> %b) - ret i16 %dx.dot + %dot = call i16 @llvm.dx.sdot.v3i16(<2 x i16> %a, <2 x i16> %b) + ret i16 %dot } -; CHECK-LABEL: sdot_int4 -define noundef i32 @sdot_int4(<4 x i32> noundef %a, <4 x i32> noundef %b) { +; CHECK-LABEL: dot_int4 +define noundef i32 @dot_int4(<4 x i32> noundef %a, <4 x i32> noundef %b) { entry: ; CHECK: extractelement <4 x i32> %a, i64 0 ; CHECK: extractelement <4 x i32> %b, i64 0 @@ -35,8 +35,8 @@ entry: ; CHECK: extractelement <4 x i32> %b, i64 3 ; EXPCHECK: call i32 @llvm.dx.imad.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) ; DOPCHECK: call i32 @dx.op.tertiary.i32(i32 48, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) - %dx.dot = call i32 @llvm.dx.sdot.v4i32(<4 x i32> %a, <4 x i32> %b) - ret i32 %dx.dot + %dot = call i32 @llvm.dx.sdot.v4i32(<4 x i32> %a, <4 x i32> %b) + ret i32 %dot } ; CHECK-LABEL: dot_uint16_t3 @@ -53,8 +53,8 @@ entry: ; CHECK: extractelement <3 x i16> %b, i64 2 ; EXPCHECK: call i16 @llvm.dx.umad.i16(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) ; DOPCHECK: call i16 @dx.op.tertiary.i16(i32 49, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}) - %dx.dot = call i16 @llvm.dx.udot.v3i16(<3 x i16> %a, <3 x i16> %b) - ret i16 %dx.dot + %dot = call i16 @llvm.dx.udot.v3i16(<3 x i16> %a, <3 x i16> %b) + ret i16 %dot } ; CHECK-LABEL: dot_uint4 @@ -75,8 +75,8 @@ entry: ; CHECK: extractelement <4 x i32> %b, i64 3 ; EXPCHECK: call i32 @llvm.dx.umad.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) ; DOPCHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) - %dx.dot = call i32 @llvm.dx.udot.v4i32(<4 x i32> %a, <4 x i32> %b) - ret i32 %dx.dot + %dot = call i32 @llvm.dx.udot.v4i32(<4 x i32> %a, <4 x i32> %b) + ret i32 %dot } ; CHECK-LABEL: dot_uint64_t4 @@ -89,8 +89,8 @@ entry: ; CHECK: extractelement <2 x i64> %b, i64 1 ; EXPCHECK: call i64 @llvm.dx.umad.i64(i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) ; DOPCHECK: call i64 @dx.op.tertiary.i64(i32 49, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}}) - %dx.dot = call i64 @llvm.dx.udot.v2i64(<2 x i64> %a, <2 x i64> %b) - ret i64 %dx.dot + %dot = call i64 @llvm.dx.udot.v2i64(<2 x i64> %a, <2 x i64> %b) + ret i64 %dot } declare i16 @llvm.dx.sdot.v2i16(<2 x i16>, <2 x i16>) diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fdot.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fdot.ll new file mode 100644 index 00000000000000..5a8d4581aa0cdb --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fdot.ll @@ -0,0 +1,75 @@ +; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; Make sure dxil operation function calls for dot are generated for float type vectors. + +; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 +; CHECK-DAG: %[[#vec2_float_16:]] = OpTypeVector %[[#float_16]] 2 +; CHECK-DAG: %[[#vec3_float_16:]] = OpTypeVector %[[#float_16]] 3 +; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 +; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#vec2_float_32:]] = OpTypeVector %[[#float_32]] 2 +; CHECK-DAG: %[[#vec3_float_32:]] = OpTypeVector %[[#float_32]] 3 +; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 + + +define noundef half @dot_half2(<2 x half> noundef %a, <2 x half> noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec2_float_16]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec2_float_16]] +; CHECK: OpDot %[[#float_16]] %[[#arg0:]] %[[#arg1:]] + %dx.dot = call half @llvm.spv.fdot.v2f16(<2 x half> %a, <2 x half> %b) + ret half %dx.dot +} + +define noundef half @dot_half3(<3 x half> noundef %a, <3 x half> noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_16]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_float_16]] +; CHECK: OpDot %[[#float_16]] %[[#arg0:]] %[[#arg1:]] + %dx.dot = call half @llvm.spv.fdot.v3f16(<3 x half> %a, <3 x half> %b) + ret half %dx.dot +} + +define noundef half @dot_half4(<4 x half> noundef %a, <4 x half> noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_16]] +; CHECK: OpDot %[[#float_16]] %[[#arg0:]] %[[#arg1:]] + %dx.dot = call half @llvm.spv.fdot.v4f16(<4 x half> %a, <4 x half> %b) + ret half %dx.dot +} + +define noundef float @dot_float2(<2 x float> noundef %a, <2 x float> noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec2_float_32]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec2_float_32]] +; CHECK: OpDot %[[#float_32]] %[[#arg0:]] %[[#arg1:]] + %dx.dot = call float @llvm.spv.fdot.v2f32(<2 x float> %a, <2 x float> %b) + ret float %dx.dot +} + +define noundef float @dot_float3(<3 x float> noundef %a, <3 x float> noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_32]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_float_32]] +; CHECK: OpDot %[[#float_32]] %[[#arg0:]] %[[#arg1:]] + %dx.dot = call float @llvm.spv.fdot.v3f32(<3 x float> %a, <3 x float> %b) + ret float %dx.dot +} + +define noundef float @dot_float4(<4 x float> noundef %a, <4 x float> noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_32]] +; CHECK: OpDot %[[#float_32]] %[[#arg0:]] %[[#arg1:]] + %dx.dot = call float @llvm.spv.fdot.v4f32(<4 x float> %a, <4 x float> %b) + ret float %dx.dot +} + +declare half @llvm.spv.fdot.v2f16(<2 x half> , <2 x half> ) +declare half @llvm.spv.fdot.v3f16(<3 x half> , <3 x half> ) +declare half @llvm.spv.fdot.v4f16(<4 x half> , <4 x half> ) +declare float @llvm.spv.fdot.v2f32(<2 x float>, <2 x float>) +declare float @llvm.spv.fdot.v3f32(<3 x float>, <3 x float>) +declare float @llvm.spv.fdot.v4f32(<4 x float>, <4 x float>) diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/idot.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/idot.ll new file mode 100644 index 00000000000000..22b6ed6bdfcbc5 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/idot.ll @@ -0,0 +1,88 @@ +; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; Make sure dxil operation function calls for dot are generated for int/uint vectors. + +; CHECK-DAG: %[[#int_16:]] = OpTypeInt 16 +; CHECK-DAG: %[[#vec2_int_16:]] = OpTypeVector %[[#int_16]] 2 +; CHECK-DAG: %[[#vec3_int_16:]] = OpTypeVector %[[#int_16]] 3 +; CHECK-DAG: %[[#int_32:]] = OpTypeInt 32 +; CHECK-DAG: %[[#vec4_int_32:]] = OpTypeVector %[[#int_32]] 4 +; CHECK-DAG: %[[#int_64:]] = OpTypeInt 64 +; CHECK-DAG: %[[#vec2_int_64:]] = OpTypeVector %[[#int_64]] 2 + +define noundef i16 @dot_int16_t2(<2 x i16> noundef %a, <2 x i16> noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec2_int_16]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec2_int_16]] +; CHECK: %[[#mul_vec:]] = OpIMul %[[#vec2_int_16]] %[[#arg0]] %[[#arg1]] +; CHECK: %[[#elt0:]] = OpCompositeExtract %[[#int_16]] %[[#mul_vec]] 0 +; CHECK: %[[#elt1:]] = OpCompositeExtract %[[#int_16]] %[[#mul_vec]] 1 +; CHECK: %[[#sum:]] = OpIAdd %[[#int_16]] %[[#elt0]] %[[#elt1]] + %dot = call i16 @llvm.spv.sdot.v3i16(<2 x i16> %a, <2 x i16> %b) + ret i16 %dot +} + +define noundef i32 @dot_int4(<4 x i32> noundef %a, <4 x i32> noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_int_32]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_int_32]] +; CHECK: %[[#mul_vec:]] = OpIMul %[[#vec4_int_32]] %[[#arg0]] %[[#arg1]] +; CHECK: %[[#elt0:]] = OpCompositeExtract %[[#int_32]] %[[#mul_vec]] 0 +; CHECK: %[[#elt1:]] = OpCompositeExtract %[[#int_32]] %[[#mul_vec]] 1 +; CHECK: %[[#sum0:]] = OpIAdd %[[#int_32]] %[[#elt0]] %[[#elt1]] +; CHECK: %[[#elt2:]] = OpCompositeExtract %[[#int_32]] %[[#mul_vec]] 2 +; CHECK: %[[#sum1:]] = OpIAdd %[[#int_32]] %[[#sum0]] %[[#elt2]] +; CHECK: %[[#elt3:]] = OpCompositeExtract %[[#int_32]] %[[#mul_vec]] 3 +; CHECK: %[[#sum2:]] = OpIAdd %[[#int_32]] %[[#sum1]] %[[#elt3]] + %dot = call i32 @llvm.spv.sdot.v4i32(<4 x i32> %a, <4 x i32> %b) + ret i32 %dot +} + +define noundef i16 @dot_uint16_t3(<3 x i16> noundef %a, <3 x i16> noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_int_16]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_int_16]] +; CHECK: %[[#mul_vec:]] = OpIMul %[[#vec3_int_16]] %[[#arg0]] %[[#arg1]] +; CHECK: %[[#elt0:]] = OpCompositeExtract %[[#int_16]] %[[#mul_vec]] 0 +; CHECK: %[[#elt1:]] = OpCompositeExtract %[[#int_16]] %[[#mul_vec]] 1 +; CHECK: %[[#sum0:]] = OpIAdd %[[#int_16]] %[[#elt0]] %[[#elt1]] +; CHECK: %[[#elt2:]] = OpCompositeExtract %[[#int_16]] %[[#mul_vec]] 2 +; CHECK: %[[#sum1:]] = OpIAdd %[[#int_16]] %[[#sum0]] %[[#elt2]] + %dot = call i16 @llvm.spv.udot.v3i16(<3 x i16> %a, <3 x i16> %b) + ret i16 %dot +} + +define noundef i32 @dot_uint4(<4 x i32> noundef %a, <4 x i32> noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_int_32]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_int_32]] +; CHECK: %[[#mul_vec:]] = OpIMul %[[#vec4_int_32]] %[[#arg0]] %[[#arg1]] +; CHECK: %[[#elt0:]] = OpCompositeExtract %[[#int_32]] %[[#mul_vec]] 0 +; CHECK: %[[#elt1:]] = OpCompositeExtract %[[#int_32]] %[[#mul_vec]] 1 +; CHECK: %[[#sum0:]] = OpIAdd %[[#int_32]] %[[#elt0]] %[[#elt1]] +; CHECK: %[[#elt2:]] = OpCompositeExtract %[[#int_32]] %[[#mul_vec]] 2 +; CHECK: %[[#sum1:]] = OpIAdd %[[#int_32]] %[[#sum0]] %[[#elt2]] +; CHECK: %[[#elt3:]] = OpCompositeExtract %[[#int_32]] %[[#mul_vec]] 3 +; CHECK: %[[#sum2:]] = OpIAdd %[[#int_32]] %[[#sum1]] %[[#elt3]] + %dot = call i32 @llvm.spv.udot.v4i32(<4 x i32> %a, <4 x i32> %b) + ret i32 %dot +} + +define noundef i64 @dot_uint64_t4(<2 x i64> noundef %a, <2 x i64> noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec2_int_64]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec2_int_64]] +; CHECK: %[[#mul_vec:]] = OpIMul %[[#vec2_int_64]] %[[#arg0]] %[[#arg1]] +; CHECK: %[[#elt0:]] = OpCompositeExtract %[[#int_64]] %[[#mul_vec]] 0 +; CHECK: %[[#elt1:]] = OpCompositeExtract %[[#int_64]] %[[#mul_vec]] 1 +; CHECK: %[[#sum0:]] = OpIAdd %[[#int_64]] %[[#elt0]] %[[#elt1]] + %dot = call i64 @llvm.spv.udot.v2i64(<2 x i64> %a, <2 x i64> %b) + ret i64 %dot +} + +declare i16 @llvm.spv.sdot.v2i16(<2 x i16>, <2 x i16>) +declare i32 @llvm.spv.sdot.v4i32(<4 x i32>, <4 x i32>) +declare i16 @llvm.spv.udot.v3i32(<3 x i16>, <3 x i16>) +declare i32 @llvm.spv.udot.v4i32(<4 x i32>, <4 x i32>) +declare i64 @llvm.spv.udot.v2i64(<2 x i64>, <2 x i64>) From e5140aed275fe60b83188143f39011d5c0ee5bb0 Mon Sep 17 00:00:00 2001 From: jeffreytan81 Date: Thu, 22 Aug 2024 11:30:34 -0700 Subject: [PATCH 250/426] Fix dap stacktrace perf issue (#104874) We have got several customer reporting of slow stepping over the past year in VSCode. Profiling shows the slow stepping is caused by `stackTrace` request which can take around 1 second for certain targets. Since VSCode sends `stackTrace` during each stop event, the slow `stackTrace` request would slow down stepping in VSCode. Below is the hot path: ``` |--68.75%--lldb_dap::DAP::HandleObject(llvm::json::Object const&) | | | |--57.70%--(anonymous namespace)::request_stackTrace(llvm::json::Object const&) | | | | | |--54.43%--lldb::SBThread::GetCurrentExceptionBacktrace() | | | lldb_private::Thread::GetCurrentExceptionBacktrace() | | | lldb_private::Thread::GetCurrentException() | | | lldb_private::ItaniumABILanguageRuntime::GetExceptionObjectForThread(std::shared_ptr) | | | | | | | |--53.43%--lldb_private::FunctionCaller::ExecuteFunction(lldb_private::ExecutionContext&, unsigned long*, lldb_private::EvaluateExpressionOptions const&, lldb_private::DiagnosticManager&, lldb_private::Value&) | | | | | | | | | |--25.23%--lldb_private::FunctionCaller::InsertFunction(lldb_private::ExecutionContext&, unsigned long&, lldb_private::DiagnosticManager&) | | | | | | | | | | | |--24.56%--lldb_private::FunctionCaller::WriteFunctionWrapper(lldb_private::ExecutionContext&, lldb_private::DiagnosticManager&) | | | | | | | | | | | | | |--19.73%--lldb_private::ExpressionParser::PrepareForExecution(unsigned long&, unsigned long&, std::shared_ptr&, lldb_private::ExecutionContext&, bool&, lldb_private::ExecutionPolicy) | | | | | | | lldb_private::ClangExpressionParser::DoPrepareForExecution(unsigned long&, unsigned long&, std::shared_ptr&, lldb_private::ExecutionContext&, bool&, lldb_private::ExecutionPolicy) | | | | | | | lldb_private::IRExecutionUnit::GetRunnableInfo(lldb_private::Status&, unsigned long&, unsigned long&) | | | | | | | | ``` The hot path is added by https://reviews.llvm.org/D156465 which should at least be disabled for Linux. Note: I am seeing similar performance hot path on Mac. This PR hides the feature behind `enableDisplayExtendedBacktrace` option which needs to be enabled on-demand. --------- Co-authored-by: jeffreytan81 --- lldb/tools/lldb-dap/DAP.cpp | 1 + lldb/tools/lldb-dap/DAP.h | 1 + lldb/tools/lldb-dap/lldb-dap.cpp | 15 +++++++++++---- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp index 1fd560f21904ab..57b93c28ce9301 100644 --- a/lldb/tools/lldb-dap/DAP.cpp +++ b/lldb/tools/lldb-dap/DAP.cpp @@ -36,6 +36,7 @@ DAP::DAP() focus_tid(LLDB_INVALID_THREAD_ID), stop_at_entry(false), is_attach(false), enable_auto_variable_summaries(false), enable_synthetic_child_debugging(false), + enable_display_extended_backtrace(false), restarting_process_id(LLDB_INVALID_PROCESS_ID), configuration_done_sent(false), waiting_for_run_in_terminal(false), progress_event_reporter( diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h index 27ea6c7ff8423f..0fc77ac1e81683 100644 --- a/lldb/tools/lldb-dap/DAP.h +++ b/lldb/tools/lldb-dap/DAP.h @@ -181,6 +181,7 @@ struct DAP { bool is_attach; bool enable_auto_variable_summaries; bool enable_synthetic_child_debugging; + bool enable_display_extended_backtrace; // The process event thread normally responds to process exited events by // shutting down the entire adapter. When we're restarting, we keep the id of // the old process here so we can detect this case and keep running. diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp index 7b83767d1afeab..495ed0256120e8 100644 --- a/lldb/tools/lldb-dap/lldb-dap.cpp +++ b/lldb/tools/lldb-dap/lldb-dap.cpp @@ -701,6 +701,8 @@ void request_attach(const llvm::json::Object &request) { GetBoolean(arguments, "enableAutoVariableSummaries", false); g_dap.enable_synthetic_child_debugging = GetBoolean(arguments, "enableSyntheticChildDebugging", false); + g_dap.enable_display_extended_backtrace = + GetBoolean(arguments, "enableDisplayExtendedBacktrace", false); g_dap.command_escape_prefix = GetString(arguments, "commandEscapePrefix", "`"); g_dap.SetFrameFormat(GetString(arguments, "customFrameFormat")); @@ -1925,6 +1927,8 @@ void request_launch(const llvm::json::Object &request) { GetBoolean(arguments, "enableAutoVariableSummaries", false); g_dap.enable_synthetic_child_debugging = GetBoolean(arguments, "enableSyntheticChildDebugging", false); + g_dap.enable_display_extended_backtrace = + GetBoolean(arguments, "enableDisplayExtendedBacktrace", false); g_dap.command_escape_prefix = GetString(arguments, "commandEscapePrefix", "`"); g_dap.SetFrameFormat(GetString(arguments, "customFrameFormat")); @@ -3111,8 +3115,9 @@ void request_stackTrace(const llvm::json::Object &request) { // This will always return an invalid thread when // libBacktraceRecording.dylib is not loaded or if there is no extended // backtrace. - lldb::SBThread queue_backtrace_thread = - thread.GetExtendedBacktraceThread("libdispatch"); + lldb::SBThread queue_backtrace_thread; + if (g_dap.enable_display_extended_backtrace) + queue_backtrace_thread = thread.GetExtendedBacktraceThread("libdispatch"); if (queue_backtrace_thread.IsValid()) { // One extra frame as a label to mark the enqueued thread. totalFrames += queue_backtrace_thread.GetNumFrames() + 1; @@ -3120,8 +3125,10 @@ void request_stackTrace(const llvm::json::Object &request) { // This will always return an invalid thread when there is no exception in // the current thread. - lldb::SBThread exception_backtrace_thread = - thread.GetCurrentExceptionBacktrace(); + lldb::SBThread exception_backtrace_thread; + if (g_dap.enable_display_extended_backtrace) + exception_backtrace_thread = thread.GetCurrentExceptionBacktrace(); + if (exception_backtrace_thread.IsValid()) { // One extra frame as a label to mark the exception thread. totalFrames += exception_backtrace_thread.GetNumFrames() + 1; From 7bcf4d63cf3b7bcc789808ea4e9c8369e94467dc Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Thu, 22 Aug 2024 11:38:24 -0700 Subject: [PATCH 251/426] [AMDGPU] Correctly insert s_nops for dst forwarding hazard (#100276) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MI300 ISA section 4.5 states there is a hazard between "VALU op which uses OPSEL or SDWA with changes the result’s bit position" and "VALU op consumes result of that op" This includes the case where the second op is SDWA with same dest and dst_sel != DWORD && dst_unused == UNUSED_PRESERVE. In this case, there is an implicit read of the first op dst and the compiler needs to resolve this hazard. Confirmed with HW team. We model dst_unused == UNUSED_PRESERVE as tied-def of implicit operand, so this PR checks for that. MI300_SP_MAS section 1.3.9.2 specifies that CVT_SR_FP8_F32 and CVT_SR_BF8_F32 with opsel[3:2] !=0 have dest forwarding issue. Currently, we only add check for CVT_SR_FP8_F32 with opsel[3] != 0 -- this PR adds support opsel[2] != 0 as well --- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 134 +++++- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 10 + .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 12 + llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 3 + llvm/lib/Target/AMDGPU/VOP3Instructions.td | 2 + llvm/lib/Target/AMDGPU/VOPInstructions.td | 2 + llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir | 436 ++++++++++++++++++ .../CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll | 2 + 8 files changed, 579 insertions(+), 22 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index a8b171aa82840a..a6b7264405ade1 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -876,6 +876,7 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, return DataIdx >= 0 && TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg); }; + int WaitStatesNeededForDef = VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); @@ -883,6 +884,70 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, return WaitStatesNeeded; } +/// Dest sel forwarding issue occurs if additional logic is needed to swizzle / +/// pack the computed value into correct bit position of the dest register. This +/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with +/// dst_sel that is not aligned to the register. This function analayzes the \p +/// MI and \returns an operand with dst forwarding issue, or nullptr if +/// none exists. +static const MachineOperand * +getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) { + if (!SIInstrInfo::isVALU(MI)) + return nullptr; + + const SIInstrInfo *TII = ST.getInstrInfo(); + + unsigned Opcode = MI.getOpcode(); + + // There are three different types of instructions + // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3 + // which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and + // CVT_SR_BF8_F32 with op_sel[3:2] + // != 0 + if (SIInstrInfo::isSDWA(MI)) { + // Type 1: SDWA with dst_sel != DWORD + if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel)) + if (DstSel->getImm() == AMDGPU::SDWA::DWORD) + return nullptr; + } else { + // Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and + // CVT_SR_BF8_F32 with op_sel[3:2] != 0) + if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) || + !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() & + SISrcMods::DST_OP_SEL || + (AMDGPU::isFP8DstSelInst(Opcode) && + (TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() & + SISrcMods::OP_SEL_0)))) + return nullptr; + } + + return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); +} + +/// Checks whether the provided \p MI "consumes" the operand with a Dest sel +/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit +/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW) +static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, + const MachineOperand *Dst, + const SIRegisterInfo *TRI) { + // We must consider implicit reads of the VALU. SDWA with dst_sel and + // UNUSED_PRESERVE will implicitly read the result from forwarded dest, + // and we must account for that hazard. + // We also must account for WAW hazards. In particular, WAW with dest + // preserve semantics (e.g. VOP3 with op_sel, VOP2 && + // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity + // check for ECC. Without accounting for this hazard, the ECC will be + // wrong. + // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e. + // complete zeroesHigh16BitsOfDest) + for (auto &Operand : VALU->operands()) { + if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) { + return true; + } + } + return false; +} + int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { int WaitStatesNeeded = 0; @@ -913,27 +978,18 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { if (ST.hasDstSelForwardingHazard()) { const int Shift16DefWaitstates = 1; - auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) { - if (!SIInstrInfo::isVALU(MI)) - return false; - const SIInstrInfo *TII = ST.getInstrInfo(); - if (SIInstrInfo::isSDWA(MI)) { - if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel)) - if (DstSel->getImm() == AMDGPU::SDWA::DWORD) - return false; - } else { - if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) || - !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers) - ->getImm() & - SISrcMods::DST_OP_SEL)) - return false; - } + auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) { const SIRegisterInfo *TRI = ST.getRegisterInfo(); - if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { - Register Def = Dst->getReg(); + const MachineOperand *ForwardedDst = + getDstSelForwardingOperand(ProducerMI, ST); + if (ForwardedDst) { + return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI); + } - for (const MachineOperand &Use : VALU->explicit_uses()) { - if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg())) + if (ProducerMI.isInlineAsm()) { + // Assume inline asm has dst forwarding hazard + for (auto &Def : ProducerMI.all_defs()) { + if (consumesDstSelForwardingOperand(VALU, &Def, TRI)) return true; } } @@ -1030,7 +1086,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { // problematic thus far. // see checkVALUHazards() - if (!ST.has12DWordStoreHazard()) + if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard()) return 0; const MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -1039,11 +1095,45 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { for (const MachineOperand &Op : llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) { if (Op.isReg() && Op.isDef()) { - WaitStatesNeeded = - std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); + if (!TRI.isVectorRegister(MRI, Op.getReg())) + continue; + + if (ST.has12DWordStoreHazard()) { + WaitStatesNeeded = + std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); + } } } + if (ST.hasDstSelForwardingHazard()) { + const int Shift16DefWaitstates = 1; + + auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) { + const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST); + // Assume inline asm reads the dst + if (Dst) + return IA->modifiesRegister(Dst->getReg(), &TRI) || + IA->readsRegister(Dst->getReg(), &TRI); + + if (ProducerMI.isInlineAsm()) { + // If MI is inline asm, assume it has dst forwarding hazard + for (auto &Def : ProducerMI.all_defs()) { + if (IA->modifiesRegister(Def.getReg(), &TRI) || + IA->readsRegister(Def.getReg(), &TRI)) { + return true; + } + } + } + + return false; + }; + + int WaitStatesNeededForDef = + Shift16DefWaitstates - + getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); + } + return WaitStatesNeeded; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 85281713e22b1f..2b54429dc9a03f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2342,6 +2342,7 @@ class VOPProfile _ArgVT, bit _EnableClamp = 0> { field bit IsFP8SrcByteSel = 0; field bit IsFP8DstByteSel = 0; + field bit HasFP8DstByteSel = 0; field bit IsFP8ByteSel = !or(IsFP8SrcByteSel, IsFP8DstByteSel); field bit HasDst = !ne(DstVT.Value, untyped.Value); @@ -2921,6 +2922,15 @@ def getVCMPXOpFromVCMP : InstrMapping { let ValueCols = [["1"]]; } +def FP8DstByteSelTable : GenericTable { + let FilterClass = "VOP3_Pseudo"; + let CppTypeName = "FP8DstByteSelInfo"; + let Fields = ["Opcode", "HasFP8DstByteSel"]; + + let PrimaryKey = ["Opcode"]; + let PrimaryKeyName = "getFP8DstByteSelHelper"; +} + def VOPDComponentTable : GenericTable { let FilterClass = "VOPD_Component"; let CppTypeName = "VOPDComponentInfo"; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 5b41a2cd731607..cda664a151ef54 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -385,6 +385,13 @@ struct SingleUseExceptionInfo { bool IsInvalidSingleUseProducer; }; +struct FP8DstByteSelInfo { + uint16_t Opcode; + bool HasFP8DstByteSel; +}; + +#define GET_FP8DstByteSelTable_DECL +#define GET_FP8DstByteSelTable_IMPL #define GET_MTBUFInfoTable_DECL #define GET_MTBUFInfoTable_IMPL #define GET_MUBUFInfoTable_DECL @@ -629,6 +636,11 @@ bool isInvalidSingleUseProducerInst(unsigned Opc) { return Info && Info->IsInvalidSingleUseProducer; } +bool isFP8DstSelInst(unsigned Opc) { + const FP8DstByteSelInfo *Info = getFP8DstByteSelHelper(Opc); + return Info ? Info->HasFP8DstByteSel : false; +} + unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) { const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc); return Info ? Info->Opcode3Addr : ~0u; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index a4e6a7ebe0558b..35c080d8e0bebc 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -861,6 +861,9 @@ getVOPDInstInfo(unsigned VOPDOpcode, const MCInstrInfo *InstrInfo); LLVM_READONLY bool isTrue16Inst(unsigned Opc); +LLVM_READONLY +bool isFP8DstSelInst(unsigned Opc); + LLVM_READONLY bool isInvalidSingleUseConsumerInst(unsigned Opc); diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 6748eff9376b0d..466114b95f9f90 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -568,6 +568,7 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile, let HasSrc2Mods = 1; let HasExtVOP3DPP = 1; let HasOpSel = 1; + let HasFP8DstByteSel = 1; let AsmVOP3OpSel = !subst(", $src2_modifiers", "", getAsmVOP3OpSel<3, HasClamp, HasOMod, HasSrc0FloatMods, HasSrc1FloatMods, @@ -587,6 +588,7 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile, class VOP3_CVT_SR_F8_ByteSel_Profile : VOP3_Profile> { let IsFP8DstByteSel = 1; + let HasFP8DstByteSel = 1; let HasClamp = 0; defvar bytesel = (ins VGPR_32:$vdst_in, ByteSel:$byte_sel); let Ins64 = !con(getIns64 pattern = [], let IsWMMA = P.IsWMMA; let IsSWMMAC = P.IsSWMMAC; + bit HasFP8DstByteSel = P.HasFP8DstByteSel; + let AsmOperands = !if(isVop3OpSel, P.AsmVOP3OpSel, !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64)); diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir new file mode 100644 index 00000000000000..e24817078d8bc9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir @@ -0,0 +1,436 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=HAZARD %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=NOHAZARD %s + +--- +name: sdwa_opsel_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: sdwa_opsel_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: sdwa_opsel_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + S_ENDPGM 0 +... + +--- +name: sdwa_lo_opsel_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: sdwa_lo_opsel_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: sdwa_lo_opsel_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + S_ENDPGM 0 +... + +--- +name: opsel_sdwa_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: opsel_sdwa_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: opsel_sdwa_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + S_ENDPGM 0 +... + + +# TODO -- there is no reason for s_nop (V_ADD_U16 doesn't preserve the dest) + +--- +name: opsel_no_sdwa_no_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: opsel_no_sdwa_no_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: opsel_no_sdwa_no_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + S_ENDPGM 0 +... + +--- +name: no_opsel_sdwa_no_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: no_opsel_sdwa_no_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_e64 killed $vgpr3, killed $vgpr4, killed $vgpr2, 0, implicit $exec + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: no_opsel_sdwa_no_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_e64 killed $vgpr3, killed $vgpr4, killed $vgpr2, 0, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_MAD_U16_e64 killed $vgpr3, killed $vgpr4, killed $vgpr2, 0, implicit $exec + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + S_ENDPGM 0 +... + +--- +name: opsel_opsel_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: opsel_opsel_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: opsel_opsel_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + renamable $vgpr0 = V_MAD_U16_gfx9_e64 4, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + S_ENDPGM 0 +... + +# TODO -- there is no reason for s_nop + +--- +name: opsel_opsel_no_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: opsel_opsel_no_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: opsel_opsel_no_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + renamable $vgpr0 = V_MAD_U16_gfx9_e64 12, killed $vgpr3, 4, killed $vgpr4, 4, killed $vgpr2, 0, 0, implicit $exec + S_ENDPGM 0 +... + +# DS_READ_U16_D16 has dest preserve semantics, but only VALU consumers have hazard + +--- +name: sdwa_loadsel_no_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: sdwa_loadsel_no_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 3, 0, 3, 3, implicit $exec + ; HAZARD-NEXT: renamable $vgpr0 = DS_READ_U16_D16 killed renamable $vgpr3, 0, 0, killed renamable $vgpr0, implicit $exec + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: sdwa_loadsel_no_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 3, 0, 3, 3, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = DS_READ_U16_D16 killed renamable $vgpr3, 0, 0, killed renamable $vgpr0, implicit $exec + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 3, 0, 3, 3, implicit $exec + renamable $vgpr0 = DS_READ_U16_D16 killed renamable $vgpr3, 0, 0, killed renamable $vgpr0, implicit $exec + S_ENDPGM 0 +... + +--- +name: sdwa_sdwa_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: sdwa_sdwa_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: sdwa_sdwa_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + S_ENDPGM 0 +... + +--- +name: cvt_sdwa_hazard_1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: cvt_sdwa_hazard_1 + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: cvt_sdwa_hazard_1 + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + S_ENDPGM 0 +... + +--- +name: cvt_sdwa_hazard_2 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: cvt_sdwa_hazard_2 + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: cvt_sdwa_hazard_2 + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + S_ENDPGM 0 +... + +--- +name: cvt_sdwa_hazard_3 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: cvt_sdwa_hazard_3 + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: cvt_sdwa_hazard_3 + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 8, killed $vgpr3, 0, killed $vgpr1, 4, $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + S_ENDPGM 0 +... + +--- +name: cvt_sdwa_no_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: cvt_sdwa_no_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: cvt_sdwa_no_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_CVT_SR_FP8_F32_e64 0, killed $vgpr3, 0, killed $vgpr1, 0, $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + S_ENDPGM 0 +... + +# TODO -- there is no reason for s_nop (V_ADD_U16 doesn't preserve the dest) + +--- +name: sdwa_nosdwa_no_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: sdwa_nosdwa_no_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: sdwa_nosdwa_no_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec + S_ENDPGM 0 +... + +--- +name: inline_sdwa_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: inline_sdwa_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1 + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: inline_sdwa_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1 + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; NOHAZARD-NEXT: S_ENDPGM 0 + INLINEASM &"v_or_b32 %0, 0, %1", 32, 327690, def $vgpr0, 327689, $vgpr1 + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + S_ENDPGM 0 +... + +--- +name: sdwa_inline_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: sdwa_inline_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1 + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: sdwa_inline_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1 + ; NOHAZARD-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) + INLINEASM &"v_or_b32 %0, 0, %1", 32, 327690, def $vgpr0, 327689, $vgpr1 + S_ENDPGM 0 +... + + +--- +name: inline_inline_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + + ; HAZARD-LABEL: name: inline_inline_hazard + ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; HAZARD-NEXT: {{ $}} + ; HAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1 + ; HAZARD-NEXT: S_NOP 0 + ; HAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1 + ; HAZARD-NEXT: S_ENDPGM 0 + ; + ; NOHAZARD-LABEL: name: inline_inline_hazard + ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode + ; NOHAZARD-NEXT: {{ $}} + ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1 + ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 %0, 0, %1", 32 /* isconvergent attdialect */, 327690 /* regdef:SReg_1_with_sub0 */, def $vgpr0, 327689 /* reguse:SReg_1_with_sub0 */, $vgpr1 + ; NOHAZARD-NEXT: S_ENDPGM 0 + INLINEASM &"v_or_b32 %0, 0, %1", 32, 327690, def $vgpr0, 327689, $vgpr1 + INLINEASM &"v_or_b32 %0, 0, %1", 32, 327690, def $vgpr0, 327689, $vgpr1 + S_ENDPGM 0 +... + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll index d3fc96d7ff8012..8313f5b655efba 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll @@ -375,6 +375,7 @@ define i32 @test_cvt_sr_bf8_f32_byte1(float %x, i32 %r, i32 %old) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0] +; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -469,6 +470,7 @@ define i32 @test_cvt_sr_fp8_f32_byte1(float %x, i32 %r, i32 %old) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0] +; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; From a7c8f41f2bec74b7dcd84932136bea801723de04 Mon Sep 17 00:00:00 2001 From: Raghu Maddhipatla <7686592+raghavendhra@users.noreply.github.com> Date: Thu, 22 Aug 2024 13:50:08 -0500 Subject: [PATCH 252/426] [NFC] [MLIR] [OpenMP] Fixing typo of clause. (#105712) --- .../LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 6d14d77c440e67..83a14290bcf64b 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -2147,8 +2147,8 @@ convertToDeviceClauseKind(mlir::omp::DeclareTargetDeviceType deviceClause) { static llvm::OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind convertToCaptureClauseKind( - mlir::omp::DeclareTargetCaptureClause captureClasue) { - switch (captureClasue) { + mlir::omp::DeclareTargetCaptureClause captureClause) { + switch (captureClause) { case mlir::omp::DeclareTargetCaptureClause::to: return llvm::OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo; case mlir::omp::DeclareTargetCaptureClause::link: From 856dadb33c38f4e3be592f11c3d67e7337e288c7 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 22 Aug 2024 13:34:48 -0500 Subject: [PATCH 253/426] [libc] Add `ctype.h` locale variants (#102711) Summary: This patch adds all the libc ctype variants. These ignore the locale ingormation completely, so they're pretty much just stubs. Because these use locale information, which is system scope, we do not enable building them outisde of full build mode. --- libc/config/gpu/entrypoints.txt | 14 ++ libc/config/linux/x86_64/entrypoints.txt | 16 +++ libc/include/CMakeLists.txt | 1 + libc/newhdrgen/yaml/ctype.yaml | 102 ++++++++++++++- libc/spec/stdc.td | 74 ++++++++++- libc/src/ctype/CMakeLists.txt | 156 +++++++++++++++++++++++ libc/src/ctype/isalnum.cpp | 2 - libc/src/ctype/isalnum_l.cpp | 21 +++ libc/src/ctype/isalnum_l.h | 21 +++ libc/src/ctype/isalpha.cpp | 2 - libc/src/ctype/isalpha_l.cpp | 21 +++ libc/src/ctype/isalpha_l.h | 21 +++ libc/src/ctype/isblank.cpp | 2 - libc/src/ctype/isblank_l.cpp | 20 +++ libc/src/ctype/isblank_l.h | 21 +++ libc/src/ctype/iscntrl.cpp | 2 - libc/src/ctype/iscntrl_l.cpp | 21 +++ libc/src/ctype/iscntrl_l.h | 21 +++ libc/src/ctype/isdigit.cpp | 2 - libc/src/ctype/isdigit_l.cpp | 20 +++ libc/src/ctype/isdigit_l.h | 21 +++ libc/src/ctype/isgraph.cpp | 2 - libc/src/ctype/isgraph_l.cpp | 21 +++ libc/src/ctype/isgraph_l.h | 21 +++ libc/src/ctype/islower.cpp | 2 - libc/src/ctype/islower_l.cpp | 21 +++ libc/src/ctype/islower_l.h | 21 +++ libc/src/ctype/isprint.cpp | 2 - libc/src/ctype/isprint_l.cpp | 21 +++ libc/src/ctype/isprint_l.h | 21 +++ libc/src/ctype/ispunct.cpp | 2 - libc/src/ctype/ispunct_l.cpp | 22 ++++ libc/src/ctype/ispunct_l.h | 21 +++ libc/src/ctype/isspace.cpp | 2 - libc/src/ctype/isspace_l.cpp | 21 +++ libc/src/ctype/isspace_l.h | 21 +++ libc/src/ctype/isupper.cpp | 2 - libc/src/ctype/isupper_l.cpp | 21 +++ libc/src/ctype/isupper_l.h | 21 +++ libc/src/ctype/isxdigit.cpp | 2 - libc/src/ctype/isxdigit_l.cpp | 22 ++++ libc/src/ctype/isxdigit_l.h | 21 +++ libc/src/ctype/tolower.cpp | 2 - libc/src/ctype/tolower_l.cpp | 21 +++ libc/src/ctype/tolower_l.h | 21 +++ libc/src/ctype/toupper.cpp | 2 - libc/src/ctype/toupper_l.cpp | 23 ++++ libc/src/ctype/toupper_l.h | 21 +++ 48 files changed, 950 insertions(+), 31 deletions(-) create mode 100644 libc/src/ctype/isalnum_l.cpp create mode 100644 libc/src/ctype/isalnum_l.h create mode 100644 libc/src/ctype/isalpha_l.cpp create mode 100644 libc/src/ctype/isalpha_l.h create mode 100644 libc/src/ctype/isblank_l.cpp create mode 100644 libc/src/ctype/isblank_l.h create mode 100644 libc/src/ctype/iscntrl_l.cpp create mode 100644 libc/src/ctype/iscntrl_l.h create mode 100644 libc/src/ctype/isdigit_l.cpp create mode 100644 libc/src/ctype/isdigit_l.h create mode 100644 libc/src/ctype/isgraph_l.cpp create mode 100644 libc/src/ctype/isgraph_l.h create mode 100644 libc/src/ctype/islower_l.cpp create mode 100644 libc/src/ctype/islower_l.h create mode 100644 libc/src/ctype/isprint_l.cpp create mode 100644 libc/src/ctype/isprint_l.h create mode 100644 libc/src/ctype/ispunct_l.cpp create mode 100644 libc/src/ctype/ispunct_l.h create mode 100644 libc/src/ctype/isspace_l.cpp create mode 100644 libc/src/ctype/isspace_l.h create mode 100644 libc/src/ctype/isupper_l.cpp create mode 100644 libc/src/ctype/isupper_l.h create mode 100644 libc/src/ctype/isxdigit_l.cpp create mode 100644 libc/src/ctype/isxdigit_l.h create mode 100644 libc/src/ctype/tolower_l.cpp create mode 100644 libc/src/ctype/tolower_l.h create mode 100644 libc/src/ctype/toupper_l.cpp create mode 100644 libc/src/ctype/toupper_l.h diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt index 0674f23687c0a5..7b869902074d8e 100644 --- a/libc/config/gpu/entrypoints.txt +++ b/libc/config/gpu/entrypoints.txt @@ -4,21 +4,35 @@ set(TARGET_LIBC_ENTRYPOINTS # ctype.h entrypoints libc.src.ctype.isalnum + libc.src.ctype.isalnum_l libc.src.ctype.isalpha + libc.src.ctype.isalpha_l libc.src.ctype.isascii libc.src.ctype.isblank + libc.src.ctype.isblank_l libc.src.ctype.iscntrl + libc.src.ctype.iscntrl_l libc.src.ctype.isdigit + libc.src.ctype.isdigit_l libc.src.ctype.isgraph + libc.src.ctype.isgraph_l libc.src.ctype.islower + libc.src.ctype.islower_l libc.src.ctype.isprint + libc.src.ctype.isprint_l libc.src.ctype.ispunct + libc.src.ctype.ispunct_l libc.src.ctype.isspace + libc.src.ctype.isspace_l libc.src.ctype.isupper + libc.src.ctype.isupper_l libc.src.ctype.isxdigit + libc.src.ctype.isxdigit_l libc.src.ctype.toascii libc.src.ctype.tolower + libc.src.ctype.tolower_l libc.src.ctype.toupper + libc.src.ctype.toupper_l # string.h entrypoints libc.src.string.bcmp diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index e7c3c7db64abe5..bac1e3cfa85da7 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -782,6 +782,22 @@ endif() if(LLVM_LIBC_FULL_BUILD) list(APPEND TARGET_LIBC_ENTRYPOINTS + # ctype.h entrypoints + libc.src.ctype.isalnum_l + libc.src.ctype.isalpha_l + libc.src.ctype.isblank_l + libc.src.ctype.iscntrl_l + libc.src.ctype.isdigit_l + libc.src.ctype.isgraph_l + libc.src.ctype.islower_l + libc.src.ctype.isprint_l + libc.src.ctype.ispunct_l + libc.src.ctype.isspace_l + libc.src.ctype.isupper_l + libc.src.ctype.isxdigit_l + libc.src.ctype.tolower_l + libc.src.ctype.toupper_l + # assert.h entrypoints libc.src.assert.__assert_fail diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index 8e00c9f1292e81..910f9eea015f27 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -45,6 +45,7 @@ add_header_macro( ctype.h DEPENDS .llvm_libc_common_h + .llvm-libc-types.locale_t ) add_header_macro( diff --git a/libc/newhdrgen/yaml/ctype.yaml b/libc/newhdrgen/yaml/ctype.yaml index f3108a34d43377..b4823c3e53234a 100644 --- a/libc/newhdrgen/yaml/ctype.yaml +++ b/libc/newhdrgen/yaml/ctype.yaml @@ -1,6 +1,7 @@ header: ctype.h macros: [] -types: [] +types: + - type_name: locale_t enums: [] objects: [] functions: @@ -100,4 +101,101 @@ functions: return_type: int arguments: - type: int - functions: null + - name: isalnum_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: isalpha_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: isblank_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: iscntrl_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: isdigit_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: isgraph_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: islower_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: isprint_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: ispunct_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: isspace_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: isupper_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: isxdigit_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: tolower_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t + - name: toupper_l + standards: + - stdc + return_type: int + arguments: + - type: int + - type: locale_t diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index f9573997c65739..402d8c335470ad 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -35,7 +35,9 @@ def StdC : StandardSpec<"stdc"> { HeaderSpec CType = HeaderSpec< "ctype.h", [], // Macros - [], // Types + [ + LocaleT + ], // Types [], // Enumerations [ FunctionSpec< @@ -108,6 +110,76 @@ def StdC : StandardSpec<"stdc"> { RetValSpec, [ArgSpec] >, + FunctionSpec< + "isalnum_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "isalpha_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "isblank_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "iscntrl_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "isdigit_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "isgraph_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "islower_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "isprint_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "ispunct_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "isspace_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "isupper_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "isxdigit_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "tolower_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "toupper_l", + RetValSpec, + [ArgSpec, ArgSpec] + >, ] >; diff --git a/libc/src/ctype/CMakeLists.txt b/libc/src/ctype/CMakeLists.txt index ae4eec9615dc19..8830c1bccf9eaa 100644 --- a/libc/src/ctype/CMakeLists.txt +++ b/libc/src/ctype/CMakeLists.txt @@ -146,3 +146,159 @@ add_entrypoint_object( DEPENDS libc.src.__support.ctype_utils ) + +# Do not build the locale versions in overlay mode. +if(NOT LLVM_LIBC_FULL_BUILD) + return() +endif() + +add_entrypoint_object( + isalnum_l + SRCS + isalnum_l.cpp + HDRS + isalnum_l.h + DEPENDS + libc.include.ctype + libc.src.__support.ctype_utils + libc.hdr.types.locale_t +) + +add_entrypoint_object( + isalpha_l + SRCS + isalpha_l.cpp + HDRS + isalpha_l.h + DEPENDS + libc.src.__support.ctype_utils + libc.hdr.types.locale_t +) + +add_entrypoint_object( + isblank_l + SRCS + isblank_l.cpp + HDRS + isblank_l.h + DEPENDS + libc.hdr.types.locale_t +) + +add_entrypoint_object( + iscntrl_l + SRCS + iscntrl_l.cpp + HDRS + iscntrl_l.h + DEPENDS + libc.hdr.types.locale_t +) + +add_entrypoint_object( + isdigit_l + SRCS + isdigit_l.cpp + HDRS + isdigit_l.h + DEPENDS + libc.src.__support.ctype_utils + libc.hdr.types.locale_t +) + +add_entrypoint_object( + isgraph_l + SRCS + isgraph_l.cpp + HDRS + isgraph_l.h + DEPENDS + libc.src.__support.ctype_utils + libc.hdr.types.locale_t +) + +add_entrypoint_object( + islower_l + SRCS + islower_l.cpp + HDRS + islower_l.h + DEPENDS + libc.src.__support.ctype_utils + libc.hdr.types.locale_t +) + +add_entrypoint_object( + isprint_l + SRCS + isprint_l.cpp + HDRS + isprint_l.h + DEPENDS + libc.hdr.types.locale_t +) + +add_entrypoint_object( + ispunct_l + SRCS + ispunct_l.cpp + HDRS + ispunct_l.h + DEPENDS + libc.src.__support.ctype_utils + libc.hdr.types.locale_t +) + +add_entrypoint_object( + isspace_l + SRCS + isspace_l.cpp + HDRS + isspace_l.h + DEPENDS + libc.hdr.types.locale_t +) + +add_entrypoint_object( + isupper_l + SRCS + isupper_l.cpp + HDRS + isupper_l.h + DEPENDS + libc.src.__support.ctype_utils + libc.hdr.types.locale_t +) + +add_entrypoint_object( + isxdigit_l + SRCS + isxdigit_l.cpp + HDRS + isxdigit_l.h + DEPENDS + libc.src.__support.ctype_utils + libc.hdr.types.locale_t +) + +add_entrypoint_object( + tolower_l + SRCS + tolower_l.cpp + HDRS + tolower_l.h + DEPENDS + libc.src.__support.ctype_utils + libc.hdr.types.locale_t +) + +add_entrypoint_object( + toupper_l + SRCS + toupper_l.cpp + HDRS + toupper_l.h + DEPENDS + libc.src.__support.ctype_utils + libc.hdr.types.locale_t +) diff --git a/libc/src/ctype/isalnum.cpp b/libc/src/ctype/isalnum.cpp index 382553c23a6bfb..54a3e357488790 100644 --- a/libc/src/ctype/isalnum.cpp +++ b/libc/src/ctype/isalnum.cpp @@ -14,8 +14,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isalnum, (int c)) { return static_cast(internal::isalnum(static_cast(c))); } diff --git a/libc/src/ctype/isalnum_l.cpp b/libc/src/ctype/isalnum_l.cpp new file mode 100644 index 00000000000000..671d9b75c4c33a --- /dev/null +++ b/libc/src/ctype/isalnum_l.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of isalnum -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isalnum_l.h" +#include "src/__support/ctype_utils.h" + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, isalnum_l, (int c, locale_t)) { + return static_cast(internal::isalnum(static_cast(c))); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isalnum_l.h b/libc/src/ctype/isalnum_l.h new file mode 100644 index 00000000000000..5bc892e6c8747e --- /dev/null +++ b/libc/src/ctype/isalnum_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for isalnum_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISALNUM_H +#define LLVM_LIBC_SRC_CTYPE_ISALNUM_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int isalnum_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISALNUM_H diff --git a/libc/src/ctype/isalpha.cpp b/libc/src/ctype/isalpha.cpp index 1a63406780b6e0..78b26f6a486eae 100644 --- a/libc/src/ctype/isalpha.cpp +++ b/libc/src/ctype/isalpha.cpp @@ -14,8 +14,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isalpha, (int c)) { return static_cast(internal::isalpha(static_cast(c))); } diff --git a/libc/src/ctype/isalpha_l.cpp b/libc/src/ctype/isalpha_l.cpp new file mode 100644 index 00000000000000..0619d979bedf22 --- /dev/null +++ b/libc/src/ctype/isalpha_l.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of isalpha -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isalpha_l.h" + +#include "src/__support/common.h" +#include "src/__support/ctype_utils.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, isalpha_l, (int c, locale_t)) { + return static_cast(internal::isalpha(static_cast(c))); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isalpha_l.h b/libc/src/ctype/isalpha_l.h new file mode 100644 index 00000000000000..3591f1175cb9a9 --- /dev/null +++ b/libc/src/ctype/isalpha_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for isalpha_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISALPHA_H +#define LLVM_LIBC_SRC_CTYPE_ISALPHA_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int isalpha_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISALPHA_H diff --git a/libc/src/ctype/isblank.cpp b/libc/src/ctype/isblank.cpp index a4f33d265bd2dd..e0a20829f86cee 100644 --- a/libc/src/ctype/isblank.cpp +++ b/libc/src/ctype/isblank.cpp @@ -13,8 +13,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isblank, (int c)) { return static_cast(c == ' ' || c == '\t'); } diff --git a/libc/src/ctype/isblank_l.cpp b/libc/src/ctype/isblank_l.cpp new file mode 100644 index 00000000000000..4f6b0bfac29724 --- /dev/null +++ b/libc/src/ctype/isblank_l.cpp @@ -0,0 +1,20 @@ +//===-- Implementation of isblank -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isblank_l.h" + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, isblank_l, (int c, locale_t)) { + return static_cast(c == ' ' || c == '\t'); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isblank_l.h b/libc/src/ctype/isblank_l.h new file mode 100644 index 00000000000000..61ede30ae76775 --- /dev/null +++ b/libc/src/ctype/isblank_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for isblank_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISBLANK_H +#define LLVM_LIBC_SRC_CTYPE_ISBLANK_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int isblank_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISBLANK_H diff --git a/libc/src/ctype/iscntrl.cpp b/libc/src/ctype/iscntrl.cpp index fb582fd6ef0820..2218adfcc33f3b 100644 --- a/libc/src/ctype/iscntrl.cpp +++ b/libc/src/ctype/iscntrl.cpp @@ -13,8 +13,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, iscntrl, (int c)) { const unsigned ch = static_cast(c); return static_cast(ch < 0x20 || ch == 0x7f); diff --git a/libc/src/ctype/iscntrl_l.cpp b/libc/src/ctype/iscntrl_l.cpp new file mode 100644 index 00000000000000..83aa480299fadc --- /dev/null +++ b/libc/src/ctype/iscntrl_l.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of iscntrl -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/iscntrl_l.h" + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, iscntrl_l, (int c, locale_t)) { + const unsigned ch = static_cast(c); + return static_cast(ch < 0x20 || ch == 0x7f); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/iscntrl_l.h b/libc/src/ctype/iscntrl_l.h new file mode 100644 index 00000000000000..7dee44fcd0bebc --- /dev/null +++ b/libc/src/ctype/iscntrl_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for iscntrl_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISCNTRL_H +#define LLVM_LIBC_SRC_CTYPE_ISCNTRL_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int iscntrl_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISCNTRL_H diff --git a/libc/src/ctype/isdigit.cpp b/libc/src/ctype/isdigit.cpp index 43c5f1940c7f00..1f711943861f8b 100644 --- a/libc/src/ctype/isdigit.cpp +++ b/libc/src/ctype/isdigit.cpp @@ -13,8 +13,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isdigit, (int c)) { return static_cast(internal::isdigit(static_cast(c))); } diff --git a/libc/src/ctype/isdigit_l.cpp b/libc/src/ctype/isdigit_l.cpp new file mode 100644 index 00000000000000..ca981362bfe839 --- /dev/null +++ b/libc/src/ctype/isdigit_l.cpp @@ -0,0 +1,20 @@ +//===-- Implementation of isdigit -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isdigit_l.h" +#include "src/__support/common.h" +#include "src/__support/ctype_utils.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, isdigit_l, (int c, locale_t)) { + return static_cast(internal::isdigit(static_cast(c))); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isdigit_l.h b/libc/src/ctype/isdigit_l.h new file mode 100644 index 00000000000000..abeec3464941a0 --- /dev/null +++ b/libc/src/ctype/isdigit_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for isdigit_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISDIGIT_H +#define LLVM_LIBC_SRC_CTYPE_ISDIGIT_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int isdigit_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISDIGIT_H diff --git a/libc/src/ctype/isgraph.cpp b/libc/src/ctype/isgraph.cpp index a5b6e501b5813f..74bb2e75d138e6 100644 --- a/libc/src/ctype/isgraph.cpp +++ b/libc/src/ctype/isgraph.cpp @@ -14,8 +14,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isgraph, (int c)) { return static_cast(internal::isgraph(static_cast(c))); } diff --git a/libc/src/ctype/isgraph_l.cpp b/libc/src/ctype/isgraph_l.cpp new file mode 100644 index 00000000000000..cbef6df148aed6 --- /dev/null +++ b/libc/src/ctype/isgraph_l.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of isgraph -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isgraph_l.h" + +#include "src/__support/common.h" +#include "src/__support/ctype_utils.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, isgraph_l, (int c, locale_t)) { + return static_cast(internal::isgraph(static_cast(c))); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isgraph_l.h b/libc/src/ctype/isgraph_l.h new file mode 100644 index 00000000000000..d96a4608655092 --- /dev/null +++ b/libc/src/ctype/isgraph_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for isgraph_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISGRAPH_H +#define LLVM_LIBC_SRC_CTYPE_ISGRAPH_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int isgraph_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISGRAPH_H diff --git a/libc/src/ctype/islower.cpp b/libc/src/ctype/islower.cpp index 61ccbcc1db413b..831aad32d3a22e 100644 --- a/libc/src/ctype/islower.cpp +++ b/libc/src/ctype/islower.cpp @@ -14,8 +14,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, islower, (int c)) { return static_cast(internal::islower(static_cast(c))); } diff --git a/libc/src/ctype/islower_l.cpp b/libc/src/ctype/islower_l.cpp new file mode 100644 index 00000000000000..b9be6acc81c992 --- /dev/null +++ b/libc/src/ctype/islower_l.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of islower -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/islower_l.h" +#include "src/__support/ctype_utils.h" + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, islower_l, (int c, locale_t)) { + return static_cast(internal::islower(static_cast(c))); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/islower_l.h b/libc/src/ctype/islower_l.h new file mode 100644 index 00000000000000..7d3e2f139602b9 --- /dev/null +++ b/libc/src/ctype/islower_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for islower_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISLOWER_H +#define LLVM_LIBC_SRC_CTYPE_ISLOWER_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int islower_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISLOWER_H diff --git a/libc/src/ctype/isprint.cpp b/libc/src/ctype/isprint.cpp index 42ab9cc8d238a1..349aefe1c17bbd 100644 --- a/libc/src/ctype/isprint.cpp +++ b/libc/src/ctype/isprint.cpp @@ -13,8 +13,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isprint, (int c)) { const unsigned ch = static_cast(c); return static_cast((ch - ' ') < 95); diff --git a/libc/src/ctype/isprint_l.cpp b/libc/src/ctype/isprint_l.cpp new file mode 100644 index 00000000000000..8f51f7f0e3e94b --- /dev/null +++ b/libc/src/ctype/isprint_l.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of isprint -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isprint_l.h" + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, isprint_l, (int c, locale_t)) { + const unsigned ch = static_cast(c); + return static_cast((ch - ' ') < 95); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isprint_l.h b/libc/src/ctype/isprint_l.h new file mode 100644 index 00000000000000..bd2ea9354c36a0 --- /dev/null +++ b/libc/src/ctype/isprint_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for isprint_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISPRINT_H +#define LLVM_LIBC_SRC_CTYPE_ISPRINT_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int isprint_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISPRINT_H diff --git a/libc/src/ctype/ispunct.cpp b/libc/src/ctype/ispunct.cpp index c1906e3acdd80e..0635294220b9c3 100644 --- a/libc/src/ctype/ispunct.cpp +++ b/libc/src/ctype/ispunct.cpp @@ -14,8 +14,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, ispunct, (int c)) { const unsigned ch = static_cast(c); return static_cast(!internal::isalnum(ch) && internal::isgraph(ch)); diff --git a/libc/src/ctype/ispunct_l.cpp b/libc/src/ctype/ispunct_l.cpp new file mode 100644 index 00000000000000..e825fbe2001b08 --- /dev/null +++ b/libc/src/ctype/ispunct_l.cpp @@ -0,0 +1,22 @@ +//===-- Implementation of ispunct -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/ispunct_l.h" + +#include "src/__support/common.h" +#include "src/__support/ctype_utils.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, ispunct_l, (int c, locale_t)) { + const unsigned ch = static_cast(c); + return static_cast(!internal::isalnum(ch) && internal::isgraph(ch)); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/ispunct_l.h b/libc/src/ctype/ispunct_l.h new file mode 100644 index 00000000000000..862daf4836f788 --- /dev/null +++ b/libc/src/ctype/ispunct_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for ispunct_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISPUNCT_H +#define LLVM_LIBC_SRC_CTYPE_ISPUNCT_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int ispunct_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISPUNCT_H diff --git a/libc/src/ctype/isspace.cpp b/libc/src/ctype/isspace.cpp index f8908493787841..005bf460fc1032 100644 --- a/libc/src/ctype/isspace.cpp +++ b/libc/src/ctype/isspace.cpp @@ -14,8 +14,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isspace, (int c)) { return static_cast(internal::isspace(static_cast(c))); } diff --git a/libc/src/ctype/isspace_l.cpp b/libc/src/ctype/isspace_l.cpp new file mode 100644 index 00000000000000..5c46dd68051261 --- /dev/null +++ b/libc/src/ctype/isspace_l.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of isspace -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isspace_l.h" +#include "src/__support/ctype_utils.h" + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, isspace_l, (int c, locale_t)) { + return static_cast(internal::isspace(static_cast(c))); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isspace_l.h b/libc/src/ctype/isspace_l.h new file mode 100644 index 00000000000000..61bbf127956da7 --- /dev/null +++ b/libc/src/ctype/isspace_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for isspace_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISSPACE_H +#define LLVM_LIBC_SRC_CTYPE_ISSPACE_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int isspace_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISSPACE_H diff --git a/libc/src/ctype/isupper.cpp b/libc/src/ctype/isupper.cpp index 8f929ea1a009e4..965fa336b28b4d 100644 --- a/libc/src/ctype/isupper.cpp +++ b/libc/src/ctype/isupper.cpp @@ -14,8 +14,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isupper, (int c)) { return static_cast(internal::isupper(static_cast(c))); } diff --git a/libc/src/ctype/isupper_l.cpp b/libc/src/ctype/isupper_l.cpp new file mode 100644 index 00000000000000..358990261d603f --- /dev/null +++ b/libc/src/ctype/isupper_l.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of isupper -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isupper_l.h" +#include "src/__support/ctype_utils.h" + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, isupper_l, (int c, locale_t)) { + return static_cast(internal::isupper(static_cast(c))); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isupper_l.h b/libc/src/ctype/isupper_l.h new file mode 100644 index 00000000000000..9bee7ef8c09f59 --- /dev/null +++ b/libc/src/ctype/isupper_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for isupper_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISUPPER_H +#define LLVM_LIBC_SRC_CTYPE_ISUPPER_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int isupper_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISUPPER_H diff --git a/libc/src/ctype/isxdigit.cpp b/libc/src/ctype/isxdigit.cpp index 391c5c53cee1e1..6b730c354db083 100644 --- a/libc/src/ctype/isxdigit.cpp +++ b/libc/src/ctype/isxdigit.cpp @@ -14,8 +14,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, isxdigit, (int c)) { const unsigned ch = static_cast(c); return static_cast(internal::isdigit(ch) || (ch | 32) - 'a' < 6); diff --git a/libc/src/ctype/isxdigit_l.cpp b/libc/src/ctype/isxdigit_l.cpp new file mode 100644 index 00000000000000..8a5c7d4d28ab1c --- /dev/null +++ b/libc/src/ctype/isxdigit_l.cpp @@ -0,0 +1,22 @@ +//===-- Implementation of isxdigit ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isxdigit_l.h" +#include "src/__support/ctype_utils.h" + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, isxdigit_l, (int c, locale_t)) { + const unsigned ch = static_cast(c); + return static_cast(internal::isdigit(ch) || (ch | 32) - 'a' < 6); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/isxdigit_l.h b/libc/src/ctype/isxdigit_l.h new file mode 100644 index 00000000000000..ee847eda4eae9a --- /dev/null +++ b/libc/src/ctype/isxdigit_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for isxdigit_l ----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISXDIGIT_H +#define LLVM_LIBC_SRC_CTYPE_ISXDIGIT_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int isxdigit_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_ISXDIGIT_H diff --git a/libc/src/ctype/tolower.cpp b/libc/src/ctype/tolower.cpp index e230428eef2b14..3ecad7bc5d5d54 100644 --- a/libc/src/ctype/tolower.cpp +++ b/libc/src/ctype/tolower.cpp @@ -14,8 +14,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, tolower, (int c)) { return internal::tolower(c); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/tolower_l.cpp b/libc/src/ctype/tolower_l.cpp new file mode 100644 index 00000000000000..7ccf31617e5925 --- /dev/null +++ b/libc/src/ctype/tolower_l.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of tolower -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/tolower_l.h" +#include "src/__support/ctype_utils.h" + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, tolower_l, (int c, locale_t)) { + return internal::tolower(c); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/tolower_l.h b/libc/src/ctype/tolower_l.h new file mode 100644 index 00000000000000..6099b8c813469c --- /dev/null +++ b/libc/src/ctype/tolower_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for tolower_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_TOLOWER_H +#define LLVM_LIBC_SRC_CTYPE_TOLOWER_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int tolower_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_TOLOWER_H diff --git a/libc/src/ctype/toupper.cpp b/libc/src/ctype/toupper.cpp index 97c1ac2c02b8c0..b5a23fc7f588bd 100644 --- a/libc/src/ctype/toupper.cpp +++ b/libc/src/ctype/toupper.cpp @@ -14,8 +14,6 @@ namespace LIBC_NAMESPACE_DECL { -// TODO: Currently restricted to default locale. -// These should be extended using locale information. LLVM_LIBC_FUNCTION(int, toupper, (int c)) { if (internal::islower(c)) return c - ('a' - 'A'); diff --git a/libc/src/ctype/toupper_l.cpp b/libc/src/ctype/toupper_l.cpp new file mode 100644 index 00000000000000..f536ff36236160 --- /dev/null +++ b/libc/src/ctype/toupper_l.cpp @@ -0,0 +1,23 @@ +//===-- Implementation of toupper_l ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/toupper_l.h" +#include "src/__support/ctype_utils.h" + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, toupper_l, (int c, locale_t)) { + if (internal::islower(c)) + return c - ('a' - 'A'); + return c; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/ctype/toupper_l.h b/libc/src/ctype/toupper_l.h new file mode 100644 index 00000000000000..8877c35d492bd8 --- /dev/null +++ b/libc/src/ctype/toupper_l.h @@ -0,0 +1,21 @@ +//===-- Implementation header for toupper_l -----------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_TOUPPER_H +#define LLVM_LIBC_SRC_CTYPE_TOUPPER_H + +#include "hdr/types/locale_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int toupper_l(int c, locale_t locale); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_CTYPE_TOUPPER_H From c2a96a243b26d93090b859f851f8c219cffeaeaa Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 22 Aug 2024 13:51:42 -0500 Subject: [PATCH 254/426] [libc] Fix locale structs with old headergen --- libc/spec/stdc.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index 402d8c335470ad..6d8be9f8e4016d 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -1665,8 +1665,8 @@ def StdC : StandardSpec<"stdc"> { >; - NamedType StructLconv : NamedType<"struct lconv">; - PtrType StructLconvPtr : PtrType; + NamedType StructLconv = NamedType<"struct lconv">; + PtrType StructLconvPtr = PtrType; HeaderSpec Locale = HeaderSpec< "locale.h", From e738c816f2079e2f0fdc395e53070cc1afd8bfac Mon Sep 17 00:00:00 2001 From: Chris Apple Date: Thu, 22 Aug 2024 11:55:31 -0700 Subject: [PATCH 255/426] =?UTF-8?q?Revert=20"[MCA][X86]=20Add=20missing=20?= =?UTF-8?q?512-bit=20vpscatterqd/vscatterqps=20schedu=E2=80=A6=20(#105716)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …le data" This reverts commit 2c1f0642a2647883f35463aebf4f90a6b1f158c1. Many build failures in: CodeGen/X86/scatter-schedule.ll Example of a build failure: https://lab.llvm.org/buildbot/#/builders/155/builds/1675 --- llvm/lib/Target/X86/X86SchedIceLake.td | 2 -- llvm/lib/Target/X86/X86SchedSkylakeServer.td | 2 -- .../llvm-mca/X86/IceLakeServer/resources-avx512.s | 10 +++++----- .../llvm-mca/X86/SkylakeServer/resources-avx512.s | 10 +++++----- 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td index 29b1464e19a32b..fd372ba4656eba 100644 --- a/llvm/lib/Target/X86/X86SchedIceLake.td +++ b/llvm/lib/Target/X86/X86SchedIceLake.td @@ -1524,10 +1524,8 @@ def ICXWriteResGroup113 : SchedWriteRes<[ICXPort0,ICXPort49,ICXPort78,ICXPort015 let ReleaseAtCycles = [1,8,8,2]; } def: InstRW<[ICXWriteResGroup113], (instrs VPSCATTERDQZmr, - VPSCATTERQDZmr, VPSCATTERQQZmr, VSCATTERDPDZmr, - VSCATTERQPSZmr, VSCATTERQPDZmr)>; def ICXWriteResGroup114 : SchedWriteRes<[ICXPort0,ICXPort49,ICXPort5,ICXPort78,ICXPort0156]> { diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index 2423602d06c470..4fded44085e897 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -1499,10 +1499,8 @@ def SKXWriteResGroup113 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237,SKXPort015 let ReleaseAtCycles = [1,8,8,2]; } def: InstRW<[SKXWriteResGroup113], (instrs VPSCATTERDQZmr, - VPSCATTERQDZmr, VPSCATTERQQZmr, VSCATTERDPDZmr, - VSCATTERQPSZmr, VSCATTERQPDZmr)>; def SKXWriteResGroup114 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> { diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s index c509e766540b15..c4df992f3aebca 100644 --- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s +++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s @@ -1804,7 +1804,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 2 10 1.00 * vpermq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 36 8 8.00 * vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 4.00 * vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: 19 7 4.00 * vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 1 1 0.50 * vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 4.00 * vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 1 1 1.00 vpshufd $0, %zmm16, %zmm19 # CHECK-NEXT: 2 8 1.00 * vpshufd $0, (%rax), %zmm19 @@ -1871,7 +1871,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 2 8 1.00 * vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 36 7 8.00 * vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 4.00 * vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: 19 7 4.00 * vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 1 1 0.50 * vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 4.00 * vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 1 3 1.00 vshuff32x4 $0, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: 2 10 1.00 * vshuff32x4 $0, (%rax), %zmm17, %zmm19 @@ -2054,7 +2054,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] -# CHECK-NEXT: - 612.00 411.17 103.67 327.50 327.50 48.50 593.17 6.00 48.50 48.50 48.50 +# CHECK-NEXT: - 612.00 408.17 102.67 327.50 327.50 41.50 592.17 5.00 41.50 41.50 41.50 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions: @@ -2774,7 +2774,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - vpermq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - 1.50 0.50 - - 8.00 1.50 0.50 8.00 8.00 8.00 vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 - - 4.00 0.50 0.50 4.00 4.00 4.00 vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: - - 1.50 0.50 - - 4.00 0.50 0.50 4.00 4.00 4.00 vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - - - - - 0.50 - - 0.50 0.50 0.50 vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 - - 4.00 0.50 0.50 4.00 4.00 4.00 vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - - - - - - 1.00 - - - - vpshufd $0, %zmm16, %zmm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - vpshufd $0, (%rax), %zmm19 @@ -2841,7 +2841,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - 1.50 0.50 - - 8.00 1.50 0.50 8.00 8.00 8.00 vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 - - 4.00 0.50 0.50 4.00 4.00 4.00 vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: - - 1.50 0.50 - - 4.00 0.50 0.50 4.00 4.00 4.00 vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - - - - - 0.50 - - 0.50 0.50 0.50 vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 - - 4.00 0.50 0.50 4.00 4.00 4.00 vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - - - - - - 1.00 - - - - vshuff32x4 $0, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - vshuff32x4 $0, (%rax), %zmm17, %zmm19 diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s index 9c006d4ebb077d..5eaa0f91fdaaba 100644 --- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s +++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s @@ -1804,7 +1804,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 2 10 1.00 * vpermq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 36 8 16.00 * vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 8.00 * vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: 19 7 8.00 * vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 8.00 * vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 1 1 1.00 vpshufd $0, %zmm16, %zmm19 # CHECK-NEXT: 2 8 1.00 * vpshufd $0, (%rax), %zmm19 @@ -1871,7 +1871,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 2 8 1.00 * vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 36 7 16.00 * vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 8.00 * vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: 19 7 8.00 * vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 1 1 1.00 * vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 8.00 * vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 1 3 1.00 vshuff32x4 $0, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: 2 10 1.00 * vshuff32x4 $0, (%rax), %zmm17, %zmm19 @@ -2052,7 +2052,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] -# CHECK-NEXT: - 612.00 352.67 103.67 359.83 359.83 97.00 651.67 6.00 32.33 +# CHECK-NEXT: - 612.00 349.67 102.67 355.17 355.17 83.00 650.67 5.00 27.67 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: @@ -2772,7 +2772,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vpermq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - 1.50 0.50 5.33 5.33 16.00 1.50 0.50 5.33 vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 0.50 0.50 2.67 vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 0.50 0.50 2.67 vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - - - 0.33 0.33 1.00 - - 0.33 vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 0.50 0.50 2.67 vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - - - - - - 1.00 - - vpshufd $0, %zmm16, %zmm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vpshufd $0, (%rax), %zmm19 @@ -2839,7 +2839,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - 1.50 0.50 5.33 5.33 16.00 1.50 0.50 5.33 vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 0.50 0.50 2.67 vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 0.50 0.50 2.67 vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - - - 0.33 0.33 1.00 - - 0.33 vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 0.50 0.50 2.67 vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - - - - - - 1.00 - - vshuff32x4 $0, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vshuff32x4 $0, (%rax), %zmm17, %zmm19 From 3082a381f57ef2885c270f41f2955e08c79634c5 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 22 Aug 2024 12:06:47 -0700 Subject: [PATCH 256/426] [LTO] Introduce helper functions to add GUIDs to ImportList (NFC) (#105555) The new helper functions make the intent clearer while hiding implementation details, including how we handle previously added entries. Note that: - If we are adding a GUID as a GlobalValueSummary::Definition, then we override a previously added GlobalValueSummary::Declaration entry for the same GUID. - If we are adding a GUID as a GlobalValueSummary::Declaration, then a previously added GlobalValueSummary::Definition entry for the same GUID takes precedence, and no change is made. --- .../llvm/Transforms/IPO/FunctionImport.h | 27 ++++++++ llvm/lib/LTO/LTOBackend.cpp | 10 +-- llvm/lib/Transforms/IPO/FunctionImport.cpp | 63 ++++++++++--------- llvm/tools/llvm-link/llvm-link.cpp | 6 +- 4 files changed, 64 insertions(+), 42 deletions(-) diff --git a/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/llvm/include/llvm/Transforms/IPO/FunctionImport.h index 5dad572532c8ae..5ab8c6d130b60a 100644 --- a/llvm/include/llvm/Transforms/IPO/FunctionImport.h +++ b/llvm/include/llvm/Transforms/IPO/FunctionImport.h @@ -122,6 +122,33 @@ class FunctionImporter { /// Import functions in Module \p M based on the supplied import list. Expected importFunctions(Module &M, const ImportMapTy &ImportList); + enum class AddDefinitionStatus { + NoChange, + Inserted, + ChangedToDefinition, + }; + + // Add the given GUID to ImportList as a definition. If the same GUID has + // been added as a declaration previously, that entry is overridden. + static AddDefinitionStatus addDefinition(ImportMapTy &ImportList, + StringRef FromModule, + GlobalValue::GUID GUID); + + // Add the given GUID to ImportList as a declaration. If the same GUID has + // been added as a definition previously, that entry takes precedence, and no + // change is made. + static void maybeAddDeclaration(ImportMapTy &ImportList, StringRef FromModule, + GlobalValue::GUID GUID); + + static void addGUID(ImportMapTy &ImportList, StringRef FromModule, + GlobalValue::GUID GUID, + GlobalValueSummary::ImportKind ImportKind) { + if (ImportKind == GlobalValueSummary::Definition) + addDefinition(ImportList, FromModule, GUID); + else + maybeAddDeclaration(ImportList, FromModule, GUID); + } + private: /// The summaries index used to trigger importing. const ModuleSummaryIndex &Index; diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index effaed2d9bfb60..ae46d946ae06a6 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -726,14 +726,8 @@ bool lto::initImportList(const Module &M, if (Summary->modulePath() == M.getModuleIdentifier()) continue; // Add an entry to provoke importing by thinBackend. - // Try emplace the entry first. If an entry with the same key already - // exists, set the value to 'std::min(existing-value, new-value)' to make - // sure a definition takes precedence over a declaration. - auto [Iter, Inserted] = ImportList[Summary->modulePath()].try_emplace( - GUID, Summary->importType()); - - if (!Inserted) - Iter->second = std::min(Iter->second, Summary->importType()); + FunctionImporter::addGUID(ImportList, Summary->modulePath(), GUID, + Summary->importType()); } } return true; diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp index 92371720e0eceb..354ad0fde092a7 100644 --- a/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -334,6 +334,25 @@ using EdgeInfo = std::tuple; } // anonymous namespace +FunctionImporter::AddDefinitionStatus +FunctionImporter::addDefinition(ImportMapTy &ImportList, StringRef FromModule, + GlobalValue::GUID GUID) { + auto [It, Inserted] = + ImportList[FromModule].try_emplace(GUID, GlobalValueSummary::Definition); + if (Inserted) + return AddDefinitionStatus::Inserted; + if (It->second == GlobalValueSummary::Definition) + return AddDefinitionStatus::NoChange; + It->second = GlobalValueSummary::Definition; + return AddDefinitionStatus::ChangedToDefinition; +} + +void FunctionImporter::maybeAddDeclaration(ImportMapTy &ImportList, + StringRef FromModule, + GlobalValue::GUID GUID) { + ImportList[FromModule].try_emplace(GUID, GlobalValueSummary::Declaration); +} + /// Import globals referenced by a function or other globals that are being /// imported, if importing such global is possible. class GlobalsImporter final { @@ -392,17 +411,13 @@ class GlobalsImporter final { // If there isn't an entry for GUID, insert pair. // Otherwise, definition should take precedence over declaration. - auto [Iter, Inserted] = - ImportList[RefSummary->modulePath()].try_emplace( - VI.getGUID(), GlobalValueSummary::Definition); + if (FunctionImporter::addDefinition( + ImportList, RefSummary->modulePath(), VI.getGUID()) != + FunctionImporter::AddDefinitionStatus::Inserted) + break; + // Only update stat and exports if we haven't already imported this // variable. - if (!Inserted) { - // Set the value to 'std::min(existing-value, new-value)' to make - // sure a definition takes precedence over a declaration. - Iter->second = std::min(GlobalValueSummary::Definition, Iter->second); - break; - } NumImportedGlobalVarsThinLink++; // Any references made by this variable will be marked exported // later, in ComputeCrossModuleImport, after import decisions are @@ -882,13 +897,10 @@ static void computeImportForFunction( if (ImportDeclaration && SummaryForDeclImport) { StringRef DeclSourceModule = SummaryForDeclImport->modulePath(); - // Since definition takes precedence over declaration for the same VI, - // try emplace pair without checking insert result. - // If insert doesn't happen, there must be an existing entry keyed by - // VI. Note `ExportLists` only keeps track of exports due to imported + // Note `ExportLists` only keeps track of exports due to imported // definitions. - ImportList[DeclSourceModule].try_emplace( - VI.getGUID(), GlobalValueSummary::Declaration); + FunctionImporter::maybeAddDeclaration(ImportList, DeclSourceModule, + VI.getGUID()); } // Update with new larger threshold if this was a retry (otherwise // we would have already inserted with NewThreshold above). Also @@ -937,12 +949,9 @@ static void computeImportForFunction( // Try emplace the definition entry, and update stats based on insertion // status. - auto [Iter, Inserted] = ImportList[ExportModulePath].try_emplace( - VI.getGUID(), GlobalValueSummary::Definition); - - // We previously decided to import this GUID definition if it was already - // inserted in the set of imports from the exporting module. - if (Inserted || Iter->second == GlobalValueSummary::Declaration) { + if (FunctionImporter::addDefinition(ImportList, ExportModulePath, + VI.getGUID()) != + FunctionImporter::AddDefinitionStatus::NoChange) { NumImportedFunctionsThinLink++; if (IsHotCallsite) NumImportedHotFunctionsThinLink++; @@ -950,9 +959,6 @@ static void computeImportForFunction( NumImportedCriticalFunctionsThinLink++; } - if (Iter->second == GlobalValueSummary::Declaration) - Iter->second = GlobalValueSummary::Definition; - // Any calls/references made by this function will be marked exported // later, in ComputeCrossModuleImport, after import decisions are // complete, which is more efficient than adding them here. @@ -1300,13 +1306,8 @@ static void ComputeCrossModuleImportForModuleFromIndexForTest( if (Summary->modulePath() == ModulePath) continue; // Add an entry to provoke importing by thinBackend. - auto [Iter, Inserted] = ImportList[Summary->modulePath()].try_emplace( - GUID, Summary->importType()); - if (!Inserted) { - // Use 'std::min' to make sure definition (with enum value 0) takes - // precedence over declaration (with enum value 1). - Iter->second = std::min(Iter->second, Summary->importType()); - } + FunctionImporter::addGUID(ImportList, Summary->modulePath(), GUID, + Summary->importType()); } #ifndef NDEBUG dumpImportListForModule(Index, ModulePath, ImportList); diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp index b311820ce58709..ef6f85d38fede6 100644 --- a/llvm/tools/llvm-link/llvm-link.cpp +++ b/llvm/tools/llvm-link/llvm-link.cpp @@ -381,9 +381,9 @@ static bool importFunctions(const char *argv0, Module &DestModule) { // definition, so make the import type definition directly. // FIXME: A follow-up patch should add test coverage for import declaration // in `llvm-link` CLI (e.g., by introducing a new command line option). - auto &Entry = - ImportList[FileNameStringCache.insert(FileName).first->getKey()]; - Entry[F->getGUID()] = GlobalValueSummary::Definition; + FunctionImporter::addDefinition( + ImportList, FileNameStringCache.insert(FileName).first->getKey(), + F->getGUID()); } auto CachedModuleLoader = [&](StringRef Identifier) { return ModuleLoaderCache.takeModule(std::string(Identifier)); From ee08d9cba5615937acf28087da841886cc6a0144 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 22 Aug 2024 23:27:33 +0400 Subject: [PATCH 257/426] AMDGPU: Remove global/flat atomic fadd intrinics (#97051) These have been replaced with atomicrmw. --- llvm/docs/ReleaseNotes.rst | 5 + llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 3 - llvm/lib/IR/AutoUpgrade.cpp | 4 +- llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 5 - .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 - .../Target/AMDGPU/AMDGPUSearchableTables.td | 2 - .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 2 - llvm/lib/Target/AMDGPU/DSInstructions.td | 6 +- llvm/lib/Target/AMDGPU/FLATInstructions.td | 13 - llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 4 - llvm/test/Bitcode/amdgcn-atomic.ll | 32 ++ .../AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll | 8 +- .../AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll | 46 +- .../GlobalISel/flat-atomic-fadd.v2f16.ll | 41 +- .../AMDGPU/GlobalISel/fp-atomics-gfx940.ll | 55 +- .../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll | 150 ++--- .../global-atomic-fadd.f32-no-rtn.ll | 535 ++++++++++------- .../GlobalISel/global-atomic-fadd.f32-rtn.ll | 161 +----- .../GlobalISel/global-atomic-fadd.f64.ll | 167 ------ .../global-atomic-fadd.v2f16-no-rtn.ll | 114 +--- .../global-atomic-fadd.v2f16-rtn.ll | 72 +-- ...llvm.amdgcn.global.atomic.fadd-with-ret.ll | 21 - .../llvm.amdgcn.global.atomic.fadd.ll | 126 ---- .../AMDGPU/cgp-addressing-modes-gfx908.ll | 38 +- .../CodeGen/AMDGPU/flat-atomic-fadd.f32.ll | 8 +- .../CodeGen/AMDGPU/flat-atomic-fadd.f64.ll | 4 +- .../CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll | 63 -- .../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll | 106 +--- llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll | 293 +--------- .../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll | 544 +++--------------- .../CodeGen/AMDGPU/gep-const-address-space.ll | 10 +- .../AMDGPU/global-atomic-fadd.f32-no-rtn.ll | 115 ---- .../AMDGPU/global-atomic-fadd.f32-rtn.ll | 120 ---- .../CodeGen/AMDGPU/global-atomic-fadd.f64.ll | 171 ------ .../AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll | 58 +- .../AMDGPU/global-atomic-fadd.v2f16-rtn.ll | 39 +- .../AMDGPU/global-saddr-atomics.gfx908.ll | 23 +- .../llvm.amdgcn.global.atomic.fadd.gfx90a.ll | 56 -- .../AMDGPU/llvm.amdgcn.global.atomic.fadd.ll | 77 --- .../test/CodeGen/AMDGPU/shl_add_ptr_global.ll | 6 +- .../CodeGen/AMDGPU/unsupported-atomics.ll | 28 - .../AMDGPU/flat-fadd-fmin-fmax-intrinsics.ll | 34 -- .../InferAddressSpaces/AMDGPU/flat_atomic.ll | 105 ++-- 43 files changed, 688 insertions(+), 2784 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll delete mode 100644 llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.gfx90a.ll delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.ll delete mode 100644 llvm/test/CodeGen/AMDGPU/unsupported-atomics.ll diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index c9eb5eea896905..2b68c54de8beb7 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -75,6 +75,11 @@ Changes to the AArch64 Backend Changes to the AMDGPU Backend ----------------------------- +* Removed ``llvm.amdgcn.flat.atomic.fadd`` and + ``llvm.amdgcn.global.atomic.fadd`` intrinsics. Users should use the + :ref:`atomicrmw ` instruction with `fadd` and + addrspace(0) or addrspace(1) instead. + Changes to the ARM Backend -------------------------- diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 539410f1ed05e6..dc13a35c66f9ab 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2909,8 +2909,6 @@ def int_amdgcn_dot4_f32_bf8_bf8 : AMDGPU8bitFloatDot4Intrinsic; // gfx908 intrinsics // ===----------------------------------------------------------------------===// -def int_amdgcn_global_atomic_fadd : AMDGPUAtomicRtn; - // llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp class AMDGPUMfmaIntrinsic : ClangBuiltin, @@ -2949,7 +2947,6 @@ def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic; def int_amdgcn_global_atomic_fmax : AMDGPUAtomicRtn; -def int_amdgcn_flat_atomic_fadd : AMDGPUAtomicRtn; def int_amdgcn_flat_atomic_fmin : AMDGPUAtomicRtn; def int_amdgcn_flat_atomic_fmax : AMDGPUAtomicRtn; diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index c6963edf5288ae..8dd5b9b3ec3d1f 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1035,8 +1035,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") || Name.starts_with("ds.fmax") || - Name.starts_with("global.atomic.fadd.v2bf16") || - Name.starts_with("flat.atomic.fadd.v2bf16")) { + Name.starts_with("global.atomic.fadd") || + Name.starts_with("flat.atomic.fadd")) { // Replaced with atomicrmw fadd/fmin/fmax, so there's no new // declaration. NewFn = nullptr; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index db8b44149cf47e..aa5b151adef3a4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -618,16 +618,11 @@ multiclass local_addr_space_atomic_op { } } -defm int_amdgcn_flat_atomic_fadd : noret_op; -defm int_amdgcn_flat_atomic_fadd : flat_addr_space_atomic_op; defm int_amdgcn_flat_atomic_fmin : noret_op; defm int_amdgcn_flat_atomic_fmax : noret_op; -defm int_amdgcn_global_atomic_fadd : global_addr_space_atomic_op; -defm int_amdgcn_flat_atomic_fadd : global_addr_space_atomic_op; defm int_amdgcn_global_atomic_fmin : noret_op; defm int_amdgcn_global_atomic_fmax : noret_op; defm int_amdgcn_global_atomic_csub : noret_op; -defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op; defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op; defm int_amdgcn_flat_atomic_fmin_num : noret_op; defm int_amdgcn_flat_atomic_fmax_num : noret_op; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 12aa6ee2a2536a..69a1936a11fe05 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4896,13 +4896,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; } - case Intrinsic::amdgcn_global_atomic_fadd: case Intrinsic::amdgcn_global_atomic_csub: case Intrinsic::amdgcn_global_atomic_fmin: case Intrinsic::amdgcn_global_atomic_fmax: case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_fmax_num: - case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmin_num: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 48fb786ed97206..95c4859674ecc4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -239,13 +239,11 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; -def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; -def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index c38c2dc0f5f618..cb3fbdb850c1ac 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1045,7 +1045,6 @@ bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl &OpIndexes, switch (IID) { case Intrinsic::amdgcn_is_shared: case Intrinsic::amdgcn_is_private: - case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax_num: @@ -1107,7 +1106,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy}, {NewV, MaskOp}); } - case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax_num: diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 3c76fb2f7961f7..e9283fde85a48d 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -1135,11 +1135,7 @@ class DSAtomicRetPatIntrinsic.ret:$value, Offset:$offset, (i1 gds))> { } - -def : DSAtomicRetPatIntrinsic; -let AddedComplexity = 1 in -def : DSAtomicRetPatIntrinsic; -} +} // End SubtargetPredicate = HasLdsAtomicAddF64 let SubtargetPredicate = HasAtomicDsPkAdd16Insts in { defm : DSAtomicRetNoRetPat_mc; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 8067090636a9aa..7b3822067072e5 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1625,25 +1625,17 @@ let OtherPredicates = [isGFX12Only] in { let OtherPredicates = [HasAtomicFaddNoRtnInsts] in { defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; -defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f32>; -defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", "global_addrspace", f32>; } let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in { defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>; -defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>; -defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>; } let OtherPredicates = [HasAtomicFaddRtnInsts] in { defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; -defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f32>; -defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", "global_addrspace", f32>; } let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in { -defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>; -defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>; } @@ -1661,19 +1653,14 @@ defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", let OtherPredicates = [HasFlatBufferGlobalAtomicFaddF64Inst] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>; -defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f64>; -defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", "global_addrspace", f64>; defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>; -defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", f64>; } let OtherPredicates = [HasFlatAtomicFaddF32Inst] in { defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>; -defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>; } let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in { -defm : FlatAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>; defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>; defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index d02d0bbb52e567..c954c0aa71f734 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1351,13 +1351,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineMemOperand::MODereferenceable; return true; } - case Intrinsic::amdgcn_global_atomic_fadd: case Intrinsic::amdgcn_global_atomic_fmin: case Intrinsic::amdgcn_global_atomic_fmax: case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: - case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmin_num: @@ -1464,13 +1462,11 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, case Intrinsic::amdgcn_ds_consume: case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: - case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_csub: - case Intrinsic::amdgcn_global_atomic_fadd: case Intrinsic::amdgcn_global_atomic_fmax: case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fmin: diff --git a/llvm/test/Bitcode/amdgcn-atomic.ll b/llvm/test/Bitcode/amdgcn-atomic.ll index 9563d178e64330..d642372799f56b 100644 --- a/llvm/test/Bitcode/amdgcn-atomic.ll +++ b/llvm/test/Bitcode/amdgcn-atomic.ll @@ -322,4 +322,36 @@ define <2 x i16> @upgrade_amdgcn_global_atomic_fadd_v2bf16_p1(ptr addrspace(1) % ret <2 x i16> %result } +declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr nocapture, <2 x half>) #0 + +define <2 x half> @upgrade_amdgcn_flat_atomic_fadd_v2f16_p0_v2f16(ptr %ptr, <2 x half> %data) { + ; CHECK: %{{.+}} = atomicrmw fadd ptr %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + %result = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) + ret <2 x half> %result +} + +declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) nocapture, <2 x half>) #0 + +define <2 x half> @upgrade_amdgcn_global_atomic_fadd_v2f16_p1_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) { + ; CHECK: %{{.+}} = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + %result = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) + ret <2 x half> %result +} + +declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr nocapture, float) #0 + +define float @upgrade_amdgcn_flat_atomic_fadd_f32_p0_f32(ptr %ptr, float %data) { + ; CHECK: %{{.+}} = atomicrmw fadd ptr %ptr, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}} + %result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) + ret float %result +} + +declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapture, float) #0 + +define float @upgrade_amdgcn_global_atomic_fadd_f32_p1_f32(ptr addrspace(1) %ptr, float %data) { + ; CHECK: %{{.+}} = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}} + %result = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) + ret float %result +} + attributes #0 = { argmemonly nounwind willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll index 820f9ee1ce7f4e..7a97ac8211f672 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll @@ -12,7 +12,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX940-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) ; GFX940-NEXT: S_ENDPGM 0 ; ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic @@ -23,7 +23,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) ; GFX11-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data) ret void @@ -38,7 +38,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX940-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) ; GFX940-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; @@ -50,7 +50,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll index b2a96fb948797a..c1cb74cb0e25a8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll @@ -1,46 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s - -define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_intrinsic(ptr %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s64) on %ir.ptr) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr %ptr, double %data) - ret void -} - -define amdgpu_ps double @flat_atomic_fadd_f64_rtn_intrinsic(ptr %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s64) on %ir.ptr) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr %ptr, double %data) - ret double %ret -} +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %data) { ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_atomicrmw @@ -82,6 +42,4 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da ret double %ret } -declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr, double) - !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll index f2f00823e0278e..0896e4dc7af14f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll @@ -1,38 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX940 %s -define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn_intrinsic(ptr %ptr, <2 x half> %data) { - ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_no_rtn_intrinsic - ; GFX940: bb.1 (%ir-block.0): - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: FLAT_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr) - ; GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr %ptr, <2 x half> %data) - ret void -} - -define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn_intrinsic(ptr %ptr, <2 x half> %data) { - ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_rtn_intrinsic - ; GFX940: bb.1 (%ir-block.0): - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: [[FLAT_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_PK_ADD_F16_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr) - ; GFX940-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_PK_ADD_F16_RTN]] - ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr %ptr, <2 x half> %data) - ret <2 x half> %ret -} - -define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) #0 { +define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_rtn ; GFX940: bb.1 (%ir-block.0): ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -48,7 +17,7 @@ define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %da ret <2 x half> %ret } -define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %data) #0 { +define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %data) { ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_saddr_rtn ; GFX940: bb.1 (%ir-block.0): ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 @@ -65,7 +34,7 @@ define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_saddr_rtn(ptr inreg %ptr, <2 ret <2 x half> %ret } -define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn(ptr %ptr, <2 x half> %data) #0 { +define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn(ptr %ptr, <2 x half> %data) { ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_no_rtn ; GFX940: bb.1 (%ir-block.0): ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -80,7 +49,7 @@ define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn(ptr %ptr, <2 x half> %data) ret void } -define amdgpu_ps void @flat_atomic_fadd_v2f16_saddr_no_rtn(ptr inreg %ptr, <2 x half> %data) #0 { +define amdgpu_ps void @flat_atomic_fadd_v2f16_saddr_no_rtn(ptr inreg %ptr, <2 x half> %data) { ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_saddr_no_rtn ; GFX940: bb.1 (%ir-block.0): ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 @@ -96,6 +65,4 @@ define amdgpu_ps void @flat_atomic_fadd_v2f16_saddr_no_rtn(ptr inreg %ptr, <2 x ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" } - !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll index 031a3633bd3757..25c7fc6463c33d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -1,22 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 -global-isel -verify-machineinstrs | FileCheck %s -check-prefix=GFX940 - -declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) -declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) - -define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 -; GFX940-NEXT: s_endpgm - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) - ret void -} +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 < %s | FileCheck %s -check-prefix=GFX940 define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: @@ -50,17 +33,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ret void } -define float @flat_atomic_fadd_f32_rtn(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) - ret float %ret -} - define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { ; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat: ; GFX940: ; %bb.0: @@ -75,31 +47,6 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { ret float %ret } -define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 -; GFX940-NEXT: s_endpgm - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) - ret void -} - -define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) - ret <2 x half> %ret -} - define <2 x half> @local_atomic_fadd_ret_v2f16_offset(ptr addrspace(3) %ptr, <2 x half> %val) { ; GFX940-LABEL: local_atomic_fadd_ret_v2f16_offset: ; GFX940: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index cfd4e1211bb01d..eb39ca2d7daa7f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -14,10 +14,8 @@ declare double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double, <4 x i32>, i32 declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg) declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg) declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32 immarg) -declare double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) declare double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) -declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) @@ -1017,29 +1015,6 @@ main_body: ret void } -define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) { -; GFX90A-LABEL: global_atomic_fadd_f64_noret: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] -; GFX90A-NEXT: s_endpgm -; -; GFX940-LABEL: global_atomic_fadd_f64_noret: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] -; GFX940-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fmin_f64_noret: ; GFX90A: ; %bb.0: ; %main_body @@ -1095,7 +1070,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB39_2 +; GFX90A-NEXT: s_cbranch_execz .LBB38_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1108,7 +1083,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB39_2: +; GFX90A-NEXT: .LBB38_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: @@ -1119,7 +1094,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB39_2 +; GFX940-NEXT: s_cbranch_execz .LBB38_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1131,7 +1106,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: .LBB39_2: +; GFX940-NEXT: .LBB38_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1147,7 +1122,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB40_2 +; GFX90A-NEXT: s_cbranch_execz .LBB39_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1158,7 +1133,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB40_2: +; GFX90A-NEXT: .LBB39_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent: @@ -1169,7 +1144,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB40_2 +; GFX940-NEXT: s_cbranch_execz .LBB39_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1181,7 +1156,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: .LBB40_2: +; GFX940-NEXT: .LBB39_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1197,7 +1172,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB41_2 +; GFX90A-NEXT: s_cbranch_execz .LBB40_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1210,7 +1185,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB41_2: +; GFX90A-NEXT: .LBB40_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: @@ -1221,7 +1196,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB41_2 +; GFX940-NEXT: s_cbranch_execz .LBB40_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1233,7 +1208,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: .LBB41_2: +; GFX940-NEXT: .LBB40_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1249,7 +1224,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB42_2 +; GFX90A-NEXT: s_cbranch_execz .LBB41_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1260,7 +1235,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB42_2: +; GFX90A-NEXT: .LBB41_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush: @@ -1271,7 +1246,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB42_2 +; GFX940-NEXT: s_cbranch_execz .LBB41_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1283,32 +1258,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: .LBB42_2: +; GFX940-NEXT: .LBB41_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define double @global_atomic_fadd_f64_rtn(ptr addrspace(1) %ptr, double %data) { -; GFX90A-LABEL: global_atomic_fadd_f64_rtn: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_atomic_fadd_f64_rtn: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %data) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat: ; GFX90A: ; %bb.0: ; %main_body @@ -1435,7 +1391,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB49_2 +; GFX90A-NEXT: s_cbranch_execz .LBB47_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1446,7 +1402,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB49_2: +; GFX90A-NEXT: .LBB47_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: @@ -1457,7 +1413,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB49_2 +; GFX940-NEXT: s_cbranch_execz .LBB47_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1469,7 +1425,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: .LBB49_2: +; GFX940-NEXT: .LBB47_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1648,48 +1604,6 @@ main_body: ret double %ret } -define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) { -; GFX90A-LABEL: flat_atomic_fadd_f64_noret: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_endpgm -; -; GFX940-LABEL: flat_atomic_fadd_f64_noret: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) - ret void -} - -define double @flat_atomic_fadd_f64_rtn(ptr %ptr, double %data) { -; GFX90A-LABEL: flat_atomic_fadd_f64_rtn: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fadd_f64_rtn: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) - ret double %ret -} - define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body @@ -1812,7 +1726,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB63_2 +; GFX90A-NEXT: s_cbranch_execz .LBB59_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1822,7 +1736,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX90A-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB63_2: +; GFX90A-NEXT: .LBB59_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat: @@ -1833,7 +1747,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB63_2 +; GFX940-NEXT: s_cbranch_execz .LBB59_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1843,7 +1757,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX940-NEXT: v_mov_b32_e32 v2, s2 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB63_2: +; GFX940-NEXT: .LBB59_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1859,7 +1773,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB64_2 +; GFX90A-NEXT: s_cbranch_execz .LBB60_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1869,7 +1783,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX90A-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB64_2: +; GFX90A-NEXT: .LBB60_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush: @@ -1880,7 +1794,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB64_2 +; GFX940-NEXT: s_cbranch_execz .LBB60_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1890,7 +1804,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX940-NEXT: v_mov_b32_e32 v2, s2 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB64_2: +; GFX940-NEXT: .LBB60_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1906,7 +1820,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB65_2 +; GFX90A-NEXT: s_cbranch_execz .LBB61_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1916,7 +1830,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX90A-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB65_2: +; GFX90A-NEXT: .LBB61_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: @@ -1927,7 +1841,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB65_2 +; GFX940-NEXT: s_cbranch_execz .LBB61_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1937,7 +1851,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX940-NEXT: v_mov_b32_e32 v2, s2 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB65_2: +; GFX940-NEXT: .LBB61_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll index 2bbb8e86f936ab..fcd8c6fb0fe7c6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll @@ -1,223 +1,342 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s -define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) { - ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic - ; GFX908_GFX11: bb.1 (%ir-block.0): - ; GFX908_GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX908_GFX11-NEXT: {{ $}} - ; GFX908_GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11-NEXT: S_ENDPGM 0 - ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret void -} - -define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, float %data) { - ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic - ; GFX908_GFX11: bb.1 (%ir-block.0): - ; GFX908_GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX908_GFX11-NEXT: {{ $}} - ; GFX908_GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX908_GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11-NEXT: S_ENDPGM 0 - ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data) - ret void -} - -define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, float %data) { - ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic - ; GFX908_GFX11: bb.1 (%ir-block.0): - ; GFX908_GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX908_GFX11-NEXT: {{ $}} - ; GFX908_GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11-NEXT: S_ENDPGM 0 +define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) { + ; GFX908-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw + ; GFX908: bb.1 (%ir-block.0): + ; GFX908-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX908-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret void -} - -define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, float %data) { - ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic - ; GFX908_GFX11: bb.1 (%ir-block.0): - ; GFX908_GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX908_GFX11-NEXT: {{ $}} - ; GFX908_GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX908_GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data) - ret void -} - -define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) { - ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw - ; GFX908_GFX11: bb.1 (%ir-block.0): - ; GFX908_GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX908_GFX11-NEXT: {{ $}} - ; GFX908_GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11-NEXT: S_ENDPGM 0 + ; GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, float %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE - ; GFX90A_GFX940-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A_GFX940-NEXT: S_BRANCH %bb.2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: bb.2 (%ir-block.5): - ; GFX90A_GFX940-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1 - ; GFX90A_GFX940-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX90A_GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY8]], [[COPY9]], implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX90A_GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec - ; GFX90A_GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 - ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A_GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 63 - ; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_3]] - ; GFX90A_GFX940-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX90A_GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY19]], implicit $exec - ; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A_GFX940-NEXT: S_BRANCH %bb.3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: bb.3 (%ir-block.31): - ; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000) - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: bb.4.Flow: - ; GFX90A_GFX940-NEXT: successors: %bb.5(0x80000000) - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: bb.5 (%ir-block.33): - ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX908-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw + ; GFX908: bb.1 (%ir-block.0): + ; GFX908-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX908-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX908-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX908-NEXT: S_BRANCH %bb.2 + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.2 (%ir-block.5): + ; GFX908-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 + ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 + ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1 + ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX908-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX908-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY8]], [[COPY9]], implicit $exec + ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX908-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX908-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec + ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX908-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec + ; GFX908-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec + ; GFX908-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX908-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX908-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec + ; GFX908-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX908-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX908-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec + ; GFX908-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX908-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX908-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec + ; GFX908-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX908-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX908-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec + ; GFX908-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX908-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX908-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec + ; GFX908-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX908-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_3]] + ; GFX908-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] + ; GFX908-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec + ; GFX908-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY19]], implicit $exec + ; GFX908-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX908-NEXT: S_BRANCH %bb.3 + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.3 (%ir-block.31): + ; GFX908-NEXT: successors: %bb.4(0x80000000) + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX908-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.4.Flow: + ; GFX908-NEXT: successors: %bb.5(0x80000000) + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.5 (%ir-block.33): + ; GFX908-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX908-NEXT: S_ENDPGM 0 + ; + ; GFX90A-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX90A-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: S_BRANCH %bb.2 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.2 (%ir-block.5): + ; GFX90A-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 + ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1 + ; GFX90A-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY8]], [[COPY9]], implicit $exec + ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX90A-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec + ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec + ; GFX90A-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX90A-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX90A-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX90A-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX90A-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX90A-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX90A-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_3]] + ; GFX90A-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] + ; GFX90A-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec + ; GFX90A-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY19]], implicit $exec + ; GFX90A-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: S_BRANCH %bb.3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.3 (%ir-block.31): + ; GFX90A-NEXT: successors: %bb.4(0x80000000) + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.4.Flow: + ; GFX90A-NEXT: successors: %bb.5(0x80000000) + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.5 (%ir-block.33): + ; GFX90A-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX940-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940-NEXT: S_BRANCH %bb.2 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: bb.2 (%ir-block.5): + ; GFX940-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 + ; GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1 + ; GFX940-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY8]], [[COPY9]], implicit $exec + ; GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec + ; GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec + ; GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec + ; GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX940-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec + ; GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX940-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX940-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec + ; GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX940-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX940-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec + ; GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX940-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX940-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec + ; GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec + ; GFX940-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_3]] + ; GFX940-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] + ; GFX940-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec + ; GFX940-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY19]], implicit $exec + ; GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940-NEXT: S_BRANCH %bb.3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: bb.3 (%ir-block.31): + ; GFX940-NEXT: successors: %bb.4(0x80000000) + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: bb.4.Flow: + ; GFX940-NEXT: successors: %bb.5(0x80000000) + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: bb.5 (%ir-block.33): + ; GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940-NEXT: S_ENDPGM 0 + ; + ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_32_xm0_xexec = SI_PS_LIVE + ; GFX11-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX11-NEXT: S_BRANCH %bb.2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: bb.2 (%ir-block.5): + ; GFX11-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $exec_lo + ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX11-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[COPY5]], implicit $exec + ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY6]], implicit-def dead $scc, implicit $exec + ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX11-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY7]], [[V_SET_INACTIVE_B32_]], 353, 15, 15, 0, implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX11-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY8]], [[V_ADD_F32_e64_]], 354, 15, 15, 0, implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX11-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY9]], [[V_ADD_F32_e64_1]], 356, 15, 15, 0, implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX11-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY10]], [[V_ADD_F32_e64_2]], 360, 15, 15, 0, implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX11-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_2]], 0, [[S_MOV_B32_2]], [[V_ADD_F32_e64_3]], 0, implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_PERMLANEX16_B32_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_ADD_F32_e64_4]], implicit $exec + ; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_LO_U32_B32_e64_]], [[COPY11]], implicit $exec + ; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX11-NEXT: S_BRANCH %bb.3 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: bb.3 (%ir-block.24): + ; GFX11-NEXT: successors: %bb.4(0x80000000) + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: bb.4.Flow: + ; GFX11-NEXT: successors: %bb.5(0x80000000) + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: bb.5 (%ir-block.26): + ; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } -declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float) -declare float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1), float) - !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll index 212bc84d1349de..a8f9ed2e6fba93 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll @@ -1,137 +1,33 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX90A %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX940 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s - -define amdgpu_ps float @global_atomic_fadd_f32_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; - ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic - ; GFX11: bb.1 (%ir-block.0): - ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] - ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret float %ret -} - -define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, float %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; - ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_intrinsic - ; GFX11: bb.1 (%ir-block.0): - ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] - ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data) - ret float %ret -} - -define amdgpu_ps float @global_atomic_fadd_f32_rtn_flat_intrinsic(ptr addrspace(1) %ptr, float %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; - ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_flat_intrinsic - ; GFX11: bb.1 (%ir-block.0): - ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] - ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret float %ret -} - -define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, float %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; - ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_flat_intrinsic - ; GFX11: bb.1 (%ir-block.0): - ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] - ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data) - ret float %ret -} +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_ps float @global_atomic_fadd_f32_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX90A-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw ; GFX11: bb.1 (%ir-block.0): @@ -426,7 +322,4 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ret float %ret } -declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float) -declare float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1), float) - !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll index 8007cc1f54a790..80fa24471a459f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll @@ -2,170 +2,6 @@ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX90A %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX940 %s -define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - -define amdgpu_ps double @global_atomic_fadd_f64_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - -define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - -define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - -define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - -define amdgpu_ps double @global_atomic_fadd_f64_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - -define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - -define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw ; GFX90A: bb.1 (%ir-block.0): @@ -319,7 +155,4 @@ define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspac ret double %ret } -declare double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1), double) -declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1), double) - !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll index ace66e6a234aae..db508b5aea8c56 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll @@ -3,115 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) { - ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn_intrinsic - ; GFX908: bb.1 (%ir-block.0): - ; GFX908-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX908-NEXT: S_ENDPGM 0 - ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret void -} - -define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) { - ; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic - ; GFX908: bb.1 (%ir-block.0): - ; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX908-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX908-NEXT: S_ENDPGM 0 - ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret void -} - -define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) { - ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat_intrinsic - ; GFX908: bb.1 (%ir-block.0): - ; GFX908-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX908-NEXT: S_ENDPGM 0 - ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret void -} - -define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) { - ; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic - ; GFX908: bb.1 (%ir-block.0): - ; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX908-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX908-NEXT: S_ENDPGM 0 - ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret void -} - -define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn(ptr addrspace(1) %ptr, <2 x half> %data) #0 { +define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn(ptr addrspace(1) %ptr, <2 x half> %data) { ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -137,7 +29,7 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn(ptr addrspace(1) %ptr, <2 ret void } -define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) #0 { +define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) { ; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 @@ -165,6 +57,4 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn(ptr addrspace(1) in ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" } - !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-rtn.ll index e895d6f18c9791..f11196be89bb1d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-rtn.ll @@ -2,73 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret <2 x half> %ret -} - -define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret <2 x half> %ret -} - -define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_flat_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret <2 x half> %ret -} - -define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret <2 x half> %ret -} - -define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn(ptr addrspace(1) %ptr, <2 x half> %data) #0 { +define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn(ptr addrspace(1) %ptr, <2 x half> %data) { ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -84,7 +18,7 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn(ptr addrspace(1) %ptr, ret <2 x half> %ret } -define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) #0 { +define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) { ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 @@ -101,6 +35,4 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn(ptr addrspace(1) ret <2 x half> %ret } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" } - !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll deleted file mode 100644 index 9a66fe10ccdf34..00000000000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll +++ /dev/null @@ -1,21 +0,0 @@ -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s -; RUN: not --crash llc -global-isel < %s -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 - -declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapture, float) -declare <2 x half> @llvm.amdgcn.global.atomic.fadd.f32.p1.v2f16(ptr addrspace(1) nocapture, <2 x half>) - -; GFX908: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.global.atomic.fadd) - -; GFX90A-LABEL: {{^}}global_atomic_fadd_f32_rtn: -; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off glc -define float @global_atomic_fadd_f32_rtn(ptr addrspace(1) %ptr, float %data) { - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret float %ret -} - -; GFX90A-LABEL: {{^}}global_atomic_fadd_v2f16_rtn: -; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc -define <2 x half> @global_atomic_fadd_v2f16_rtn(ptr addrspace(1) %ptr, <2 x half> %data) { - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.f32.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret <2 x half> %ret -} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll deleted file mode 100644 index de91c45000f137..00000000000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll +++ /dev/null @@ -1,126 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s - -define void @global_atomic_fadd_f32(ptr addrspace(1) %ptr, float %data) { -; GFX908-LABEL: global_atomic_fadd_f32: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_atomic_fadd_f32: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret void -} - -define void @global_atomic_fadd_f32_off_2048(ptr addrspace(1) %ptr, float %data) { -; GFX908-LABEL: global_atomic_fadd_f32_off_2048: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2048 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_atomic_fadd_f32_off_2048: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2048 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 512 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %gep, float %data) - ret void -} - -define void @global_atomic_fadd_f32_off_neg2047(ptr addrspace(1) %ptr, float %data) { -; GFX908-LABEL: global_atomic_fadd_f32_off_neg2047: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2044 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_atomic_fadd_f32_off_neg2047: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2044 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -511 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %gep, float %data) - ret void -} - -define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(ptr addrspace(1) %ptr, float %data) { -; GFX908-LABEL: global_atomic_fadd_f32_off_ss: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX908-NEXT: v_mov_b32_e32 v1, 0 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v0, s2 -; GFX908-NEXT: global_atomic_add_f32 v1, v0, s[0:1] offset:2048 -; GFX908-NEXT: s_endpgm -; -; GFX90A-LABEL: global_atomic_fadd_f32_off_ss: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s2 -; GFX90A-NEXT: global_atomic_add_f32 v1, v0, s[0:1] offset:2048 -; GFX90A-NEXT: s_endpgm - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 512 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %gep, float %data) - ret void -} - -define void @global_atomic_fadd_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) { -; GFX908-LABEL: global_atomic_fadd_v2f16: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_atomic_fadd_v2f16: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret void -} - -define void @global_atomic_fadd_v2f16_off_neg2047(ptr addrspace(1) %ptr, <2 x half> %data) { -; GFX908-LABEL: global_atomic_fadd_v2f16_off_neg2047: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2044 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_atomic_fadd_v2f16_off_neg2047: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2044 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -511 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %gep, <2 x half> %data) - ret void -} - -declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapture, float) #0 -declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) nocapture, <2 x half>) #0 - -attributes #0 = { argmemonly nounwind willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll index 21e2a85ab18d98..7587b81e9936da 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll @@ -1,26 +1,27 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -S -passes='require,function(codegenprepare)' -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=OPT %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefix=GCN %s ; Make sure we match the addressing mode offset of globla.atomic.fadd intrinsics across blocks. define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) { -; OPT-LABEL: @test_sink_small_offset_global_atomic_fadd_f32( -; OPT-NEXT: entry: -; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]] +; OPT-LABEL: define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32( +; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[IN:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-NEXT: [[ENTRY:.*]]: +; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR2:[0-9]+]] ; OPT-NEXT: [[CMP:%.*]] = icmp eq i32 [[TID]], 0 -; OPT-NEXT: br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]] -; OPT: if: -; OPT-NEXT: [[IN_GEP:%.*]] = getelementptr float, ptr addrspace(1) [[IN:%.*]], i32 7 -; OPT-NEXT: [[FADD2:%.*]] = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) [[IN_GEP]], float 2.000000e+00) +; OPT-NEXT: br i1 [[CMP]], label %[[ENDIF:.*]], label %[[IF:.*]] +; OPT: [[IF]]: +; OPT-NEXT: [[IN_GEP:%.*]] = getelementptr float, ptr addrspace(1) [[IN]], i32 7 +; OPT-NEXT: [[FADD2:%.*]] = atomicrmw fadd ptr addrspace(1) [[IN_GEP]], float 2.000000e+00 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]], !amdgpu.ignore.denormal.mode [[META0]] ; OPT-NEXT: [[VAL:%.*]] = load volatile float, ptr addrspace(1) undef, align 4 -; OPT-NEXT: br label [[ENDIF]] -; OPT: endif: -; OPT-NEXT: [[X:%.*]] = phi float [ [[VAL]], [[IF]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr float, ptr addrspace(1) [[OUT:%.*]], i32 999999 +; OPT-NEXT: br label %[[ENDIF]] +; OPT: [[ENDIF]]: +; OPT-NEXT: [[X:%.*]] = phi float [ [[VAL]], %[[IF]] ], [ 0.000000e+00, %[[ENTRY]] ] +; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr float, ptr addrspace(1) [[OUT]], i32 999999 ; OPT-NEXT: store float [[X]], ptr addrspace(1) [[OUT_GEP]], align 4 -; OPT-NEXT: br label [[DONE:%.*]] -; OPT: done: +; OPT-NEXT: br label %[[DONE:.*]] +; OPT: [[DONE]]: ; OPT-NEXT: ret void ; ; GCN-LABEL: test_sink_small_offset_global_atomic_fadd_f32: @@ -36,6 +37,8 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr add ; GCN-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] offset:28 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: global_load_dword v0, v[0:1], off glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB0_2: ; %endif @@ -51,7 +54,7 @@ entry: if: %in.gep = getelementptr float, ptr addrspace(1) %in, i32 7 - %fadd2 = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %in.gep, float 2.0) + %fadd2 = atomicrmw fadd ptr addrspace(1) %in.gep, float 2.000000e+00 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 %val = load volatile float, ptr addrspace(1) undef br label %endif @@ -71,3 +74,8 @@ declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapt attributes #0 = { argmemonly nounwind } attributes #1 = { nounwind readnone willreturn } attributes #2 = { argmemonly nounwind willreturn } + +!0 = !{} +;. +; OPT: [[META0]] = !{} +;. diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll index 3e9e5056f87d6e..ef180cef7ed2a7 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll @@ -14,7 +14,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX940-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX940-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) ; GFX940-NEXT: S_ENDPGM 0 ; ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic @@ -26,7 +26,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) ; GFX11-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data) ret void @@ -42,7 +42,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX940-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX940-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) ; GFX940-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] ; GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -55,7 +55,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data) diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll index 07293db896f440..e5dcf9ce309cd8 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll @@ -21,7 +21,7 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_intrinsic(ptr %ptr, double %d ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s64) on %ir.ptr) + ; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s64) on %ir.ptr) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr %ptr, double %data) ret void @@ -44,7 +44,7 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_intrinsic(ptr %ptr, double %da ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s64) on %ir.ptr) + ; GFX90A_GFX940-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s64) on %ir.ptr) ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0 ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1 ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]] diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll deleted file mode 100644 index 647c5b568b7ad5..00000000000000 --- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll +++ /dev/null @@ -1,63 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX12 %s - -define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn_intrinsic(ptr %ptr, <2 x half> %data) { - ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_no_rtn_intrinsic - ; GFX940: bb.0 (%ir-block.0): - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX940-NEXT: FLAT_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) - ; GFX940-NEXT: S_ENDPGM 0 - ; - ; GFX12-LABEL: name: flat_atomic_fadd_v2f16_no_rtn_intrinsic - ; GFX12: bb.0 (%ir-block.0): - ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX12-NEXT: FLAT_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) - ; GFX12-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr %ptr, <2 x half> %data) - ret void -} - -define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn_intrinsic(ptr %ptr, <2 x half> %data) { - ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_rtn_intrinsic - ; GFX940: bb.0 (%ir-block.0): - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX940-NEXT: [[FLAT_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) - ; GFX940-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_PK_ADD_F16_RTN]] - ; GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - ; - ; GFX12-LABEL: name: flat_atomic_fadd_v2f16_rtn_intrinsic - ; GFX12: bb.0 (%ir-block.0): - ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX12-NEXT: [[FLAT_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) - ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_PK_ADD_F16_RTN]] - ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr %ptr, <2 x half> %data) - ret <2 x half> %ret -} - -declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr, <2 x half>) diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll index 05259b4f51310d..e94e2ee9f37d8e 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll @@ -1,113 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12-SDAG -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12-GISEL +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 | FileCheck %s -check-prefix=GFX12-SDAG +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 | FileCheck %s -check-prefix=GFX12-GISEL declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) declare <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat>, <4 x i32>, i32, i32, i32, i32 immarg) declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32) declare <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32, i32, i32) -declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) -declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) - -define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { -; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_noret: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-SDAG-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_noret: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-GISEL-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 -; GFX12-GISEL-NEXT: s_endpgm - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) - ret void -} - -define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { -; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_rtn: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_rtn: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) - ret <2 x half> %ret -} - -define void @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) { -; GFX12-SDAG-LABEL: global_atomic_pk_add_v2f16: -; GFX12-SDAG: ; %bb.0: ; %main_body -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: global_atomic_pk_add_v2f16: -; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret void -} - -define <2 x half> @global_atomic_pk_add_v2f16_rtn(ptr addrspace(1) %ptr, <2 x half> %data) { -; GFX12-SDAG-LABEL: global_atomic_pk_add_v2f16_rtn: -; GFX12-SDAG: ; %bb.0: ; %main_body -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: global_atomic_pk_add_v2f16_rtn: -; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret <2 x half> %ret -} define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret_offset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; GFX12-SDAG-LABEL: raw_buffer_atomic_add_v2f16_noret_offset: diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll index 5322a283d3de4d..88a95937b9c906 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll @@ -1,35 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 | FileCheck %s -check-prefix=GFX940 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefix=GFX12 -declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) -declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1) declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) -define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 -; GFX940-NEXT: s_endpgm -; -; GFX12-LABEL: flat_atomic_fadd_f32_noret: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 -; GFX12-NEXT: s_endpgm - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) - ret void -} - define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX940: ; %bb.0: @@ -85,29 +60,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0 ret void } - -define float @flat_atomic_fadd_f32_rtn(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_f32_rtn: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) - ret float %ret -} - define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { ; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat: ; GFX940: ; %bb.0: @@ -137,51 +89,6 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { ret float %ret } -define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 -; GFX940-NEXT: s_endpgm -; -; GFX12-LABEL: flat_atomic_fadd_v2f16_noret: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 -; GFX12-NEXT: s_endpgm - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) - ret void -} - -define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2f16_rtn: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) - ret <2 x half> %ret -} - define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: @@ -282,202 +189,6 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ret <2 x i16> %ret } -define float @flat_atomic_fadd_f32_intrinsic_ret__posoffset(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_ret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:4092 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_ret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 1023 - %result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) - ret float %result -} - -define float @flat_atomic_fadd_f32_intrinsic_ret__negoffset(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_ret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_ret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -256 - %result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) - ret float %result -} - -define void @flat_atomic_fadd_f32_intrinsic_noret__posoffset(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_noret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:4092 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_noret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:4092 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 1023 - %unused = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) - ret void -} - -define void @flat_atomic_fadd_f32_intrinsic_noret__negoffset(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_noret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_noret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-1024 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -256 - %unused = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) - ret void -} - -define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__posoffset(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:4092 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 1023 - %result = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) - ret <2 x half> %result -} - -define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__negoffset(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -256 - %result = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) - ret <2 x half> %result -} - -define void @flat_atomic_fadd_v2f16_intrinsic_noret__posoffset(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:4092 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:4092 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 1023 - %unused = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) - ret void -} - -define void @flat_atomic_fadd_v2f16_intrinsic_noret__negoffset(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-1024 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -256 - %unused = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) - ret void -} - attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" } !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 2615147b488d43..957c10ddf85e5d 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GFX90A -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX90A +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX940 declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg) declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg) @@ -14,11 +14,8 @@ declare double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double, <4 x i32>, i32 declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg) declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg) declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32 immarg) -declare double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) declare double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) -declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) -declare double @llvm.amdgcn.flat.atomic.fadd.f64.p3.f64(ptr addrspace(3) %ptr, double %data) declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, i32, i1) @@ -1019,31 +1016,6 @@ main_body: ret void } -define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) { -; GFX90A-LABEL: global_atomic_fadd_f64_noret: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s7 -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] -; GFX90A-NEXT: s_endpgm -; -; GFX940-LABEL: global_atomic_fadd_f64_noret: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] -; GFX940-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fmin_f64_noret: ; GFX90A: ; %bb.0: ; %main_body @@ -1097,47 +1069,28 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB39_2 -; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB39_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB39_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: .LBB39_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1147,45 +1100,26 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB40_2 -; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB40_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB40_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: .LBB40_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1195,47 +1129,28 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB41_2 -; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB41_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB41_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: .LBB41_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1245,70 +1160,32 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB42_2 -; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB42_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB42_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: .LBB42_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define double @global_atomic_fadd_f64_rtn(ptr addrspace(1) %ptr, double %data) { -; GFX90A-LABEL: global_atomic_fadd_f64_rtn: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_atomic_fadd_f64_rtn: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %data) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat: ; GFX90A: ; %bb.0: ; %main_body @@ -1429,57 +1306,37 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[4:5], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB49_3 -; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s4 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB49_2: ; %atomicrmw.start +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_2 -; GFX90A-NEXT: .LBB49_3: +; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB49_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: .LBB49_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst @@ -1658,52 +1515,6 @@ main_body: ret double %ret } -define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) { -; GFX90A-LABEL: flat_atomic_fadd_f64_noret: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s5 -; GFX90A-NEXT: v_mov_b32_e32 v2, s6 -; GFX90A-NEXT: v_mov_b32_e32 v3, s7 -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_endpgm -; -; GFX940-LABEL: flat_atomic_fadd_f64_noret: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) - ret void -} - -define double @flat_atomic_fadd_f64_rtn(ptr %ptr, double %data) { -; GFX90A-LABEL: flat_atomic_fadd_f64_rtn: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fadd_f64_rtn: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) - ret double %ret -} - define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body @@ -1713,7 +1524,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 @@ -1724,7 +1535,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm ; @@ -1839,85 +1650,16 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, double %data) { ; GFX90A-LABEL: local_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB63_2 -; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c -; GFX90A-NEXT: s_load_dword s6, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mul_f64 v[0:1], s[4:5], v[0:1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s6 -; GFX90A-NEXT: ds_add_f64 v2, v[0:1] -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB63_2: -; GFX90A-NEXT: s_endpgm -; -; GFX940-LABEL: local_atomic_fadd_f64_noret: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB63_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c -; GFX940-NEXT: s_load_dword s6, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mul_f64 v[0:1], s[4:5], v[0:1] -; GFX940-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-NEXT: ds_add_f64 v2, v[0:1] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB63_2: -; GFX940-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) - ret void -} - -define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) { -; GFX90A-LABEL: local_atomic_fadd_f64_rtn: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: local_atomic_fadd_f64_rtn: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) - ret double %ret -} - -define amdgpu_kernel void @local_atomic_fadd_f64_noret_from_flat_intrinsic(ptr addrspace(3) %ptr, double %data) { -; GFX90A-LABEL: local_atomic_fadd_f64_noret_from_flat_intrinsic: -; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dword s4, s[2:3], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: local_atomic_fadd_f64_noret_from_flat_intrinsic: +; GFX940-LABEL: local_atomic_fadd_f64_noret: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_load_dword s4, s[2:3], 0x24 ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c @@ -1925,14 +1667,15 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_from_flat_intrinsic(ptr a ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: ds_add_f64 v2, v[0:1] +; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p3.f64(ptr addrspace(3) %ptr, double %data) + %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) ret void } -define double @local_atomic_fadd_f64_rtn_from_flat_intrinsic(ptr addrspace(3) %ptr, double %data) { -; GFX90A-LABEL: local_atomic_fadd_f64_rtn_from_flat_intrinsic: +define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) { +; GFX90A-LABEL: local_atomic_fadd_f64_rtn: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 @@ -1941,7 +1684,7 @@ define double @local_atomic_fadd_f64_rtn_from_flat_intrinsic(ptr addrspace(3) %p ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: local_atomic_fadd_f64_rtn_from_flat_intrinsic: +; GFX940-LABEL: local_atomic_fadd_f64_rtn: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 @@ -1950,49 +1693,30 @@ define double @local_atomic_fadd_f64_rtn_from_flat_intrinsic(ptr addrspace(3) %p ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p3.f64(ptr addrspace(3) %ptr, double %data) + %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) ret double %ret } define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr) #1 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB67_2 -; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX90A-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB67_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB67_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB67_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -2002,42 +1726,23 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3) %ptr) #0 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB68_2 -; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX90A-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB68_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB68_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB68_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -2047,42 +1752,23 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #4 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB69_2 -; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX90A-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB69_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB69_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB69_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst @@ -2157,92 +1843,6 @@ main_body: ret double %ret } -define double @flat_atomic_fadd_f64_intrinsic_rtn__posoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__posoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 511 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) - ret double %ret -} - -define double @flat_atomic_fadd_f64_intrinsic_rtn__negoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__negoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 -511 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %gep, double %data) - ret double %ret -} - -define void @flat_atomic_fadd_f64_intrinsic_noret__posoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_noret__posoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_noret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 511 - %unused = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) - ret void -} - -define void @flat_atomic_fadd_f64_intrinsic_noret__negoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_noret__negoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_noret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 -511 - %unused = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %gep, double %data) - ret void -} - define double @flat_atomic_fmin_f64_intrinsic_rtn__posoffset(ptr %ptr, double %data) #1 { ; GFX90A-LABEL: flat_atomic_fmin_f64_intrinsic_rtn__posoffset: ; GFX90A: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll index d70d45d44af0fd..c8bbafbfa44d85 100644 --- a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll @@ -1,7 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s - -declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr nocapture, double) #8 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck %s define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, ptr addrspace(1) %b, double %c) { ; CHECK-LABEL: IllegalGEPConst: @@ -17,14 +15,16 @@ define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, ptr addrspace(1) %b ; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: s_addc_u32 s1, s5, s1 ; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] offset:-8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_wbinvl1_vol ; CHECK-NEXT: s_endpgm entry: %i = add nsw i32 %a, -1 %i.2 = sext i32 %i to i64 %i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2 %i.4 = addrspacecast ptr addrspace(1) %i.3 to ptr - %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %i.4, double %c) #8 + %i.5 = atomicrmw fadd ptr %i.4, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret void } -attributes #8 = { argmemonly mustprogress nounwind willreturn "target-cpu"="gfx90a" } +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll index c1b333c6927497..ad3f920eadc91f 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll @@ -5,118 +5,6 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX908_GFX11_GFX12,GFX11_GFX12 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX908_GFX11_GFX12,GFX11_GFX12 %s -define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) { - ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic - ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0): - ; GFX908_GFX11_GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX908_GFX11_GFX12-NEXT: {{ $}} - ; GFX908_GFX11_GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX908_GFX11_GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX908_GFX11_GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11_GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11_GFX12-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX908_GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11_GFX12-NEXT: S_ENDPGM 0 - ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret void -} - -define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, float %data) { - ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic - ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0): - ; GFX908_GFX11_GFX12-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX908_GFX11_GFX12-NEXT: {{ $}} - ; GFX908_GFX11_GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11_GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX908_GFX11_GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX908_GFX11_GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11_GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX908_GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11_GFX12-NEXT: S_ENDPGM 0 - ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data) - ret void -} - -define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, float %data) { - ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic - ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0): - ; GFX908_GFX11_GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX908_GFX11_GFX12-NEXT: {{ $}} - ; GFX908_GFX11_GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX908_GFX11_GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX908_GFX11_GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11_GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11_GFX12-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX908_GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11_GFX12-NEXT: S_ENDPGM 0 - ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret void -} - -define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, float %data) { - ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic - ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0): - ; GFX908_GFX11_GFX12-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX908_GFX11_GFX12-NEXT: {{ $}} - ; GFX908_GFX11_GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11_GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX908_GFX11_GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX908_GFX11_GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11_GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX908_GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11_GFX12-NEXT: S_ENDPGM 0 - ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data) - ret void -} - define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) { ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0): @@ -324,7 +212,4 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ret void } -declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float) -declare float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1), float) - !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll index 1b955eae891590..3951e02d46a8f3 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll @@ -4,126 +4,6 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s -define amdgpu_ps float @global_atomic_fadd_f32_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - ; - ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic - ; GFX11: bb.0 (%ir-block.0): - ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] - ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret float %ret -} - -define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, float %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - ; - ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_intrinsic - ; GFX11: bb.0 (%ir-block.0): - ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] - ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data) - ret float %ret -} - -define amdgpu_ps float @global_atomic_fadd_f32_rtn_flat_intrinsic(ptr addrspace(1) %ptr, float %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - ; - ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_flat_intrinsic - ; GFX11: bb.0 (%ir-block.0): - ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] - ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret float %ret -} - -define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, float %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - ; - ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_flat_intrinsic - ; GFX11: bb.0 (%ir-block.0): - ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] - ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data) - ret float %ret -} - define amdgpu_ps float @global_atomic_fadd_f32_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) { ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw ; GFX90A_GFX940: bb.0 (%ir-block.0): diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll index 50bd5bf57dd9f2..ba94a53dff03bd 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll @@ -2,174 +2,6 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX90A %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX940 %s -define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - -define amdgpu_ps double @global_atomic_fadd_f64_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - -define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - -define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - -define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - -define amdgpu_ps double @global_atomic_fadd_f64_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - -define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - -define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) { ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw ; GFX90A_GFX940: bb.0 (%ir-block.0): @@ -299,7 +131,4 @@ define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspac ret double %ret } -declare double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1), double) -declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1), double) - !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll index 60345c0b44339b..02e425e6d10a8d 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll @@ -1,13 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) { - ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn_intrinsic + +define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn(ptr addrspace(1) %ptr, <2 x half> %data) { + ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn ; GFX908: bb.0 (%ir-block.0): ; GFX908-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX908-NEXT: {{ $}} @@ -16,10 +17,10 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(ptr addrspace(1 ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_intrinsic + ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX90A_GFX940-NEXT: {{ $}} @@ -28,14 +29,14 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(ptr addrspace(1 ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) + %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } -define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) { - ; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic +define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) { + ; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn ; GFX908: bb.0 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 ; GFX908-NEXT: {{ $}} @@ -44,10 +45,10 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic(ptr addrs ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic + ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 ; GFX90A_GFX940-NEXT: {{ $}} @@ -56,14 +57,14 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic(ptr addrs ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) + %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } -define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) { - ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat_intrinsic +define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat(ptr addrspace(1) %ptr, <2 x half> %data) { + ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat ; GFX908: bb.0 (%ir-block.0): ; GFX908-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX908-NEXT: {{ $}} @@ -72,10 +73,10 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat_intrinsic(ptr addrsp ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat_intrinsic + ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX90A_GFX940-NEXT: {{ $}} @@ -84,14 +85,14 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat_intrinsic(ptr addrsp ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) + %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } -define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) { - ; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic +define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat(ptr addrspace(1) inreg %ptr, <2 x half> %data) { + ; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat ; GFX908: bb.0 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 ; GFX908-NEXT: {{ $}} @@ -100,10 +101,10 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(ptr ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic + ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 ; GFX90A_GFX940-NEXT: {{ $}} @@ -112,11 +113,10 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(ptr ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) + %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } -declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>) -declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>) +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-rtn.ll index c8caf1fe365b15..794a52b6900ea1 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-rtn.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn_intrinsic +define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn(ptr addrspace(1) %ptr, <2 x half> %data) { + ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX90A_GFX940-NEXT: {{ $}} @@ -14,15 +14,15 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_intrinsic(ptr addrspac ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) + %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %ret } -define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_intrinsic +define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) { + ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 ; GFX90A_GFX940-NEXT: {{ $}} @@ -31,15 +31,15 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_intrinsic(ptr ad ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) + %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %ret } -define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_flat_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn_flat_intrinsic +define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_flat(ptr addrspace(1) %ptr, <2 x half> %data) { + ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn_flat ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX90A_GFX940-NEXT: {{ $}} @@ -48,15 +48,15 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_flat_intrinsic(ptr add ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) + %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %ret } -define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic +define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_flat(ptr addrspace(1) inreg %ptr, <2 x half> %data) { + ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_flat ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 ; GFX90A_GFX940-NEXT: {{ $}} @@ -65,12 +65,11 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic(p ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) + %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %ret } -declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>) -declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>) +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll index 0147084a6996f3..84065c4675ab1f 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck --check-prefix=GCN %s ; Test using saddr addressing mode of global_* flat atomic instructions. @@ -11,10 +11,12 @@ define amdgpu_ps void @global_fadd_saddr_f32_nortn(ptr addrspace(1) inreg %sbase ; GCN-LABEL: global_fadd_saddr_f32_nortn: ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_wbinvl1 ; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1(ptr addrspace(1) %gep0, float %data) + %ret = atomicrmw fadd ptr addrspace(1) %gep0, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } @@ -22,11 +24,13 @@ define amdgpu_ps void @global_fadd_saddr_f32_nortn_neg128(ptr addrspace(1) inreg ; GCN-LABEL: global_fadd_saddr_f32_nortn_neg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_wbinvl1 ; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1(ptr addrspace(1) %gep1, float %data) + %ret = atomicrmw fadd ptr addrspace(1) %gep1, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } @@ -34,10 +38,12 @@ define amdgpu_ps void @global_fadd_saddr_v2f16_nortn(ptr addrspace(1) inreg %sba ; GCN-LABEL: global_fadd_saddr_v2f16_nortn: ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_pk_add_f16 v0, v1, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_wbinvl1 ; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1(ptr addrspace(1) %gep0, <2 x half> %data) + %ret = atomicrmw fadd ptr addrspace(1) %gep0, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } @@ -45,15 +51,14 @@ define amdgpu_ps void @global_fadd_saddr_v2f16_nortn_neg128(ptr addrspace(1) inr ; GCN-LABEL: global_fadd_saddr_v2f16_nortn_neg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_pk_add_f16 v0, v1, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_wbinvl1 ; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1(ptr addrspace(1) %gep1, <2 x half> %data) + %ret = atomicrmw fadd ptr addrspace(1) %gep1, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } -declare float @llvm.amdgcn.global.atomic.fadd.f32.p1(ptr addrspace(1) nocapture, float) #0 -declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1(ptr addrspace(1) nocapture, <2 x half>) #0 - -attributes #0 = { argmemonly nounwind willreturn } +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.gfx90a.ll deleted file mode 100644 index af841057471891..00000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.gfx90a.ll +++ /dev/null @@ -1,56 +0,0 @@ -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GFX90A - -declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float) -declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>) - -; GFX90A-LABEL: {{^}}global_atomic_add_f32: -; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off glc -define amdgpu_ps float @global_atomic_add_f32(ptr addrspace(1) %ptr, float %data) { -main_body: - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret float %ret -} - -; GFX90A-LABEL: {{^}}global_atomic_add_f32_off4: -; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off offset:4 glc -define amdgpu_ps float @global_atomic_add_f32_off4(ptr addrspace(1) %ptr, float %data) { -main_body: - %p = getelementptr float, ptr addrspace(1) %ptr, i64 1 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data) - ret float %ret -} - -; GFX90A-LABEL: {{^}}global_atomic_add_f32_offneg4: -; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off offset:-4 glc -define amdgpu_ps float @global_atomic_add_f32_offneg4(ptr addrspace(1) %ptr, float %data) { -main_body: - %p = getelementptr float, ptr addrspace(1) %ptr, i64 -1 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data) - ret float %ret -} - -; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16: -; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc -define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) { -main_body: - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret <2 x half> %ret -} - -; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16_off4: -; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:4 glc -define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16_off4(ptr addrspace(1) %ptr, <2 x half> %data) { -main_body: - %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 1 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data) - ret <2 x half> %ret -} - -; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16_offneg4: -; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-4 glc -define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16_offneg4(ptr addrspace(1) %ptr, <2 x half> %data) { -main_body: - %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -1 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data) - ret <2 x half> %ret -} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.ll deleted file mode 100644 index 0c3ce3308dd8fe..00000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.ll +++ /dev/null @@ -1,77 +0,0 @@ -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP | FileCheck %s -check-prefix=GCN -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP | FileCheck %s -check-prefix=GCN - -declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float) -declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>) -declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr, float) - -; GCN-LABEL: {{^}}global_atomic_add_f32: -; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} -define amdgpu_kernel void @global_atomic_add_f32(ptr addrspace(1) %ptr, float %data) { -main_body: - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret void -} - -; GCN-LABEL: {{^}}global_atomic_add_f32_off4: -; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 -define amdgpu_kernel void @global_atomic_add_f32_off4(ptr addrspace(1) %ptr, float %data) { -main_body: - %p = getelementptr float, ptr addrspace(1) %ptr, i64 1 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data) - ret void -} - -; GCN-LABEL: {{^}}global_atomic_add_f32_offneg4: -; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:-4 -define amdgpu_kernel void @global_atomic_add_f32_offneg4(ptr addrspace(1) %ptr, float %data) { -main_body: - %p = getelementptr float, ptr addrspace(1) %ptr, i64 -1 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data) - ret void -} - -; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16: -; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) { -main_body: - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret void -} - -; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16_off4: -; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 -define amdgpu_kernel void @global_atomic_pk_add_v2f16_off4(ptr addrspace(1) %ptr, <2 x half> %data) { -main_body: - %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 1 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data) - ret void -} - -; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16_offneg4: -; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:-4{{$}} -define amdgpu_kernel void @global_atomic_pk_add_v2f16_offneg4(ptr addrspace(1) %ptr, <2 x half> %data) { -main_body: - %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -1 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data) - ret void -} - -; Make sure this artificially selects with an incorrect subtarget, but -; the feature set. -; GCN-LABEL: {{^}}global_atomic_fadd_f32_wrong_subtarget: -; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @global_atomic_fadd_f32_wrong_subtarget(ptr addrspace(1) %ptr, float %data) #0 { - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret void -} - -; GCN-LABEL: {{^}}flat_atomic_fadd_f32_wrong_subtarget: -; GCN: flat_atomic_add_f32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} -define amdgpu_kernel void @flat_atomic_fadd_f32_wrong_subtarget(ptr %ptr, float %data) #1 { - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) - ret void -} - -attributes #0 = { "target-cpu"="gfx803" "target-features"="+atomic-fadd-no-rtn-insts"} -attributes #1 = { "target-cpu"="gfx803" "target-features"="+flat-atomic-fadd-f32-inst"} diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll index 39541537b3647a..8ea83da78f889d 100644 --- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll +++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll @@ -40,12 +40,12 @@ define void @shl_base_global_ptr_global_atomic_fadd(ptr addrspace(1) %out, ptr a %cast = ptrtoint ptr addrspace(1) %arrayidx0 to i64 %shl = shl i64 %cast, 2 %castback = inttoptr i64 %shl to ptr addrspace(1) - call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %castback, float 100.0) + %unused = atomicrmw fadd ptr addrspace(1) %castback, float 100.0 syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 store volatile i64 %cast, ptr addrspace(1) %extra.use, align 4 ret void } -declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapture, float) #1 - attributes #0 = { nounwind } attributes #1 = { argmemonly nounwind willreturn } + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-atomics.ll b/llvm/test/CodeGen/AMDGPU/unsupported-atomics.ll deleted file mode 100644 index 4b84f9175307f5..00000000000000 --- a/llvm/test/CodeGen/AMDGPU/unsupported-atomics.ll +++ /dev/null @@ -1,28 +0,0 @@ -; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GFX906 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX908 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940 %s -; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GFX1030 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100 %s - -; GFX906: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.global.atomic.fadd - -; GFX908-LABEL: fadd_test: -; GFX908: global_atomic_add_f32 - -; GFX90A-LABEL: fadd_test: -; GFX90A: global_atomic_add_f32 - -; GFX940-LABEL: fadd_test: -; GFX940: global_atomic_add_f32 - -; GFX1030: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.global.atomic.fadd - -; GFX1100-LABEL: fadd_test: -; GFX1100: global_atomic_add_f32 - -define fastcc void @fadd_test(ptr addrspace(1) nocapture noundef %0, float noundef %1) unnamed_addr { - %3 = tail call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) noundef %0, float noundef %1) - ret void -} -declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapture, float) diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat-fadd-fmin-fmax-intrinsics.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat-fadd-fmin-fmax-intrinsics.ll index b05caa440ddcb5..3d529a2c6ef69a 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat-fadd-fmin-fmax-intrinsics.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat-fadd-fmin-fmax-intrinsics.ll @@ -1,20 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=infer-address-spaces %s | FileCheck %s -declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) declare float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %ptr, float %data) declare float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %ptr, float %data) define amdgpu_kernel void @flat_atomic_fadd_f32_p1(ptr addrspace(1) %ptr, float %data) { ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p1 ; CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]], float [[DATA:%.*]]) { -; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) [[PTR]], float [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p1.f32(ptr addrspace(1) [[PTR]], float [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p1.f32(ptr addrspace(1) [[PTR]], float [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(1) %ptr to ptr - %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data) %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) ret void @@ -24,13 +21,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_p2(ptr addrspace(2) %ptr, float ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p2 ; CHECK-SAME: (ptr addrspace(2) [[PTR:%.*]], float [[DATA:%.*]]) { ; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(2) [[PTR]] to ptr -; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(2) %ptr to ptr - %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data) %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) ret void @@ -40,13 +35,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_p3(ptr addrspace(3) %ptr, float ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p3 ; CHECK-SAME: (ptr addrspace(3) [[PTR:%.*]], float [[DATA:%.*]]) { ; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr -; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(3) %ptr to ptr - %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data) %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) ret void @@ -55,13 +48,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_p3(ptr addrspace(3) %ptr, float define amdgpu_kernel void @flat_atomic_fadd_f32_p4(ptr addrspace(4) %ptr, float %data) { ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p4 ; CHECK-SAME: (ptr addrspace(4) [[PTR:%.*]], float [[DATA:%.*]]) { -; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p4.f32(ptr addrspace(4) [[PTR]], float [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p4.f32(ptr addrspace(4) [[PTR]], float [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p4.f32(ptr addrspace(4) [[PTR]], float [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(4) %ptr to ptr - %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data) %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) ret void @@ -71,13 +62,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_p5(ptr addrspace(5) %ptr, float ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p5 ; CHECK-SAME: (ptr addrspace(5) [[PTR:%.*]], float [[DATA:%.*]]) { ; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr -; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(5) %ptr to ptr - %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data) %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) ret void @@ -86,13 +75,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_p5(ptr addrspace(5) %ptr, float define amdgpu_kernel void @flat_atomic_fadd_f32_p6(ptr addrspace(6) %ptr, float %data) { ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p6 ; CHECK-SAME: (ptr addrspace(6) [[PTR:%.*]], float [[DATA:%.*]]) { -; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p6.f32(ptr addrspace(6) [[PTR]], float [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p6.f32(ptr addrspace(6) [[PTR]], float [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p6.f32(ptr addrspace(6) [[PTR]], float [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(6) %ptr to ptr - %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data) %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) ret void @@ -102,13 +89,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_p7(ptr addrspace(7) %ptr, float ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p7 ; CHECK-SAME: (ptr addrspace(7) [[PTR:%.*]], float [[DATA:%.*]]) { ; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(7) [[PTR]] to ptr -; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(7) %ptr to ptr - %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data) %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) ret void @@ -117,32 +102,27 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_p7(ptr addrspace(7) %ptr, float define amdgpu_kernel void @flat_atomic_fadd_f32_p99(ptr addrspace(99) %ptr, float %data) { ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p99 ; CHECK-SAME: (ptr addrspace(99) [[PTR:%.*]], float [[DATA:%.*]]) { -; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p99.f32(ptr addrspace(99) [[PTR]], float [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p99.f32(ptr addrspace(99) [[PTR]], float [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p99.f32(ptr addrspace(99) [[PTR]], float [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(99) %ptr to ptr - %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data) %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) ret void } -declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) define amdgpu_kernel void @flat_atomic_fadd_f64_p1(ptr addrspace(1) %ptr, double %data) { ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p1 ; CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]], double [[DATA:%.*]]) { -; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) [[PTR]], double [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p1.f64(ptr addrspace(1) [[PTR]], double [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p1.f64(ptr addrspace(1) [[PTR]], double [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(1) %ptr to ptr - %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data) %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) ret void @@ -152,13 +132,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_p2(ptr addrspace(2) %ptr, double ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p2 ; CHECK-SAME: (ptr addrspace(2) [[PTR:%.*]], double [[DATA:%.*]]) { ; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(2) [[PTR]] to ptr -; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(2) %ptr to ptr - %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data) %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) ret void @@ -168,13 +146,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_p3(ptr addrspace(3) %ptr, double ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p3 ; CHECK-SAME: (ptr addrspace(3) [[PTR:%.*]], double [[DATA:%.*]]) { ; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr -; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(3) %ptr to ptr - %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data) %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) ret void @@ -183,13 +159,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_p3(ptr addrspace(3) %ptr, double define amdgpu_kernel void @flat_atomic_fadd_f64_p4(ptr addrspace(4) %ptr, double %data) { ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p4 ; CHECK-SAME: (ptr addrspace(4) [[PTR:%.*]], double [[DATA:%.*]]) { -; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p4.f64(ptr addrspace(4) [[PTR]], double [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p4.f64(ptr addrspace(4) [[PTR]], double [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p4.f64(ptr addrspace(4) [[PTR]], double [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(4) %ptr to ptr - %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data) %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) ret void @@ -199,13 +173,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_p5(ptr addrspace(5) %ptr, double ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p5 ; CHECK-SAME: (ptr addrspace(5) [[PTR:%.*]], double [[DATA:%.*]]) { ; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr -; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(5) %ptr to ptr - %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data) %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) ret void @@ -214,13 +186,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_p5(ptr addrspace(5) %ptr, double define amdgpu_kernel void @flat_atomic_fadd_f64_p6(ptr addrspace(6) %ptr, double %data) { ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p6 ; CHECK-SAME: (ptr addrspace(6) [[PTR:%.*]], double [[DATA:%.*]]) { -; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p6.f64(ptr addrspace(6) [[PTR]], double [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p6.f64(ptr addrspace(6) [[PTR]], double [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p6.f64(ptr addrspace(6) [[PTR]], double [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(6) %ptr to ptr - %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data) %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) ret void @@ -230,13 +200,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_p7(ptr addrspace(7) %ptr, double ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p7 ; CHECK-SAME: (ptr addrspace(7) [[PTR:%.*]], double [[DATA:%.*]]) { ; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(7) [[PTR]] to ptr -; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(7) %ptr to ptr - %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data) %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) ret void @@ -245,13 +213,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_p7(ptr addrspace(7) %ptr, double define amdgpu_kernel void @flat_atomic_fadd_f64_p99(ptr addrspace(99) %ptr, double %data) { ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p99 ; CHECK-SAME: (ptr addrspace(99) [[PTR:%.*]], double [[DATA:%.*]]) { -; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p99.f64(ptr addrspace(99) [[PTR]], double [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p99.f64(ptr addrspace(99) [[PTR]], double [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p99.f64(ptr addrspace(99) [[PTR]], double [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(99) %ptr to ptr - %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data) %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) ret void diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll index d9c3c4b17090bd..57e6fdb35113e6 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll @@ -1,9 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s -declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr nocapture, double) #8 -declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr nocapture, double) #8 -declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr nocapture, double) #8 +declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr nocapture, double) #0 +declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr nocapture, double) #0 define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) { ; CHECK-LABEL: InferNothing: @@ -21,37 +20,49 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) { ; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; CHECK-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: buffer_wbinvl1_vol ; CHECK-NEXT: s_endpgm entry: %i = add nsw i32 %a, -1 %i.2 = sext i32 %i to i64 %i.3 = getelementptr inbounds double, ptr %b, i64 %i.2 - %i.4 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %i.3, double %c) #8 + %i.4 = atomicrmw fadd ptr %i.3, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret void } - define protected amdgpu_kernel void @InferFadd(i32 %a, ptr addrspace(1) %b, double %c) { ; CHECK-LABEL: InferFadd: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 +; CHECK-NEXT: s_mov_b64 s[0:1], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_cbranch_execz .LBB1_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s8, s[2:3], 0x24 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s1, s0, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CHECK-NEXT: s_add_u32 s0, s4, s0 -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 -; CHECK-NEXT: s_addc_u32 s1, s5, s1 -; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] offset:-8 +; CHECK-NEXT: s_ashr_i32 s9, s8, 31 +; CHECK-NEXT: s_lshl_b64 s[2:3], s[8:9], 3 +; CHECK-NEXT: s_add_u32 s2, s4, s2 +; CHECK-NEXT: s_addc_u32 s3, s5, s3 +; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; CHECK-NEXT: v_mul_f64 v[0:1], s[6:7], v[0:1] +; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] offset:-8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_wbinvl1_vol +; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: s_endpgm entry: %i = add nsw i32 %a, -1 %i.2 = sext i32 %i to i64 %i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2 %i.4 = addrspacecast ptr addrspace(1) %i.3 to ptr - %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %i.4, double %c) #8 + %0 = atomicrmw fadd ptr %i.4, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret void } @@ -75,7 +86,7 @@ entry: %i.2 = sext i32 %i to i64 %i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2 %i.4 = addrspacecast ptr addrspace(1) %i.3 to ptr - %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %i.4, double %c) #8 + %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %i.4, double %c) #1 ret void } @@ -99,27 +110,43 @@ entry: %i.2 = sext i32 %i to i64 %i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2 %i.4 = addrspacecast ptr addrspace(1) %i.3 to ptr - %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %i.4, double %c) #8 + %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %i.4, double %c) #1 ret void } define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, double %c, ptr %d) { ; CHECK-LABEL: InferMixed: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 ; CHECK-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x3c ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: s_mov_b64 s[0:1], exec ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s1, s0, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; CHECK-NEXT: v_mov_b32_e32 v0, s8 ; CHECK-NEXT: v_mov_b32_e32 v1, s9 -; CHECK-NEXT: s_add_u32 s0, s4, s0 ; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] -; CHECK-NEXT: s_addc_u32 s1, s5, s1 ; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; CHECK-NEXT: global_atomic_add_f64 v4, v[2:3], s[0:1] offset:-7 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: buffer_wbinvl1_vol +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_and_saveexec_b64 s[8:9], vcc +; CHECK-NEXT: s_cbranch_execz .LBB4_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s2, s[2:3], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[2:3], s[2:3], 3 +; CHECK-NEXT: s_add_u32 s2, s4, s2 +; CHECK-NEXT: s_addc_u32 s3, s5, s3 +; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; CHECK-NEXT: v_mul_f64 v[0:1], s[6:7], v[0:1] +; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] offset:-7 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_wbinvl1_vol +; CHECK-NEXT: .LBB4_2: ; CHECK-NEXT: s_endpgm entry: %i = add nsw i32 %a, -1 @@ -127,13 +154,13 @@ entry: %i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2 br label %bb1 -bb1: +bb1: ; preds = %entry %i.7 = ptrtoint ptr addrspace(1) %i.3 to i64 %i.8 = add nsw i64 %i.7, 1 %i.9 = inttoptr i64 %i.8 to ptr addrspace(1) - %i.10 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %d, double %c) #23 + %0 = atomicrmw fadd ptr %d, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 %i.11 = addrspacecast ptr addrspace(1) %i.9 to ptr - %i.12 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %i.11, double %c) #23 + %1 = atomicrmw fadd ptr %i.11, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret void } @@ -158,10 +185,21 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl ; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] ; CHECK-NEXT: s_cbranch_vccnz .LBB5_1 ; CHECK-NEXT: ; %bb.2: ; %bb1 -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: s_mov_b64 s[0:1], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_cbranch_execz .LBB5_4 +; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; CHECK-NEXT: v_mul_f64 v[0:1], s[6:7], v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_wbinvl1_vol +; CHECK-NEXT: .LBB5_4: ; CHECK-NEXT: s_endpgm entry: %i = add nsw i32 %a, -1 @@ -170,7 +208,7 @@ entry: %i.4 = ptrtoint ptr addrspace(1) %i.3 to i64 br label %bb0 -bb0: +bb0: ; preds = %bb0, %entry %phi = phi ptr addrspace(1) [ %i.3, %entry ], [ %i.9, %bb0 ] %i.7 = ptrtoint ptr addrspace(1) %phi to i64 %i.8 = sub nsw i64 %i.7, 1 @@ -178,12 +216,13 @@ bb0: %i.9 = inttoptr i64 %i.7 to ptr addrspace(1) br i1 %cmp2, label %bb1, label %bb0 -bb1: +bb1: ; preds = %bb0 %i.10 = addrspacecast ptr addrspace(1) %i.9 to ptr - %i.11 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %i.10, double %c) #23 + %0 = atomicrmw fadd ptr %i.10, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret void } +attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +attributes #1 = { mustprogress nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" } -attributes #8 = { argmemonly mustprogress nounwind willreturn "target-cpu"="gfx90a" } - +!0 = !{} From e454d3103739c19a863a210701cc03528c96dd68 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 22 Aug 2024 20:40:16 +0100 Subject: [PATCH 258/426] [VPlan] Factor out precomputing costs from LVP::cost (NFC). Move the logic for pre-computing costs of certain instructions to a separate helper function, allowing re-use in a follow-up patch. --- .../Vectorize/LoopVectorizationPlanner.h | 6 ++++++ .../Transforms/Vectorize/LoopVectorize.cpp | 21 ++++++++++++------- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 3bb7a8e651a3f6..b5f87e458833d6 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -344,6 +344,12 @@ class LoopVectorizationPlanner { /// been retired. InstructionCost cost(VPlan &Plan, ElementCount VF) const; + /// Precompute costs for certain instructions using the legacy cost model. The + /// function is used to bring up the VPlan-based cost model to initially avoid + /// taking different decisions due to inaccuracies in the legacy cost model. + InstructionCost precomputeCosts(VPlan &Plan, ElementCount VF, + VPCostContext &CostCtx) const; + public: LoopVectorizationPlanner( Loop *L, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 23d0f39ad93ebe..8e9324ba718b39 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7072,13 +7072,10 @@ bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const { SkipCostComputation.contains(UI); } -InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, - ElementCount VF) const { - InstructionCost Cost = 0; - LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext(); - VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), - LLVMCtx, CM); - +InstructionCost +LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, + VPCostContext &CostCtx) const { + InstructionCost Cost; // Cost modeling for inductions is inaccurate in the legacy cost model // compared to the recipes that are generated. To match here initially during // VPlan cost model bring up directly use the induction costs from the legacy @@ -7224,6 +7221,16 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF); Cost += BranchCost; } + return Cost; +} + +InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, + ElementCount VF) const { + LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext(); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), + LLVMCtx, CM); + InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); + // Now compute and add the VPlan-based cost. Cost += Plan.cost(VF, CostCtx); LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n"); From a2d8743cc86f96f6b1cbd85798328bd3fb2bf4de Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Thu, 22 Aug 2024 22:03:05 +0200 Subject: [PATCH 259/426] [LLD][COFF] Generate X64 thunks for ARM64EC entry points and patchable functions. (#105499) This implements Fast-Forward Sequences documented in ARM64EC ABI https://learn.microsoft.com/en-us/windows/arm/arm64ec-abi. There are two conditions when linker should generate such thunks: - For each exported ARM64EC functions. It applies only to ARM64EC functions (we may also have pure x64 functions, for which no thunk is needed). MSVC linker creates `EXP+` symbol in those cases that points to the thunk and uses that symbol for the export. It's observable from the module: it's possible to reference such symbols as I did in the test. Note that it uses export name, not name of the symbol that's exported (as in `foo` in `/EXPORT:foo=bar`). This implies that if the same function is exported multiple times, it will have multiple thunks. I followed this MSVC behavior. - For hybrid_patchable functions. The linker tries to generate a thunk for each undefined `EXP+*` symbol (and such symbols are created by the compiler as a target of weak alias from the demangled name). MSVC linker tries to find corresponding `*$hp_target` symbol and if fails to do so, it outputs a cryptic error like `LINK : fatal error LNK1000: Internal error during IMAGE::BuildImage`. I just skip generating the thunk in such case (which causes undefined reference error). MSVC linker additionally checks that the symbol complex type is a function (see also #102898). We generally don't do such checks in LLD, so I made it less strict. It should be fine: if it's some data symbol, it will not have `$hp_target` symbol, so we will skip it anyway. --- lld/COFF/Chunks.cpp | 5 + lld/COFF/Chunks.h | 20 +++ lld/COFF/Driver.cpp | 69 +++++++ lld/COFF/Driver.h | 4 + lld/COFF/SymbolTable.cpp | 3 + lld/COFF/SymbolTable.h | 3 + lld/COFF/Writer.cpp | 11 ++ lld/test/COFF/arm64ec-export-thunks.test | 188 ++++++++++++++++++++ lld/test/COFF/arm64ec-exports.s | 12 +- lld/test/COFF/arm64ec-patchable-thunks.test | 86 +++++++++ 10 files changed, 395 insertions(+), 6 deletions(-) create mode 100644 lld/test/COFF/arm64ec-export-thunks.test create mode 100644 lld/test/COFF/arm64ec-patchable-thunks.test diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp index 2807c894520512..be44950a1720e3 100644 --- a/lld/COFF/Chunks.cpp +++ b/lld/COFF/Chunks.cpp @@ -1073,4 +1073,9 @@ void AbsolutePointerChunk::writeTo(uint8_t *buf) const { } } +void ECExportThunkChunk::writeTo(uint8_t *buf) const { + memcpy(buf, ECExportThunkCode, sizeof(ECExportThunkCode)); + write32le(buf + 10, target->getRVA() - rva - 14); +} + } // namespace lld::coff diff --git a/lld/COFF/Chunks.h b/lld/COFF/Chunks.h index df311524a8d185..5443d4619a977e 100644 --- a/lld/COFF/Chunks.h +++ b/lld/COFF/Chunks.h @@ -749,6 +749,26 @@ class ECCodeMapChunk : public NonSectionChunk { std::vector ↦ }; +static const uint8_t ECExportThunkCode[] = { + 0x48, 0x8b, 0xc4, // movq %rsp, %rax + 0x48, 0x89, 0x58, 0x20, // movq %rbx, 0x20(%rax) + 0x55, // pushq %rbp + 0x5d, // popq %rbp + 0xe9, 0, 0, 0, 0, // jmp *0x0 + 0xcc, // int3 + 0xcc // int3 +}; + +class ECExportThunkChunk : public NonSectionCodeChunk { +public: + explicit ECExportThunkChunk(Defined *targetSym) : target(targetSym) {} + size_t getSize() const override { return sizeof(ECExportThunkCode); }; + void writeTo(uint8_t *buf) const override; + MachineTypes getMachine() const override { return AMD64; } + + Defined *target; +}; + // MinGW specific, for the "automatic import of variables from DLLs" feature. // This provides the table of runtime pseudo relocations, for variable // references that turned out to need to be imported from a DLL even though diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index 9e28b1c50be504..c09c91fe4b1719 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -1317,6 +1317,72 @@ void LinkerDriver::convertResources() { f->includeResourceChunks(); } +void LinkerDriver::maybeCreateECExportThunk(StringRef name, Symbol *&sym) { + Defined *def; + if (!sym) + return; + if (auto undef = dyn_cast(sym)) + def = undef->getWeakAlias(); + else + def = dyn_cast(sym); + if (!def) + return; + + if (def->getChunk()->getArm64ECRangeType() != chpe_range_type::Arm64EC) + return; + StringRef expName; + if (auto mangledName = getArm64ECMangledFunctionName(name)) + expName = saver().save("EXP+" + *mangledName); + else + expName = saver().save("EXP+" + name); + sym = addUndefined(expName); + if (auto undef = dyn_cast(sym)) { + if (!undef->getWeakAlias()) { + auto thunk = make(def); + replaceSymbol(undef, undef->getName(), thunk); + } + } +} + +void LinkerDriver::createECExportThunks() { + // Check if EXP+ symbols have corresponding $hp_target symbols and use them + // to create export thunks when available. + for (Symbol *s : ctx.symtab.expSymbols) { + if (!s->isUsedInRegularObj) + continue; + assert(s->getName().starts_with("EXP+")); + std::string targetName = + (s->getName().substr(strlen("EXP+")) + "$hp_target").str(); + Symbol *sym = ctx.symtab.find(targetName); + if (!sym) + continue; + Defined *targetSym; + if (auto undef = dyn_cast(sym)) + targetSym = undef->getWeakAlias(); + else + targetSym = dyn_cast(sym); + if (!targetSym) + continue; + + auto *undef = dyn_cast(s); + if (undef && !undef->getWeakAlias()) { + auto thunk = make(targetSym); + replaceSymbol(undef, undef->getName(), thunk); + } + if (!targetSym->isGCRoot) { + targetSym->isGCRoot = true; + ctx.config.gcroot.push_back(targetSym); + } + } + + if (ctx.config.entry) + maybeCreateECExportThunk(ctx.config.entry->getName(), ctx.config.entry); + for (Export &e : ctx.config.exports) { + if (!e.data) + maybeCreateECExportThunk(e.extName.empty() ? e.name : e.extName, e.sym); + } +} + // In MinGW, if no symbols are chosen to be exported, then all symbols are // automatically exported by default. This behavior can be forced by the // -export-all-symbols option, so that it happens even when exports are @@ -2520,6 +2586,9 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { if (!wrapped.empty()) wrapSymbols(ctx, wrapped); + if (isArm64EC(config->machine)) + createECExportThunks(); + // Resolve remaining undefined symbols and warn about imported locals. ctx.symtab.resolveRemainingUndefines(); if (errorCount()) diff --git a/lld/COFF/Driver.h b/lld/COFF/Driver.h index fa54de05befb58..b5cf8e2f18fd4e 100644 --- a/lld/COFF/Driver.h +++ b/lld/COFF/Driver.h @@ -270,6 +270,10 @@ class LinkerDriver { // Convert Windows resource files (.res files) to a .obj file. MemoryBufferRef convertResToCOFF(ArrayRef mbs, ArrayRef objs); + + // Create export thunks for exported and patchable Arm64EC function symbols. + void createECExportThunks(); + void maybeCreateECExportThunk(StringRef name, Symbol *&sym); }; // Create enum with OPT_xxx values for each option in Options.td diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp index 6c3c4e3931aa84..1dfff0a90f4aee 100644 --- a/lld/COFF/SymbolTable.cpp +++ b/lld/COFF/SymbolTable.cpp @@ -551,6 +551,9 @@ std::pair SymbolTable::insert(StringRef name) { sym->pendingArchiveLoad = false; sym->canInline = true; inserted = true; + + if (isArm64EC(ctx.config.machine) && name.starts_with("EXP+")) + expSymbols.push_back(sym); } return {sym, inserted}; } diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h index 93b376b69f7ecf..b5f95d2ad7f112 100644 --- a/lld/COFF/SymbolTable.h +++ b/lld/COFF/SymbolTable.h @@ -116,6 +116,9 @@ class SymbolTable { // A list of chunks which to be added to .rdata. std::vector localImportChunks; + // A list of EC EXP+ symbols. + std::vector expSymbols; + // Iterates symbols in non-determinstic hash table order. template void forEachSymbol(T callback) { for (auto &pair : symMap) diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp index f776c76a47ae96..776595d98c391d 100644 --- a/lld/COFF/Writer.cpp +++ b/lld/COFF/Writer.cpp @@ -304,6 +304,7 @@ class Writer { uint64_t sizeOfHeaders; OutputSection *textSec; + OutputSection *hexpthkSec; OutputSection *rdataSec; OutputSection *buildidSec; OutputSection *dataSec; @@ -984,6 +985,8 @@ void Writer::createSections() { // Try to match the section order used by link.exe. textSec = createSection(".text", code | r | x); + if (isArm64EC(ctx.config.machine)) + hexpthkSec = createSection(".hexpthk", code | r | x); createSection(".bss", bss | r | w); rdataSec = createSection(".rdata", data | r); buildidSec = createSection(".buildid", data | r); @@ -2046,6 +2049,14 @@ void Writer::maybeAddRVATable(SymbolRVASet tableSymbols, StringRef tableSym, // Create CHPE metadata chunks. void Writer::createECChunks() { + for (Symbol *s : ctx.symtab.expSymbols) { + auto sym = dyn_cast(s); + if (!sym || !sym->getChunk()) + continue; + if (auto thunk = dyn_cast(sym->getChunk())) + hexpthkSec->addChunk(thunk); + } + auto codeMapChunk = make(codeMap); rdataSec->addChunk(codeMapChunk); Symbol *codeMapSym = ctx.symtab.findUnderscore("__hybrid_code_map"); diff --git a/lld/test/COFF/arm64ec-export-thunks.test b/lld/test/COFF/arm64ec-export-thunks.test new file mode 100644 index 00000000000000..6ed0514d4b17f3 --- /dev/null +++ b/lld/test/COFF/arm64ec-export-thunks.test @@ -0,0 +1,188 @@ +REQUIRES: aarch64, x86 +RUN: split-file %s %t.dir && cd %t.dir + +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows arm64ec-data.s -o arm64ec-data.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows arm64ec-func.s -o arm64ec-func.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows antidep-func.s -o antidep-func.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows arm64ec-data-sym.s -o arm64ec-data-sym.obj +RUN: llvm-mc -filetype=obj -triple=x86_64-windows x86_64-func.s -o x86_64-func.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj + +RUN: lld-link -out:exports.dll -machine:arm64ec arm64ec-func.obj x86_64-func.obj loadconfig-arm64ec.obj \ +RUN: arm64ec-data.obj -dll -noentry -export:arm64ec_func -export:func=arm64ec_func \ +RUN: -export:x86_64_func -export:data_sym,DATA + +RUN: llvm-objdump -d exports.dll | FileCheck -check-prefix=EXP-DISASM %s +EXP-DISASM: Disassembly of section .text: +EXP-DISASM-EMPTY: +EXP-DISASM-NEXT: 0000000180001000 <.text>: +EXP-DISASM-NEXT: 180001000: 90000008 adrp x8, 0x180001000 <.text> +EXP-DISASM-NEXT: 180001004: 52800040 mov w0, #0x2 +EXP-DISASM-NEXT: 180001008: d65f03c0 ret +EXP-DISASM-NEXT: ... +EXP-DISASM-EMPTY: +EXP-DISASM-NEXT: 0000000180002000 : +EXP-DISASM-NEXT: 180002000: e8 fb ef ff ff callq 0x180001000 <.text> +EXP-DISASM-NEXT: 180002005: b8 03 00 00 00 movl $0x3, %eax +EXP-DISASM-NEXT: 18000200a: c3 retq +EXP-DISASM-EMPTY: +EXP-DISASM-NEXT: Disassembly of section .hexpthk: +EXP-DISASM-EMPTY: +EXP-DISASM-NEXT: 0000000180003000 : +EXP-DISASM-NEXT: 180003000: 48 8b c4 movq %rsp, %rax +EXP-DISASM-NEXT: 180003003: 48 89 58 20 movq %rbx, 0x20(%rax) +EXP-DISASM-NEXT: 180003007: 55 pushq %rbp +EXP-DISASM-NEXT: 180003008: 5d popq %rbp +EXP-DISASM-NEXT: 180003009: e9 f2 df ff ff jmp 0x180001000 <.text> +EXP-DISASM-NEXT: 18000300e: cc int3 +EXP-DISASM-NEXT: 18000300f: cc int3 +EXP-DISASM-EMPTY: +EXP-DISASM-NEXT: 0000000180003010 : +EXP-DISASM-NEXT: 180003010: 48 8b c4 movq %rsp, %rax +EXP-DISASM-NEXT: 180003013: 48 89 58 20 movq %rbx, 0x20(%rax) +EXP-DISASM-NEXT: 180003017: 55 pushq %rbp +EXP-DISASM-NEXT: 180003018: 5d popq %rbp +EXP-DISASM-NEXT: 180003019: e9 e2 df ff ff jmp 0x180001000 <.text> +EXP-DISASM-NEXT: 18000301e: cc int3 +EXP-DISASM-NEXT: 18000301f: cc int3 + +RUN: llvm-objdump -p exports.dll | FileCheck -check-prefix=EXP-EXPORT %s +EXP-EXPORT: Ordinal RVA Name +EXP-EXPORT-NEXT: 1 0x3010 arm64ec_func +EXP-EXPORT-NEXT: 2 0x6000 data_sym +EXP-EXPORT-NEXT: 3 0x3000 func +EXP-EXPORT-NEXT: 4 0x2000 x86_64_func + +RUN: llvm-readobj --coff-load-config exports.dll | FileCheck -check-prefix=EXP-CHPE %s +EXP-CHPE: CodeMap [ +EXP-CHPE-NEXT: 0x1000 - 0x100C ARM64EC +EXP-CHPE-NEXT: 0x2000 - 0x3020 X64 +EXP-CHPE-NEXT: ] + +RUN: llvm-objdump -s --section=.test exports.dll | FileCheck --check-prefix=EXP-DATA %s +EXP-DATA: 180006000 00300000 10300000 + +RUN: lld-link -out:exports2.dll -machine:arm64ec antidep-func.obj x86_64-func.obj loadconfig-arm64ec.obj \ +RUN: arm64ec-data.obj -dll -noentry -export:arm64ec_func -export:func=arm64ec_func \ +RUN: -export:x86_64_func -export:data_sym,DATA + +RUN: llvm-objdump -d exports2.dll | FileCheck -check-prefix=EXP-DISASM %s +RUN: llvm-objdump -p exports2.dll | FileCheck -check-prefix=EXP-EXPORT %s +RUN: llvm-objdump -s --section=.test exports2.dll | FileCheck --check-prefix=EXP-DATA %s +RUN: llvm-readobj --coff-load-config exports2.dll | FileCheck -check-prefix=EXP-CHPE %s + +RUN: lld-link -out:entry.dll -machine:arm64ec arm64ec-func.obj loadconfig-arm64ec.obj -dll -entry:arm64ec_func + +RUN: llvm-objdump -d entry.dll | FileCheck -check-prefix=ENTRY-DISASM %s +ENTRY-DISASM: Disassembly of section .text: +ENTRY-DISASM-EMPTY: +ENTRY-DISASM-NEXT: 0000000180001000 <.text>: +ENTRY-DISASM-NEXT: 180001000: 90000008 adrp x8, 0x180001000 <.text> +ENTRY-DISASM-NEXT: 180001004: 52800040 mov w0, #0x2 // =2 +ENTRY-DISASM-NEXT: 180001008: d65f03c0 ret +ENTRY-DISASM-EMPTY: +ENTRY-DISASM-NEXT: Disassembly of section .hexpthk: +ENTRY-DISASM-EMPTY: +ENTRY-DISASM-NEXT: 0000000180002000 <.hexpthk>: +ENTRY-DISASM-NEXT: 180002000: 48 8b c4 movq %rsp, %rax +ENTRY-DISASM-NEXT: 180002003: 48 89 58 20 movq %rbx, 0x20(%rax) +ENTRY-DISASM-NEXT: 180002007: 55 pushq %rbp +ENTRY-DISASM-NEXT: 180002008: 5d popq %rbp +ENTRY-DISASM-NEXT: 180002009: e9 f2 ef ff ff jmp 0x180001000 <.text> +ENTRY-DISASM-NEXT: 18000200e: cc int3 +ENTRY-DISASM-NEXT: 18000200f: cc int3 + +RUN: llvm-readobj --headers entry.dll | FileCheck -check-prefix=ENTRY %s +ENTRY: AddressOfEntryPoint: 0x2000 + +RUN: llvm-readobj --coff-load-config entry.dll | FileCheck -check-prefix=ENTRY-CHPE %s +ENTRY-CHPE: CodeMap [ +ENTRY-CHPE-NEXT: 0x1000 - 0x100C ARM64EC +ENTRY-CHPE-NEXT: 0x2000 - 0x2010 X64 +ENTRY-CHPE-NEXT: ] + + +Test exporting data symbol as a function: + +RUN: lld-link -out:data-func.dll -machine:arm64ec arm64ec-data-sym.obj loadconfig-arm64ec.obj -dll -noentry -export:data_sym + +RUN: llvm-readobj --hex-dump=.test data-func.dll | FileCheck --check-prefix=DATAFUNC-TEST %s +DATAFUNC-TEST: Hex dump of section '.test': +DATAFUNC-TEST-NEXT: 0x180003000 00000000 .... + +RUN: llvm-readobj --coff-exports --hex-dump=.test data-func.dll | FileCheck --check-prefix=DATAFUNC-EXP %s +DATAFUNC-EXP: Export { +DATAFUNC-EXP-NEXT: Ordinal: 1 +DATAFUNC-EXP-NEXT: Name: data_sym +DATAFUNC-EXP-NEXT: RVA: 0x3000 +DATAFUNC-EXP-NEXT: } + + +Test mingw-style auto-export: + +RUN: lld-link -out:export-all.dll -machine:arm64ec arm64ec-func.obj loadconfig-arm64ec.obj -dll -noentry -lldmingw +RUN: llvm-objdump -d export-all.dll | FileCheck --check-prefix=EXPORT-ALL %s + +EXPORT-ALL: Disassembly of section .text: +EXPORT-ALL-EMPTY: +EXPORT-ALL-NEXT: 0000000180001000 <.text>: +EXPORT-ALL-NEXT: 180001000: 90000008 adrp x8, 0x180001000 <.text> +EXPORT-ALL-NEXT: 180001004: 52800040 mov w0, #0x2 // =2 +EXPORT-ALL-NEXT: 180001008: d65f03c0 ret +EXPORT-ALL-EMPTY: +EXPORT-ALL-NEXT: Disassembly of section .hexpthk: +EXPORT-ALL-EMPTY: +EXPORT-ALL-NEXT: 0000000180002000 : +EXPORT-ALL-NEXT: 180002000: 48 8b c4 movq %rsp, %rax +EXPORT-ALL-NEXT: 180002003: 48 89 58 20 movq %rbx, 0x20(%rax) +EXPORT-ALL-NEXT: 180002007: 55 pushq %rbp +EXPORT-ALL-NEXT: 180002008: 5d popq %rbp +EXPORT-ALL-NEXT: 180002009: e9 f2 ef ff ff jmp 0x180001000 <.text> +EXPORT-ALL-NEXT: 18000200e: cc int3 +EXPORT-ALL-NEXT: 18000200f: cc int3 + + +#--- arm64ec-func.s + .text + .globl arm64ec_func + .p2align 2, 0x0 +arm64ec_func: + adrp x8,arm64ec_func + mov w0, #2 + ret + +#--- antidep-func.s + .text + .globl "#arm64ec_func" + .p2align 2, 0x0 +"#arm64ec_func": + adrp x8,arm64ec_func + mov w0, #2 + ret + + .weak_anti_dep arm64ec_func +arm64ec_func = "#arm64ec_func" + +#--- arm64ec-data.s + .section .test, "r" + .globl data_sym + .p2align 2, 0x0 +data_sym: + .rva "EXP+#func" + .rva "EXP+#arm64ec_func" + +#--- x86_64-func.s + .text + .globl x86_64_func + .p2align 2, 0x0 +x86_64_func: + call arm64ec_func + movl $3, %eax + retq + +#--- arm64ec-data-sym.s + .section .test, "r" + .globl data_sym + .p2align 2, 0x0 +data_sym: + .word 0 diff --git a/lld/test/COFF/arm64ec-exports.s b/lld/test/COFF/arm64ec-exports.s index a48211e6fb76c1..870540d6104621 100644 --- a/lld/test/COFF/arm64ec-exports.s +++ b/lld/test/COFF/arm64ec-exports.s @@ -16,32 +16,32 @@ ; EXP: Export { ; EXP-NEXT: Ordinal: 1 ; EXP-NEXT: Name: #mangled_data_sym -; EXP-NEXT: RVA: 0x3000 +; EXP-NEXT: RVA: 0x4000 ; EXP-NEXT: } ; EXP-NEXT: Export { ; EXP-NEXT: Ordinal: 2 ; EXP-NEXT: Name: ?cxx_func@@YAHXZ -; EXP-NEXT: RVA: 0x1018 +; EXP-NEXT: RVA: 0x2030 ; EXP-NEXT: } ; EXP-NEXT: Export { ; EXP-NEXT: Ordinal: 3 ; EXP-NEXT: Name: data_sym -; EXP-NEXT: RVA: 0x3004 +; EXP-NEXT: RVA: 0x4004 ; EXP-NEXT: } ; EXP-NEXT: Export { ; EXP-NEXT: Ordinal: 4 ; EXP-NEXT: Name: exportas_func -; EXP-NEXT: RVA: 0x1010 +; EXP-NEXT: RVA: 0x2020 ; EXP-NEXT: } ; EXP-NEXT: Export { ; EXP-NEXT: Ordinal: 5 ; EXP-NEXT: Name: mangled_func -; EXP-NEXT: RVA: 0x1008 +; EXP-NEXT: RVA: 0x2010 ; EXP-NEXT: } ; EXP-NEXT: Export { ; EXP-NEXT: Ordinal: 6 ; EXP-NEXT: Name: unmangled_func -; EXP-NEXT: RVA: 0x1000 +; EXP-NEXT: RVA: 0x2000 ; EXP-NEXT: } ; RUN: llvm-nm --print-armap out.lib | FileCheck --check-prefix=IMPLIB %s diff --git a/lld/test/COFF/arm64ec-patchable-thunks.test b/lld/test/COFF/arm64ec-patchable-thunks.test new file mode 100644 index 00000000000000..cccd42eebfd367 --- /dev/null +++ b/lld/test/COFF/arm64ec-patchable-thunks.test @@ -0,0 +1,86 @@ +REQUIRES: aarch64, x86 +RUN: split-file %s %t.dir && cd %t.dir + +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows arm64ec-patchable.s -o arm64ec-patchable.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows arm64ec-alias.s -o arm64ec-alias.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows test-sec.s -o test-sec.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj + +RUN: lld-link -out:test.dll -machine:arm64ec arm64ec-patchable.obj test-sec.obj loadconfig-arm64ec.obj -dll -noentry + +RUN: llvm-objdump -d test.dll | FileCheck -check-prefix=PATCH-DISASM %s +PATCH-DISASM: Disassembly of section .text: +PATCH-DISASM-EMPTY: +PATCH-DISASM-NEXT: 0000000180001000 <.text>: +PATCH-DISASM-NEXT: 180001000: 52800040 mov w0, #0x2 // =2 +PATCH-DISASM-NEXT: 180001004: d65f03c0 ret +PATCH-DISASM-EMPTY: +PATCH-DISASM-NEXT: Disassembly of section .hexpthk: +PATCH-DISASM-EMPTY: +PATCH-DISASM-NEXT: 0000000180002000 +PATCH-DISASM-NEXT: 180002000: 48 8b c4 movq %rsp, %rax +PATCH-DISASM-NEXT: 180002003: 48 89 58 20 movq %rbx, 0x20(%rax) +PATCH-DISASM-NEXT: 180002007: 55 pushq %rbp +PATCH-DISASM-NEXT: 180002008: 5d popq %rbp +PATCH-DISASM-NEXT: 180002009: e9 f2 ef ff ff jmp 0x180001000 <.text> +PATCH-DISASM-NEXT: 18000200e: cc int3 +PATCH-DISASM-NEXT: 18000200f: cc int3 + +RUN: llvm-readobj --hex-dump=.test test.dll | FileCheck -check-prefix=RVA %s +RVA: 0x180005000 00200000 + +RUN: llvm-readobj --coff-load-config test.dll | FileCheck -check-prefix=PATCH-CHPE %s +PATCH-CHPE: CodeMap [ +PATCH-CHPE-NEXT: 0x1000 - 0x1008 ARM64EC +PATCH-CHPE-NEXT: 0x2000 - 0x2010 X64 +PATCH-CHPE-NEXT: ] + + +RUN: lld-link -out:test2.dll -machine:arm64ec arm64ec-alias.obj test-sec.obj loadconfig-arm64ec.obj -dll -noentry + +RUN: llvm-objdump -d test2.dll | FileCheck -check-prefix=PATCH-DISASM %s +RUN: llvm-readobj --hex-dump=.test test2.dll | FileCheck -check-prefix=RVA %s +RUN: llvm-readobj --coff-load-config test2.dll | FileCheck -check-prefix=PATCH-CHPE %s + +RUN: lld-link -out:test3.dll -machine:arm64ec arm64ec-alias.obj test-sec.obj loadconfig-arm64ec.obj -dll -noentry -export:patchable_func + +RUN: llvm-objdump -d test3.dll | FileCheck -check-prefix=PATCH-DISASM %s +RUN: llvm-readobj --hex-dump=.test test3.dll | FileCheck -check-prefix=RVA %s +RUN: llvm-readobj --coff-load-config test3.dll | FileCheck -check-prefix=PATCH-CHPE %s + + +RUN: not lld-link -out:test4.dll -machine:arm64ec test-sec.obj loadconfig-arm64ec.obj -dll -noentry 2>&1 | FileCheck --check-prefix=ERR %s +ERR: error: undefined symbol: EXP+#patchable_func + + +#--- arm64ec-patchable.s + .section ".text", "x", discard, "#patchable_func$hp_target" + .globl "#patchable_func$hp_target" + .p2align 2, 0x0 +"#patchable_func$hp_target": + mov w0, #2 + ret + + .def "EXP+#patchable_func" + .scl 2 + .type 32 + .endef + +#--- arm64ec-alias.s + .section ".text", "x", discard, "#patchable_func$hp_target" + .globl "#patchable_func$hp_target" + .p2align 2, 0x0 +"#patchable_func$hp_target": + mov w0, #2 + ret + + .def "EXP+#patchable_func" + .scl 2 + .type 32 + .endef + .weak patchable_func +patchable_func = "EXP+#patchable_func" + +#--- test-sec.s + .section ".test", "rd" + .rva "EXP+#patchable_func" From cb4efe1d078144a72306732a56afea3885650e8d Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 22 Aug 2024 21:38:06 +0100 Subject: [PATCH 260/426] [VPlan] Don't trigger VF assertion if VPlan has extra simplifications. There are cases where VPlans contain some simplifications that are very hard to accurately account for up-front in the legacy cost model. Those cases are caused by un-simplified inputs, which trigger the assert ensuring both the legacy and VPlan-based cost model agree on the VF. To avoid false positives due to missed simplifications in general, only trigger the assert if the chosen VPlan doesn't contain any additional simplifications. Fixes https://github.com/llvm/llvm-project/issues/104714. Fixes https://github.com/llvm/llvm-project/issues/105713. --- .../Transforms/Vectorize/LoopVectorize.cpp | 65 +++++++++- .../RISCV/blocks-with-dead-instructions.ll | 122 ++++++++++++++++++ 2 files changed, 186 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 8e9324ba718b39..32e8f331257801 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7237,6 +7237,56 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, return Cost; } +/// Return true if the original loop \ TheLoop contains any instructions that do +/// not have corresponding recipes in \p Plan and are not marked to be ignored +/// in \p CostCtx. This means the VPlan contains simplification that the legacy +/// cost-model did not account for. +static bool +planContainsAdditionalSimplifications(VPlan &Plan, ElementCount VF, + VPCostContext &CostCtx, Loop *TheLoop, + LoopVectorizationCostModel &CM) { + // First collect all instructions for the recipes in Plan. + auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction q { + if (auto *S = dyn_cast(R)) + return dyn_cast_or_null(S->getUnderlyingValue()); + if (auto *WidenMem = dyn_cast(R)) + return &WidenMem->getIngredient(); + return nullptr; + }; + + DenseSet SeenInstrs; + auto Iter = vp_depth_first_deep(Plan.getEntry()); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { + for (VPRecipeBase &R : *VPBB) { + if (auto *IR = dyn_cast(&R)) { + auto *IG = IR->getInterleaveGroup(); + unsigned NumMembers = IG->getNumMembers(); + for (unsigned I = 0; I != NumMembers; ++I) { + if (Instruction *M = IG->getMember(I)) + SeenInstrs.insert(M); + } + continue; + } + if (Instruction *UI = GetInstructionForCost(&R)) + SeenInstrs.insert(UI); + } + } + + // Return true if the loop contains any instructions that are not also part of + // the VPlan or are skipped for VPlan-based cost computations. This indicates + // that the VPlan contains extra simplifications. + return any_of( + TheLoop->blocks(), [&SeenInstrs, VF, &CostCtx, &CM](BasicBlock *BB) { + return any_of(*BB, [&SeenInstrs, VF, &CostCtx, &CM](Instruction &I) { + if (isa(&I)) + return false; + return !SeenInstrs.contains(&I) && + !CostCtx.skipCostComputation(&I, true) && + !CM.canTruncateToMinimalBitwidth(&I, VF); + }); + }); +} + VectorizationFactor LoopVectorizationPlanner::computeBestVF() { if (VPlans.empty()) return VectorizationFactor::Disabled(); @@ -7292,7 +7342,20 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { // cost-model and will be retired once the VPlan-based cost-model is // stabilized. VectorizationFactor LegacyVF = selectVectorizationFactor(); - assert(BestFactor.Width == LegacyVF.Width && + VPlan &BestPlan = getPlanFor(BestFactor.Width); + + // Pre-compute the cost and use it to check if BestPlan contains any + // simplifications not accounted for in the legacy cost model. If that's the + // case, don't trigger the assertion, as the extra simplifications may cause a + // different VF to be picked by the VPlan-based cost model. + LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext(); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), + LLVMCtx, CM); + precomputeCosts(BestPlan, BestFactor.Width, CostCtx); + assert((BestFactor.Width == LegacyVF.Width || + planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width), + BestFactor.Width, CostCtx, + OrigLoop, CM)) && " VPlan cost model and legacy cost model disagreed"); assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) && "when vectorizing, the scalar cost must be computed."); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll index d970b427d035da..b629b964a70c04 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll @@ -827,6 +827,120 @@ exit: ret void } +; Test case for https://github.com/llvm/llvm-project/issues/100591. +define void @dead_load_in_block(ptr %dst, ptr %src, i8 %N, i64 %x) #0 { +; CHECK-LABEL: define void @dead_load_in_block( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i8 [[N:%.*]], i64 [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[N_EXT:%.*]] = zext i8 [[N]] to i64 +; CHECK-NEXT: [[UMIN7:%.*]] = call i64 @llvm.umin.i64(i64 [[N_EXT]], i64 1) +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[N_EXT]], [[UMIN7]] +; CHECK-NEXT: [[TMP1:%.*]] = udiv i64 [[TMP0]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[UMIN7]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.umax.i64(i64 40, i64 [[TMP5]]) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP6]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N_EXT]], i64 1) +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[N_EXT]], [[UMIN]] +; CHECK-NEXT: [[TMP8:%.*]] = udiv i64 [[TMP7]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[UMIN]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 12 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP10]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = shl i64 [[X]], 2 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 4 +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]] +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 4 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND04:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND15:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], [[TMP15]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 3 +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 2 +; CHECK-NEXT: [[TMP18:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP19:%.*]] = add [[TMP18]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = mul [[TMP19]], shufflevector ( insertelement ( poison, i64 3, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP20]] +; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 2 +; CHECK-NEXT: [[TMP23:%.*]] = mul i64 3, [[TMP22]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP23]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[DST]], [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i32.nxv2p0( zeroinitializer, [[TMP24]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)), !alias.scope [[META18:![0-9]+]], !noalias [[META21:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[L_0:%.*]] = load i32, ptr [[SRC]], align 4 +; CHECK-NEXT: [[C_0:%.*]] = icmp eq i32 [[L_0]], 0 +; CHECK-NEXT: br i1 [[C_0]], label %[[LOOP_LATCH]], label %[[THEN:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[GEP_SRC_X:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[X]] +; CHECK-NEXT: [[L_DEAD:%.*]] = load i32, ptr [[GEP_SRC_X]], align 4 +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i32 0, ptr [[GEP_DST]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 3 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV]], [[N_EXT]] +; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %N.ext = zext i8 %N to i64 + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %l.0 = load i32, ptr %src, align 4 + %c.0 = icmp eq i32 %l.0, 0 + br i1 %c.0, label %loop.latch , label %then + +then: + %gep.src.x = getelementptr i32, ptr %src, i64 %x + %l.dead = load i32, ptr %gep.src.x, align 4 + br label %loop.latch + +loop.latch: + %gep.dst = getelementptr i32, ptr %dst, i64 %iv + store i32 0, ptr %gep.dst, align 4 + %iv.next = add i64 %iv, 3 + %cmp = icmp ult i64 %iv, %N.ext + br i1 %cmp, label %loop.header, label %exit + +exit: + ret void +} + attributes #0 = { "target-features"="+64bit,+v" } ;. ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} @@ -847,4 +961,12 @@ attributes #0 = { "target-features"="+64bit,+v" } ; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} ; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]} ; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]} +; CHECK: [[META18]] = !{[[META19:![0-9]+]]} +; CHECK: [[META19]] = distinct !{[[META19]], [[META20:![0-9]+]]} +; CHECK: [[META20]] = distinct !{[[META20]], !"LVerDomain"} +; CHECK: [[META21]] = !{[[META22:![0-9]+]], [[META23:![0-9]+]]} +; CHECK: [[META22]] = distinct !{[[META22]], [[META20]]} +; CHECK: [[META23]] = distinct !{[[META23]], [[META20]]} +; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]} +; CHECK: [[LOOP25]] = distinct !{[[LOOP25]], [[META1]]} ;. From 768dba71fe0caf2b7e698a1c29c86a48bbd00149 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 22 Aug 2024 21:42:16 +0100 Subject: [PATCH 261/426] [VPlan] Fix typo in cb4efe1d. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 32e8f331257801..f2de38f46c86aa 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7246,7 +7246,7 @@ planContainsAdditionalSimplifications(VPlan &Plan, ElementCount VF, VPCostContext &CostCtx, Loop *TheLoop, LoopVectorizationCostModel &CM) { // First collect all instructions for the recipes in Plan. - auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction q { + auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * { if (auto *S = dyn_cast(R)) return dyn_cast_or_null(S->getUnderlyingValue()); if (auto *WidenMem = dyn_cast(R)) From 172c4a4a147833f1c08df1555f3170aa9ccb6cbe Mon Sep 17 00:00:00 2001 From: Ian Anderson Date: Thu, 22 Aug 2024 13:44:58 -0700 Subject: [PATCH 262/426] [libunwind] Stop installing the mach-o module map (#105616) libunwind shouldn't know that compact_unwind_encoding.h is part of a MachO module that it doesn't own. Delete the mach-o module map, and let whatever is in charge of the mach-o directory be the one to say how its module is organized and where compact_unwind_encoding.h fits in. --- libunwind/include/CMakeLists.txt | 1 - libunwind/include/mach-o/compact_unwind_encoding.modulemap | 4 ---- 2 files changed, 5 deletions(-) delete mode 100644 libunwind/include/mach-o/compact_unwind_encoding.modulemap diff --git a/libunwind/include/CMakeLists.txt b/libunwind/include/CMakeLists.txt index 51065d68afd4ea..6796d67a3354ff 100644 --- a/libunwind/include/CMakeLists.txt +++ b/libunwind/include/CMakeLists.txt @@ -3,7 +3,6 @@ set(files libunwind.h libunwind.modulemap mach-o/compact_unwind_encoding.h - mach-o/compact_unwind_encoding.modulemap unwind_arm_ehabi.h unwind_itanium.h unwind.h diff --git a/libunwind/include/mach-o/compact_unwind_encoding.modulemap b/libunwind/include/mach-o/compact_unwind_encoding.modulemap deleted file mode 100644 index 6eae657d31b5c5..00000000000000 --- a/libunwind/include/mach-o/compact_unwind_encoding.modulemap +++ /dev/null @@ -1,4 +0,0 @@ -module MachO.compact_unwind_encoding [system] { - header "compact_unwind_encoding.h" - export * -} From d010ec6af8162a8ae4e42d2cac5282f83db0ce07 Mon Sep 17 00:00:00 2001 From: Chris Apple Date: Thu, 22 Aug 2024 14:08:24 -0700 Subject: [PATCH 263/426] [clang][rtsan] Introduce realtime sanitizer codegen and driver (#102622) Introduce the `-fsanitize=realtime` flag in clang driver Plug in the RealtimeSanitizer PassManager pass in Codegen, and attribute a function based on if it has the `[[clang::nonblocking]]` function effect. --- clang/docs/RealtimeSanitizer.rst | 85 +++++++++++++++++++ clang/docs/ReleaseNotes.rst | 5 ++ clang/docs/UsersManual.rst | 2 + clang/docs/index.rst | 1 + clang/include/clang/Basic/Sanitizers.def | 3 + clang/include/clang/Driver/SanitizerArgs.h | 1 + clang/lib/CodeGen/BackendUtil.cpp | 8 ++ clang/lib/CodeGen/CodeGenFunction.cpp | 7 ++ clang/lib/Driver/SanitizerArgs.cpp | 14 +-- clang/lib/Driver/ToolChains/CommonArgs.cpp | 6 ++ clang/lib/Driver/ToolChains/Darwin.cpp | 8 ++ clang/lib/Driver/ToolChains/Linux.cpp | 1 + clang/test/CodeGen/rtsan_attribute_inserted.c | 7 ++ .../test/CodeGen/rtsan_entry_exit_insertion.c | 13 +++ .../rtsan_no_attribute_sanitizer_disabled.c | 6 ++ clang/test/Driver/fsanitize.c | 46 ++++++++++ 16 files changed, 208 insertions(+), 5 deletions(-) create mode 100644 clang/docs/RealtimeSanitizer.rst create mode 100644 clang/test/CodeGen/rtsan_attribute_inserted.c create mode 100644 clang/test/CodeGen/rtsan_entry_exit_insertion.c create mode 100644 clang/test/CodeGen/rtsan_no_attribute_sanitizer_disabled.c diff --git a/clang/docs/RealtimeSanitizer.rst b/clang/docs/RealtimeSanitizer.rst new file mode 100644 index 00000000000000..799cd43509c6e6 --- /dev/null +++ b/clang/docs/RealtimeSanitizer.rst @@ -0,0 +1,85 @@ +================= +RealtimeSanitizer +================= + +.. contents:: + :local: + +Introduction +============ +RealtimeSanitizer (a.k.a. RTSan) is a real-time safety testing tool for C and C++ +projects. RTSan can be used to detect real-time violations, i.e. calls to methods +that are not safe for use in functions with deterministic runtime requirements. +RTSan considers any function marked with the ``[[clang::nonblocking]]`` attribute +to be a real-time function. If RTSan detects a call to ``malloc``, ``free``, +``pthread_mutex_lock``, or anything else that could have a non-deterministic +execution time in a function marked ``[[clang::nonblocking]]`` +RTSan raises an error. + +The runtime slowdown introduced by RealtimeSanitizer is negligible. + +How to build +============ + +Build LLVM/Clang with `CMake ` and enable the +``compiler-rt`` runtime. An example CMake configuration that will allow for the +use/testing of RealtimeSanitizer: + +.. code-block:: console + + $ cmake -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_ENABLE_RUNTIMES="compiler-rt" /llvm + +Usage +===== + +There are two requirements: + +1. The code must be compiled with the ``-fsanitize=realtime`` flag. +2. Functions that are subject to real-time constraints must be marked + with the ``[[clang::nonblocking]]`` attribute. + +Typically, these attributes should be added onto the functions that are entry +points for threads with real-time priority. These threads are subject to a fixed +callback time, such as audio callback threads or rendering loops in video game +code. + +.. code-block:: console + + % cat example_realtime_violation.cpp + #include + + void violation() [[clang::nonblocking]]{ + std::vector v; + v.resize(100); + } + + int main() { + violation(); + return 0; + } + # Compile and link + % clang++ -fsanitize=realtime -g example_realtime_violation.cpp + +If a real-time safety violation is detected in a ``[[clang::nonblocking]]`` +context, or any function invoked by that function, the program will exit with a +non-zero exit code. + +.. code-block:: console + + % clang++ -fsanitize=realtime -g example_realtime_violation.cpp + % ./a.out + Real-time violation: intercepted call to real-time unsafe function `malloc` in real-time context! Stack trace: + #0 0x000102893034 in __rtsan::PrintStackTrace() rtsan_stack.cpp:45 + #1 0x000102892e64 in __rtsan::Context::ExpectNotRealtime(char const*) rtsan_context.cpp:78 + #2 0x00010289397c in malloc rtsan_interceptors.cpp:286 + #3 0x000195bd7bd0 in operator new(unsigned long)+0x1c (libc++abi.dylib:arm64+0x16bd0) + #4 0x5c7f00010230f07c () + #5 0x00010230f058 in std::__1::__libcpp_allocate[abi:ue170006](unsigned long, unsigned long) new:324 + #6 0x00010230effc in std::__1::allocator::allocate[abi:ue170006](unsigned long) allocator.h:114 + ... snip ... + #10 0x00010230e4bc in std::__1::vector>::__append(unsigned long) vector:1162 + #11 0x00010230dcdc in std::__1::vector>::resize(unsigned long) vector:1981 + #12 0x00010230dc28 in violation() main.cpp:5 + #13 0x00010230dd64 in main main.cpp:9 + #14 0x0001958960dc () + #15 0x2f557ffffffffffc () diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 12a924acc14331..5f5bf51849e602 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -452,6 +452,11 @@ Moved checkers Sanitizers ---------- +- Introduced Realtime Sanitizer, activated by using the -fsanitize=realtime + flag. This sanitizer detects unsafe system library calls, such as memory + allocations and mutex locks. If any such function is called during invocation + of a function marked with the ``[[clang::nonblocking]]`` attribute, an error + is printed to the console and the process exits non-zero. - Added the ``-fsanitize-undefined-ignore-overflow-pattern`` flag which can be used to disable specific overflow-dependent code patterns. The supported diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst index d19b77ae40b0d7..069ecba875cd59 100644 --- a/clang/docs/UsersManual.rst +++ b/clang/docs/UsersManual.rst @@ -2068,6 +2068,8 @@ are listed below. integrity. - ``-fsanitize=safe-stack``: :doc:`safe stack ` protection against stack-based memory corruption errors. + - ``-fsanitize=realtime``: :doc:`RealtimeSanitizer`, + a real-time safety checker. There are more fine-grained checks available: see the :ref:`list ` of specific kinds of diff --git a/clang/docs/index.rst b/clang/docs/index.rst index 9bae0bd83243bd..4a497f4d9bcc3c 100644 --- a/clang/docs/index.rst +++ b/clang/docs/index.rst @@ -32,6 +32,7 @@ Using Clang as a Compiler UndefinedBehaviorSanitizer DataFlowSanitizer LeakSanitizer + RealtimeSanitizer SanitizerCoverage SanitizerStats SanitizerSpecialCaseList diff --git a/clang/include/clang/Basic/Sanitizers.def b/clang/include/clang/Basic/Sanitizers.def index bee35e9dca7c39..9223f62b3639a7 100644 --- a/clang/include/clang/Basic/Sanitizers.def +++ b/clang/include/clang/Basic/Sanitizers.def @@ -79,6 +79,9 @@ SANITIZER("thread", Thread) // Numerical stability sanitizer. SANITIZER("numerical", NumericalStability) +// RealtimeSanitizer +SANITIZER("realtime", Realtime) + // LeakSanitizer SANITIZER("leak", Leak) diff --git a/clang/include/clang/Driver/SanitizerArgs.h b/clang/include/clang/Driver/SanitizerArgs.h index e64ec463ca8907..0c6f3869549ef7 100644 --- a/clang/include/clang/Driver/SanitizerArgs.h +++ b/clang/include/clang/Driver/SanitizerArgs.h @@ -107,6 +107,7 @@ class SanitizerArgs { bool needsNsanRt() const { return Sanitizers.has(SanitizerKind::NumericalStability); } + bool needsRtsanRt() const { return Sanitizers.has(SanitizerKind::Realtime); } bool hasMemTag() const { return hasMemtagHeap() || hasMemtagStack() || hasMemtagGlobals(); diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index fdd89edd72e109..026f16484c0949 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -78,6 +78,7 @@ #include "llvm/Transforms/Instrumentation/MemorySanitizer.h" #include "llvm/Transforms/Instrumentation/NumericalStabilitySanitizer.h" #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" +#include "llvm/Transforms/Instrumentation/RealtimeSanitizer.h" #include "llvm/Transforms/Instrumentation/SanitizerBinaryMetadata.h" #include "llvm/Transforms/Instrumentation/SanitizerCoverage.h" #include "llvm/Transforms/Instrumentation/ThreadSanitizer.h" @@ -990,6 +991,13 @@ void EmitAssemblyHelper::RunOptimizationPipeline( FPM.addPass(BoundsCheckingPass()); }); + if (LangOpts.Sanitize.has(SanitizerKind::Realtime)) + PB.registerScalarOptimizerLateEPCallback( + [](FunctionPassManager &FPM, OptimizationLevel Level) { + RealtimeSanitizerOptions Opts; + FPM.addPass(RealtimeSanitizerPass(Opts)); + }); + // Don't add sanitizers if we are here from ThinLTO PostLink. That already // done on PreLink stage. if (!IsThinLTOPostLink) { diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index eff8c9f5694084..c89eaa0f4e3bfc 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -845,6 +845,13 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy, if (SanOpts.has(SanitizerKind::ShadowCallStack)) Fn->addFnAttr(llvm::Attribute::ShadowCallStack); + if (SanOpts.has(SanitizerKind::Realtime)) + if (FD && FD->getASTContext().hasAnyFunctionEffects()) + for (const FunctionEffectWithCondition &Fe : FD->getFunctionEffects()) { + if (Fe.Effect.kind() == FunctionEffect::Kind::NonBlocking) + Fn->addFnAttr(llvm::Attribute::SanitizeRealtime); + } + // Apply fuzzing attribute to the function. if (SanOpts.hasOneOf(SanitizerKind::Fuzzer | SanitizerKind::FuzzerNoLink)) Fn->addFnAttr(llvm::Attribute::OptForFuzzing); diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp index 9d9ad79d51d7f8..09262f40b5b50c 100644 --- a/clang/lib/Driver/SanitizerArgs.cpp +++ b/clang/lib/Driver/SanitizerArgs.cpp @@ -558,11 +558,15 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC, SanitizerKind::Leak | SanitizerKind::Thread | SanitizerKind::Memory | SanitizerKind::KernelAddress | SanitizerKind::Scudo | SanitizerKind::SafeStack), - std::make_pair(SanitizerKind::MemTag, - SanitizerKind::Address | SanitizerKind::KernelAddress | - SanitizerKind::HWAddress | - SanitizerKind::KernelHWAddress), - std::make_pair(SanitizerKind::KCFI, SanitizerKind::Function)}; + std::make_pair(SanitizerKind::MemTag, SanitizerKind::Address | + SanitizerKind::KernelAddress | + SanitizerKind::HWAddress | + SanitizerKind::KernelHWAddress), + std::make_pair(SanitizerKind::KCFI, SanitizerKind::Function), + std::make_pair(SanitizerKind::Realtime, + SanitizerKind::Address | SanitizerKind::Thread | + SanitizerKind::Undefined | SanitizerKind::Memory)}; + // Enable toolchain specific default sanitizers if not explicitly disabled. SanitizerMask Default = TC.getDefaultSanitizers() & ~AllRemove; diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 0738ed18f54078..0601016c3b14b8 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1456,6 +1456,8 @@ collectSanitizerRuntimes(const ToolChain &TC, const ArgList &Args, if (!Args.hasArg(options::OPT_shared)) HelperStaticRuntimes.push_back("hwasan-preinit"); } + if (SanArgs.needsRtsanRt() && SanArgs.linkRuntimes()) + SharedRuntimes.push_back("rtsan"); } // The stats_client library is also statically linked into DSOs. @@ -1481,6 +1483,10 @@ collectSanitizerRuntimes(const ToolChain &TC, const ArgList &Args, StaticRuntimes.push_back("asan_cxx"); } + if (!SanArgs.needsSharedRt() && SanArgs.needsRtsanRt() && + SanArgs.linkRuntimes()) + StaticRuntimes.push_back("rtsan"); + if (!SanArgs.needsSharedRt() && SanArgs.needsMemProfRt()) { StaticRuntimes.push_back("memprof"); if (SanArgs.linkCXXRuntimes()) diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp index 2550541a438481..5e7f9290e2009d 100644 --- a/clang/lib/Driver/ToolChains/Darwin.cpp +++ b/clang/lib/Driver/ToolChains/Darwin.cpp @@ -1519,6 +1519,8 @@ void DarwinClang::AddLinkRuntimeLibArgs(const ArgList &Args, const char *sanitizer = nullptr; if (Sanitize.needsUbsanRt()) { sanitizer = "UndefinedBehaviorSanitizer"; + } else if (Sanitize.needsRtsanRt()) { + sanitizer = "RealtimeSanitizer"; } else if (Sanitize.needsAsanRt()) { sanitizer = "AddressSanitizer"; } else if (Sanitize.needsTsanRt()) { @@ -1541,6 +1543,11 @@ void DarwinClang::AddLinkRuntimeLibArgs(const ArgList &Args, AddLinkSanitizerLibArgs(Args, CmdArgs, "asan"); } } + if (Sanitize.needsRtsanRt()) { + assert(Sanitize.needsSharedRt() && + "Static sanitizer runtimes not supported"); + AddLinkSanitizerLibArgs(Args, CmdArgs, "rtsan"); + } if (Sanitize.needsLsanRt()) AddLinkSanitizerLibArgs(Args, CmdArgs, "lsan"); if (Sanitize.needsUbsanRt()) { @@ -3539,6 +3546,7 @@ SanitizerMask Darwin::getSupportedSanitizers() const { Res |= SanitizerKind::Address; Res |= SanitizerKind::PointerCompare; Res |= SanitizerKind::PointerSubtract; + Res |= SanitizerKind::Realtime; Res |= SanitizerKind::Leak; Res |= SanitizerKind::Fuzzer; Res |= SanitizerKind::FuzzerNoLink; diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp index 2265138edbffbe..96680b3412a2db 100644 --- a/clang/lib/Driver/ToolChains/Linux.cpp +++ b/clang/lib/Driver/ToolChains/Linux.cpp @@ -800,6 +800,7 @@ SanitizerMask Linux::getSupportedSanitizers() const { Res |= SanitizerKind::Address; Res |= SanitizerKind::PointerCompare; Res |= SanitizerKind::PointerSubtract; + Res |= SanitizerKind::Realtime; Res |= SanitizerKind::Fuzzer; Res |= SanitizerKind::FuzzerNoLink; Res |= SanitizerKind::KernelAddress; diff --git a/clang/test/CodeGen/rtsan_attribute_inserted.c b/clang/test/CodeGen/rtsan_attribute_inserted.c new file mode 100644 index 00000000000000..05a1d9a8c2047a --- /dev/null +++ b/clang/test/CodeGen/rtsan_attribute_inserted.c @@ -0,0 +1,7 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=realtime %s -emit-llvm -o - %s | FileCheck %s + +float process(float *a) [[clang::nonblocking]] { return *a; } + +// CHECK-LABEL: @process{{.*}}#0 { +// CHECK: attributes #0 = { +// CHECK-SAME: {{.*sanitize_realtime.*}} diff --git a/clang/test/CodeGen/rtsan_entry_exit_insertion.c b/clang/test/CodeGen/rtsan_entry_exit_insertion.c new file mode 100644 index 00000000000000..9ba0103ca1e353 --- /dev/null +++ b/clang/test/CodeGen/rtsan_entry_exit_insertion.c @@ -0,0 +1,13 @@ +// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -fsanitize=realtime -emit-llvm -o - %s | FileCheck %s + +int foo(int *a) [[clang::nonblocking]] { return *a; } + +// The first instruction after the function is entred should be a call to +// enable the realtime sanitizer stack. +// CHECK-LABEL: define{{.*}}@foo +// CHECK-NEXT: entry: +// CHECK-NEXT: call{{.*}}__rtsan_realtime_enter + +// __rtsan_realtime_exit should be inserted at all function returns. +// CHECK-LABEL: call{{.*}}__rtsan_realtime_exit +// CHECK-NEXT: ret diff --git a/clang/test/CodeGen/rtsan_no_attribute_sanitizer_disabled.c b/clang/test/CodeGen/rtsan_no_attribute_sanitizer_disabled.c new file mode 100644 index 00000000000000..43ad6ed1a429ee --- /dev/null +++ b/clang/test/CodeGen/rtsan_no_attribute_sanitizer_disabled.c @@ -0,0 +1,6 @@ +// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s + +float process(float *a) [[clang::nonblocking]] { return *a; } + +// Without the -fsanitize=realtime flag, we shouldn't attach the attribute. +// CHECK-NOT: {{.*sanitize_realtime.*}} diff --git a/clang/test/Driver/fsanitize.c b/clang/test/Driver/fsanitize.c index 678fa432fb0a0a..f86c978f221cd4 100644 --- a/clang/test/Driver/fsanitize.c +++ b/clang/test/Driver/fsanitize.c @@ -1040,3 +1040,49 @@ // RUN: not %clang --target=aarch64-none-elf -fsanitize=dataflow %s -### 2>&1 | FileCheck %s -check-prefix=UNSUPPORTED-BAREMETAL // RUN: not %clang --target=arm-arm-none-eabi -fsanitize=shadow-call-stack %s -### 2>&1 | FileCheck %s -check-prefix=UNSUPPORTED-BAREMETAL // UNSUPPORTED-BAREMETAL: unsupported option '-fsanitize={{.*}}' for target + +// RUN: %clang --target=x86_64-apple-darwin -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-X86-64-DARWIN +// CHECK-RTSAN-X86-64-DARWIN-NOT: unsupported option + +// RUN: %clang --target=x86_64-apple-darwin -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-X86-64-DARWIN +// CHECK-RTSAN-X86-64-DARWIN-NOT: unsupported option +// RUN: %clang --target=x86_64-apple-macos -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-X86-64-MACOS +// CHECK-RTSAN-X86-64-MACOS-NOT: unsupported option +// RUN: %clang --target=arm64-apple-macos -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-ARM64-MACOS +// CHECK-RTSAN-ARM64-MACOS-NOT: unsupported option + +// RUN: %clang --target=arm64-apple-ios-simulator -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-ARM64-IOSSIMULATOR +// CHECK-RTSAN-ARM64-IOSSIMULATOR-NOT: unsupported option + +// RUN: %clang --target=arm64-apple-watchos-simulator -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-ARM64-WATCHOSSIMULATOR +// CHECK-RTSAN-ARM64-WATCHOSSIMULATOR-NOT: unsupported option + +// RUN: %clang --target=arm64-apple-tvos-simulator -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-ARM64-TVOSSIMULATOR +// CHECK-RTSAN-ARM64-TVOSSIMULATOR-NOT: unsupported option + +// RUN: %clang --target=x86_64-apple-ios-simulator -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-X86-64-IOSSIMULATOR +// CHECK-RTSAN-X86-64-IOSSIMULATOR-NOT: unsupported option + +// RUN: %clang --target=x86_64-apple-watchos-simulator -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-X86-64-WATCHOSSIMULATOR +// CHECK-RTSAN-X86-64-WATCHOSSIMULATOR-NOT: unsupported option + +// RUN: %clang --target=x86_64-apple-tvos-simulator -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-X86-64-TVOSSIMULATOR +// CHECK-RTSAN-X86-64-TVOSSIMULATOR-NOT: unsupported option + +// RUN: %clang --target=x86_64-linux-gnu -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-X86-64-LINUX +// CHECK-RTSAN-X86-64-LINUX-NOT: unsupported option + +// RUN: not %clang --target=i386-pc-openbsd -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-OPENBSD +// CHECK-RTSAN-OPENBSD: unsupported option '-fsanitize=realtime' for target 'i386-pc-openbsd' + +// RUN: not %clang --target=x86_64-linux-gnu -fsanitize=realtime,thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-REALTIME-TSAN +// CHECK-REALTIME-TSAN: error: invalid argument '-fsanitize=realtime' not allowed with '-fsanitize=thread' + +// RUN: not %clang --target=x86_64-linux-gnu -fsanitize=realtime,address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-REALTIME-ASAN +// CHECK-REALTIME-ASAN: error: invalid argument '-fsanitize=realtime' not allowed with '-fsanitize=address' + +// RUN: not %clang --target=x86_64-linux-gnu -fsanitize=realtime,memory %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-REALTIME-MSAN +// CHECK-REALTIME-MSAN: error: invalid argument '-fsanitize=realtime' not allowed with '-fsanitize=memory' + +// RUN: not %clang --target=x86_64-linux-gnu -fsanitize=realtime,undefined %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-REALTIME-UBSAN +// CHECK-REALTIME-UBSAN: error: invalid argument '-fsanitize=realtime' not allowed with '-fsanitize=undefined' From 8b5f606612de30ece5e113517decacca0d8ccb35 Mon Sep 17 00:00:00 2001 From: Sirraide Date: Thu, 22 Aug 2024 23:33:40 +0200 Subject: [PATCH 264/426] [Clang] [Parser] Improve diagnostic for `friend concept` (#105121) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Diagnose this early after parsing declaration specifiers; this allows us to issue a better diagnostic. This also checks for `concept friend` and concept declarations w/o a template-head because it’s easiest to do that at the same time. Fixes #45182. --- clang/docs/ReleaseNotes.rst | 2 ++ clang/include/clang/Basic/DiagnosticParseKinds.td | 3 +++ clang/lib/Parse/ParseDeclCXX.cpp | 13 +++++++++++++ clang/test/Parser/friend-concept.cpp | 13 +++++++++++++ 4 files changed, 31 insertions(+) create mode 100644 clang/test/Parser/friend-concept.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 5f5bf51849e602..e0ede62b80c2ee 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -241,6 +241,8 @@ Improvements to Clang's diagnostics - Don't emit duplicated dangling diagnostics. (#GH93386). +- Improved diagnostic when trying to befriend a concept. (#GH45182). + Improvements to Clang's time-trace ---------------------------------- diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index 464f08637332d4..0b8ab4bf092509 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -974,6 +974,9 @@ def warn_cxx23_variadic_friends : Warning< "variadic 'friend' declarations are incompatible with C++ standards before C++2c">, DefaultIgnore, InGroup; +def err_friend_concept : Error< + "friend declaration cannot be a concept">; + // C++11 default member initialization def ext_nonstatic_member_init : ExtWarn< "default member initializer for non-static data member is a C++11 " diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp index 18c5fe6056b472..7ca27d00c0bcbf 100644 --- a/clang/lib/Parse/ParseDeclCXX.cpp +++ b/clang/lib/Parse/ParseDeclCXX.cpp @@ -3139,6 +3139,19 @@ Parser::DeclGroupPtrTy Parser::ParseCXXClassMemberDeclaration( return Actions.BuildDeclaratorGroup(Decls); } + // Befriending a concept is invalid and would already fail if + // we did nothing here, but this allows us to issue a more + // helpful diagnostic. + if (Tok.is(tok::kw_concept)) { + Diag(Tok.getLocation(), + DS.isFriendSpecified() || NextToken().is(tok::kw_friend) + ? diag::err_friend_concept + : diag:: + err_concept_decls_may_only_appear_in_global_namespace_scope); + SkipUntil(tok::semi, tok::r_brace, StopBeforeMatch); + return nullptr; + } + ParsingDeclarator DeclaratorInfo(*this, DS, DeclAttrs, DeclaratorContext::Member); if (TemplateInfo.TemplateParams) diff --git a/clang/test/Parser/friend-concept.cpp b/clang/test/Parser/friend-concept.cpp new file mode 100644 index 00000000000000..d771ca4d4178ed --- /dev/null +++ b/clang/test/Parser/friend-concept.cpp @@ -0,0 +1,13 @@ +// RUN: %clang_cc1 -fsyntax-only -verify -std=c++20 %s + +template +concept fooable = true; + +struct S { + template friend concept x = requires { requires true; }; // expected-error {{friend declaration cannot be a concept}} + template friend concept fooable; // expected-error {{friend declaration cannot be a concept}} + template concept friend fooable; // expected-error {{expected unqualified-id}} + friend concept fooable; // expected-error {{friend declaration cannot be a concept}} + concept friend fooable; // expected-error {{friend declaration cannot be a concept}} + concept fooable; // expected-error {{concept declarations may only appear in global or namespace scope}} +}; From 42d06b8e555727e8e043d5ea9240ad103d950192 Mon Sep 17 00:00:00 2001 From: Harini0924 <79345568+Harini0924@users.noreply.github.com> Date: Thu, 22 Aug 2024 14:39:23 -0700 Subject: [PATCH 265/426] [compiler-rt][test] Change tests to remove the use of `unset` command in lit internal shell (#104880) This patch rewrites tests to remove the use of the `unset` command, which is not supported in the lit internal shell. The tests now use the `env -u` to unset environment variables. The `unset` command is used in shell environments to remove the environment variable. However, because the lit internal shell does not support the `unset` command, using it in tests would result in errors or other unexpected behavior. To overcome this limitation, the tests have been updated to use the `env -u` command instead. `env -u` is supported by lit and effectively removes specified environment variables. This allows the tests to achieve the same goal of unsetting environment variables while ensuring compatibility with the lit internal shell. This change is relevant for [[RFC] Enabling the Lit Internal Shell by Default](https://discourse.llvm.org/t/rfc-enabling-the-lit-internal-shell-by-default/80179/3) Fixes: #102397 --- .../test/fuzzer/afl-driver-close-fd-mask.test | 13 ++++++------- compiler-rt/test/fuzzer/afl-driver-stderr.test | 7 +++---- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/compiler-rt/test/fuzzer/afl-driver-close-fd-mask.test b/compiler-rt/test/fuzzer/afl-driver-close-fd-mask.test index 71f74e27ec4fe6..98c7f08697c174 100644 --- a/compiler-rt/test/fuzzer/afl-driver-close-fd-mask.test +++ b/compiler-rt/test/fuzzer/afl-driver-close-fd-mask.test @@ -3,29 +3,28 @@ RUN: %no_fuzzer_cpp_compiler %S/AFLDriverTest.cpp %libfuzzer_src/afl/afl_driver. ; Test that not specifying AFL_DRIVER_CLOSE_FD_MASK works as intended. RUN: echo -n "abc" > %t.file3 -RUN: unset AFL_DRIVER_CLOSE_FD_MASK -RUN: %run %t-AFLDriverTest < %t.file3 2>&1 | FileCheck %s --check-prefixes=STDERR,STDOUT +RUN: env -u AFL_DRIVER_CLOSE_FD_MASK %run %t-AFLDriverTest < %t.file3 2>&1 | FileCheck %s --check-prefixes=STDERR,STDOUT STDOUT: STDOUT MESSAGE STDERR: STDERR MESSAGE ; Test that stdout is closed properly. -RUN: AFL_DRIVER_CLOSE_FD_MASK=1 %run %t-AFLDriverTest < %t.file3 2>&1 | FileCheck %s --check-prefixes=NOT_STDOUT,STDERR +RUN: env AFL_DRIVER_CLOSE_FD_MASK=1 %run %t-AFLDriverTest < %t.file3 2>&1 | FileCheck %s --check-prefixes=NOT_STDOUT,STDERR NOT_STDOUT-NOT: STDOUT MESSAGE ; Test that stderr is closed properly. -RUN: AFL_DRIVER_CLOSE_FD_MASK=2 %run %t-AFLDriverTest < %t.file3 2>&1 | FileCheck %s --check-prefixes=NOT_STDERR,STDOUT +RUN: env AFL_DRIVER_CLOSE_FD_MASK=2 %run %t-AFLDriverTest < %t.file3 2>&1 | FileCheck %s --check-prefixes=NOT_STDERR,STDOUT NOT_STDERR-NOT: STDERR MESSAGE ; Test that both are closed properly. -RUN: AFL_DRIVER_CLOSE_FD_MASK=3 %run %t-AFLDriverTest < %t.file3 2>&1 | FileCheck %s --check-prefixes=NOT_STDERR,NOT_STDOUT +RUN: env AFL_DRIVER_CLOSE_FD_MASK=3 %run %t-AFLDriverTest < %t.file3 2>&1 | FileCheck %s --check-prefixes=NOT_STDERR,NOT_STDOUT ; Test that a stack is printed when we close stderr RUN: echo -n "abcd" > %t.file4 -RUN: AFL_DRIVER_CLOSE_FD_MASK=2 not %run %t-AFLDriverTest < %t.file4 2>&1 | FileCheck %s --check-prefixes=ASAN_CRASH,STDOUT,NOT_STDERR +RUN: env AFL_DRIVER_CLOSE_FD_MASK=2 not %run %t-AFLDriverTest < %t.file4 2>&1 | FileCheck %s --check-prefixes=ASAN_CRASH,STDOUT,NOT_STDERR ASAN_CRASH: ERROR: AddressSanitizer ; Test that a stack is written to the stderr duplicate file when we close stderr ; and specify a duplicate. RUN: rm -f %t.stderr -RUN: AFL_DRIVER_STDERR_DUPLICATE_FILENAME=%t.stderr AFL_DRIVER_CLOSE_FD_MASK=2 not %run %t-AFLDriverTest < %t.file4 +RUN: env AFL_DRIVER_STDERR_DUPLICATE_FILENAME=%t.stderr AFL_DRIVER_CLOSE_FD_MASK=2 not %run %t-AFLDriverTest < %t.file4 RUN: cat %t.stderr | FileCheck %s --check-prefixes=ASAN_CRASH,NOT_STDERR diff --git a/compiler-rt/test/fuzzer/afl-driver-stderr.test b/compiler-rt/test/fuzzer/afl-driver-stderr.test index 84f21d4da5cbda..4b0c3b40221afd 100644 --- a/compiler-rt/test/fuzzer/afl-driver-stderr.test +++ b/compiler-rt/test/fuzzer/afl-driver-stderr.test @@ -4,12 +4,11 @@ XFAIL: ios RUN: %no_fuzzer_cpp_compiler %S/AFLDriverTest.cpp %libfuzzer_src/afl/afl_driver.cpp -o %t-AFLDriverTest ; Test that not specifying a stderr file isn't broken. -RUN: unset AFL_DRIVER_STDERR_DUPLICATE_FILENAME -RUN: %run %t-AFLDriverTest +RUN: env -u AFL_DRIVER_STDERR_DUPLICATE_FILENAME %run %t-AFLDriverTest ; Test that specifying an invalid file causes a crash. -RUN: ASAN_OPTIONS= AFL_DRIVER_STDERR_DUPLICATE_FILENAME="%T" not --crash %run %t-AFLDriverTest +RUN: env ASAN_OPTIONS= AFL_DRIVER_STDERR_DUPLICATE_FILENAME="%T" not --crash %run %t-AFLDriverTest ; Test that a file is created when specified as the duplicate stderr. -RUN: AFL_DRIVER_STDERR_DUPLICATE_FILENAME=%t %run %t-AFLDriverTest +RUN: env AFL_DRIVER_STDERR_DUPLICATE_FILENAME=%t %run %t-AFLDriverTest RUN: stat %t From d7fc779aacd4b5538bc42139892812aad8c6d528 Mon Sep 17 00:00:00 2001 From: Amir Bishara <139038766+amirBish@users.noreply.github.com> Date: Fri, 23 Aug 2024 00:39:43 +0300 Subject: [PATCH 266/426] [mlir][SCF]-Fix loop coalescing with iteration arguements (#105488) Fix a bug found when coalescing loops which have iteration arguments, such that the inner loop's terminator may have operands of the inner loop iteration arguments which are about to be replaced by the outer loop's iteration arguments. The current flow leads to crush within the IR code. --- mlir/lib/Dialect/SCF/Utils/Utils.cpp | 12 ++ mlir/test/Dialect/Affine/loop-coalescing.mlir | 120 ++++++++++++++++++ 2 files changed, 132 insertions(+) diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp index 40f82557d2eb8a..9545610f10be7c 100644 --- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp +++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp @@ -864,6 +864,18 @@ LogicalResult mlir::coalesceLoops(RewriterBase &rewriter, Operation *innerTerminator = innerLoop.getBody()->getTerminator(); auto yieldedVals = llvm::to_vector(innerTerminator->getOperands()); + assert(llvm::equal(outerLoop.getRegionIterArgs(), innerLoop.getInitArgs())); + for (Value &yieldedVal : yieldedVals) { + // The yielded value may be an iteration argument of the inner loop + // which is about to be inlined. + auto iter = llvm::find(innerLoop.getRegionIterArgs(), yieldedVal); + if (iter != innerLoop.getRegionIterArgs().end()) { + unsigned iterArgIndex = iter - innerLoop.getRegionIterArgs().begin(); + // `outerLoop` iter args identical to the `innerLoop` init args. + assert(iterArgIndex < innerLoop.getInitArgs().size()); + yieldedVal = innerLoop.getInitArgs()[iterArgIndex]; + } + } rewriter.eraseOp(innerTerminator); SmallVector innerBlockArgs; diff --git a/mlir/test/Dialect/Affine/loop-coalescing.mlir b/mlir/test/Dialect/Affine/loop-coalescing.mlir index 0235000aeac538..45dd299295f640 100644 --- a/mlir/test/Dialect/Affine/loop-coalescing.mlir +++ b/mlir/test/Dialect/Affine/loop-coalescing.mlir @@ -114,6 +114,126 @@ func.func @unnormalized_loops() { return } +func.func @noramalized_loops_with_yielded_iter_args() { + // CHECK: %[[orig_lb:.*]] = arith.constant 0 + // CHECK: %[[orig_step:.*]] = arith.constant 1 + // CHECK: %[[orig_ub_k:.*]] = arith.constant 3 + // CHECK: %[[orig_ub_i:.*]] = arith.constant 42 + // CHECK: %[[orig_ub_j:.*]] = arith.constant 56 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c42 = arith.constant 42 : index + %c56 = arith.constant 56 : index + // The range of the new scf. + // CHECK: %[[partial_range:.*]] = arith.muli %[[orig_ub_i]], %[[orig_ub_j]] + // CHECK-NEXT:%[[range:.*]] = arith.muli %[[partial_range]], %[[orig_ub_k]] + + // Updated loop bounds. + // CHECK: scf.for %[[i:.*]] = %[[orig_lb]] to %[[range]] step %[[orig_step]] iter_args(%[[VAL_1:.*]] = %[[orig_lb]]) -> (index) { + %2:1 = scf.for %i = %c0 to %c42 step %c1 iter_args(%arg0 = %c0) -> (index) { + // Inner loops must have been removed. + // CHECK-NOT: scf.for + + // Reconstruct original IVs from the linearized one. + // CHECK: %[[orig_k:.*]] = arith.remsi %[[i]], %[[orig_ub_k]] + // CHECK: %[[div:.*]] = arith.divsi %[[i]], %[[orig_ub_k]] + // CHECK: %[[orig_j:.*]] = arith.remsi %[[div]], %[[orig_ub_j]] + // CHECK: %[[orig_i:.*]] = arith.divsi %[[div]], %[[orig_ub_j]] + %1:1 = scf.for %j = %c0 to %c56 step %c1 iter_args(%arg1 = %arg0) -> (index){ + %0:1 = scf.for %k = %c0 to %c3 step %c1 iter_args(%arg2 = %arg1) -> (index) { + // CHECK: "use"(%[[orig_i]], %[[orig_j]], %[[orig_k]]) + "use"(%i, %j, %k) : (index, index, index) -> () + // CHECK: scf.yield %[[VAL_1]] : index + scf.yield %arg2 : index + } + scf.yield %0#0 : index + } + scf.yield %1#0 : index + } + return +} + +func.func @noramalized_loops_with_shuffled_yielded_iter_args() { + // CHECK: %[[orig_lb:.*]] = arith.constant 0 + // CHECK: %[[orig_step:.*]] = arith.constant 1 + // CHECK: %[[orig_ub_k:.*]] = arith.constant 3 + // CHECK: %[[orig_ub_i:.*]] = arith.constant 42 + // CHECK: %[[orig_ub_j:.*]] = arith.constant 56 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c42 = arith.constant 42 : index + %c56 = arith.constant 56 : index + // The range of the new scf. + // CHECK: %[[partial_range:.*]] = arith.muli %[[orig_ub_i]], %[[orig_ub_j]] + // CHECK-NEXT:%[[range:.*]] = arith.muli %[[partial_range]], %[[orig_ub_k]] + + // Updated loop bounds. + // CHECK: scf.for %[[i:.*]] = %[[orig_lb]] to %[[range]] step %[[orig_step]] iter_args(%[[VAL_1:.*]] = %[[orig_lb]], %[[VAL_2:.*]] = %[[orig_lb]]) -> (index, index) { + %2:2 = scf.for %i = %c0 to %c42 step %c1 iter_args(%arg0 = %c0, %arg1 = %c0) -> (index, index) { + // Inner loops must have been removed. + // CHECK-NOT: scf.for + + // Reconstruct original IVs from the linearized one. + // CHECK: %[[orig_k:.*]] = arith.remsi %[[i]], %[[orig_ub_k]] + // CHECK: %[[div:.*]] = arith.divsi %[[i]], %[[orig_ub_k]] + // CHECK: %[[orig_j:.*]] = arith.remsi %[[div]], %[[orig_ub_j]] + // CHECK: %[[orig_i:.*]] = arith.divsi %[[div]], %[[orig_ub_j]] + %1:2 = scf.for %j = %c0 to %c56 step %c1 iter_args(%arg2 = %arg0, %arg3 = %arg1) -> (index, index){ + %0:2 = scf.for %k = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2, %arg5 = %arg3) -> (index, index) { + // CHECK: "use"(%[[orig_i]], %[[orig_j]], %[[orig_k]]) + "use"(%i, %j, %k) : (index, index, index) -> () + // CHECK: scf.yield %[[VAL_2]], %[[VAL_1]] : index, index + scf.yield %arg5, %arg4 : index, index + } + scf.yield %0#0, %0#1 : index, index + } + scf.yield %1#0, %1#1 : index, index + } + return +} + +func.func @noramalized_loops_with_yielded_non_iter_args() { + // CHECK: %[[orig_lb:.*]] = arith.constant 0 + // CHECK: %[[orig_step:.*]] = arith.constant 1 + // CHECK: %[[orig_ub_k:.*]] = arith.constant 3 + // CHECK: %[[orig_ub_i:.*]] = arith.constant 42 + // CHECK: %[[orig_ub_j:.*]] = arith.constant 56 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c42 = arith.constant 42 : index + %c56 = arith.constant 56 : index + // The range of the new scf. + // CHECK: %[[partial_range:.*]] = arith.muli %[[orig_ub_i]], %[[orig_ub_j]] + // CHECK-NEXT:%[[range:.*]] = arith.muli %[[partial_range]], %[[orig_ub_k]] + + // Updated loop bounds. + // CHECK: scf.for %[[i:.*]] = %[[orig_lb]] to %[[range]] step %[[orig_step]] iter_args(%[[VAL_1:.*]] = %[[orig_lb]]) -> (index) { + %2:1 = scf.for %i = %c0 to %c42 step %c1 iter_args(%arg0 = %c0) -> (index) { + // Inner loops must have been removed. + // CHECK-NOT: scf.for + + // Reconstruct original IVs from the linearized one. + // CHECK: %[[orig_k:.*]] = arith.remsi %[[i]], %[[orig_ub_k]] + // CHECK: %[[div:.*]] = arith.divsi %[[i]], %[[orig_ub_k]] + // CHECK: %[[orig_j:.*]] = arith.remsi %[[div]], %[[orig_ub_j]] + // CHECK: %[[orig_i:.*]] = arith.divsi %[[div]], %[[orig_ub_j]] + %1:1 = scf.for %j = %c0 to %c56 step %c1 iter_args(%arg1 = %arg0) -> (index){ + %0:1 = scf.for %k = %c0 to %c3 step %c1 iter_args(%arg2 = %arg1) -> (index) { + // CHECK: %[[res:.*]] = "use"(%[[orig_i]], %[[orig_j]], %[[orig_k]]) + %res = "use"(%i, %j, %k) : (index, index, index) -> (index) + // CHECK: scf.yield %[[res]] : index + scf.yield %res : index + } + scf.yield %0#0 : index + } + scf.yield %1#0 : index + } + return +} + // Check with parametric loop bounds and steps, capture the bounds here. // CHECK-LABEL: @parametric // CHECK-SAME: %[[orig_lb1:[A-Za-z0-9]+]]: From 911e246fe8fd35bd82fc11db001513a1e2f6990c Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Thu, 22 Aug 2024 14:44:35 -0700 Subject: [PATCH 267/426] [NFC][ADT] Add reverse iterators and `value_type` to StringRef (#105579) - Add reverse iterators and `value_type` to StringRef. - Add unit test for all 4 iterator flavors. - This prepares StringRef to be used with `SequenceToOffsetTable`. --- llvm/include/llvm/ADT/StringRef.h | 12 ++++++++++++ llvm/unittests/ADT/StringRefTest.cpp | 14 +++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/ADT/StringRef.h b/llvm/include/llvm/ADT/StringRef.h index 049f22b03e46e8..952d6485dafc1a 100644 --- a/llvm/include/llvm/ADT/StringRef.h +++ b/llvm/include/llvm/ADT/StringRef.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -54,6 +55,9 @@ namespace llvm { using iterator = const char *; using const_iterator = const char *; using size_type = size_t; + using value_type = char; + using reverse_iterator = std::reverse_iterator; + using const_reverse_iterator = std::reverse_iterator; private: /// The start of the string, in an external buffer. @@ -112,6 +116,14 @@ namespace llvm { iterator end() const { return Data + Length; } + reverse_iterator rbegin() const { + return std::make_reverse_iterator(end()); + } + + reverse_iterator rend() const { + return std::make_reverse_iterator(begin()); + } + const unsigned char *bytes_begin() const { return reinterpret_cast(begin()); } diff --git a/llvm/unittests/ADT/StringRefTest.cpp b/llvm/unittests/ADT/StringRefTest.cpp index a0529b03ae8c22..ec9cdc197597d0 100644 --- a/llvm/unittests/ADT/StringRefTest.cpp +++ b/llvm/unittests/ADT/StringRefTest.cpp @@ -57,9 +57,17 @@ TEST(StringRefTest, EmptyInitializerList) { TEST(StringRefTest, Iteration) { StringRef S("hello"); - const char *p = "hello"; - for (const char *it = S.begin(), *ie = S.end(); it != ie; ++it, ++p) - EXPECT_EQ(*it, *p); + constexpr StringLiteral CS("hello"); + + // Note: Cannot use literal strings in equal() as iteration over a literal + // string includes the null terminator. + const std::string_view RefFwd("hello"); + const std::string_view RefRev("olleh"); + + EXPECT_TRUE(equal(S, RefFwd)); + EXPECT_TRUE(equal(CS, RefFwd)); + EXPECT_TRUE(equal(make_range(S.rbegin(), S.rend()), RefRev)); + EXPECT_TRUE(equal(make_range(CS.rbegin(), CS.rend()), RefRev)); } TEST(StringRefTest, StringOps) { From a1e9b7e646b76bf844e8a9a101ebd27de11992ff Mon Sep 17 00:00:00 2001 From: Chris Apple Date: Thu, 22 Aug 2024 15:19:41 -0700 Subject: [PATCH 268/426] =?UTF-8?q?Revert=20"[clang][rtsan]=20Introduce=20?= =?UTF-8?q?realtime=20sanitizer=20codegen=20and=20drive=E2=80=A6=20(#10574?= =?UTF-8?q?4)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …r (#102622)" This reverts commit d010ec6af8162a8ae4e42d2cac5282f83db0ce07. Build failure: https://lab.llvm.org/buildbot/#/builders/159/builds/4466 --- clang/docs/RealtimeSanitizer.rst | 85 ------------------- clang/docs/ReleaseNotes.rst | 5 -- clang/docs/UsersManual.rst | 2 - clang/docs/index.rst | 1 - clang/include/clang/Basic/Sanitizers.def | 3 - clang/include/clang/Driver/SanitizerArgs.h | 1 - clang/lib/CodeGen/BackendUtil.cpp | 8 -- clang/lib/CodeGen/CodeGenFunction.cpp | 7 -- clang/lib/Driver/SanitizerArgs.cpp | 14 ++- clang/lib/Driver/ToolChains/CommonArgs.cpp | 6 -- clang/lib/Driver/ToolChains/Darwin.cpp | 8 -- clang/lib/Driver/ToolChains/Linux.cpp | 1 - clang/test/CodeGen/rtsan_attribute_inserted.c | 7 -- .../test/CodeGen/rtsan_entry_exit_insertion.c | 13 --- .../rtsan_no_attribute_sanitizer_disabled.c | 6 -- clang/test/Driver/fsanitize.c | 46 ---------- 16 files changed, 5 insertions(+), 208 deletions(-) delete mode 100644 clang/docs/RealtimeSanitizer.rst delete mode 100644 clang/test/CodeGen/rtsan_attribute_inserted.c delete mode 100644 clang/test/CodeGen/rtsan_entry_exit_insertion.c delete mode 100644 clang/test/CodeGen/rtsan_no_attribute_sanitizer_disabled.c diff --git a/clang/docs/RealtimeSanitizer.rst b/clang/docs/RealtimeSanitizer.rst deleted file mode 100644 index 799cd43509c6e6..00000000000000 --- a/clang/docs/RealtimeSanitizer.rst +++ /dev/null @@ -1,85 +0,0 @@ -================= -RealtimeSanitizer -================= - -.. contents:: - :local: - -Introduction -============ -RealtimeSanitizer (a.k.a. RTSan) is a real-time safety testing tool for C and C++ -projects. RTSan can be used to detect real-time violations, i.e. calls to methods -that are not safe for use in functions with deterministic runtime requirements. -RTSan considers any function marked with the ``[[clang::nonblocking]]`` attribute -to be a real-time function. If RTSan detects a call to ``malloc``, ``free``, -``pthread_mutex_lock``, or anything else that could have a non-deterministic -execution time in a function marked ``[[clang::nonblocking]]`` -RTSan raises an error. - -The runtime slowdown introduced by RealtimeSanitizer is negligible. - -How to build -============ - -Build LLVM/Clang with `CMake ` and enable the -``compiler-rt`` runtime. An example CMake configuration that will allow for the -use/testing of RealtimeSanitizer: - -.. code-block:: console - - $ cmake -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_ENABLE_RUNTIMES="compiler-rt" /llvm - -Usage -===== - -There are two requirements: - -1. The code must be compiled with the ``-fsanitize=realtime`` flag. -2. Functions that are subject to real-time constraints must be marked - with the ``[[clang::nonblocking]]`` attribute. - -Typically, these attributes should be added onto the functions that are entry -points for threads with real-time priority. These threads are subject to a fixed -callback time, such as audio callback threads or rendering loops in video game -code. - -.. code-block:: console - - % cat example_realtime_violation.cpp - #include - - void violation() [[clang::nonblocking]]{ - std::vector v; - v.resize(100); - } - - int main() { - violation(); - return 0; - } - # Compile and link - % clang++ -fsanitize=realtime -g example_realtime_violation.cpp - -If a real-time safety violation is detected in a ``[[clang::nonblocking]]`` -context, or any function invoked by that function, the program will exit with a -non-zero exit code. - -.. code-block:: console - - % clang++ -fsanitize=realtime -g example_realtime_violation.cpp - % ./a.out - Real-time violation: intercepted call to real-time unsafe function `malloc` in real-time context! Stack trace: - #0 0x000102893034 in __rtsan::PrintStackTrace() rtsan_stack.cpp:45 - #1 0x000102892e64 in __rtsan::Context::ExpectNotRealtime(char const*) rtsan_context.cpp:78 - #2 0x00010289397c in malloc rtsan_interceptors.cpp:286 - #3 0x000195bd7bd0 in operator new(unsigned long)+0x1c (libc++abi.dylib:arm64+0x16bd0) - #4 0x5c7f00010230f07c () - #5 0x00010230f058 in std::__1::__libcpp_allocate[abi:ue170006](unsigned long, unsigned long) new:324 - #6 0x00010230effc in std::__1::allocator::allocate[abi:ue170006](unsigned long) allocator.h:114 - ... snip ... - #10 0x00010230e4bc in std::__1::vector>::__append(unsigned long) vector:1162 - #11 0x00010230dcdc in std::__1::vector>::resize(unsigned long) vector:1981 - #12 0x00010230dc28 in violation() main.cpp:5 - #13 0x00010230dd64 in main main.cpp:9 - #14 0x0001958960dc () - #15 0x2f557ffffffffffc () diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index e0ede62b80c2ee..34f6680c85e87e 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -454,11 +454,6 @@ Moved checkers Sanitizers ---------- -- Introduced Realtime Sanitizer, activated by using the -fsanitize=realtime - flag. This sanitizer detects unsafe system library calls, such as memory - allocations and mutex locks. If any such function is called during invocation - of a function marked with the ``[[clang::nonblocking]]`` attribute, an error - is printed to the console and the process exits non-zero. - Added the ``-fsanitize-undefined-ignore-overflow-pattern`` flag which can be used to disable specific overflow-dependent code patterns. The supported diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst index 069ecba875cd59..d19b77ae40b0d7 100644 --- a/clang/docs/UsersManual.rst +++ b/clang/docs/UsersManual.rst @@ -2068,8 +2068,6 @@ are listed below. integrity. - ``-fsanitize=safe-stack``: :doc:`safe stack ` protection against stack-based memory corruption errors. - - ``-fsanitize=realtime``: :doc:`RealtimeSanitizer`, - a real-time safety checker. There are more fine-grained checks available: see the :ref:`list ` of specific kinds of diff --git a/clang/docs/index.rst b/clang/docs/index.rst index 4a497f4d9bcc3c..9bae0bd83243bd 100644 --- a/clang/docs/index.rst +++ b/clang/docs/index.rst @@ -32,7 +32,6 @@ Using Clang as a Compiler UndefinedBehaviorSanitizer DataFlowSanitizer LeakSanitizer - RealtimeSanitizer SanitizerCoverage SanitizerStats SanitizerSpecialCaseList diff --git a/clang/include/clang/Basic/Sanitizers.def b/clang/include/clang/Basic/Sanitizers.def index 9223f62b3639a7..bee35e9dca7c39 100644 --- a/clang/include/clang/Basic/Sanitizers.def +++ b/clang/include/clang/Basic/Sanitizers.def @@ -79,9 +79,6 @@ SANITIZER("thread", Thread) // Numerical stability sanitizer. SANITIZER("numerical", NumericalStability) -// RealtimeSanitizer -SANITIZER("realtime", Realtime) - // LeakSanitizer SANITIZER("leak", Leak) diff --git a/clang/include/clang/Driver/SanitizerArgs.h b/clang/include/clang/Driver/SanitizerArgs.h index 0c6f3869549ef7..e64ec463ca8907 100644 --- a/clang/include/clang/Driver/SanitizerArgs.h +++ b/clang/include/clang/Driver/SanitizerArgs.h @@ -107,7 +107,6 @@ class SanitizerArgs { bool needsNsanRt() const { return Sanitizers.has(SanitizerKind::NumericalStability); } - bool needsRtsanRt() const { return Sanitizers.has(SanitizerKind::Realtime); } bool hasMemTag() const { return hasMemtagHeap() || hasMemtagStack() || hasMemtagGlobals(); diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 026f16484c0949..fdd89edd72e109 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -78,7 +78,6 @@ #include "llvm/Transforms/Instrumentation/MemorySanitizer.h" #include "llvm/Transforms/Instrumentation/NumericalStabilitySanitizer.h" #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" -#include "llvm/Transforms/Instrumentation/RealtimeSanitizer.h" #include "llvm/Transforms/Instrumentation/SanitizerBinaryMetadata.h" #include "llvm/Transforms/Instrumentation/SanitizerCoverage.h" #include "llvm/Transforms/Instrumentation/ThreadSanitizer.h" @@ -991,13 +990,6 @@ void EmitAssemblyHelper::RunOptimizationPipeline( FPM.addPass(BoundsCheckingPass()); }); - if (LangOpts.Sanitize.has(SanitizerKind::Realtime)) - PB.registerScalarOptimizerLateEPCallback( - [](FunctionPassManager &FPM, OptimizationLevel Level) { - RealtimeSanitizerOptions Opts; - FPM.addPass(RealtimeSanitizerPass(Opts)); - }); - // Don't add sanitizers if we are here from ThinLTO PostLink. That already // done on PreLink stage. if (!IsThinLTOPostLink) { diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index c89eaa0f4e3bfc..eff8c9f5694084 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -845,13 +845,6 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy, if (SanOpts.has(SanitizerKind::ShadowCallStack)) Fn->addFnAttr(llvm::Attribute::ShadowCallStack); - if (SanOpts.has(SanitizerKind::Realtime)) - if (FD && FD->getASTContext().hasAnyFunctionEffects()) - for (const FunctionEffectWithCondition &Fe : FD->getFunctionEffects()) { - if (Fe.Effect.kind() == FunctionEffect::Kind::NonBlocking) - Fn->addFnAttr(llvm::Attribute::SanitizeRealtime); - } - // Apply fuzzing attribute to the function. if (SanOpts.hasOneOf(SanitizerKind::Fuzzer | SanitizerKind::FuzzerNoLink)) Fn->addFnAttr(llvm::Attribute::OptForFuzzing); diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp index 09262f40b5b50c..9d9ad79d51d7f8 100644 --- a/clang/lib/Driver/SanitizerArgs.cpp +++ b/clang/lib/Driver/SanitizerArgs.cpp @@ -558,15 +558,11 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC, SanitizerKind::Leak | SanitizerKind::Thread | SanitizerKind::Memory | SanitizerKind::KernelAddress | SanitizerKind::Scudo | SanitizerKind::SafeStack), - std::make_pair(SanitizerKind::MemTag, SanitizerKind::Address | - SanitizerKind::KernelAddress | - SanitizerKind::HWAddress | - SanitizerKind::KernelHWAddress), - std::make_pair(SanitizerKind::KCFI, SanitizerKind::Function), - std::make_pair(SanitizerKind::Realtime, - SanitizerKind::Address | SanitizerKind::Thread | - SanitizerKind::Undefined | SanitizerKind::Memory)}; - + std::make_pair(SanitizerKind::MemTag, + SanitizerKind::Address | SanitizerKind::KernelAddress | + SanitizerKind::HWAddress | + SanitizerKind::KernelHWAddress), + std::make_pair(SanitizerKind::KCFI, SanitizerKind::Function)}; // Enable toolchain specific default sanitizers if not explicitly disabled. SanitizerMask Default = TC.getDefaultSanitizers() & ~AllRemove; diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 0601016c3b14b8..0738ed18f54078 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1456,8 +1456,6 @@ collectSanitizerRuntimes(const ToolChain &TC, const ArgList &Args, if (!Args.hasArg(options::OPT_shared)) HelperStaticRuntimes.push_back("hwasan-preinit"); } - if (SanArgs.needsRtsanRt() && SanArgs.linkRuntimes()) - SharedRuntimes.push_back("rtsan"); } // The stats_client library is also statically linked into DSOs. @@ -1483,10 +1481,6 @@ collectSanitizerRuntimes(const ToolChain &TC, const ArgList &Args, StaticRuntimes.push_back("asan_cxx"); } - if (!SanArgs.needsSharedRt() && SanArgs.needsRtsanRt() && - SanArgs.linkRuntimes()) - StaticRuntimes.push_back("rtsan"); - if (!SanArgs.needsSharedRt() && SanArgs.needsMemProfRt()) { StaticRuntimes.push_back("memprof"); if (SanArgs.linkCXXRuntimes()) diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp index 5e7f9290e2009d..2550541a438481 100644 --- a/clang/lib/Driver/ToolChains/Darwin.cpp +++ b/clang/lib/Driver/ToolChains/Darwin.cpp @@ -1519,8 +1519,6 @@ void DarwinClang::AddLinkRuntimeLibArgs(const ArgList &Args, const char *sanitizer = nullptr; if (Sanitize.needsUbsanRt()) { sanitizer = "UndefinedBehaviorSanitizer"; - } else if (Sanitize.needsRtsanRt()) { - sanitizer = "RealtimeSanitizer"; } else if (Sanitize.needsAsanRt()) { sanitizer = "AddressSanitizer"; } else if (Sanitize.needsTsanRt()) { @@ -1543,11 +1541,6 @@ void DarwinClang::AddLinkRuntimeLibArgs(const ArgList &Args, AddLinkSanitizerLibArgs(Args, CmdArgs, "asan"); } } - if (Sanitize.needsRtsanRt()) { - assert(Sanitize.needsSharedRt() && - "Static sanitizer runtimes not supported"); - AddLinkSanitizerLibArgs(Args, CmdArgs, "rtsan"); - } if (Sanitize.needsLsanRt()) AddLinkSanitizerLibArgs(Args, CmdArgs, "lsan"); if (Sanitize.needsUbsanRt()) { @@ -3546,7 +3539,6 @@ SanitizerMask Darwin::getSupportedSanitizers() const { Res |= SanitizerKind::Address; Res |= SanitizerKind::PointerCompare; Res |= SanitizerKind::PointerSubtract; - Res |= SanitizerKind::Realtime; Res |= SanitizerKind::Leak; Res |= SanitizerKind::Fuzzer; Res |= SanitizerKind::FuzzerNoLink; diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp index 96680b3412a2db..2265138edbffbe 100644 --- a/clang/lib/Driver/ToolChains/Linux.cpp +++ b/clang/lib/Driver/ToolChains/Linux.cpp @@ -800,7 +800,6 @@ SanitizerMask Linux::getSupportedSanitizers() const { Res |= SanitizerKind::Address; Res |= SanitizerKind::PointerCompare; Res |= SanitizerKind::PointerSubtract; - Res |= SanitizerKind::Realtime; Res |= SanitizerKind::Fuzzer; Res |= SanitizerKind::FuzzerNoLink; Res |= SanitizerKind::KernelAddress; diff --git a/clang/test/CodeGen/rtsan_attribute_inserted.c b/clang/test/CodeGen/rtsan_attribute_inserted.c deleted file mode 100644 index 05a1d9a8c2047a..00000000000000 --- a/clang/test/CodeGen/rtsan_attribute_inserted.c +++ /dev/null @@ -1,7 +0,0 @@ -// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=realtime %s -emit-llvm -o - %s | FileCheck %s - -float process(float *a) [[clang::nonblocking]] { return *a; } - -// CHECK-LABEL: @process{{.*}}#0 { -// CHECK: attributes #0 = { -// CHECK-SAME: {{.*sanitize_realtime.*}} diff --git a/clang/test/CodeGen/rtsan_entry_exit_insertion.c b/clang/test/CodeGen/rtsan_entry_exit_insertion.c deleted file mode 100644 index 9ba0103ca1e353..00000000000000 --- a/clang/test/CodeGen/rtsan_entry_exit_insertion.c +++ /dev/null @@ -1,13 +0,0 @@ -// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -fsanitize=realtime -emit-llvm -o - %s | FileCheck %s - -int foo(int *a) [[clang::nonblocking]] { return *a; } - -// The first instruction after the function is entred should be a call to -// enable the realtime sanitizer stack. -// CHECK-LABEL: define{{.*}}@foo -// CHECK-NEXT: entry: -// CHECK-NEXT: call{{.*}}__rtsan_realtime_enter - -// __rtsan_realtime_exit should be inserted at all function returns. -// CHECK-LABEL: call{{.*}}__rtsan_realtime_exit -// CHECK-NEXT: ret diff --git a/clang/test/CodeGen/rtsan_no_attribute_sanitizer_disabled.c b/clang/test/CodeGen/rtsan_no_attribute_sanitizer_disabled.c deleted file mode 100644 index 43ad6ed1a429ee..00000000000000 --- a/clang/test/CodeGen/rtsan_no_attribute_sanitizer_disabled.c +++ /dev/null @@ -1,6 +0,0 @@ -// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s - -float process(float *a) [[clang::nonblocking]] { return *a; } - -// Without the -fsanitize=realtime flag, we shouldn't attach the attribute. -// CHECK-NOT: {{.*sanitize_realtime.*}} diff --git a/clang/test/Driver/fsanitize.c b/clang/test/Driver/fsanitize.c index f86c978f221cd4..678fa432fb0a0a 100644 --- a/clang/test/Driver/fsanitize.c +++ b/clang/test/Driver/fsanitize.c @@ -1040,49 +1040,3 @@ // RUN: not %clang --target=aarch64-none-elf -fsanitize=dataflow %s -### 2>&1 | FileCheck %s -check-prefix=UNSUPPORTED-BAREMETAL // RUN: not %clang --target=arm-arm-none-eabi -fsanitize=shadow-call-stack %s -### 2>&1 | FileCheck %s -check-prefix=UNSUPPORTED-BAREMETAL // UNSUPPORTED-BAREMETAL: unsupported option '-fsanitize={{.*}}' for target - -// RUN: %clang --target=x86_64-apple-darwin -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-X86-64-DARWIN -// CHECK-RTSAN-X86-64-DARWIN-NOT: unsupported option - -// RUN: %clang --target=x86_64-apple-darwin -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-X86-64-DARWIN -// CHECK-RTSAN-X86-64-DARWIN-NOT: unsupported option -// RUN: %clang --target=x86_64-apple-macos -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-X86-64-MACOS -// CHECK-RTSAN-X86-64-MACOS-NOT: unsupported option -// RUN: %clang --target=arm64-apple-macos -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-ARM64-MACOS -// CHECK-RTSAN-ARM64-MACOS-NOT: unsupported option - -// RUN: %clang --target=arm64-apple-ios-simulator -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-ARM64-IOSSIMULATOR -// CHECK-RTSAN-ARM64-IOSSIMULATOR-NOT: unsupported option - -// RUN: %clang --target=arm64-apple-watchos-simulator -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-ARM64-WATCHOSSIMULATOR -// CHECK-RTSAN-ARM64-WATCHOSSIMULATOR-NOT: unsupported option - -// RUN: %clang --target=arm64-apple-tvos-simulator -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-ARM64-TVOSSIMULATOR -// CHECK-RTSAN-ARM64-TVOSSIMULATOR-NOT: unsupported option - -// RUN: %clang --target=x86_64-apple-ios-simulator -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-X86-64-IOSSIMULATOR -// CHECK-RTSAN-X86-64-IOSSIMULATOR-NOT: unsupported option - -// RUN: %clang --target=x86_64-apple-watchos-simulator -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-X86-64-WATCHOSSIMULATOR -// CHECK-RTSAN-X86-64-WATCHOSSIMULATOR-NOT: unsupported option - -// RUN: %clang --target=x86_64-apple-tvos-simulator -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-X86-64-TVOSSIMULATOR -// CHECK-RTSAN-X86-64-TVOSSIMULATOR-NOT: unsupported option - -// RUN: %clang --target=x86_64-linux-gnu -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-X86-64-LINUX -// CHECK-RTSAN-X86-64-LINUX-NOT: unsupported option - -// RUN: not %clang --target=i386-pc-openbsd -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-OPENBSD -// CHECK-RTSAN-OPENBSD: unsupported option '-fsanitize=realtime' for target 'i386-pc-openbsd' - -// RUN: not %clang --target=x86_64-linux-gnu -fsanitize=realtime,thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-REALTIME-TSAN -// CHECK-REALTIME-TSAN: error: invalid argument '-fsanitize=realtime' not allowed with '-fsanitize=thread' - -// RUN: not %clang --target=x86_64-linux-gnu -fsanitize=realtime,address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-REALTIME-ASAN -// CHECK-REALTIME-ASAN: error: invalid argument '-fsanitize=realtime' not allowed with '-fsanitize=address' - -// RUN: not %clang --target=x86_64-linux-gnu -fsanitize=realtime,memory %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-REALTIME-MSAN -// CHECK-REALTIME-MSAN: error: invalid argument '-fsanitize=realtime' not allowed with '-fsanitize=memory' - -// RUN: not %clang --target=x86_64-linux-gnu -fsanitize=realtime,undefined %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-REALTIME-UBSAN -// CHECK-REALTIME-UBSAN: error: invalid argument '-fsanitize=realtime' not allowed with '-fsanitize=undefined' From 4e6ff75efa14e0156c005ffcf3d7964dc754b792 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 22 Aug 2024 15:35:05 -0700 Subject: [PATCH 269/426] [Vectorize] Fix a warning This patch fixes: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp:7245:1: error: unused function 'planContainsAdditionalSimplifications' [-Werror,-Wunused-function] --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f2de38f46c86aa..f3fb888f20cbbd 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7241,7 +7241,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, /// not have corresponding recipes in \p Plan and are not marked to be ignored /// in \p CostCtx. This means the VPlan contains simplification that the legacy /// cost-model did not account for. -static bool +[[maybe_unused]] static bool planContainsAdditionalSimplifications(VPlan &Plan, ElementCount VF, VPCostContext &CostCtx, Loop *TheLoop, LoopVectorizationCostModel &CM) { From ca48b015a1719ba7be2d357056f348473d495d3d Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 22 Aug 2024 16:01:36 -0700 Subject: [PATCH 270/426] [LTO] Use a helper function to add a definition (NFC) (#105721) I missed this one when I introduced helper functions in: commit 3082a381f57ef2885c270f41f2955e08c79634c5 Author: Kazu Hirata Date: Thu Aug 22 12:06:47 2024 -0700 --- llvm/lib/Transforms/IPO/FunctionImport.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp index 354ad0fde092a7..55803670071d16 100644 --- a/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -600,8 +600,8 @@ class WorkloadImportsManager : public ModuleImportsManager { LLVM_DEBUG(dbgs() << "[Workload][Including]" << VI.name() << " from " << ExportingModule << " : " << Function::getGUID(VI.name()) << "\n"); - ImportList[ExportingModule][VI.getGUID()] = - GlobalValueSummary::Definition; + FunctionImporter::addDefinition(ImportList, ExportingModule, + VI.getGUID()); GVI.onImportingSummary(*GVS); if (ExportLists) (*ExportLists)[ExportingModule].insert(VI); From 424b87b8d6f0fab0898cb5cfdf9b1bbf06ddee03 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 22 Aug 2024 16:19:48 -0700 Subject: [PATCH 271/426] [RISCV][TTI] Use legalized element types when costing casts (#105723) This fixes a crash introduced by my ac6e1fd0c089043fe60bd0040ba3cad884f00206. I had failed to consider the case where a vector is truncated to an illegal element type. The resulting intermediate VT wasn't an MVT and we'd fail an assertion. Surprisingly, SLP does query illegal element types in some cases. --- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 11 +++++------ llvm/test/Analysis/CostModel/RISCV/cast.ll | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 781e3d7929aa43..1af873f85d03c7 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1055,13 +1055,12 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); - int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) - - (int)Log2_32(Src->getScalarSizeInBits()); + int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) - + (int)Log2_32(SrcLT.second.getScalarSizeInBits()); switch (ISD) { case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: { - const unsigned SrcEltSize = Src->getScalarSizeInBits(); - if (SrcEltSize == 1) { + if (Src->getScalarSizeInBits() == 1) { // We do not use vsext/vzext to extend from mask vector. // Instead we use the following instructions to extend from mask vector: // vmv.v.i v8, 0 @@ -1091,8 +1090,8 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, case ISD::FP_EXTEND: case ISD::FP_ROUND: { // Counts of narrow/widen instructions. - unsigned SrcEltSize = Src->getScalarSizeInBits(); - unsigned DstEltSize = Dst->getScalarSizeInBits(); + unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits(); + unsigned DstEltSize = DstLT.second.getScalarSizeInBits(); unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll index 68e633d6f05505..e90fab9fbc8c46 100644 --- a/llvm/test/Analysis/CostModel/RISCV/cast.ll +++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll @@ -1016,6 +1016,11 @@ define void @zext() { define void @trunc() { ; RV32-LABEL: 'trunc' +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i2 = trunc <2 x i16> undef to <2 x i2> +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i4 = trunc <2 x i16> undef to <2 x i4> +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i6 = trunc <2 x i16> undef to <2 x i6> +; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i4_v2i2 = trunc <2 x i4> undef to <2 x i2> +; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i6_v2i4 = trunc <2 x i6> undef to <2 x i4> ; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i8 = trunc <2 x i16> undef to <2 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i8 = trunc <2 x i32> undef to <2 x i8> ; RV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i64_v2i8 = trunc <2 x i64> undef to <2 x i8> @@ -1169,6 +1174,11 @@ define void @trunc() { ; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; RV64-LABEL: 'trunc' +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i2 = trunc <2 x i16> undef to <2 x i2> +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i4 = trunc <2 x i16> undef to <2 x i4> +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i6 = trunc <2 x i16> undef to <2 x i6> +; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i4_v2i2 = trunc <2 x i4> undef to <2 x i2> +; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i6_v2i4 = trunc <2 x i6> undef to <2 x i4> ; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i8 = trunc <2 x i16> undef to <2 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i8 = trunc <2 x i32> undef to <2 x i8> ; RV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i64_v2i8 = trunc <2 x i64> undef to <2 x i8> @@ -1321,6 +1331,13 @@ define void @trunc() { ; RV64-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %nxv64i64_nxv64i1 = trunc undef to ; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + + %v2i16_v2i2 = trunc <2 x i16> undef to <2 x i2> + %v2i16_v2i4 = trunc <2 x i16> undef to <2 x i4> + %v2i16_v2i6 = trunc <2 x i16> undef to <2 x i6> + %v2i4_v2i2 = trunc <2 x i4> undef to <2 x i2> + %v2i6_v2i4 = trunc <2 x i6> undef to <2 x i4> + %v2i16_v2i8 = trunc <2 x i16> undef to <2 x i8> %v2i32_v2i8 = trunc <2 x i32> undef to <2 x i8> %v2i64_v2i8 = trunc <2 x i64> undef to <2 x i8> From 0d21c2b3e516617ee0fe60e2e5368e0c447b17ad Mon Sep 17 00:00:00 2001 From: vporpo Date: Thu, 22 Aug 2024 16:20:51 -0700 Subject: [PATCH 272/426] [SandboxIR] Implement CatchReturnInst (#105605) This patch implements sandboxir::CatchReturnInst mirroring llvm::CatchReturnInst. --- llvm/include/llvm/SandboxIR/SandboxIR.h | 29 +++++++++ .../llvm/SandboxIR/SandboxIRValues.def | 1 + llvm/lib/SandboxIR/SandboxIR.cpp | 54 +++++++++++++++ llvm/unittests/SandboxIR/SandboxIRTest.cpp | 65 +++++++++++++++++++ llvm/unittests/SandboxIR/TrackerTest.cpp | 50 ++++++++++++++ 5 files changed, 199 insertions(+) diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index ed5b6f9c9da852..c09e167d67bb1c 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -130,6 +130,7 @@ class CallBrInst; class FuncletPadInst; class CatchPadInst; class CleanupPadInst; +class CatchReturnInst; class GetElementPtrInst; class CastInst; class PtrToIntInst; @@ -262,6 +263,7 @@ class Value { friend class FuncletPadInst; // For getting `Val`. friend class CatchPadInst; // For getting `Val`. friend class CleanupPadInst; // For getting `Val`. + friend class CatchReturnInst; // For getting `Val`. friend class GetElementPtrInst; // For getting `Val`. friend class CatchSwitchInst; // For getting `Val`. friend class SwitchInst; // For getting `Val`. @@ -687,6 +689,7 @@ class Instruction : public sandboxir::User { friend class CallBrInst; // For getTopmostLLVMInstruction(). friend class CatchPadInst; // For getTopmostLLVMInstruction(). friend class CleanupPadInst; // For getTopmostLLVMInstruction(). + friend class CatchReturnInst; // For getTopmostLLVMInstruction(). friend class GetElementPtrInst; // For getTopmostLLVMInstruction(). friend class CatchSwitchInst; // For getTopmostLLVMInstruction(). friend class SwitchInst; // For getTopmostLLVMInstruction(). @@ -1914,6 +1917,30 @@ class CleanupPadInst : public FuncletPadInst { } }; +class CatchReturnInst + : public SingleLLVMInstructionImpl { + CatchReturnInst(llvm::CatchReturnInst *CRI, Context &Ctx) + : SingleLLVMInstructionImpl(ClassID::CatchRet, Opcode::CatchRet, CRI, + Ctx) {} + friend class Context; // For constructor. + +public: + static CatchReturnInst *create(CatchPadInst *CatchPad, BasicBlock *BB, + BBIterator WhereIt, BasicBlock *WhereBB, + Context &Ctx); + CatchPadInst *getCatchPad() const; + void setCatchPad(CatchPadInst *CatchPad); + BasicBlock *getSuccessor() const; + void setSuccessor(BasicBlock *NewSucc); + unsigned getNumSuccessors() { + return cast(Val)->getNumSuccessors(); + } + Value *getCatchSwitchParentPad() const; + static bool classof(const Value *From) { + return From->getSubclassID() == ClassID::CatchRet; + } +}; + class GetElementPtrInst final : public SingleLLVMInstructionImpl { /// Use Context::createGetElementPtrInst(). Don't call @@ -2820,6 +2847,8 @@ class Context { friend CatchPadInst; // For createCatchPadInst() CleanupPadInst *createCleanupPadInst(llvm::CleanupPadInst *I); friend CleanupPadInst; // For createCleanupPadInst() + CatchReturnInst *createCatchReturnInst(llvm::CatchReturnInst *I); + friend CatchReturnInst; // For createCatchReturnInst() GetElementPtrInst *createGetElementPtrInst(llvm::GetElementPtrInst *I); friend GetElementPtrInst; // For createGetElementPtrInst() CatchSwitchInst *createCatchSwitchInst(llvm::CatchSwitchInst *I); diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def index a75f872bc88acb..b7b396e30dc3ca 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def +++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def @@ -48,6 +48,7 @@ DEF_INSTR(Invoke, OP(Invoke), InvokeInst) DEF_INSTR(CallBr, OP(CallBr), CallBrInst) DEF_INSTR(CatchPad, OP(CatchPad), CatchPadInst) DEF_INSTR(CleanupPad, OP(CleanupPad), CleanupPadInst) +DEF_INSTR(CatchRet, OP(CatchRet), CatchReturnInst) DEF_INSTR(GetElementPtr, OP(GetElementPtr), GetElementPtrInst) DEF_INSTR(CatchSwitch, OP(CatchSwitch), CatchSwitchInst) DEF_INSTR(Switch, OP(Switch), SwitchInst) diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp index 1ff82a968a717f..b953e68c33180e 100644 --- a/llvm/lib/SandboxIR/SandboxIR.cpp +++ b/llvm/lib/SandboxIR/SandboxIR.cpp @@ -1105,6 +1105,50 @@ CleanupPadInst *CleanupPadInst::create(Value *ParentPad, ArrayRef Args, return Ctx.createCleanupPadInst(LLVMI); } +CatchReturnInst *CatchReturnInst::create(CatchPadInst *CatchPad, BasicBlock *BB, + BBIterator WhereIt, + BasicBlock *WhereBB, Context &Ctx) { + auto &Builder = Ctx.getLLVMIRBuilder(); + if (WhereIt != WhereBB->end()) + Builder.SetInsertPoint((*WhereIt).getTopmostLLVMInstruction()); + else + Builder.SetInsertPoint(cast(WhereBB->Val)); + llvm::CatchReturnInst *LLVMI = Builder.CreateCatchRet( + cast(CatchPad->Val), cast(BB->Val)); + return Ctx.createCatchReturnInst(LLVMI); +} + +CatchPadInst *CatchReturnInst::getCatchPad() const { + return cast( + Ctx.getValue(cast(Val)->getCatchPad())); +} + +void CatchReturnInst::setCatchPad(CatchPadInst *CatchPad) { + Ctx.getTracker() + .emplaceIfTracking>(this); + cast(Val)->setCatchPad( + cast(CatchPad->Val)); +} + +BasicBlock *CatchReturnInst::getSuccessor() const { + return cast( + Ctx.getValue(cast(Val)->getSuccessor())); +} + +void CatchReturnInst::setSuccessor(BasicBlock *NewSucc) { + Ctx.getTracker() + .emplaceIfTracking>(this); + cast(Val)->setSuccessor( + cast(NewSucc->Val)); +} + +Value *CatchReturnInst::getCatchSwitchParentPad() const { + return Ctx.getValue( + cast(Val)->getCatchSwitchParentPad()); +} + Value *GetElementPtrInst::create(Type *Ty, Value *Ptr, ArrayRef IdxList, BasicBlock::iterator WhereIt, @@ -2138,6 +2182,12 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { std::unique_ptr(new CleanupPadInst(LLVMCPI, *this)); return It->second.get(); } + case llvm::Instruction::CatchRet: { + auto *LLVMCRI = cast(LLVMV); + It->second = + std::unique_ptr(new CatchReturnInst(LLVMCRI, *this)); + return It->second.get(); + } case llvm::Instruction::GetElementPtr: { auto *LLVMGEP = cast(LLVMV); It->second = std::unique_ptr( @@ -2322,6 +2372,10 @@ CleanupPadInst *Context::createCleanupPadInst(llvm::CleanupPadInst *I) { auto NewPtr = std::unique_ptr(new CleanupPadInst(I, *this)); return cast(registerValue(std::move(NewPtr))); } +CatchReturnInst *Context::createCatchReturnInst(llvm::CatchReturnInst *I) { + auto NewPtr = std::unique_ptr(new CatchReturnInst(I, *this)); + return cast(registerValue(std::move(NewPtr))); +} GetElementPtrInst * Context::createGetElementPtrInst(llvm::GetElementPtrInst *I) { auto NewPtr = diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index 28894397a60d6f..76ca64caeeeb07 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -1957,6 +1957,71 @@ define void @foo() { #endif // NDEBUG } +TEST_F(SandboxIRTest, CatchReturnInst) { + parseIR(C, R"IR( +define void @foo() { +dispatch: + %cs = catchswitch within none [label %catch] unwind to caller +catch: + %catchpad = catchpad within %cs [ptr @foo] + catchret from %catchpad to label %continue +continue: + ret void +catch2: + %catchpad2 = catchpad within %cs [ptr @foo] + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + BasicBlock *LLVMCatch = getBasicBlockByName(LLVMF, "catch"); + auto LLVMIt = LLVMCatch->begin(); + [[maybe_unused]] auto *LLVMCP = cast(&*LLVMIt++); + auto *LLVMCR = cast(&*LLVMIt++); + + sandboxir::Context Ctx(C); + [[maybe_unused]] auto &F = *Ctx.createFunction(&LLVMF); + auto *Catch = cast(Ctx.getValue(LLVMCatch)); + auto *Catch2 = cast( + Ctx.getValue(getBasicBlockByName(LLVMF, "catch2"))); + auto It = Catch->begin(); + [[maybe_unused]] auto *CP = cast(&*It++); + auto *CR = cast(&*It++); + auto *CP2 = cast(&*Catch2->begin()); + + // Check getCatchPad(). + EXPECT_EQ(CR->getCatchPad(), Ctx.getValue(LLVMCR->getCatchPad())); + // Check setCatchPad(). + auto *OrigCP = CR->getCatchPad(); + auto *NewCP = CP2; + EXPECT_NE(NewCP, OrigCP); + CR->setCatchPad(NewCP); + EXPECT_EQ(CR->getCatchPad(), NewCP); + CR->setCatchPad(OrigCP); + EXPECT_EQ(CR->getCatchPad(), OrigCP); + // Check getSuccessor(). + EXPECT_EQ(CR->getSuccessor(), Ctx.getValue(LLVMCR->getSuccessor())); + // Check setSuccessor(). + auto *OrigSucc = CR->getSuccessor(); + auto *NewSucc = Catch; + EXPECT_NE(NewSucc, OrigSucc); + CR->setSuccessor(NewSucc); + EXPECT_EQ(CR->getSuccessor(), NewSucc); + CR->setSuccessor(OrigSucc); + EXPECT_EQ(CR->getSuccessor(), OrigSucc); + // Check getNumSuccessors(). + EXPECT_EQ(CR->getNumSuccessors(), LLVMCR->getNumSuccessors()); + // Check getCatchSwitchParentPad(). + EXPECT_EQ(CR->getCatchSwitchParentPad(), + Ctx.getValue(LLVMCR->getCatchSwitchParentPad())); + // Check create(). + auto *CRI = + cast(sandboxir::CatchReturnInst::create( + CP, Catch, CP->getIterator(), Catch, Ctx)); + EXPECT_EQ(CRI->getNextNode(), CP); + EXPECT_EQ(CRI->getCatchPad(), CP); + EXPECT_EQ(CRI->getSuccessor(), Catch); +} + TEST_F(SandboxIRTest, GetElementPtrInstruction) { parseIR(C, R"IR( define void @foo(ptr %ptr, <2 x ptr> %ptrs) { diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp index c2faf60a57f3b8..6614ab7fa248e1 100644 --- a/llvm/unittests/SandboxIR/TrackerTest.cpp +++ b/llvm/unittests/SandboxIR/TrackerTest.cpp @@ -713,6 +713,56 @@ define void @foo(i32 %cond0, i32 %cond1) { EXPECT_EQ(*HIt++, Handler1); } +TEST_F(TrackerTest, CatchReturnInstSetters) { + parseIR(C, R"IR( +define void @foo() { +dispatch: + %cs = catchswitch within none [label %catch] unwind to caller +catch: + %catchpad = catchpad within %cs [ptr @foo] + catchret from %catchpad to label %continue +continue: + ret void +catch2: + %catchpad2 = catchpad within %cs [ptr @foo] + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + BasicBlock *LLVMCatch = getBasicBlockByName(LLVMF, "catch"); + auto LLVMIt = LLVMCatch->begin(); + [[maybe_unused]] auto *LLVMCP = cast(&*LLVMIt++); + + sandboxir::Context Ctx(C); + [[maybe_unused]] auto &F = *Ctx.createFunction(&LLVMF); + auto *Catch = cast(Ctx.getValue(LLVMCatch)); + auto *Catch2 = cast( + Ctx.getValue(getBasicBlockByName(LLVMF, "catch2"))); + auto It = Catch->begin(); + [[maybe_unused]] auto *CP = cast(&*It++); + auto *CR = cast(&*It++); + auto *CP2 = cast(&*Catch2->begin()); + + // Check setCatchPad(). + auto *OrigCP = CR->getCatchPad(); + auto *NewCP = CP2; + EXPECT_NE(NewCP, OrigCP); + Ctx.save(); + CR->setCatchPad(NewCP); + EXPECT_EQ(CR->getCatchPad(), NewCP); + Ctx.revert(); + EXPECT_EQ(CR->getCatchPad(), OrigCP); + // Check setSuccessor(). + auto *OrigSucc = CR->getSuccessor(); + auto *NewSucc = Catch; + EXPECT_NE(NewSucc, OrigSucc); + Ctx.save(); + CR->setSuccessor(NewSucc); + EXPECT_EQ(CR->getSuccessor(), NewSucc); + Ctx.revert(); + EXPECT_EQ(CR->getSuccessor(), OrigSucc); +} + TEST_F(TrackerTest, SwitchInstSetters) { parseIR(C, R"IR( define void @foo(i32 %cond0, i32 %cond1) { From 1df15042bdda3817827e39c772525a4a24c1adbe Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 22 Aug 2024 16:38:19 -0700 Subject: [PATCH 273/426] Revert "[clang] Merge lifetimebound and GSL code paths for lifetime analysis (#104906)" (#105752) Revert as it breaks libc++ tests, see #104906. This reverts commit c368a720a0b40bb8fe4aff3971fe9a7009c85aa6. --- clang/docs/ReleaseNotes.rst | 2 - clang/lib/Sema/CheckExprLifetime.cpp | 246 +++++++++++------- .../Sema/warn-lifetime-analysis-nocfg.cpp | 13 - 3 files changed, 149 insertions(+), 112 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 34f6680c85e87e..04cc4cc22964a8 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -239,8 +239,6 @@ Improvements to Clang's diagnostics - Clang now diagnoses when the result of a [[nodiscard]] function is discarded after being cast in C. Fixes #GH104391. -- Don't emit duplicated dangling diagnostics. (#GH93386). - - Improved diagnostic when trying to befriend a concept. (#GH45182). Improvements to Clang's time-trace diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp index 7e23c08cc79ffb..7389046eaddde1 100644 --- a/clang/lib/Sema/CheckExprLifetime.cpp +++ b/clang/lib/Sema/CheckExprLifetime.cpp @@ -237,11 +237,13 @@ static bool pathContainsInit(IndirectLocalPath &Path) { static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, Expr *Init, LocalVisitor Visit, - bool RevisitSubinits); + bool RevisitSubinits, + bool EnableLifetimeWarnings); static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, Expr *Init, ReferenceKind RK, - LocalVisitor Visit); + LocalVisitor Visit, + bool EnableLifetimeWarnings); template static bool isRecordWithAttr(QualType Type) { if (auto *RD = Type->getAsCXXRecordDecl()) @@ -324,6 +326,66 @@ static bool shouldTrackFirstArgument(const FunctionDecl *FD) { return false; } +static void handleGslAnnotatedTypes(IndirectLocalPath &Path, Expr *Call, + LocalVisitor Visit) { + auto VisitPointerArg = [&](const Decl *D, Expr *Arg, bool Value) { + // We are not interested in the temporary base objects of gsl Pointers: + // Temp().ptr; // Here ptr might not dangle. + if (isa(Arg->IgnoreImpCasts())) + return; + // Once we initialized a value with a reference, it can no longer dangle. + if (!Value) { + for (const IndirectLocalPathEntry &PE : llvm::reverse(Path)) { + if (PE.Kind == IndirectLocalPathEntry::GslReferenceInit) + continue; + if (PE.Kind == IndirectLocalPathEntry::GslPointerInit || + PE.Kind == IndirectLocalPathEntry::GslPointerAssignment) + return; + break; + } + } + Path.push_back({Value ? IndirectLocalPathEntry::GslPointerInit + : IndirectLocalPathEntry::GslReferenceInit, + Arg, D}); + if (Arg->isGLValue()) + visitLocalsRetainedByReferenceBinding(Path, Arg, RK_ReferenceBinding, + Visit, + /*EnableLifetimeWarnings=*/true); + else + visitLocalsRetainedByInitializer(Path, Arg, Visit, true, + /*EnableLifetimeWarnings=*/true); + Path.pop_back(); + }; + + if (auto *MCE = dyn_cast(Call)) { + const auto *MD = cast_or_null(MCE->getDirectCallee()); + if (MD && shouldTrackImplicitObjectArg(MD)) + VisitPointerArg(MD, MCE->getImplicitObjectArgument(), + !MD->getReturnType()->isReferenceType()); + return; + } else if (auto *OCE = dyn_cast(Call)) { + FunctionDecl *Callee = OCE->getDirectCallee(); + if (Callee && Callee->isCXXInstanceMember() && + shouldTrackImplicitObjectArg(cast(Callee))) + VisitPointerArg(Callee, OCE->getArg(0), + !Callee->getReturnType()->isReferenceType()); + return; + } else if (auto *CE = dyn_cast(Call)) { + FunctionDecl *Callee = CE->getDirectCallee(); + if (Callee && shouldTrackFirstArgument(Callee)) + VisitPointerArg(Callee, CE->getArg(0), + !Callee->getReturnType()->isReferenceType()); + return; + } + + if (auto *CCE = dyn_cast(Call)) { + const auto *Ctor = CCE->getConstructor(); + const CXXRecordDecl *RD = Ctor->getParent(); + if (CCE->getNumArgs() > 0 && RD->hasAttr()) + VisitPointerArg(Ctor->getParamDecl(0), CCE->getArgs()[0], true); + } +} + static bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) { const TypeSourceInfo *TSI = FD->getTypeSourceInfo(); if (!TSI) @@ -361,9 +423,8 @@ static bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) { return false; } -// Visit lifetimebound or gsl-pointer arguments. -static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call, - LocalVisitor Visit) { +static void visitLifetimeBoundArguments(IndirectLocalPath &Path, Expr *Call, + LocalVisitor Visit) { const FunctionDecl *Callee; ArrayRef Args; @@ -378,8 +439,6 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call, if (!Callee) return; - bool EnableGSLAnalysis = !Callee->getASTContext().getDiagnostics().isIgnored( - diag::warn_dangling_lifetime_pointer, SourceLocation()); Expr *ObjectArg = nullptr; if (isa(Call) && Callee->isCXXInstanceMember()) { ObjectArg = Args[0]; @@ -392,35 +451,11 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call, Path.push_back({IndirectLocalPathEntry::LifetimeBoundCall, Arg, D}); if (Arg->isGLValue()) visitLocalsRetainedByReferenceBinding(Path, Arg, RK_ReferenceBinding, - Visit); + Visit, + /*EnableLifetimeWarnings=*/false); else - visitLocalsRetainedByInitializer(Path, Arg, Visit, true); - Path.pop_back(); - }; - auto VisitGSLPointerArg = [&](const Decl *D, Expr *Arg, bool Value) { - // We are not interested in the temporary base objects of gsl Pointers: - // Temp().ptr; // Here ptr might not dangle. - if (isa(Arg->IgnoreImpCasts())) - return; - // Once we initialized a value with a reference, it can no longer dangle. - if (!Value) { - for (const IndirectLocalPathEntry &PE : llvm::reverse(Path)) { - if (PE.Kind == IndirectLocalPathEntry::GslReferenceInit) - continue; - if (PE.Kind == IndirectLocalPathEntry::GslPointerInit || - PE.Kind == IndirectLocalPathEntry::GslPointerAssignment) - return; - break; - } - } - Path.push_back({Value ? IndirectLocalPathEntry::GslPointerInit - : IndirectLocalPathEntry::GslReferenceInit, - Arg, D}); - if (Arg->isGLValue()) - visitLocalsRetainedByReferenceBinding(Path, Arg, RK_ReferenceBinding, - Visit); - else - visitLocalsRetainedByInitializer(Path, Arg, Visit, true); + visitLocalsRetainedByInitializer(Path, Arg, Visit, true, + /*EnableLifetimeWarnings=*/false); Path.pop_back(); }; @@ -443,12 +478,6 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call, CheckCoroObjArg = false; if (implicitObjectParamIsLifetimeBound(Callee) || CheckCoroObjArg) VisitLifetimeBoundArg(Callee, ObjectArg); - else if (EnableGSLAnalysis) { - if (auto *CME = dyn_cast(Callee); - CME && shouldTrackImplicitObjectArg(CME)) - VisitGSLPointerArg(Callee, ObjectArg, - !Callee->getReturnType()->isReferenceType()); - } } for (unsigned I = 0, @@ -456,17 +485,6 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call, I != N; ++I) { if (CheckCoroCall || Callee->getParamDecl(I)->hasAttr()) VisitLifetimeBoundArg(Callee->getParamDecl(I), Args[I]); - else if (EnableGSLAnalysis && I == 0) { // GSL - if (shouldTrackFirstArgument(Callee)) { - VisitGSLPointerArg(Callee, Args[0], - !Callee->getReturnType()->isReferenceType()); - } else if (auto *CCE = dyn_cast(Call); - CCE && - CCE->getConstructor()->getParent()->hasAttr()) { - VisitGSLPointerArg(CCE->getConstructor()->getParamDecl(0), Args[0], - true); - } - } } } @@ -474,7 +492,8 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call, /// glvalue expression \c Init. static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, Expr *Init, ReferenceKind RK, - LocalVisitor Visit) { + LocalVisitor Visit, + bool EnableLifetimeWarnings) { RevertToOldSizeRAII RAII(Path); // Walk past any constructs which we can lifetime-extend across. @@ -511,7 +530,8 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, else // We can't lifetime extend through this but we might still find some // retained temporaries. - return visitLocalsRetainedByInitializer(Path, Init, Visit, true); + return visitLocalsRetainedByInitializer(Path, Init, Visit, true, + EnableLifetimeWarnings); } // Step into CXXDefaultInitExprs so we can diagnose cases where a @@ -525,18 +545,23 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, if (auto *MTE = dyn_cast(Init)) { if (Visit(Path, Local(MTE), RK)) - visitLocalsRetainedByInitializer(Path, MTE->getSubExpr(), Visit, true); + visitLocalsRetainedByInitializer(Path, MTE->getSubExpr(), Visit, true, + EnableLifetimeWarnings); } if (auto *M = dyn_cast(Init)) { // Lifetime of a non-reference type field is same as base object. if (auto *F = dyn_cast(M->getMemberDecl()); F && !F->getType()->isReferenceType()) - visitLocalsRetainedByInitializer(Path, M->getBase(), Visit, true); + visitLocalsRetainedByInitializer(Path, M->getBase(), Visit, true, + EnableLifetimeWarnings); } - if (isa(Init)) - return visitFunctionCallArguments(Path, Init, Visit); + if (isa(Init)) { + if (EnableLifetimeWarnings) + handleGslAnnotatedTypes(Path, Init, Visit); + return visitLifetimeBoundArguments(Path, Init, Visit); + } switch (Init->getStmtClass()) { case Stmt::DeclRefExprClass: { @@ -555,7 +580,8 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, } else if (VD->getInit() && !isVarOnPath(Path, VD)) { Path.push_back({IndirectLocalPathEntry::VarInit, DRE, VD}); visitLocalsRetainedByReferenceBinding(Path, VD->getInit(), - RK_ReferenceBinding, Visit); + RK_ReferenceBinding, Visit, + EnableLifetimeWarnings); } } break; @@ -567,13 +593,15 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, // handling all sorts of rvalues passed to a unary operator. const UnaryOperator *U = cast(Init); if (U->getOpcode() == UO_Deref) - visitLocalsRetainedByInitializer(Path, U->getSubExpr(), Visit, true); + visitLocalsRetainedByInitializer(Path, U->getSubExpr(), Visit, true, + EnableLifetimeWarnings); break; } case Stmt::ArraySectionExprClass: { - visitLocalsRetainedByInitializer( - Path, cast(Init)->getBase(), Visit, true); + visitLocalsRetainedByInitializer(Path, + cast(Init)->getBase(), + Visit, true, EnableLifetimeWarnings); break; } @@ -581,9 +609,11 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, case Stmt::BinaryConditionalOperatorClass: { auto *C = cast(Init); if (!C->getTrueExpr()->getType()->isVoidType()) - visitLocalsRetainedByReferenceBinding(Path, C->getTrueExpr(), RK, Visit); + visitLocalsRetainedByReferenceBinding(Path, C->getTrueExpr(), RK, Visit, + EnableLifetimeWarnings); if (!C->getFalseExpr()->getType()->isVoidType()) - visitLocalsRetainedByReferenceBinding(Path, C->getFalseExpr(), RK, Visit); + visitLocalsRetainedByReferenceBinding(Path, C->getFalseExpr(), RK, Visit, + EnableLifetimeWarnings); break; } @@ -606,7 +636,8 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, /// the prvalue expression \c Init. static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, Expr *Init, LocalVisitor Visit, - bool RevisitSubinits) { + bool RevisitSubinits, + bool EnableLifetimeWarnings) { RevertToOldSizeRAII RAII(Path); Expr *Old; @@ -647,16 +678,18 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, if (VD && VD->getType().isConstQualified() && VD->getInit() && !isVarOnPath(Path, VD)) { Path.push_back({IndirectLocalPathEntry::VarInit, DRE, VD}); - visitLocalsRetainedByInitializer(Path, VD->getInit(), Visit, - true); + visitLocalsRetainedByInitializer( + Path, VD->getInit(), Visit, true, EnableLifetimeWarnings); } } else if (auto *MTE = dyn_cast(L)) { if (MTE->getType().isConstQualified()) visitLocalsRetainedByInitializer(Path, MTE->getSubExpr(), - Visit, true); + Visit, true, + EnableLifetimeWarnings); } return false; - }); + }, + EnableLifetimeWarnings); // We assume that objects can be retained by pointers cast to integers, // but not if the integer is cast to floating-point type or to _Complex. @@ -685,8 +718,9 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, // Model array-to-pointer decay as taking the address of the array // lvalue. Path.push_back({IndirectLocalPathEntry::AddressOf, CE}); - return visitLocalsRetainedByReferenceBinding( - Path, CE->getSubExpr(), RK_ReferenceBinding, Visit); + return visitLocalsRetainedByReferenceBinding(Path, CE->getSubExpr(), + RK_ReferenceBinding, Visit, + EnableLifetimeWarnings); default: return; @@ -701,7 +735,8 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, // lifetime of the array exactly like binding a reference to a temporary. if (auto *ILE = dyn_cast(Init)) return visitLocalsRetainedByReferenceBinding(Path, ILE->getSubExpr(), - RK_StdInitializerList, Visit); + RK_StdInitializerList, Visit, + EnableLifetimeWarnings); if (InitListExpr *ILE = dyn_cast(Init)) { // We already visited the elements of this initializer list while @@ -712,12 +747,14 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, if (ILE->isTransparent()) return visitLocalsRetainedByInitializer(Path, ILE->getInit(0), Visit, - RevisitSubinits); + RevisitSubinits, + EnableLifetimeWarnings); if (ILE->getType()->isArrayType()) { for (unsigned I = 0, N = ILE->getNumInits(); I != N; ++I) visitLocalsRetainedByInitializer(Path, ILE->getInit(I), Visit, - RevisitSubinits); + RevisitSubinits, + EnableLifetimeWarnings); return; } @@ -730,12 +767,14 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, if (RD->isUnion() && ILE->getInitializedFieldInUnion() && ILE->getInitializedFieldInUnion()->getType()->isReferenceType()) visitLocalsRetainedByReferenceBinding(Path, ILE->getInit(0), - RK_ReferenceBinding, Visit); + RK_ReferenceBinding, Visit, + EnableLifetimeWarnings); else { unsigned Index = 0; for (; Index < RD->getNumBases() && Index < ILE->getNumInits(); ++Index) visitLocalsRetainedByInitializer(Path, ILE->getInit(Index), Visit, - RevisitSubinits); + RevisitSubinits, + EnableLifetimeWarnings); for (const auto *I : RD->fields()) { if (Index >= ILE->getNumInits()) break; @@ -744,13 +783,14 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, Expr *SubInit = ILE->getInit(Index); if (I->getType()->isReferenceType()) visitLocalsRetainedByReferenceBinding(Path, SubInit, - RK_ReferenceBinding, Visit); + RK_ReferenceBinding, Visit, + EnableLifetimeWarnings); else // This might be either aggregate-initialization of a member or // initialization of a std::initializer_list object. Regardless, // we should recursively lifetime-extend that initializer. - visitLocalsRetainedByInitializer(Path, SubInit, Visit, - RevisitSubinits); + visitLocalsRetainedByInitializer( + Path, SubInit, Visit, RevisitSubinits, EnableLifetimeWarnings); ++Index; } } @@ -771,9 +811,10 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, Path.push_back({IndirectLocalPathEntry::LambdaCaptureInit, E, &Cap}); if (E->isGLValue()) visitLocalsRetainedByReferenceBinding(Path, E, RK_ReferenceBinding, - Visit); + Visit, EnableLifetimeWarnings); else - visitLocalsRetainedByInitializer(Path, E, Visit, true); + visitLocalsRetainedByInitializer(Path, E, Visit, true, + EnableLifetimeWarnings); if (Cap.capturesVariable()) Path.pop_back(); } @@ -787,14 +828,18 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, Expr *Arg = MTE->getSubExpr(); Path.push_back({IndirectLocalPathEntry::TemporaryCopy, Arg, CCE->getConstructor()}); - visitLocalsRetainedByInitializer(Path, Arg, Visit, true); + visitLocalsRetainedByInitializer(Path, Arg, Visit, true, + /*EnableLifetimeWarnings*/ false); Path.pop_back(); } } } - if (isa(Init) || isa(Init)) - return visitFunctionCallArguments(Path, Init, Visit); + if (isa(Init) || isa(Init)) { + if (EnableLifetimeWarnings) + handleGslAnnotatedTypes(Path, Init, Visit); + return visitLifetimeBoundArguments(Path, Init, Visit); + } switch (Init->getStmtClass()) { case Stmt::UnaryOperatorClass: { @@ -810,7 +855,8 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, Path.push_back({IndirectLocalPathEntry::AddressOf, UO}); visitLocalsRetainedByReferenceBinding(Path, UO->getSubExpr(), - RK_ReferenceBinding, Visit); + RK_ReferenceBinding, Visit, + EnableLifetimeWarnings); } break; } @@ -823,9 +869,11 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, break; if (BO->getLHS()->getType()->isPointerType()) - visitLocalsRetainedByInitializer(Path, BO->getLHS(), Visit, true); + visitLocalsRetainedByInitializer(Path, BO->getLHS(), Visit, true, + EnableLifetimeWarnings); else if (BO->getRHS()->getType()->isPointerType()) - visitLocalsRetainedByInitializer(Path, BO->getRHS(), Visit, true); + visitLocalsRetainedByInitializer(Path, BO->getRHS(), Visit, true, + EnableLifetimeWarnings); break; } @@ -835,9 +883,11 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, // In C++, we can have a throw-expression operand, which has 'void' type // and isn't interesting from a lifetime perspective. if (!C->getTrueExpr()->getType()->isVoidType()) - visitLocalsRetainedByInitializer(Path, C->getTrueExpr(), Visit, true); + visitLocalsRetainedByInitializer(Path, C->getTrueExpr(), Visit, true, + EnableLifetimeWarnings); if (!C->getFalseExpr()->getType()->isVoidType()) - visitLocalsRetainedByInitializer(Path, C->getFalseExpr(), Visit, true); + visitLocalsRetainedByInitializer(Path, C->getFalseExpr(), Visit, true, + EnableLifetimeWarnings); break; } @@ -939,7 +989,8 @@ static void checkExprLifetimeImpl(Sema &SemaRef, const InitializedEntity *InitEntity, const InitializedEntity *ExtendingEntity, LifetimeKind LK, - const AssignedEntity *AEntity, Expr *Init) { + const AssignedEntity *AEntity, Expr *Init, + bool EnableLifetimeWarnings) { assert((AEntity && LK == LK_Assignment) || (InitEntity && LK != LK_Assignment)); // If this entity doesn't have an interesting lifetime, don't bother looking @@ -1233,20 +1284,19 @@ static void checkExprLifetimeImpl(Sema &SemaRef, }; llvm::SmallVector Path; - if (!SemaRef.getDiagnostics().isIgnored(diag::warn_dangling_lifetime_pointer, - SourceLocation()) && - LK == LK_Assignment && + if (EnableLifetimeWarnings && LK == LK_Assignment && isRecordWithAttr(AEntity->LHS->getType())) Path.push_back({IndirectLocalPathEntry::GslPointerAssignment, Init}); if (Init->isGLValue()) visitLocalsRetainedByReferenceBinding(Path, Init, RK_ReferenceBinding, - TemporaryVisitor); + TemporaryVisitor, + EnableLifetimeWarnings); else visitLocalsRetainedByInitializer( Path, Init, TemporaryVisitor, // Don't revisit the sub inits for the intialization case. - /*RevisitSubinits=*/!InitEntity); + /*RevisitSubinits=*/!InitEntity, EnableLifetimeWarnings); } void checkExprLifetime(Sema &SemaRef, const InitializedEntity &Entity, @@ -1254,8 +1304,10 @@ void checkExprLifetime(Sema &SemaRef, const InitializedEntity &Entity, auto LTResult = getEntityLifetime(&Entity); LifetimeKind LK = LTResult.getInt(); const InitializedEntity *ExtendingEntity = LTResult.getPointer(); + bool EnableLifetimeWarnings = !SemaRef.getDiagnostics().isIgnored( + diag::warn_dangling_lifetime_pointer, SourceLocation()); checkExprLifetimeImpl(SemaRef, &Entity, ExtendingEntity, LK, - /*AEntity*/ nullptr, Init); + /*AEntity*/ nullptr, Init, EnableLifetimeWarnings); } void checkExprLifetime(Sema &SemaRef, const AssignedEntity &Entity, @@ -1271,7 +1323,7 @@ void checkExprLifetime(Sema &SemaRef, const AssignedEntity &Entity, checkExprLifetimeImpl(SemaRef, /*InitEntity=*/nullptr, /*ExtendingEntity=*/nullptr, LK_Assignment, &Entity, - Init); + Init, EnableLifetimeWarnings); } } // namespace clang::sema diff --git a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp index 86ee90ed6df8dd..09dfb2b5d96a89 100644 --- a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp +++ b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp @@ -479,16 +479,3 @@ void testForBug49342() { auto it = std::iter{} - 2; // Used to be false positive. } - -namespace GH93386 { -// verify no duplicated diagnostics are emitted. -struct [[gsl::Pointer]] S { - S(const std::vector& abc [[clang::lifetimebound]]); -}; - -S test(std::vector a) { - return S(a); // expected-warning {{address of stack memory associated with}} -} - -auto s = S(std::vector()); // expected-warning {{temporary whose address is used as value of local variable}} -} From ecfceb890d47e4c11804cdc2c38f905f691ef397 Mon Sep 17 00:00:00 2001 From: h-vetinari Date: Fri, 23 Aug 2024 10:51:25 +1100 Subject: [PATCH 274/426] [clang][NFC] order C++ standards in reverse in release notes (#104866) Noticed that the release notes currently have a weird order: C++17, C++14(!), C++20, C++23, C++2c. Reorder them in reverse chronological order, which also matches the [status page](https://clang.llvm.org/cxx_status.html). --- clang/docs/ReleaseNotes.rst | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 04cc4cc22964a8..93040c2eee2c0b 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -107,19 +107,6 @@ C++ Language Changes constant expression. Supports the `V.xyzw` syntax and other tidbits as seen in OpenCL. Selecting multiple elements is left as a future work. -C++17 Feature Support -^^^^^^^^^^^^^^^^^^^^^ - -C++14 Feature Support -^^^^^^^^^^^^^^^^^^^^^ - -C++20 Feature Support -^^^^^^^^^^^^^^^^^^^^^ - -C++23 Feature Support -^^^^^^^^^^^^^^^^^^^^^ -- Removed the restriction to literal types in constexpr functions in C++23 mode. - C++2c Feature Support ^^^^^^^^^^^^^^^^^^^^^ @@ -131,6 +118,13 @@ C++2c Feature Support - Implemented `P2893R3 Variadic Friends `_ +C++23 Feature Support +^^^^^^^^^^^^^^^^^^^^^ +- Removed the restriction to literal types in constexpr functions in C++23 mode. + +C++20 Feature Support +^^^^^^^^^^^^^^^^^^^^^ + Resolutions to C++ Defect Reports ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 25d976b45cb5b3d222d3a9cd94caa8a54031bbb7 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Thu, 22 Aug 2024 19:02:45 -0500 Subject: [PATCH 275/426] [ScalarizeMaskedMemIntr] Don't use a scalar mask on GPUs (#104842) ScalarizedMaskedMemIntr contains an optimization where the mask is bitcast into an iN and then bit-tests with powers of two are used to determine whether to load/store/... or not. However, on machines with branch divergence (mainly GPUs), this is a mis-optimization, since each i1 in the mask will be stored in a condition register - that is, ecah of these "i1"s is likely to be a word or two wide, making these bit operations counterproductive. Therefore, amend this pass to skip the optimizaiton on targets that it pessimizes. Pre-commit tests #104645 --- .../Scalar/ScalarizeMaskedMemIntrin.cpp | 136 +++++++++++------- .../AMDGPU/expamd-masked-load.ll | 33 ++--- .../AMDGPU/expand-masked-gather.ll | 11 +- .../AMDGPU/expand-masked-scatter.ll | 11 +- .../AMDGPU/expand-masked-store.ll | 33 ++--- 5 files changed, 115 insertions(+), 109 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp index 9cb7bad94c20bc..63fcc1760ccafe 100644 --- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp +++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp @@ -69,10 +69,11 @@ class ScalarizeMaskedMemIntrinLegacyPass : public FunctionPass { static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, - DomTreeUpdater *DTU); + bool HasBranchDivergence, DomTreeUpdater *DTU); static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, const TargetTransformInfo &TTI, - const DataLayout &DL, DomTreeUpdater *DTU); + const DataLayout &DL, bool HasBranchDivergence, + DomTreeUpdater *DTU); char ScalarizeMaskedMemIntrinLegacyPass::ID = 0; @@ -141,8 +142,9 @@ static unsigned adjustForEndian(const DataLayout &DL, unsigned VectorWidth, // %10 = extractelement <16 x i1> %mask, i32 2 // br i1 %10, label %cond.load4, label %else5 // -static void scalarizeMaskedLoad(const DataLayout &DL, CallInst *CI, - DomTreeUpdater *DTU, bool &ModifiedDT) { +static void scalarizeMaskedLoad(const DataLayout &DL, bool HasBranchDivergence, + CallInst *CI, DomTreeUpdater *DTU, + bool &ModifiedDT) { Value *Ptr = CI->getArgOperand(0); Value *Alignment = CI->getArgOperand(1); Value *Mask = CI->getArgOperand(2); @@ -221,11 +223,10 @@ static void scalarizeMaskedLoad(const DataLayout &DL, CallInst *CI, return; } // If the mask is not v1i1, use scalar bit test operations. This generates - // better results on X86 at least. - // Note: this produces worse code on AMDGPU, where the "i1" is implicitly SIMD - // - what's a good way to detect this? - Value *SclrMask; - if (VectorWidth != 1) { + // better results on X86 at least. However, don't do this on GPUs and other + // machines with divergence, as there each i1 needs a vector register. + Value *SclrMask = nullptr; + if (VectorWidth != 1 && !HasBranchDivergence) { Type *SclrMaskTy = Builder.getIntNTy(VectorWidth); SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask"); } @@ -233,13 +234,15 @@ static void scalarizeMaskedLoad(const DataLayout &DL, CallInst *CI, for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { // Fill the "else" block, created in the previous iteration // - // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] - // %mask_1 = and i16 %scalar_mask, i32 1 << Idx - // %cond = icmp ne i16 %mask_1, 0 - // br i1 %mask_1, label %cond.load, label %else + // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, + // %else ] %mask_1 = and i16 %scalar_mask, i32 1 << Idx %cond = icmp ne i16 + // %mask_1, 0 br i1 %mask_1, label %cond.load, label %else // + // On GPUs, use + // %cond = extrectelement %mask, Idx + // instead Value *Predicate; - if (VectorWidth != 1) { + if (SclrMask != nullptr) { Value *Mask = Builder.getInt(APInt::getOneBitSet( VectorWidth, adjustForEndian(DL, VectorWidth, Idx))); Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask), @@ -312,8 +315,9 @@ static void scalarizeMaskedLoad(const DataLayout &DL, CallInst *CI, // store i32 %6, i32* %7 // br label %else2 // . . . -static void scalarizeMaskedStore(const DataLayout &DL, CallInst *CI, - DomTreeUpdater *DTU, bool &ModifiedDT) { +static void scalarizeMaskedStore(const DataLayout &DL, bool HasBranchDivergence, + CallInst *CI, DomTreeUpdater *DTU, + bool &ModifiedDT) { Value *Src = CI->getArgOperand(0); Value *Ptr = CI->getArgOperand(1); Value *Alignment = CI->getArgOperand(2); @@ -378,10 +382,10 @@ static void scalarizeMaskedStore(const DataLayout &DL, CallInst *CI, } // If the mask is not v1i1, use scalar bit test operations. This generates - // better results on X86 at least. - - Value *SclrMask; - if (VectorWidth != 1) { + // better results on X86 at least. However, don't do this on GPUs or other + // machines with branch divergence, as there each i1 takes up a register. + Value *SclrMask = nullptr; + if (VectorWidth != 1 && !HasBranchDivergence) { Type *SclrMaskTy = Builder.getIntNTy(VectorWidth); SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask"); } @@ -393,8 +397,11 @@ static void scalarizeMaskedStore(const DataLayout &DL, CallInst *CI, // %cond = icmp ne i16 %mask_1, 0 // br i1 %mask_1, label %cond.store, label %else // + // On GPUs, use + // %cond = extrectelement %mask, Idx + // instead Value *Predicate; - if (VectorWidth != 1) { + if (SclrMask != nullptr) { Value *Mask = Builder.getInt(APInt::getOneBitSet( VectorWidth, adjustForEndian(DL, VectorWidth, Idx))); Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask), @@ -461,7 +468,8 @@ static void scalarizeMaskedStore(const DataLayout &DL, CallInst *CI, // . . . // %Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src // ret <16 x i32> %Result -static void scalarizeMaskedGather(const DataLayout &DL, CallInst *CI, +static void scalarizeMaskedGather(const DataLayout &DL, + bool HasBranchDivergence, CallInst *CI, DomTreeUpdater *DTU, bool &ModifiedDT) { Value *Ptrs = CI->getArgOperand(0); Value *Alignment = CI->getArgOperand(1); @@ -500,9 +508,10 @@ static void scalarizeMaskedGather(const DataLayout &DL, CallInst *CI, } // If the mask is not v1i1, use scalar bit test operations. This generates - // better results on X86 at least. - Value *SclrMask; - if (VectorWidth != 1) { + // better results on X86 at least. However, don't do this on GPUs or other + // machines with branch divergence, as there, each i1 takes up a register. + Value *SclrMask = nullptr; + if (VectorWidth != 1 && !HasBranchDivergence) { Type *SclrMaskTy = Builder.getIntNTy(VectorWidth); SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask"); } @@ -514,9 +523,12 @@ static void scalarizeMaskedGather(const DataLayout &DL, CallInst *CI, // %cond = icmp ne i16 %mask_1, 0 // br i1 %Mask1, label %cond.load, label %else // + // On GPUs, use + // %cond = extrectelement %mask, Idx + // instead Value *Predicate; - if (VectorWidth != 1) { + if (SclrMask != nullptr) { Value *Mask = Builder.getInt(APInt::getOneBitSet( VectorWidth, adjustForEndian(DL, VectorWidth, Idx))); Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask), @@ -591,7 +603,8 @@ static void scalarizeMaskedGather(const DataLayout &DL, CallInst *CI, // store i32 %Elt1, i32* %Ptr1, align 4 // br label %else2 // . . . -static void scalarizeMaskedScatter(const DataLayout &DL, CallInst *CI, +static void scalarizeMaskedScatter(const DataLayout &DL, + bool HasBranchDivergence, CallInst *CI, DomTreeUpdater *DTU, bool &ModifiedDT) { Value *Src = CI->getArgOperand(0); Value *Ptrs = CI->getArgOperand(1); @@ -629,8 +642,8 @@ static void scalarizeMaskedScatter(const DataLayout &DL, CallInst *CI, // If the mask is not v1i1, use scalar bit test operations. This generates // better results on X86 at least. - Value *SclrMask; - if (VectorWidth != 1) { + Value *SclrMask = nullptr; + if (VectorWidth != 1 && !HasBranchDivergence) { Type *SclrMaskTy = Builder.getIntNTy(VectorWidth); SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask"); } @@ -642,8 +655,11 @@ static void scalarizeMaskedScatter(const DataLayout &DL, CallInst *CI, // %cond = icmp ne i16 %mask_1, 0 // br i1 %Mask1, label %cond.store, label %else // + // On GPUs, use + // %cond = extrectelement %mask, Idx + // instead Value *Predicate; - if (VectorWidth != 1) { + if (SclrMask != nullptr) { Value *Mask = Builder.getInt(APInt::getOneBitSet( VectorWidth, adjustForEndian(DL, VectorWidth, Idx))); Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask), @@ -681,7 +697,8 @@ static void scalarizeMaskedScatter(const DataLayout &DL, CallInst *CI, ModifiedDT = true; } -static void scalarizeMaskedExpandLoad(const DataLayout &DL, CallInst *CI, +static void scalarizeMaskedExpandLoad(const DataLayout &DL, + bool HasBranchDivergence, CallInst *CI, DomTreeUpdater *DTU, bool &ModifiedDT) { Value *Ptr = CI->getArgOperand(0); Value *Mask = CI->getArgOperand(1); @@ -738,9 +755,10 @@ static void scalarizeMaskedExpandLoad(const DataLayout &DL, CallInst *CI, } // If the mask is not v1i1, use scalar bit test operations. This generates - // better results on X86 at least. - Value *SclrMask; - if (VectorWidth != 1) { + // better results on X86 at least. However, don't do this on GPUs or other + // machines with branch divergence, as there, each i1 takes up a register. + Value *SclrMask = nullptr; + if (VectorWidth != 1 && !HasBranchDivergence) { Type *SclrMaskTy = Builder.getIntNTy(VectorWidth); SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask"); } @@ -748,13 +766,16 @@ static void scalarizeMaskedExpandLoad(const DataLayout &DL, CallInst *CI, for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { // Fill the "else" block, created in the previous iteration // - // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] - // %mask_1 = extractelement <16 x i1> %mask, i32 Idx - // br i1 %mask_1, label %cond.load, label %else + // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, + // %else ] %mask_1 = extractelement <16 x i1> %mask, i32 Idx br i1 %mask_1, + // label %cond.load, label %else // + // On GPUs, use + // %cond = extrectelement %mask, Idx + // instead Value *Predicate; - if (VectorWidth != 1) { + if (SclrMask != nullptr) { Value *Mask = Builder.getInt(APInt::getOneBitSet( VectorWidth, adjustForEndian(DL, VectorWidth, Idx))); Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask), @@ -813,7 +834,8 @@ static void scalarizeMaskedExpandLoad(const DataLayout &DL, CallInst *CI, ModifiedDT = true; } -static void scalarizeMaskedCompressStore(const DataLayout &DL, CallInst *CI, +static void scalarizeMaskedCompressStore(const DataLayout &DL, + bool HasBranchDivergence, CallInst *CI, DomTreeUpdater *DTU, bool &ModifiedDT) { Value *Src = CI->getArgOperand(0); @@ -855,9 +877,10 @@ static void scalarizeMaskedCompressStore(const DataLayout &DL, CallInst *CI, } // If the mask is not v1i1, use scalar bit test operations. This generates - // better results on X86 at least. - Value *SclrMask; - if (VectorWidth != 1) { + // better results on X86 at least. However, don't do this on GPUs or other + // machines with branch divergence, as there, each i1 takes up a register. + Value *SclrMask = nullptr; + if (VectorWidth != 1 && !HasBranchDivergence) { Type *SclrMaskTy = Builder.getIntNTy(VectorWidth); SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask"); } @@ -868,8 +891,11 @@ static void scalarizeMaskedCompressStore(const DataLayout &DL, CallInst *CI, // %mask_1 = extractelement <16 x i1> %mask, i32 Idx // br i1 %mask_1, label %cond.store, label %else // + // On GPUs, use + // %cond = extrectelement %mask, Idx + // instead Value *Predicate; - if (VectorWidth != 1) { + if (SclrMask != nullptr) { Value *Mask = Builder.getInt(APInt::getOneBitSet( VectorWidth, adjustForEndian(DL, VectorWidth, Idx))); Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask), @@ -993,12 +1019,13 @@ static bool runImpl(Function &F, const TargetTransformInfo &TTI, bool EverMadeChange = false; bool MadeChange = true; auto &DL = F.getDataLayout(); + bool HasBranchDivergence = TTI.hasBranchDivergence(&F); while (MadeChange) { MadeChange = false; for (BasicBlock &BB : llvm::make_early_inc_range(F)) { bool ModifiedDTOnIteration = false; MadeChange |= optimizeBlock(BB, ModifiedDTOnIteration, TTI, DL, - DTU ? &*DTU : nullptr); + HasBranchDivergence, DTU ? &*DTU : nullptr); // Restart BB iteration if the dominator tree of the Function was changed if (ModifiedDTOnIteration) @@ -1032,13 +1059,14 @@ ScalarizeMaskedMemIntrinPass::run(Function &F, FunctionAnalysisManager &AM) { static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, - DomTreeUpdater *DTU) { + bool HasBranchDivergence, DomTreeUpdater *DTU) { bool MadeChange = false; BasicBlock::iterator CurInstIterator = BB.begin(); while (CurInstIterator != BB.end()) { if (CallInst *CI = dyn_cast(&*CurInstIterator++)) - MadeChange |= optimizeCallInst(CI, ModifiedDT, TTI, DL, DTU); + MadeChange |= + optimizeCallInst(CI, ModifiedDT, TTI, DL, HasBranchDivergence, DTU); if (ModifiedDT) return true; } @@ -1048,7 +1076,8 @@ static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, const TargetTransformInfo &TTI, - const DataLayout &DL, DomTreeUpdater *DTU) { + const DataLayout &DL, bool HasBranchDivergence, + DomTreeUpdater *DTU) { IntrinsicInst *II = dyn_cast(CI); if (II) { // The scalarization code below does not work for scalable vectors. @@ -1071,14 +1100,14 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, CI->getType(), cast(CI->getArgOperand(1))->getAlignValue())) return false; - scalarizeMaskedLoad(DL, CI, DTU, ModifiedDT); + scalarizeMaskedLoad(DL, HasBranchDivergence, CI, DTU, ModifiedDT); return true; case Intrinsic::masked_store: if (TTI.isLegalMaskedStore( CI->getArgOperand(0)->getType(), cast(CI->getArgOperand(2))->getAlignValue())) return false; - scalarizeMaskedStore(DL, CI, DTU, ModifiedDT); + scalarizeMaskedStore(DL, HasBranchDivergence, CI, DTU, ModifiedDT); return true; case Intrinsic::masked_gather: { MaybeAlign MA = @@ -1089,7 +1118,7 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, if (TTI.isLegalMaskedGather(LoadTy, Alignment) && !TTI.forceScalarizeMaskedGather(cast(LoadTy), Alignment)) return false; - scalarizeMaskedGather(DL, CI, DTU, ModifiedDT); + scalarizeMaskedGather(DL, HasBranchDivergence, CI, DTU, ModifiedDT); return true; } case Intrinsic::masked_scatter: { @@ -1102,7 +1131,7 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, !TTI.forceScalarizeMaskedScatter(cast(StoreTy), Alignment)) return false; - scalarizeMaskedScatter(DL, CI, DTU, ModifiedDT); + scalarizeMaskedScatter(DL, HasBranchDivergence, CI, DTU, ModifiedDT); return true; } case Intrinsic::masked_expandload: @@ -1110,14 +1139,15 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, CI->getType(), CI->getAttributes().getParamAttrs(0).getAlignment().valueOrOne())) return false; - scalarizeMaskedExpandLoad(DL, CI, DTU, ModifiedDT); + scalarizeMaskedExpandLoad(DL, HasBranchDivergence, CI, DTU, ModifiedDT); return true; case Intrinsic::masked_compressstore: if (TTI.isLegalMaskedCompressStore( CI->getArgOperand(0)->getType(), CI->getAttributes().getParamAttrs(1).getAlignment().valueOrOne())) return false; - scalarizeMaskedCompressStore(DL, CI, DTU, ModifiedDT); + scalarizeMaskedCompressStore(DL, HasBranchDivergence, CI, DTU, + ModifiedDT); return true; } } diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AMDGPU/expamd-masked-load.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AMDGPU/expamd-masked-load.ll index 35e5bcde4c0dbd..faee9f95ebdac0 100644 --- a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AMDGPU/expamd-masked-load.ll +++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AMDGPU/expamd-masked-load.ll @@ -8,10 +8,8 @@ define <2 x i32> @scalarize_v2i32(ptr %p, <2 x i1> %mask, <2 x i32> %passthru) { ; CHECK-LABEL: define <2 x i32> @scalarize_v2i32( ; CHECK-SAME: ptr [[P:%.*]], <2 x i1> [[MASK:%.*]], <2 x i32> [[PASSTHRU:%.*]]) { -; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK]] to i2 -; CHECK-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[TMP2]], label %[[COND_LOAD:.*]], label %[[ELSE:.*]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[MASK]], i64 0 +; CHECK-NEXT: br i1 [[TMP1]], label %[[COND_LOAD:.*]], label %[[ELSE:.*]] ; CHECK: [[COND_LOAD]]: ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 @@ -19,9 +17,8 @@ define <2 x i32> @scalarize_v2i32(ptr %p, <2 x i1> %mask, <2 x i32> %passthru) { ; CHECK-NEXT: br label %[[ELSE]] ; CHECK: [[ELSE]]: ; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i32> [ [[TMP5]], %[[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ] -; CHECK-NEXT: [[TMP6:%.*]] = and i2 [[SCALAR_MASK]], -2 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i2 [[TMP6]], 0 -; CHECK-NEXT: br i1 [[TMP7]], label %[[COND_LOAD1:.*]], label %[[ELSE2:.*]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[MASK]], i64 1 +; CHECK-NEXT: br i1 [[TMP6]], label %[[COND_LOAD1:.*]], label %[[ELSE2:.*]] ; CHECK: [[COND_LOAD1]]: ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4 @@ -58,10 +55,8 @@ define <2 x i32> @scalarize_v2i32_splat_mask(ptr %p, i1 %mask, <2 x i32> %passth define <2 x half> @scalarize_v2f16(ptr %p, <2 x i1> %mask, <2 x half> %passthru) { ; CHECK-LABEL: define <2 x half> @scalarize_v2f16( ; CHECK-SAME: ptr [[P:%.*]], <2 x i1> [[MASK:%.*]], <2 x half> [[PASSTHRU:%.*]]) { -; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK]] to i2 -; CHECK-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[TMP2]], label %[[COND_LOAD:.*]], label %[[ELSE:.*]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[MASK]], i64 0 +; CHECK-NEXT: br i1 [[TMP1]], label %[[COND_LOAD:.*]], label %[[ELSE:.*]] ; CHECK: [[COND_LOAD]]: ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds half, ptr [[P]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = load half, ptr [[TMP3]], align 2 @@ -69,9 +64,8 @@ define <2 x half> @scalarize_v2f16(ptr %p, <2 x i1> %mask, <2 x half> %passthru) ; CHECK-NEXT: br label %[[ELSE]] ; CHECK: [[ELSE]]: ; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x half> [ [[TMP5]], %[[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ] -; CHECK-NEXT: [[TMP6:%.*]] = and i2 [[SCALAR_MASK]], -2 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i2 [[TMP6]], 0 -; CHECK-NEXT: br i1 [[TMP7]], label %[[COND_LOAD1:.*]], label %[[ELSE2:.*]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[MASK]], i64 1 +; CHECK-NEXT: br i1 [[TMP6]], label %[[COND_LOAD1:.*]], label %[[ELSE2:.*]] ; CHECK: [[COND_LOAD1]]: ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds half, ptr [[P]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[TMP8]], align 2 @@ -88,10 +82,8 @@ define <2 x half> @scalarize_v2f16(ptr %p, <2 x i1> %mask, <2 x half> %passthru) define <2 x i32> @scalarize_v2i32_p3(ptr addrspace(3) %p, <2 x i1> %mask, <2 x i32> %passthru) { ; CHECK-LABEL: define <2 x i32> @scalarize_v2i32_p3( ; CHECK-SAME: ptr addrspace(3) [[P:%.*]], <2 x i1> [[MASK:%.*]], <2 x i32> [[PASSTHRU:%.*]]) { -; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK]] to i2 -; CHECK-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[TMP2]], label %[[COND_LOAD:.*]], label %[[ELSE:.*]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[MASK]], i64 0 +; CHECK-NEXT: br i1 [[TMP1]], label %[[COND_LOAD:.*]], label %[[ELSE:.*]] ; CHECK: [[COND_LOAD]]: ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(3) [[TMP3]], align 4 @@ -99,9 +91,8 @@ define <2 x i32> @scalarize_v2i32_p3(ptr addrspace(3) %p, <2 x i1> %mask, <2 x i ; CHECK-NEXT: br label %[[ELSE]] ; CHECK: [[ELSE]]: ; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i32> [ [[TMP5]], %[[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ] -; CHECK-NEXT: [[TMP6:%.*]] = and i2 [[SCALAR_MASK]], -2 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i2 [[TMP6]], 0 -; CHECK-NEXT: br i1 [[TMP7]], label %[[COND_LOAD1:.*]], label %[[ELSE2:.*]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[MASK]], i64 1 +; CHECK-NEXT: br i1 [[TMP6]], label %[[COND_LOAD1:.*]], label %[[ELSE2:.*]] ; CHECK: [[COND_LOAD1]]: ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(3) [[TMP8]], align 4 diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AMDGPU/expand-masked-gather.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AMDGPU/expand-masked-gather.ll index 94d0e2943d9366..8c4408bfa527b7 100644 --- a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AMDGPU/expand-masked-gather.ll +++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AMDGPU/expand-masked-gather.ll @@ -8,10 +8,8 @@ define <2 x i32> @scalarize_v2i32(<2 x ptr> %p, <2 x i1> %mask, <2 x i32> %passthru) { ; CHECK-LABEL: define <2 x i32> @scalarize_v2i32( ; CHECK-SAME: <2 x ptr> [[P:%.*]], <2 x i1> [[MASK:%.*]], <2 x i32> [[PASSTHRU:%.*]]) { -; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK]] to i2 -; CHECK-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[TMP2]], label %[[COND_LOAD:.*]], label %[[ELSE:.*]] +; CHECK-NEXT: [[MASK0:%.*]] = extractelement <2 x i1> [[MASK]], i64 0 +; CHECK-NEXT: br i1 [[MASK0]], label %[[COND_LOAD:.*]], label %[[ELSE:.*]] ; CHECK: [[COND_LOAD]]: ; CHECK-NEXT: [[PTR0:%.*]] = extractelement <2 x ptr> [[P]], i64 0 ; CHECK-NEXT: [[LOAD0:%.*]] = load i32, ptr [[PTR0]], align 8 @@ -19,9 +17,8 @@ define <2 x i32> @scalarize_v2i32(<2 x ptr> %p, <2 x i1> %mask, <2 x i32> %passt ; CHECK-NEXT: br label %[[ELSE]] ; CHECK: [[ELSE]]: ; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i32> [ [[RES0]], %[[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ] -; CHECK-NEXT: [[TMP3:%.*]] = and i2 [[SCALAR_MASK]], -2 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i2 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[TMP4]], label %[[COND_LOAD1:.*]], label %[[ELSE2:.*]] +; CHECK-NEXT: [[MASK1:%.*]] = extractelement <2 x i1> [[MASK]], i64 1 +; CHECK-NEXT: br i1 [[MASK1]], label %[[COND_LOAD1:.*]], label %[[ELSE2:.*]] ; CHECK: [[COND_LOAD1]]: ; CHECK-NEXT: [[PTR1:%.*]] = extractelement <2 x ptr> [[P]], i64 1 ; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[PTR1]], align 8 diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AMDGPU/expand-masked-scatter.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AMDGPU/expand-masked-scatter.ll index 45debf35d06e4f..448ee18e9b4dec 100644 --- a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AMDGPU/expand-masked-scatter.ll +++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AMDGPU/expand-masked-scatter.ll @@ -8,19 +8,16 @@ define void @scalarize_v2i32(<2 x ptr> %p, <2 x i1> %mask, <2 x i32> %value) { ; CHECK-LABEL: define void @scalarize_v2i32( ; CHECK-SAME: <2 x ptr> [[P:%.*]], <2 x i1> [[MASK:%.*]], <2 x i32> [[VALUE:%.*]]) { -; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK]] to i2 -; CHECK-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[TMP2]], label %[[COND_STORE:.*]], label %[[ELSE:.*]] +; CHECK-NEXT: [[MASK0:%.*]] = extractelement <2 x i1> [[MASK]], i64 0 +; CHECK-NEXT: br i1 [[MASK0]], label %[[COND_STORE:.*]], label %[[ELSE:.*]] ; CHECK: [[COND_STORE]]: ; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x i32> [[VALUE]], i64 0 ; CHECK-NEXT: [[PTR0:%.*]] = extractelement <2 x ptr> [[P]], i64 0 ; CHECK-NEXT: store i32 [[ELT0]], ptr [[PTR0]], align 8 ; CHECK-NEXT: br label %[[ELSE]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[TMP3:%.*]] = and i2 [[SCALAR_MASK]], -2 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i2 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[TMP4]], label %[[COND_STORE1:.*]], label %[[ELSE2:.*]] +; CHECK-NEXT: [[MASK1:%.*]] = extractelement <2 x i1> [[MASK]], i64 1 +; CHECK-NEXT: br i1 [[MASK1]], label %[[COND_STORE1:.*]], label %[[ELSE2:.*]] ; CHECK: [[COND_STORE1]]: ; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x i32> [[VALUE]], i64 1 ; CHECK-NEXT: [[PTR1:%.*]] = extractelement <2 x ptr> [[P]], i64 1 diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AMDGPU/expand-masked-store.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AMDGPU/expand-masked-store.ll index 1efd008b77e1c0..2eb86f20374d87 100644 --- a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AMDGPU/expand-masked-store.ll +++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AMDGPU/expand-masked-store.ll @@ -8,19 +8,16 @@ define void @scalarize_v2i32(ptr %p, <2 x i1> %mask, <2 x i32> %data) { ; CHECK-LABEL: define void @scalarize_v2i32( ; CHECK-SAME: ptr [[P:%.*]], <2 x i1> [[MASK:%.*]], <2 x i32> [[DATA:%.*]]) { -; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK]] to i2 -; CHECK-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[TMP2]], label %[[COND_STORE:.*]], label %[[ELSE:.*]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[MASK]], i64 0 +; CHECK-NEXT: br i1 [[TMP1]], label %[[COND_STORE:.*]], label %[[ELSE:.*]] ; CHECK: [[COND_STORE]]: ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[DATA]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 0 ; CHECK-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 ; CHECK-NEXT: br label %[[ELSE]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[TMP5:%.*]] = and i2 [[SCALAR_MASK]], -2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i2 [[TMP5]], 0 -; CHECK-NEXT: br i1 [[TMP6]], label %[[COND_STORE1:.*]], label %[[ELSE2:.*]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[MASK]], i64 1 +; CHECK-NEXT: br i1 [[TMP5]], label %[[COND_STORE1:.*]], label %[[ELSE2:.*]] ; CHECK: [[COND_STORE1]]: ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[DATA]], i64 1 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 1 @@ -55,19 +52,16 @@ define void @scalarize_v2i32_splat_mask(ptr %p, <2 x i32> %data, i1 %mask) { define void @scalarize_v2f16(ptr %p, <2 x i1> %mask, <2 x half> %data) { ; CHECK-LABEL: define void @scalarize_v2f16( ; CHECK-SAME: ptr [[P:%.*]], <2 x i1> [[MASK:%.*]], <2 x half> [[DATA:%.*]]) { -; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK]] to i2 -; CHECK-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[TMP2]], label %[[COND_STORE:.*]], label %[[ELSE:.*]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[MASK]], i64 0 +; CHECK-NEXT: br i1 [[TMP1]], label %[[COND_STORE:.*]], label %[[ELSE:.*]] ; CHECK: [[COND_STORE]]: ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x half> [[DATA]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds half, ptr [[P]], i32 0 ; CHECK-NEXT: store half [[TMP3]], ptr [[TMP4]], align 2 ; CHECK-NEXT: br label %[[ELSE]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[TMP5:%.*]] = and i2 [[SCALAR_MASK]], -2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i2 [[TMP5]], 0 -; CHECK-NEXT: br i1 [[TMP6]], label %[[COND_STORE1:.*]], label %[[ELSE2:.*]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[MASK]], i64 1 +; CHECK-NEXT: br i1 [[TMP5]], label %[[COND_STORE1:.*]], label %[[ELSE2:.*]] ; CHECK: [[COND_STORE1]]: ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x half> [[DATA]], i64 1 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds half, ptr [[P]], i32 1 @@ -83,19 +77,16 @@ define void @scalarize_v2f16(ptr %p, <2 x i1> %mask, <2 x half> %data) { define void @scalarize_v2i32_p3(ptr addrspace(3) %p, <2 x i1> %mask, <2 x i32> %data) { ; CHECK-LABEL: define void @scalarize_v2i32_p3( ; CHECK-SAME: ptr addrspace(3) [[P:%.*]], <2 x i1> [[MASK:%.*]], <2 x i32> [[DATA:%.*]]) { -; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK]] to i2 -; CHECK-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[TMP2]], label %[[COND_STORE:.*]], label %[[ELSE:.*]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[MASK]], i64 0 +; CHECK-NEXT: br i1 [[TMP1]], label %[[COND_STORE:.*]], label %[[ELSE:.*]] ; CHECK: [[COND_STORE]]: ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[DATA]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P]], i32 0 ; CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(3) [[TMP4]], align 4 ; CHECK-NEXT: br label %[[ELSE]] ; CHECK: [[ELSE]]: -; CHECK-NEXT: [[TMP5:%.*]] = and i2 [[SCALAR_MASK]], -2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i2 [[TMP5]], 0 -; CHECK-NEXT: br i1 [[TMP6]], label %[[COND_STORE1:.*]], label %[[ELSE2:.*]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[MASK]], i64 1 +; CHECK-NEXT: br i1 [[TMP5]], label %[[COND_STORE1:.*]], label %[[ELSE2:.*]] ; CHECK: [[COND_STORE1]]: ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[DATA]], i64 1 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P]], i32 1 From 08e5a1de8227512d4774a534b91cb2353cef6284 Mon Sep 17 00:00:00 2001 From: Jeff Niu Date: Thu, 22 Aug 2024 20:23:44 -0400 Subject: [PATCH 276/426] [llvm][NVPTX] Fix quadratic runtime in ProxyRegErasure (#105730) This pass performs RAUW by walking the machine function for each RAUW operation. For large functions, this runtime in this pass starts to blow up. Linearize the pass by batching the RAUW ops at once. --- .../lib/Target/NVPTX/NVPTXProxyRegErasure.cpp | 60 +++++++++---------- 1 file changed, 28 insertions(+), 32 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp b/llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp index 258ae97a20d582..f3a3362addb0ea 100644 --- a/llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp @@ -34,7 +34,6 @@ void initializeNVPTXProxyRegErasurePass(PassRegistry &); namespace { struct NVPTXProxyRegErasure : public MachineFunctionPass { -public: static char ID; NVPTXProxyRegErasure() : MachineFunctionPass(ID) { initializeNVPTXProxyRegErasurePass(*PassRegistry::getPassRegistry()); @@ -49,23 +48,22 @@ struct NVPTXProxyRegErasure : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { MachineFunctionPass::getAnalysisUsage(AU); } - -private: - void replaceMachineInstructionUsage(MachineFunction &MF, MachineInstr &MI); - - void replaceRegisterUsage(MachineInstr &Instr, MachineOperand &From, - MachineOperand &To); }; } // namespace char NVPTXProxyRegErasure::ID = 0; -INITIALIZE_PASS(NVPTXProxyRegErasure, "nvptx-proxyreg-erasure", "NVPTX ProxyReg Erasure", false, false) +INITIALIZE_PASS(NVPTXProxyRegErasure, "nvptx-proxyreg-erasure", + "NVPTX ProxyReg Erasure", false, false) bool NVPTXProxyRegErasure::runOnMachineFunction(MachineFunction &MF) { SmallVector RemoveList; + // ProxyReg instructions forward a register as another: `%dst = mov.iN %src`. + // Bulk RAUW the `%dst` registers in two passes over the machine function. + DenseMap RAUWBatch; + for (auto &BB : MF) { for (auto &MI : BB) { switch (MI.getOpcode()) { @@ -74,44 +72,42 @@ bool NVPTXProxyRegErasure::runOnMachineFunction(MachineFunction &MF) { case NVPTX::ProxyRegI32: case NVPTX::ProxyRegI64: case NVPTX::ProxyRegF32: - case NVPTX::ProxyRegF64: - replaceMachineInstructionUsage(MF, MI); + case NVPTX::ProxyRegF64: { + auto &InOp = *MI.uses().begin(); + auto &OutOp = *MI.defs().begin(); + assert(InOp.isReg() && "ProxyReg input should be a register."); + assert(OutOp.isReg() && "ProxyReg output should be a register."); RemoveList.push_back(&MI); + RAUWBatch.try_emplace(OutOp.getReg(), InOp.getReg()); break; } + } } } + // If there were no proxy instructions, exit early. + if (RemoveList.empty()) + return false; + + // Erase the proxy instructions first. for (auto *MI : RemoveList) { MI->eraseFromParent(); } - return !RemoveList.empty(); -} - -void NVPTXProxyRegErasure::replaceMachineInstructionUsage(MachineFunction &MF, - MachineInstr &MI) { - auto &InOp = *MI.uses().begin(); - auto &OutOp = *MI.defs().begin(); - - assert(InOp.isReg() && "ProxyReg input operand should be a register."); - assert(OutOp.isReg() && "ProxyReg output operand should be a register."); - + // Now go replace the registers. for (auto &BB : MF) { - for (auto &I : BB) { - replaceRegisterUsage(I, OutOp, InOp); + for (auto &MI : BB) { + for (auto &Op : MI.uses()) { + if (!Op.isReg()) + continue; + auto it = RAUWBatch.find(Op.getReg()); + if (it != RAUWBatch.end()) + Op.setReg(it->second); + } } } -} -void NVPTXProxyRegErasure::replaceRegisterUsage(MachineInstr &Instr, - MachineOperand &From, - MachineOperand &To) { - for (auto &Op : Instr.uses()) { - if (Op.isReg() && Op.getReg() == From.getReg()) { - Op.setReg(To.getReg()); - } - } + return true; } MachineFunctionPass *llvm::createNVPTXProxyRegErasurePass() { From be8ee098c4b45522eb4836ee0034469208c85c74 Mon Sep 17 00:00:00 2001 From: Keith Smiley Date: Thu, 22 Aug 2024 18:27:59 -0700 Subject: [PATCH 277/426] [bazel] Move lldb-dap cc_binary to lldb/BUILD.bazel (#105733) On linux lldb-dap uses the location of the lldb-dap binary to search for lldb-server. Previously these were produced in different directories corresponding to the BUILD file paths. It's not ideal that the BUILD file location matters for the binary at runtime but it doesn't hurt to have this tool here too like the others. --- .../llvm-project-overlay/lldb/BUILD.bazel | 48 ++++++++++++++ .../lldb/tools/lldb-dap/BUILD.bazel | 62 ------------------- 2 files changed, 48 insertions(+), 62 deletions(-) delete mode 100644 utils/bazel/llvm-project-overlay/lldb/tools/lldb-dap/BUILD.bazel diff --git a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel index ee7ea7458b4488..3ed4f552290da4 100644 --- a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel @@ -1015,3 +1015,51 @@ cc_binary( "//conditions:default": [], }), ) + +expand_template( + name = "lldb-dap-plist", + out = "lldb-dap-Info.plist", + substitutions = { + "${LLDB_VERSION}": PACKAGE_VERSION, + }, + template = "tools/lldb-dap/lldb-dap-Info.plist.in", +) + +gentbl_cc_library( + name = "lldb_dap_opts_gen", + strip_include_prefix = "tools/lldb-dap", + tbl_outs = [( + ["-gen-opt-parser-defs"], + "tools/lldb-dap/Options.inc", + )], + tblgen = "//llvm:llvm-tblgen", + td_file = "tools/lldb-dap/Options.td", + deps = ["//llvm:OptParserTdFiles"], +) + +cc_binary( + name = "lldb-dap", + srcs = glob([ + "tools/lldb-dap/*.cpp", + "tools/lldb-dap/*.h", + ]), + additional_linker_inputs = [ + ":lldb-dap-plist", + ], + linkopts = select({ + "@platforms//os:macos": [ + "-Wl,-sectcreate,__TEXT,__info_plist,$(location :lldb-dap-plist)", + ], + "//conditions:default": [], + }), + deps = [ + ":lldb_dap_opts_gen", + "//lldb:APIHeaders", + "//lldb:Headers", + "//lldb:Host", + "//lldb:liblldb.wrapper", + "//llvm:Option", + "//llvm:Support", + "//llvm:config", + ], +) diff --git a/utils/bazel/llvm-project-overlay/lldb/tools/lldb-dap/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/tools/lldb-dap/BUILD.bazel deleted file mode 100644 index 6d3f19098fb452..00000000000000 --- a/utils/bazel/llvm-project-overlay/lldb/tools/lldb-dap/BUILD.bazel +++ /dev/null @@ -1,62 +0,0 @@ -# This file is licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -load("@bazel_skylib//rules:expand_template.bzl", "expand_template") -load("//:vars.bzl", "PACKAGE_VERSION") -load("//mlir:tblgen.bzl", "gentbl_cc_library") - -package( - default_visibility = ["//visibility:public"], - features = ["layering_check"], -) - -licenses(["notice"]) - -expand_template( - name = "plist", - out = "lldb-dap-Info.plist", - substitutions = { - "${LLDB_VERSION}": PACKAGE_VERSION, - }, - template = "lldb-dap-Info.plist.in", -) - -gentbl_cc_library( - name = "lldb_dap_opts_gen", - strip_include_prefix = ".", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "Options.inc", - )], - tblgen = "//llvm:llvm-tblgen", - td_file = "Options.td", - deps = ["//llvm:OptParserTdFiles"], -) - -cc_binary( - name = "lldb-dap", - srcs = glob([ - "*.cpp", - "*.h", - ]), - additional_linker_inputs = [ - ":plist", - ], - linkopts = select({ - "@platforms//os:macos": [ - "-Wl,-sectcreate,__TEXT,__info_plist,$(location :plist)", - ], - "//conditions:default": [], - }), - deps = [ - ":lldb_dap_opts_gen", - "//lldb:APIHeaders", - "//lldb:Headers", - "//lldb:Host", - "//lldb:liblldb.wrapper", - "//llvm:Option", - "//llvm:Support", - "//llvm:config", - ], -) From f06563a5c0d239a6b98f74db522681613254ad08 Mon Sep 17 00:00:00 2001 From: Yun-Fly Date: Fri, 23 Aug 2024 10:07:17 +0800 Subject: [PATCH 278/426] [mlir][tensor] Add consumer fusion for `tensor.pack` op. (#103715) Add missing `getIterationDomainTileFromOperandTile` and `getTiledImplementationFromOperandTile` to `tensor.pack` and enable fusing it as a consumer. NOTE that, it only expects perfect tiling scenario without padding semantic currently. --- .../Tensor/IR/TensorTilingInterfaceImpl.cpp | 114 ++++++++++++++++++ .../tile-and-fuse-consumer.mlir | 59 +++++++++ 2 files changed, 173 insertions(+) diff --git a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp index 361340a4e62f2d..dec678de6d1c27 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp @@ -246,6 +246,120 @@ struct PackOpTiling return failure(); return tilingResult.value(); } + + /// Method to return the position of iteration domain tile computed by the + /// tiled operation. In current `tensor.pack` context, the `resultOffsets` and + /// `resultSizes` only cover outer dimensions. + LogicalResult getIterationDomainTileFromOperandTile( + Operation *op, OpBuilder &b, unsigned operandNumber, + ArrayRef offsets, ArrayRef sizes, + SmallVectorImpl &resultOffsets, + SmallVectorImpl &resultSizes) const { + if (operandNumber != 0) + return failure(); + + auto packOp = cast(op); + // It is not trivial to infer dest tile from source tile if `packOp` has + // padding semantic. + if (packOp.getPaddingValue()) + return failure(); + + Location loc = packOp.getLoc(); + + SmallVector outerDimOffsets, outerDimSizes; + DenseMap dimAndTileMapping = + packOp.getDimAndTileMapping(); + for (auto dim : packOp.getOuterDimsPerm()) { + if (dimAndTileMapping.count(dim)) { + FailureOr cstSize = + ValueBoundsConstraintSet::computeConstantBound( + presburger::BoundType::UB, sizes[dim], + /*stopCondition=*/nullptr, /*closedUB=*/true); + std::optional cstInnerSize = + getConstantIntValue(dimAndTileMapping[dim]); + // Currently fusing `packOp` as consumer only expects perfect tiling + // scenario because even if without padding semantic, the `packOp` may + // also yield incomplete tiles. E.g. tensor<30xf32> -> tensor<5x6xf32>, + // where the `tileSize` from operand of `packOp` is 5, which is not + // exactly divided by `innerTile`(=6) of `packOp`. As the result: + // 1. the first slice is extracted from (0) to (4) and inserted into + // (0,0)~(0,4) at first row. + // 2. the second slice is extracted from (5) to (9) and SHOULD BE + // respectively inserted into two rows with different length, including + // first row: (0,5) and second row (1,0)~(1,3). It is hard to coordinate + // them, thus adding below constraint to bypass them temporarily. In + // another word, we can only support tiling with consumer if the tile + // size for the producer is a multiple of the inner tile size for the + // packed dimensions at this moment. + if (failed(cstSize) || !cstInnerSize || *cstSize % *cstInnerSize != 0) { + return failure(); + } + + using AV = affine::AffineValueExpr; + affine::AffineBuilder ab(b, loc); + AffineExpr dim0, sym; + bindDims(b.getContext(), dim0); + bindSymbols(b.getContext(), sym); + auto avOffset = AV(dim0).bind(offsets[dim]); + auto avSize = AV(dim0).bind(sizes[dim]); + auto avTileSize = AV(sym).bind(dimAndTileMapping[dim]); + outerDimOffsets.push_back(ab.floor(avOffset, avTileSize)); + outerDimSizes.push_back(ab.ceil(avSize, avTileSize)); + } else { + outerDimOffsets.push_back(offsets[dim]); + outerDimSizes.push_back(sizes[dim]); + } + } + + resultOffsets = outerDimOffsets; + resultSizes = outerDimSizes; + return success(); + } + + /// Method to return the tiled implementation of tensor.pack as a consumer. + FailureOr getTiledImplementationFromOperandTile( + Operation *op, OpBuilder &b, unsigned operandNumber, + ArrayRef offsets, ArrayRef sizes) const { + if (operandNumber != 0) + return failure(); + + auto packOp = cast(op); + Location loc = packOp.getLoc(); + + int64_t inputRank = packOp.getSourceRank(); + auto oneAttr = b.getI64IntegerAttr(1); + SmallVector strides(inputRank, oneAttr); + + SmallVector tiledOperands; + tiledOperands.push_back(b.create(loc, packOp.getSource(), + offsets, sizes, strides)); + + SmallVector outerDimOffsets, outerDimSizes; + if (failed(getIterationDomainTileFromOperandTile( + op, b, /*operandNumber=*/0, offsets, sizes, outerDimOffsets, + outerDimSizes))) + return failure(); + + SmallVector outputOffsets, outputSizes; + if (failed(getResultTilePosition(op, b, 0, outerDimOffsets, outerDimSizes, + outputOffsets, outputSizes))) + return failure(); + + strides.append(packOp.getDestRank() - inputRank, oneAttr); + auto extractSlice = b.create( + loc, packOp.getDest(), outputOffsets, outputSizes, strides); + tiledOperands.push_back(extractSlice); + + assert(!packOp.getPaddingValue() && "Expect no padding semantic"); + for (auto tile : packOp.getInnerTiles()) + tiledOperands.push_back(tile); + + Operation *tiledPackOp = b.create( + loc, TypeRange{extractSlice.getType()}, tiledOperands, op->getAttrs()); + + return TilingResult{{tiledPackOp}, + SmallVector(tiledPackOp->getResults())}; + } }; struct UnpackTileDimInfo { diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir index 400b558e37fcda..741dfbfb1cd5c2 100644 --- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir +++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir @@ -315,3 +315,62 @@ module attributes {transform.with_named_sequence} { // CHECK: } // CHECK: } // CHECK: return %[[FINAL_RESULT]]#1 : + +// ----- + +#map = affine_map<(d0, d1) -> (d0, d1)> +module { + func.func @fuse_pack_consumer_into_scf_forall(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x32xf32>) -> tensor<4x32x16xf32> { + %c4 = arith.constant 4 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index + %1 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %arg2) -> (tensor<64x32xf32>) { + %extracted_slice = tensor.extract_slice %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32> + %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) { + ^bb0(%in: f32, %in_16: f32, %out: f32): + %13 = arith.mulf %in, %in_16 : f32 + %14 = arith.addf %out, %13 : f32 + linalg.yield %14 : f32 + } -> tensor<32x32xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %3 into %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32> + } + } + %output = tensor.empty() : tensor<4x32x16xf32> + %pack = tensor.pack %1 outer_dims_perm = [0, 1] inner_dims_pos = [0] inner_tiles = [16] into %output : tensor<64x32xf32> -> tensor<4x32x16xf32> + return %pack : tensor<4x32x16xf32> + } +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) { + %slice_op = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1 + : (!transform.any_op) -> !transform.any_op + %a, %b = transform.test.fuse_consumer %slice_op + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield + } +} +// CHECK: #[[PACK_RESULT_MAP:.*]] = affine_map<(d0) -> (d0 floordiv 16)> +// CHECK: func.func @fuse_pack_consumer_into_scf_forall( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<32x32xf32> +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: tensor<32x32xf32> +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: tensor<64x32xf32>) +// CHECK: %[[OUT_INIT:.*]] = tensor.empty() : tensor<4x32x16xf32> +// CHECK: %[[FINAL_RESULT:.*]]:2 = scf.forall (%[[IV1:.*]], %[[IV2:.*]]) in (2, 2) +// CHECK-SAME: shared_outs(%[[FIRST_OUT_ARG:.*]] = %[[ARG2]], %[[PACK_OUT_ARG:.*]] = %[[OUT_INIT]]) +// CHECK-SAME: { +// CHECK: %[[GENERIC_OUT_SLICE:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] +// CHECK: %[[GENERIC_OUT:.*]] = linalg.generic +// CHECK-SAME: outs(%[[GENERIC_OUT_SLICE]] : +// CHECK: %[[PACK_RESULT_OFFSET:.*]] = affine.apply #[[PACK_RESULT_MAP]](%[[IV1]]) +// CHECK: %[[TILED_PACK_DEST:.*]] = tensor.extract_slice %[[PACK_OUT_ARG]][%[[PACK_RESULT_OFFSET]], %[[IV2]], 0] [2, 32, 16] [1, 1, 1] +// CHECK: %[[TILED_PACK_OUT:.*]] = tensor.pack %[[GENERIC_OUT]] +// CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0] inner_tiles = [16] +// CHECK-SAME: into %[[TILED_PACK_DEST]] +// CHECK: scf.forall.in_parallel { +// CHECK: tensor.parallel_insert_slice %[[TILED_PACK_OUT]] into %[[PACK_OUT_ARG]][%[[PACK_RESULT_OFFSET]], %[[IV2]], 0] [2, 32, 16] [1, 1, 1] +// CHECK: tensor.parallel_insert_slice %[[GENERIC_OUT]] into %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] +// CHECK: } +// CHECK: } +// CHECK: return %[[FINAL_RESULT]]#1 : From 381405fafe9d48d29c777e7680902d0943834859 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Thu, 22 Aug 2024 19:16:59 -0700 Subject: [PATCH 279/426] [NFC][TableGen] Emit more readable builtin string table (#105445) - Add `EmitStringLiteralDef` to StringToOffsetTable class to emit more readable string table. - Use that in `EmitIntrinsicToBuiltinMap`. --- .../llvm/TableGen/StringToOffsetTable.h | 26 +++++++++++++++++++ llvm/utils/TableGen/IntrinsicEmitter.cpp | 4 +-- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/TableGen/StringToOffsetTable.h b/llvm/include/llvm/TableGen/StringToOffsetTable.h index 7fb9d02d77c704..f2a20f06ae007f 100644 --- a/llvm/include/llvm/TableGen/StringToOffsetTable.h +++ b/llvm/include/llvm/TableGen/StringToOffsetTable.h @@ -12,6 +12,7 @@ #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringMap.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -52,6 +53,31 @@ class StringToOffsetTable { return II->second; } + // Emit the string using string literal concatenation, for better readability + // and searchability. + void EmitStringLiteralDef(raw_ostream &OS, const Twine &Decl, + const Twine &Indent = " ") const { + OS << formatv(R"( +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Woverlength-strings" +#endif +{0}{1} = )", + Indent, Decl); + + for (StringRef Str : split(AggregateString, '\0')) { + OS << "\n" << Indent << " \""; + OS.write_escaped(Str); + OS << "\\0\""; + } + OS << R"(; +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif +)"; + } + + // Emit the string as one single string. void EmitString(raw_ostream &O) { // Escape the string. SmallString<256> Str; diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp index 5d972157828784..8e536c99f627f5 100644 --- a/llvm/utils/TableGen/IntrinsicEmitter.cpp +++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp @@ -669,9 +669,7 @@ Intrinsic::getIntrinsicFor{1}Builtin(StringRef TargetPrefix, } if (!Table.empty()) { - OS << " static constexpr char BuiltinNames[] = {\n"; - Table.EmitCharArray(OS); - OS << " };\n\n"; + Table.EmitStringLiteralDef(OS, "static constexpr char BuiltinNames[]"); OS << R"( struct BuiltinEntry { From 987ffc31f8813f8b4157f5191dcff63a7c4db161 Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Fri, 23 Aug 2024 11:03:40 +0900 Subject: [PATCH 280/426] [AMDGPU] Refactor code for GETPC bundle updates in hazards (NFCI) As suggested in review for PR #100067. Refactor code for S_GETPC_B64 bundle updates for use with multiple hazard mitigations. --- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 50 +++++++++++++------ 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index a6b7264405ade1..2c1071c5433058 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -2851,6 +2851,38 @@ bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { return false; } +// Adjust global offsets for instructions bundled with S_GETPC_B64 after +// insertion of a new instruction. +static void updateGetPCBundle(MachineInstr *NewMI) { + if (!NewMI->isBundled()) + return; + + // Find start of bundle. + auto I = NewMI->getIterator(); + while (I->isBundledWithPred()) + I--; + if (I->isBundle()) + I++; + + // Bail if this is not an S_GETPC bundle. + if (I->getOpcode() != AMDGPU::S_GETPC_B64) + return; + + // Update offsets of any references in the bundle. + const unsigned NewBytes = 4; + assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && + "Unexpected instruction insertion in bundle"); + auto NextMI = std::next(NewMI->getIterator()); + auto End = NewMI->getParent()->end(); + while (NextMI != End && NextMI->isBundledWithPred()) { + for (auto &Operand : NextMI->operands()) { + if (Operand.isGlobal()) + Operand.setOffset(Operand.getOffset() + NewBytes); + } + NextMI++; + } +} + bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { if (!ST.hasVALUMaskWriteHazard()) return false; @@ -2968,22 +3000,12 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { auto NextMI = std::next(MI->getIterator()); // Add s_waitcnt_depctr sa_sdst(0) after SALU write. - BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), - TII.get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); + auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), + TII.get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); // SALU write may be s_getpc in a bundle. - if (MI->getOpcode() == AMDGPU::S_GETPC_B64) { - // Update offsets of any references in the bundle. - while (NextMI != MI->getParent()->end() && - NextMI->isBundledWithPred()) { - for (auto &Operand : NextMI->operands()) { - if (Operand.isGlobal()) - Operand.setOffset(Operand.getOffset() + 4); - } - NextMI++; - } - } + updateGetPCBundle(NewMI); return true; } From 714033a6bf3a81b1350f969ddd83bcd9fbb703e8 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Thu, 22 Aug 2024 20:02:48 -0700 Subject: [PATCH 281/426] [clang-format] Don't insert a space between :: and * (#105043) Also, don't insert a space after ::* for method pointers. See https://github.com/llvm/llvm-project/pull/86253#issuecomment-2298404887. Fixes #100841. --- clang/lib/Format/TokenAnnotator.cpp | 16 +++++---- clang/unittests/Format/FormatTest.cpp | 36 +++++++++---------- clang/unittests/Format/QualifierFixerTest.cpp | 36 +++++++++---------- 3 files changed, 45 insertions(+), 43 deletions(-) diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index 9d4204655b8ed6..0d5741ed76f7cb 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -4478,10 +4478,8 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, } if (Left.is(tok::colon)) return Left.isNot(TT_ObjCMethodExpr); - if (Left.is(tok::coloncolon)) { - return Right.is(tok::star) && Right.is(TT_PointerOrReference) && - Style.PointerAlignment != FormatStyle::PAS_Left; - } + if (Left.is(tok::coloncolon)) + return false; if (Left.is(tok::less) || Right.isOneOf(tok::greater, tok::less)) { if (Style.Language == FormatStyle::LK_TextProto || (Style.Language == FormatStyle::LK_Proto && @@ -4591,8 +4589,14 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, if (!BeforeLeft) return false; if (BeforeLeft->is(tok::coloncolon)) { - return Left.is(tok::star) && - Style.PointerAlignment != FormatStyle::PAS_Right; + if (Left.isNot(tok::star)) + return false; + assert(Style.PointerAlignment != FormatStyle::PAS_Right); + if (!Right.startsSequence(tok::identifier, tok::r_paren)) + return true; + assert(Right.Next); + const auto *LParen = Right.Next->MatchingParen; + return !LParen || LParen->isNot(TT_FunctionTypeLParen); } return !BeforeLeft->isOneOf(tok::l_paren, tok::l_square); } diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 794ccab3704534..e895f16465491a 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -3646,8 +3646,8 @@ TEST_F(FormatTest, FormatsClasses) { " : public aaaaaaaaaaaaaaaaaaa {};"); verifyFormat("template \n" - "struct Aaaaaaaaaaaaaaaaa\n" - " : Aaaaaaaaaaaaaaaaa {};"); + "struct Aaaaaaaaaaaaaaaaa\n" + " : Aaaaaaaaaaaaaaaaa {};"); verifyFormat("class ::A::B {};"); } @@ -11166,10 +11166,10 @@ TEST_F(FormatTest, UnderstandsBinaryOperators) { } TEST_F(FormatTest, UnderstandsPointersToMembers) { - verifyFormat("int A:: *x;"); - verifyFormat("int (S:: *func)(void *);"); - verifyFormat("void f() { int (S:: *func)(void *); }"); - verifyFormat("typedef bool *(Class:: *Member)() const;"); + verifyFormat("int A::*x;"); + verifyFormat("int (S::*func)(void *);"); + verifyFormat("void f() { int (S::*func)(void *); }"); + verifyFormat("typedef bool *(Class::*Member)() const;"); verifyFormat("void f() {\n" " (a->*f)();\n" " a->*x;\n" @@ -11187,16 +11187,16 @@ TEST_F(FormatTest, UnderstandsPointersToMembers) { FormatStyle Style = getLLVMStyle(); EXPECT_EQ(Style.PointerAlignment, FormatStyle::PAS_Right); - verifyFormat("typedef bool *(Class:: *Member)() const;", Style); - verifyFormat("void f(int A:: *p) { int A:: *v = &A::B; }", Style); + verifyFormat("typedef bool *(Class::*Member)() const;", Style); + verifyFormat("void f(int A::*p) { int A::*v = &A::B; }", Style); Style.PointerAlignment = FormatStyle::PAS_Left; - verifyFormat("typedef bool* (Class::* Member)() const;", Style); + verifyFormat("typedef bool* (Class::*Member)() const;", Style); verifyFormat("void f(int A::* p) { int A::* v = &A::B; }", Style); Style.PointerAlignment = FormatStyle::PAS_Middle; - verifyFormat("typedef bool * (Class:: * Member)() const;", Style); - verifyFormat("void f(int A:: * p) { int A:: * v = &A::B; }", Style); + verifyFormat("typedef bool * (Class::*Member)() const;", Style); + verifyFormat("void f(int A::* p) { int A::* v = &A::B; }", Style); } TEST_F(FormatTest, UnderstandsUnaryOperators) { @@ -12539,7 +12539,7 @@ TEST_F(FormatTest, FormatsFunctionTypes) { verifyFormat("int (*func)(void *);"); verifyFormat("void f() { int (*func)(void *); }"); verifyFormat("template \n" - "using Callback = void (CallbackClass:: *)(SomeObject *Data);"); + "using MyCallback = void (CallbackClass::*)(SomeObject *Data);"); verifyGoogleFormat("A;"); verifyGoogleFormat("void* (*a)(int);"); @@ -19462,13 +19462,13 @@ TEST_F(FormatTest, AlignConsecutiveDeclarations) { "int bbbbbbb = 0;", Alignment); // http://llvm.org/PR68079 - verifyFormat("using Fn = int (A:: *)();\n" - "using RFn = int (A:: *)() &;\n" - "using RRFn = int (A:: *)() &&;", + verifyFormat("using Fn = int (A::*)();\n" + "using RFn = int (A::*)() &;\n" + "using RRFn = int (A::*)() &&;", Alignment); - verifyFormat("using Fn = int (A:: *)();\n" - "using RFn = int *(A:: *)() &;\n" - "using RRFn = double (A:: *)() &&;", + verifyFormat("using Fn = int (A::*)();\n" + "using RFn = int *(A::*)() &;\n" + "using RRFn = double (A::*)() &&;", Alignment); // PAS_Right diff --git a/clang/unittests/Format/QualifierFixerTest.cpp b/clang/unittests/Format/QualifierFixerTest.cpp index 3a5f63e5de65b4..f9255c6e4c7088 100644 --- a/clang/unittests/Format/QualifierFixerTest.cpp +++ b/clang/unittests/Format/QualifierFixerTest.cpp @@ -305,7 +305,7 @@ TEST_F(QualifierFixerTest, RightQualifier) { verifyFormat("Foo inline static const;", "Foo inline const static;", Style); verifyFormat("Foo inline static const;", Style); - verifyFormat("Foo::Bar const volatile A:: *;", + verifyFormat("Foo::Bar const volatile A::*;", "volatile const Foo::Bar A::*;", Style); @@ -523,15 +523,14 @@ TEST_F(QualifierFixerTest, RightQualifier) { verifyFormat("const INTPTR a;", Style); // Pointers to members - verifyFormat("int S:: *a;", Style); - verifyFormat("int const S:: *a;", "const int S:: *a;", Style); - verifyFormat("int const S:: *const a;", "const int S::* const a;", Style); - verifyFormat("int A:: *const A:: *p1;", Style); - verifyFormat("float (C:: *p)(int);", Style); - verifyFormat("float (C:: *const p)(int);", Style); - verifyFormat("float (C:: *p)(int) const;", Style); - verifyFormat("float const (C:: *p)(int);", "const float (C::*p)(int);", - Style); + verifyFormat("int S::*a;", Style); + verifyFormat("int const S::*a;", "const int S::*a;", Style); + verifyFormat("int const S::*const a;", "const int S::* const a;", Style); + verifyFormat("int A::*const A::*p1;", Style); + verifyFormat("float (C::*p)(int);", Style); + verifyFormat("float (C::*const p)(int);", Style); + verifyFormat("float (C::*p)(int) const;", Style); + verifyFormat("float const (C::*p)(int);", "const float (C::*p)(int);", Style); } TEST_F(QualifierFixerTest, LeftQualifier) { @@ -831,15 +830,14 @@ TEST_F(QualifierFixerTest, LeftQualifier) { verifyFormat("INTPTR const a;", Style); // Pointers to members - verifyFormat("int S:: *a;", Style); - verifyFormat("const int S:: *a;", "int const S:: *a;", Style); - verifyFormat("const int S:: *const a;", "int const S::* const a;", Style); - verifyFormat("int A:: *const A:: *p1;", Style); - verifyFormat("float (C:: *p)(int);", Style); - verifyFormat("float (C:: *const p)(int);", Style); - verifyFormat("float (C:: *p)(int) const;", Style); - verifyFormat("const float (C:: *p)(int);", "float const (C::*p)(int);", - Style); + verifyFormat("int S::*a;", Style); + verifyFormat("const int S::*a;", "int const S::*a;", Style); + verifyFormat("const int S::*const a;", "int const S::*const a;", Style); + verifyFormat("int A::*const A::*p1;", Style); + verifyFormat("float (C::*p)(int);", Style); + verifyFormat("float (C::*const p)(int);", Style); + verifyFormat("float (C::*p)(int) const;", Style); + verifyFormat("const float (C::*p)(int);", "float const (C::*p)(int);", Style); } TEST_F(QualifierFixerTest, ConstVolatileQualifiersOrder) { From 151945151c3d29b3a6b3a630cf36942cab07fef9 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 22 Aug 2024 20:03:08 -0700 Subject: [PATCH 282/426] Revert "[Vectorize] Fix warnings" (#105771) Triggers assert in compiler https://lab.llvm.org/buildbot/#/builders/51/builds/2836 ``` Instructions.cpp:1700: llvm::ShuffleVectorInst::ShuffleVectorInst(Value *, Value *, ArrayRef, const Twine &, InsertPosition): Assertion `isValidOperands(V1, V2, Mask) && "Invalid shuffle vector instruction operands!"' failed. ``` This reverts commit a625435d3ef4c7bbfceb44498b9b5a2cbbed838b. --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e8ab6839d9fa87..afaef6f9da9872 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -9297,7 +9297,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) if (CommonMask[Idx] != PoisonMaskElem) CommonMask[Idx] = Idx; - for (const auto &[E, Idx] : SubVectors) { + for (const auto [E, Idx] : SubVectors) { Cost += ::getShuffleCost( TTI, TTI::SK_InsertSubvector, FixedVectorType::get(ScalarTy, CommonMask.size()), std::nullopt, @@ -12455,7 +12455,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) if (CommonMask[Idx] != PoisonMaskElem) CommonMask[Idx] = Idx; - for (const auto &[E, Idx] : SubVectors) { + for (const auto [E, Idx] : SubVectors) { Vec = Builder.CreateInsertVector( Vec->getType(), Vec, E->VectorizedValue, Builder.getInt64(Idx)); if (!CommonMask.empty()) { @@ -12636,7 +12636,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, E->ReuseShuffleIndices.end()); SmallVector GatheredScalars(E->Scalars.begin(), E->Scalars.end()); // Clear values, to be replaced by insertvector instructions. - for (const auto &[EIdx, Idx] : E->CombinedEntriesWithIndices) + for (const auto [EIdx, Idx] : E->CombinedEntriesWithIndices) for_each(MutableArrayRef(GatheredScalars) .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()), [&](Value *&V) { V = PoisonValue::get(V->getType()); }); @@ -13073,7 +13073,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) { - for (const auto &[EIdx, _] : E->CombinedEntriesWithIndices) + for (const auto [EIdx, _] : E->CombinedEntriesWithIndices) (void)vectorizeTree(VectorizableTree[EIdx].get(), /*PostponedPHIs=*/false); return processBuildVector(E, ScalarTy, Builder, *this); From 62da359ce7a7ae09e6afa96227eb556be54aabb1 Mon Sep 17 00:00:00 2001 From: bwlodarcz Date: Fri, 23 Aug 2024 05:27:36 +0200 Subject: [PATCH 283/426] [SPIRV] Emitting DebugSource, DebugCompileUnit (#97558) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit introduces emission of DebugSource, DebugCompileUnit from NonSemantic.Shader.DebugInfo.100 and required OpString with filename. NonSemantic.Shader.DebugInfo.100 is divided, following DWARF into two main concepts – emitting DIE and Line. In DWARF .debug_abbriev and .debug_info sections are responsible for emitting tree with information (DEIs) about e.g. types, compilation unit. Corresponding to that in NonSemantic.Shader.DebugInfo.100 have instructions like DebugSource, DebugCompileUnit etc. which preforms same role in SPIR-V file. The difference is in fact that in SPIR-V there are no sections but logical layout which forces order of the instruction emission. The NonSemantic.Shader.DebugInfo.100 requires for this type of global information to be emitted after OpTypeXXX and OpConstantXXX instructions. One of the goals was to minimize changes and interaction with SPIRVModuleAnalysis as possible which current commit achieves by emitting it’s instructions directly into MachineFunction. The possibility of duplicates are mitigated by guard inside pass which emits the global information only once in one function. By that method duplicates don’t have chance to be emitted. From that point, adding new debug global instructions should be straightforward. --- llvm/docs/SPIRVUsage.rst | 8 +- llvm/lib/Target/SPIRV/CMakeLists.txt | 1 + llvm/lib/Target/SPIRV/SPIRV.h | 2 + llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp | 8 +- llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp | 3 +- .../Target/SPIRV/SPIRVEmitNonSemanticDI.cpp | 188 ++++++++++++++++++ llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp | 23 ++- llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h | 3 +- llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp | 12 ++ .../SPIRV/debug-info/basic-global-di.ll | 46 +++++ 10 files changed, 287 insertions(+), 7 deletions(-) create mode 100644 llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp create mode 100644 llvm/test/CodeGen/SPIRV/debug-info/basic-global-di.ll diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst index 70865b95cb3932..0f0b21fb237703 100644 --- a/llvm/docs/SPIRVUsage.rst +++ b/llvm/docs/SPIRVUsage.rst @@ -33,7 +33,11 @@ Static Compiler Commands Command: `llc -O1 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_arbitrary_precision_integers input.ll -o output.spvt` Description: Compiles an LLVM IL file to SPIR-V with (`-O1`) optimizations, targeting a 64-bit architecture. It enables the SPV_INTEL_arbitrary_precision_integers extension. -3. **SPIR-V Binary Generation** +3. **Compilation with experimental NonSemantic.Shader.DebugInfo.100 support** + Command: `llc --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info input.ll -o output.spvt` + Description: Compiles an LLVM IL file to SPIR-V with additional NonSemantic.Shader.DebugInfo.100 instructions. It enables the required SPV_KHR_non_semantic_info extension. + +4. **SPIR-V Binary Generation** Command: `llc -O0 -mtriple=spirv64-unknown-unknown -filetype=obj input.ll -o output.spvt` Description: Generates a SPIR-V object file (`output.spvt`) from an LLVM module, targeting a 64-bit SPIR-V architecture with no optimizations. @@ -181,6 +185,8 @@ list of supported SPIR-V extensions, sorted alphabetically by their extension na - Adds a new instruction that enables rotating values across invocations within a subgroup. * - ``SPV_KHR_uniform_group_instructions`` - Allows support for additional group operations within uniform control flow. + * - ``SPV_KHR_non_semantic_info`` + - Adds the ability to declare extended instruction sets that have no semantic impact and can be safely removed from a module. To enable multiple extensions, list them separated by spaces. For example, to enable support for atomic operations on floating-point numbers and arbitrary precision integers, use: diff --git a/llvm/lib/Target/SPIRV/CMakeLists.txt b/llvm/lib/Target/SPIRV/CMakeLists.txt index 14647e92f5d088..5f8aea5fc8d84d 100644 --- a/llvm/lib/Target/SPIRV/CMakeLists.txt +++ b/llvm/lib/Target/SPIRV/CMakeLists.txt @@ -40,6 +40,7 @@ add_llvm_target(SPIRVCodeGen SPIRVSubtarget.cpp SPIRVTargetMachine.cpp SPIRVUtils.cpp + SPIRVEmitNonSemanticDI.cpp LINK_COMPONENTS Analysis diff --git a/llvm/lib/Target/SPIRV/SPIRV.h b/llvm/lib/Target/SPIRV/SPIRV.h index e597a1dc8dc06c..6c35a467f53bef 100644 --- a/llvm/lib/Target/SPIRV/SPIRV.h +++ b/llvm/lib/Target/SPIRV/SPIRV.h @@ -26,6 +26,7 @@ FunctionPass *createSPIRVRegularizerPass(); FunctionPass *createSPIRVPreLegalizerPass(); FunctionPass *createSPIRVPostLegalizerPass(); ModulePass *createSPIRVEmitIntrinsicsPass(SPIRVTargetMachine *TM); +MachineFunctionPass *createSPIRVEmitNonSemanticDIPass(SPIRVTargetMachine *TM); InstructionSelector * createSPIRVInstructionSelector(const SPIRVTargetMachine &TM, const SPIRVSubtarget &Subtarget, @@ -36,6 +37,7 @@ void initializeSPIRVConvergenceRegionAnalysisWrapperPassPass(PassRegistry &); void initializeSPIRVPreLegalizerPass(PassRegistry &); void initializeSPIRVPostLegalizerPass(PassRegistry &); void initializeSPIRVEmitIntrinsicsPass(PassRegistry &); +void initializeSPIRVEmitNonSemanticDIPass(PassRegistry &); } // namespace llvm #endif // LLVM_LIB_TARGET_SPIRV_SPIRV_H diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp index 7fe8e11aaa4209..55b41627802096 100644 --- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp @@ -274,6 +274,8 @@ void SPIRVAsmPrinter::outputDebugSourceAndStrings(const Module &M) { addStringImm(Str.first(), Inst); outputMCInst(Inst); } + // Output OpString. + outputModuleSection(SPIRV::MB_DebugStrings); // Output OpSource. MCInst Inst; Inst.setOpcode(SPIRV::OpSource); @@ -589,9 +591,11 @@ void SPIRVAsmPrinter::outputModuleSections() { // the first section to allow use of: OpLine and OpNoLine debug information; // non-semantic instructions with OpExtInst. outputModuleSection(SPIRV::MB_TypeConstVars); - // 10. All function declarations (functions without a body). + // 10. All global NonSemantic.Shader.DebugInfo.100 instructions. + outputModuleSection(SPIRV::MB_NonSemanticGlobalDI); + // 11. All function declarations (functions without a body). outputExtFuncDecls(); - // 11. All function definitions (functions with a body). + // 12. All function definitions (functions with a body). // This is done in regular function output. } diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp index c7c244cfa89770..90a9ab1d33ced4 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp @@ -68,7 +68,8 @@ static const std::map SPIRV::Extension::Extension::SPV_KHR_shader_clock}, {"SPV_KHR_cooperative_matrix", SPIRV::Extension::Extension::SPV_KHR_cooperative_matrix}, -}; + {"SPV_KHR_non_semantic_info", + SPIRV::Extension::Extension::SPV_KHR_non_semantic_info}}; bool SPIRVExtensionsParser::parse(cl::Option &O, llvm::StringRef ArgName, llvm::StringRef ArgValue, diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp new file mode 100644 index 00000000000000..cc506356e39043 --- /dev/null +++ b/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp @@ -0,0 +1,188 @@ +#include "MCTargetDesc/SPIRVBaseInfo.h" +#include "MCTargetDesc/SPIRVMCTargetDesc.h" +#include "SPIRVGlobalRegistry.h" +#include "SPIRVRegisterInfo.h" +#include "SPIRVTargetMachine.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/Metadata.h" +#include "llvm/PassRegistry.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Path.h" + +#define DEBUG_TYPE "spirv-nonsemantic-debug-info" + +namespace llvm { +struct SPIRVEmitNonSemanticDI : public MachineFunctionPass { + static char ID; + SPIRVTargetMachine *TM; + SPIRVEmitNonSemanticDI(SPIRVTargetMachine *TM); + SPIRVEmitNonSemanticDI(); + + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + bool IsGlobalDIEmitted = false; + bool emitGlobalDI(MachineFunction &MF); +}; + +void initializeSPIRVEmitNonSemanticDIPass(PassRegistry &); + +FunctionPass *createSPIRVEmitNonSemanticDIPass(SPIRVTargetMachine *TM) { + return new SPIRVEmitNonSemanticDI(TM); +} +} // namespace llvm + +using namespace llvm; + +INITIALIZE_PASS(SPIRVEmitNonSemanticDI, DEBUG_TYPE, + "SPIRV NonSemantic.Shader.DebugInfo.100 emitter", false, false) + +char SPIRVEmitNonSemanticDI::ID = 0; + +SPIRVEmitNonSemanticDI::SPIRVEmitNonSemanticDI(SPIRVTargetMachine *TM) + : MachineFunctionPass(ID), TM(TM) { + initializeSPIRVEmitNonSemanticDIPass(*PassRegistry::getPassRegistry()); +} + +SPIRVEmitNonSemanticDI::SPIRVEmitNonSemanticDI() : MachineFunctionPass(ID) { + initializeSPIRVEmitNonSemanticDIPass(*PassRegistry::getPassRegistry()); +} + +bool SPIRVEmitNonSemanticDI::emitGlobalDI(MachineFunction &MF) { + // If this MachineFunction doesn't have any BB repeat procedure + // for the next + if (MF.begin() == MF.end()) { + IsGlobalDIEmitted = false; + return false; + } + + // Required variables to get from metadata search + LLVMContext *Context; + SmallString<128> FilePath; + unsigned SourceLanguage = 0; + int64_t DwarfVersion = 0; + int64_t DebugInfoVersion = 0; + + // Searching through the Module metadata to find nescessary + // information like DwarfVersion or SourceLanguage + { + const MachineModuleInfo &MMI = + getAnalysis().getMMI(); + const Module *M = MMI.getModule(); + Context = &M->getContext(); + const NamedMDNode *DbgCu = M->getNamedMetadata("llvm.dbg.cu"); + if (!DbgCu) + return false; + for (const auto *Op : DbgCu->operands()) { + if (const auto *CompileUnit = dyn_cast(Op)) { + DIFile *File = CompileUnit->getFile(); + sys::path::append(FilePath, File->getDirectory(), File->getFilename()); + SourceLanguage = CompileUnit->getSourceLanguage(); + break; + } + } + const NamedMDNode *ModuleFlags = M->getNamedMetadata("llvm.module.flags"); + for (const auto *Op : ModuleFlags->operands()) { + const MDOperand &MaybeStrOp = Op->getOperand(1); + if (MaybeStrOp.equalsStr("Dwarf Version")) + DwarfVersion = + cast( + cast(Op->getOperand(2))->getValue()) + ->getSExtValue(); + else if (MaybeStrOp.equalsStr("Debug Info Version")) + DebugInfoVersion = + cast( + cast(Op->getOperand(2))->getValue()) + ->getSExtValue(); + } + } + // NonSemantic.Shader.DebugInfo.100 global DI instruction emitting + { + // Required LLVM variables for emitting logic + const SPIRVInstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo(); + const SPIRVRegisterInfo *TRI = TM->getSubtargetImpl()->getRegisterInfo(); + const RegisterBankInfo *RBI = TM->getSubtargetImpl()->getRegBankInfo(); + SPIRVGlobalRegistry *GR = TM->getSubtargetImpl()->getSPIRVGlobalRegistry(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineBasicBlock &MBB = *MF.begin(); + + // To correct placement of a OpLabel instruction during SPIRVAsmPrinter + // emission all new instructions needs to be placed after OpFunction + // and before first terminator + MachineIRBuilder MIRBuilder(MBB, MBB.getFirstTerminator()); + + // Emit OpString with FilePath which is required by DebugSource + const Register StrReg = MRI.createVirtualRegister(&SPIRV::IDRegClass); + MRI.setType(StrReg, LLT::scalar(32)); + MachineInstrBuilder MIB = MIRBuilder.buildInstr(SPIRV::OpString); + MIB.addDef(StrReg); + addStringImm(FilePath, MIB); + + const SPIRVType *VoidTy = + GR->getOrCreateSPIRVType(Type::getVoidTy(*Context), MIRBuilder); + + // Emit DebugSource which is required by DebugCompilationUnit + const Register DebugSourceResIdReg = + MRI.createVirtualRegister(&SPIRV::IDRegClass); + MRI.setType(DebugSourceResIdReg, LLT::scalar(32)); + MIB = MIRBuilder.buildInstr(SPIRV::OpExtInst) + .addDef(DebugSourceResIdReg) + .addUse(GR->getSPIRVTypeID(VoidTy)) + .addImm(static_cast( + SPIRV::InstructionSet::NonSemantic_Shader_DebugInfo_100)) + .addImm(SPIRV::NonSemanticExtInst::DebugSource) + .addUse(StrReg); + MIB.constrainAllUses(*TII, *TRI, *RBI); + GR->assignSPIRVTypeToVReg(VoidTy, DebugSourceResIdReg, MF); + + const SPIRVType *I32Ty = + GR->getOrCreateSPIRVType(Type::getInt32Ty(*Context), MIRBuilder); + + // Convert DwarfVersion, DebugInfo and SourceLanguage integers to OpConstant + // instructions required by DebugCompilationUnit + const Register DwarfVersionReg = + GR->buildConstantInt(DwarfVersion, MIRBuilder, I32Ty, false); + const Register DebugInfoVersionReg = + GR->buildConstantInt(DebugInfoVersion, MIRBuilder, I32Ty, false); + const Register SourceLanguageReg = + GR->buildConstantInt(SourceLanguage, MIRBuilder, I32Ty, false); + + // Emit DebugCompilationUnit + const Register DebugCompUnitResIdReg = + MRI.createVirtualRegister(&SPIRV::IDRegClass); + MRI.setType(DebugCompUnitResIdReg, LLT::scalar(32)); + MIB = MIRBuilder.buildInstr(SPIRV::OpExtInst) + .addDef(DebugCompUnitResIdReg) + .addUse(GR->getSPIRVTypeID(VoidTy)) + .addImm(static_cast( + SPIRV::InstructionSet::NonSemantic_Shader_DebugInfo_100)) + .addImm(SPIRV::NonSemanticExtInst::DebugCompilationUnit) + .addUse(DebugInfoVersionReg) + .addUse(DwarfVersionReg) + .addUse(DebugSourceResIdReg) + .addUse(SourceLanguageReg); + MIB.constrainAllUses(*TII, *TRI, *RBI); + GR->assignSPIRVTypeToVReg(VoidTy, DebugCompUnitResIdReg, MF); + } + return true; +} + +bool SPIRVEmitNonSemanticDI::runOnMachineFunction(MachineFunction &MF) { + bool Res = false; + // emitGlobalDI needs to be executed only once to avoid + // emitting duplicates + if (!IsGlobalDIEmitted) { + IsGlobalDIEmitted = true; + Res = emitGlobalDI(MF); + } + return Res; +} diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index ac0aa682ea4beb..a2fcfc636e3684 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -21,7 +21,6 @@ #include "SPIRVSubtarget.h" #include "SPIRVTargetMachine.h" #include "SPIRVUtils.h" -#include "TargetInfo/SPIRVTargetInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -427,7 +426,19 @@ void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) { if (MAI.getSkipEmission(&MI)) continue; const unsigned OpCode = MI.getOpcode(); - if (OpCode == SPIRV::OpName || OpCode == SPIRV::OpMemberName) { + if (OpCode == SPIRV::OpString) { + collectOtherInstr(MI, MAI, SPIRV::MB_DebugStrings, IS); + } else if (OpCode == SPIRV::OpExtInst) { + MachineOperand Ins = MI.getOperand(3); + namespace NS = SPIRV::NonSemanticExtInst; + static constexpr int64_t GlobalNonSemanticDITy[] = { + NS::DebugSource, NS::DebugCompilationUnit}; + bool IsGlobalDI = false; + for (unsigned Idx = 0; Idx < std::size(GlobalNonSemanticDITy); ++Idx) + IsGlobalDI |= Ins.getImm() == GlobalNonSemanticDITy[Idx]; + if (IsGlobalDI) + collectOtherInstr(MI, MAI, SPIRV::MB_NonSemanticGlobalDI, IS); + } else if (OpCode == SPIRV::OpName || OpCode == SPIRV::OpMemberName) { collectOtherInstr(MI, MAI, SPIRV::MB_DebugNames, IS); } else if (OpCode == SPIRV::OpEntryPoint) { collectOtherInstr(MI, MAI, SPIRV::MB_EntryPoints, IS); @@ -899,6 +910,14 @@ void addInstrRequirements(const MachineInstr &MI, Reqs.addCapability(SPIRV::Capability::Float16Buffer); break; } + case SPIRV::OpExtInst: { + if (MI.getOperand(2).getImm() == + static_cast( + SPIRV::InstructionSet::NonSemantic_Shader_DebugInfo_100)) { + Reqs.addExtension(SPIRV::Extension::SPV_KHR_non_semantic_info); + } + break; + } case SPIRV::OpBitReverse: case SPIRV::OpBitFieldInsert: case SPIRV::OpBitFieldSExtract: diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h index 79226d6d93efb2..024728c347e8a8 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h @@ -20,7 +20,6 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringMap.h" namespace llvm { class SPIRVSubtarget; @@ -34,9 +33,11 @@ enum ModuleSectionType { MB_EntryPoints, // All OpEntryPoint instructions (if any). // MB_ExecutionModes, MB_DebugSourceAndStrings, MB_DebugNames, // All OpName and OpMemberName intrs. + MB_DebugStrings, // All OpString intrs. MB_DebugModuleProcessed, // All OpModuleProcessed instructions. MB_Annotations, // OpDecorate, OpMemberDecorate etc. MB_TypeConstVars, // OpTypeXXX, OpConstantXXX, and global OpVariables. + MB_NonSemanticGlobalDI, // OpExtInst with e.g. DebugSource, DebugTypeBasic. MB_ExtFuncDecls, // OpFunction etc. to declare for external funcs. NUM_MODULE_SECTIONS // Total number of sections requiring basic blocks. }; diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp index 52fc6f33b4ef14..48a2ce89bad390 100644 --- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp @@ -115,6 +115,7 @@ class SPIRVPassConfig : public TargetPassConfig { void addOptimizedRegAlloc() override {} void addPostRegAlloc() override; + void addPreEmitPass() override; private: const SPIRVTargetMachine &TM; @@ -208,6 +209,17 @@ bool SPIRVPassConfig::addRegBankSelect() { return false; } +static cl::opt SPVEnableNonSemanticDI( + "spv-emit-nonsemantic-debug-info", + cl::desc("Emit SPIR-V NonSemantic.Shader.DebugInfo.100 instructions"), + cl::Optional, cl::init(false)); + +void SPIRVPassConfig::addPreEmitPass() { + if (SPVEnableNonSemanticDI) { + addPass(createSPIRVEmitNonSemanticDIPass(&getTM())); + } +} + namespace { // A custom subclass of InstructionSelect, which is mostly the same except from // not requiring RegBankSelect to occur previously. diff --git a/llvm/test/CodeGen/SPIRV/debug-info/basic-global-di.ll b/llvm/test/CodeGen/SPIRV/debug-info/basic-global-di.ll new file mode 100644 index 00000000000000..336b7db324c3d4 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/debug-info/basic-global-di.ll @@ -0,0 +1,46 @@ +; RUN: llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info --print-after=spirv-nonsemantic-debug-info -O0 -mtriple=spirv64-unknown-unknown %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-MIR +; RUN: llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc --verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_non_semantic_info %s -o - | FileCheck %s --check-prefix=CHECK-OPTION +; RUN: %if spirv-tools %{ llc --spirv-ext=+SPV_KHR_non_semantic_info -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-MIR-DAG: [[type_void:%[0-9]+\:type]] = OpTypeVoid +; CHECK-MIR-DAG: [[type_i64:%[0-9]+\:type\(s64\)]] = OpTypeInt 32, 0 +; CHECK-MIR-DAG: [[dwarf_version:%[0-9]+\:iid\(s32\)]] = OpConstantI [[type_i64]], 5 +; CHECK-MIR-DAG: [[source_language:%[0-9]+\:iid\(s32\)]] = OpConstantI [[type_i64]], 3 +; CHECK-MIR-DAG: [[debug_info_version:%[0-9]+\:iid\(s32\)]] = OpConstantI [[type_i64]], 21 +; CHECK-MIR-DAG: [[filename_str:%[0-9]+\:id\(s32\)]] = OpString 1094795567, 1094795585, 792805697, 1111638594, 1111638594, 1128481583, 1128481603, 1697596227, 1886216568, 1663985004, 0 +; CHECK-MIR-DAG: [[debug_source:%[0-9]+\:id\(s32\)]] = OpExtInst [[type_void]], 3, 35, [[filename_str]] +; CHECK-MIR-DAG: [[debug_compilation_unit:%[0-9]+\:id\(s32\)]] = OpExtInst [[type_void]], 3, 1, [[source_language]], [[dwarf_version]], [[debug_source]], [[debug_info_version]] + +; CHECK-SPIRV: [[ext_inst_non_semantic:%[0-9]+]] = OpExtInstImport "NonSemantic.Shader.DebugInfo.100" +; CHECK-SPIRV: [[filename_str:%[0-9]+]] = OpString "/AAAAAAAAAA/BBBBBBBB/CCCCCCCCC/example.c" +; CHECK-SPIRV-DAG: [[type_void:%[0-9]+]] = OpTypeVoid +; CHECK-SPIRV-DAG: [[type_i32:%[0-9]+]] = OpTypeInt 32 0 +; CHECK-SPIRV-DAG: [[dwarf_version:%[0-9]+]] = OpConstant [[type_i32]] 5 +; CHECK-SPIRV-DAG: [[debug_info_version:%[0-9]+]] = OpConstant [[type_i32]] 21 +; CHECK-SPIRV-DAG: [[source_language:%[0-9]+]] = OpConstant [[type_i32]] 3 +; CHECK-SPIRV: [[debug_source:%[0-9]+]] = OpExtInst [[type_void]] [[ext_inst_non_semantic]] DebugSource [[filename_str]] +; CHECK-SPIRV: [[debug_compiation_unit:%[0-9]+]] = OpExtInst [[type_void]] [[ext_inst_non_semantic]] DebugCompilationUnit [[source_language]] [[dwarf_version]] [[debug_source]] [[debug_info_version]] + +; CHECK-OPTION-NOT: OpExtInstImport "NonSemantic.Shader.DebugInfo.100" +; CHECK-OPTION-NOT: OpString "/AAAAAAAAAA/BBBBBBBB/CCCCCCCCC/example.c" + +define spir_func void @foo() { +entry: + ret void +} + +define spir_func void @bar() { +entry: + ret void +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5} + +!0 = distinct !DICompileUnit(language: DW_LANG_OpenCL, file: !1, producer: "clang version XX.X.XXXX (FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "example.c", directory: "/AAAAAAAAAA/BBBBBBBB/CCCCCCCCC", checksumkind: CSK_MD5, checksum: "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 7, !"frame-pointer", i32 2} From e15abb798282e4151f546eef14be4906f428eb46 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Fri, 23 Aug 2024 11:31:06 +1000 Subject: [PATCH 284/426] [ORC] Add an identifier-override argument to loadRelocatableObject and friends. API clients may want to use things other than paths as the buffer identifiers. No testcase -- I haven't thought of a good way to expose this via the regression testing tools. rdar://133536831 --- .../Orc/LoadRelocatableObject.h | 5 ++- llvm/include/llvm/ExecutionEngine/Orc/MachO.h | 8 ++-- .../Orc/LoadRelocatableObject.cpp | 23 +++++++++--- llvm/lib/ExecutionEngine/Orc/MachO.cpp | 37 +++++++++++++++++-- 4 files changed, 59 insertions(+), 14 deletions(-) diff --git a/llvm/include/llvm/ExecutionEngine/Orc/LoadRelocatableObject.h b/llvm/include/llvm/ExecutionEngine/Orc/LoadRelocatableObject.h index a6a2f41dcff7d6..c79c3e316e3828 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/LoadRelocatableObject.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/LoadRelocatableObject.h @@ -27,8 +27,9 @@ namespace orc { // Load an object file compatible with the given triple (if given) from the // given path. May return a file slice if the path contains a universal binary. -Expected> loadRelocatableObject(StringRef Path, - const Triple &TT); +Expected> loadRelocatableObject( + StringRef Path, const Triple &TT, + std::optional IdentifierOverride = std::nullopt); } // End namespace orc } // End namespace llvm diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachO.h b/llvm/include/llvm/ExecutionEngine/Orc/MachO.h index 58c04a9121c87b..fdaa2f73cda6a3 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/MachO.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/MachO.h @@ -38,14 +38,16 @@ checkMachORelocatableObject(std::unique_ptr Obj, const Triple &TT, /// Load a relocatable object compatible with TT from Path. /// If Path is a universal binary, this function will return a buffer for the /// slice compatible with Triple (if one is present). -Expected> -loadMachORelocatableObject(StringRef Path, const Triple &TT); +Expected> loadMachORelocatableObject( + StringRef Path, const Triple &TT, + std::optional IdentifierOverride = std::nullopt); /// Load a compatible relocatable object (if available) from a MachO universal /// binary. Expected> loadMachORelocatableObjectFromUniversalBinary( - StringRef UBPath, std::unique_ptr UBBuf, const Triple &TT); + StringRef UBPath, std::unique_ptr UBBuf, const Triple &TT, + std::optional IdentifierOverride = std::nullopt); /// Utility for identifying the file-slice compatible with TT in a universal /// binary. diff --git a/llvm/lib/ExecutionEngine/Orc/LoadRelocatableObject.cpp b/llvm/lib/ExecutionEngine/Orc/LoadRelocatableObject.cpp index 5fde1b5fb4da92..0a32a768313fd8 100644 --- a/llvm/lib/ExecutionEngine/Orc/LoadRelocatableObject.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LoadRelocatableObject.cpp @@ -9,6 +9,7 @@ #include "llvm/ExecutionEngine/Orc/LoadRelocatableObject.h" #include "llvm/BinaryFormat/Magic.h" #include "llvm/ExecutionEngine/Orc/MachO.h" +#include "llvm/Support/FileSystem.h" #define DEBUG_TYPE "orc" @@ -29,10 +30,22 @@ checkELFRelocatableObject(std::unique_ptr Obj, const Triple &TT) { } Expected> -loadRelocatableObject(StringRef Path, const Triple &TT) { - auto Buf = MemoryBuffer::getFile(Path); +loadRelocatableObject(StringRef Path, const Triple &TT, + std::optional IdentifierOverride) { + if (!IdentifierOverride) + IdentifierOverride = Path; + + Expected FDOrErr = + sys::fs::openNativeFileForRead(Path, sys::fs::OF_None); + if (!FDOrErr) + return createFileError(Path, FDOrErr.takeError()); + sys::fs::file_t FD = *FDOrErr; + auto Buf = + MemoryBuffer::getOpenFile(FD, *IdentifierOverride, /*FileSize=*/-1); + sys::fs::closeFile(FD); if (!Buf) - return createFileError(Path, Buf.getError()); + return make_error( + StringRef("Could not load object at path ") + Path, Buf.getError()); std::optional RequireFormat; if (TT.getObjectFormat() != Triple::UnknownObjectFormat) @@ -53,8 +66,8 @@ loadRelocatableObject(StringRef Path, const Triple &TT) { break; case file_magic::macho_universal_binary: if (!RequireFormat || *RequireFormat == Triple::MachO) - return loadMachORelocatableObjectFromUniversalBinary(Path, - std::move(*Buf), TT); + return loadMachORelocatableObjectFromUniversalBinary( + Path, std::move(*Buf), TT, IdentifierOverride); break; default: break; diff --git a/llvm/lib/ExecutionEngine/Orc/MachO.cpp b/llvm/lib/ExecutionEngine/Orc/MachO.cpp index fe5a1dbbb9f8a1..8fc262220bf892 100644 --- a/llvm/lib/ExecutionEngine/Orc/MachO.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MachO.cpp @@ -10,6 +10,7 @@ #include "llvm/BinaryFormat/MachO.h" #include "llvm/Object/MachOUniversal.h" +#include "llvm/Support/FileSystem.h" #define DEBUG_TYPE "orc" @@ -85,14 +86,27 @@ checkMachORelocatableObject(std::unique_ptr Obj, const Triple &TT, } Expected> -loadMachORelocatableObject(StringRef Path, const Triple &TT) { +loadMachORelocatableObject(StringRef Path, const Triple &TT, + std::optional IdentifierOverride) { assert((TT.getObjectFormat() == Triple::UnknownObjectFormat || TT.getObjectFormat() == Triple::MachO) && "TT must specify MachO or Unknown object format"); - auto Buf = MemoryBuffer::getFile(Path); + if (!IdentifierOverride) + IdentifierOverride = Path; + + Expected FDOrErr = + sys::fs::openNativeFileForRead(Path, sys::fs::OF_None); + if (!FDOrErr) + return createFileError(Path, FDOrErr.takeError()); + sys::fs::file_t FD = *FDOrErr; + auto Buf = + MemoryBuffer::getOpenFile(FD, *IdentifierOverride, /*FileSize=*/-1); + sys::fs::closeFile(FD); if (!Buf) - return createFileError(Path, Buf.getError()); + return make_error( + StringRef("Could not load MachO object at path ") + Path, + Buf.getError()); switch (identify_magic((*Buf)->getBuffer())) { case file_magic::macho_object: @@ -110,7 +124,8 @@ loadMachORelocatableObject(StringRef Path, const Triple &TT) { Expected> loadMachORelocatableObjectFromUniversalBinary( - StringRef UBPath, std::unique_ptr UBBuf, const Triple &TT) { + StringRef UBPath, std::unique_ptr UBBuf, const Triple &TT, + std::optional IdentifierOverride) { auto UniversalBin = object::MachOUniversalBinary::create(UBBuf->getMemBufferRef()); @@ -121,6 +136,20 @@ loadMachORelocatableObjectFromUniversalBinary( if (!SliceRange) return SliceRange.takeError(); + Expected FDOrErr = + sys::fs::openNativeFileForRead(UBPath, sys::fs::OF_None); + if (!FDOrErr) + return createFileError(UBPath, FDOrErr.takeError()); + sys::fs::file_t FD = *FDOrErr; + auto Buf = MemoryBuffer::getOpenFileSlice( + FD, *IdentifierOverride, SliceRange->second, SliceRange->first); + sys::fs::closeFile(FD); + if (!Buf) + return make_error( + "Could not load " + TT.getArchName() + + " slice of MachO universal binary at path " + UBPath, + Buf.getError()); + auto ObjBuf = errorOrToExpected(MemoryBuffer::getFileSlice( UBPath, SliceRange->second, SliceRange->first, false)); if (!ObjBuf) From 351f4a5593f1ef507708ec5eeca165b20add3340 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 22 Aug 2024 21:14:12 -0700 Subject: [PATCH 285/426] Reland "[Vectorize] Fix warnings"" (#105772) Revert was wrong, The bot is still broken https://lab.llvm.org/buildbot/#/builders/51/builds/2838 Reverts llvm/llvm-project#105771 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index afaef6f9da9872..e8ab6839d9fa87 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -9297,7 +9297,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) if (CommonMask[Idx] != PoisonMaskElem) CommonMask[Idx] = Idx; - for (const auto [E, Idx] : SubVectors) { + for (const auto &[E, Idx] : SubVectors) { Cost += ::getShuffleCost( TTI, TTI::SK_InsertSubvector, FixedVectorType::get(ScalarTy, CommonMask.size()), std::nullopt, @@ -12455,7 +12455,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) if (CommonMask[Idx] != PoisonMaskElem) CommonMask[Idx] = Idx; - for (const auto [E, Idx] : SubVectors) { + for (const auto &[E, Idx] : SubVectors) { Vec = Builder.CreateInsertVector( Vec->getType(), Vec, E->VectorizedValue, Builder.getInt64(Idx)); if (!CommonMask.empty()) { @@ -12636,7 +12636,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, E->ReuseShuffleIndices.end()); SmallVector GatheredScalars(E->Scalars.begin(), E->Scalars.end()); // Clear values, to be replaced by insertvector instructions. - for (const auto [EIdx, Idx] : E->CombinedEntriesWithIndices) + for (const auto &[EIdx, Idx] : E->CombinedEntriesWithIndices) for_each(MutableArrayRef(GatheredScalars) .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()), [&](Value *&V) { V = PoisonValue::get(V->getType()); }); @@ -13073,7 +13073,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) { - for (const auto [EIdx, _] : E->CombinedEntriesWithIndices) + for (const auto &[EIdx, _] : E->CombinedEntriesWithIndices) (void)vectorizeTree(VectorizableTree[EIdx].get(), /*PostponedPHIs=*/false); return processBuildVector(E, ScalarTy, Builder, *this); From fdaaa878443285e47a2cbc1b641ac04e2efa7881 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 22 Aug 2024 21:27:53 -0700 Subject: [PATCH 286/426] [Scalar] Remove an unused variable (#105767) The last use was removed by: commit 89fe570958f8b82df9a9c3b4c251ecba9753272a Author: Philip Reames Date: Tue May 12 23:39:23 2015 +0000 --- llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp index 77d67a2ce0f380..bf86be0dd387f0 100644 --- a/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp +++ b/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp @@ -342,7 +342,6 @@ bool PlaceSafepointsPass::runImpl(Function &F, const TargetLibraryInfo &TLI) { // The split loop structure here is so that we only need to recalculate // the dominator tree once. Alternatively, we could just keep it up to // date and use a more natural merged loop. - SetVector SplitBackedges; for (BasicBlock *Header : Headers) { BasicBlock *NewBB = SplitEdge(Term->getParent(), Header, &DT); PollsNeeded.push_back(NewBB->getTerminator()); From 7c3237d778572931ff097e81df43d0bce9d1d4f8 Mon Sep 17 00:00:00 2001 From: Tom Date: Fri, 23 Aug 2024 05:42:22 +0100 Subject: [PATCH 287/426] [clang-format] Change BinPackParameters to enum and add AlwaysOnePerLine (#101882) Related issues that have requested this feature: #51833 #23796 #53190 Partially solves - this issue requests is for both arguments and parameters --- clang/docs/ClangFormatStyleOptions.rst | 51 +++++-- clang/include/clang/Format/Format.h | 50 ++++--- clang/lib/Format/ContinuationIndenter.cpp | 27 +--- clang/lib/Format/Format.cpp | 19 ++- clang/lib/Format/FormatToken.cpp | 17 +++ clang/lib/Format/FormatToken.h | 3 + clang/lib/Format/TokenAnnotator.cpp | 8 ++ clang/unittests/Format/ConfigParseTest.cpp | 14 +- clang/unittests/Format/FormatTest.cpp | 131 +++++++++++++++--- clang/unittests/Format/FormatTestComments.cpp | 35 ++++- clang/unittests/Format/FormatTestObjC.cpp | 4 +- 11 files changed, 275 insertions(+), 84 deletions(-) diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst index c79a635d86a6ef..a427d7cd40fcdd 100644 --- a/clang/docs/ClangFormatStyleOptions.rst +++ b/clang/docs/ClangFormatStyleOptions.rst @@ -1617,7 +1617,7 @@ the configuration (without a prefix: ``Auto``). **AllowAllParametersOfDeclarationOnNextLine** (``Boolean``) :versionbadge:`clang-format 3.3` :ref:`¶ ` If the function declaration doesn't fit on a line, allow putting all parameters of a function declaration onto - the next line even if ``BinPackParameters`` is ``false``. + the next line even if ``BinPackParameters`` is ``OnePerLine``. .. code-block:: c++ @@ -2067,20 +2067,41 @@ the configuration (without a prefix: ``Auto``). .. _BinPackParameters: -**BinPackParameters** (``Boolean``) :versionbadge:`clang-format 3.7` :ref:`¶ ` - If ``false``, a function declaration's or function definition's - parameters will either all be on the same line or will have one line each. +**BinPackParameters** (``BinPackParametersStyle``) :versionbadge:`clang-format 3.7` :ref:`¶ ` + The bin pack parameters style to use. - .. code-block:: c++ + Possible values: + + * ``BPPS_BinPack`` (in configuration: ``BinPack``) + Bin-pack parameters. + + .. code-block:: c++ + + void f(int a, int bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb, + int ccccccccccccccccccccccccccccccccccccccccccc); + + * ``BPPS_OnePerLine`` (in configuration: ``OnePerLine``) + Put all parameters on the current line if they fit. + Otherwise, put each one on its own line. + + .. code-block:: c++ + + void f(int a, int b, int c); + + void f(int a, + int b, + int ccccccccccccccccccccccccccccccccccccc); + + * ``BPPS_AlwaysOnePerLine`` (in configuration: ``AlwaysOnePerLine``) + Always put each parameter on its own line. + + .. code-block:: c++ + + void f(int a, + int b, + int c); - true: - void f(int aaaaaaaaaaaaaaaaaaaa, int aaaaaaaaaaaaaaaaaaaa, - int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa) {} - false: - void f(int aaaaaaaaaaaaaaaaaaaa, - int aaaaaaaaaaaaaaaaaaaa, - int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa) {} .. _BitFieldColonSpacing: @@ -4817,7 +4838,7 @@ the configuration (without a prefix: ``Auto``). items into as few lines as possible when they go over ``ColumnLimit``. If ``Auto`` (the default), delegates to the value in - ``BinPackParameters``. If that is ``true``, bin-packs Objective-C + ``BinPackParameters``. If that is ``BinPack``, bin-packs Objective-C protocol conformance list items into as few lines as possible whenever they go over ``ColumnLimit``. @@ -4831,13 +4852,13 @@ the configuration (without a prefix: ``Auto``). .. code-block:: objc - Always (or Auto, if BinPackParameters=true): + Always (or Auto, if BinPackParameters==BinPack): @interface ccccccccccccc () < ccccccccccccc, ccccccccccccc, ccccccccccccc, ccccccccccccc> { } - Never (or Auto, if BinPackParameters=false): + Never (or Auto, if BinPackParameters!=BinPack): @interface ddddddddddddd () < ddddddddddddd, ddddddddddddd, diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h index 2af1d4065c3cc1..d8b62c7652a0f6 100644 --- a/clang/include/clang/Format/Format.h +++ b/clang/include/clang/Format/Format.h @@ -659,7 +659,7 @@ struct FormatStyle { /// If the function declaration doesn't fit on a line, /// allow putting all parameters of a function declaration onto - /// the next line even if ``BinPackParameters`` is ``false``. + /// the next line even if ``BinPackParameters`` is ``OnePerLine``. /// \code /// true: /// void myFunction( @@ -1192,20 +1192,36 @@ struct FormatStyle { /// \version 3.7 bool BinPackArguments; - /// If ``false``, a function declaration's or function definition's - /// parameters will either all be on the same line or will have one line each. - /// \code - /// true: - /// void f(int aaaaaaaaaaaaaaaaaaaa, int aaaaaaaaaaaaaaaaaaaa, - /// int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa) {} - /// - /// false: - /// void f(int aaaaaaaaaaaaaaaaaaaa, - /// int aaaaaaaaaaaaaaaaaaaa, - /// int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa) {} - /// \endcode + /// Different way to try to fit all parameters on a line. + enum BinPackParametersStyle : int8_t { + /// Bin-pack parameters. + /// \code + /// void f(int a, int bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb, + /// int ccccccccccccccccccccccccccccccccccccccccccc); + /// \endcode + BPPS_BinPack, + /// Put all parameters on the current line if they fit. + /// Otherwise, put each one on its own line. + /// \code + /// void f(int a, int b, int c); + /// + /// void f(int a, + /// int b, + /// int ccccccccccccccccccccccccccccccccccccc); + /// \endcode + BPPS_OnePerLine, + /// Always put each parameter on its own line. + /// \code + /// void f(int a, + /// int b, + /// int c); + /// \endcode + BPPS_AlwaysOnePerLine, + }; + + /// The bin pack parameters style to use. /// \version 3.7 - bool BinPackParameters; + BinPackParametersStyle BinPackParameters; /// Styles for adding spacing around ``:`` in bitfield definitions. enum BitFieldColonSpacingStyle : int8_t { @@ -3414,7 +3430,7 @@ struct FormatStyle { /// items into as few lines as possible when they go over ``ColumnLimit``. /// /// If ``Auto`` (the default), delegates to the value in - /// ``BinPackParameters``. If that is ``true``, bin-packs Objective-C + /// ``BinPackParameters``. If that is ``BinPack``, bin-packs Objective-C /// protocol conformance list items into as few lines as possible /// whenever they go over ``ColumnLimit``. /// @@ -3426,13 +3442,13 @@ struct FormatStyle { /// onto individual lines whenever they go over ``ColumnLimit``. /// /// \code{.objc} - /// Always (or Auto, if BinPackParameters=true): + /// Always (or Auto, if BinPackParameters==BinPack): /// @interface ccccccccccccc () < /// ccccccccccccc, ccccccccccccc, /// ccccccccccccc, ccccccccccccc> { /// } /// - /// Never (or Auto, if BinPackParameters=false): + /// Never (or Auto, if BinPackParameters!=BinPack): /// @interface ddddddddddddd () < /// ddddddddddddd, /// ddddddddddddd, diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp index 46dafad65863dc..4fcb776db45b58 100644 --- a/clang/lib/Format/ContinuationIndenter.cpp +++ b/clang/lib/Format/ContinuationIndenter.cpp @@ -128,25 +128,6 @@ static bool startsSegmentOfBuilderTypeCall(const FormatToken &Tok) { return Tok.isMemberAccess() && Tok.Previous && Tok.Previous->closesScope(); } -// Returns \c true if \c Current starts a new parameter. -static bool startsNextParameter(const FormatToken &Current, - const FormatStyle &Style) { - assert(Current.Previous); - const auto &Previous = *Current.Previous; - if (Current.is(TT_CtorInitializerComma) && - Style.BreakConstructorInitializers == FormatStyle::BCIS_BeforeComma) { - return true; - } - if (Style.Language == FormatStyle::LK_Proto && Current.is(TT_SelectorName)) - return true; - return Previous.is(tok::comma) && !Current.isTrailingComment() && - ((Previous.isNot(TT_CtorInitializerComma) || - Style.BreakConstructorInitializers != - FormatStyle::BCIS_BeforeComma) && - (Previous.isNot(TT_InheritanceComma) || - Style.BreakInheritanceList != FormatStyle::BILS_BeforeComma)); -} - // Returns \c true if \c Token in an alignable binary operator static bool isAlignableBinaryOperator(const FormatToken &Token) { // No need to align binary operators that only have two operands. @@ -437,7 +418,8 @@ bool ContinuationIndenter::mustBreak(const LineState &State) { // sets BreakBeforeParameter to avoid bin packing and this creates a // completely unnecessary line break after a template type that isn't // line-wrapped. - (Previous.NestingLevel == 1 || Style.BinPackParameters)) || + (Previous.NestingLevel == 1 || + Style.BinPackParameters == FormatStyle::BPPS_BinPack)) || (Style.BreakBeforeTernaryOperators && Current.is(TT_ConditionalExpr) && Previous.isNot(tok::question)) || (!Style.BreakBeforeTernaryOperators && @@ -1951,11 +1933,12 @@ void ContinuationIndenter::moveStatePastScopeOpener(LineState &State, // for backwards compatibility. bool ObjCBinPackProtocolList = (Style.ObjCBinPackProtocolList == FormatStyle::BPS_Auto && - Style.BinPackParameters) || + Style.BinPackParameters == FormatStyle::BPPS_BinPack) || Style.ObjCBinPackProtocolList == FormatStyle::BPS_Always; bool BinPackDeclaration = - (State.Line->Type != LT_ObjCDecl && Style.BinPackParameters) || + (State.Line->Type != LT_ObjCDecl && + Style.BinPackParameters == FormatStyle::BPPS_BinPack) || (State.Line->Type == LT_ObjCDecl && ObjCBinPackProtocolList); bool GenericSelection = diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index 97fac41cdd3008..d2463b892fbb96 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -134,6 +134,19 @@ template <> struct ScalarEnumerationTraits { } }; +template <> +struct ScalarEnumerationTraits { + static void enumeration(IO &IO, FormatStyle::BinPackParametersStyle &Value) { + IO.enumCase(Value, "BinPack", FormatStyle::BPPS_BinPack); + IO.enumCase(Value, "OnePerLine", FormatStyle::BPPS_OnePerLine); + IO.enumCase(Value, "AlwaysOnePerLine", FormatStyle::BPPS_AlwaysOnePerLine); + + // For backward compatibility. + IO.enumCase(Value, "true", FormatStyle::BPPS_BinPack); + IO.enumCase(Value, "false", FormatStyle::BPPS_OnePerLine); + } +}; + template <> struct ScalarEnumerationTraits { static void enumeration(IO &IO, FormatStyle::BinPackStyle &Value) { IO.enumCase(Value, "Auto", FormatStyle::BPS_Auto); @@ -1461,7 +1474,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) { LLVMStyle.AlwaysBreakBeforeMultilineStrings = false; LLVMStyle.AttributeMacros.push_back("__capability"); LLVMStyle.BinPackArguments = true; - LLVMStyle.BinPackParameters = true; + LLVMStyle.BinPackParameters = FormatStyle::BPPS_BinPack; LLVMStyle.BitFieldColonSpacing = FormatStyle::BFCS_Both; LLVMStyle.BracedInitializerIndentWidth = std::nullopt; LLVMStyle.BraceWrapping = {/*AfterCaseLabel=*/false, @@ -1836,7 +1849,7 @@ FormatStyle getChromiumStyle(FormatStyle::LanguageKind Language) { ChromiumStyle.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_Inline; ChromiumStyle.AllowShortIfStatementsOnASingleLine = FormatStyle::SIS_Never; ChromiumStyle.AllowShortLoopsOnASingleLine = false; - ChromiumStyle.BinPackParameters = false; + ChromiumStyle.BinPackParameters = FormatStyle::BPPS_OnePerLine; ChromiumStyle.DerivePointerAlignment = false; if (Language == FormatStyle::LK_ObjC) ChromiumStyle.ColumnLimit = 80; @@ -1851,7 +1864,7 @@ FormatStyle getMozillaStyle() { MozillaStyle.AlwaysBreakAfterDefinitionReturnType = FormatStyle::DRTBS_TopLevel; MozillaStyle.BinPackArguments = false; - MozillaStyle.BinPackParameters = false; + MozillaStyle.BinPackParameters = FormatStyle::BPPS_OnePerLine; MozillaStyle.BreakAfterReturnType = FormatStyle::RTBS_TopLevel; MozillaStyle.BreakBeforeBraces = FormatStyle::BS_Mozilla; MozillaStyle.BreakConstructorInitializers = FormatStyle::BCIS_BeforeComma; diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp index 85bec71ffbbc8a..963e8f87793fa0 100644 --- a/clang/lib/Format/FormatToken.cpp +++ b/clang/lib/Format/FormatToken.cpp @@ -322,5 +322,22 @@ CommaSeparatedList::getColumnFormat(unsigned RemainingCharacters) const { return BestFormat; } +bool startsNextParameter(const FormatToken &Current, const FormatStyle &Style) { + assert(Current.Previous); + const auto &Previous = *Current.Previous; + if (Current.is(TT_CtorInitializerComma) && + Style.BreakConstructorInitializers == FormatStyle::BCIS_BeforeComma) { + return true; + } + if (Style.Language == FormatStyle::LK_Proto && Current.is(TT_SelectorName)) + return true; + return Previous.is(tok::comma) && !Current.isTrailingComment() && + ((Previous.isNot(TT_CtorInitializerComma) || + Style.BreakConstructorInitializers != + FormatStyle::BCIS_BeforeComma) && + (Previous.isNot(TT_InheritanceComma) || + Style.BreakInheritanceList != FormatStyle::BILS_BeforeComma)); +} + } // namespace format } // namespace clang diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h index abcedb66b57cc6..2d386d99aa841a 100644 --- a/clang/lib/Format/FormatToken.h +++ b/clang/lib/Format/FormatToken.h @@ -1978,6 +1978,9 @@ inline bool continuesLineComment(const FormatToken &FormatTok, FormatTok.OriginalColumn >= MinContinueColumn; } +// Returns \c true if \c Current starts a new parameter. +bool startsNextParameter(const FormatToken &Current, const FormatStyle &Style); + } // namespace format } // namespace clang diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index 0d5741ed76f7cb..f8bf8d9570d9a8 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -5479,6 +5479,14 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line, return true; } + // Ignores the first parameter as this will be handled separately by + // BreakFunctionDefinitionParameters or AlignAfterOpenBracket. + if (Style.BinPackParameters == FormatStyle::BPPS_AlwaysOnePerLine && + Line.MightBeFunctionDecl && !Left.opensScope() && + startsNextParameter(Right, Style)) { + return true; + } + const auto *BeforeLeft = Left.Previous; const auto *AfterRight = Right.Next; diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp index 2ee0df99353ff5..b8bdfaaa74e10e 100644 --- a/clang/unittests/Format/ConfigParseTest.cpp +++ b/clang/unittests/Format/ConfigParseTest.cpp @@ -160,7 +160,6 @@ TEST(ConfigParseTest, ParsesConfigurationBools) { CHECK_PARSE_BOOL(AllowShortEnumsOnASingleLine); CHECK_PARSE_BOOL(AllowShortLoopsOnASingleLine); CHECK_PARSE_BOOL(BinPackArguments); - CHECK_PARSE_BOOL(BinPackParameters); CHECK_PARSE_BOOL(BreakAdjacentStringLiterals); CHECK_PARSE_BOOL(BreakAfterJavaFieldAnnotations); CHECK_PARSE_BOOL(BreakBeforeTernaryOperators); @@ -436,6 +435,19 @@ TEST(ConfigParseTest, ParsesConfiguration) { CHECK_PARSE("BreakBeforeInheritanceComma: true", BreakInheritanceList, FormatStyle::BILS_BeforeComma); + Style.BinPackParameters = FormatStyle::BPPS_OnePerLine; + CHECK_PARSE("BinPackParameters: BinPack", BinPackParameters, + FormatStyle::BPPS_BinPack); + CHECK_PARSE("BinPackParameters: OnePerLine", BinPackParameters, + FormatStyle::BPPS_OnePerLine); + CHECK_PARSE("BinPackParameters: AlwaysOnePerLine", BinPackParameters, + FormatStyle::BPPS_AlwaysOnePerLine); + // For backward compatibility. + CHECK_PARSE("BinPackParameters: true", BinPackParameters, + FormatStyle::BPPS_BinPack); + CHECK_PARSE("BinPackParameters: false", BinPackParameters, + FormatStyle::BPPS_OnePerLine); + Style.PackConstructorInitializers = FormatStyle::PCIS_BinPack; CHECK_PARSE("PackConstructorInitializers: Never", PackConstructorInitializers, FormatStyle::PCIS_Never); diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index e895f16465491a..779109976a4f77 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -2338,7 +2338,7 @@ TEST_F(FormatTest, FormatsForLoop) { "for (const Foo &baz = in.value(); !baz.at_end(); ++baz) {\n}"); FormatStyle NoBinPacking = getLLVMStyle(); - NoBinPacking.BinPackParameters = false; + NoBinPacking.BinPackParameters = FormatStyle::BPPS_OnePerLine; verifyFormat("for (int aaaaaaaaaaa = 1;\n" " aaaaaaaaaaa <= aaaaaaaaaaaaaaaaaaaaaa(aaaaaaaaaaaaaaaa,\n" " aaaaaaaaaaaaaaaa,\n" @@ -7165,7 +7165,7 @@ TEST_F(FormatTest, LineBreakingInBinaryExpressions) { "}"); FormatStyle OnePerLine = getLLVMStyle(); - OnePerLine.BinPackParameters = false; + OnePerLine.BinPackParameters = FormatStyle::BPPS_OnePerLine; verifyFormat( "if (aaaaaaaaaaaaaaaaaaaaaaaaaaaa || aaaaaaaaaaaaaaaaaaaaaaaaaaaa ||\n" " aaaaaaaaaaaaaaaaaaaaaaaaaaaa || aaaaaaaaaaaaaaaaaaaaaaaaaaaa ||\n" @@ -7319,7 +7319,7 @@ TEST_F(FormatTest, ExpressionIndentationBreakingBeforeOperators) { Style = getLLVMStyleWithColumns(20); Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak; - Style.BinPackParameters = false; + Style.BinPackParameters = FormatStyle::BPPS_OnePerLine; Style.BreakBeforeBinaryOperators = FormatStyle::BOS_NonAssignment; Style.ContinuationIndentWidth = 2; verifyFormat("struct Foo {\n" @@ -7694,7 +7694,7 @@ TEST_F(FormatTest, ConstructorInitializers) { " : aaaaa(aaaaaaaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaaaaaaaa,\n" " aaaaaaaaaaaaaaaaaaaaaa) {}", OnePerLine); - OnePerLine.BinPackParameters = false; + OnePerLine.BinPackParameters = FormatStyle::BPPS_OnePerLine; verifyFormat( "Constructor()\n" " : aaaaaaaaaaaaaaaaaaaaaaaa(\n" @@ -7718,7 +7718,7 @@ TEST_F(FormatTest, ConstructorInitializers) { TEST_F(FormatTest, AllowAllConstructorInitializersOnNextLine) { FormatStyle Style = getLLVMStyleWithColumns(60); Style.BreakConstructorInitializers = FormatStyle::BCIS_BeforeComma; - Style.BinPackParameters = false; + Style.BinPackParameters = FormatStyle::BPPS_OnePerLine; for (int i = 0; i < 4; ++i) { // Test all combinations of parameters that should not have an effect. @@ -7954,7 +7954,7 @@ TEST_F(FormatTest, AllowAllArgumentsOnNextLine) { } // This parameter should not affect declarations. - Style.BinPackParameters = false; + Style.BinPackParameters = FormatStyle::BPPS_OnePerLine; Style.AllowAllArgumentsOnNextLine = false; Style.AllowAllParametersOfDeclarationOnNextLine = true; verifyFormat("void FunctionCallWithReallyLongName(\n" @@ -8049,7 +8049,7 @@ TEST_F(FormatTest, BreakFunctionDefinitionParameters) { // Test the style where all parameters are on their own lines. Style.AllowAllParametersOfDeclarationOnNextLine = false; - Style.BinPackParameters = false; + Style.BinPackParameters = FormatStyle::BPPS_OnePerLine; verifyFormat("void functionDecl(paramA, paramB, paramC);\n" "void emptyFunctionDefinition() {}\n" "void functionDefinition(\n" @@ -8244,7 +8244,7 @@ TEST_F(FormatTest, BreakConstructorInitializersAfterColon) { " aaaaa(aaaaaaaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaaaaaaaa,\n" " aaaaaaaaaaaaaaaaaaaaaa) {}", OnePerLine); - OnePerLine.BinPackParameters = false; + OnePerLine.BinPackParameters = FormatStyle::BPPS_OnePerLine; verifyFormat("Constructor() :\n" " aaaaaaaaaaaaaaaaaaaaaaaa(\n" " aaaaaaaaaaa().aaa(),\n" @@ -8409,7 +8409,7 @@ TEST_F(FormatTest, MemoizationTests) { // This test takes VERY long when memoization is broken. FormatStyle OnePerLine = getLLVMStyle(); OnePerLine.PackConstructorInitializers = FormatStyle::PCIS_NextLine; - OnePerLine.BinPackParameters = false; + OnePerLine.BinPackParameters = FormatStyle::BPPS_OnePerLine; std::string input = "Constructor()\n" " : aaaa(a,\n"; for (unsigned i = 0, e = 80; i != e; ++i) @@ -8830,7 +8830,7 @@ TEST_F(FormatTest, BreaksDesireably) { TEST_F(FormatTest, FormatsDeclarationsOnePerLine) { FormatStyle NoBinPacking = getGoogleStyle(); - NoBinPacking.BinPackParameters = false; + NoBinPacking.BinPackParameters = FormatStyle::BPPS_OnePerLine; NoBinPacking.BinPackArguments = true; verifyFormat("void f() {\n" " f(aaaaaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaaaaaa,\n" @@ -8862,7 +8862,7 @@ TEST_F(FormatTest, FormatsDeclarationsOnePerLine) { TEST_F(FormatTest, FormatsOneParameterPerLineIfNecessary) { FormatStyle NoBinPacking = getGoogleStyle(); - NoBinPacking.BinPackParameters = false; + NoBinPacking.BinPackParameters = FormatStyle::BPPS_OnePerLine; NoBinPacking.BinPackArguments = false; verifyFormat("f(aaaaaaaaaaaaaaaaaaaa,\n" " aaaaaaaaaaaaaaaaaaaa,\n" @@ -8925,6 +8925,97 @@ TEST_F(FormatTest, FormatsOneParameterPerLineIfNecessary) { NoBinPacking); } +TEST_F(FormatTest, FormatsDeclarationBreakAlways) { + FormatStyle BreakAlways = getGoogleStyle(); + BreakAlways.BinPackParameters = FormatStyle::BPPS_AlwaysOnePerLine; + verifyFormat("void f(int a,\n" + " int b);", + BreakAlways); + verifyFormat("void f(int aaaaaaaaaaaaaaaaaaaaaaaaaa,\n" + " int bbbbbbbbbbbbbbbbbbbbbbbbb,\n" + " int cccccccccccccccccccccccc);", + BreakAlways); + + // Ensure AlignAfterOpenBracket interacts correctly with BinPackParameters set + // to BPPS_AlwaysOnePerLine. + BreakAlways.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak; + verifyFormat( + "void someLongFunctionName(\n" + " int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,\n" + " int b);", + BreakAlways); + BreakAlways.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent; + verifyFormat( + "void someLongFunctionName(\n" + " int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,\n" + " int b\n" + ");", + BreakAlways); +} + +TEST_F(FormatTest, FormatsDefinitionBreakAlways) { + FormatStyle BreakAlways = getGoogleStyle(); + BreakAlways.BinPackParameters = FormatStyle::BPPS_AlwaysOnePerLine; + verifyFormat("void f(int a,\n" + " int b) {\n" + " f(a, b);\n" + "}", + BreakAlways); + + // Ensure BinPackArguments interact correctly when BinPackParameters is set to + // BPPS_AlwaysOnePerLine. + verifyFormat("void f(int aaaaaaaaaaaaaaaaaaaaaaaaaa,\n" + " int bbbbbbbbbbbbbbbbbbbbbbbbb,\n" + " int cccccccccccccccccccccccc) {\n" + " f(aaaaaaaaaaaaaaaaaaaaaaaaaa, bbbbbbbbbbbbbbbbbbbbbbbbb,\n" + " cccccccccccccccccccccccc);\n" + "}", + BreakAlways); + BreakAlways.BinPackArguments = false; + verifyFormat("void f(int aaaaaaaaaaaaaaaaaaaaaaaaaa,\n" + " int bbbbbbbbbbbbbbbbbbbbbbbbb,\n" + " int cccccccccccccccccccccccc) {\n" + " f(aaaaaaaaaaaaaaaaaaaaaaaaaa,\n" + " bbbbbbbbbbbbbbbbbbbbbbbbb,\n" + " cccccccccccccccccccccccc);\n" + "}", + BreakAlways); + + // Ensure BreakFunctionDefinitionParameters interacts correctly when + // BinPackParameters is set to BPPS_AlwaysOnePerLine. + BreakAlways.BreakFunctionDefinitionParameters = true; + verifyFormat("void f(\n" + " int a,\n" + " int b) {\n" + " f(a, b);\n" + "}", + BreakAlways); + BreakAlways.BreakFunctionDefinitionParameters = false; + + // Ensure AlignAfterOpenBracket interacts correctly with BinPackParameters set + // to BPPS_AlwaysOnePerLine. + BreakAlways.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak; + verifyFormat( + "void someLongFunctionName(\n" + " int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,\n" + " int b) {\n" + " someLongFunctionName(\n" + " aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa, b);\n" + "}", + BreakAlways); + BreakAlways.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent; + verifyFormat( + "void someLongFunctionName(\n" + " int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,\n" + " int b\n" + ") {\n" + " someLongFunctionName(\n" + " aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa, b\n" + " );\n" + "}", + BreakAlways); +} + TEST_F(FormatTest, AdaptiveOnePerLineFormatting) { FormatStyle Style = getLLVMStyleWithColumns(15); Style.ExperimentalAutoDetectBinPacking = true; @@ -9256,7 +9347,7 @@ TEST_F(FormatTest, AlignsAfterOpenBracket) { Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak; Style.BinPackArguments = false; - Style.BinPackParameters = false; + Style.BinPackParameters = FormatStyle::BPPS_OnePerLine; verifyFormat("void aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n" " aaaaaaaaaaa aaaaaaaa,\n" " aaaaaaaaa aaaaaaa,\n" @@ -9295,7 +9386,7 @@ TEST_F(FormatTest, AlignsAfterOpenBracket) { Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent; Style.BinPackArguments = false; - Style.BinPackParameters = false; + Style.BinPackParameters = FormatStyle::BPPS_OnePerLine; verifyFormat("void aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n" " aaaaaaaaaaa aaaaaaaa,\n" " aaaaaaaaa aaaaaaa,\n" @@ -10706,7 +10797,7 @@ TEST_F(FormatTest, WrapsAtFunctionCallsIfNecessary) { " .a();"); FormatStyle NoBinPacking = getLLVMStyle(); - NoBinPacking.BinPackParameters = false; + NoBinPacking.BinPackParameters = FormatStyle::BPPS_OnePerLine; verifyFormat("aaaaaaaaaaaaaaaaaaaaaa(aaaaaaaaaaaaaaaaaaaaa)\n" " .aaaaaaaaaaaaaaaaa(aaaaaaaaaaaaaaaaaaaaa)\n" " .aaaaaaaaaaaaaaaaaaa(aaaaaaaaaaaaaaaaaaa,\n" @@ -13618,7 +13709,7 @@ TEST_F(FormatTest, HandlesIncludeDirectives) { TEST_F(FormatTest, IncompleteParameterLists) { FormatStyle NoBinPacking = getLLVMStyle(); - NoBinPacking.BinPackParameters = false; + NoBinPacking.BinPackParameters = FormatStyle::BPPS_OnePerLine; verifyFormat("void aaaaaaaaaaaaaaaaaa(int level,\n" " double *min_x,\n" " double *max_x,\n" @@ -14284,7 +14375,7 @@ TEST_F(FormatTest, FormatsBracedListsInColumnLayout) { " [](const Input &i) -> Output { return " "Output{1, 2}; });"); FormatStyle NoBinPacking = getLLVMStyle(); - NoBinPacking.BinPackParameters = false; + NoBinPacking.BinPackParameters = FormatStyle::BPPS_OnePerLine; verifyFormat("waarudo::unit desk = {\n" " .s = \"desk\", .p = p, .b = [] { return w::r{3, 10, 1, 1, " "1, 1} * w::m; }};", @@ -19789,7 +19880,7 @@ TEST_F(FormatTest, AlignConsecutiveDeclarations) { Alignment.AlignConsecutiveAssignments.Enabled = false; Alignment.ColumnLimit = 30; - Alignment.BinPackParameters = false; + Alignment.BinPackParameters = FormatStyle::BPPS_OnePerLine; verifyFormat("void foo(float a,\n" " float b,\n" " int c,\n" @@ -19803,7 +19894,7 @@ TEST_F(FormatTest, AlignConsecutiveDeclarations) { " uint32_t *c,\n" " bool d) {}", Alignment); - Alignment.BinPackParameters = true; + Alignment.BinPackParameters = FormatStyle::BPPS_BinPack; Alignment.ColumnLimit = 80; // Bug 33507 @@ -23229,7 +23320,7 @@ TEST_F(FormatTest, FormatsLambdas) { " LambdaBodyMustBeBreak);\n" "};", LLVMWithBeforeLambdaBody); - LLVMWithBeforeLambdaBody.BinPackParameters = false; + LLVMWithBeforeLambdaBody.BinPackParameters = FormatStyle::BPPS_OnePerLine; verifyFormat("FctAllOnSameLine_SLS_All([]() { return S; }, Fst, Second);", LLVMWithBeforeLambdaBody); verifyFormat( @@ -26709,7 +26800,7 @@ TEST_F(FormatTest, AlignAfterOpenBracketBlockIndent) { Medium, Style); Style.BinPackArguments = false; - Style.BinPackParameters = false; + Style.BinPackParameters = FormatStyle::BPPS_OnePerLine; verifyFormat(Short, Style); diff --git a/clang/unittests/Format/FormatTestComments.cpp b/clang/unittests/Format/FormatTestComments.cpp index 8f84d59cbb2e2c..4eea14282c3256 100644 --- a/clang/unittests/Format/FormatTestComments.cpp +++ b/clang/unittests/Format/FormatTestComments.cpp @@ -362,6 +362,19 @@ TEST_F(FormatTestComments, KeepsParameterWithTrailingCommentsOnTheirOwnLine) { format("aaaaaaaaaa(aaaa(aaaa,\n" "aaaa), //\n" "aaaa, bbbbb);")); + + FormatStyle BreakAlways = getLLVMStyle(); + BreakAlways.BinPackParameters = FormatStyle::BPPS_AlwaysOnePerLine; + verifyFormat("int SomeFunction(a,\n" + " b, // comment\n" + " c,\n" + " d);", + BreakAlways); + verifyFormat("int SomeFunction(a,\n" + " b,\n" + " // comment\n" + " c);", + BreakAlways); } TEST_F(FormatTestComments, RemovesTrailingWhitespaceOfComments) { @@ -403,13 +416,27 @@ TEST_F(FormatTestComments, UnderstandsBlockComments) { verifyFormat("f(/* aaaaaaaaaaaaaaaaaa = */\n" " aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa);"); - FormatStyle NoBinPacking = getLLVMStyle(); - NoBinPacking.BinPackParameters = false; + verifyFormat( + "int aaaaaaaaaaaaa(/* 1st */ int bbbbbbbbbb, /* 2nd */ int ccccccccccc,\n" + " /* 3rd */ int dddddddddddd);"); + + auto Style = getLLVMStyle(); + Style.BinPackParameters = FormatStyle::BPPS_OnePerLine; verifyFormat("aaaaaaaa(/* parameter 1 */ aaaaaa,\n" " /* parameter 2 */ aaaaaa,\n" " /* parameter 3 */ aaaaaa,\n" " /* parameter 4 */ aaaaaa);", - NoBinPacking); + Style); + verifyFormat("int a(/* 1st */ int b, /* 2nd */ int c);", Style); + verifyFormat("int aaaaaaaaaaaaa(/* 1st */ int bbbbbbbbbb,\n" + " /* 2nd */ int ccccccccccc,\n" + " /* 3rd */ int dddddddddddd);", + Style); + + Style.BinPackParameters = FormatStyle::BPPS_AlwaysOnePerLine; + verifyFormat("int a(/* 1st */ int b,\n" + " /* 2nd */ int c);", + Style); // Aligning block comments in macros. verifyGoogleFormat("#define A \\\n" @@ -2449,7 +2476,7 @@ TEST_F(FormatTestComments, BlockComments) { getLLVMStyleWithColumns(50))); FormatStyle NoBinPacking = getLLVMStyle(); - NoBinPacking.BinPackParameters = false; + NoBinPacking.BinPackParameters = FormatStyle::BPPS_OnePerLine; EXPECT_EQ("someFunction(1, /* comment 1 */\n" " 2, /* comment 2 */\n" " 3, /* comment 3 */\n" diff --git a/clang/unittests/Format/FormatTestObjC.cpp b/clang/unittests/Format/FormatTestObjC.cpp index d2c3459e0f846d..9b6f0c396d4dbf 100644 --- a/clang/unittests/Format/FormatTestObjC.cpp +++ b/clang/unittests/Format/FormatTestObjC.cpp @@ -377,7 +377,7 @@ TEST_F(FormatTestObjC, FormatObjCInterface) { " ddddddddddddd> {\n" "}"); - Style.BinPackParameters = false; + Style.BinPackParameters = FormatStyle::BPPS_OnePerLine; Style.ObjCBinPackProtocolList = FormatStyle::BPS_Auto; verifyFormat("@interface eeeeeeeeeeeee () <\n" " eeeeeeeeeeeee,\n" @@ -411,7 +411,7 @@ TEST_F(FormatTestObjC, FormatObjCInterface) { "+ (id)init;\n" "@end"); Style.ColumnLimit = 40; - // BinPackParameters should be true by default. + // BinPackParameters should be BPPS_BinPack by default. verifyFormat("void eeeeeeee(int eeeee, int eeeee,\n" " int eeeee, int eeeee);"); // ObjCBinPackProtocolList should be BPS_Never by default. From 3563907969843cb5d97995fb02177ee578e33aa2 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 22 Aug 2024 21:56:01 -0700 Subject: [PATCH 288/426] [LTO] Turn ImportMapTy into a proper class (NFC) (#105748) This patch turns type alias ImportMapTy into a proper class to provide a more intuitive interface like: ImportList.addDefinition(...) as opposed to: FunctionImporter::addDefinition(ImportList, ...) Also, this patch requires all non-const accesses to go through addDefinition, maybeAddDeclaration, and addGUID while providing const accesses via: const ImportMapTyImpl &getImportMap() const { return ImportMap; } I realize ImportMapTy may not be the best name as a class (maybe OK as a type alias). I am not renaming ImportMapTy in this patch at least because there are 47 mentions of ImportMapTy under llvm/. --- .../llvm/Transforms/IPO/FunctionImport.h | 82 +++++++++++-------- llvm/lib/LTO/LTO.cpp | 9 +- llvm/lib/LTO/LTOBackend.cpp | 3 +- llvm/lib/Transforms/IPO/FunctionImport.cpp | 60 +++++++------- llvm/tools/llvm-link/llvm-link.cpp | 5 +- 5 files changed, 84 insertions(+), 75 deletions(-) diff --git a/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/llvm/include/llvm/Transforms/IPO/FunctionImport.h index 5ab8c6d130b60a..0c8380db74314f 100644 --- a/llvm/include/llvm/Transforms/IPO/FunctionImport.h +++ b/llvm/include/llvm/Transforms/IPO/FunctionImport.h @@ -96,13 +96,54 @@ class FunctionImporter { std::tuple>>; - /// The map contains an entry for every module to import from, the key being - /// the module identifier to pass to the ModuleLoader. The value is the set of - /// functions to import. The module identifier strings must be owned - /// elsewhere, typically by the in-memory ModuleSummaryIndex the importing - /// decisions are made from (the module path for each summary is owned by the - /// index's module path string table). - using ImportMapTy = DenseMap; + /// The map maintains the list of imports. Conceptually, it is a collection + /// of tuples of the form: + /// + /// (The name of the source module, GUID, Definition/Declaration) + /// + /// The name of the source module is the module identifier to pass to the + /// ModuleLoader. The module identifier strings must be owned elsewhere, + /// typically by the in-memory ModuleSummaryIndex the importing decisions are + /// made from (the module path for each summary is owned by the index's module + /// path string table). + class ImportMapTy { + public: + using ImportMapTyImpl = DenseMap; + + enum class AddDefinitionStatus { + // No change was made to the list of imports or whether each import should + // be imported as a declaration or definition. + NoChange, + // Successfully added the given GUID to be imported as a definition. There + // was no existing entry with the same GUID as a declaration. + Inserted, + // An existing with the given GUID was changed to a definition. + ChangedToDefinition, + }; + + // Add the given GUID to ImportList as a definition. If the same GUID has + // been added as a declaration previously, that entry is overridden. + AddDefinitionStatus addDefinition(StringRef FromModule, + GlobalValue::GUID GUID); + + // Add the given GUID to ImportList as a declaration. If the same GUID has + // been added as a definition previously, that entry takes precedence, and + // no change is made. + void maybeAddDeclaration(StringRef FromModule, GlobalValue::GUID GUID); + + void addGUID(StringRef FromModule, GlobalValue::GUID GUID, + GlobalValueSummary::ImportKind ImportKind) { + if (ImportKind == GlobalValueSummary::Definition) + addDefinition(FromModule, GUID); + else + maybeAddDeclaration(FromModule, GUID); + } + + const ImportMapTyImpl &getImportMap() const { return ImportMap; } + + private: + ImportMapTyImpl ImportMap; + }; /// The set contains an entry for every global value that the module exports. /// Depending on the user context, this container is allowed to contain @@ -122,33 +163,6 @@ class FunctionImporter { /// Import functions in Module \p M based on the supplied import list. Expected importFunctions(Module &M, const ImportMapTy &ImportList); - enum class AddDefinitionStatus { - NoChange, - Inserted, - ChangedToDefinition, - }; - - // Add the given GUID to ImportList as a definition. If the same GUID has - // been added as a declaration previously, that entry is overridden. - static AddDefinitionStatus addDefinition(ImportMapTy &ImportList, - StringRef FromModule, - GlobalValue::GUID GUID); - - // Add the given GUID to ImportList as a declaration. If the same GUID has - // been added as a definition previously, that entry takes precedence, and no - // change is made. - static void maybeAddDeclaration(ImportMapTy &ImportList, StringRef FromModule, - GlobalValue::GUID GUID); - - static void addGUID(ImportMapTy &ImportList, StringRef FromModule, - GlobalValue::GUID GUID, - GlobalValueSummary::ImportKind ImportKind) { - if (ImportKind == GlobalValueSummary::Definition) - addDefinition(ImportList, FromModule, GUID); - else - maybeAddDeclaration(ImportList, FromModule, GUID); - } - private: /// The summaries index used to trigger importing. const ModuleSummaryIndex &Index; diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index e5545860c329d4..ee0193344d5ac9 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -177,7 +177,8 @@ std::string llvm::computeLTOCacheKey( // Include the hash for every module we import functions from. The set of // imported symbols for each module may affect code generation and is // sensitive to link order, so include that as well. - using ImportMapIteratorTy = FunctionImporter::ImportMapTy::const_iterator; + using ImportMapIteratorTy = + FunctionImporter::ImportMapTy::ImportMapTyImpl::const_iterator; struct ImportModule { ImportMapIteratorTy ModIt; const ModuleSummaryIndex::ModuleInfo *ModInfo; @@ -191,10 +192,10 @@ std::string llvm::computeLTOCacheKey( }; std::vector ImportModulesVector; - ImportModulesVector.reserve(ImportList.size()); + ImportModulesVector.reserve(ImportList.getImportMap().size()); - for (ImportMapIteratorTy It = ImportList.begin(); It != ImportList.end(); - ++It) { + for (ImportMapIteratorTy It = ImportList.getImportMap().begin(); + It != ImportList.getImportMap().end(); ++It) { ImportModulesVector.push_back({It, Index.getModule(It->getFirst())}); } // Order using module hash, to be both independent of module name and diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index ae46d946ae06a6..4e58cd369c3ac9 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -726,8 +726,7 @@ bool lto::initImportList(const Module &M, if (Summary->modulePath() == M.getModuleIdentifier()) continue; // Add an entry to provoke importing by thinBackend. - FunctionImporter::addGUID(ImportList, Summary->modulePath(), GUID, - Summary->importType()); + ImportList.addGUID(Summary->modulePath(), GUID, Summary->importType()); } } return true; diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp index 55803670071d16..b26c15b665b590 100644 --- a/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -334,11 +334,11 @@ using EdgeInfo = std::tuple; } // anonymous namespace -FunctionImporter::AddDefinitionStatus -FunctionImporter::addDefinition(ImportMapTy &ImportList, StringRef FromModule, - GlobalValue::GUID GUID) { +FunctionImporter::ImportMapTy::AddDefinitionStatus +FunctionImporter::ImportMapTy::addDefinition(StringRef FromModule, + GlobalValue::GUID GUID) { auto [It, Inserted] = - ImportList[FromModule].try_emplace(GUID, GlobalValueSummary::Definition); + ImportMap[FromModule].try_emplace(GUID, GlobalValueSummary::Definition); if (Inserted) return AddDefinitionStatus::Inserted; if (It->second == GlobalValueSummary::Definition) @@ -347,10 +347,9 @@ FunctionImporter::addDefinition(ImportMapTy &ImportList, StringRef FromModule, return AddDefinitionStatus::ChangedToDefinition; } -void FunctionImporter::maybeAddDeclaration(ImportMapTy &ImportList, - StringRef FromModule, - GlobalValue::GUID GUID) { - ImportList[FromModule].try_emplace(GUID, GlobalValueSummary::Declaration); +void FunctionImporter::ImportMapTy::maybeAddDeclaration( + StringRef FromModule, GlobalValue::GUID GUID) { + ImportMap[FromModule].try_emplace(GUID, GlobalValueSummary::Declaration); } /// Import globals referenced by a function or other globals that are being @@ -411,9 +410,8 @@ class GlobalsImporter final { // If there isn't an entry for GUID, insert pair. // Otherwise, definition should take precedence over declaration. - if (FunctionImporter::addDefinition( - ImportList, RefSummary->modulePath(), VI.getGUID()) != - FunctionImporter::AddDefinitionStatus::Inserted) + if (ImportList.addDefinition(RefSummary->modulePath(), VI.getGUID()) != + FunctionImporter::ImportMapTy::AddDefinitionStatus::Inserted) break; // Only update stat and exports if we haven't already imported this @@ -600,8 +598,7 @@ class WorkloadImportsManager : public ModuleImportsManager { LLVM_DEBUG(dbgs() << "[Workload][Including]" << VI.name() << " from " << ExportingModule << " : " << Function::getGUID(VI.name()) << "\n"); - FunctionImporter::addDefinition(ImportList, ExportingModule, - VI.getGUID()); + ImportList.addDefinition(ExportingModule, VI.getGUID()); GVI.onImportingSummary(*GVS); if (ExportLists) (*ExportLists)[ExportingModule].insert(VI); @@ -899,8 +896,7 @@ static void computeImportForFunction( // Note `ExportLists` only keeps track of exports due to imported // definitions. - FunctionImporter::maybeAddDeclaration(ImportList, DeclSourceModule, - VI.getGUID()); + ImportList.maybeAddDeclaration(DeclSourceModule, VI.getGUID()); } // Update with new larger threshold if this was a retry (otherwise // we would have already inserted with NewThreshold above). Also @@ -949,9 +945,8 @@ static void computeImportForFunction( // Try emplace the definition entry, and update stats based on insertion // status. - if (FunctionImporter::addDefinition(ImportList, ExportModulePath, - VI.getGUID()) != - FunctionImporter::AddDefinitionStatus::NoChange) { + if (ImportList.addDefinition(ExportModulePath, VI.getGUID()) != + FunctionImporter::ImportMapTy::AddDefinitionStatus::NoChange) { NumImportedFunctionsThinLink++; if (IsHotCallsite) NumImportedHotFunctionsThinLink++; @@ -1084,7 +1079,7 @@ numGlobalVarSummaries(const ModuleSummaryIndex &Index, // the number of defined function summaries as output parameter. static unsigned numGlobalVarSummaries(const ModuleSummaryIndex &Index, - FunctionImporter::FunctionsToImportTy &ImportMap, + const FunctionImporter::FunctionsToImportTy &ImportMap, unsigned &DefinedFS) { unsigned NumGVS = 0; DefinedFS = 0; @@ -1105,9 +1100,9 @@ static bool checkVariableImport( DenseMap &ExportLists) { DenseSet FlattenedImports; - for (auto &ImportPerModule : ImportLists) - for (auto &ExportPerModule : ImportPerModule.second) - for (auto &[GUID, Type] : ExportPerModule.second) + for (const auto &ImportPerModule : ImportLists) + for (const auto &ExportPerModule : ImportPerModule.second.getImportMap()) + for (const auto &[GUID, Type] : ExportPerModule.second) FlattenedImports.insert(GUID); // Checks that all GUIDs of read/writeonly vars we see in export lists @@ -1217,9 +1212,10 @@ void llvm::ComputeCrossModuleImport( unsigned NumGVS = numGlobalVarSummaries(Index, Exports); LLVM_DEBUG(dbgs() << "* Module " << ModName << " exports " << Exports.size() - NumGVS << " functions and " << NumGVS - << " vars. Imports from " << ModuleImports.second.size() + << " vars. Imports from " + << ModuleImports.second.getImportMap().size() << " modules.\n"); - for (auto &Src : ModuleImports.second) { + for (const auto &Src : ModuleImports.second.getImportMap()) { auto SrcModName = Src.first; unsigned DefinedFS = 0; unsigned NumGVSPerMod = @@ -1240,8 +1236,8 @@ static void dumpImportListForModule(const ModuleSummaryIndex &Index, StringRef ModulePath, FunctionImporter::ImportMapTy &ImportList) { LLVM_DEBUG(dbgs() << "* Module " << ModulePath << " imports from " - << ImportList.size() << " modules.\n"); - for (auto &Src : ImportList) { + << ImportList.getImportMap().size() << " modules.\n"); + for (const auto &Src : ImportList.getImportMap()) { auto SrcModName = Src.first; unsigned DefinedFS = 0; unsigned NumGVSPerMod = numGlobalVarSummaries(Index, Src.second, DefinedFS); @@ -1306,8 +1302,7 @@ static void ComputeCrossModuleImportForModuleFromIndexForTest( if (Summary->modulePath() == ModulePath) continue; // Add an entry to provoke importing by thinBackend. - FunctionImporter::addGUID(ImportList, Summary->modulePath(), GUID, - Summary->importType()); + ImportList.addGUID(Summary->modulePath(), GUID, Summary->importType()); } #ifndef NDEBUG dumpImportListForModule(Index, ModulePath, ImportList); @@ -1496,7 +1491,7 @@ void llvm::gatherImportedSummariesForModule( ModuleToSummariesForIndex[std::string(ModulePath)] = ModuleToDefinedGVSummaries.lookup(ModulePath); // Include summaries for imports. - for (const auto &ILI : ImportList) { + for (const auto &ILI : ImportList.getImportMap()) { auto &SummariesForIndex = ModuleToSummariesForIndex[std::string(ILI.first)]; const auto &DefinedGVSummaries = @@ -1777,7 +1772,7 @@ Expected FunctionImporter::importFunctions( IRMover Mover(DestModule); // Do the actual import of functions now, one Module at a time std::set ModuleNameOrderedList; - for (const auto &FunctionsToImportPerModule : ImportList) { + for (const auto &FunctionsToImportPerModule : ImportList.getImportMap()) { ModuleNameOrderedList.insert(FunctionsToImportPerModule.first); } @@ -1792,8 +1787,9 @@ Expected FunctionImporter::importFunctions( for (const auto &Name : ModuleNameOrderedList) { // Get the module for the import - const auto &FunctionsToImportPerModule = ImportList.find(Name); - assert(FunctionsToImportPerModule != ImportList.end()); + const auto &FunctionsToImportPerModule = + ImportList.getImportMap().find(Name); + assert(FunctionsToImportPerModule != ImportList.getImportMap().end()); Expected> SrcModuleOrErr = ModuleLoader(Name); if (!SrcModuleOrErr) return SrcModuleOrErr.takeError(); diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp index ef6f85d38fede6..54d38a6ddd6f10 100644 --- a/llvm/tools/llvm-link/llvm-link.cpp +++ b/llvm/tools/llvm-link/llvm-link.cpp @@ -381,9 +381,8 @@ static bool importFunctions(const char *argv0, Module &DestModule) { // definition, so make the import type definition directly. // FIXME: A follow-up patch should add test coverage for import declaration // in `llvm-link` CLI (e.g., by introducing a new command line option). - FunctionImporter::addDefinition( - ImportList, FileNameStringCache.insert(FileName).first->getKey(), - F->getGUID()); + ImportList.addDefinition( + FileNameStringCache.insert(FileName).first->getKey(), F->getGUID()); } auto CachedModuleLoader = [&](StringRef Identifier) { return ModuleLoaderCache.takeModule(std::string(Identifier)); From 96b3166602cbe3dc1240bc3189cf1581273928a2 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 22 Aug 2024 22:21:20 -0700 Subject: [PATCH 289/426] Revert "[SLP]Improve/fix subvectors in gather/buildvector nodes handling" (#105780) with "[Vectorize] Fix warnings" It introduced compiler crashes, see #104144. This reverts commit 69332bb8995aef60d830406de12cb79a50390261 and 351f4a5593f1ef507708ec5eeca165b20add3340. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 328 +++++++++------- .../PhaseOrdering/AArch64/slpordering.ll | 74 ++-- .../SLPVectorizer/AArch64/getelementptr.ll | 11 +- .../SLPVectorizer/AArch64/loadorder.ll | 192 ++++----- .../AArch64/multiple_reduction.ll | 365 +++++++++++------- .../AArch64/scalarization-overhead.ll | 62 +-- .../AArch64/shuffle-vectors-mask-size.ll | 7 +- .../SLPVectorizer/AArch64/tsc-s116.ll | 8 +- .../vectorizable-selects-uniform-cmps.ll | 32 +- .../RISCV/combined-loads-stored.ll | 7 +- .../SLPVectorizer/RISCV/reductions.ll | 48 +-- .../SLPVectorizer/SystemZ/pr34619.ll | 11 +- .../Transforms/SLPVectorizer/X86/addsub.ll | 18 +- .../X86/extract-many-users-buildvector.ll | 43 ++- .../X86/extract-scalar-from-undef.ll | 27 +- .../X86/gather-node-same-as-vect-but-order.ll | 13 +- .../SLPVectorizer/X86/horizontal-minmax.ll | 16 +- .../SLPVectorizer/X86/inst_size_bug.ll | 18 +- .../SLPVectorizer/X86/landing_pad.ll | 19 +- llvm/test/Transforms/SLPVectorizer/X86/phi.ll | 36 +- .../SLPVectorizer/X86/reduction-logical.ll | 17 +- .../X86/remark-partial-loads-vectorize.ll | 16 +- .../X86/scatter-vectorize-reused-pointer.ll | 26 +- .../X86/schedule_budget_debug_info.ll | 40 +- .../SLPVectorizer/X86/split-load8_2-unord.ll | 39 +- .../Transforms/SLPVectorizer/X86/tiny-tree.ll | 5 +- .../X86/vect-gather-same-nodes.ll | 6 +- 27 files changed, 785 insertions(+), 699 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e8ab6839d9fa87..d7763a022f3b6e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3094,10 +3094,6 @@ class BoUpSLP { /// The index of this treeEntry in VectorizableTree. int Idx = -1; - /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from - /// other nodes as a series of insertvector instructions. - SmallVector, 0> CombinedEntriesWithIndices; - private: /// The operands of each instruction in each lane Operands[op_index][lane]. /// Note: This helps avoid the replication of the code that performs the @@ -3398,9 +3394,7 @@ class BoUpSLP { if (!isConstant(V)) { auto *I = dyn_cast(V); AllConstsOrCasts &= I && I->getType()->isIntegerTy(); - if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE || - !UserTreeIdx.UserTE->isGather()) - ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last); + ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last); } if (AllConstsOrCasts) CastMaxMinBWSizes = @@ -8355,49 +8349,8 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef Ptrs, void BoUpSLP::transformNodes() { constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - // The tree may grow here, so iterate over nodes, built before. - for (unsigned Idx : seq(VectorizableTree.size())) { - TreeEntry &E = *VectorizableTree[Idx]; - if (E.isGather()) { - ArrayRef VL = E.Scalars; - const unsigned Sz = getVectorElementSize(VL.front()); - unsigned MinVF = getMinVF(2 * Sz); - if (VL.size() <= 2 || - (E.getOpcode() && - (E.isAltShuffle() || E.getOpcode() != Instruction::Load))) - continue; - // Try to find vectorizable sequences and transform them into a series of - // insertvector instructions. - unsigned StartIdx = 0; - unsigned End = VL.size(); - for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) { - for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) { - ArrayRef Slice = VL.slice(Cnt, VF); - InstructionsState S = getSameOpcode(Slice, *TLI); - if (!S.getOpcode() || S.isAltShuffle() || - (S.getOpcode() != Instruction::Load && - any_of(Slice, [&](Value *V) { - return !areAllUsersVectorized(cast(V), - UserIgnoreList); - }))) - continue; - if (!getTreeEntry(Slice.front()) && !getTreeEntry(Slice.back())) { - unsigned PrevSize = VectorizableTree.size(); - buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX)); - if (PrevSize + 1 == VectorizableTree.size() && - VectorizableTree[PrevSize]->isGather()) { - VectorizableTree.pop_back(); - continue; - } - E.CombinedEntriesWithIndices.emplace_back(PrevSize, Cnt); - if (StartIdx == Cnt) - StartIdx = Cnt + VF; - if (End == Cnt + VF) - End = Cnt; - } - } - } - } + for (std::unique_ptr &TE : VectorizableTree) { + TreeEntry &E = *TE; switch (E.getOpcode()) { case Instruction::Load: { // No need to reorder masked gather loads, just reorder the scalar @@ -8520,7 +8473,175 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { auto *VecTy = getWidenedType(ScalarTy, VL.size()); InstructionCost GatherCost = 0; SmallVector Gathers(VL); - if (!Root && isSplat(VL)) { + // Improve gather cost for gather of loads, if we can group some of the + // loads into vector loads. + InstructionsState S = getSameOpcode(VL, *R.TLI); + const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy); + unsigned MinVF = R.getMinVF(2 * Sz); + if (VL.size() > 2 && + ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) || + (InVectors.empty() && + any_of(seq(0, VL.size() / MinVF), + [&](unsigned Idx) { + ArrayRef SubVL = VL.slice(Idx * MinVF, MinVF); + InstructionsState S = getSameOpcode(SubVL, *R.TLI); + return S.getOpcode() == Instruction::Load && + !S.isAltShuffle(); + }))) && + !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) && + !isSplat(Gathers)) { + InstructionCost BaseCost = R.getGatherCost(Gathers, !Root, ScalarTy); + SetVector VectorizedLoads; + SmallVector> VectorizedStarts; + SmallVector ScatterVectorized; + unsigned StartIdx = 0; + unsigned VF = VL.size() / 2; + for (; VF >= MinVF; VF /= 2) { + for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End; + Cnt += VF) { + ArrayRef Slice = VL.slice(Cnt, VF); + if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) { + InstructionsState SliceS = getSameOpcode(Slice, *R.TLI); + if (SliceS.getOpcode() != Instruction::Load || + SliceS.isAltShuffle()) + continue; + } + if (!VectorizedLoads.count(Slice.front()) && + !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) { + SmallVector PointerOps; + OrdersType CurrentOrder; + LoadsState LS = R.canVectorizeLoads(Slice, Slice.front(), + CurrentOrder, PointerOps); + switch (LS) { + case LoadsState::Vectorize: + case LoadsState::ScatterVectorize: + case LoadsState::StridedVectorize: + // Mark the vectorized loads so that we don't vectorize them + // again. + // TODO: better handling of loads with reorders. + if (((LS == LoadsState::Vectorize || + LS == LoadsState::StridedVectorize) && + CurrentOrder.empty()) || + (LS == LoadsState::StridedVectorize && + isReverseOrder(CurrentOrder))) + VectorizedStarts.emplace_back(Cnt, LS); + else + ScatterVectorized.push_back(Cnt); + VectorizedLoads.insert(Slice.begin(), Slice.end()); + // If we vectorized initial block, no need to try to vectorize + // it again. + if (Cnt == StartIdx) + StartIdx += VF; + break; + case LoadsState::Gather: + break; + } + } + } + // Check if the whole array was vectorized already - exit. + if (StartIdx >= VL.size()) + break; + // Found vectorizable parts - exit. + if (!VectorizedLoads.empty()) + break; + } + if (!VectorizedLoads.empty()) { + unsigned NumParts = TTI.getNumberOfParts(VecTy); + bool NeedInsertSubvectorAnalysis = + !NumParts || (VL.size() / VF) > NumParts; + // Get the cost for gathered loads. + for (unsigned I = 0, End = VL.size(); I < End; I += VF) { + if (VectorizedLoads.contains(VL[I])) + continue; + GatherCost += + getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root); + } + // Exclude potentially vectorized loads from list of gathered + // scalars. + Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType())); + // The cost for vectorized loads. + InstructionCost ScalarsCost = 0; + for (Value *V : VectorizedLoads) { + auto *LI = cast(V); + ScalarsCost += + TTI.getMemoryOpCost(Instruction::Load, LI->getType(), + LI->getAlign(), LI->getPointerAddressSpace(), + CostKind, TTI::OperandValueInfo(), LI); + } + auto *LoadTy = getWidenedType(VL.front()->getType(), VF); + for (const std::pair &P : VectorizedStarts) { + auto *LI = cast(VL[P.first]); + Align Alignment = LI->getAlign(); + GatherCost += + P.second == LoadsState::Vectorize + ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, + LI->getPointerAddressSpace(), CostKind, + TTI::OperandValueInfo(), LI) + : TTI.getStridedMemoryOpCost( + Instruction::Load, LoadTy, LI->getPointerOperand(), + /*VariableMask=*/false, Alignment, CostKind, LI); + // Add external uses costs. + for (auto [Idx, V] : enumerate(VL.slice( + P.first, std::min(VL.size() - P.first, VF)))) + if (!R.areAllUsersVectorized(cast(V))) + GatherCost += TTI.getVectorInstrCost(Instruction::ExtractElement, + LoadTy, CostKind, Idx); + // Estimate GEP cost. + SmallVector PointerOps(VF); + for (auto [I, V] : enumerate(VL.slice(P.first, VF))) + PointerOps[I] = cast(V)->getPointerOperand(); + auto [ScalarGEPCost, VectorGEPCost] = + getGEPCosts(TTI, PointerOps, LI->getPointerOperand(), + Instruction::Load, CostKind, LI->getType(), LoadTy); + GatherCost += VectorGEPCost - ScalarGEPCost; + } + for (unsigned P : ScatterVectorized) { + auto *LI0 = cast(VL[P]); + ArrayRef Slice = VL.slice(P, VF); + Align CommonAlignment = computeCommonAlignment(Slice); + GatherCost += TTI.getGatherScatterOpCost( + Instruction::Load, LoadTy, LI0->getPointerOperand(), + /*VariableMask=*/false, CommonAlignment, CostKind, LI0); + // Estimate GEP cost. + SmallVector PointerOps(VF); + for (auto [I, V] : enumerate(Slice)) + PointerOps[I] = cast(V)->getPointerOperand(); + OrdersType Order; + if (sortPtrAccesses(PointerOps, LI0->getType(), *R.DL, *R.SE, + Order)) { + // TODO: improve checks if GEPs can be vectorized. + Value *Ptr0 = PointerOps.front(); + Type *ScalarTy = Ptr0->getType(); + auto *VecTy = getWidenedType(ScalarTy, VF); + auto [ScalarGEPCost, VectorGEPCost] = + getGEPCosts(TTI, PointerOps, Ptr0, Instruction::GetElementPtr, + CostKind, ScalarTy, VecTy); + GatherCost += VectorGEPCost - ScalarGEPCost; + if (!Order.empty()) { + SmallVector Mask; + inversePermutation(Order, Mask); + GatherCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, + VecTy, Mask, CostKind); + } + } else { + GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true, + PointerOps.front()->getType()); + } + } + if (NeedInsertSubvectorAnalysis) { + // Add the cost for the subvectors insert. + SmallVector ShuffleMask(VL.size()); + for (unsigned I = VF, E = VL.size(); I < E; I += VF) { + for (unsigned Idx : seq(0, E)) + ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx; + GatherCost += ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, + ShuffleMask, CostKind, I, LoadTy); + } + } + GatherCost -= ScalarsCost; + } + GatherCost = std::min(BaseCost, GatherCost); + } else if (!Root && isSplat(VL)) { // Found the broadcasting of the single scalar, calculate the cost as // the broadcast. const auto *It = find_if_not(VL, IsaPred); @@ -9268,9 +9389,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { InstructionCost createFreeze(InstructionCost Cost) { return Cost; } /// Finalize emission of the shuffles. InstructionCost - finalize(ArrayRef ExtMask, - ArrayRef> SubVectors, - unsigned VF = 0, + finalize(ArrayRef ExtMask, unsigned VF = 0, function_ref &)> Action = {}) { IsFinalized = true; if (Action) { @@ -9288,29 +9407,6 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { Action(V, CommonMask); InVectors.front() = V; } - if (!SubVectors.empty()) { - const PointerUnion &Vec = InVectors.front(); - if (InVectors.size() == 2) - Cost += createShuffle(Vec, InVectors.back(), CommonMask); - else - Cost += createShuffle(Vec, nullptr, CommonMask); - for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) - if (CommonMask[Idx] != PoisonMaskElem) - CommonMask[Idx] = Idx; - for (const auto &[E, Idx] : SubVectors) { - Cost += ::getShuffleCost( - TTI, TTI::SK_InsertSubvector, - FixedVectorType::get(ScalarTy, CommonMask.size()), std::nullopt, - CostKind, Idx, - FixedVectorType::get(ScalarTy, E->getVectorFactor())); - if (!CommonMask.empty()) { - std::iota(std::next(CommonMask.begin(), Idx), - std::next(CommonMask.begin(), Idx + E->getVectorFactor()), - Idx); - } - } - } - ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true); if (CommonMask.empty()) { assert(InVectors.size() == 1 && "Expected only one vector with no mask"); @@ -12408,9 +12504,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { /// \param Action the action (if any) to be performed before final applying of /// the \p ExtMask mask. Value * - finalize(ArrayRef ExtMask, - ArrayRef> SubVectors, - unsigned VF = 0, + finalize(ArrayRef ExtMask, unsigned VF = 0, function_ref &)> Action = {}) { IsFinalized = true; SmallVector NewExtMask(ExtMask); @@ -12444,29 +12538,6 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { Action(Vec, CommonMask); InVectors.front() = Vec; } - if (!SubVectors.empty()) { - Value *Vec = InVectors.front(); - if (InVectors.size() == 2) { - Vec = createShuffle(Vec, InVectors.back(), CommonMask); - InVectors.pop_back(); - } else { - Vec = createShuffle(Vec, nullptr, CommonMask); - } - for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) - if (CommonMask[Idx] != PoisonMaskElem) - CommonMask[Idx] = Idx; - for (const auto &[E, Idx] : SubVectors) { - Vec = Builder.CreateInsertVector( - Vec->getType(), Vec, E->VectorizedValue, Builder.getInt64(Idx)); - if (!CommonMask.empty()) { - std::iota(std::next(CommonMask.begin(), Idx), - std::next(CommonMask.begin(), Idx + E->getVectorFactor()), - Idx); - } - } - InVectors.front() = Vec; - } - if (!ExtMask.empty()) { if (CommonMask.empty()) { CommonMask.assign(ExtMask.begin(), ExtMask.end()); @@ -12545,14 +12616,7 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, : ScalarTy, Builder, *this); ShuffleBuilder.add(V, Mask); - SmallVector> SubVectors( - E->CombinedEntriesWithIndices.size()); - transform(E->CombinedEntriesWithIndices, SubVectors.begin(), - [&](const auto &P) { - return std::make_pair(VectorizableTree[P.first].get(), - P.second); - }); - return ShuffleBuilder.finalize(std::nullopt, SubVectors); + return ShuffleBuilder.finalize(std::nullopt); }; Value *V = vectorizeTree(VE, PostponedPHIs); if (VF * getNumElements(VL[0]->getType()) != @@ -12635,17 +12699,6 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, SmallVector ReuseShuffleIndices(E->ReuseShuffleIndices.begin(), E->ReuseShuffleIndices.end()); SmallVector GatheredScalars(E->Scalars.begin(), E->Scalars.end()); - // Clear values, to be replaced by insertvector instructions. - for (const auto &[EIdx, Idx] : E->CombinedEntriesWithIndices) - for_each(MutableArrayRef(GatheredScalars) - .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()), - [&](Value *&V) { V = PoisonValue::get(V->getType()); }); - SmallVector> SubVectors( - E->CombinedEntriesWithIndices.size()); - transform(E->CombinedEntriesWithIndices, SubVectors.begin(), - [&](const auto &P) { - return std::make_pair(VectorizableTree[P.first].get(), P.second); - }); // Build a mask out of the reorder indices and reorder scalars per this // mask. SmallVector ReorderMask; @@ -12783,7 +12836,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } } ShuffleBuilder.add(*FrontTE, Mask); - Res = ShuffleBuilder.finalize(E->getCommonMask(), SubVectors); + Res = ShuffleBuilder.finalize(E->getCommonMask()); return Res; } if (!Resized) { @@ -13040,10 +13093,10 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, (IsSingleShuffle && ((IsIdentityShuffle && IsNonPoisoned) || IsUsedInExpr) && isa(V)); })) - Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); else Res = ShuffleBuilder.finalize( - E->ReuseShuffleIndices, SubVectors, E->Scalars.size(), + E->ReuseShuffleIndices, E->Scalars.size(), [&](Value *&Vec, SmallVectorImpl &Mask) { TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false); Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec); @@ -13054,7 +13107,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true); Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size()); ShuffleBuilder.add(BV, ReuseMask); - Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); } else { // Gather all constants. SmallVector Mask(GatheredScalars.size(), PoisonMaskElem); @@ -13064,7 +13117,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } Value *BV = ShuffleBuilder.gather(GatheredScalars); ShuffleBuilder.add(BV, Mask); - Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); } if (NeedFreeze) @@ -13073,8 +13126,6 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) { - for (const auto &[EIdx, _] : E->CombinedEntriesWithIndices) - (void)vectorizeTree(VectorizableTree[EIdx].get(), /*PostponedPHIs=*/false); return processBuildVector(E, ScalarTy, Builder, *this); } @@ -13126,13 +13177,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { } else { ShuffleBuilder.addOrdered(V, E->ReorderIndices); } - SmallVector> SubVectors( - E->CombinedEntriesWithIndices.size()); - transform( - E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) { - return std::make_pair(VectorizableTree[P.first].get(), P.second); - }); - return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); + return ShuffleBuilder.finalize(E->ReuseShuffleIndices); }; assert(!E->isGather() && "Unhandled state"); @@ -14535,7 +14580,7 @@ Value *BoUpSLP::vectorizeTree( ShuffleBuilder.add(V1, CombinedMask1); if (V2) ShuffleBuilder.add(V2, CombinedMask2); - return ShuffleBuilder.finalize(std::nullopt, std::nullopt); + return ShuffleBuilder.finalize(std::nullopt); }; auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef Mask, @@ -14673,14 +14718,7 @@ Value *BoUpSLP::vectorizeTree( // Clear up reduction references, if any. if (UserIgnoreList) { for (Instruction *I : RemovedInsts) { - const TreeEntry *IE = getTreeEntry(I); - if (IE->Idx != 0 && - !(VectorizableTree.front()->isGather() && isa(I) && - !IE->UserTreeIndices.empty() && - any_of(IE->UserTreeIndices, [&](const EdgeInfo &EI) { - return EI.UserTE == VectorizableTree.front().get() && - EI.EdgeIdx == UINT_MAX; - }))) + if (getTreeEntry(I)->Idx != 0) continue; SmallVector LogicalOpSelects; I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) { diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll index 2121775224098e..22511c018dca2d 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll @@ -18,62 +18,62 @@ define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[IP2]] to i64 ; CHECK-NEXT: [[RRRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 4 ; CHECK-NEXT: [[RRRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 4 -; CHECK-NEXT: [[RDD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[RDD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[RRRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR]], i64 4 -; CHECK-NEXT: [[RRRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64]], i64 4 -; CHECK-NEXT: [[RDD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[RDD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[RRRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_1]], i64 4 -; CHECK-NEXT: [[RRRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_1]], i64 4 -; CHECK-NEXT: [[RDD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_1]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[RDD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_1]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[RRRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_2]], i64 4 -; CHECK-NEXT: [[RRRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_2]], i64 4 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1, !tbaa [[TBAA0:![0-9]+]] ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[RDD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[RDD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[RRRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR]], i64 4 +; CHECK-NEXT: [[RRRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64]], i64 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[RDD_PTR]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[RDD_PTR64]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_1]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[RDD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[RDD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[RRRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_1]], i64 4 +; CHECK-NEXT: [[RRRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_1]], i64 4 ; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[RDD_PTR_1]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_1]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_2]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[RDD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_1]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[RDD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_1]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[RRRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_2]], i64 4 +; CHECK-NEXT: [[RRRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_2]], i64 4 ; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[RDD_PTR_2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_3]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_3]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_3]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_3]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP18]], <16 x i8> [[TMP19]], <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP21]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <16 x i8> [[TMP31]], <16 x i8> [[TMP32]], <16 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32> ; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> ; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> ; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> -; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]] +; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP34]], [[TMP40]] ; CHECK-NEXT: [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], -; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP26]] +; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP28]] ; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll index 91c8db14a45aa1..c1cef6ff3d10b4 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll @@ -169,12 +169,11 @@ define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[T11]] to i64 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP9]] ; CHECK-NEXT: [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[T10]], i64 2 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[T12]], i64 3 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP11]], <4 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]]) -; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP14]], [[SUM_032]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[T10]], i64 2 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[T12]], i64 3 +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]]) +; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP13]], [[SUM_032]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll index 5b878108af59af..d79aed89b0be73 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -340,12 +340,12 @@ entry: define i16 @reduce_blockstrided4(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, i32 noundef %stride) { ; CHECK-LABEL: @reduce_blockstrided4( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X:%.*]], align 2 ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X]], align 2 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM]] ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y:%.*]], align 2 +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM]] ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i16> [[TMP2]], [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i16> [[TMP3]], [[TMP1]] @@ -416,31 +416,31 @@ define i32 @reduce_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[OFF2:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4 -; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 ; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP12]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> [[TMP11]], <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = zext <16 x i8> [[TMP12]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP15]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> ; CHECK-NEXT: [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = mul nuw nsw <16 x i32> [[TMP11]], [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = mul nuw nsw <16 x i32> [[TMP13]], [[TMP19]] ; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP20]]) ; CHECK-NEXT: ret i32 [[TMP21]] ; @@ -677,63 +677,63 @@ entry: define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, ptr nocapture noundef writeonly %z, i32 noundef %stride) { ; CHECK-LABEL: @store_blockstrided3( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i64 8 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[X:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ADD4:%.*]] = add nsw i32 [[STRIDE:%.*]], 1 ; CHECK-NEXT: [[IDXPROM5:%.*]] = sext i32 [[ADD4]] to i64 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[STRIDE]], 1 ; CHECK-NEXT: [[IDXPROM11:%.*]] = sext i32 [[MUL]] to i64 ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM11]] -; CHECK-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL]], 2 -; CHECK-NEXT: [[IDXPROM19:%.*]] = sext i32 [[ADD18]] to i64 -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM19]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[ADD14:%.*]] = or disjoint i32 [[MUL]], 1 +; CHECK-NEXT: [[IDXPROM15:%.*]] = sext i32 [[ADD14]] to i64 +; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM15]] ; CHECK-NEXT: [[MUL21:%.*]] = mul nsw i32 [[STRIDE]], 3 ; CHECK-NEXT: [[IDXPROM23:%.*]] = sext i32 [[MUL21]] to i64 ; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM23]] -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX24]], align 4 -; CHECK-NEXT: [[ADD26:%.*]] = add nsw i32 [[MUL21]], 1 -; CHECK-NEXT: [[IDXPROM27:%.*]] = sext i32 [[ADD26]] to i64 -; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM27]] -; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i64 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4 +; CHECK-NEXT: [[ADD30:%.*]] = add nsw i32 [[MUL21]], 2 +; CHECK-NEXT: [[IDXPROM31:%.*]] = sext i32 [[ADD30]] to i64 +; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM31]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX32]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[Y:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, ptr [[Y]], i64 8 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]] -; CHECK-NEXT: [[ARRAYIDX56:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM19]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX56]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4 +; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM15]] ; CHECK-NEXT: [[ARRAYIDX60:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM23]] -; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX60]], align 4 -; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]] +; CHECK-NEXT: [[ARRAYIDX68:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM31]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX68]], align 4 ; CHECK-NEXT: [[ARRAYIDX72:%.*]] = getelementptr inbounds i8, ptr [[Z:%.*]], i64 4 -; CHECK-NEXT: [[MUL73:%.*]] = mul nsw i32 [[TMP3]], [[TMP0]] +; CHECK-NEXT: [[MUL73:%.*]] = mul nsw i32 [[TMP6]], [[TMP1]] ; CHECK-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 24 -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[X]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr [[Y]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP8]], [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP9]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP5]], [[TMP0]] +; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP7]], [[TMP2]] ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> -; CHECK-NEXT: [[ARRAYIDX84:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 28 -; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX48]], align 4 +; CHECK-NEXT: [[MUL81:%.*]] = mul nsw i32 [[TMP8]], [[TMP3]] +; CHECK-NEXT: [[ARRAYIDX82:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 32 +; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX16]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX52]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP14]], [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[MUL85:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]] -; CHECK-NEXT: [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]] -; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 44 -; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 36 -; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4 +; CHECK-NEXT: [[ARRAYIDX90:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 40 +; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX24]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX60]], align 4 ; CHECK-NEXT: store i32 [[MUL73]], ptr [[Z]], align 4 ; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr [[ARRAYIDX72]], align 4 -; CHECK-NEXT: store <2 x i32> [[TMP16]], ptr [[ARRAYIDX84]], align 4 -; CHECK-NEXT: store i32 [[MUL85]], ptr [[ARRAYIDX76]], align 4 -; CHECK-NEXT: store i32 [[MUL87]], ptr [[ARRAYIDX88]], align 4 +; CHECK-NEXT: store i32 [[MUL81]], ptr [[ARRAYIDX82]], align 4 +; CHECK-NEXT: store <2 x i32> [[TMP16]], ptr [[ARRAYIDX76]], align 4 ; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <2 x i32> [[TMP18]], [[TMP17]] ; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP19]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: store <2 x i32> [[TMP20]], ptr [[ARRAYIDX92]], align 4 +; CHECK-NEXT: store <2 x i32> [[TMP20]], ptr [[ARRAYIDX90]], align 4 +; CHECK-NEXT: [[MUL91:%.*]] = mul nsw i32 [[TMP9]], [[TMP4]] +; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 36 +; CHECK-NEXT: store i32 [[MUL91]], ptr [[ARRAYIDX92]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -833,12 +833,12 @@ entry: define void @store_blockstrided4(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, i32 noundef %stride, ptr %dst0) { ; CHECK-LABEL: @store_blockstrided4( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X:%.*]], align 2 ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X]], align 2 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM]] ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y:%.*]], align 2 +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM]] ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i16> [[TMP2]], [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i16> [[TMP3]], [[TMP1]] @@ -1203,62 +1203,62 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1, ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[ST2:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4 -; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4 -; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 4 -; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 ; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 ; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 ; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 4 ; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 ; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 ; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 +; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4 ; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP18]], <16 x i8> [[TMP19]], <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP21]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <16 x i8> [[TMP31]], <16 x i8> [[TMP32]], <16 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32> ; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> ; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> ; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> -; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]] +; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP34]], [[TMP40]] ; CHECK-NEXT: [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], -; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP26]] +; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP28]] ; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP45:%.*]] = add nsw <16 x i32> [[TMP43]], [[TMP44]] ; CHECK-NEXT: [[TMP46:%.*]] = sub nsw <16 x i32> [[TMP43]], [[TMP44]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll index 07411cacb36268..d89d6286703605 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll @@ -14,161 +14,232 @@ define i64 @straight(ptr nocapture noundef readonly %p, i32 noundef %st) { ; CHECK-LABEL: @straight( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[ST:%.*]] to i64 -; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[P]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 2 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ADD_PTR]], align 2 +; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ADD_PTR_1]], align 2 +; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[ADD_PTR_2]], align 2 +; CHECK-NEXT: [[ADD_PTR_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[ADD_PTR_3]], align 2 +; CHECK-NEXT: [[ADD_PTR_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr [[ADD_PTR_4]], align 2 +; CHECK-NEXT: [[ADD_PTR_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[ADD_PTR_5]], align 2 +; CHECK-NEXT: [[ADD_PTR_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i16>, ptr [[ADD_PTR_6]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> poison, <8 x i16> [[TMP0]], i64 0) -; CHECK-NEXT: [[TMP9:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP8]], <8 x i16> [[TMP1]], i64 8) -; CHECK-NEXT: [[TMP10:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP9]], <8 x i16> [[TMP2]], i64 16) -; CHECK-NEXT: [[TMP11:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP10]], <8 x i16> [[TMP3]], i64 24) -; CHECK-NEXT: [[TMP12:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP11]], <8 x i16> [[TMP4]], i64 32) -; CHECK-NEXT: [[TMP13:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP12]], <8 x i16> [[TMP5]], i64 40) -; CHECK-NEXT: [[TMP14:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP13]], <8 x i16> [[TMP6]], i64 48) -; CHECK-NEXT: [[TMP15:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP14]], <8 x i16> [[TMP7]], i64 56) -; CHECK-NEXT: [[TMP16:%.*]] = zext <64 x i16> [[TMP15]] to <64 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <64 x i32> [[TMP16]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <64 x i32> [[TMP16]], i32 1 -; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i32 [[TMP17]], [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = mul nuw nsw <64 x i32> [[TMP16]], [[TMP16]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <64 x i32> [[TMP16]], i32 2 -; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[TMP20]] -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <64 x i32> [[TMP16]], i32 3 -; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[TMP21]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <64 x i32> [[TMP16]], i32 4 -; CHECK-NEXT: [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <64 x i32> [[TMP16]], i32 5 -; CHECK-NEXT: [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[TMP23]] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <64 x i32> [[TMP16]], i32 6 -; CHECK-NEXT: [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[TMP24]] -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <64 x i32> [[TMP16]], i32 7 -; CHECK-NEXT: [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[TMP25]] -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <64 x i32> [[TMP16]], i32 8 -; CHECK-NEXT: [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[TMP26]] -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <64 x i32> [[TMP16]], i32 9 -; CHECK-NEXT: [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[TMP27]] -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <64 x i32> [[TMP16]], i32 10 -; CHECK-NEXT: [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[TMP28]] -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <64 x i32> [[TMP16]], i32 11 -; CHECK-NEXT: [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[TMP29]] -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <64 x i32> [[TMP16]], i32 12 -; CHECK-NEXT: [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[TMP30]] -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <64 x i32> [[TMP16]], i32 13 -; CHECK-NEXT: [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[TMP31]] -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <64 x i32> [[TMP16]], i32 14 -; CHECK-NEXT: [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[TMP32]] -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <64 x i32> [[TMP16]], i32 15 -; CHECK-NEXT: [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[TMP33]] -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <64 x i32> [[TMP16]], i32 16 -; CHECK-NEXT: [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[TMP34]] -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <64 x i32> [[TMP16]], i32 17 -; CHECK-NEXT: [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[TMP35]] -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <64 x i32> [[TMP16]], i32 18 -; CHECK-NEXT: [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[TMP36]] -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <64 x i32> [[TMP16]], i32 19 -; CHECK-NEXT: [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[TMP37]] -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <64 x i32> [[TMP16]], i32 20 -; CHECK-NEXT: [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[TMP38]] -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <64 x i32> [[TMP16]], i32 21 -; CHECK-NEXT: [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[TMP39]] -; CHECK-NEXT: [[TMP40:%.*]] = extractelement <64 x i32> [[TMP16]], i32 22 -; CHECK-NEXT: [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[TMP40]] -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <64 x i32> [[TMP16]], i32 23 -; CHECK-NEXT: [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[TMP41]] -; CHECK-NEXT: [[TMP42:%.*]] = extractelement <64 x i32> [[TMP16]], i32 24 -; CHECK-NEXT: [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[TMP42]] -; CHECK-NEXT: [[TMP43:%.*]] = extractelement <64 x i32> [[TMP16]], i32 25 -; CHECK-NEXT: [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[TMP43]] -; CHECK-NEXT: [[TMP44:%.*]] = extractelement <64 x i32> [[TMP16]], i32 26 -; CHECK-NEXT: [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[TMP44]] -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <64 x i32> [[TMP16]], i32 27 -; CHECK-NEXT: [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[TMP45]] -; CHECK-NEXT: [[TMP46:%.*]] = extractelement <64 x i32> [[TMP16]], i32 28 -; CHECK-NEXT: [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[TMP46]] -; CHECK-NEXT: [[TMP47:%.*]] = extractelement <64 x i32> [[TMP16]], i32 29 -; CHECK-NEXT: [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[TMP47]] -; CHECK-NEXT: [[TMP48:%.*]] = extractelement <64 x i32> [[TMP16]], i32 30 -; CHECK-NEXT: [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[TMP48]] -; CHECK-NEXT: [[TMP49:%.*]] = extractelement <64 x i32> [[TMP16]], i32 31 -; CHECK-NEXT: [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[TMP49]] -; CHECK-NEXT: [[TMP50:%.*]] = extractelement <64 x i32> [[TMP16]], i32 32 -; CHECK-NEXT: [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[TMP50]] -; CHECK-NEXT: [[TMP51:%.*]] = extractelement <64 x i32> [[TMP16]], i32 33 -; CHECK-NEXT: [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[TMP51]] -; CHECK-NEXT: [[TMP52:%.*]] = extractelement <64 x i32> [[TMP16]], i32 34 -; CHECK-NEXT: [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[TMP52]] -; CHECK-NEXT: [[TMP53:%.*]] = extractelement <64 x i32> [[TMP16]], i32 35 -; CHECK-NEXT: [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[TMP53]] -; CHECK-NEXT: [[TMP54:%.*]] = extractelement <64 x i32> [[TMP16]], i32 36 -; CHECK-NEXT: [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[TMP54]] -; CHECK-NEXT: [[TMP55:%.*]] = extractelement <64 x i32> [[TMP16]], i32 37 -; CHECK-NEXT: [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[TMP55]] -; CHECK-NEXT: [[TMP56:%.*]] = extractelement <64 x i32> [[TMP16]], i32 38 -; CHECK-NEXT: [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[TMP56]] -; CHECK-NEXT: [[TMP57:%.*]] = extractelement <64 x i32> [[TMP16]], i32 39 -; CHECK-NEXT: [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[TMP57]] -; CHECK-NEXT: [[TMP58:%.*]] = extractelement <64 x i32> [[TMP16]], i32 40 -; CHECK-NEXT: [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[TMP58]] -; CHECK-NEXT: [[TMP59:%.*]] = extractelement <64 x i32> [[TMP16]], i32 41 -; CHECK-NEXT: [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[TMP59]] -; CHECK-NEXT: [[TMP60:%.*]] = extractelement <64 x i32> [[TMP16]], i32 42 -; CHECK-NEXT: [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[TMP60]] -; CHECK-NEXT: [[TMP61:%.*]] = extractelement <64 x i32> [[TMP16]], i32 43 -; CHECK-NEXT: [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[TMP61]] -; CHECK-NEXT: [[TMP62:%.*]] = extractelement <64 x i32> [[TMP16]], i32 44 -; CHECK-NEXT: [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[TMP62]] -; CHECK-NEXT: [[TMP63:%.*]] = extractelement <64 x i32> [[TMP16]], i32 45 -; CHECK-NEXT: [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[TMP63]] -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <64 x i32> [[TMP16]], i32 46 -; CHECK-NEXT: [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[TMP64]] -; CHECK-NEXT: [[TMP65:%.*]] = extractelement <64 x i32> [[TMP16]], i32 47 -; CHECK-NEXT: [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[TMP65]] -; CHECK-NEXT: [[TMP66:%.*]] = extractelement <64 x i32> [[TMP16]], i32 48 -; CHECK-NEXT: [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[TMP66]] -; CHECK-NEXT: [[TMP67:%.*]] = extractelement <64 x i32> [[TMP16]], i32 49 -; CHECK-NEXT: [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[TMP67]] -; CHECK-NEXT: [[TMP68:%.*]] = extractelement <64 x i32> [[TMP16]], i32 50 -; CHECK-NEXT: [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[TMP68]] -; CHECK-NEXT: [[TMP69:%.*]] = extractelement <64 x i32> [[TMP16]], i32 51 -; CHECK-NEXT: [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[TMP69]] -; CHECK-NEXT: [[TMP70:%.*]] = extractelement <64 x i32> [[TMP16]], i32 52 -; CHECK-NEXT: [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[TMP70]] -; CHECK-NEXT: [[TMP71:%.*]] = extractelement <64 x i32> [[TMP16]], i32 53 -; CHECK-NEXT: [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[TMP71]] -; CHECK-NEXT: [[TMP72:%.*]] = extractelement <64 x i32> [[TMP16]], i32 54 -; CHECK-NEXT: [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[TMP72]] -; CHECK-NEXT: [[TMP73:%.*]] = extractelement <64 x i32> [[TMP16]], i32 55 -; CHECK-NEXT: [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[TMP73]] -; CHECK-NEXT: [[TMP74:%.*]] = extractelement <64 x i32> [[TMP16]], i32 56 -; CHECK-NEXT: [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[TMP74]] -; CHECK-NEXT: [[TMP75:%.*]] = extractelement <64 x i32> [[TMP16]], i32 57 -; CHECK-NEXT: [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[TMP75]] -; CHECK-NEXT: [[TMP76:%.*]] = extractelement <64 x i32> [[TMP16]], i32 58 -; CHECK-NEXT: [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[TMP76]] -; CHECK-NEXT: [[TMP77:%.*]] = extractelement <64 x i32> [[TMP16]], i32 59 -; CHECK-NEXT: [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[TMP77]] -; CHECK-NEXT: [[TMP78:%.*]] = extractelement <64 x i32> [[TMP16]], i32 60 -; CHECK-NEXT: [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[TMP78]] -; CHECK-NEXT: [[TMP79:%.*]] = extractelement <64 x i32> [[TMP16]], i32 61 -; CHECK-NEXT: [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[TMP79]] -; CHECK-NEXT: [[TMP80:%.*]] = extractelement <64 x i32> [[TMP16]], i32 62 -; CHECK-NEXT: [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[TMP80]] -; CHECK-NEXT: [[TMP81:%.*]] = extractelement <64 x i32> [[TMP16]], i32 63 -; CHECK-NEXT: [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[TMP81]] -; CHECK-NEXT: [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP19]]) +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP7]], i32 7 +; CHECK-NEXT: [[CONV_7_7:%.*]] = zext i16 [[TMP8]] to i32 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <64 x i16> [[TMP9]], <64 x i16> [[TMP10]], <64 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <64 x i16> [[TMP11]], <64 x i16> [[TMP12]], <64 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <64 x i16> [[TMP13]], <64 x i16> [[TMP14]], <64 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <64 x i16> [[TMP15]], <64 x i16> [[TMP16]], <64 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <64 x i16> [[TMP17]], <64 x i16> [[TMP18]], <64 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <64 x i16> [[TMP19]], <64 x i16> [[TMP20]], <64 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <64 x i16> [[TMP21]], <64 x i16> [[TMP22]], <64 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = zext <64 x i16> [[TMP23]] to <64 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[TMP7]], i32 6 +; CHECK-NEXT: [[CONV_6_7:%.*]] = zext i16 [[TMP25]] to i32 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP7]], i32 5 +; CHECK-NEXT: [[CONV_5_7:%.*]] = zext i16 [[TMP26]] to i32 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x i16> [[TMP7]], i32 4 +; CHECK-NEXT: [[CONV_4_7:%.*]] = zext i16 [[TMP27]] to i32 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i16> [[TMP7]], i32 3 +; CHECK-NEXT: [[CONV_3_7:%.*]] = zext i16 [[TMP28]] to i32 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i16> [[TMP7]], i32 2 +; CHECK-NEXT: [[CONV_2_7:%.*]] = zext i16 [[TMP29]] to i32 +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i16> [[TMP7]], i32 1 +; CHECK-NEXT: [[CONV_1_7:%.*]] = zext i16 [[TMP30]] to i32 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <8 x i16> [[TMP7]], i32 0 +; CHECK-NEXT: [[CONV_764:%.*]] = zext i16 [[TMP31]] to i32 +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7 +; CHECK-NEXT: [[CONV_7_6:%.*]] = zext i16 [[TMP32]] to i32 +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6 +; CHECK-NEXT: [[CONV_6_6:%.*]] = zext i16 [[TMP33]] to i32 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5 +; CHECK-NEXT: [[CONV_5_6:%.*]] = zext i16 [[TMP34]] to i32 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4 +; CHECK-NEXT: [[CONV_4_6:%.*]] = zext i16 [[TMP35]] to i32 +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 3 +; CHECK-NEXT: [[CONV_3_6:%.*]] = zext i16 [[TMP36]] to i32 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i16> [[TMP6]], i32 2 +; CHECK-NEXT: [[CONV_2_6:%.*]] = zext i16 [[TMP37]] to i32 +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <8 x i16> [[TMP6]], i32 1 +; CHECK-NEXT: [[CONV_1_6:%.*]] = zext i16 [[TMP38]] to i32 +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[CONV_660:%.*]] = zext i16 [[TMP39]] to i32 +; CHECK-NEXT: [[TMP40:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7 +; CHECK-NEXT: [[CONV_7_5:%.*]] = zext i16 [[TMP40]] to i32 +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6 +; CHECK-NEXT: [[CONV_6_5:%.*]] = zext i16 [[TMP41]] to i32 +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5 +; CHECK-NEXT: [[CONV_5_5:%.*]] = zext i16 [[TMP42]] to i32 +; CHECK-NEXT: [[TMP43:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4 +; CHECK-NEXT: [[CONV_4_5:%.*]] = zext i16 [[TMP43]] to i32 +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <8 x i16> [[TMP5]], i32 3 +; CHECK-NEXT: [[CONV_3_5:%.*]] = zext i16 [[TMP44]] to i32 +; CHECK-NEXT: [[TMP45:%.*]] = extractelement <8 x i16> [[TMP5]], i32 2 +; CHECK-NEXT: [[CONV_2_5:%.*]] = zext i16 [[TMP45]] to i32 +; CHECK-NEXT: [[TMP46:%.*]] = extractelement <8 x i16> [[TMP5]], i32 1 +; CHECK-NEXT: [[CONV_1_5:%.*]] = zext i16 [[TMP46]] to i32 +; CHECK-NEXT: [[TMP47:%.*]] = extractelement <8 x i16> [[TMP5]], i32 0 +; CHECK-NEXT: [[CONV_556:%.*]] = zext i16 [[TMP47]] to i32 +; CHECK-NEXT: [[TMP48:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7 +; CHECK-NEXT: [[CONV_7_4:%.*]] = zext i16 [[TMP48]] to i32 +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6 +; CHECK-NEXT: [[CONV_6_4:%.*]] = zext i16 [[TMP49]] to i32 +; CHECK-NEXT: [[TMP50:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5 +; CHECK-NEXT: [[CONV_5_4:%.*]] = zext i16 [[TMP50]] to i32 +; CHECK-NEXT: [[TMP51:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4 +; CHECK-NEXT: [[CONV_4_4:%.*]] = zext i16 [[TMP51]] to i32 +; CHECK-NEXT: [[TMP52:%.*]] = extractelement <8 x i16> [[TMP4]], i32 3 +; CHECK-NEXT: [[CONV_3_4:%.*]] = zext i16 [[TMP52]] to i32 +; CHECK-NEXT: [[TMP53:%.*]] = extractelement <8 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[CONV_2_4:%.*]] = zext i16 [[TMP53]] to i32 +; CHECK-NEXT: [[TMP54:%.*]] = extractelement <8 x i16> [[TMP4]], i32 1 +; CHECK-NEXT: [[CONV_1_4:%.*]] = zext i16 [[TMP54]] to i32 +; CHECK-NEXT: [[TMP55:%.*]] = extractelement <8 x i16> [[TMP4]], i32 0 +; CHECK-NEXT: [[CONV_452:%.*]] = zext i16 [[TMP55]] to i32 +; CHECK-NEXT: [[TMP56:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 +; CHECK-NEXT: [[CONV_7_3:%.*]] = zext i16 [[TMP56]] to i32 +; CHECK-NEXT: [[TMP57:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 +; CHECK-NEXT: [[CONV_6_3:%.*]] = zext i16 [[TMP57]] to i32 +; CHECK-NEXT: [[TMP58:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 +; CHECK-NEXT: [[CONV_5_3:%.*]] = zext i16 [[TMP58]] to i32 +; CHECK-NEXT: [[TMP59:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 +; CHECK-NEXT: [[CONV_4_3:%.*]] = zext i16 [[TMP59]] to i32 +; CHECK-NEXT: [[TMP60:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +; CHECK-NEXT: [[CONV_3_3:%.*]] = zext i16 [[TMP60]] to i32 +; CHECK-NEXT: [[TMP61:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 +; CHECK-NEXT: [[CONV_2_3:%.*]] = zext i16 [[TMP61]] to i32 +; CHECK-NEXT: [[TMP62:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 +; CHECK-NEXT: [[CONV_1_3:%.*]] = zext i16 [[TMP62]] to i32 +; CHECK-NEXT: [[TMP63:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[CONV_348:%.*]] = zext i16 [[TMP63]] to i32 +; CHECK-NEXT: [[TMP64:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 +; CHECK-NEXT: [[CONV_7_2:%.*]] = zext i16 [[TMP64]] to i32 +; CHECK-NEXT: [[TMP65:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6 +; CHECK-NEXT: [[CONV_6_2:%.*]] = zext i16 [[TMP65]] to i32 +; CHECK-NEXT: [[TMP66:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5 +; CHECK-NEXT: [[CONV_5_2:%.*]] = zext i16 [[TMP66]] to i32 +; CHECK-NEXT: [[TMP67:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4 +; CHECK-NEXT: [[CONV_4_2:%.*]] = zext i16 [[TMP67]] to i32 +; CHECK-NEXT: [[TMP68:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3 +; CHECK-NEXT: [[CONV_3_2:%.*]] = zext i16 [[TMP68]] to i32 +; CHECK-NEXT: [[TMP69:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2 +; CHECK-NEXT: [[CONV_2_2:%.*]] = zext i16 [[TMP69]] to i32 +; CHECK-NEXT: [[TMP70:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1 +; CHECK-NEXT: [[CONV_1_2:%.*]] = zext i16 [[TMP70]] to i32 +; CHECK-NEXT: [[TMP71:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0 +; CHECK-NEXT: [[CONV_244:%.*]] = zext i16 [[TMP71]] to i32 +; CHECK-NEXT: [[TMP72:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +; CHECK-NEXT: [[CONV_7_1:%.*]] = zext i16 [[TMP72]] to i32 +; CHECK-NEXT: [[TMP73:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6 +; CHECK-NEXT: [[CONV_6_1:%.*]] = zext i16 [[TMP73]] to i32 +; CHECK-NEXT: [[TMP74:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5 +; CHECK-NEXT: [[CONV_5_1:%.*]] = zext i16 [[TMP74]] to i32 +; CHECK-NEXT: [[TMP75:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4 +; CHECK-NEXT: [[CONV_4_1:%.*]] = zext i16 [[TMP75]] to i32 +; CHECK-NEXT: [[TMP76:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3 +; CHECK-NEXT: [[CONV_3_1:%.*]] = zext i16 [[TMP76]] to i32 +; CHECK-NEXT: [[TMP77:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2 +; CHECK-NEXT: [[CONV_2_1:%.*]] = zext i16 [[TMP77]] to i32 +; CHECK-NEXT: [[TMP78:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1 +; CHECK-NEXT: [[CONV_1_1:%.*]] = zext i16 [[TMP78]] to i32 +; CHECK-NEXT: [[TMP79:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 +; CHECK-NEXT: [[CONV_140:%.*]] = zext i16 [[TMP79]] to i32 +; CHECK-NEXT: [[TMP80:%.*]] = extractelement <8 x i16> [[TMP0]], i32 7 +; CHECK-NEXT: [[CONV_7:%.*]] = zext i16 [[TMP80]] to i32 +; CHECK-NEXT: [[TMP81:%.*]] = extractelement <8 x i16> [[TMP0]], i32 6 +; CHECK-NEXT: [[CONV_6:%.*]] = zext i16 [[TMP81]] to i32 +; CHECK-NEXT: [[TMP82:%.*]] = extractelement <8 x i16> [[TMP0]], i32 5 +; CHECK-NEXT: [[CONV_5:%.*]] = zext i16 [[TMP82]] to i32 +; CHECK-NEXT: [[TMP83:%.*]] = extractelement <8 x i16> [[TMP0]], i32 4 +; CHECK-NEXT: [[CONV_4:%.*]] = zext i16 [[TMP83]] to i32 +; CHECK-NEXT: [[TMP84:%.*]] = extractelement <8 x i16> [[TMP0]], i32 3 +; CHECK-NEXT: [[CONV_3:%.*]] = zext i16 [[TMP84]] to i32 +; CHECK-NEXT: [[TMP85:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2 +; CHECK-NEXT: [[CONV_2:%.*]] = zext i16 [[TMP85]] to i32 +; CHECK-NEXT: [[TMP86:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0 +; CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP86]] to i32 +; CHECK-NEXT: [[TMP87:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1 +; CHECK-NEXT: [[CONV_1:%.*]] = zext i16 [[TMP87]] to i32 +; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i32 [[CONV]], [[CONV_1]] +; CHECK-NEXT: [[TMP88:%.*]] = mul nuw nsw <64 x i32> [[TMP24]], [[TMP24]] +; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[CONV_2]] +; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[CONV_3]] +; CHECK-NEXT: [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[CONV_4]] +; CHECK-NEXT: [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[CONV_5]] +; CHECK-NEXT: [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[CONV_6]] +; CHECK-NEXT: [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[CONV_7]] +; CHECK-NEXT: [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[CONV_140]] +; CHECK-NEXT: [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[CONV_1_1]] +; CHECK-NEXT: [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[CONV_2_1]] +; CHECK-NEXT: [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[CONV_3_1]] +; CHECK-NEXT: [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[CONV_4_1]] +; CHECK-NEXT: [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[CONV_5_1]] +; CHECK-NEXT: [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[CONV_6_1]] +; CHECK-NEXT: [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[CONV_7_1]] +; CHECK-NEXT: [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[CONV_244]] +; CHECK-NEXT: [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[CONV_1_2]] +; CHECK-NEXT: [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[CONV_2_2]] +; CHECK-NEXT: [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[CONV_3_2]] +; CHECK-NEXT: [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[CONV_4_2]] +; CHECK-NEXT: [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[CONV_5_2]] +; CHECK-NEXT: [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[CONV_6_2]] +; CHECK-NEXT: [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[CONV_7_2]] +; CHECK-NEXT: [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[CONV_348]] +; CHECK-NEXT: [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[CONV_1_3]] +; CHECK-NEXT: [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[CONV_2_3]] +; CHECK-NEXT: [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[CONV_3_3]] +; CHECK-NEXT: [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[CONV_4_3]] +; CHECK-NEXT: [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[CONV_5_3]] +; CHECK-NEXT: [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[CONV_6_3]] +; CHECK-NEXT: [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[CONV_7_3]] +; CHECK-NEXT: [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[CONV_452]] +; CHECK-NEXT: [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[CONV_1_4]] +; CHECK-NEXT: [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[CONV_2_4]] +; CHECK-NEXT: [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[CONV_3_4]] +; CHECK-NEXT: [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[CONV_4_4]] +; CHECK-NEXT: [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[CONV_5_4]] +; CHECK-NEXT: [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[CONV_6_4]] +; CHECK-NEXT: [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[CONV_7_4]] +; CHECK-NEXT: [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[CONV_556]] +; CHECK-NEXT: [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[CONV_1_5]] +; CHECK-NEXT: [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[CONV_2_5]] +; CHECK-NEXT: [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[CONV_3_5]] +; CHECK-NEXT: [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[CONV_4_5]] +; CHECK-NEXT: [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[CONV_5_5]] +; CHECK-NEXT: [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[CONV_6_5]] +; CHECK-NEXT: [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[CONV_7_5]] +; CHECK-NEXT: [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[CONV_660]] +; CHECK-NEXT: [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[CONV_1_6]] +; CHECK-NEXT: [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[CONV_2_6]] +; CHECK-NEXT: [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[CONV_3_6]] +; CHECK-NEXT: [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[CONV_4_6]] +; CHECK-NEXT: [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[CONV_5_6]] +; CHECK-NEXT: [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[CONV_6_6]] +; CHECK-NEXT: [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[CONV_7_6]] +; CHECK-NEXT: [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[CONV_764]] +; CHECK-NEXT: [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[CONV_1_7]] +; CHECK-NEXT: [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[CONV_2_7]] +; CHECK-NEXT: [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[CONV_3_7]] +; CHECK-NEXT: [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[CONV_4_7]] +; CHECK-NEXT: [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[CONV_5_7]] +; CHECK-NEXT: [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[CONV_6_7]] +; CHECK-NEXT: [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[CONV_7_7]] +; CHECK-NEXT: [[TMP89:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP88]]) ; CHECK-NEXT: [[CONV15:%.*]] = zext i32 [[ADD_7_7]] to i64 -; CHECK-NEXT: [[CONV16:%.*]] = zext i32 [[TMP82]] to i64 +; CHECK-NEXT: [[CONV16:%.*]] = zext i32 [[TMP89]] to i64 ; CHECK-NEXT: [[SHL:%.*]] = shl nuw i64 [[CONV16]], 32 ; CHECK-NEXT: [[ADD17:%.*]] = or i64 [[SHL]], [[CONV15]] ; CHECK-NEXT: ret i64 [[ADD17]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll index 8093285ad8717c..6f6b66255a4340 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll @@ -3,63 +3,39 @@ ; Test case reported on D134605 where the vectorization was causing a slowdown due to an underestimation in the cost of the extractions. -; NOTE: cost of shuffle <4 x float>, <4 x float>, <2 x i32> is 12! - define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, ptr %arg5, i1 %arg6, i1 %arg7, i1 %arg8) { ; CHECK-LABEL: @zot( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[VAL:%.*]] = fmul fast float 0.000000e+00, 0.000000e+00 ; CHECK-NEXT: [[VAL9:%.*]] = fmul fast float 0.000000e+00, [[ARG:%.*]] -; CHECK-NEXT: [[VAL10:%.*]] = fmul fast float [[ARG3:%.*]], 1.000000e+00 -; CHECK-NEXT: [[VAL11:%.*]] = fmul fast float [[ARG3]], 1.000000e+00 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> , float [[ARG]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> , [[TMP2]] ; CHECK-NEXT: [[VAL12:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 -; CHECK-NEXT: [[VAL13:%.*]] = fadd fast float [[VAL12]], 2.000000e+00 -; CHECK-NEXT: [[VAL14:%.*]] = fadd fast float 0.000000e+00, 0.000000e+00 -; CHECK-NEXT: [[VAL15:%.*]] = fadd fast float [[VAL14]], 1.000000e+00 -; CHECK-NEXT: [[VAL16:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 -; CHECK-NEXT: [[VAL17:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP2]], float [[VAL12]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float 0.000000e+00, i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <4 x float> [[TMP5]], ; CHECK-NEXT: br i1 [[ARG6:%.*]], label [[BB18:%.*]], label [[BB57:%.*]] ; CHECK: bb18: -; CHECK-NEXT: [[VAL19:%.*]] = phi float [ [[VAL13]], [[BB:%.*]] ] -; CHECK-NEXT: [[VAL20:%.*]] = phi float [ [[VAL15]], [[BB]] ] -; CHECK-NEXT: [[VAL21:%.*]] = phi float [ [[VAL16]], [[BB]] ] -; CHECK-NEXT: [[VAL22:%.*]] = phi float [ [[VAL17]], [[BB]] ] -; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[VAL16]], 2.000000e+00 -; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[VAL17]], 3.000000e+00 +; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x float> [ [[TMP6]], [[BB:%.*]] ] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP6]], i32 2 +; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[TMP8]], 2.000000e+00 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i32 3 +; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[TMP9]], 3.000000e+00 ; CHECK-NEXT: br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]] ; CHECK: bb25: -; CHECK-NEXT: [[VAL26:%.*]] = phi float [ [[VAL19]], [[BB18]] ] -; CHECK-NEXT: [[VAL27:%.*]] = phi float [ [[VAL20]], [[BB18]] ] -; CHECK-NEXT: [[VAL28:%.*]] = phi float [ [[VAL21]], [[BB18]] ] -; CHECK-NEXT: [[VAL29:%.*]] = phi float [ [[VAL22]], [[BB18]] ] +; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x float> [ [[TMP7]], [[BB18]] ] ; CHECK-NEXT: br label [[BB30:%.*]] ; CHECK: bb30: ; CHECK-NEXT: [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] ; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[VAL9]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] -; CHECK-NEXT: [[VAL33:%.*]] = load i8, ptr [[ARG5:%.*]], align 1 -; CHECK-NEXT: [[VAL34:%.*]] = uitofp i8 [[VAL33]] to float -; CHECK-NEXT: [[VAL35:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 1 -; CHECK-NEXT: [[VAL36:%.*]] = load i8, ptr [[VAL35]], align 1 -; CHECK-NEXT: [[VAL37:%.*]] = uitofp i8 [[VAL36]] to float -; CHECK-NEXT: [[VAL38:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 2 -; CHECK-NEXT: [[VAL39:%.*]] = load i8, ptr [[VAL38]], align 1 -; CHECK-NEXT: [[VAL40:%.*]] = uitofp i8 [[VAL39]] to float -; CHECK-NEXT: [[VAL41:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 3 -; CHECK-NEXT: [[VAL42:%.*]] = load i8, ptr [[VAL41]], align 1 -; CHECK-NEXT: [[VAL43:%.*]] = uitofp i8 [[VAL42]] to float -; CHECK-NEXT: [[VAL44:%.*]] = fsub fast float [[VAL34]], [[VAL]] -; CHECK-NEXT: [[VAL45:%.*]] = fsub fast float [[VAL37]], [[VAL9]] -; CHECK-NEXT: [[VAL46:%.*]] = fsub fast float [[VAL40]], [[VAL10]] -; CHECK-NEXT: [[VAL47:%.*]] = fsub fast float [[VAL43]], [[VAL11]] -; CHECK-NEXT: [[VAL48:%.*]] = fmul fast float [[VAL44]], [[VAL26]] -; CHECK-NEXT: [[VAL49:%.*]] = fmul fast float [[VAL45]], [[VAL27]] -; CHECK-NEXT: [[VAL50:%.*]] = fadd fast float [[VAL49]], [[VAL48]] -; CHECK-NEXT: [[VAL51:%.*]] = fmul fast float [[VAL46]], [[VAL28]] -; CHECK-NEXT: [[VAL52:%.*]] = fadd fast float [[VAL50]], [[VAL51]] -; CHECK-NEXT: [[VAL53:%.*]] = fmul fast float [[VAL47]], [[VAL29]] -; CHECK-NEXT: [[VAL54:%.*]] = fadd fast float [[VAL52]], [[VAL53]] +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = uitofp <4 x i8> [[TMP11]] to <4 x float> +; CHECK-NEXT: [[TMP13:%.*]] = fsub fast <4 x float> [[TMP12]], [[TMP3]] +; CHECK-NEXT: [[TMP14:%.*]] = fmul fast <4 x float> [[TMP13]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP14]]) ; CHECK-NEXT: [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]]) -; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[VAL54]]) +; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[TMP15]]) ; CHECK-NEXT: call void @ham(float [[VAL55]], float [[VAL56]]) ; CHECK-NEXT: br i1 [[ARG8:%.*]], label [[BB30]], label [[BB57]] ; CHECK: bb57: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll index 4f881823746228..e39cd8aaa111b1 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll @@ -7,13 +7,16 @@ define void @p(double %0) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> , double [[TMP0]], i32 3 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[MUL16_150_1_I:%.*]] = fmul double 0.000000e+00, 0.000000e+00 ; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x double> zeroinitializer, [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP14]], double [[MUL16_150_1_I]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> zeroinitializer, [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x double> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> , <2 x double> [[TMP7]], i64 2) +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> , <4 x double> [[TMP9]], <4 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x double> [[TMP8]], [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = fadd <4 x double> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = fptosi <4 x double> [[TMP12]] to <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll index ff1d6253bec928..95aa40f664c0ce 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -25,11 +25,11 @@ define void @s116_modified(ptr %a) { ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP2]], i64 2) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x float> [[TMP7]], [[TMP9]] ; CHECK-NEXT: store <4 x float> [[TMP10]], ptr [[A]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll index f04c359b432b5e..b59659ca75eb24 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll @@ -241,9 +241,12 @@ entry: define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) { ; CHECK-LABEL: @select_uniform_ugt_16xi8( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i8 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR:%.*]], align 1 +; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 8 ; CHECK-NEXT: [[L_8:%.*]] = load i8, ptr [[GEP_8]], align 1 ; CHECK-NEXT: [[CMP_8:%.*]] = icmp ugt i8 [[L_8]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0 +; CHECK-NEXT: [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]] ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 9 ; CHECK-NEXT: [[L_9:%.*]] = load i8, ptr [[GEP_9]], align 1 ; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 10 @@ -251,28 +254,19 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) { ; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 11 ; CHECK-NEXT: [[L_11:%.*]] = load i8, ptr [[GEP_11]], align 1 ; CHECK-NEXT: [[GEP_12:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 12 -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0 -; CHECK-NEXT: [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[L_9]], i32 9 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[L_10]], i32 10 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[L_11]], i32 11 -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP6]], <8 x i8> [[TMP0]], i64 0) -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP7]], <4 x i8> [[TMP2]], i64 12) -; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt <16 x i8> [[TMP8]], -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> [[TMP11]], <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[L_9]], i32 9 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[L_10]], i32 10 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[L_11]], i32 11 -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP17]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP16]], <16 x i8> [[TMP18]] -; CHECK-NEXT: store <16 x i8> [[TMP19]], ptr [[PTR]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt <16 x i8> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP11]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP10]], <16 x i8> [[TMP9]], <16 x i8> [[TMP12]] +; CHECK-NEXT: store <16 x i8> [[TMP13]], ptr [[PTR]], align 2 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll index cd79250e8fb6be..94a55c435c8c39 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll @@ -4,11 +4,12 @@ define void @test(ptr noalias %p, ptr %p1) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ptr noalias [[P:%.*]], ptr [[P1:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[P]], align 2 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr [[GEP2]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.vector.insert.v4i16.v2i16(<4 x i16> poison, <2 x i16> [[TMP1]], i64 0) -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i16> @llvm.vector.insert.v4i16.v2i16(<4 x i16> [[TMP3]], <2 x i16> [[TMP2]], i64 2) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i32> ; CHECK-NEXT: store <4 x i16> [[TMP5]], ptr [[P1]], align 2 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll index 151b91184bf428..ff3d2c4c59394c 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll @@ -1013,20 +1013,22 @@ declare i32 @llvm.abs.i32(i32, i1) define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) { ; CHECK-LABEL: @stride_sum_abs_diff( -; CHECK-NEXT: [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 [[STRIDE:%.*]] -; CHECK-NEXT: [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i64 [[STRIDE]] -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[Q]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[Q:%.*]], align 4 +; CHECK-NEXT: [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[STRIDE:%.*]] +; CHECK-NEXT: [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q]], i64 [[STRIDE]] ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[P_2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[Q_2]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP1]], i64 0) -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP5]], <2 x i32> [[TMP3]], i64 2) -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP2]], i64 0) -; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP7]], <2 x i32> [[TMP4]], i64 2) -; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP9]], i1 true) -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) -; CHECK-NEXT: ret i32 [[TMP11]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP11]], i1 true) +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]]) +; CHECK-NEXT: ret i32 [[TMP13]] ; %x.0 = load i32, ptr %p %y.0 = load i32, ptr %q @@ -1066,11 +1068,12 @@ define i32 @reduce_sum_2arrays_a(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[Q:%.*]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> poison, <4 x i8> [[TMP0]], i64 0) -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP2]], <4 x i8> [[TMP1]], i64 4) -; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]]) -; CHECK-NEXT: ret i32 [[TMP5]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) +; CHECK-NEXT: ret i32 [[TMP6]] ; entry: %x.0 = load i8, ptr %p, align 1 @@ -1114,11 +1117,12 @@ define i32 @reduce_sum_2arrays_b(ptr noalias noundef %x, ptr noalias %y) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[X:%.*]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[Y:%.*]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> poison, <4 x i8> [[TMP0]], i64 0) -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP2]], <4 x i8> [[TMP1]], i64 4) -; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]]) -; CHECK-NEXT: ret i32 [[TMP5]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) +; CHECK-NEXT: ret i32 [[TMP6]] ; entry: %0 = load i8, ptr %x, align 1 diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll index 413aedefe9b6ad..0fcbead65d0d66 100644 --- a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll +++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll @@ -13,11 +13,12 @@ define void @foo() local_unnamed_addr { ; CHECK-NEXT: [[ARRAYIDX372:%.*]] = getelementptr inbounds [4 x [4 x i32]], ptr @dct_luma, i64 0, i64 3, i64 0 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([4 x [4 x i32]], ptr @bar, i64 0, i64 3, i64 2), align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP2]], <2 x i32> [[TMP1]], i64 2) -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[ADD277]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = ashr <4 x i32> [[TMP5]], -; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[ARRAYIDX372]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> , i32 [[ADD277]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = ashr <4 x i32> [[TMP6]], +; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr [[ARRAYIDX372]], align 4 ; CHECK-NEXT: unreachable ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll index 96b498ced7d0f8..f7bd2431a76054 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll @@ -392,14 +392,16 @@ define void @vec_shuff_reorder() #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr @fb, align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP1]], i64 0) -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2) -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0) -; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP7]], <2 x float> [[TMP4]], i64 2) -; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = fsub <4 x float> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> -; CHECK-NEXT: store <4 x float> [[TMP11]], ptr @fc, align 4 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = fsub <4 x float> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x i32> +; CHECK-NEXT: store <4 x float> [[TMP13]], ptr @fc, align 4 ; CHECK-NEXT: ret void ; %1 = load float, ptr @fb, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll index 87b1302e4cecf4..3b03ca13ea65d0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll @@ -6,25 +6,30 @@ define i1 @test(float %0, double %1) { ; CHECK-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) { ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> , float [[TMP0]], i32 3 ; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> , <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> , <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] -; CHECK-NEXT: [[TMP12:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP10]], i64 0) -; CHECK-NEXT: [[TMP13:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP11]], i64 0) -; CHECK-NEXT: [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP13]], <2 x double> [[TMP6]], i64 4) -; CHECK-NEXT: [[TMP15:%.*]] = fsub <8 x double> [[TMP12]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = fmul <8 x double> [[TMP12]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x double> [[TMP15]], <8 x double> [[TMP16]], <8 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = fptrunc <8 x double> [[TMP17]] to <8 x float> -; CHECK-NEXT: [[TMP19:%.*]] = fmul <8 x float> [[TMP18]], zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = fcmp oeq <8 x float> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = freeze <8 x i1> [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP21]]) -; CHECK-NEXT: ret i1 [[TMP22]] +; CHECK-NEXT: [[TMP5:%.*]] = fpext float 0.000000e+00 to double +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> zeroinitializer, [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP8]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x double> , double [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x double> [[TMP10]], double [[TMP5]], i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = fmul <4 x double> [[TMP9]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x double> , <8 x double> [[TMP14]], <8 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x double> , <8 x double> [[TMP16]], <8 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x double> [[TMP17]], <8 x double> [[TMP18]], <8 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = fsub <8 x double> [[TMP15]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = fmul <8 x double> [[TMP15]], [[TMP19]] +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x double> [[TMP20]], <8 x double> [[TMP21]], <8 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = fptrunc <8 x double> [[TMP22]] to <8 x float> +; CHECK-NEXT: [[TMP24:%.*]] = fmul <8 x float> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = fcmp oeq <8 x float> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = freeze <8 x i1> [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP26]]) +; CHECK-NEXT: ret i1 [[TMP27]] ; %3 = fpext float %0 to double %4 = fpext float 0.000000e+00 to double diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll index 6ff03acf85cdfd..d326c855a10912 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll @@ -4,19 +4,20 @@ define i64 @foo(i32 %tmp7) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7:%.*]], 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 undef, i32 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 0, i32 5 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> , i32 [[TMP8]], i32 3 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> , <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP3]], <2 x i32> zeroinitializer, i64 0) -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <8 x i32> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add <8 x i32> zeroinitializer, [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP9]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP10]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[TMP7:%.*]], i32 2 +; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[TMP0]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = sub i32 undef, 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> , i32 [[TMP24]], i32 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 0, i32 5 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP24]], i32 6 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP77:%.*]] = sub nsw <8 x i32> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP77]], <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i32> zeroinitializer, [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP10]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP11]], 0 ; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[OP_RDX]] to i64 ; CHECK-NEXT: ret i64 [[TMP64]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll index 757d0b1708b6fb..d80d7b5ecd4e76 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll @@ -8,18 +8,19 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) { ; CHECK-SAME: ptr [[I7:%.*]], i32 [[TMP0:%.*]], i1 [[TOBOOL62_NOT:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[RC21:%.*]] = alloca [0 x [0 x %struct.rect]], i32 0, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[RC21]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x float> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[Y0:%.*]] = getelementptr i8, ptr [[RC21]], i64 8 -; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[Y0]], align 4 +; CHECK-NEXT: [[X1:%.*]] = getelementptr i8, ptr [[RC21]], i64 4 +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, ptr [[X1]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[I7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[RC21]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 2 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 3 -; CHECK-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP11]], <2 x float> [[TMP8]], i64 0) -; CHECK-NEXT: [[TMP12:%.*]] = fcmp olt <4 x float> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = fcmp olt <4 x float> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = fcmp olt <4 x float> [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP5]], <4 x float> zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP12]], <4 x float> zeroinitializer, <4 x float> [[TMP15]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll index b0d9fea43a0e6c..fa022ad69af791 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -1016,13 +1016,15 @@ define i32 @maxi8_wrong_parent(i32) { ; THRESH-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @arr, align 16 ; THRESH-NEXT: br label [[PP:%.*]] ; THRESH: pp: -; THRESH-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 -; THRESH-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16 -; THRESH-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP4]], i64 4) -; THRESH-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP2]], i64 0) -; THRESH-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP3]], i64 2) -; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP7]]) -; THRESH-NEXT: ret i32 [[TMP8]] +; THRESH-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 +; THRESH-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 +; THRESH-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> +; THRESH-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; THRESH-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> +; THRESH-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> +; THRESH-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> +; THRESH-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP9]]) +; THRESH-NEXT: ret i32 [[TMP10]] ; %2 = load i32, ptr @arr, align 16 %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll index 54c950a0785020..6c4572593027d6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll @@ -4,20 +4,14 @@ define void @inst_size(ptr %a, <2 x i64> %b) { ; CHECK-LABEL: @inst_size( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMPL1:%.*]] = load i64, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[PTR2]], align 4 -; CHECK-NEXT: [[PTR4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 3 -; CHECK-NEXT: [[TMPL4:%.*]] = load i64, ptr [[PTR4]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[B:%.*]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[TMPL1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP2]], <2 x i64> [[TMP0]], i64 2) -; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i64> zeroinitializer, [[TMP3]] -; CHECK-NEXT: [[T45:%.*]] = icmp sgt i64 0, [[TMPL4]] +; CHECK-NEXT: [[VAL:%.*]] = extractelement <2 x i64> [[B:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[T41:%.*]] = icmp sgt i64 0, [[VAL]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i64> zeroinitializer, [[TMP1]] ; CHECK-NEXT: br label [[BLOCK:%.*]] ; CHECK: block: -; CHECK-NEXT: [[PHI5:%.*]] = phi i1 [ [[T45]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi <4 x i1> [ [[TMP4]], [[ENTRY]] ] +; CHECK-NEXT: [[PHI1:%.*]] = phi i1 [ [[T41]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x i1> [ [[TMP2]], [[ENTRY]] ] ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll b/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll index 813c5e7418b30e..47b42bc8f32a7d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll @@ -10,10 +10,10 @@ define void @foo() personality ptr @bar { ; CHECK: bb2.loopexit: ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ [[TMP8:%.*]], [[BB9:%.*]] ], [ poison, [[BB2_LOOPEXIT:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ [[TMP7:%.*]], [[BB9:%.*]] ], [ poison, [[BB2_LOOPEXIT:%.*]] ] ; CHECK-NEXT: ret void ; CHECK: bb3: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP4:%.*]], [[BB6:%.*]] ], [ poison, [[BB1:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP3:%.*]], [[BB6:%.*]] ], [ poison, [[BB1:%.*]] ] ; CHECK-NEXT: [[TMP2:%.*]] = invoke i32 poison(ptr addrspace(1) nonnull poison, i32 0, i32 0, i32 poison) [ "deopt"() ] ; CHECK-NEXT: to label [[BB4:%.*]] unwind label [[BB10:%.*]] ; CHECK: bb4: @@ -21,30 +21,29 @@ define void @foo() personality ptr @bar { ; CHECK: bb5: ; CHECK-NEXT: br label [[BB7:%.*]] ; CHECK: bb6: -; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ , [[BB8:%.*]] ] -; CHECK-NEXT: [[TMP4]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP3]] = phi <2 x i32> [ , [[BB8:%.*]] ] ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb7: ; CHECK-NEXT: [[LOCAL_5_84111:%.*]] = phi i32 [ poison, [[BB8]] ], [ poison, [[BB5]] ] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[LOCAL_5_84111]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = invoke i32 poison(ptr addrspace(1) nonnull poison, i32 poison, i32 poison, i32 poison) [ "deopt"() ] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[LOCAL_5_84111]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = invoke i32 poison(ptr addrspace(1) nonnull poison, i32 poison, i32 poison, i32 poison) [ "deopt"() ] ; CHECK-NEXT: to label [[BB8]] unwind label [[BB12:%.*]] ; CHECK: bb8: ; CHECK-NEXT: br i1 poison, label [[BB7]], label [[BB6]] ; CHECK: bb9: ; CHECK-NEXT: [[INDVARS_IV528799:%.*]] = phi i64 [ poison, [[BB10]] ], [ poison, [[BB12]] ] -; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x i32> [ [[TMP9:%.*]], [[BB10]] ], [ [[TMP10:%.*]], [[BB12]] ] -; CHECK-NEXT: [[TMP8]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP7]], i64 2) +; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ [[TMP8:%.*]], [[BB10]] ], [ [[TMP9:%.*]], [[BB12]] ] +; CHECK-NEXT: [[TMP7]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: br label [[BB2]] ; CHECK: bb10: -; CHECK-NEXT: [[TMP9]] = phi <2 x i32> [ [[TMP1]], [[BB3]] ] +; CHECK-NEXT: [[TMP8]] = phi <2 x i32> [ [[TMP1]], [[BB3]] ] ; CHECK-NEXT: [[LANDING_PAD68:%.*]] = landingpad { ptr, i32 } ; CHECK-NEXT: cleanup ; CHECK-NEXT: br label [[BB9]] ; CHECK: bb11: ; CHECK-NEXT: ret void ; CHECK: bb12: -; CHECK-NEXT: [[TMP10]] = phi <2 x i32> [ [[TMP5]], [[BB7]] ] +; CHECK-NEXT: [[TMP9]] = phi <2 x i32> [ [[TMP4]], [[BB7]] ] ; CHECK-NEXT: [[LANDING_PAD149:%.*]] = landingpad { ptr, i32 } ; CHECK-NEXT: cleanup ; CHECK-NEXT: br label [[BB9]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll index 7201583f3450e0..96151e0bd6c418 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -144,8 +144,8 @@ define float @foo3(ptr nocapture readonly %A) #0 { ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP14:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP8:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 ; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00 ; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]] @@ -154,25 +154,23 @@ define float @foo3(ptr nocapture readonly %A) #0 { ; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX14]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]] -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 -; CHECK-NEXT: [[TMP9]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP11]], <2 x float> [[TMP8]], i64 2) -; CHECK-NEXT: [[TMP13:%.*]] = fmul <4 x float> [[TMP12]], -; CHECK-NEXT: [[TMP14]] = fadd <4 x float> [[TMP3]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP15]], 121 +; CHECK-NEXT: [[TMP8]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP8]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP10]], +; CHECK-NEXT: [[TMP12]] = fadd <4 x float> [[TMP3]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP13]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP14]], i32 0 -; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP14]], i32 1 -; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP14]], i32 2 -; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP14]], i32 3 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP19]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP12]], i32 0 +; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP12]], i32 1 +; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP12]], i32 2 +; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP12]], i32 3 +; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP17]] ; CHECK-NEXT: ret float [[ADD31]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll index 12389f4a3dbf4a..865d8178667167 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -390,15 +390,14 @@ define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) { define i1 @logical_and_icmp_clamp_pred_diff(<4 x i32> %x) { ; CHECK-LABEL: @logical_and_icmp_clamp_pred_diff( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP2]], <4 x i32> [[X]], i64 0) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <8 x i32> [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp ult <8 x i32> [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP6]], <8 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = freeze <8 x i1> [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP8]]) -; CHECK-NEXT: ret i1 [[TMP9]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = freeze <8 x i1> [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP7]]) +; CHECK-NEXT: ret i1 [[TMP8]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll index 8aaa71ef47a8c9..7de2cde45525ae 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll @@ -10,7 +10,16 @@ ; YAML-NEXT: - String: 'SLP vectorized with cost ' ; YAML-NEXT: - Cost: '-4' ; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '5' +; YAML-NEXT: - TreeSize: '4' +; YAML-LABEL: --- !Passed +; YAML-NEXT: Pass: slp-vectorizer +; YAML-NEXT: Name: VectorizedList +; YAML-NEXT: Function: test +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'SLP vectorized with cost ' +; YAML-NEXT: - Cost: '-2' +; YAML-NEXT: - String: ' and with tree size ' +; YAML-NEXT: - TreeSize: '2' define <4 x float> @test(ptr %x, float %v, float %a) { ; CHECK-LABEL: define <4 x float> @test( @@ -19,8 +28,9 @@ define <4 x float> @test(ptr %x, float %v, float %a) { ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[A]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[V]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP1]], i64 2) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP3]], [[TMP7]] ; CHECK-NEXT: ret <4 x float> [[TMP8]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll index c01c44ff03c153..dadf5992ba288d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll @@ -5,25 +5,23 @@ define void @test(i1 %c, ptr %arg) { ; CHECK-LABEL: @test( ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: -; CHECK-NEXT: [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 24 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP4]], i64 0) -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP5]], <2 x i64> [[TMP2]], i64 2) +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG:%.*]], align 8 +; CHECK-NEXT: [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <4 x i32> ; CHECK-NEXT: br label [[JOIN:%.*]] ; CHECK: else: +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 ; CHECK-NEXT: [[ARG_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24 -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP10]], i64 0) -; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP11]], <2 x i64> [[TMP8]], i64 2) +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> [[TMP9]], <4 x i32> ; CHECK-NEXT: br label [[JOIN]] ; CHECK: join: -; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i64> [ [[TMP6]], [[IF]] ], [ [[TMP12]], [[ELSE]] ] +; CHECK-NEXT: [[TMP11:%.*]] = phi <4 x i64> [ [[TMP5]], [[IF]] ], [ [[TMP10]], [[ELSE]] ] ; CHECK-NEXT: ret void ; br i1 %c, label %if, label %else diff --git a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget_debug_info.ll b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget_debug_info.ll index 207b2d45c335e0..d45054b6bebce7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget_debug_info.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget_debug_info.ll @@ -14,21 +14,7 @@ declare void @unknown() define void @test(ptr %a, ptr %b, ptr %c, ptr %d) { ; VECTOR_DBG-LABEL: @test( ; VECTOR_DBG-NEXT: entry: -; VECTOR_DBG-NEXT: [[L0:%.*]] = load float, ptr [[A:%.*]], align 4 -; VECTOR_DBG-NEXT: [[A1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 -; VECTOR_DBG-NEXT: [[L1:%.*]] = load float, ptr [[A1]], align 4 -; VECTOR_DBG-NEXT: [[A2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2 -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3:![0-9]+]], !DIExpression(), [[META5:![0-9]+]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: [[B1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 -; VECTOR_DBG-NEXT: [[B2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 -; VECTOR_DBG-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A2]], align 4 +; VECTOR_DBG-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4 ; VECTOR_DBG-NEXT: call void @unknown() ; VECTOR_DBG-NEXT: call void @unknown() ; VECTOR_DBG-NEXT: call void @unknown() @@ -57,22 +43,22 @@ define void @test(ptr %a, ptr %b, ptr %c, ptr %d) { ; VECTOR_DBG-NEXT: call void @unknown() ; VECTOR_DBG-NEXT: call void @unknown() ; VECTOR_DBG-NEXT: call void @unknown() -; VECTOR_DBG-NEXT: store float [[L0]], ptr [[B]], align 4 -; VECTOR_DBG-NEXT: store float [[L1]], ptr [[B1]], align 4 -; VECTOR_DBG-NEXT: store <2 x float> [[TMP0]], ptr [[B2]], align 4 +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3:![0-9]+]], !DIExpression(), [[META5:![0-9]+]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: store <4 x float> [[TMP0]], ptr [[B:%.*]], align 4 ; VECTOR_DBG-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4 ; VECTOR_DBG-NEXT: store <4 x float> [[TMP1]], ptr [[D:%.*]], align 4 ; VECTOR_DBG-NEXT: ret void ; ; VECTOR_NODBG-LABEL: @test( ; VECTOR_NODBG-NEXT: entry: -; VECTOR_NODBG-NEXT: [[L0:%.*]] = load float, ptr [[A:%.*]], align 4 -; VECTOR_NODBG-NEXT: [[A1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 -; VECTOR_NODBG-NEXT: [[L1:%.*]] = load float, ptr [[A1]], align 4 -; VECTOR_NODBG-NEXT: [[A2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2 -; VECTOR_NODBG-NEXT: [[B1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 -; VECTOR_NODBG-NEXT: [[B2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 -; VECTOR_NODBG-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A2]], align 4 +; VECTOR_NODBG-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4 ; VECTOR_NODBG-NEXT: call void @unknown() ; VECTOR_NODBG-NEXT: call void @unknown() ; VECTOR_NODBG-NEXT: call void @unknown() @@ -101,9 +87,7 @@ define void @test(ptr %a, ptr %b, ptr %c, ptr %d) { ; VECTOR_NODBG-NEXT: call void @unknown() ; VECTOR_NODBG-NEXT: call void @unknown() ; VECTOR_NODBG-NEXT: call void @unknown() -; VECTOR_NODBG-NEXT: store float [[L0]], ptr [[B]], align 4 -; VECTOR_NODBG-NEXT: store float [[L1]], ptr [[B1]], align 4 -; VECTOR_NODBG-NEXT: store <2 x float> [[TMP0]], ptr [[B2]], align 4 +; VECTOR_NODBG-NEXT: store <4 x float> [[TMP0]], ptr [[B:%.*]], align 4 ; VECTOR_NODBG-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4 ; VECTOR_NODBG-NEXT: store <4 x float> [[TMP1]], ptr [[D:%.*]], align 4 ; VECTOR_NODBG-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll index 6ca1f8119c1cf0..6825f43b5a9eb4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll @@ -14,21 +14,22 @@ define dso_local void @_Z4testP1S(ptr %p) local_unnamed_addr { ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 4 ; CHECK-NEXT: [[I7:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4 ; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 12 -; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 14 -; CHECK-NEXT: [[I13:%.*]] = load i32, ptr [[ARRAYIDX41]], align 4 +; CHECK-NEXT: [[I9:%.*]] = load i32, ptr [[ARRAYIDX27]], align 4 +; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 13 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX34]], align 4 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 5 ; CHECK-NEXT: [[I15:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX27]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[I1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[I7]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[I13]], i32 6 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[I15]], i32 7 -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP8]], <2 x i32> [[TMP1]], i64 4) -; CHECK-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[TMP9]], [[TMP2]] -; CHECK-NEXT: store <8 x i32> [[TMP10]], ptr [[P]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[I9]], i32 4 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[I15]], i32 7 +; CHECK-NEXT: [[TMP11:%.*]] = add nsw <8 x i32> [[TMP10]], [[TMP2]] +; CHECK-NEXT: store <8 x i32> [[TMP11]], ptr [[P]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -105,10 +106,11 @@ define dso_local void @test_unordered_splits(ptr nocapture %p) local_unnamed_add ; CHECK-NEXT: [[G20:%.*]] = getelementptr inbounds [16 x i32], ptr [[P2]], i32 0, i64 12 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[G10]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[G20]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP0]], i64 0) -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP2]], <4 x i32> [[TMP1]], i64 4) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: store <8 x i32> [[TMP4]], ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: store <8 x i32> [[TMP5]], ptr [[P:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -163,11 +165,14 @@ define dso_local void @test_cost_splits(ptr nocapture %p) local_unnamed_addr { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[G12]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[G20]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[G22]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> poison, <2 x i32> [[TMP0]], i64 0) -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP4]], <2 x i32> [[TMP1]], i64 2) -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP2]], i64 4) -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP3]], i64 6) -; CHECK-NEXT: store <8 x i32> [[TMP7]], ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> +; CHECK-NEXT: store <8 x i32> [[TMP10]], ptr [[P:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll index 3eabed5882e58b..eb3d395f4c6a6f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll @@ -160,8 +160,9 @@ define void @tiny_tree_not_fully_vectorizable2(ptr noalias nocapture %dst, ptr n ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP4]], <2 x float> [[TMP2]], i64 2) -; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[DST_ADDR_022]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +; CHECK-NEXT: store <4 x float> [[TMP6]], ptr [[DST_ADDR_022]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds float, ptr [[SRC_ADDR_021]], i64 [[I_023]] ; CHECK-NEXT: [[ADD_PTR8]] = getelementptr inbounds float, ptr [[DST_ADDR_022]], i64 [[I_023]] ; CHECK-NEXT: [[INC]] = add i64 [[I_023]], 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll index e1b091cc6fcda7..6ac6884ca5377f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll @@ -8,14 +8,14 @@ define void @test(ptr %a, ptr %b) { ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr null, align 4 ; CHECK-NEXT: [[ARRAYIDX120:%.*]] = getelementptr [4 x float], ptr [[B:%.*]], i64 0, i64 3 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX120]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr null, align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP3]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP6]], <2 x float> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i32 2 ; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP5]], zeroinitializer From 84aa02d3fa1f1f614c4f3c144ec118b2f05ae6b0 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Fri, 23 Aug 2024 06:52:09 +0100 Subject: [PATCH 290/426] [memref] Handle edge case in subview of full static size fold (#105635) It is possible to have a subview with a fully static size and a type that matches the source type, but a dynamic offset that may be different. However, currently the memref dialect folds: ```mlir func.func @subview_of_static_full_size( %arg0: memref<16x4xf32, strided<[4, 1], offset: ?>>, %idx: index) -> memref<16x4xf32, strided<[4, 1], offset: ?>> { %0 = memref.subview %arg0[%idx, 0][16, 4][1, 1] : memref<16x4xf32, strided<[4, 1], offset: ?>> to memref<16x4xf32, strided<[4, 1], offset: ?>> return %0 : memref<16x4xf32, strided<[4, 1], offset: ?>> } ``` To: ```mlir func.func @subview_of_static_full_size( %arg0: memref<16x4xf32, strided<[4, 1], offset: ?>>, %arg1: index) -> memref<16x4xf32, strided<[4, 1], offset: ?>> { return %arg0 : memref<16x4xf32, strided<[4, 1], offset: ?>> } ``` Which drops the dynamic offset from the `subview` op. --- mlir/include/mlir/IR/BuiltinAttributes.td | 4 ++++ mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp | 15 +++++++++------ mlir/lib/IR/BuiltinAttributes.cpp | 7 +++++++ mlir/test/Dialect/MemRef/canonicalize.mlir | 13 +++++++++++++ 4 files changed, 33 insertions(+), 6 deletions(-) diff --git a/mlir/include/mlir/IR/BuiltinAttributes.td b/mlir/include/mlir/IR/BuiltinAttributes.td index d9295936ee97bd..f0d41754001400 100644 --- a/mlir/include/mlir/IR/BuiltinAttributes.td +++ b/mlir/include/mlir/IR/BuiltinAttributes.td @@ -1012,6 +1012,10 @@ def StridedLayoutAttr : Builtin_Attr<"StridedLayout", "strided_layout", let extraClassDeclaration = [{ /// Print the attribute to the given output stream. void print(raw_ostream &os) const; + + /// Returns true if this layout is static, i.e. the strides and offset all + /// have a known value > 0. + bool hasStaticLayout() const; }]; } diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp index 150049e5c5effe..9c021d3613f1c8 100644 --- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp +++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp @@ -3279,11 +3279,14 @@ void SubViewOp::getCanonicalizationPatterns(RewritePatternSet &results, } OpFoldResult SubViewOp::fold(FoldAdaptor adaptor) { - auto resultShapedType = llvm::cast(getResult().getType()); - auto sourceShapedType = llvm::cast(getSource().getType()); - - if (resultShapedType.hasStaticShape() && - resultShapedType == sourceShapedType) { + MemRefType sourceMemrefType = getSource().getType(); + MemRefType resultMemrefType = getResult().getType(); + auto resultLayout = + dyn_cast_if_present(resultMemrefType.getLayout()); + + if (resultMemrefType == sourceMemrefType && + resultMemrefType.hasStaticShape() && + (!resultLayout || resultLayout.hasStaticLayout())) { return getViewSource(); } @@ -3301,7 +3304,7 @@ OpFoldResult SubViewOp::fold(FoldAdaptor adaptor) { strides, [](OpFoldResult ofr) { return isConstantIntValue(ofr, 1); }); bool allSizesSame = llvm::equal(sizes, srcSizes); if (allOffsetsZero && allStridesOne && allSizesSame && - resultShapedType == sourceShapedType) + resultMemrefType == sourceMemrefType) return getViewSource(); } diff --git a/mlir/lib/IR/BuiltinAttributes.cpp b/mlir/lib/IR/BuiltinAttributes.cpp index 89b1ed67f5d067..8861a940336133 100644 --- a/mlir/lib/IR/BuiltinAttributes.cpp +++ b/mlir/lib/IR/BuiltinAttributes.cpp @@ -229,6 +229,13 @@ void StridedLayoutAttr::print(llvm::raw_ostream &os) const { os << ">"; } +/// Returns true if this layout is static, i.e. the strides and offset all have +/// a known value > 0. +bool StridedLayoutAttr::hasStaticLayout() const { + return !ShapedType::isDynamic(getOffset()) && + !ShapedType::isDynamicShape(getStrides()); +} + /// Returns the strided layout as an affine map. AffineMap StridedLayoutAttr::getAffineMap() const { return makeStridedLinearLayoutMap(getStrides(), getOffset(), getContext()); diff --git a/mlir/test/Dialect/MemRef/canonicalize.mlir b/mlir/test/Dialect/MemRef/canonicalize.mlir index b15af9baca7dc7..02110bc2892d05 100644 --- a/mlir/test/Dialect/MemRef/canonicalize.mlir +++ b/mlir/test/Dialect/MemRef/canonicalize.mlir @@ -70,6 +70,19 @@ func.func @subview_of_static_full_size(%arg0 : memref<4x6x16x32xi8>) -> memref<4 // ----- +// CHECK-LABEL: func @negative_subview_of_static_full_size +// CHECK-SAME: %[[ARG0:.+]]: memref<16x4xf32, strided<[4, 1], offset: ?>> +// CHECK-SAME: %[[IDX:.+]]: index +// CHECK: %[[S:.+]] = memref.subview %[[ARG0]][%[[IDX]], 0] [16, 4] [1, 1] +// CHECK-SAME: to memref<16x4xf32, strided<[4, 1], offset: ?>> +// CHECK: return %[[S]] : memref<16x4xf32, strided<[4, 1], offset: ?>> +func.func @negative_subview_of_static_full_size(%arg0: memref<16x4xf32, strided<[4, 1], offset: ?>>, %idx: index) -> memref<16x4xf32, strided<[4, 1], offset: ?>> { + %0 = memref.subview %arg0[%idx, 0][16, 4][1, 1] : memref<16x4xf32, strided<[4, 1], offset: ?>> to memref<16x4xf32, strided<[4, 1], offset: ?>> + return %0 : memref<16x4xf32, strided<[4, 1], offset: ?>> +} + +// ----- + func.func @subview_canonicalize(%arg0 : memref, %arg1 : index, %arg2 : index) -> memref> { From 59721f2326988ece58fab183971f79b71f751b83 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 23 Aug 2024 00:05:20 -0700 Subject: [PATCH 291/426] [MIPS] Optimize sortRelocs for o32 The o32 ABI specifies: > Each relocation type of R_MIPS_HI16 must have an associated R_MIPS_LO16 entry immediately following it in the list of relocations. [...] the addend AHL is computed as (AHI << 16) + (short)ALO In practice, the high-part and low-part relocations may not be adjacent in assembly files, requiring the assembler to reorder relocations. http://reviews.llvm.org/D19718 performed the reordering, but did not optimize for the common case where a %lo immediately follows its matching %hi. The quadratic time complexity could make sections with many relocations very slow to process. This patch implements the fast path, simplifies the code, and makes the behavior more similar to GNU assembler (for the .rel.mips_hilo_8b test). We also remove `OriginalSymbol`, removing overhead for other targets. Fix #104562 Pull Request: https://github.com/llvm/llvm-project/pull/104723 --- llvm/include/llvm/MC/MCELFObjectWriter.h | 8 +- llvm/lib/MC/ELFObjectWriter.cpp | 4 +- .../Mips/MCTargetDesc/MipsELFObjectWriter.cpp | 127 ++++++------------ llvm/test/MC/Mips/sort-relocation-table.s | 6 +- 4 files changed, 49 insertions(+), 96 deletions(-) diff --git a/llvm/include/llvm/MC/MCELFObjectWriter.h b/llvm/include/llvm/MC/MCELFObjectWriter.h index d17fc931d1561c..b09e3bbffad329 100644 --- a/llvm/include/llvm/MC/MCELFObjectWriter.h +++ b/llvm/include/llvm/MC/MCELFObjectWriter.h @@ -37,16 +37,14 @@ struct ELFRelocationEntry { const MCSymbolELF *Symbol; // The symbol to relocate with. unsigned Type; // The type of the relocation. uint64_t Addend; // The addend to use. - const MCSymbolELF *OriginalSymbol; // The original value of Symbol if we changed it. ELFRelocationEntry(uint64_t Offset, const MCSymbolELF *Symbol, unsigned Type, - uint64_t Addend, const MCSymbolELF *OriginalSymbol) - : Offset(Offset), Symbol(Symbol), Type(Type), Addend(Addend), - OriginalSymbol(OriginalSymbol) {} + uint64_t Addend) + : Offset(Offset), Symbol(Symbol), Type(Type), Addend(Addend) {} void print(raw_ostream &Out) const { Out << "Off=" << Offset << ", Sym=" << Symbol << ", Type=" << Type - << ", Addend=" << Addend << ", OriginalSymbol=" << OriginalSymbol; + << ", Addend=" << Addend; } LLVM_DUMP_METHOD void dump() const { print(errs()); } diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp index 35d0a2eb52dfc7..62a4b5347f8299 100644 --- a/llvm/lib/MC/ELFObjectWriter.cpp +++ b/llvm/lib/MC/ELFObjectWriter.cpp @@ -1453,7 +1453,7 @@ void ELFObjectWriter::recordRelocation(MCAssembler &Asm, SecA ? cast(SecA->getBeginSymbol()) : nullptr; if (SectionSymbol) SectionSymbol->setUsedInReloc(); - ELFRelocationEntry Rec(FixupOffset, SectionSymbol, Type, Addend, SymA); + ELFRelocationEntry Rec(FixupOffset, SectionSymbol, Type, Addend); Relocations[&FixupSection].push_back(Rec); return; } @@ -1468,7 +1468,7 @@ void ELFObjectWriter::recordRelocation(MCAssembler &Asm, else RenamedSymA->setUsedInReloc(); } - ELFRelocationEntry Rec(FixupOffset, RenamedSymA, Type, Addend, SymA); + ELFRelocationEntry Rec(FixupOffset, RenamedSymA, Type, Addend); Relocations[&FixupSection].push_back(Rec); } diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp index 19b6bf0da22a60..faf9772ab75756 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp @@ -40,11 +40,6 @@ struct MipsRelocationEntry { bool Matched = false; ///< Is this relocation part of a match. MipsRelocationEntry(const ELFRelocationEntry &R) : R(R) {} - - void print(raw_ostream &Out) const { - R.print(Out); - Out << ", Matched=" << Matched; - } }; class MipsELFObjectWriter : public MCELFObjectTargetWriter { @@ -134,8 +129,7 @@ static unsigned getMatchingLoType(const ELFRelocationEntry &Reloc) { if (Type == ELF::R_MIPS16_HI16) return ELF::R_MIPS16_LO16; - if (Reloc.OriginalSymbol && - Reloc.OriginalSymbol->getBinding() != ELF::STB_LOCAL) + if (Reloc.Symbol && Reloc.Symbol->getBinding() != ELF::STB_LOCAL) return ELF::R_MIPS_NONE; if (Type == ELF::R_MIPS_GOT16) @@ -148,43 +142,11 @@ static unsigned getMatchingLoType(const ELFRelocationEntry &Reloc) { return ELF::R_MIPS_NONE; } -/// Determine whether a relocation (X) matches the one given in R. -/// -/// A relocation matches if: -/// - It's type matches that of a corresponding low part. This is provided in -/// MatchingType for efficiency. -/// - It's based on the same symbol. -/// - It's offset of greater or equal to that of the one given in R. -/// It should be noted that this rule assumes the programmer does not use -/// offsets that exceed the alignment of the symbol. The carry-bit will be -/// incorrect if this is not true. -/// -/// A matching relocation is unbeatable if: -/// - It is not already involved in a match. -/// - It's offset is exactly that of the one given in R. -static FindBestPredicateResult isMatchingReloc(const MipsRelocationEntry &X, - const ELFRelocationEntry &R, - unsigned MatchingType) { - if (X.R.Type == MatchingType && X.R.OriginalSymbol == R.OriginalSymbol) { - if (!X.Matched && X.R.Addend == R.Addend) - return FindBest_PerfectMatch; - else if (X.R.Addend >= R.Addend) - return FindBest_Match; - } - return FindBest_NoMatch; -} - -/// Determine whether Candidate or PreviousBest is the better match. -/// The return value is true if Candidate is the better match. -/// -/// A matching relocation is a better match if: -/// - It has a smaller addend. -/// - It is not already involved in a match. -static bool compareMatchingRelocs(const MipsRelocationEntry &Candidate, - const MipsRelocationEntry &PreviousBest) { - if (Candidate.R.Addend != PreviousBest.R.Addend) - return Candidate.R.Addend < PreviousBest.R.Addend; - return PreviousBest.Matched && !Candidate.Matched; +// Determine whether a relocation X is a low-part and matches the high-part R +// perfectly by symbol and addend. +static bool isMatchingReloc(unsigned MatchingType, const ELFRelocationEntry &R, + const ELFRelocationEntry &X) { + return X.Type == MatchingType && X.Symbol == R.Symbol && X.Addend == R.Addend; } MipsELFObjectWriter::MipsELFObjectWriter(uint8_t OSABI, @@ -413,58 +375,51 @@ void MipsELFObjectWriter::sortRelocs(const MCAssembler &Asm, if (hasRelocationAddend()) return; - if (Relocs.size() < 2) - return; - // Sort relocations by the address they are applied to. llvm::sort(Relocs, [](const ELFRelocationEntry &A, const ELFRelocationEntry &B) { return A.Offset < B.Offset; }); + // Place relocations in a list for reorder convenience. Hi16 contains the + // iterators of high-part relocations. std::list Sorted; - std::list Remainder; - - // Separate the movable relocations (AHL relocations using the high bits) from - // the immobile relocations (everything else). This does not preserve high/low - // matches that already existed in the input. - copy_if_else(Relocs.begin(), Relocs.end(), std::back_inserter(Remainder), - std::back_inserter(Sorted), [](const ELFRelocationEntry &Reloc) { - return getMatchingLoType(Reloc) != ELF::R_MIPS_NONE; - }); + SmallVector::iterator, 0> Hi16; + for (auto &R : Relocs) { + Sorted.push_back(R); + if (getMatchingLoType(R) != ELF::R_MIPS_NONE) + Hi16.push_back(std::prev(Sorted.end())); + } - for (auto &R : Remainder) { + for (auto I : Hi16) { + auto &R = I->R; unsigned MatchingType = getMatchingLoType(R); - assert(MatchingType != ELF::R_MIPS_NONE && - "Wrong list for reloc that doesn't need a match"); - - // Find the best matching relocation for the current high part. - // See isMatchingReloc for a description of a matching relocation and - // compareMatchingRelocs for a description of what 'best' means. - auto InsertionPoint = - find_best(Sorted.begin(), Sorted.end(), - [&R, &MatchingType](const MipsRelocationEntry &X) { - return isMatchingReloc(X, R, MatchingType); - }, - compareMatchingRelocs); - - // If we matched then insert the high part in front of the match and mark - // both relocations as being involved in a match. We only mark the high - // part for cosmetic reasons in the debug output. + // If the next relocation is a perfect match, continue; + if (std::next(I) != Sorted.end() && + isMatchingReloc(MatchingType, R, std::next(I)->R)) + continue; + // Otherwise, find the best matching low-part relocation with the following + // criteria. It must have the same symbol and its addend is no lower than + // that of the current high-part. // - // If we failed to find a match then the high part is orphaned. This is not - // permitted since the relocation cannot be evaluated without knowing the - // carry-in. We can sometimes handle this using a matching low part that is - // already used in a match but we already cover that case in - // isMatchingReloc and compareMatchingRelocs. For the remaining cases we - // should insert the high part at the end of the list. This will cause the - // linker to fail but the alternative is to cause the linker to bind the - // high part to a semi-matching low part and silently calculate the wrong - // value. Unfortunately we have no means to warn the user that we did this - // so leave it up to the linker to complain about it. - if (InsertionPoint != Sorted.end()) - InsertionPoint->Matched = true; - Sorted.insert(InsertionPoint, R)->Matched = true; + // (1) %lo with a smaller offset is preferred. + // (2) %lo with the same offset that is unmatched is preferred. + // (3) later %lo is preferred. + auto Best = Sorted.end(); + for (auto J = Sorted.begin(); J != Sorted.end(); ++J) { + auto &R1 = J->R; + if (R1.Type == MatchingType && R.Symbol == R1.Symbol && + R.Addend <= R1.Addend && + (Best == Sorted.end() || R1.Addend < Best->R.Addend || + (!Best->Matched && R1.Addend == Best->R.Addend))) + Best = J; + } + if (Best != Sorted.end() && R.Addend == Best->R.Addend) + Best->Matched = true; + + // Move the high-part before the low-part, or if not found, the end of the + // list. The unmatched high-part will lead to a linker warning/error. + Sorted.splice(Best, Sorted, I); } assert(Relocs.size() == Sorted.size() && "Some relocs were not consumed"); diff --git a/llvm/test/MC/Mips/sort-relocation-table.s b/llvm/test/MC/Mips/sort-relocation-table.s index cc951956fd24a0..7d126ba9f049d8 100644 --- a/llvm/test/MC/Mips/sort-relocation-table.s +++ b/llvm/test/MC/Mips/sort-relocation-table.s @@ -150,8 +150,8 @@ lui $2, %hi(sym1) # CHECK-LABEL: Section ({{[0-9]+}}) .rel.mips_hilo_8b { -# CHECK-NEXT: 0x8 R_MIPS_HI16 sym1 # CHECK-NEXT: 0x0 R_MIPS_LO16 sym1 +# CHECK-NEXT: 0x8 R_MIPS_HI16 sym1 # CHECK-NEXT: 0x4 R_MIPS_LO16 sym1 # CHECK-NEXT: } @@ -331,8 +331,8 @@ lui $2, %got(local1) # CHECK-LABEL: Section ({{[0-9]+}}) .rel.mips_gotlo_8b { -# CHECK-NEXT: 0x8 R_MIPS_GOT16 .text # CHECK-NEXT: 0x0 R_MIPS_LO16 .text +# CHECK-NEXT: 0x8 R_MIPS_GOT16 .text # CHECK-NEXT: 0x4 R_MIPS_LO16 .text # CHECK-NEXT: } @@ -372,9 +372,9 @@ # CHECK-LABEL: Section ({{[0-9]+}}) .rel.mips_gotlo_10 { # CHECK-NEXT: 0x0 R_MIPS_GOT16 .text # CHECK-NEXT: 0x4 R_MIPS_LO16 .text +# CHECK-NEXT: 0x8 R_MIPS_GOT16 .text # CHECK-NEXT: 0xC R_MIPS_GOT16 .text # CHECK-NEXT: 0x10 R_MIPS_LO16 .text -# CHECK-NEXT: 0x8 R_MIPS_GOT16 .text # CHECK-NEXT: } # Finally, do test 2 for R_MIPS_GOT16 on external symbols to prove they are From a69ba0a5f911ebdfd59b399e82ded8143e89e6cd Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Fri, 23 Aug 2024 09:16:22 +0200 Subject: [PATCH 292/426] [clang][bytecode][NFC] Get rid of const_casts in Move fns (#105698) --- clang/lib/AST/ByteCode/Descriptor.cpp | 23 ++++++++++------------- clang/lib/AST/ByteCode/Descriptor.h | 2 +- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/clang/lib/AST/ByteCode/Descriptor.cpp b/clang/lib/AST/ByteCode/Descriptor.cpp index 47b8885c9ae379..170203fe818775 100644 --- a/clang/lib/AST/ByteCode/Descriptor.cpp +++ b/clang/lib/AST/ByteCode/Descriptor.cpp @@ -31,10 +31,9 @@ static void dtorTy(Block *, std::byte *Ptr, const Descriptor *) { } template -static void moveTy(Block *, const std::byte *Src, std::byte *Dst, +static void moveTy(Block *, std::byte *Src, std::byte *Dst, const Descriptor *) { - // FIXME: Get rid of the const_cast. - auto *SrcPtr = reinterpret_cast(const_cast(Src)); + auto *SrcPtr = reinterpret_cast(Src); auto *DstPtr = reinterpret_cast(Dst); new (DstPtr) T(std::move(*SrcPtr)); } @@ -63,11 +62,9 @@ static void dtorArrayTy(Block *, std::byte *Ptr, const Descriptor *D) { } template -static void moveArrayTy(Block *, const std::byte *Src, std::byte *Dst, +static void moveArrayTy(Block *, std::byte *Src, std::byte *Dst, const Descriptor *D) { - // FIXME: Get rid of the const_cast. - InitMapPtr &SrcIMP = - *reinterpret_cast(const_cast(Src)); + InitMapPtr &SrcIMP = *reinterpret_cast(Src); if (SrcIMP) { // We only ever invoke the moveFunc when moving block contents to a // DeadBlock. DeadBlocks don't need InitMaps, so we destroy them here. @@ -76,7 +73,7 @@ static void moveArrayTy(Block *, const std::byte *Src, std::byte *Dst, Src += sizeof(InitMapPtr); Dst += sizeof(InitMapPtr); for (unsigned I = 0, NE = D->getNumElems(); I < NE; ++I) { - auto *SrcPtr = &reinterpret_cast(const_cast(Src))[I]; + auto *SrcPtr = &reinterpret_cast(Src)[I]; auto *DstPtr = &reinterpret_cast(Dst)[I]; new (DstPtr) T(std::move(*SrcPtr)); } @@ -126,7 +123,7 @@ static void dtorArrayDesc(Block *B, std::byte *Ptr, const Descriptor *D) { } } -static void moveArrayDesc(Block *B, const std::byte *Src, std::byte *Dst, +static void moveArrayDesc(Block *B, std::byte *Src, std::byte *Dst, const Descriptor *D) { const unsigned NumElems = D->getNumElems(); const unsigned ElemSize = @@ -134,11 +131,11 @@ static void moveArrayDesc(Block *B, const std::byte *Src, std::byte *Dst, unsigned ElemOffset = 0; for (unsigned I = 0; I < NumElems; ++I, ElemOffset += ElemSize) { - const auto *SrcPtr = Src + ElemOffset; + auto *SrcPtr = Src + ElemOffset; auto *DstPtr = Dst + ElemOffset; - const auto *SrcDesc = reinterpret_cast(SrcPtr); - const auto *SrcElemLoc = reinterpret_cast(SrcDesc + 1); + auto *SrcDesc = reinterpret_cast(SrcPtr); + auto *SrcElemLoc = reinterpret_cast(SrcDesc + 1); auto *DstDesc = reinterpret_cast(DstPtr); auto *DstElemLoc = reinterpret_cast(DstDesc + 1); @@ -233,7 +230,7 @@ static void dtorRecord(Block *B, std::byte *Ptr, const Descriptor *D) { destroyBase(B, Ptr, F.Desc, F.Offset); } -static void moveRecord(Block *B, const std::byte *Src, std::byte *Dst, +static void moveRecord(Block *B, std::byte *Src, std::byte *Dst, const Descriptor *D) { assert(D); assert(D->ElemRecord); diff --git a/clang/lib/AST/ByteCode/Descriptor.h b/clang/lib/AST/ByteCode/Descriptor.h index 41899c3bd6831a..82f90430f7f4e5 100644 --- a/clang/lib/AST/ByteCode/Descriptor.h +++ b/clang/lib/AST/ByteCode/Descriptor.h @@ -44,7 +44,7 @@ using BlockDtorFn = void (*)(Block *Storage, std::byte *FieldPtr, /// blocks are persisted: the move function copies all inline descriptors and /// non-trivial fields, as existing pointers might need to reference those /// descriptors. Data is not copied since it cannot be legally read. -using BlockMoveFn = void (*)(Block *Storage, const std::byte *SrcFieldPtr, +using BlockMoveFn = void (*)(Block *Storage, std::byte *SrcFieldPtr, std::byte *DstFieldPtr, const Descriptor *FieldDesc); From e5f196e4e7e3aec5c19adeacb7191ed0a099ea9a Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Fri, 23 Aug 2024 15:35:53 +0800 Subject: [PATCH 293/426] [NFCI] [C++20] [Modules] Relax the case for duplicated declaration in multiple module units for explicit specialization Relax the case for duplicated declaration in multiple module units for explicit specialization and refactor the implementation of checkMultipleDefinitionInNamedModules a little bit. This is intended to not affect any end users since it only relaxes the condition to emit an error. --- clang/lib/Serialization/ASTReaderDecl.cpp | 27 +++++++++++------------ 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index ef160228933c59..a6da103ed9799e 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -3692,12 +3692,6 @@ static void inheritDefaultTemplateArguments(ASTContext &Context, // the program is ill-formed; static void checkMultipleDefinitionInNamedModules(ASTReader &Reader, Decl *D, Decl *Previous) { - Module *M = Previous->getOwningModule(); - - // We only care about the case in named modules. - if (!M || !M->isNamedModule()) - return; - // If it is previous implcitly introduced, it is not meaningful to // diagnose it. if (Previous->isImplicit()) @@ -3714,16 +3708,21 @@ static void checkMultipleDefinitionInNamedModules(ASTReader &Reader, Decl *D, // FIXME: Maybe this shows the implicit instantiations may have incorrect // module owner ships. But given we've finished the compilation of a module, // how can we add new entities to that module? - if (auto *VTSD = dyn_cast(Previous); - VTSD && !VTSD->isExplicitSpecialization()) + if (isa(Previous)) return; - if (auto *CTSD = dyn_cast(Previous); - CTSD && !CTSD->isExplicitSpecialization()) + if (isa(Previous)) + return; + if (auto *Func = dyn_cast(Previous); + Func && Func->getTemplateSpecializationInfo()) + return; + + Module *M = Previous->getOwningModule(); + if (!M) + return; + + // We only forbids merging decls within named modules. + if (!M->isNamedModule()) return; - if (auto *Func = dyn_cast(Previous)) - if (auto *FTSI = Func->getTemplateSpecializationInfo(); - FTSI && !FTSI->isExplicitSpecialization()) - return; // It is fine if they are in the same module. if (Reader.getContext().isInSameModule(M, D->getOwningModule())) From 39986f0b4d797e4ad3c12607f2b4abe2322b82bb Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Fri, 23 Aug 2024 15:43:39 +0800 Subject: [PATCH 294/426] [NFCI] [Serialization] Use demoteThisDefinitionToDeclaration instead of setCompleteDefinition(false) for CXXRecordDecl When we merge the definition for CXXRecordDecl, we would use setCompleteDefinition(false) to mark the merged definition. But this was not the correct/good interface. We can't know that the merged definition was a definition then. And actually, we provided an interface for this: demoteThisDefinitionToDeclaration. So this patch tries to use the correct API. This was found in the downstream developing. This is not strictly NFC but it is intended to be NFC for every end users. --- clang/lib/Serialization/ASTReaderDecl.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index a6da103ed9799e..4d9d024796716e 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -2064,7 +2064,7 @@ void ASTDeclMerger::MergeDefinitionData( Reader.MergedDeclContexts.insert(std::make_pair(MergeDD.Definition, DD.Definition)); Reader.PendingDefinitions.erase(MergeDD.Definition); - MergeDD.Definition->setCompleteDefinition(false); + MergeDD.Definition->demoteThisDefinitionToDeclaration(); Reader.mergeDefinitionVisibility(DD.Definition, MergeDD.Definition); assert(!Reader.Lookups.contains(MergeDD.Definition) && "already loaded pending lookups for merged definition"); @@ -2175,6 +2175,9 @@ void ASTDeclReader::ReadCXXRecordDefinition(CXXRecordDecl *D, bool Update, D->DefinitionData = Canon->DefinitionData; ReadCXXDefinitionData(*DD, D, LambdaContext, IndexInLambdaContext); + // Mark this declaration as being a definition. + D->setCompleteDefinition(true); + // We might already have a different definition for this record. This can // happen either because we're reading an update record, or because we've // already done some merging. Either way, just merge into it. @@ -2183,9 +2186,6 @@ void ASTDeclReader::ReadCXXRecordDefinition(CXXRecordDecl *D, bool Update, return; } - // Mark this declaration as being a definition. - D->setCompleteDefinition(true); - // If this is not the first declaration or is an update record, we can have // other redeclarations already. Make a note that we need to propagate the // DefinitionData pointer onto them. From 85b6aac7c25f9d2a976a76045ace1e7afebb5965 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Fri, 23 Aug 2024 16:06:00 +0800 Subject: [PATCH 295/426] [ConstraintElim] Fix miscompilation caused by PR97974 (#105790) Fixes https://github.com/llvm/llvm-project/issues/105785. --- .../Scalar/ConstraintElimination.cpp | 2 +- .../ConstraintElimination/pr105785.ll | 46 +++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/ConstraintElimination/pr105785.ll diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index 84ccf06d16d5e8..6565aed4bc390c 100644 --- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -1464,7 +1464,7 @@ static bool checkAndReplaceCmp(CmpIntrinsic *I, ConstraintInfo &Info, ToRemove.push_back(I); return true; } - if (checkCondition(ICmpInst::ICMP_EQ, LHS, RHS, I, Info)) { + if (checkCondition(ICmpInst::ICMP_EQ, LHS, RHS, I, Info).value_or(false)) { I->replaceAllUsesWith(ConstantInt::get(I->getType(), 0)); ToRemove.push_back(I); return true; diff --git a/llvm/test/Transforms/ConstraintElimination/pr105785.ll b/llvm/test/Transforms/ConstraintElimination/pr105785.ll new file mode 100644 index 00000000000000..6c340a11dd2e2c --- /dev/null +++ b/llvm/test/Transforms/ConstraintElimination/pr105785.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=constraint-elimination -S %s | FileCheck %s + +define void @pr105785(ptr %p) { +; CHECK-LABEL: define void @pr105785( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_COND:.*]] +; CHECK: [[FOR_COND]]: +; CHECK-NEXT: [[FOR_IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ 1, %[[FOR_COND1:.*]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[FOR_IND]], 0 +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_COND1]], label %[[FOR_END6:.*]] +; CHECK: [[FOR_COND1]]: +; CHECK-NEXT: [[FOR_IND2:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY3:.*]] ], [ 0, %[[FOR_COND]] ] +; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[FOR_IND2]], 3 +; CHECK-NEXT: br i1 [[CMP2]], label %[[FOR_BODY3]], label %[[FOR_COND]] +; CHECK: [[FOR_BODY3]]: +; CHECK-NEXT: [[SCMP:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[FOR_IND]], i32 1) +; CHECK-NEXT: store i32 [[SCMP]], ptr [[P]], align 4 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[FOR_IND2]], 1 +; CHECK-NEXT: br label %[[FOR_COND1]] +; CHECK: [[FOR_END6]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.cond + +for.cond: ; preds = %for.cond1, %entry + %for.ind = phi i32 [ 0, %entry ], [ 1, %for.cond1 ] + %cmp = icmp eq i32 %for.ind, 0 + br i1 %cmp, label %for.cond1, label %for.end6 + +for.cond1: ; preds = %for.cond, %for.body3 + %for.ind2 = phi i32 [ %inc, %for.body3 ], [ 0, %for.cond ] + %cmp2 = icmp ult i32 %for.ind2, 3 + br i1 %cmp2, label %for.body3, label %for.cond + +for.body3: ; preds = %for.cond1 + %scmp = call i32 @llvm.scmp.i32.i32(i32 %for.ind, i32 1) + store i32 %scmp, ptr %p, align 4 + %inc = add nuw nsw i32 %for.ind2, 1 + br label %for.cond1 + +for.end6: + ret void +} From 28133d9159e814160fa622de6ffdcf36dd25f9d7 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 22 Aug 2024 15:55:28 +0100 Subject: [PATCH 296/426] [AArch64] Add Add/Sub/Mul test coverage for GISel. NFC --- llvm/test/CodeGen/AArch64/add.ll | 501 ++++++++++++++++++++++++++ llvm/test/CodeGen/AArch64/mul.ll | 600 +++++++++++++++++++++++++++++++ llvm/test/CodeGen/AArch64/sub.ll | 501 ++++++++++++++++++++++++++ 3 files changed, 1602 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/add.ll create mode 100644 llvm/test/CodeGen/AArch64/mul.ll create mode 100644 llvm/test/CodeGen/AArch64/sub.ll diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll new file mode 100644 index 00000000000000..39d1933f0e7b97 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/add.ll @@ -0,0 +1,501 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +; CHECK-GI: warning: Instruction selection used fallback path for v2i128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3i128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v4i128 + +define i8 @i8(i8 %a, i8 %b) { +; CHECK-LABEL: i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add w0, w0, w1 +; CHECK-NEXT: ret +entry: + %s = add i8 %a, %b + ret i8 %s +} + +define i16 @i16(i16 %a, i16 %b) { +; CHECK-LABEL: i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add w0, w0, w1 +; CHECK-NEXT: ret +entry: + %s = add i16 %a, %b + ret i16 %s +} + +define i32 @i32(i32 %a, i32 %b) { +; CHECK-LABEL: i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add w0, w0, w1 +; CHECK-NEXT: ret +entry: + %s = add i32 %a, %b + ret i32 %s +} + +define i64 @i64(i64 %a, i64 %b) { +; CHECK-LABEL: i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add x0, x0, x1 +; CHECK-NEXT: ret +entry: + %s = add i64 %a, %b + ret i64 %s +} + +define i128 @i128(i128 %a, i128 %b) { +; CHECK-LABEL: i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: adds x0, x0, x2 +; CHECK-NEXT: adc x1, x1, x3 +; CHECK-NEXT: ret +entry: + %s = add i128 %a, %b + ret i128 %s +} + +define void @v2i8(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: v2i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1] +; CHECK-SD-NEXT: add x8, x0, #1 +; CHECK-SD-NEXT: add x9, x1, #1 +; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: fmov w9, s0 +; CHECK-SD-NEXT: strb w9, [x0] +; CHECK-SD-NEXT: strb w8, [x0, #1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v2i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr b0, [x0] +; CHECK-GI-NEXT: ldr b1, [x0, #1] +; CHECK-GI-NEXT: ldr b2, [x1] +; CHECK-GI-NEXT: ldr b3, [x1, #1] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v3.s[0] +; CHECK-GI-NEXT: add v0.2s, v0.2s, v2.2s +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: str b0, [x0] +; CHECK-GI-NEXT: str b1, [x0, #1] +; CHECK-GI-NEXT: ret +entry: + %d = load <2 x i8>, ptr %p1 + %e = load <2 x i8>, ptr %p2 + %s = add <2 x i8> %d, %e + store <2 x i8> %s, ptr %p1 + ret void +} + +define void @v3i8(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: v3i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b +; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: umov w8, v0.h[2] +; CHECK-SD-NEXT: str s1, [sp, #12] +; CHECK-SD-NEXT: ldrh w9, [sp, #12] +; CHECK-SD-NEXT: strb w8, [x0, #2] +; CHECK-SD-NEXT: strh w9, [x0] +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v3i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldrb w8, [x0] +; CHECK-GI-NEXT: ldrb w9, [x0, #1] +; CHECK-GI-NEXT: ldrb w10, [x1] +; CHECK-GI-NEXT: ldrb w11, [x1, #1] +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: fmov s2, w10 +; CHECK-GI-NEXT: fmov s3, w11 +; CHECK-GI-NEXT: ldrb w8, [x0, #2] +; CHECK-GI-NEXT: ldrb w9, [x1, #2] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] +; CHECK-GI-NEXT: add v0.4h, v0.4h, v2.4h +; CHECK-GI-NEXT: mov h1, v0.h[1] +; CHECK-GI-NEXT: mov h2, v0.h[2] +; CHECK-GI-NEXT: str b0, [x0] +; CHECK-GI-NEXT: str b1, [x0, #1] +; CHECK-GI-NEXT: str b2, [x0, #2] +; CHECK-GI-NEXT: ret +entry: + %d = load <3 x i8>, ptr %p1 + %e = load <3 x i8>, ptr %p2 + %s = add <3 x i8> %d, %e + store <3 x i8> %s, ptr %p1 + ret void +} + +define void @v4i8(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: v4i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: str s0, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v4i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr w8, [x0] +; CHECK-GI-NEXT: ldr w9, [x1] +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: mov b2, v0.b[1] +; CHECK-GI-NEXT: mov b3, v1.b[1] +; CHECK-GI-NEXT: mov b4, v0.b[2] +; CHECK-GI-NEXT: mov b5, v0.b[3] +; CHECK-GI-NEXT: mov b6, v1.b[3] +; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] +; CHECK-GI-NEXT: mov b2, v1.b[2] +; CHECK-GI-NEXT: mov v1.b[1], v3.b[0] +; CHECK-GI-NEXT: mov v0.b[2], v4.b[0] +; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] +; CHECK-GI-NEXT: mov v0.b[3], v5.b[0] +; CHECK-GI-NEXT: mov v1.b[3], v6.b[0] +; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: mov h1, v0.h[1] +; CHECK-GI-NEXT: mov h2, v0.h[2] +; CHECK-GI-NEXT: mov h3, v0.h[3] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] +; CHECK-GI-NEXT: mov v0.h[3], v3.h[0] +; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: str w8, [x0] +; CHECK-GI-NEXT: ret +entry: + %d = load <4 x i8>, ptr %p1 + %e = load <4 x i8>, ptr %p2 + %s = add <4 x i8> %d, %e + store <4 x i8> %s, ptr %p1 + ret void +} + +define <8 x i8> @v8i8(<8 x i8> %d, <8 x i8> %e) { +; CHECK-LABEL: v8i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret +entry: + %s = add <8 x i8> %d, %e + ret <8 x i8> %s +} + +define <16 x i8> @v16i8(<16 x i8> %d, <16 x i8> %e) { +; CHECK-LABEL: v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %s = add <16 x i8> %d, %e + ret <16 x i8> %s +} + +define <32 x i8> @v32i8(<32 x i8> %d, <32 x i8> %e) { +; CHECK-SD-LABEL: v32i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: add v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: add v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v32i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: add v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: ret +entry: + %s = add <32 x i8> %d, %e + ret <32 x i8> %s +} + +define void @v2i16(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: v2i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] +; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1] +; CHECK-SD-NEXT: add x8, x0, #2 +; CHECK-SD-NEXT: add x9, x1, #2 +; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: fmov w9, s0 +; CHECK-SD-NEXT: strh w9, [x0] +; CHECK-SD-NEXT: strh w8, [x0, #2] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v2i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr h0, [x0] +; CHECK-GI-NEXT: ldr h1, [x0, #2] +; CHECK-GI-NEXT: ldr h2, [x1] +; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v2.4h, #0 +; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: str h0, [x0] +; CHECK-GI-NEXT: str h1, [x0, #2] +; CHECK-GI-NEXT: ret +entry: + %d = load <2 x i16>, ptr %p1 + %e = load <2 x i16>, ptr %p2 + %s = add <2 x i16> %d, %e + store <2 x i16> %s, ptr %p1 + ret void +} + +define void @v3i16(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: v3i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: ldr d1, [x1] +; CHECK-SD-NEXT: add x8, x0, #4 +; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: st1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: str s0, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v3i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr h0, [x0] +; CHECK-GI-NEXT: ldr h1, [x0, #2] +; CHECK-GI-NEXT: add x8, x0, #2 +; CHECK-GI-NEXT: ldr h2, [x1] +; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: add x9, x0, #4 +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: ldr h1, [x0, #4] +; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: ldr h3, [x1, #4] +; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] +; CHECK-GI-NEXT: add v0.4h, v0.4h, v2.4h +; CHECK-GI-NEXT: str h0, [x0] +; CHECK-GI-NEXT: st1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: st1 { v0.h }[2], [x9] +; CHECK-GI-NEXT: ret +entry: + %d = load <3 x i16>, ptr %p1 + %e = load <3 x i16>, ptr %p2 + %s = add <3 x i16> %d, %e + store <3 x i16> %s, ptr %p1 + ret void +} + +define <4 x i16> @v4i16(<4 x i16> %d, <4 x i16> %e) { +; CHECK-LABEL: v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ret +entry: + %s = add <4 x i16> %d, %e + ret <4 x i16> %s +} + +define <8 x i16> @v8i16(<8 x i16> %d, <8 x i16> %e) { +; CHECK-LABEL: v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ret +entry: + %s = add <8 x i16> %d, %e + ret <8 x i16> %s +} + +define <16 x i16> @v16i16(<16 x i16> %d, <16 x i16> %e) { +; CHECK-SD-LABEL: v16i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: add v1.8h, v1.8h, v3.8h +; CHECK-SD-NEXT: add v0.8h, v0.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v16i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v0.8h, v0.8h, v2.8h +; CHECK-GI-NEXT: add v1.8h, v1.8h, v3.8h +; CHECK-GI-NEXT: ret +entry: + %s = add <16 x i16> %d, %e + ret <16 x i16> %s +} + +define <2 x i32> @v2i32(<2 x i32> %d, <2 x i32> %e) { +; CHECK-LABEL: v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ret +entry: + %s = add <2 x i32> %d, %e + ret <2 x i32> %s +} + +define <3 x i32> @v3i32(<3 x i32> %d, <3 x i32> %e) { +; CHECK-LABEL: v3i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret +entry: + %s = add <3 x i32> %d, %e + ret <3 x i32> %s +} + +define <4 x i32> @v4i32(<4 x i32> %d, <4 x i32> %e) { +; CHECK-LABEL: v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret +entry: + %s = add <4 x i32> %d, %e + ret <4 x i32> %s +} + +define <8 x i32> @v8i32(<8 x i32> %d, <8 x i32> %e) { +; CHECK-SD-LABEL: v8i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-SD-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v8i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-GI-NEXT: ret +entry: + %s = add <8 x i32> %d, %e + ret <8 x i32> %s +} + +define <2 x i64> @v2i64(<2 x i64> %d, <2 x i64> %e) { +; CHECK-LABEL: v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-NEXT: ret +entry: + %s = add <2 x i64> %d, %e + ret <2 x i64> %s +} + +define <3 x i64> @v3i64(<3 x i64> %d, <3 x i64> %e) { +; CHECK-SD-LABEL: v3i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: add d0, d0, d3 +; CHECK-SD-NEXT: add d1, d1, d4 +; CHECK-SD-NEXT: add d2, d2, d5 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v3i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4 +; CHECK-GI-NEXT: fmov x8, d2 +; CHECK-GI-NEXT: fmov x9, d5 +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] +; CHECK-GI-NEXT: add x8, x8, x9 +; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: add v0.2d, v0.2d, v3.2d +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret +entry: + %s = add <3 x i64> %d, %e + ret <3 x i64> %s +} + +define <4 x i64> @v4i64(<4 x i64> %d, <4 x i64> %e) { +; CHECK-SD-LABEL: v4i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: add v1.2d, v1.2d, v3.2d +; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v4i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: add v1.2d, v1.2d, v3.2d +; CHECK-GI-NEXT: ret +entry: + %s = add <4 x i64> %d, %e + ret <4 x i64> %s +} + +define <2 x i128> @v2i128(<2 x i128> %d, <2 x i128> %e) { +; CHECK-LABEL: v2i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: adds x0, x0, x4 +; CHECK-NEXT: adc x1, x1, x5 +; CHECK-NEXT: adds x2, x2, x6 +; CHECK-NEXT: adc x3, x3, x7 +; CHECK-NEXT: ret +entry: + %s = add <2 x i128> %d, %e + ret <2 x i128> %s +} + +define <3 x i128> @v3i128(<3 x i128> %d, <3 x i128> %e) { +; CHECK-LABEL: v3i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp x8, x9, [sp] +; CHECK-NEXT: adds x0, x0, x6 +; CHECK-NEXT: ldp x10, x11, [sp, #16] +; CHECK-NEXT: adc x1, x1, x7 +; CHECK-NEXT: adds x2, x2, x8 +; CHECK-NEXT: adc x3, x3, x9 +; CHECK-NEXT: adds x4, x4, x10 +; CHECK-NEXT: adc x5, x5, x11 +; CHECK-NEXT: ret +entry: + %s = add <3 x i128> %d, %e + ret <3 x i128> %s +} + +define <4 x i128> @v4i128(<4 x i128> %d, <4 x i128> %e) { +; CHECK-LABEL: v4i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp x8, x9, [sp] +; CHECK-NEXT: ldp x11, x10, [sp, #16] +; CHECK-NEXT: ldp x13, x12, [sp, #32] +; CHECK-NEXT: adds x0, x0, x8 +; CHECK-NEXT: adc x1, x1, x9 +; CHECK-NEXT: ldp x8, x9, [sp, #48] +; CHECK-NEXT: adds x2, x2, x11 +; CHECK-NEXT: adc x3, x3, x10 +; CHECK-NEXT: adds x4, x4, x13 +; CHECK-NEXT: adc x5, x5, x12 +; CHECK-NEXT: adds x6, x6, x8 +; CHECK-NEXT: adc x7, x7, x9 +; CHECK-NEXT: ret +entry: + %s = add <4 x i128> %d, %e + ret <4 x i128> %s +} diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll new file mode 100644 index 00000000000000..d2804329f1e255 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/mul.ll @@ -0,0 +1,600 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +; CHECK-GI: warning: Instruction selection used fallback path for v2i128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3i128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v4i128 + +define i8 @i8(i8 %a, i8 %b) { +; CHECK-LABEL: i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul w0, w0, w1 +; CHECK-NEXT: ret +entry: + %s = mul i8 %a, %b + ret i8 %s +} + +define i16 @i16(i16 %a, i16 %b) { +; CHECK-LABEL: i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul w0, w0, w1 +; CHECK-NEXT: ret +entry: + %s = mul i16 %a, %b + ret i16 %s +} + +define i32 @i32(i32 %a, i32 %b) { +; CHECK-LABEL: i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul w0, w0, w1 +; CHECK-NEXT: ret +entry: + %s = mul i32 %a, %b + ret i32 %s +} + +define i64 @i64(i64 %a, i64 %b) { +; CHECK-LABEL: i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul x0, x0, x1 +; CHECK-NEXT: ret +entry: + %s = mul i64 %a, %b + ret i64 %s +} + +define i128 @i128(i128 %a, i128 %b) { +; CHECK-SD-LABEL: i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: umulh x8, x0, x2 +; CHECK-SD-NEXT: madd x8, x0, x3, x8 +; CHECK-SD-NEXT: mul x0, x0, x2 +; CHECK-SD-NEXT: madd x1, x1, x2, x8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mul x9, x0, x3 +; CHECK-GI-NEXT: mul x8, x0, x2 +; CHECK-GI-NEXT: umulh x10, x0, x2 +; CHECK-GI-NEXT: madd x9, x1, x2, x9 +; CHECK-GI-NEXT: mov x0, x8 +; CHECK-GI-NEXT: add x1, x9, x10 +; CHECK-GI-NEXT: ret +entry: + %s = mul i128 %a, %b + ret i128 %s +} + +define void @v2i8(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: v2i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1] +; CHECK-SD-NEXT: add x8, x0, #1 +; CHECK-SD-NEXT: add x9, x1, #1 +; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: fmov w9, s0 +; CHECK-SD-NEXT: strb w9, [x0] +; CHECK-SD-NEXT: strb w8, [x0, #1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v2i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr b0, [x0] +; CHECK-GI-NEXT: ldr b1, [x0, #1] +; CHECK-GI-NEXT: ldr b2, [x1] +; CHECK-GI-NEXT: ldr b3, [x1, #1] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v3.s[0] +; CHECK-GI-NEXT: mul v0.2s, v0.2s, v2.2s +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: str b0, [x0] +; CHECK-GI-NEXT: str b1, [x0, #1] +; CHECK-GI-NEXT: ret +entry: + %d = load <2 x i8>, ptr %p1 + %e = load <2 x i8>, ptr %p2 + %s = mul <2 x i8> %d, %e + store <2 x i8> %s, ptr %p1 + ret void +} + +define void @v3i8(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: v3i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b +; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: umov w8, v0.h[2] +; CHECK-SD-NEXT: str s1, [sp, #12] +; CHECK-SD-NEXT: ldrh w9, [sp, #12] +; CHECK-SD-NEXT: strb w8, [x0, #2] +; CHECK-SD-NEXT: strh w9, [x0] +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v3i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldrb w8, [x0] +; CHECK-GI-NEXT: ldrb w9, [x0, #1] +; CHECK-GI-NEXT: ldrb w10, [x1] +; CHECK-GI-NEXT: ldrb w11, [x1, #1] +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: fmov s2, w10 +; CHECK-GI-NEXT: fmov s3, w11 +; CHECK-GI-NEXT: ldrb w8, [x0, #2] +; CHECK-GI-NEXT: ldrb w9, [x1, #2] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] +; CHECK-GI-NEXT: mul v0.4h, v0.4h, v2.4h +; CHECK-GI-NEXT: mov h1, v0.h[1] +; CHECK-GI-NEXT: mov h2, v0.h[2] +; CHECK-GI-NEXT: str b0, [x0] +; CHECK-GI-NEXT: str b1, [x0, #1] +; CHECK-GI-NEXT: str b2, [x0, #2] +; CHECK-GI-NEXT: ret +entry: + %d = load <3 x i8>, ptr %p1 + %e = load <3 x i8>, ptr %p2 + %s = mul <3 x i8> %d, %e + store <3 x i8> %s, ptr %p1 + ret void +} + +define void @v4i8(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: v4i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: umull v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-SD-NEXT: str s0, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v4i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr w8, [x0] +; CHECK-GI-NEXT: ldr w9, [x1] +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: mov b2, v0.b[1] +; CHECK-GI-NEXT: mov b3, v1.b[1] +; CHECK-GI-NEXT: mov b4, v0.b[2] +; CHECK-GI-NEXT: mov b5, v0.b[3] +; CHECK-GI-NEXT: mov b6, v1.b[3] +; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] +; CHECK-GI-NEXT: mov b2, v1.b[2] +; CHECK-GI-NEXT: mov v1.b[1], v3.b[0] +; CHECK-GI-NEXT: mov v0.b[2], v4.b[0] +; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] +; CHECK-GI-NEXT: mov v0.b[3], v5.b[0] +; CHECK-GI-NEXT: mov v1.b[3], v6.b[0] +; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: mul v0.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: mov h1, v0.h[1] +; CHECK-GI-NEXT: mov h2, v0.h[2] +; CHECK-GI-NEXT: mov h3, v0.h[3] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] +; CHECK-GI-NEXT: mov v0.h[3], v3.h[0] +; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: str w8, [x0] +; CHECK-GI-NEXT: ret +entry: + %d = load <4 x i8>, ptr %p1 + %e = load <4 x i8>, ptr %p2 + %s = mul <4 x i8> %d, %e + store <4 x i8> %s, ptr %p1 + ret void +} + +define <8 x i8> @v8i8(<8 x i8> %d, <8 x i8> %e) { +; CHECK-LABEL: v8i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret +entry: + %s = mul <8 x i8> %d, %e + ret <8 x i8> %s +} + +define <16 x i8> @v16i8(<16 x i8> %d, <16 x i8> %e) { +; CHECK-LABEL: v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %s = mul <16 x i8> %d, %e + ret <16 x i8> %s +} + +define <32 x i8> @v32i8(<32 x i8> %d, <32 x i8> %e) { +; CHECK-SD-LABEL: v32i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mul v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: mul v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v32i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mul v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: mul v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: ret +entry: + %s = mul <32 x i8> %d, %e + ret <32 x i8> %s +} + +define void @v2i16(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: v2i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] +; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1] +; CHECK-SD-NEXT: add x8, x0, #2 +; CHECK-SD-NEXT: add x9, x1, #2 +; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: fmov w9, s0 +; CHECK-SD-NEXT: strh w9, [x0] +; CHECK-SD-NEXT: strh w8, [x0, #2] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v2i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr h0, [x0] +; CHECK-GI-NEXT: ldr h1, [x0, #2] +; CHECK-GI-NEXT: ldr h2, [x1] +; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v2.4h, #0 +; CHECK-GI-NEXT: mul v0.2s, v0.2s, v1.2s +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: str h0, [x0] +; CHECK-GI-NEXT: str h1, [x0, #2] +; CHECK-GI-NEXT: ret +entry: + %d = load <2 x i16>, ptr %p1 + %e = load <2 x i16>, ptr %p2 + %s = mul <2 x i16> %d, %e + store <2 x i16> %s, ptr %p1 + ret void +} + +define void @v3i16(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: v3i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: ldr d1, [x1] +; CHECK-SD-NEXT: add x8, x0, #4 +; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: st1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: str s0, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v3i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr h0, [x0] +; CHECK-GI-NEXT: ldr h1, [x0, #2] +; CHECK-GI-NEXT: add x8, x0, #2 +; CHECK-GI-NEXT: ldr h2, [x1] +; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: add x9, x0, #4 +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: ldr h1, [x0, #4] +; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: ldr h3, [x1, #4] +; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] +; CHECK-GI-NEXT: mul v0.4h, v0.4h, v2.4h +; CHECK-GI-NEXT: str h0, [x0] +; CHECK-GI-NEXT: st1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: st1 { v0.h }[2], [x9] +; CHECK-GI-NEXT: ret +entry: + %d = load <3 x i16>, ptr %p1 + %e = load <3 x i16>, ptr %p2 + %s = mul <3 x i16> %d, %e + store <3 x i16> %s, ptr %p1 + ret void +} + +define <4 x i16> @v4i16(<4 x i16> %d, <4 x i16> %e) { +; CHECK-LABEL: v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ret +entry: + %s = mul <4 x i16> %d, %e + ret <4 x i16> %s +} + +define <8 x i16> @v8i16(<8 x i16> %d, <8 x i16> %e) { +; CHECK-LABEL: v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ret +entry: + %s = mul <8 x i16> %d, %e + ret <8 x i16> %s +} + +define <16 x i16> @v16i16(<16 x i16> %d, <16 x i16> %e) { +; CHECK-SD-LABEL: v16i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mul v1.8h, v1.8h, v3.8h +; CHECK-SD-NEXT: mul v0.8h, v0.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v16i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mul v0.8h, v0.8h, v2.8h +; CHECK-GI-NEXT: mul v1.8h, v1.8h, v3.8h +; CHECK-GI-NEXT: ret +entry: + %s = mul <16 x i16> %d, %e + ret <16 x i16> %s +} + +define <2 x i32> @v2i32(<2 x i32> %d, <2 x i32> %e) { +; CHECK-LABEL: v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ret +entry: + %s = mul <2 x i32> %d, %e + ret <2 x i32> %s +} + +define <3 x i32> @v3i32(<3 x i32> %d, <3 x i32> %e) { +; CHECK-LABEL: v3i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret +entry: + %s = mul <3 x i32> %d, %e + ret <3 x i32> %s +} + +define <4 x i32> @v4i32(<4 x i32> %d, <4 x i32> %e) { +; CHECK-LABEL: v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret +entry: + %s = mul <4 x i32> %d, %e + ret <4 x i32> %s +} + +define <8 x i32> @v8i32(<8 x i32> %d, <8 x i32> %e) { +; CHECK-SD-LABEL: v8i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mul v1.4s, v1.4s, v3.4s +; CHECK-SD-NEXT: mul v0.4s, v0.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v8i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mul v0.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: mul v1.4s, v1.4s, v3.4s +; CHECK-GI-NEXT: ret +entry: + %s = mul <8 x i32> %d, %e + ret <8 x i32> %s +} + +define <2 x i64> @v2i64(<2 x i64> %d, <2 x i64> %e) { +; CHECK-SD-LABEL: v2i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov x10, d1 +; CHECK-SD-NEXT: fmov x11, d0 +; CHECK-SD-NEXT: mov x8, v1.d[1] +; CHECK-SD-NEXT: mov x9, v0.d[1] +; CHECK-SD-NEXT: mul x10, x11, x10 +; CHECK-SD-NEXT: mul x8, x9, x8 +; CHECK-SD-NEXT: fmov d0, x10 +; CHECK-SD-NEXT: mov v0.d[1], x8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v2i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fmov x10, d0 +; CHECK-GI-NEXT: fmov x11, d1 +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov x9, v1.d[1] +; CHECK-GI-NEXT: mul x10, x10, x11 +; CHECK-GI-NEXT: mul x8, x8, x9 +; CHECK-GI-NEXT: fmov d0, x10 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret +entry: + %s = mul <2 x i64> %d, %e + ret <2 x i64> %s +} + +define <3 x i64> @v3i64(<3 x i64> %d, <3 x i64> %e) { +; CHECK-SD-LABEL: v3i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d3 killed $d3 def $q3 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: fmov x8, d3 +; CHECK-SD-NEXT: fmov x9, d0 +; CHECK-SD-NEXT: // kill: def $d4 killed $d4 def $q4 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: // kill: def $d5 killed $d5 def $q5 +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-SD-NEXT: fmov x10, d1 +; CHECK-SD-NEXT: fmov x11, d2 +; CHECK-SD-NEXT: mul x8, x9, x8 +; CHECK-SD-NEXT: fmov x9, d4 +; CHECK-SD-NEXT: mul x9, x10, x9 +; CHECK-SD-NEXT: fmov x10, d5 +; CHECK-SD-NEXT: fmov d0, x8 +; CHECK-SD-NEXT: mul x10, x11, x10 +; CHECK-SD-NEXT: fmov d1, x9 +; CHECK-SD-NEXT: fmov d2, x10 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v3i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4 +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] +; CHECK-GI-NEXT: fmov x10, d0 +; CHECK-GI-NEXT: fmov x11, d3 +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov x9, v3.d[1] +; CHECK-GI-NEXT: mul x10, x10, x11 +; CHECK-GI-NEXT: mul x8, x8, x9 +; CHECK-GI-NEXT: fmov x9, d5 +; CHECK-GI-NEXT: fmov d0, x10 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: fmov x8, d2 +; CHECK-GI-NEXT: mul x8, x8, x9 +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: ret +entry: + %s = mul <3 x i64> %d, %e + ret <3 x i64> %s +} + +define <4 x i64> @v4i64(<4 x i64> %d, <4 x i64> %e) { +; CHECK-SD-LABEL: v4i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov x8, d2 +; CHECK-SD-NEXT: fmov x9, d0 +; CHECK-SD-NEXT: fmov x12, d1 +; CHECK-SD-NEXT: mov x10, v2.d[1] +; CHECK-SD-NEXT: mov x11, v0.d[1] +; CHECK-SD-NEXT: mov x13, v3.d[1] +; CHECK-SD-NEXT: mov x14, v1.d[1] +; CHECK-SD-NEXT: mul x8, x9, x8 +; CHECK-SD-NEXT: fmov x9, d3 +; CHECK-SD-NEXT: mul x10, x11, x10 +; CHECK-SD-NEXT: mul x9, x12, x9 +; CHECK-SD-NEXT: fmov d0, x8 +; CHECK-SD-NEXT: mul x11, x14, x13 +; CHECK-SD-NEXT: mov v0.d[1], x10 +; CHECK-SD-NEXT: fmov d1, x9 +; CHECK-SD-NEXT: mov v1.d[1], x11 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v4i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: fmov x9, d2 +; CHECK-GI-NEXT: fmov x12, d3 +; CHECK-GI-NEXT: mov x10, v0.d[1] +; CHECK-GI-NEXT: mov x11, v2.d[1] +; CHECK-GI-NEXT: mov x13, v1.d[1] +; CHECK-GI-NEXT: mov x14, v3.d[1] +; CHECK-GI-NEXT: mul x8, x8, x9 +; CHECK-GI-NEXT: fmov x9, d1 +; CHECK-GI-NEXT: mul x10, x10, x11 +; CHECK-GI-NEXT: mul x9, x9, x12 +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: mul x11, x13, x14 +; CHECK-GI-NEXT: mov v0.d[1], x10 +; CHECK-GI-NEXT: fmov d1, x9 +; CHECK-GI-NEXT: mov v1.d[1], x11 +; CHECK-GI-NEXT: ret +entry: + %s = mul <4 x i64> %d, %e + ret <4 x i64> %s +} + +define <2 x i128> @v2i128(<2 x i128> %d, <2 x i128> %e) { +; CHECK-LABEL: v2i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umulh x8, x2, x6 +; CHECK-NEXT: umulh x9, x0, x4 +; CHECK-NEXT: madd x8, x2, x7, x8 +; CHECK-NEXT: madd x9, x0, x5, x9 +; CHECK-NEXT: madd x3, x3, x6, x8 +; CHECK-NEXT: madd x1, x1, x4, x9 +; CHECK-NEXT: mul x0, x0, x4 +; CHECK-NEXT: mul x2, x2, x6 +; CHECK-NEXT: ret +entry: + %s = mul <2 x i128> %d, %e + ret <2 x i128> %s +} + +define <3 x i128> @v3i128(<3 x i128> %d, <3 x i128> %e) { +; CHECK-LABEL: v3i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umulh x9, x0, x6 +; CHECK-NEXT: ldp x8, x10, [sp] +; CHECK-NEXT: madd x9, x0, x7, x9 +; CHECK-NEXT: umulh x11, x2, x8 +; CHECK-NEXT: madd x1, x1, x6, x9 +; CHECK-NEXT: ldp x9, x12, [sp, #16] +; CHECK-NEXT: madd x10, x2, x10, x11 +; CHECK-NEXT: umulh x13, x4, x9 +; CHECK-NEXT: madd x3, x3, x8, x10 +; CHECK-NEXT: madd x11, x4, x12, x13 +; CHECK-NEXT: mul x0, x0, x6 +; CHECK-NEXT: madd x5, x5, x9, x11 +; CHECK-NEXT: mul x2, x2, x8 +; CHECK-NEXT: mul x4, x4, x9 +; CHECK-NEXT: ret +entry: + %s = mul <3 x i128> %d, %e + ret <3 x i128> %s +} + +define <4 x i128> @v4i128(<4 x i128> %d, <4 x i128> %e) { +; CHECK-LABEL: v4i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp x8, x9, [sp] +; CHECK-NEXT: ldp x11, x12, [sp, #16] +; CHECK-NEXT: umulh x10, x0, x8 +; CHECK-NEXT: umulh x13, x2, x11 +; CHECK-NEXT: madd x9, x0, x9, x10 +; CHECK-NEXT: madd x10, x2, x12, x13 +; CHECK-NEXT: ldp x13, x14, [sp, #48] +; CHECK-NEXT: madd x1, x1, x8, x9 +; CHECK-NEXT: madd x3, x3, x11, x10 +; CHECK-NEXT: ldp x9, x10, [sp, #32] +; CHECK-NEXT: umulh x15, x6, x13 +; CHECK-NEXT: umulh x12, x4, x9 +; CHECK-NEXT: mul x0, x0, x8 +; CHECK-NEXT: madd x10, x4, x10, x12 +; CHECK-NEXT: madd x12, x6, x14, x15 +; CHECK-NEXT: madd x5, x5, x9, x10 +; CHECK-NEXT: madd x7, x7, x13, x12 +; CHECK-NEXT: mul x2, x2, x11 +; CHECK-NEXT: mul x4, x4, x9 +; CHECK-NEXT: mul x6, x6, x13 +; CHECK-NEXT: ret +entry: + %s = mul <4 x i128> %d, %e + ret <4 x i128> %s +} diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll new file mode 100644 index 00000000000000..0f18ed1006fac5 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sub.ll @@ -0,0 +1,501 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +; CHECK-GI: warning: Instruction selection used fallback path for v2i128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3i128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v4i128 + +define i8 @i8(i8 %a, i8 %b) { +; CHECK-LABEL: i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub w0, w0, w1 +; CHECK-NEXT: ret +entry: + %s = sub i8 %a, %b + ret i8 %s +} + +define i16 @i16(i16 %a, i16 %b) { +; CHECK-LABEL: i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub w0, w0, w1 +; CHECK-NEXT: ret +entry: + %s = sub i16 %a, %b + ret i16 %s +} + +define i32 @i32(i32 %a, i32 %b) { +; CHECK-LABEL: i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub w0, w0, w1 +; CHECK-NEXT: ret +entry: + %s = sub i32 %a, %b + ret i32 %s +} + +define i64 @i64(i64 %a, i64 %b) { +; CHECK-LABEL: i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub x0, x0, x1 +; CHECK-NEXT: ret +entry: + %s = sub i64 %a, %b + ret i64 %s +} + +define i128 @i128(i128 %a, i128 %b) { +; CHECK-LABEL: i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x0, x0, x2 +; CHECK-NEXT: sbc x1, x1, x3 +; CHECK-NEXT: ret +entry: + %s = sub i128 %a, %b + ret i128 %s +} + +define void @v2i8(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: v2i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1] +; CHECK-SD-NEXT: add x8, x0, #1 +; CHECK-SD-NEXT: add x9, x1, #1 +; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-NEXT: sub v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: fmov w9, s0 +; CHECK-SD-NEXT: strb w9, [x0] +; CHECK-SD-NEXT: strb w8, [x0, #1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v2i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr b0, [x0] +; CHECK-GI-NEXT: ldr b1, [x0, #1] +; CHECK-GI-NEXT: ldr b2, [x1] +; CHECK-GI-NEXT: ldr b3, [x1, #1] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v3.s[0] +; CHECK-GI-NEXT: sub v0.2s, v0.2s, v2.2s +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: str b0, [x0] +; CHECK-GI-NEXT: str b1, [x0, #1] +; CHECK-GI-NEXT: ret +entry: + %d = load <2 x i8>, ptr %p1 + %e = load <2 x i8>, ptr %p2 + %s = sub <2 x i8> %d, %e + store <2 x i8> %s, ptr %p1 + ret void +} + +define void @v3i8(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: v3i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b +; CHECK-SD-NEXT: sub v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: umov w8, v0.h[2] +; CHECK-SD-NEXT: str s1, [sp, #12] +; CHECK-SD-NEXT: ldrh w9, [sp, #12] +; CHECK-SD-NEXT: strb w8, [x0, #2] +; CHECK-SD-NEXT: strh w9, [x0] +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v3i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldrb w8, [x0] +; CHECK-GI-NEXT: ldrb w9, [x0, #1] +; CHECK-GI-NEXT: ldrb w10, [x1] +; CHECK-GI-NEXT: ldrb w11, [x1, #1] +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: fmov s2, w10 +; CHECK-GI-NEXT: fmov s3, w11 +; CHECK-GI-NEXT: ldrb w8, [x0, #2] +; CHECK-GI-NEXT: ldrb w9, [x1, #2] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] +; CHECK-GI-NEXT: sub v0.4h, v0.4h, v2.4h +; CHECK-GI-NEXT: mov h1, v0.h[1] +; CHECK-GI-NEXT: mov h2, v0.h[2] +; CHECK-GI-NEXT: str b0, [x0] +; CHECK-GI-NEXT: str b1, [x0, #1] +; CHECK-GI-NEXT: str b2, [x0, #2] +; CHECK-GI-NEXT: ret +entry: + %d = load <3 x i8>, ptr %p1 + %e = load <3 x i8>, ptr %p2 + %s = sub <3 x i8> %d, %e + store <3 x i8> %s, ptr %p1 + ret void +} + +define void @v4i8(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: v4i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: str s0, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v4i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr w8, [x0] +; CHECK-GI-NEXT: ldr w9, [x1] +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: mov b2, v0.b[1] +; CHECK-GI-NEXT: mov b3, v1.b[1] +; CHECK-GI-NEXT: mov b4, v0.b[2] +; CHECK-GI-NEXT: mov b5, v0.b[3] +; CHECK-GI-NEXT: mov b6, v1.b[3] +; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] +; CHECK-GI-NEXT: mov b2, v1.b[2] +; CHECK-GI-NEXT: mov v1.b[1], v3.b[0] +; CHECK-GI-NEXT: mov v0.b[2], v4.b[0] +; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] +; CHECK-GI-NEXT: mov v0.b[3], v5.b[0] +; CHECK-GI-NEXT: mov v1.b[3], v6.b[0] +; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: sub v0.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: mov h1, v0.h[1] +; CHECK-GI-NEXT: mov h2, v0.h[2] +; CHECK-GI-NEXT: mov h3, v0.h[3] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] +; CHECK-GI-NEXT: mov v0.h[3], v3.h[0] +; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: str w8, [x0] +; CHECK-GI-NEXT: ret +entry: + %d = load <4 x i8>, ptr %p1 + %e = load <4 x i8>, ptr %p2 + %s = sub <4 x i8> %d, %e + store <4 x i8> %s, ptr %p1 + ret void +} + +define <8 x i8> @v8i8(<8 x i8> %d, <8 x i8> %e) { +; CHECK-LABEL: v8i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret +entry: + %s = sub <8 x i8> %d, %e + ret <8 x i8> %s +} + +define <16 x i8> @v16i8(<16 x i8> %d, <16 x i8> %e) { +; CHECK-LABEL: v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %s = sub <16 x i8> %d, %e + ret <16 x i8> %s +} + +define <32 x i8> @v32i8(<32 x i8> %d, <32 x i8> %e) { +; CHECK-SD-LABEL: v32i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: sub v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v32i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: sub v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: ret +entry: + %s = sub <32 x i8> %d, %e + ret <32 x i8> %s +} + +define void @v2i16(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: v2i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] +; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1] +; CHECK-SD-NEXT: add x8, x0, #2 +; CHECK-SD-NEXT: add x9, x1, #2 +; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-SD-NEXT: sub v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: fmov w9, s0 +; CHECK-SD-NEXT: strh w9, [x0] +; CHECK-SD-NEXT: strh w8, [x0, #2] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v2i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr h0, [x0] +; CHECK-GI-NEXT: ldr h1, [x0, #2] +; CHECK-GI-NEXT: ldr h2, [x1] +; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v2.4h, #0 +; CHECK-GI-NEXT: sub v0.2s, v0.2s, v1.2s +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: str h0, [x0] +; CHECK-GI-NEXT: str h1, [x0, #2] +; CHECK-GI-NEXT: ret +entry: + %d = load <2 x i16>, ptr %p1 + %e = load <2 x i16>, ptr %p2 + %s = sub <2 x i16> %d, %e + store <2 x i16> %s, ptr %p1 + ret void +} + +define void @v3i16(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: v3i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: ldr d1, [x1] +; CHECK-SD-NEXT: add x8, x0, #4 +; CHECK-SD-NEXT: sub v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: st1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: str s0, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v3i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr h0, [x0] +; CHECK-GI-NEXT: ldr h1, [x0, #2] +; CHECK-GI-NEXT: add x8, x0, #2 +; CHECK-GI-NEXT: ldr h2, [x1] +; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: add x9, x0, #4 +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: ldr h1, [x0, #4] +; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: ldr h3, [x1, #4] +; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] +; CHECK-GI-NEXT: sub v0.4h, v0.4h, v2.4h +; CHECK-GI-NEXT: str h0, [x0] +; CHECK-GI-NEXT: st1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: st1 { v0.h }[2], [x9] +; CHECK-GI-NEXT: ret +entry: + %d = load <3 x i16>, ptr %p1 + %e = load <3 x i16>, ptr %p2 + %s = sub <3 x i16> %d, %e + store <3 x i16> %s, ptr %p1 + ret void +} + +define <4 x i16> @v4i16(<4 x i16> %d, <4 x i16> %e) { +; CHECK-LABEL: v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ret +entry: + %s = sub <4 x i16> %d, %e + ret <4 x i16> %s +} + +define <8 x i16> @v8i16(<8 x i16> %d, <8 x i16> %e) { +; CHECK-LABEL: v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ret +entry: + %s = sub <8 x i16> %d, %e + ret <8 x i16> %s +} + +define <16 x i16> @v16i16(<16 x i16> %d, <16 x i16> %e) { +; CHECK-SD-LABEL: v16i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub v1.8h, v1.8h, v3.8h +; CHECK-SD-NEXT: sub v0.8h, v0.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v16i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v0.8h, v0.8h, v2.8h +; CHECK-GI-NEXT: sub v1.8h, v1.8h, v3.8h +; CHECK-GI-NEXT: ret +entry: + %s = sub <16 x i16> %d, %e + ret <16 x i16> %s +} + +define <2 x i32> @v2i32(<2 x i32> %d, <2 x i32> %e) { +; CHECK-LABEL: v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ret +entry: + %s = sub <2 x i32> %d, %e + ret <2 x i32> %s +} + +define <3 x i32> @v3i32(<3 x i32> %d, <3 x i32> %e) { +; CHECK-LABEL: v3i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret +entry: + %s = sub <3 x i32> %d, %e + ret <3 x i32> %s +} + +define <4 x i32> @v4i32(<4 x i32> %d, <4 x i32> %e) { +; CHECK-LABEL: v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret +entry: + %s = sub <4 x i32> %d, %e + ret <4 x i32> %s +} + +define <8 x i32> @v8i32(<8 x i32> %d, <8 x i32> %e) { +; CHECK-SD-LABEL: v8i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub v1.4s, v1.4s, v3.4s +; CHECK-SD-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v8i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: sub v1.4s, v1.4s, v3.4s +; CHECK-GI-NEXT: ret +entry: + %s = sub <8 x i32> %d, %e + ret <8 x i32> %s +} + +define <2 x i64> @v2i64(<2 x i64> %d, <2 x i64> %e) { +; CHECK-LABEL: v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d +; CHECK-NEXT: ret +entry: + %s = sub <2 x i64> %d, %e + ret <2 x i64> %s +} + +define <3 x i64> @v3i64(<3 x i64> %d, <3 x i64> %e) { +; CHECK-SD-LABEL: v3i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub d0, d0, d3 +; CHECK-SD-NEXT: sub d1, d1, d4 +; CHECK-SD-NEXT: sub d2, d2, d5 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v3i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4 +; CHECK-GI-NEXT: fmov x8, d2 +; CHECK-GI-NEXT: fmov x9, d5 +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] +; CHECK-GI-NEXT: sub x8, x8, x9 +; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: sub v0.2d, v0.2d, v3.2d +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret +entry: + %s = sub <3 x i64> %d, %e + ret <3 x i64> %s +} + +define <4 x i64> @v4i64(<4 x i64> %d, <4 x i64> %e) { +; CHECK-SD-LABEL: v4i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub v1.2d, v1.2d, v3.2d +; CHECK-SD-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v4i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: sub v1.2d, v1.2d, v3.2d +; CHECK-GI-NEXT: ret +entry: + %s = sub <4 x i64> %d, %e + ret <4 x i64> %s +} + +define <2 x i128> @v2i128(<2 x i128> %d, <2 x i128> %e) { +; CHECK-LABEL: v2i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x0, x0, x4 +; CHECK-NEXT: sbc x1, x1, x5 +; CHECK-NEXT: subs x2, x2, x6 +; CHECK-NEXT: sbc x3, x3, x7 +; CHECK-NEXT: ret +entry: + %s = sub <2 x i128> %d, %e + ret <2 x i128> %s +} + +define <3 x i128> @v3i128(<3 x i128> %d, <3 x i128> %e) { +; CHECK-LABEL: v3i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp x8, x9, [sp] +; CHECK-NEXT: subs x0, x0, x6 +; CHECK-NEXT: ldp x10, x11, [sp, #16] +; CHECK-NEXT: sbc x1, x1, x7 +; CHECK-NEXT: subs x2, x2, x8 +; CHECK-NEXT: sbc x3, x3, x9 +; CHECK-NEXT: subs x4, x4, x10 +; CHECK-NEXT: sbc x5, x5, x11 +; CHECK-NEXT: ret +entry: + %s = sub <3 x i128> %d, %e + ret <3 x i128> %s +} + +define <4 x i128> @v4i128(<4 x i128> %d, <4 x i128> %e) { +; CHECK-LABEL: v4i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp x8, x9, [sp] +; CHECK-NEXT: ldp x11, x10, [sp, #16] +; CHECK-NEXT: ldp x13, x12, [sp, #32] +; CHECK-NEXT: subs x0, x0, x8 +; CHECK-NEXT: sbc x1, x1, x9 +; CHECK-NEXT: ldp x8, x9, [sp, #48] +; CHECK-NEXT: subs x2, x2, x11 +; CHECK-NEXT: sbc x3, x3, x10 +; CHECK-NEXT: subs x4, x4, x13 +; CHECK-NEXT: sbc x5, x5, x12 +; CHECK-NEXT: subs x6, x6, x8 +; CHECK-NEXT: sbc x7, x7, x9 +; CHECK-NEXT: ret +entry: + %s = sub <4 x i128> %d, %e + ret <4 x i128> %s +} From f53bfa39a7dae444650a9c0e16d52301a733f5fc Mon Sep 17 00:00:00 2001 From: David Green Date: Fri, 23 Aug 2024 09:19:25 +0100 Subject: [PATCH 297/426] [AArch64] Add And/Or/XOr test coverage for GISel. NFC --- llvm/test/CodeGen/AArch64/andorxor.ll | 1522 +++++++++++++++++++++++++ 1 file changed, 1522 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/andorxor.ll diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll new file mode 100644 index 00000000000000..efa4be707ceda9 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/andorxor.ll @@ -0,0 +1,1522 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +; CHECK-GI: warning: Instruction selection used fallback path for and_v2i128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for or_v2i128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for xor_v2i128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for and_v3i128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for or_v3i128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for xor_v3i128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for and_v4i128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for or_v4i128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for xor_v4i128 + +define i8 @and_i8(i8 %a, i8 %b) { +; CHECK-LABEL: and_i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w0, w0, w1 +; CHECK-NEXT: ret +entry: + %s = and i8 %a, %b + ret i8 %s +} + +define i8 @or_i8(i8 %a, i8 %b) { +; CHECK-LABEL: or_i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: orr w0, w0, w1 +; CHECK-NEXT: ret +entry: + %s = or i8 %a, %b + ret i8 %s +} + +define i8 @xor_i8(i8 %a, i8 %b) { +; CHECK-LABEL: xor_i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: eor w0, w0, w1 +; CHECK-NEXT: ret +entry: + %s = xor i8 %a, %b + ret i8 %s +} + +define i16 @and_i16(i16 %a, i16 %b) { +; CHECK-LABEL: and_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w0, w0, w1 +; CHECK-NEXT: ret +entry: + %s = and i16 %a, %b + ret i16 %s +} + +define i16 @or_i16(i16 %a, i16 %b) { +; CHECK-LABEL: or_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: orr w0, w0, w1 +; CHECK-NEXT: ret +entry: + %s = or i16 %a, %b + ret i16 %s +} + +define i16 @xor_i16(i16 %a, i16 %b) { +; CHECK-LABEL: xor_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: eor w0, w0, w1 +; CHECK-NEXT: ret +entry: + %s = xor i16 %a, %b + ret i16 %s +} + +define i32 @and_i32(i32 %a, i32 %b) { +; CHECK-LABEL: and_i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w0, w0, w1 +; CHECK-NEXT: ret +entry: + %s = and i32 %a, %b + ret i32 %s +} + +define i32 @or_i32(i32 %a, i32 %b) { +; CHECK-LABEL: or_i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: orr w0, w0, w1 +; CHECK-NEXT: ret +entry: + %s = or i32 %a, %b + ret i32 %s +} + +define i32 @xor_i32(i32 %a, i32 %b) { +; CHECK-LABEL: xor_i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: eor w0, w0, w1 +; CHECK-NEXT: ret +entry: + %s = xor i32 %a, %b + ret i32 %s +} + +define i64 @and_i64(i64 %a, i64 %b) { +; CHECK-LABEL: and_i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and x0, x0, x1 +; CHECK-NEXT: ret +entry: + %s = and i64 %a, %b + ret i64 %s +} + +define i64 @or_i64(i64 %a, i64 %b) { +; CHECK-LABEL: or_i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: orr x0, x0, x1 +; CHECK-NEXT: ret +entry: + %s = or i64 %a, %b + ret i64 %s +} + +define i64 @xor_i64(i64 %a, i64 %b) { +; CHECK-LABEL: xor_i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: eor x0, x0, x1 +; CHECK-NEXT: ret +entry: + %s = xor i64 %a, %b + ret i64 %s +} + +define i128 @and_i128(i128 %a, i128 %b) { +; CHECK-SD-LABEL: and_i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: and x1, x1, x3 +; CHECK-SD-NEXT: and x0, x0, x2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: and x0, x0, x2 +; CHECK-GI-NEXT: and x1, x1, x3 +; CHECK-GI-NEXT: ret +entry: + %s = and i128 %a, %b + ret i128 %s +} + +define i128 @or_i128(i128 %a, i128 %b) { +; CHECK-SD-LABEL: or_i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: orr x1, x1, x3 +; CHECK-SD-NEXT: orr x0, x0, x2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: or_i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: orr x0, x0, x2 +; CHECK-GI-NEXT: orr x1, x1, x3 +; CHECK-GI-NEXT: ret +entry: + %s = or i128 %a, %b + ret i128 %s +} + +define i128 @xor_i128(i128 %a, i128 %b) { +; CHECK-SD-LABEL: xor_i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: eor x1, x1, x3 +; CHECK-SD-NEXT: eor x0, x0, x2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: xor_i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: eor x0, x0, x2 +; CHECK-GI-NEXT: eor x1, x1, x3 +; CHECK-GI-NEXT: ret +entry: + %s = xor i128 %a, %b + ret i128 %s +} + +define void @and_v2i8(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: and_v2i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1] +; CHECK-SD-NEXT: add x8, x0, #1 +; CHECK-SD-NEXT: add x9, x1, #1 +; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: fmov w9, s0 +; CHECK-SD-NEXT: strb w9, [x0] +; CHECK-SD-NEXT: strb w8, [x0, #1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_v2i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr b0, [x0] +; CHECK-GI-NEXT: ldr b1, [x0, #1] +; CHECK-GI-NEXT: ldr b2, [x1] +; CHECK-GI-NEXT: ldr b3, [x1, #1] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v3.s[0] +; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: str b0, [x0] +; CHECK-GI-NEXT: str b1, [x0, #1] +; CHECK-GI-NEXT: ret +entry: + %d = load <2 x i8>, ptr %p1 + %e = load <2 x i8>, ptr %p2 + %s = and <2 x i8> %d, %e + store <2 x i8> %s, ptr %p1 + ret void +} + +define void @or_v2i8(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: or_v2i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1] +; CHECK-SD-NEXT: add x8, x0, #1 +; CHECK-SD-NEXT: add x9, x1, #1 +; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: fmov w9, s0 +; CHECK-SD-NEXT: strb w9, [x0] +; CHECK-SD-NEXT: strb w8, [x0, #1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: or_v2i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr b0, [x0] +; CHECK-GI-NEXT: ldr b1, [x0, #1] +; CHECK-GI-NEXT: ldr b2, [x1] +; CHECK-GI-NEXT: ldr b3, [x1, #1] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v3.s[0] +; CHECK-GI-NEXT: orr v0.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: str b0, [x0] +; CHECK-GI-NEXT: str b1, [x0, #1] +; CHECK-GI-NEXT: ret +entry: + %d = load <2 x i8>, ptr %p1 + %e = load <2 x i8>, ptr %p2 + %s = or <2 x i8> %d, %e + store <2 x i8> %s, ptr %p1 + ret void +} + +define void @xor_v2i8(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: xor_v2i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1] +; CHECK-SD-NEXT: add x8, x0, #1 +; CHECK-SD-NEXT: add x9, x1, #1 +; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: fmov w9, s0 +; CHECK-SD-NEXT: strb w9, [x0] +; CHECK-SD-NEXT: strb w8, [x0, #1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: xor_v2i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr b0, [x0] +; CHECK-GI-NEXT: ldr b1, [x0, #1] +; CHECK-GI-NEXT: ldr b2, [x1] +; CHECK-GI-NEXT: ldr b3, [x1, #1] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] +; CHECK-GI-NEXT: mov v2.s[1], v3.s[0] +; CHECK-GI-NEXT: eor v0.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: str b0, [x0] +; CHECK-GI-NEXT: str b1, [x0, #1] +; CHECK-GI-NEXT: ret +entry: + %d = load <2 x i8>, ptr %p1 + %e = load <2 x i8>, ptr %p2 + %s = xor <2 x i8> %d, %e + store <2 x i8> %s, ptr %p1 + ret void +} + +define void @and_v3i8(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: and_v3i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b +; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: umov w8, v0.h[2] +; CHECK-SD-NEXT: str s1, [sp, #12] +; CHECK-SD-NEXT: ldrh w9, [sp, #12] +; CHECK-SD-NEXT: strb w8, [x0, #2] +; CHECK-SD-NEXT: strh w9, [x0] +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_v3i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldrb w8, [x0] +; CHECK-GI-NEXT: ldrb w9, [x0, #1] +; CHECK-GI-NEXT: ldrb w10, [x1] +; CHECK-GI-NEXT: ldrb w11, [x1, #1] +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: fmov s2, w10 +; CHECK-GI-NEXT: fmov s3, w11 +; CHECK-GI-NEXT: ldrb w8, [x0, #2] +; CHECK-GI-NEXT: ldrb w9, [x1, #2] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] +; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: mov h1, v0.h[1] +; CHECK-GI-NEXT: mov h2, v0.h[2] +; CHECK-GI-NEXT: str b0, [x0] +; CHECK-GI-NEXT: str b1, [x0, #1] +; CHECK-GI-NEXT: str b2, [x0, #2] +; CHECK-GI-NEXT: ret +entry: + %d = load <3 x i8>, ptr %p1 + %e = load <3 x i8>, ptr %p2 + %s = and <3 x i8> %d, %e + store <3 x i8> %s, ptr %p1 + ret void +} + +define void @or_v3i8(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: or_v3i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b +; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: umov w8, v0.h[2] +; CHECK-SD-NEXT: str s1, [sp, #12] +; CHECK-SD-NEXT: ldrh w9, [sp, #12] +; CHECK-SD-NEXT: strb w8, [x0, #2] +; CHECK-SD-NEXT: strh w9, [x0] +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: or_v3i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldrb w8, [x0] +; CHECK-GI-NEXT: ldrb w9, [x0, #1] +; CHECK-GI-NEXT: ldrb w10, [x1] +; CHECK-GI-NEXT: ldrb w11, [x1, #1] +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: fmov s2, w10 +; CHECK-GI-NEXT: fmov s3, w11 +; CHECK-GI-NEXT: ldrb w8, [x0, #2] +; CHECK-GI-NEXT: ldrb w9, [x1, #2] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] +; CHECK-GI-NEXT: orr v0.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: mov h1, v0.h[1] +; CHECK-GI-NEXT: mov h2, v0.h[2] +; CHECK-GI-NEXT: str b0, [x0] +; CHECK-GI-NEXT: str b1, [x0, #1] +; CHECK-GI-NEXT: str b2, [x0, #2] +; CHECK-GI-NEXT: ret +entry: + %d = load <3 x i8>, ptr %p1 + %e = load <3 x i8>, ptr %p2 + %s = or <3 x i8> %d, %e + store <3 x i8> %s, ptr %p1 + ret void +} + +define void @xor_v3i8(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: xor_v3i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b +; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: umov w8, v0.h[2] +; CHECK-SD-NEXT: str s1, [sp, #12] +; CHECK-SD-NEXT: ldrh w9, [sp, #12] +; CHECK-SD-NEXT: strb w8, [x0, #2] +; CHECK-SD-NEXT: strh w9, [x0] +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: xor_v3i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldrb w8, [x0] +; CHECK-GI-NEXT: ldrb w9, [x0, #1] +; CHECK-GI-NEXT: ldrb w10, [x1] +; CHECK-GI-NEXT: ldrb w11, [x1, #1] +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: fmov s2, w10 +; CHECK-GI-NEXT: fmov s3, w11 +; CHECK-GI-NEXT: ldrb w8, [x0, #2] +; CHECK-GI-NEXT: ldrb w9, [x1, #2] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] +; CHECK-GI-NEXT: eor v0.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: mov h1, v0.h[1] +; CHECK-GI-NEXT: mov h2, v0.h[2] +; CHECK-GI-NEXT: str b0, [x0] +; CHECK-GI-NEXT: str b1, [x0, #1] +; CHECK-GI-NEXT: str b2, [x0, #2] +; CHECK-GI-NEXT: ret +entry: + %d = load <3 x i8>, ptr %p1 + %e = load <3 x i8>, ptr %p2 + %s = xor <3 x i8> %d, %e + store <3 x i8> %s, ptr %p1 + ret void +} + +define void @and_v4i8(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: and_v4i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: str s0, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_v4i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr w8, [x0] +; CHECK-GI-NEXT: ldr w9, [x1] +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: mov b2, v0.b[1] +; CHECK-GI-NEXT: mov b3, v1.b[1] +; CHECK-GI-NEXT: mov b4, v0.b[2] +; CHECK-GI-NEXT: mov b5, v0.b[3] +; CHECK-GI-NEXT: mov b6, v1.b[3] +; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] +; CHECK-GI-NEXT: mov b2, v1.b[2] +; CHECK-GI-NEXT: mov v1.b[1], v3.b[0] +; CHECK-GI-NEXT: mov v0.b[2], v4.b[0] +; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] +; CHECK-GI-NEXT: mov v0.b[3], v5.b[0] +; CHECK-GI-NEXT: mov v1.b[3], v6.b[0] +; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: mov h1, v0.h[1] +; CHECK-GI-NEXT: mov h2, v0.h[2] +; CHECK-GI-NEXT: mov h3, v0.h[3] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] +; CHECK-GI-NEXT: mov v0.h[3], v3.h[0] +; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: str w8, [x0] +; CHECK-GI-NEXT: ret +entry: + %d = load <4 x i8>, ptr %p1 + %e = load <4 x i8>, ptr %p2 + %s = and <4 x i8> %d, %e + store <4 x i8> %s, ptr %p1 + ret void +} + +define void @or_v4i8(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: or_v4i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: str s0, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: or_v4i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr w8, [x0] +; CHECK-GI-NEXT: ldr w9, [x1] +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: mov b2, v0.b[1] +; CHECK-GI-NEXT: mov b3, v1.b[1] +; CHECK-GI-NEXT: mov b4, v0.b[2] +; CHECK-GI-NEXT: mov b5, v0.b[3] +; CHECK-GI-NEXT: mov b6, v1.b[3] +; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] +; CHECK-GI-NEXT: mov b2, v1.b[2] +; CHECK-GI-NEXT: mov v1.b[1], v3.b[0] +; CHECK-GI-NEXT: mov v0.b[2], v4.b[0] +; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] +; CHECK-GI-NEXT: mov v0.b[3], v5.b[0] +; CHECK-GI-NEXT: mov v1.b[3], v6.b[0] +; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: mov h1, v0.h[1] +; CHECK-GI-NEXT: mov h2, v0.h[2] +; CHECK-GI-NEXT: mov h3, v0.h[3] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] +; CHECK-GI-NEXT: mov v0.h[3], v3.h[0] +; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: str w8, [x0] +; CHECK-GI-NEXT: ret +entry: + %d = load <4 x i8>, ptr %p1 + %e = load <4 x i8>, ptr %p2 + %s = or <4 x i8> %d, %e + store <4 x i8> %s, ptr %p1 + ret void +} + +define void @xor_v4i8(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: xor_v4i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: str s0, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: xor_v4i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr w8, [x0] +; CHECK-GI-NEXT: ldr w9, [x1] +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: mov b2, v0.b[1] +; CHECK-GI-NEXT: mov b3, v1.b[1] +; CHECK-GI-NEXT: mov b4, v0.b[2] +; CHECK-GI-NEXT: mov b5, v0.b[3] +; CHECK-GI-NEXT: mov b6, v1.b[3] +; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] +; CHECK-GI-NEXT: mov b2, v1.b[2] +; CHECK-GI-NEXT: mov v1.b[1], v3.b[0] +; CHECK-GI-NEXT: mov v0.b[2], v4.b[0] +; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] +; CHECK-GI-NEXT: mov v0.b[3], v5.b[0] +; CHECK-GI-NEXT: mov v1.b[3], v6.b[0] +; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: mov h1, v0.h[1] +; CHECK-GI-NEXT: mov h2, v0.h[2] +; CHECK-GI-NEXT: mov h3, v0.h[3] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: mov v0.h[2], v2.h[0] +; CHECK-GI-NEXT: mov v0.h[3], v3.h[0] +; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: str w8, [x0] +; CHECK-GI-NEXT: ret +entry: + %d = load <4 x i8>, ptr %p1 + %e = load <4 x i8>, ptr %p2 + %s = xor <4 x i8> %d, %e + store <4 x i8> %s, ptr %p1 + ret void +} + +define <8 x i8> @and_v8i8(<8 x i8> %d, <8 x i8> %e) { +; CHECK-LABEL: and_v8i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret +entry: + %s = and <8 x i8> %d, %e + ret <8 x i8> %s +} + +define <8 x i8> @or_v8i8(<8 x i8> %d, <8 x i8> %e) { +; CHECK-LABEL: or_v8i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret +entry: + %s = or <8 x i8> %d, %e + ret <8 x i8> %s +} + +define <8 x i8> @xor_v8i8(<8 x i8> %d, <8 x i8> %e) { +; CHECK-LABEL: xor_v8i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret +entry: + %s = xor <8 x i8> %d, %e + ret <8 x i8> %s +} + +define <16 x i8> @and_v16i8(<16 x i8> %d, <16 x i8> %e) { +; CHECK-LABEL: and_v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %s = and <16 x i8> %d, %e + ret <16 x i8> %s +} + +define <16 x i8> @or_v16i8(<16 x i8> %d, <16 x i8> %e) { +; CHECK-LABEL: or_v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %s = or <16 x i8> %d, %e + ret <16 x i8> %s +} + +define <16 x i8> @xor_v16i8(<16 x i8> %d, <16 x i8> %e) { +; CHECK-LABEL: xor_v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %s = xor <16 x i8> %d, %e + ret <16 x i8> %s +} + +define <32 x i8> @and_v32i8(<32 x i8> %d, <32 x i8> %e) { +; CHECK-SD-LABEL: and_v32i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_v32i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: ret +entry: + %s = and <32 x i8> %d, %e + ret <32 x i8> %s +} + +define <32 x i8> @or_v32i8(<32 x i8> %d, <32 x i8> %e) { +; CHECK-SD-LABEL: or_v32i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: orr v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: or_v32i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: orr v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: ret +entry: + %s = or <32 x i8> %d, %e + ret <32 x i8> %s +} + +define <32 x i8> @xor_v32i8(<32 x i8> %d, <32 x i8> %e) { +; CHECK-SD-LABEL: xor_v32i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: eor v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: eor v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: xor_v32i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: eor v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: eor v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: ret +entry: + %s = xor <32 x i8> %d, %e + ret <32 x i8> %s +} + +define void @and_v2i16(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: and_v2i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] +; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1] +; CHECK-SD-NEXT: add x8, x0, #2 +; CHECK-SD-NEXT: add x9, x1, #2 +; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: fmov w9, s0 +; CHECK-SD-NEXT: strh w9, [x0] +; CHECK-SD-NEXT: strh w8, [x0, #2] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_v2i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr h0, [x0] +; CHECK-GI-NEXT: ldr h1, [x0, #2] +; CHECK-GI-NEXT: ldr h2, [x1] +; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v2.4h, #0 +; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: str h0, [x0] +; CHECK-GI-NEXT: str h1, [x0, #2] +; CHECK-GI-NEXT: ret +entry: + %d = load <2 x i16>, ptr %p1 + %e = load <2 x i16>, ptr %p2 + %s = and <2 x i16> %d, %e + store <2 x i16> %s, ptr %p1 + ret void +} + +define void @or_v2i16(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: or_v2i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] +; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1] +; CHECK-SD-NEXT: add x8, x0, #2 +; CHECK-SD-NEXT: add x9, x1, #2 +; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: fmov w9, s0 +; CHECK-SD-NEXT: strh w9, [x0] +; CHECK-SD-NEXT: strh w8, [x0, #2] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: or_v2i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr h0, [x0] +; CHECK-GI-NEXT: ldr h1, [x0, #2] +; CHECK-GI-NEXT: ldr h2, [x1] +; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v2.4h, #0 +; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: str h0, [x0] +; CHECK-GI-NEXT: str h1, [x0, #2] +; CHECK-GI-NEXT: ret +entry: + %d = load <2 x i16>, ptr %p1 + %e = load <2 x i16>, ptr %p2 + %s = or <2 x i16> %d, %e + store <2 x i16> %s, ptr %p1 + ret void +} + +define void @xor_v2i16(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: xor_v2i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] +; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1] +; CHECK-SD-NEXT: add x8, x0, #2 +; CHECK-SD-NEXT: add x9, x1, #2 +; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: fmov w9, s0 +; CHECK-SD-NEXT: strh w9, [x0] +; CHECK-SD-NEXT: strh w8, [x0, #2] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: xor_v2i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr h0, [x0] +; CHECK-GI-NEXT: ldr h1, [x0, #2] +; CHECK-GI-NEXT: ldr h2, [x1] +; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v2.4h, #0 +; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: str h0, [x0] +; CHECK-GI-NEXT: str h1, [x0, #2] +; CHECK-GI-NEXT: ret +entry: + %d = load <2 x i16>, ptr %p1 + %e = load <2 x i16>, ptr %p2 + %s = xor <2 x i16> %d, %e + store <2 x i16> %s, ptr %p1 + ret void +} + +define void @and_v3i16(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: and_v3i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldr x8, [x0] +; CHECK-SD-NEXT: ldr x9, [x1] +; CHECK-SD-NEXT: and x8, x8, x9 +; CHECK-SD-NEXT: fmov d0, x8 +; CHECK-SD-NEXT: str w8, [x0] +; CHECK-SD-NEXT: add x8, x0, #4 +; CHECK-SD-NEXT: st1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_v3i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr h0, [x0] +; CHECK-GI-NEXT: ldr h1, [x0, #2] +; CHECK-GI-NEXT: add x8, x0, #2 +; CHECK-GI-NEXT: ldr h2, [x1] +; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: add x9, x0, #4 +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: ldr h1, [x0, #4] +; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: ldr h3, [x1, #4] +; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] +; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: str h0, [x0] +; CHECK-GI-NEXT: st1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: st1 { v0.h }[2], [x9] +; CHECK-GI-NEXT: ret +entry: + %d = load <3 x i16>, ptr %p1 + %e = load <3 x i16>, ptr %p2 + %s = and <3 x i16> %d, %e + store <3 x i16> %s, ptr %p1 + ret void +} + +define void @or_v3i16(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: or_v3i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldr x8, [x0] +; CHECK-SD-NEXT: ldr x9, [x1] +; CHECK-SD-NEXT: orr x8, x8, x9 +; CHECK-SD-NEXT: fmov d0, x8 +; CHECK-SD-NEXT: str w8, [x0] +; CHECK-SD-NEXT: add x8, x0, #4 +; CHECK-SD-NEXT: st1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: or_v3i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr h0, [x0] +; CHECK-GI-NEXT: ldr h1, [x0, #2] +; CHECK-GI-NEXT: add x8, x0, #2 +; CHECK-GI-NEXT: ldr h2, [x1] +; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: add x9, x0, #4 +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: ldr h1, [x0, #4] +; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: ldr h3, [x1, #4] +; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] +; CHECK-GI-NEXT: orr v0.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: str h0, [x0] +; CHECK-GI-NEXT: st1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: st1 { v0.h }[2], [x9] +; CHECK-GI-NEXT: ret +entry: + %d = load <3 x i16>, ptr %p1 + %e = load <3 x i16>, ptr %p2 + %s = or <3 x i16> %d, %e + store <3 x i16> %s, ptr %p1 + ret void +} + +define void @xor_v3i16(ptr %p1, ptr %p2) { +; CHECK-SD-LABEL: xor_v3i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldr x8, [x0] +; CHECK-SD-NEXT: ldr x9, [x1] +; CHECK-SD-NEXT: eor x8, x8, x9 +; CHECK-SD-NEXT: fmov d0, x8 +; CHECK-SD-NEXT: str w8, [x0] +; CHECK-SD-NEXT: add x8, x0, #4 +; CHECK-SD-NEXT: st1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: xor_v3i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr h0, [x0] +; CHECK-GI-NEXT: ldr h1, [x0, #2] +; CHECK-GI-NEXT: add x8, x0, #2 +; CHECK-GI-NEXT: ldr h2, [x1] +; CHECK-GI-NEXT: ldr h3, [x1, #2] +; CHECK-GI-NEXT: add x9, x0, #4 +; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] +; CHECK-GI-NEXT: ldr h1, [x0, #4] +; CHECK-GI-NEXT: mov v2.h[1], v3.h[0] +; CHECK-GI-NEXT: ldr h3, [x1, #4] +; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] +; CHECK-GI-NEXT: mov v2.h[2], v3.h[0] +; CHECK-GI-NEXT: eor v0.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: str h0, [x0] +; CHECK-GI-NEXT: st1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: st1 { v0.h }[2], [x9] +; CHECK-GI-NEXT: ret +entry: + %d = load <3 x i16>, ptr %p1 + %e = load <3 x i16>, ptr %p2 + %s = xor <3 x i16> %d, %e + store <3 x i16> %s, ptr %p1 + ret void +} + +define <4 x i16> @and_v4i16(<4 x i16> %d, <4 x i16> %e) { +; CHECK-LABEL: and_v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret +entry: + %s = and <4 x i16> %d, %e + ret <4 x i16> %s +} + +define <4 x i16> @or_v4i16(<4 x i16> %d, <4 x i16> %e) { +; CHECK-LABEL: or_v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret +entry: + %s = or <4 x i16> %d, %e + ret <4 x i16> %s +} + +define <4 x i16> @xor_v4i16(<4 x i16> %d, <4 x i16> %e) { +; CHECK-LABEL: xor_v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret +entry: + %s = xor <4 x i16> %d, %e + ret <4 x i16> %s +} + +define <8 x i16> @and_v8i16(<8 x i16> %d, <8 x i16> %e) { +; CHECK-LABEL: and_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %s = and <8 x i16> %d, %e + ret <8 x i16> %s +} + +define <8 x i16> @or_v8i16(<8 x i16> %d, <8 x i16> %e) { +; CHECK-LABEL: or_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %s = or <8 x i16> %d, %e + ret <8 x i16> %s +} + +define <8 x i16> @xor_v8i16(<8 x i16> %d, <8 x i16> %e) { +; CHECK-LABEL: xor_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %s = xor <8 x i16> %d, %e + ret <8 x i16> %s +} + +define <16 x i16> @and_v16i16(<16 x i16> %d, <16 x i16> %e) { +; CHECK-SD-LABEL: and_v16i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_v16i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: ret +entry: + %s = and <16 x i16> %d, %e + ret <16 x i16> %s +} + +define <16 x i16> @or_v16i16(<16 x i16> %d, <16 x i16> %e) { +; CHECK-SD-LABEL: or_v16i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: orr v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: or_v16i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: orr v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: ret +entry: + %s = or <16 x i16> %d, %e + ret <16 x i16> %s +} + +define <16 x i16> @xor_v16i16(<16 x i16> %d, <16 x i16> %e) { +; CHECK-SD-LABEL: xor_v16i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: eor v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: eor v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: xor_v16i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: eor v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: eor v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: ret +entry: + %s = xor <16 x i16> %d, %e + ret <16 x i16> %s +} + +define <2 x i32> @and_v2i32(<2 x i32> %d, <2 x i32> %e) { +; CHECK-LABEL: and_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret +entry: + %s = and <2 x i32> %d, %e + ret <2 x i32> %s +} + +define <2 x i32> @or_v2i32(<2 x i32> %d, <2 x i32> %e) { +; CHECK-LABEL: or_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret +entry: + %s = or <2 x i32> %d, %e + ret <2 x i32> %s +} + +define <2 x i32> @xor_v2i32(<2 x i32> %d, <2 x i32> %e) { +; CHECK-LABEL: xor_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret +entry: + %s = xor <2 x i32> %d, %e + ret <2 x i32> %s +} + +define <3 x i32> @and_v3i32(<3 x i32> %d, <3 x i32> %e) { +; CHECK-LABEL: and_v3i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %s = and <3 x i32> %d, %e + ret <3 x i32> %s +} + +define <3 x i32> @or_v3i32(<3 x i32> %d, <3 x i32> %e) { +; CHECK-LABEL: or_v3i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %s = or <3 x i32> %d, %e + ret <3 x i32> %s +} + +define <3 x i32> @xor_v3i32(<3 x i32> %d, <3 x i32> %e) { +; CHECK-LABEL: xor_v3i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %s = xor <3 x i32> %d, %e + ret <3 x i32> %s +} + +define <4 x i32> @and_v4i32(<4 x i32> %d, <4 x i32> %e) { +; CHECK-LABEL: and_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %s = and <4 x i32> %d, %e + ret <4 x i32> %s +} + +define <4 x i32> @or_v4i32(<4 x i32> %d, <4 x i32> %e) { +; CHECK-LABEL: or_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %s = or <4 x i32> %d, %e + ret <4 x i32> %s +} + +define <4 x i32> @xor_v4i32(<4 x i32> %d, <4 x i32> %e) { +; CHECK-LABEL: xor_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %s = xor <4 x i32> %d, %e + ret <4 x i32> %s +} + +define <8 x i32> @and_v8i32(<8 x i32> %d, <8 x i32> %e) { +; CHECK-SD-LABEL: and_v8i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_v8i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: ret +entry: + %s = and <8 x i32> %d, %e + ret <8 x i32> %s +} + +define <8 x i32> @or_v8i32(<8 x i32> %d, <8 x i32> %e) { +; CHECK-SD-LABEL: or_v8i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: orr v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: or_v8i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: orr v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: ret +entry: + %s = or <8 x i32> %d, %e + ret <8 x i32> %s +} + +define <8 x i32> @xor_v8i32(<8 x i32> %d, <8 x i32> %e) { +; CHECK-SD-LABEL: xor_v8i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: eor v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: eor v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: xor_v8i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: eor v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: eor v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: ret +entry: + %s = xor <8 x i32> %d, %e + ret <8 x i32> %s +} + +define <2 x i64> @and_v2i64(<2 x i64> %d, <2 x i64> %e) { +; CHECK-LABEL: and_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %s = and <2 x i64> %d, %e + ret <2 x i64> %s +} + +define <2 x i64> @or_v2i64(<2 x i64> %d, <2 x i64> %e) { +; CHECK-LABEL: or_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %s = or <2 x i64> %d, %e + ret <2 x i64> %s +} + +define <2 x i64> @xor_v2i64(<2 x i64> %d, <2 x i64> %e) { +; CHECK-LABEL: xor_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %s = xor <2 x i64> %d, %e + ret <2 x i64> %s +} + +define <3 x i64> @and_v3i64(<3 x i64> %d, <3 x i64> %e) { +; CHECK-SD-LABEL: and_v3i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: and v0.8b, v0.8b, v3.8b +; CHECK-SD-NEXT: and v1.8b, v1.8b, v4.8b +; CHECK-SD-NEXT: and v2.8b, v2.8b, v5.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_v3i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4 +; CHECK-GI-NEXT: fmov x8, d2 +; CHECK-GI-NEXT: fmov x9, d5 +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] +; CHECK-GI-NEXT: and x8, x8, x9 +; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret +entry: + %s = and <3 x i64> %d, %e + ret <3 x i64> %s +} + +define <3 x i64> @or_v3i64(<3 x i64> %d, <3 x i64> %e) { +; CHECK-SD-LABEL: or_v3i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: orr v0.8b, v0.8b, v3.8b +; CHECK-SD-NEXT: orr v1.8b, v1.8b, v4.8b +; CHECK-SD-NEXT: orr v2.8b, v2.8b, v5.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: or_v3i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4 +; CHECK-GI-NEXT: fmov x8, d2 +; CHECK-GI-NEXT: fmov x9, d5 +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] +; CHECK-GI-NEXT: orr x8, x8, x9 +; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret +entry: + %s = or <3 x i64> %d, %e + ret <3 x i64> %s +} + +define <3 x i64> @xor_v3i64(<3 x i64> %d, <3 x i64> %e) { +; CHECK-SD-LABEL: xor_v3i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: eor v0.8b, v0.8b, v3.8b +; CHECK-SD-NEXT: eor v1.8b, v1.8b, v4.8b +; CHECK-SD-NEXT: eor v2.8b, v2.8b, v5.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: xor_v3i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: // kill: def $d4 killed $d4 def $q4 +; CHECK-GI-NEXT: fmov x8, d2 +; CHECK-GI-NEXT: fmov x9, d5 +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: mov v3.d[1], v4.d[0] +; CHECK-GI-NEXT: eor x8, x8, x9 +; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: eor v0.16b, v0.16b, v3.16b +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret +entry: + %s = xor <3 x i64> %d, %e + ret <3 x i64> %s +} + +define <4 x i64> @and_v4i64(<4 x i64> %d, <4 x i64> %e) { +; CHECK-SD-LABEL: and_v4i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_v4i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: ret +entry: + %s = and <4 x i64> %d, %e + ret <4 x i64> %s +} + +define <4 x i64> @or_v4i64(<4 x i64> %d, <4 x i64> %e) { +; CHECK-SD-LABEL: or_v4i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: orr v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: or_v4i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: orr v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: ret +entry: + %s = or <4 x i64> %d, %e + ret <4 x i64> %s +} + +define <4 x i64> @xor_v4i64(<4 x i64> %d, <4 x i64> %e) { +; CHECK-SD-LABEL: xor_v4i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: eor v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: eor v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: xor_v4i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: eor v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: eor v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: ret +entry: + %s = xor <4 x i64> %d, %e + ret <4 x i64> %s +} + +define <2 x i128> @and_v2i128(<2 x i128> %d, <2 x i128> %e) { +; CHECK-LABEL: and_v2i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and x2, x2, x6 +; CHECK-NEXT: and x0, x0, x4 +; CHECK-NEXT: and x1, x1, x5 +; CHECK-NEXT: and x3, x3, x7 +; CHECK-NEXT: ret +entry: + %s = and <2 x i128> %d, %e + ret <2 x i128> %s +} + +define <2 x i128> @or_v2i128(<2 x i128> %d, <2 x i128> %e) { +; CHECK-LABEL: or_v2i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: orr x2, x2, x6 +; CHECK-NEXT: orr x0, x0, x4 +; CHECK-NEXT: orr x1, x1, x5 +; CHECK-NEXT: orr x3, x3, x7 +; CHECK-NEXT: ret +entry: + %s = or <2 x i128> %d, %e + ret <2 x i128> %s +} + +define <2 x i128> @xor_v2i128(<2 x i128> %d, <2 x i128> %e) { +; CHECK-LABEL: xor_v2i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: eor x2, x2, x6 +; CHECK-NEXT: eor x0, x0, x4 +; CHECK-NEXT: eor x1, x1, x5 +; CHECK-NEXT: eor x3, x3, x7 +; CHECK-NEXT: ret +entry: + %s = xor <2 x i128> %d, %e + ret <2 x i128> %s +} + +define <3 x i128> @and_v3i128(<3 x i128> %d, <3 x i128> %e) { +; CHECK-LABEL: and_v3i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp x8, x9, [sp] +; CHECK-NEXT: and x0, x0, x6 +; CHECK-NEXT: ldp x11, x10, [sp, #16] +; CHECK-NEXT: and x1, x1, x7 +; CHECK-NEXT: and x2, x2, x8 +; CHECK-NEXT: and x3, x3, x9 +; CHECK-NEXT: and x4, x4, x11 +; CHECK-NEXT: and x5, x5, x10 +; CHECK-NEXT: ret +entry: + %s = and <3 x i128> %d, %e + ret <3 x i128> %s +} + +define <3 x i128> @or_v3i128(<3 x i128> %d, <3 x i128> %e) { +; CHECK-LABEL: or_v3i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp x8, x9, [sp] +; CHECK-NEXT: orr x0, x0, x6 +; CHECK-NEXT: ldp x11, x10, [sp, #16] +; CHECK-NEXT: orr x1, x1, x7 +; CHECK-NEXT: orr x2, x2, x8 +; CHECK-NEXT: orr x3, x3, x9 +; CHECK-NEXT: orr x4, x4, x11 +; CHECK-NEXT: orr x5, x5, x10 +; CHECK-NEXT: ret +entry: + %s = or <3 x i128> %d, %e + ret <3 x i128> %s +} + +define <3 x i128> @xor_v3i128(<3 x i128> %d, <3 x i128> %e) { +; CHECK-LABEL: xor_v3i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp x8, x9, [sp] +; CHECK-NEXT: eor x0, x0, x6 +; CHECK-NEXT: ldp x11, x10, [sp, #16] +; CHECK-NEXT: eor x1, x1, x7 +; CHECK-NEXT: eor x2, x2, x8 +; CHECK-NEXT: eor x3, x3, x9 +; CHECK-NEXT: eor x4, x4, x11 +; CHECK-NEXT: eor x5, x5, x10 +; CHECK-NEXT: ret +entry: + %s = xor <3 x i128> %d, %e + ret <3 x i128> %s +} + +define <4 x i128> @and_v4i128(<4 x i128> %d, <4 x i128> %e) { +; CHECK-LABEL: and_v4i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp x9, x8, [sp, #32] +; CHECK-NEXT: ldp x11, x10, [sp] +; CHECK-NEXT: ldp x13, x12, [sp, #16] +; CHECK-NEXT: ldp x15, x14, [sp, #48] +; CHECK-NEXT: and x4, x4, x9 +; CHECK-NEXT: and x0, x0, x11 +; CHECK-NEXT: and x1, x1, x10 +; CHECK-NEXT: and x5, x5, x8 +; CHECK-NEXT: and x2, x2, x13 +; CHECK-NEXT: and x3, x3, x12 +; CHECK-NEXT: and x6, x6, x15 +; CHECK-NEXT: and x7, x7, x14 +; CHECK-NEXT: ret +entry: + %s = and <4 x i128> %d, %e + ret <4 x i128> %s +} + +define <4 x i128> @or_v4i128(<4 x i128> %d, <4 x i128> %e) { +; CHECK-LABEL: or_v4i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp x9, x8, [sp, #32] +; CHECK-NEXT: ldp x11, x10, [sp] +; CHECK-NEXT: ldp x13, x12, [sp, #16] +; CHECK-NEXT: ldp x15, x14, [sp, #48] +; CHECK-NEXT: orr x4, x4, x9 +; CHECK-NEXT: orr x0, x0, x11 +; CHECK-NEXT: orr x1, x1, x10 +; CHECK-NEXT: orr x5, x5, x8 +; CHECK-NEXT: orr x2, x2, x13 +; CHECK-NEXT: orr x3, x3, x12 +; CHECK-NEXT: orr x6, x6, x15 +; CHECK-NEXT: orr x7, x7, x14 +; CHECK-NEXT: ret +entry: + %s = or <4 x i128> %d, %e + ret <4 x i128> %s +} + +define <4 x i128> @xor_v4i128(<4 x i128> %d, <4 x i128> %e) { +; CHECK-LABEL: xor_v4i128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp x9, x8, [sp, #32] +; CHECK-NEXT: ldp x11, x10, [sp] +; CHECK-NEXT: ldp x13, x12, [sp, #16] +; CHECK-NEXT: ldp x15, x14, [sp, #48] +; CHECK-NEXT: eor x4, x4, x9 +; CHECK-NEXT: eor x0, x0, x11 +; CHECK-NEXT: eor x1, x1, x10 +; CHECK-NEXT: eor x5, x5, x8 +; CHECK-NEXT: eor x2, x2, x13 +; CHECK-NEXT: eor x3, x3, x12 +; CHECK-NEXT: eor x6, x6, x15 +; CHECK-NEXT: eor x7, x7, x14 +; CHECK-NEXT: ret +entry: + %s = xor <4 x i128> %d, %e + ret <4 x i128> %s +} From b02b5b7b598ff146f8d5ed529412236533429403 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 23 Aug 2024 09:59:19 +0100 Subject: [PATCH 298/426] [AMDGPU] Simplify use of hasMovrel and hasVGPRIndexMode (#105680) The generic subtarget has neither of these features. Rather than forcing HasMovrel on, it is simpler to expand dynamic vector indexing to a sequence of compare/select instructions. NFC for real subtargets. --- llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 12 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 7 +- .../CodeGen/AMDGPU/indirect-addressing-si.ll | 1704 +++++++++++++++++ 3 files changed, 1712 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index b3872a6374261b..352994e541fc88 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -143,14 +143,8 @@ GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, if (LDSBankCount == 0) LDSBankCount = 32; - if (TT.getArch() == Triple::amdgcn) { - if (LocalMemorySize == 0) - LocalMemorySize = 32768; - - // Do something sensible for unspecified target. - if (!HasMovrel && !HasVGPRIndexMode) - HasMovrel = true; - } + if (TT.getArch() == Triple::amdgcn && LocalMemorySize == 0) + LocalMemorySize = 32768; AddressableLocalMemorySize = LocalMemorySize; @@ -366,7 +360,7 @@ bool GCNSubtarget::hasMadF16() const { } bool GCNSubtarget::useVGPRIndexMode() const { - return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); + return hasVGPRIndexMode() && (!hasMovrel() || EnableVGPRIndexMode); } bool GCNSubtarget::useAA() const { return UseAA; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index c954c0aa71f734..ecd4451c504727 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -13350,12 +13350,15 @@ bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize, // On some architectures (GFX9) movrel is not available and it's better // to expand. - if (!Subtarget->hasMovrel()) + if (Subtarget->useVGPRIndexMode()) return NumInsts <= 16; // If movrel is available, use it instead of expanding for vector of 8 // elements. - return NumInsts <= 15; + if (Subtarget->hasMovrel()) + return NumInsts <= 15; + + return true; } bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const { diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index c130eb04d02370..a33142fd0ab1f3 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GENERIC %s ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -O0 < %s | FileCheck -check-prefix=NOOPT %s ; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI-MOVREL %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI,VI-MOVREL %s @@ -8,6 +9,75 @@ ; Tests for indirect addressing on SI, which is implemented using dynamic ; indexing of vectors. define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) { +; GENERIC-LABEL: extract_w_offset: +; GENERIC: ; %bb.0: ; %entry +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000 +; GENERIC-NEXT: v_mov_b32_e32 v1, 0x40a00000 +; GENERIC-NEXT: v_mov_b32_e32 v2, 0x40c00000 +; GENERIC-NEXT: v_mov_b32_e32 v3, 0x40e00000 +; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000 +; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41100000 +; GENERIC-NEXT: v_mov_b32_e32 v6, 0x41200000 +; GENERIC-NEXT: v_mov_b32_e32 v7, 0x41300000 +; GENERIC-NEXT: v_mov_b32_e32 v8, 0x41400000 +; GENERIC-NEXT: v_mov_b32_e32 v9, 0x41500000 +; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41600000 +; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41700000 +; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41800000 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: s_add_i32 s6, s4, 1 +; GENERIC-NEXT: s_cmp_eq_u32 s6, 1 +; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v13, 1.0, 2.0, s[4:5] +; GENERIC-NEXT: s_cmp_lg_u32 s6, 2 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 3 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, 4.0, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 4 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 5 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 6 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 7 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 8 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 9 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 10 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 11 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 12 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 13 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 14 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 15 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc +; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: extract_w_offset: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -201,6 +271,65 @@ entry: ; XXX: Could do v_or_b32 directly define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %out, i32 %in, <16 x i32> %or.val) { +; GENERIC-LABEL: extract_w_offset_salu_use_vector: +; GENERIC: ; %bb.0: ; %entry +; GENERIC-NEXT: s_load_dword s20, s[2:3], 0xb +; GENERIC-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: s_add_i32 s20, s20, 1 +; GENERIC-NEXT: s_or_b32 s2, s19, 16 +; GENERIC-NEXT: s_or_b32 s18, s18, 15 +; GENERIC-NEXT: s_or_b32 s17, s17, 14 +; GENERIC-NEXT: s_or_b32 s16, s16, 13 +; GENERIC-NEXT: s_or_b32 s15, s15, 12 +; GENERIC-NEXT: s_or_b32 s14, s14, 11 +; GENERIC-NEXT: s_or_b32 s13, s13, 10 +; GENERIC-NEXT: s_or_b32 s12, s12, 9 +; GENERIC-NEXT: s_or_b32 s11, s11, 8 +; GENERIC-NEXT: s_or_b32 s10, s10, 7 +; GENERIC-NEXT: s_or_b32 s9, s9, 6 +; GENERIC-NEXT: s_or_b32 s8, s8, 5 +; GENERIC-NEXT: s_or_b32 s7, s7, 4 +; GENERIC-NEXT: s_or_b32 s6, s6, 3 +; GENERIC-NEXT: s_or_b32 s4, s4, 1 +; GENERIC-NEXT: s_or_b32 s5, s5, 2 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 1 +; GENERIC-NEXT: s_cselect_b32 s4, s5, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 2 +; GENERIC-NEXT: s_cselect_b32 s4, s6, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 3 +; GENERIC-NEXT: s_cselect_b32 s4, s7, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 4 +; GENERIC-NEXT: s_cselect_b32 s4, s8, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 5 +; GENERIC-NEXT: s_cselect_b32 s4, s9, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 6 +; GENERIC-NEXT: s_cselect_b32 s4, s10, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 7 +; GENERIC-NEXT: s_cselect_b32 s4, s11, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 8 +; GENERIC-NEXT: s_cselect_b32 s4, s12, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 9 +; GENERIC-NEXT: s_cselect_b32 s4, s13, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 10 +; GENERIC-NEXT: s_cselect_b32 s4, s14, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 11 +; GENERIC-NEXT: s_cselect_b32 s4, s15, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 12 +; GENERIC-NEXT: s_cselect_b32 s4, s16, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 13 +; GENERIC-NEXT: s_cselect_b32 s4, s17, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 14 +; GENERIC-NEXT: s_cselect_b32 s4, s18, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 15 +; GENERIC-NEXT: s_cselect_b32 s4, s2, s4 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: v_mov_b32_e32 v0, s4 +; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: extract_w_offset_salu_use_vector: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -494,6 +623,74 @@ entry: } define amdgpu_kernel void @extract_wo_offset(ptr addrspace(1) %out, i32 %in) { +; GENERIC-LABEL: extract_wo_offset: +; GENERIC: ; %bb.0: ; %entry +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GENERIC-NEXT: s_load_dword s6, s[2:3], 0xb +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000 +; GENERIC-NEXT: v_mov_b32_e32 v1, 0x40a00000 +; GENERIC-NEXT: v_mov_b32_e32 v2, 0x40c00000 +; GENERIC-NEXT: v_mov_b32_e32 v3, 0x40e00000 +; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000 +; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41100000 +; GENERIC-NEXT: v_mov_b32_e32 v6, 0x41200000 +; GENERIC-NEXT: v_mov_b32_e32 v7, 0x41300000 +; GENERIC-NEXT: v_mov_b32_e32 v8, 0x41400000 +; GENERIC-NEXT: v_mov_b32_e32 v9, 0x41500000 +; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41600000 +; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41700000 +; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41800000 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: s_cmp_eq_u32 s6, 1 +; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v13, 1.0, 2.0, s[4:5] +; GENERIC-NEXT: s_cmp_lg_u32 s6, 2 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 3 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, 4.0, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 4 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 5 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 6 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 7 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 8 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 9 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 10 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 11 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 12 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 13 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 14 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s6, 15 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc +; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: extract_wo_offset: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -679,6 +876,50 @@ entry: } define amdgpu_kernel void @extract_neg_offset_sgpr(ptr addrspace(1) %out, i32 %offset) { +; GENERIC-LABEL: extract_neg_offset_sgpr: +; GENERIC: ; %bb.0: ; %entry +; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: s_add_i32 s2, s4, 0xfffffe00 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 1 +; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GENERIC-NEXT: s_cmp_lg_u32 s2, 2 +; GENERIC-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GENERIC-NEXT: v_readfirstlane_b32 s4, v0 +; GENERIC-NEXT: s_cselect_b32 s4, s4, 2 +; GENERIC-NEXT: s_cmp_lg_u32 s2, 3 +; GENERIC-NEXT: s_cselect_b32 s4, s4, 3 +; GENERIC-NEXT: s_cmp_lg_u32 s2, 4 +; GENERIC-NEXT: s_cselect_b32 s4, s4, 5 +; GENERIC-NEXT: s_cmp_lg_u32 s2, 5 +; GENERIC-NEXT: s_cselect_b32 s4, s4, 6 +; GENERIC-NEXT: s_cmp_lg_u32 s2, 6 +; GENERIC-NEXT: s_cselect_b32 s4, s4, 7 +; GENERIC-NEXT: s_cmp_lg_u32 s2, 7 +; GENERIC-NEXT: s_cselect_b32 s4, s4, 8 +; GENERIC-NEXT: s_cmp_lg_u32 s2, 8 +; GENERIC-NEXT: s_cselect_b32 s4, s4, 9 +; GENERIC-NEXT: s_cmp_lg_u32 s2, 9 +; GENERIC-NEXT: s_cselect_b32 s4, s4, 10 +; GENERIC-NEXT: s_cmp_lg_u32 s2, 10 +; GENERIC-NEXT: s_cselect_b32 s4, s4, 11 +; GENERIC-NEXT: s_cmp_lg_u32 s2, 11 +; GENERIC-NEXT: s_cselect_b32 s4, s4, 12 +; GENERIC-NEXT: s_cmp_lg_u32 s2, 12 +; GENERIC-NEXT: s_cselect_b32 s4, s4, 13 +; GENERIC-NEXT: s_cmp_lg_u32 s2, 13 +; GENERIC-NEXT: s_cselect_b32 s4, s4, 14 +; GENERIC-NEXT: s_cmp_lg_u32 s2, 14 +; GENERIC-NEXT: s_cselect_b32 s4, s4, 15 +; GENERIC-NEXT: s_cmp_lg_u32 s2, 15 +; GENERIC-NEXT: s_cselect_b32 s4, s4, 16 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: v_mov_b32_e32 v0, s4 +; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: extract_neg_offset_sgpr: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -866,6 +1107,66 @@ entry: } define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, <16 x i32> %vec0, <16 x i32> %vec1, i32 %offset) { +; GENERIC-LABEL: extract_neg_offset_sgpr_loaded: +; GENERIC: ; %bb.0: ; %entry +; GENERIC-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; GENERIC-NEXT: s_load_dword s20, s[2:3], 0x39 +; GENERIC-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x29 +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: s_addk_i32 s20, 0xfe00 +; GENERIC-NEXT: s_or_b32 s2, s19, s51 +; GENERIC-NEXT: s_or_b32 s18, s18, s50 +; GENERIC-NEXT: s_or_b32 s17, s17, s49 +; GENERIC-NEXT: s_or_b32 s16, s16, s48 +; GENERIC-NEXT: s_or_b32 s15, s15, s47 +; GENERIC-NEXT: s_or_b32 s14, s14, s46 +; GENERIC-NEXT: s_or_b32 s13, s13, s45 +; GENERIC-NEXT: s_or_b32 s12, s12, s44 +; GENERIC-NEXT: s_or_b32 s11, s11, s43 +; GENERIC-NEXT: s_or_b32 s10, s10, s42 +; GENERIC-NEXT: s_or_b32 s9, s9, s41 +; GENERIC-NEXT: s_or_b32 s8, s8, s40 +; GENERIC-NEXT: s_or_b32 s7, s7, s39 +; GENERIC-NEXT: s_or_b32 s6, s6, s38 +; GENERIC-NEXT: s_or_b32 s4, s4, s36 +; GENERIC-NEXT: s_or_b32 s5, s5, s37 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 1 +; GENERIC-NEXT: s_cselect_b32 s4, s5, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 2 +; GENERIC-NEXT: s_cselect_b32 s4, s6, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 3 +; GENERIC-NEXT: s_cselect_b32 s4, s7, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 4 +; GENERIC-NEXT: s_cselect_b32 s4, s8, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 5 +; GENERIC-NEXT: s_cselect_b32 s4, s9, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 6 +; GENERIC-NEXT: s_cselect_b32 s4, s10, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 7 +; GENERIC-NEXT: s_cselect_b32 s4, s11, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 8 +; GENERIC-NEXT: s_cselect_b32 s4, s12, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 9 +; GENERIC-NEXT: s_cselect_b32 s4, s13, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 10 +; GENERIC-NEXT: s_cselect_b32 s4, s14, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 11 +; GENERIC-NEXT: s_cselect_b32 s4, s15, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 12 +; GENERIC-NEXT: s_cselect_b32 s4, s16, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 13 +; GENERIC-NEXT: s_cselect_b32 s4, s17, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 14 +; GENERIC-NEXT: s_cselect_b32 s4, s18, s4 +; GENERIC-NEXT: s_cmp_eq_u32 s20, 15 +; GENERIC-NEXT: s_cselect_b32 s4, s2, s4 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: v_mov_b32_e32 v0, s4 +; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: extract_neg_offset_sgpr_loaded: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -1161,6 +1462,46 @@ entry: } define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) { +; GENERIC-LABEL: extract_neg_offset_vgpr: +; GENERIC: ; %bb.0: ; %entry +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: v_add_i32_e32 v0, vcc, 0xfffffe00, v0 +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GENERIC-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, 2, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 4, v0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, 6, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 6, v0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, 7, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 7, v0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, 8, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 8, v0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, 9, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, 10, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 10, v0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, 11, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 11, v0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, 12, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 12, v0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, 13, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 13, v0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, 14, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 14, v0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, 15, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 15, v0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, 16, v1, vcc +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: extract_neg_offset_vgpr: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 @@ -1458,6 +1799,18 @@ entry: ; undefined behavior, but shouldn't crash compiler define amdgpu_kernel void @extract_undef_offset_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GENERIC-LABEL: extract_undef_offset_sgpr: +; GENERIC: ; %bb.0: ; %entry +; GENERIC-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GENERIC-NEXT: s_mov_b32 s7, 0xf000 +; GENERIC-NEXT: s_mov_b32 s6, -1 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: s_mov_b32 s4, s2 +; GENERIC-NEXT: s_mov_b32 s5, s3 +; GENERIC-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: extract_undef_offset_sgpr: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb @@ -1513,6 +1866,10 @@ entry: ; undefined behavior, but shouldn't crash compiler define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GENERIC-LABEL: insert_undef_offset_sgpr_vector_src: +; GENERIC: ; %bb.0: ; %entry +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: insert_undef_offset_sgpr_vector_src: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_endpgm @@ -1536,6 +1893,84 @@ entry: } define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { +; GENERIC-LABEL: insert_w_offset: +; GENERIC: ; %bb.0: ; %entry +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41880000 +; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000 +; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000 +; GENERIC-NEXT: v_mov_b32_e32 v5, 0x40e00000 +; GENERIC-NEXT: v_mov_b32_e32 v8, 0x40c00000 +; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000 +; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41400000 +; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41300000 +; GENERIC-NEXT: v_mov_b32_e32 v13, 0x41200000 +; GENERIC-NEXT: v_mov_b32_e32 v14, 0x41100000 +; GENERIC-NEXT: v_mov_b32_e32 v15, 0x41800000 +; GENERIC-NEXT: v_mov_b32_e32 v16, 0x41700000 +; GENERIC-NEXT: v_mov_b32_e32 v17, 0x41600000 +; GENERIC-NEXT: v_mov_b32_e32 v18, 0x41500000 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: s_add_i32 s4, s4, 1 +; GENERIC-NEXT: s_cmp_eq_u32 s4, 3 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, 4.0, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 2 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v2, v0, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 1 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, 2.0, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 0 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, 1.0, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 7 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v4, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 6 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v5, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 5 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 4 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 11 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v8, v11, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 10 +; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_waitcnt expcnt(0) +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v12, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 9 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 8 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 15 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v9, v15, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 14 +; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_waitcnt expcnt(0) +; GENERIC-NEXT: v_cndmask_b32_e32 v8, v16, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 13 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v17, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 12 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v18, v10, vcc +; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48 +; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: insert_w_offset: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -1813,6 +2248,85 @@ entry: } define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %out, i16 %in) { +; GENERIC-LABEL: insert_unsigned_base_plus_offset: +; GENERIC: ; %bb.0: ; %entry +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41880000 +; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000 +; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000 +; GENERIC-NEXT: v_mov_b32_e32 v5, 0x40e00000 +; GENERIC-NEXT: v_mov_b32_e32 v8, 0x40c00000 +; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000 +; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41400000 +; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41300000 +; GENERIC-NEXT: v_mov_b32_e32 v13, 0x41200000 +; GENERIC-NEXT: v_mov_b32_e32 v14, 0x41100000 +; GENERIC-NEXT: v_mov_b32_e32 v15, 0x41800000 +; GENERIC-NEXT: v_mov_b32_e32 v16, 0x41700000 +; GENERIC-NEXT: v_mov_b32_e32 v17, 0x41600000 +; GENERIC-NEXT: v_mov_b32_e32 v18, 0x41500000 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: s_and_b32 s4, s4, 0xffff +; GENERIC-NEXT: s_add_i32 s4, s4, 1 +; GENERIC-NEXT: s_cmp_eq_u32 s4, 3 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, 4.0, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 2 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v2, v0, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 1 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, 2.0, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 0 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, 1.0, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 7 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v4, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 6 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v5, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 5 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 4 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 11 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v8, v11, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 10 +; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_waitcnt expcnt(0) +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v12, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 9 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 8 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 15 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v9, v15, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 14 +; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_waitcnt expcnt(0) +; GENERIC-NEXT: v_cndmask_b32_e32 v8, v16, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 13 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v17, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 12 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v18, v10, vcc +; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48 +; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: insert_unsigned_base_plus_offset: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -2091,6 +2605,85 @@ entry: } define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, i16 %in) { +; GENERIC-LABEL: insert_signed_base_plus_offset: +; GENERIC: ; %bb.0: ; %entry +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41880000 +; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000 +; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000 +; GENERIC-NEXT: v_mov_b32_e32 v5, 0x40e00000 +; GENERIC-NEXT: v_mov_b32_e32 v8, 0x40c00000 +; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000 +; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41400000 +; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41300000 +; GENERIC-NEXT: v_mov_b32_e32 v13, 0x41200000 +; GENERIC-NEXT: v_mov_b32_e32 v14, 0x41100000 +; GENERIC-NEXT: v_mov_b32_e32 v15, 0x41800000 +; GENERIC-NEXT: v_mov_b32_e32 v16, 0x41700000 +; GENERIC-NEXT: v_mov_b32_e32 v17, 0x41600000 +; GENERIC-NEXT: v_mov_b32_e32 v18, 0x41500000 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: s_sext_i32_i16 s4, s4 +; GENERIC-NEXT: s_add_i32 s4, s4, 1 +; GENERIC-NEXT: s_cmp_eq_u32 s4, 3 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, 4.0, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 2 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v2, v0, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 1 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, 2.0, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 0 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, 1.0, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 7 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v4, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 6 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v5, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 5 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 4 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 11 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v8, v11, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 10 +; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_waitcnt expcnt(0) +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v12, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 9 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 8 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 15 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v9, v15, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 14 +; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_waitcnt expcnt(0) +; GENERIC-NEXT: v_cndmask_b32_e32 v8, v16, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 13 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v17, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 12 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v18, v10, vcc +; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48 +; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: insert_signed_base_plus_offset: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -2376,6 +2969,83 @@ entry: ; Make sure that TwoAddressInstructions keeps src0 as subregister sub0 ; of the tied implicit use and def of the super register. define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) { +; GENERIC-LABEL: insert_wo_offset: +; GENERIC: ; %bb.0: ; %entry +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41880000 +; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000 +; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000 +; GENERIC-NEXT: v_mov_b32_e32 v5, 0x40e00000 +; GENERIC-NEXT: v_mov_b32_e32 v8, 0x40c00000 +; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000 +; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41400000 +; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41300000 +; GENERIC-NEXT: v_mov_b32_e32 v13, 0x41200000 +; GENERIC-NEXT: v_mov_b32_e32 v14, 0x41100000 +; GENERIC-NEXT: v_mov_b32_e32 v15, 0x41800000 +; GENERIC-NEXT: v_mov_b32_e32 v16, 0x41700000 +; GENERIC-NEXT: v_mov_b32_e32 v17, 0x41600000 +; GENERIC-NEXT: v_mov_b32_e32 v18, 0x41500000 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: s_cmp_eq_u32 s4, 3 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, 4.0, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 2 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v2, v0, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 1 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, 2.0, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 0 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, 1.0, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 7 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v4, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 6 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v5, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 5 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 4 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 11 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v8, v11, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 10 +; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_waitcnt expcnt(0) +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v12, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 9 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 8 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 15 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v9, v15, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 14 +; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_waitcnt expcnt(0) +; GENERIC-NEXT: v_cndmask_b32_e32 v8, v16, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 13 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v17, v10, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 12 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v18, v10, vcc +; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48 +; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: insert_wo_offset: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -2646,6 +3316,71 @@ entry: } define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addrspace(1) %out, i32 %offset) { +; GENERIC-LABEL: insert_neg_offset_sgpr: +; GENERIC: ; %bb.0: ; %entry +; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xd +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: s_add_i32 s6, s4, 0xfffffe00 +; GENERIC-NEXT: s_cmp_eq_u32 s6, 0 +; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s6, 3 +; GENERIC-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GENERIC-NEXT: s_cselect_b32 s4, 16, 3 +; GENERIC-NEXT: s_cmp_eq_u32 s6, 2 +; GENERIC-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GENERIC-NEXT: s_cselect_b32 s5, 16, 2 +; GENERIC-NEXT: s_cmp_eq_u32 s6, 1 +; GENERIC-NEXT: v_mov_b32_e32 v3, s4 +; GENERIC-NEXT: s_cselect_b32 s4, 16, 1 +; GENERIC-NEXT: s_cmp_eq_u32 s6, 7 +; GENERIC-NEXT: v_mov_b32_e32 v2, s5 +; GENERIC-NEXT: s_cselect_b32 s5, 16, 7 +; GENERIC-NEXT: s_cmp_eq_u32 s6, 6 +; GENERIC-NEXT: v_mov_b32_e32 v1, s4 +; GENERIC-NEXT: s_cselect_b32 s4, 16, 6 +; GENERIC-NEXT: s_cmp_eq_u32 s6, 5 +; GENERIC-NEXT: v_mov_b32_e32 v7, s5 +; GENERIC-NEXT: s_cselect_b32 s5, 16, 5 +; GENERIC-NEXT: s_cmp_eq_u32 s6, 4 +; GENERIC-NEXT: v_mov_b32_e32 v6, s4 +; GENERIC-NEXT: s_cselect_b32 s4, 16, 4 +; GENERIC-NEXT: s_cmp_eq_u32 s6, 11 +; GENERIC-NEXT: v_mov_b32_e32 v5, s5 +; GENERIC-NEXT: s_cselect_b32 s5, 16, 11 +; GENERIC-NEXT: s_cmp_eq_u32 s6, 10 +; GENERIC-NEXT: v_mov_b32_e32 v4, s4 +; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GENERIC-NEXT: s_cselect_b32 s4, 16, 10 +; GENERIC-NEXT: s_cmp_eq_u32 s6, 9 +; GENERIC-NEXT: s_waitcnt expcnt(0) +; GENERIC-NEXT: v_mov_b32_e32 v7, s5 +; GENERIC-NEXT: s_cselect_b32 s5, 16, 9 +; GENERIC-NEXT: s_cmp_eq_u32 s6, 8 +; GENERIC-NEXT: v_mov_b32_e32 v6, s4 +; GENERIC-NEXT: s_cselect_b32 s4, 16, 8 +; GENERIC-NEXT: s_cmp_eq_u32 s6, 15 +; GENERIC-NEXT: v_mov_b32_e32 v5, s5 +; GENERIC-NEXT: s_cselect_b32 s5, 16, 15 +; GENERIC-NEXT: s_cmp_eq_u32 s6, 14 +; GENERIC-NEXT: v_mov_b32_e32 v4, s4 +; GENERIC-NEXT: s_cselect_b32 s4, 16, 14 +; GENERIC-NEXT: s_cmp_eq_u32 s6, 13 +; GENERIC-NEXT: s_cselect_b32 s7, 16, 13 +; GENERIC-NEXT: s_cmp_eq_u32 s6, 12 +; GENERIC-NEXT: s_cselect_b32 s6, 16, 12 +; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GENERIC-NEXT: s_waitcnt expcnt(0) +; GENERIC-NEXT: v_mov_b32_e32 v7, s5 +; GENERIC-NEXT: v_mov_b32_e32 v6, s4 +; GENERIC-NEXT: v_mov_b32_e32 v5, s7 +; GENERIC-NEXT: v_mov_b32_e32 v4, s6 +; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48 +; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: insert_neg_offset_sgpr: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb @@ -2930,6 +3665,71 @@ entry: ; The vector indexed into is originally loaded into an SGPR rather ; than built with a reg_sequence define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ptr addrspace(1) %out, <16 x i32> %vec, i32 %offset) { +; GENERIC-LABEL: insert_neg_offset_sgpr_loadreg: +; GENERIC: ; %bb.0: ; %entry +; GENERIC-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0xb +; GENERIC-NEXT: s_load_dword s20, s[2:3], 0x29 +; GENERIC-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x19 +; GENERIC-NEXT: s_mov_b32 s19, 0xf000 +; GENERIC-NEXT: s_mov_b32 s18, -1 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: s_addk_i32 s20, 0xfe00 +; GENERIC-NEXT: s_cmp_lg_u32 s20, 3 +; GENERIC-NEXT: s_cselect_b32 s3, s3, 5 +; GENERIC-NEXT: s_cmp_lg_u32 s20, 2 +; GENERIC-NEXT: s_cselect_b32 s2, s2, 5 +; GENERIC-NEXT: s_cmp_lg_u32 s20, 1 +; GENERIC-NEXT: v_mov_b32_e32 v3, s3 +; GENERIC-NEXT: s_cselect_b32 s1, s1, 5 +; GENERIC-NEXT: s_cmp_lg_u32 s20, 0 +; GENERIC-NEXT: v_mov_b32_e32 v2, s2 +; GENERIC-NEXT: s_cselect_b32 s0, s0, 5 +; GENERIC-NEXT: s_cmp_lg_u32 s20, 7 +; GENERIC-NEXT: v_mov_b32_e32 v1, s1 +; GENERIC-NEXT: s_cselect_b32 s1, s7, 5 +; GENERIC-NEXT: s_cmp_lg_u32 s20, 6 +; GENERIC-NEXT: v_mov_b32_e32 v0, s0 +; GENERIC-NEXT: s_cselect_b32 s0, s6, 5 +; GENERIC-NEXT: s_cmp_lg_u32 s20, 5 +; GENERIC-NEXT: v_mov_b32_e32 v7, s1 +; GENERIC-NEXT: s_cselect_b32 s1, s5, 5 +; GENERIC-NEXT: s_cmp_lg_u32 s20, 4 +; GENERIC-NEXT: v_mov_b32_e32 v6, s0 +; GENERIC-NEXT: s_cselect_b32 s0, s4, 5 +; GENERIC-NEXT: s_cmp_lg_u32 s20, 11 +; GENERIC-NEXT: v_mov_b32_e32 v5, s1 +; GENERIC-NEXT: s_cselect_b32 s1, s11, 5 +; GENERIC-NEXT: s_cmp_lg_u32 s20, 10 +; GENERIC-NEXT: v_mov_b32_e32 v4, s0 +; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16 +; GENERIC-NEXT: s_cselect_b32 s0, s10, 5 +; GENERIC-NEXT: s_cmp_lg_u32 s20, 9 +; GENERIC-NEXT: s_waitcnt expcnt(0) +; GENERIC-NEXT: v_mov_b32_e32 v7, s1 +; GENERIC-NEXT: s_cselect_b32 s1, s9, 5 +; GENERIC-NEXT: s_cmp_lg_u32 s20, 8 +; GENERIC-NEXT: v_mov_b32_e32 v6, s0 +; GENERIC-NEXT: s_cselect_b32 s0, s8, 5 +; GENERIC-NEXT: s_cmp_lg_u32 s20, 15 +; GENERIC-NEXT: v_mov_b32_e32 v5, s1 +; GENERIC-NEXT: s_cselect_b32 s1, s15, 5 +; GENERIC-NEXT: s_cmp_lg_u32 s20, 14 +; GENERIC-NEXT: v_mov_b32_e32 v4, s0 +; GENERIC-NEXT: s_cselect_b32 s0, s14, 5 +; GENERIC-NEXT: s_cmp_lg_u32 s20, 13 +; GENERIC-NEXT: s_cselect_b32 s2, s13, 5 +; GENERIC-NEXT: s_cmp_lg_u32 s20, 12 +; GENERIC-NEXT: s_cselect_b32 s3, s12, 5 +; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:32 +; GENERIC-NEXT: s_waitcnt expcnt(0) +; GENERIC-NEXT: v_mov_b32_e32 v7, s1 +; GENERIC-NEXT: v_mov_b32_e32 v6, s0 +; GENERIC-NEXT: v_mov_b32_e32 v5, s2 +; GENERIC-NEXT: v_mov_b32_e32 v4, s3 +; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:48 +; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: insert_neg_offset_sgpr_loadreg: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb @@ -3170,6 +3970,51 @@ entry: } define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; GENERIC-LABEL: insert_neg_offset_vgpr: +; GENERIC: ; %bb.0: ; %entry +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: v_add_i32_e32 v12, vcc, 0xfffffe00, v0 +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 +; GENERIC-NEXT: v_cndmask_b32_e64 v3, 4, 33, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12 +; GENERIC-NEXT: v_cndmask_b32_e64 v2, 3, 33, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; GENERIC-NEXT: v_cndmask_b32_e64 v1, 2, 33, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 +; GENERIC-NEXT: v_cndmask_b32_e64 v0, 1, 33, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12 +; GENERIC-NEXT: v_cndmask_b32_e64 v7, 8, 33, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12 +; GENERIC-NEXT: v_cndmask_b32_e64 v6, 7, 33, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12 +; GENERIC-NEXT: v_cndmask_b32_e64 v5, 6, 33, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12 +; GENERIC-NEXT: v_cndmask_b32_e64 v4, 5, 33, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12 +; GENERIC-NEXT: v_cndmask_b32_e64 v11, 12, 33, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12 +; GENERIC-NEXT: v_cndmask_b32_e64 v10, 11, 33, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12 +; GENERIC-NEXT: v_cndmask_b32_e64 v9, 10, 33, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12 +; GENERIC-NEXT: v_cndmask_b32_e64 v8, 9, 33, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12 +; GENERIC-NEXT: v_cndmask_b32_e64 v15, 16, 33, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12 +; GENERIC-NEXT: v_cndmask_b32_e64 v14, 15, 33, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12 +; GENERIC-NEXT: v_cndmask_b32_e64 v13, 14, 33, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12 +; GENERIC-NEXT: v_cndmask_b32_e64 v12, 13, 33, vcc +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; GENERIC-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: insert_neg_offset_vgpr: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 @@ -3611,6 +4456,52 @@ entry: } define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; GENERIC-LABEL: insert_neg_inline_offset_vgpr: +; GENERIC: ; %bb.0: ; %entry +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: v_add_i32_e32 v12, vcc, -16, v0 +; GENERIC-NEXT: v_mov_b32_e32 v16, 0x1f4 +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, 4, v16, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12 +; GENERIC-NEXT: v_cndmask_b32_e32 v2, 3, v16, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, 2, v16, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, 1, v16, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, 8, v16, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, 7, v16, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, 6, v16, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, 5, v16, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12 +; GENERIC-NEXT: v_cndmask_b32_e32 v11, 12, v16, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12 +; GENERIC-NEXT: v_cndmask_b32_e32 v10, 11, v16, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12 +; GENERIC-NEXT: v_cndmask_b32_e32 v9, 10, v16, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12 +; GENERIC-NEXT: v_cndmask_b32_e32 v8, 9, v16, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12 +; GENERIC-NEXT: v_cndmask_b32_e32 v15, 16, v16, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12 +; GENERIC-NEXT: v_cndmask_b32_e32 v14, 15, v16, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12 +; GENERIC-NEXT: v_cndmask_b32_e32 v13, 14, v16, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12 +; GENERIC-NEXT: v_cndmask_b32_e32 v12, 13, v16, vcc +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; GENERIC-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: insert_neg_inline_offset_vgpr: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 @@ -4057,6 +4948,98 @@ entry: ; When the block is split to insert the loop, make sure any other ; places that need to be expanded in the same block are also handled. define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) { +; GENERIC-LABEL: extract_vgpr_offset_multiple_in_block: +; GENERIC: ; %bb.0: ; %entry +; GENERIC-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GENERIC-NEXT: s_mov_b32 s11, 0xf000 +; GENERIC-NEXT: s_mov_b32 s6, 0 +; GENERIC-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GENERIC-NEXT: v_mov_b32_e32 v2, 0 +; GENERIC-NEXT: s_mov_b32 s7, s11 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 glc +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; GENERIC-NEXT: s_mov_b32 s10, -1 +; GENERIC-NEXT: ;;#ASMSTART +; GENERIC-NEXT: s_mov_b32 s4, 17 +; GENERIC-NEXT: ;;#ASMEND +; GENERIC-NEXT: v_add_i32_e32 v2, vcc, 1, v1 +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GENERIC-NEXT: v_cndmask_b32_e64 v3, 7, 9, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 2, v1 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, 11, v3, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GENERIC-NEXT: v_cndmask_b32_e64 v4, 7, 9, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 3, v1 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, 13, v3, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 2, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, 11, v4, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 4, v1 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, 5, v3, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 3, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, 13, v4, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 5, v1 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, 6, v3, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 4, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 6, v1 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, 7, v3, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 5, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, 6, v4, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 7, v1 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, 8, v3, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 6, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, 7, v4, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 8, v1 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, 9, v3, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 7, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, 8, v4, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 9, v1 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, 10, v3, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 8, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, 9, v4, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 10, v1 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, 11, v3, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 9, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, 10, v4, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 11, v1 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, 12, v3, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 10, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, 11, v4, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 12, v1 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, 13, v3, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 11, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, 12, v4, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 13, v1 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, 14, v3, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 12, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, 13, v4, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 14, v1 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, 15, v3, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 13, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, 14, v4, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 15, v1 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, 16, v3, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 14, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, 15, v4, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 15, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v2, 16, v3, vcc +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: buffer_store_dword v1, off, s[8:11], 0 +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: buffer_store_dword v2, off, s[8:11], 0 +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GENERIC-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GENERIC-NEXT: s_cbranch_execz .LBB16_2 +; GENERIC-NEXT: ; %bb.1: ; %bb1 +; GENERIC-NEXT: v_mov_b32_e32 v0, s4 +; GENERIC-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: .LBB16_2: ; %bb2 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: extract_vgpr_offset_multiple_in_block: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 @@ -4717,6 +5700,122 @@ bb2: } define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in, <16 x i32> %vec0) { +; GENERIC-LABEL: insert_vgpr_offset_multiple_in_block: +; GENERIC: ; %bb.0: ; %entry +; GENERIC-NEXT: s_load_dwordx2 s[24:25], s[2:3], 0xd +; GENERIC-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; GENERIC-NEXT: s_mov_b32 s23, 0xf000 +; GENERIC-NEXT: s_mov_b32 s26, 0 +; GENERIC-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GENERIC-NEXT: v_mov_b32_e32 v2, 0 +; GENERIC-NEXT: s_mov_b32 s27, s23 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: buffer_load_dword v2, v[1:2], s[24:27], 0 addr64 glc +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 +; GENERIC-NEXT: s_mov_b32 s22, -1 +; GENERIC-NEXT: ;;#ASMSTART +; GENERIC-NEXT: v_mov_b32 v1, 62 +; GENERIC-NEXT: ;;#ASMEND +; GENERIC-NEXT: v_mov_b32_e32 v3, s16 +; GENERIC-NEXT: v_mov_b32_e32 v4, s17 +; GENERIC-NEXT: v_mov_b32_e32 v5, s18 +; GENERIC-NEXT: v_mov_b32_e32 v6, s19 +; GENERIC-NEXT: v_mov_b32_e32 v7, s12 +; GENERIC-NEXT: v_mov_b32_e32 v8, s13 +; GENERIC-NEXT: v_mov_b32_e32 v9, s14 +; GENERIC-NEXT: v_mov_b32_e32 v10, s15 +; GENERIC-NEXT: v_mov_b32_e32 v11, s8 +; GENERIC-NEXT: v_mov_b32_e32 v12, s9 +; GENERIC-NEXT: v_mov_b32_e32 v13, s10 +; GENERIC-NEXT: v_mov_b32_e32 v14, s11 +; GENERIC-NEXT: v_mov_b32_e32 v15, s4 +; GENERIC-NEXT: v_mov_b32_e32 v16, s5 +; GENERIC-NEXT: v_mov_b32_e32 v17, s6 +; GENERIC-NEXT: v_mov_b32_e32 v18, s7 +; GENERIC-NEXT: v_add_i32_e32 v19, vcc, 1, v2 +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v20, v3, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 13, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 14, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v11, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v11, v15, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v2, v18, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 3, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, 63, v2, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 2, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 1, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v2, 63, v11, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 7, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 6, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 5, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 4, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 11, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v13, 63, v10, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 10, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 9, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 8, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v10, 63, v24, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 15, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 14, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 13, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: buffer_store_dwordx4 v[14:17], off, s[20:23], 0 offset:48 +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: buffer_store_dwordx4 v[10:13], off, s[20:23], 0 offset:32 +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[20:23], 0 offset:16 +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: buffer_store_dwordx4 v[2:5], off, s[20:23], 0 +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GENERIC-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GENERIC-NEXT: s_cbranch_execz .LBB17_2 +; GENERIC-NEXT: ; %bb.1: ; %bb1 +; GENERIC-NEXT: buffer_store_dword v1, off, s[20:23], 0 +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: .LBB17_2: ; %bb2 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: insert_vgpr_offset_multiple_in_block: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_mov_b32 s28, SCRATCH_RSRC_DWORD0 @@ -5530,6 +6629,136 @@ bb2: ; Avoid inserting extra v_mov from copies within the vgpr indexing sequence. The ; gpr_idx mode switching sequence is expanded late for this reason. define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %out1, i32 %in) { +; GENERIC-LABEL: insert_w_offset_multiple_in_block: +; GENERIC: ; %bb.0: ; %entry +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: v_mov_b32_e32 v0, 0x41500000 +; GENERIC-NEXT: v_mov_b32_e32 v8, 0x41880000 +; GENERIC-NEXT: v_mov_b32_e32 v1, 0x41600000 +; GENERIC-NEXT: v_mov_b32_e32 v2, 0x41700000 +; GENERIC-NEXT: v_mov_b32_e32 v3, 0x41800000 +; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41100000 +; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41200000 +; GENERIC-NEXT: v_mov_b32_e32 v6, 0x41300000 +; GENERIC-NEXT: v_mov_b32_e32 v7, 0x41400000 +; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000 +; GENERIC-NEXT: v_mov_b32_e32 v10, 0x40c00000 +; GENERIC-NEXT: v_mov_b32_e32 v11, 0x40e00000 +; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41000000 +; GENERIC-NEXT: v_mov_b32_e32 v15, 0x40400000 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: s_add_i32 s5, s4, 1 +; GENERIC-NEXT: s_cmp_eq_u32 s5, 12 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 13 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 14 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 15 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 8 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 9 +; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 10 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 11 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 4 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 5 +; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v10, v10, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 6 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v11, v11, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 7 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v12, v12, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 0 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v13, 1.0, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 1 +; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:16 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v14, 2.0, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 2 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v15, v15, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 3 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v16, 4.0, v8, vcc +; GENERIC-NEXT: s_add_i32 s4, s4, 2 +; GENERIC-NEXT: s_cmp_lg_u32 s4, 3 +; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_waitcnt expcnt(0) +; GENERIC-NEXT: v_cndmask_b32_e32 v16, v8, v16, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 2 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v15, v8, v15, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 1 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v14, v8, v14, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 0 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v13, v8, v13, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 7 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v12, v8, v12, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 6 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v11, v8, v11, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 5 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v10, v8, v10, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 4 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 11 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 10 +; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:80 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 9 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 8 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 15 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 14 +; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 13 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 12 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:64 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: insert_w_offset_multiple_in_block: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -5998,6 +7227,40 @@ entry: ; Make sure we don't hit use of undefined register errors when expanding an ; extract with undef index. define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) { +; GENERIC-LABEL: extract_adjacent_blocks: +; GENERIC: ; %bb.0: ; %bb +; GENERIC-NEXT: s_load_dword s0, s[2:3], 0x9 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: s_cmp_lg_u32 s0, 0 +; GENERIC-NEXT: s_cbranch_scc0 .LBB19_4 +; GENERIC-NEXT: ; %bb.1: ; %bb4 +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: ;;#ASMSTART +; GENERIC-NEXT: ; reg use v[0:3] +; GENERIC-NEXT: ;;#ASMEND +; GENERIC-NEXT: s_mov_b64 vcc, exec +; GENERIC-NEXT: s_cbranch_execnz .LBB19_3 +; GENERIC-NEXT: .LBB19_2: ; %bb1 +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: ;;#ASMSTART +; GENERIC-NEXT: ; reg use v[0:3] +; GENERIC-NEXT: ;;#ASMEND +; GENERIC-NEXT: .LBB19_3: ; %bb7 +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: s_endpgm +; GENERIC-NEXT: .LBB19_4: +; GENERIC-NEXT: s_mov_b64 vcc, 0 +; GENERIC-NEXT: s_branch .LBB19_2 +; ; NOOPT-LABEL: extract_adjacent_blocks: ; NOOPT: ; %bb.0: ; %bb ; NOOPT-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 @@ -6210,6 +7473,40 @@ bb7: } define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) { +; GENERIC-LABEL: insert_adjacent_blocks: +; GENERIC: ; %bb.0: ; %bb +; GENERIC-NEXT: s_load_dword s0, s[2:3], 0x9 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: s_cmp_lg_u32 s0, 0 +; GENERIC-NEXT: s_cbranch_scc0 .LBB20_4 +; GENERIC-NEXT: ; %bb.1: ; %bb4 +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: ;;#ASMSTART +; GENERIC-NEXT: ; reg use v[0:3] +; GENERIC-NEXT: ;;#ASMEND +; GENERIC-NEXT: s_mov_b64 vcc, exec +; GENERIC-NEXT: s_cbranch_execnz .LBB20_3 +; GENERIC-NEXT: .LBB20_2: ; %bb1 +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: ;;#ASMSTART +; GENERIC-NEXT: ; reg use v[0:3] +; GENERIC-NEXT: ;;#ASMEND +; GENERIC-NEXT: .LBB20_3: ; %bb7 +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: s_endpgm +; GENERIC-NEXT: .LBB20_4: +; GENERIC-NEXT: s_mov_b64 vcc, 0 +; GENERIC-NEXT: s_branch .LBB20_2 +; ; NOOPT-LABEL: insert_adjacent_blocks: ; NOOPT: ; %bb.0: ; %bb ; NOOPT-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 @@ -6430,6 +7727,24 @@ bb7: ; FIXME: Should be able to fold zero input to movreld to inline imm? define amdgpu_kernel void @multi_same_block(i32 %arg) { +; GENERIC-LABEL: multi_same_block: +; GENERIC: ; %bb.0: ; %bb +; GENERIC-NEXT: s_load_dword s0, s[2:3], 0x9 +; GENERIC-NEXT: v_mov_b32_e32 v0, 0x41900000 +; GENERIC-NEXT: v_mov_b32_e32 v1, 0x41b0cccd +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: s_add_i32 s2, s0, -16 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 1 +; GENERIC-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v0, v0, 4.0, s[0:1] +; GENERIC-NEXT: s_cmp_eq_u32 s2, 5 +; GENERIC-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e64 v1, v1, -4.0, s[0:1] +; GENERIC-NEXT: s_mov_b32 m0, -1 +; GENERIC-NEXT: ds_write_b32 v0, v0 +; GENERIC-NEXT: ds_write_b32 v0, v1 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: multi_same_block: ; NOOPT: ; %bb.0: ; %bb ; NOOPT-NEXT: s_load_dword s0, s[2:3], 0x9 @@ -6596,6 +7911,76 @@ bb: ; offset puts outside of superegister bounaries, so clamp to 1st element. define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) { +; GENERIC-LABEL: extract_largest_inbounds_offset: +; GENERIC: ; %bb.0: ; %entry +; GENERIC-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GENERIC-NEXT: s_mov_b32 s7, 0xf000 +; GENERIC-NEXT: s_mov_b32 s6, -1 +; GENERIC-NEXT: s_load_dword s12, s[2:3], 0xd +; GENERIC-NEXT: s_mov_b32 s2, s6 +; GENERIC-NEXT: s_mov_b32 s3, s7 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: s_mov_b32 s4, s8 +; GENERIC-NEXT: s_mov_b32 s5, s9 +; GENERIC-NEXT: s_mov_b32 s0, s10 +; GENERIC-NEXT: s_mov_b32 s1, s11 +; GENERIC-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: s_add_i32 s12, s12, 15 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 1 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 2 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 3 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 4 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 5 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 6 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 7 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 8 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 9 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 10 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 11 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 12 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 13 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 14 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 15 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc +; GENERIC-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: extract_largest_inbounds_offset: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -6790,6 +8175,76 @@ entry: } define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) { +; GENERIC-LABEL: extract_out_of_bounds_offset: +; GENERIC: ; %bb.0: ; %entry +; GENERIC-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GENERIC-NEXT: s_mov_b32 s7, 0xf000 +; GENERIC-NEXT: s_mov_b32 s6, -1 +; GENERIC-NEXT: s_load_dword s12, s[2:3], 0xd +; GENERIC-NEXT: s_mov_b32 s2, s6 +; GENERIC-NEXT: s_mov_b32 s3, s7 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: s_mov_b32 s4, s8 +; GENERIC-NEXT: s_mov_b32 s5, s9 +; GENERIC-NEXT: s_mov_b32 s0, s10 +; GENERIC-NEXT: s_mov_b32 s1, s11 +; GENERIC-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: s_add_i32 s12, s12, 16 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 1 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 2 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 3 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 4 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 5 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 6 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 7 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 8 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 9 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 10 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 11 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 12 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 13 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 14 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s12, 15 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc +; GENERIC-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: extract_out_of_bounds_offset: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -6984,6 +8439,77 @@ entry: } define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx.in) { +; GENERIC-LABEL: extractelement_v16i32_or_index: +; GENERIC: ; %bb.0: ; %entry +; GENERIC-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GENERIC-NEXT: s_mov_b32 s7, 0xf000 +; GENERIC-NEXT: s_mov_b32 s6, -1 +; GENERIC-NEXT: s_load_dword s12, s[2:3], 0xd +; GENERIC-NEXT: s_mov_b32 s2, s6 +; GENERIC-NEXT: s_mov_b32 s3, s7 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: s_mov_b32 s4, s8 +; GENERIC-NEXT: s_mov_b32 s5, s9 +; GENERIC-NEXT: s_mov_b32 s0, s10 +; GENERIC-NEXT: s_mov_b32 s1, s11 +; GENERIC-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: s_lshl_b32 s0, s12, 2 +; GENERIC-NEXT: s_or_b32 s0, s0, 1 +; GENERIC-NEXT: s_cmp_eq_u32 s0, 1 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s0, 2 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s0, 3 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s0, 4 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s0, 5 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s0, 6 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s0, 7 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s0, 8 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s0, 9 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s0, 10 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s0, 11 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s0, 12 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s0, 13 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s0, 14 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_cmp_eq_u32 s0, 15 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc +; GENERIC-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: extractelement_v16i32_or_index: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -7179,6 +8705,89 @@ entry: } define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, <16 x float> %a, i32 %idx.in) nounwind { +; GENERIC-LABEL: insertelement_v16f32_or_index: +; GENERIC: ; %bb.0: +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GENERIC-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 +; GENERIC-NEXT: s_load_dword s20, s[2:3], 0x29 +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: v_mov_b32_e32 v10, 0x40a00000 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: s_lshl_b32 s20, s20, 2 +; GENERIC-NEXT: v_mov_b32_e32 v0, s7 +; GENERIC-NEXT: v_mov_b32_e32 v1, s6 +; GENERIC-NEXT: v_mov_b32_e32 v4, s5 +; GENERIC-NEXT: v_mov_b32_e32 v5, s4 +; GENERIC-NEXT: v_mov_b32_e32 v6, s11 +; GENERIC-NEXT: v_mov_b32_e32 v8, s10 +; GENERIC-NEXT: v_mov_b32_e32 v9, s9 +; GENERIC-NEXT: v_mov_b32_e32 v11, s8 +; GENERIC-NEXT: v_mov_b32_e32 v12, s15 +; GENERIC-NEXT: v_mov_b32_e32 v13, s14 +; GENERIC-NEXT: v_mov_b32_e32 v14, s13 +; GENERIC-NEXT: v_mov_b32_e32 v15, s12 +; GENERIC-NEXT: v_mov_b32_e32 v16, s19 +; GENERIC-NEXT: v_mov_b32_e32 v17, s18 +; GENERIC-NEXT: v_mov_b32_e32 v18, s17 +; GENERIC-NEXT: v_mov_b32_e32 v19, s16 +; GENERIC-NEXT: s_or_b32 s4, s20, 1 +; GENERIC-NEXT: s_cmp_lg_u32 s4, 3 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, v10, v0, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 2 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v2, v10, v1, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 1 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, v10, v4, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 0 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v10, v5, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 7 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v10, v6, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 6 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v10, v8, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 5 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v10, v9, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 4 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, v10, v11, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 11 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v8, v10, v12, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 10 +; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_waitcnt expcnt(0) +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v10, v13, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 9 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v10, v14, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 8 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v10, v15, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 15 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v9, v10, v16, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 14 +; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_waitcnt expcnt(0) +; GENERIC-NEXT: v_cndmask_b32_e32 v8, v10, v17, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 13 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v10, v18, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 12 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v10, v19, vcc +; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48 +; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: insertelement_v16f32_or_index: ; NOOPT: ; %bb.0: ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 @@ -7429,6 +9038,34 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, } define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { +; GENERIC-LABEL: broken_phi_bb: +; GENERIC: ; %bb.0: ; %bb +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GENERIC-NEXT: s_mov_b32 s6, 8 +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: s_branch .LBB26_2 +; GENERIC-NEXT: .LBB26_1: ; %Flow +; GENERIC-NEXT: ; in Loop: Header=BB26_2 Depth=1 +; GENERIC-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GENERIC-NEXT: s_cbranch_vccz .LBB26_4 +; GENERIC-NEXT: .LBB26_2: ; %bb2 +; GENERIC-NEXT: ; =>This Inner Loop Header: Depth=1 +; GENERIC-NEXT: s_waitcnt lgkmcnt(0) +; GENERIC-NEXT: s_cmp_ge_i32 s6, s0 +; GENERIC-NEXT: s_mov_b64 s[4:5], -1 +; GENERIC-NEXT: ; implicit-def: $sgpr6 +; GENERIC-NEXT: s_cbranch_scc1 .LBB26_1 +; GENERIC-NEXT: ; %bb.3: ; %bb4 +; GENERIC-NEXT: ; in Loop: Header=BB26_2 Depth=1 +; GENERIC-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: s_mov_b64 s[4:5], 0 +; GENERIC-NEXT: s_mov_b32 s6, s1 +; GENERIC-NEXT: s_branch .LBB26_1 +; GENERIC-NEXT: .LBB26_4: ; %bb8 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: broken_phi_bb: ; NOOPT: ; %bb.0: ; %bb ; NOOPT-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 @@ -7846,6 +9483,73 @@ bb8: } define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace(4) %in, i32 %val, <4 x i32> inreg %desc, i32 inreg %A) { +; GENERIC-LABEL: insert_or_disj_index: +; GENERIC: ; %bb.0: ; %entry +; GENERIC-NEXT: v_mov_b32_e32 v2, s4 +; GENERIC-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; GENERIC-NEXT: s_mov_b32 s2, 0 +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s0, s2 +; GENERIC-NEXT: s_mov_b32 s1, s2 +; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: v_readfirstlane_b32 s4, v2 +; GENERIC-NEXT: s_or_b32 s4, s4, 1 +; GENERIC-NEXT: s_cmp_eq_u32 s4, 3 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 2 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, 0, v4, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 1 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, 0, v4, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 0 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 7 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v12, 0, v4, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 6 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v11, 0, v4, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 5 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 4 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v9, 0, v4, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 11 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v13, 0, v4, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 10 +; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], v[0:1], s[0:3], 0 addr64 offset:16 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_waitcnt expcnt(0) +; GENERIC-NEXT: v_cndmask_b32_e32 v12, 0, v4, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 9 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v11, 0, v4, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 8 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 15 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v14, 0, v4, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 14 +; GENERIC-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[0:3], 0 addr64 offset:32 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: s_waitcnt expcnt(0) +; GENERIC-NEXT: v_cndmask_b32_e32 v13, 0, v4, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 13 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v12, 0, v4, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s4, 12 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v11, 0, v4, vcc +; GENERIC-NEXT: buffer_store_dwordx4 v[11:14], v[0:1], s[0:3], 0 addr64 offset:48 +; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], v[0:1], s[0:3], 0 addr64 +; GENERIC-NEXT: s_endpgm +; ; NOOPT-LABEL: insert_or_disj_index: ; NOOPT: ; %bb.0: ; %entry ; NOOPT-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 From 96509bb98fc0a7e929304a64362baaa2589d5a6b Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 23 Aug 2024 10:11:52 +0100 Subject: [PATCH 299/426] [Matrix] Preserve signedness when extending matrix index expression. (#103044) As per [1] the indices for a matrix element access operator shall have integral or unscoped enumeration types and be non-negative. At the moment, the index expression is converted to SizeType irrespective of the signedness of the index expression. This causes implicit sign conversion warnings if any of the indices is signed. As per the spec, using signed types as indices is allowed and should not cause any warnings. If the index expression is signed, extend to SignedSizeType to avoid the warning. [1] https://clang.llvm.org/docs/MatrixTypes.html#matrix-type-element-access-operator PR: https://github.com/llvm/llvm-project/pull/103044 --- clang/lib/CodeGen/CGExpr.cpp | 15 +++++++++++++-- clang/lib/CodeGen/CGExprScalar.cpp | 4 ++-- clang/lib/CodeGen/CodeGenFunction.h | 1 + clang/lib/Sema/SemaExpr.cpp | 3 +-- .../matrix-index-operator-sign-conversion.cpp | 7 ++----- 5 files changed, 19 insertions(+), 11 deletions(-) diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 48d9a3b8a5acb3..426fccb721db84 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -4392,13 +4392,24 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E, return LV; } +llvm::Value *CodeGenFunction::EmitMatrixIndexExpr(const Expr *E) { + llvm::Value *Idx = EmitScalarExpr(E); + if (Idx->getType() == IntPtrTy) + return Idx; + bool IsSigned = E->getType()->isSignedIntegerOrEnumerationType(); + return Builder.CreateIntCast(Idx, IntPtrTy, IsSigned); +} + LValue CodeGenFunction::EmitMatrixSubscriptExpr(const MatrixSubscriptExpr *E) { assert( !E->isIncomplete() && "incomplete matrix subscript expressions should be rejected during Sema"); LValue Base = EmitLValue(E->getBase()); - llvm::Value *RowIdx = EmitScalarExpr(E->getRowIdx()); - llvm::Value *ColIdx = EmitScalarExpr(E->getColumnIdx()); + + // Extend or truncate the index type to 32 or 64-bits if needed. + llvm::Value *RowIdx = EmitMatrixIndexExpr(E->getRowIdx()); + llvm::Value *ColIdx = EmitMatrixIndexExpr(E->getColumnIdx()); + llvm::Value *NumRows = Builder.getIntN( RowIdx->getType()->getScalarSizeInBits(), E->getBase()->getType()->castAs()->getNumRows()); diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index 3bda254c86adf6..2a726bba2dd304 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -2007,8 +2007,8 @@ Value *ScalarExprEmitter::VisitMatrixSubscriptExpr(MatrixSubscriptExpr *E) { // Handle the vector case. The base must be a vector, the index must be an // integer value. - Value *RowIdx = Visit(E->getRowIdx()); - Value *ColumnIdx = Visit(E->getColumnIdx()); + Value *RowIdx = CGF.EmitMatrixIndexExpr(E->getRowIdx()); + Value *ColumnIdx = CGF.EmitMatrixIndexExpr(E->getColumnIdx()); const auto *MatrixTy = E->getBase()->getType()->castAs(); unsigned NumRows = MatrixTy->getNumRows(); diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index e1b9ada3c1e1fd..05f85f8b95bfa2 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4280,6 +4280,7 @@ class CodeGenFunction : public CodeGenTypeCache { LValue EmitUnaryOpLValue(const UnaryOperator *E); LValue EmitArraySubscriptExpr(const ArraySubscriptExpr *E, bool Accessed = false); + llvm::Value *EmitMatrixIndexExpr(const Expr *E); LValue EmitMatrixSubscriptExpr(const MatrixSubscriptExpr *E); LValue EmitArraySectionExpr(const ArraySectionExpr *E, bool IsLowerBound = true); diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index c67183df335dd5..ea57316ad8014e 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -5033,8 +5033,7 @@ ExprResult Sema::CreateBuiltinMatrixSubscriptExpr(Expr *Base, Expr *RowIdx, } } - ExprResult ConvExpr = - tryConvertExprToType(IndexExpr, Context.getSizeType()); + ExprResult ConvExpr = IndexExpr; assert(!ConvExpr.isInvalid() && "should be able to convert any integer type to size type"); return ConvExpr.get(); diff --git a/clang/test/SemaCXX/matrix-index-operator-sign-conversion.cpp b/clang/test/SemaCXX/matrix-index-operator-sign-conversion.cpp index 4254780651c5f5..e6fe4a6c57ff22 100644 --- a/clang/test/SemaCXX/matrix-index-operator-sign-conversion.cpp +++ b/clang/test/SemaCXX/matrix-index-operator-sign-conversion.cpp @@ -2,19 +2,16 @@ template using m __attribute__((__matrix_type__(R,C))) = T; -// FIXME: should not warn here. double index1(m X, int i) { return X[i][0]; } -// expected-warning@-1 {{implicit conversion changes signedness: 'int' to 'unsigned long'}} double index2(m X, unsigned i) { return X[i][0]; } double index3(m X, char i) { return X[i][0]; } -// expected-warning@-1 {{implicit conversion changes signedness: 'char' to 'unsigned long'}} double index4(m X, int i) { return X[0][i]; } -// expected-warning@-1 {{implicit conversion changes signedness: 'int' to 'unsigned long'}} double index5(m X, unsigned i) { return X[0][i]; } double index6(m X, char i) { return X[0][i]; } -// expected-warning@-1 {{implicit conversion changes signedness: 'char' to 'unsigned long'}} + +// expected-no-diagnostics From fa2dccb377d0b712223efe5b62e5fc633580a9e6 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 23 Aug 2024 10:31:33 +0100 Subject: [PATCH 300/426] [AMDGPU] Remove one case of vmcnt loop header flushing for GFX12 (#105550) When a loop contains a VMEM load whose result is only used outside the loop, do not bother to flush vmcnt in the loop head on GFX12. A wait for vmcnt will be required inside the loop anyway, because VMEM instructions can write their VGPR results out of order. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 2 +- llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 4262e7b5d9c250..eafe20be17d5b9 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2390,7 +2390,7 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, } if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside) return true; - return HasVMemLoad && UsesVgprLoadedOutside; + return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder(); } bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir index bdef55ab956a01..0ddd2aa285b264 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir @@ -295,7 +295,7 @@ body: | # GFX12-LABEL: waitcnt_vm_loop2 # GFX12-LABEL: bb.0: # GFX12: BUFFER_LOAD_FORMAT_X_IDXEN -# GFX12: S_WAIT_LOADCNT 0 +# GFX12-NOT: S_WAIT_LOADCNT 0 # GFX12-LABEL: bb.1: # GFX12: S_WAIT_LOADCNT 0 # GFX12-LABEL: bb.2: @@ -342,7 +342,7 @@ body: | # GFX12-LABEL: waitcnt_vm_loop2_store # GFX12-LABEL: bb.0: # GFX12: BUFFER_LOAD_FORMAT_X_IDXEN -# GFX12: S_WAIT_LOADCNT 0 +# GFX12-NOT: S_WAIT_LOADCNT 0 # GFX12-LABEL: bb.1: # GFX12: S_WAIT_LOADCNT 0 # GFX12-LABEL: bb.2: @@ -499,9 +499,9 @@ body: | # GFX12-LABEL: waitcnt_vm_loop2_reginterval # GFX12-LABEL: bb.0: # GFX12: GLOBAL_LOAD_DWORDX4 -# GFX12: S_WAIT_LOADCNT 0 -# GFX12-LABEL: bb.1: # GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 # GFX12-LABEL: bb.2: name: waitcnt_vm_loop2_reginterval body: | @@ -600,7 +600,7 @@ body: | # GFX12-LABEL: bb.0: # GFX12: BUFFER_LOAD_FORMAT_X_IDXEN # GFX12: BUFFER_LOAD_FORMAT_X_IDXEN -# GFX12: S_WAIT_LOADCNT 0 +# GFX12-NOT: S_WAIT_LOADCNT 0 # GFX12-LABEL: bb.1: # GFX12: S_WAIT_LOADCNT 0 # GFX12-LABEL: bb.2: From cf6cd1fd67356ca0c2972992928592d2430043d2 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 23 Aug 2024 10:32:08 +0100 Subject: [PATCH 301/426] [MCA][X86] Add missing 512-bit vpscatterqd/vscatterqps schedule data (REAPPLIED) This doesn't match uops.info yet - but it matches the existing vpscatterdq/vscatterqpd entries like uops.info says it should Reapplied with codegen fix for scatter-schedule.ll Fixes #105675 --- llvm/lib/Target/X86/X86SchedIceLake.td | 2 ++ llvm/lib/Target/X86/X86SchedSkylakeServer.td | 2 ++ llvm/test/CodeGen/X86/scatter-schedule.ll | 4 ++-- .../llvm-mca/X86/IceLakeServer/resources-avx512.s | 10 +++++----- .../llvm-mca/X86/SkylakeServer/resources-avx512.s | 10 +++++----- 5 files changed, 16 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td index fd372ba4656eba..29b1464e19a32b 100644 --- a/llvm/lib/Target/X86/X86SchedIceLake.td +++ b/llvm/lib/Target/X86/X86SchedIceLake.td @@ -1524,8 +1524,10 @@ def ICXWriteResGroup113 : SchedWriteRes<[ICXPort0,ICXPort49,ICXPort78,ICXPort015 let ReleaseAtCycles = [1,8,8,2]; } def: InstRW<[ICXWriteResGroup113], (instrs VPSCATTERDQZmr, + VPSCATTERQDZmr, VPSCATTERQQZmr, VSCATTERDPDZmr, + VSCATTERQPSZmr, VSCATTERQPDZmr)>; def ICXWriteResGroup114 : SchedWriteRes<[ICXPort0,ICXPort49,ICXPort5,ICXPort78,ICXPort0156]> { diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index 4fded44085e897..2423602d06c470 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -1499,8 +1499,10 @@ def SKXWriteResGroup113 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237,SKXPort015 let ReleaseAtCycles = [1,8,8,2]; } def: InstRW<[SKXWriteResGroup113], (instrs VPSCATTERDQZmr, + VPSCATTERQDZmr, VPSCATTERQQZmr, VSCATTERDPDZmr, + VSCATTERQPSZmr, VSCATTERQPDZmr)>; def SKXWriteResGroup114 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> { diff --git a/llvm/test/CodeGen/X86/scatter-schedule.ll b/llvm/test/CodeGen/X86/scatter-schedule.ll index c841e23eab76b2..762a050247a87e 100644 --- a/llvm/test/CodeGen/X86/scatter-schedule.ll +++ b/llvm/test/CodeGen/X86/scatter-schedule.ll @@ -10,8 +10,8 @@ define void @test(i64 %x272, <16 x ptr> %x335, <16 x i32> %x270) { ; CHECK-LABEL: test: ; CHECK: # %bb.0: ; CHECK-NEXT: kxnorw %k0, %k0, %k1 -; CHECK-NEXT: kxnorw %k0, %k0, %k2 -; CHECK-NEXT: vpscatterqd %ymm2, (,%zmm0) {%k2} +; CHECK-NEXT: vpscatterqd %ymm2, (,%zmm0) {%k1} +; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; CHECK-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s index c4df992f3aebca..c509e766540b15 100644 --- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s +++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s @@ -1804,7 +1804,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 2 10 1.00 * vpermq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 36 8 8.00 * vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 4.00 * vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: 1 1 0.50 * vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 19 7 4.00 * vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 4.00 * vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 1 1 1.00 vpshufd $0, %zmm16, %zmm19 # CHECK-NEXT: 2 8 1.00 * vpshufd $0, (%rax), %zmm19 @@ -1871,7 +1871,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 2 8 1.00 * vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 36 7 8.00 * vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 4.00 * vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: 1 1 0.50 * vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 19 7 4.00 * vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 4.00 * vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 1 3 1.00 vshuff32x4 $0, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: 2 10 1.00 * vshuff32x4 $0, (%rax), %zmm17, %zmm19 @@ -2054,7 +2054,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] -# CHECK-NEXT: - 612.00 408.17 102.67 327.50 327.50 41.50 592.17 5.00 41.50 41.50 41.50 +# CHECK-NEXT: - 612.00 411.17 103.67 327.50 327.50 48.50 593.17 6.00 48.50 48.50 48.50 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions: @@ -2774,7 +2774,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - vpermq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - 1.50 0.50 - - 8.00 1.50 0.50 8.00 8.00 8.00 vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 - - 4.00 0.50 0.50 4.00 4.00 4.00 vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: - - - - - - 0.50 - - 0.50 0.50 0.50 vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 - - 4.00 0.50 0.50 4.00 4.00 4.00 vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 - - 4.00 0.50 0.50 4.00 4.00 4.00 vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - - - - - - 1.00 - - - - vpshufd $0, %zmm16, %zmm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - vpshufd $0, (%rax), %zmm19 @@ -2841,7 +2841,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - 1.50 0.50 - - 8.00 1.50 0.50 8.00 8.00 8.00 vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 - - 4.00 0.50 0.50 4.00 4.00 4.00 vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: - - - - - - 0.50 - - 0.50 0.50 0.50 vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 - - 4.00 0.50 0.50 4.00 4.00 4.00 vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 - - 4.00 0.50 0.50 4.00 4.00 4.00 vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - - - - - - 1.00 - - - - vshuff32x4 $0, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - vshuff32x4 $0, (%rax), %zmm17, %zmm19 diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s index 5eaa0f91fdaaba..9c006d4ebb077d 100644 --- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s +++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s @@ -1804,7 +1804,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 2 10 1.00 * vpermq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 36 8 16.00 * vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 8.00 * vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: 1 1 1.00 * vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 19 7 8.00 * vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 8.00 * vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 1 1 1.00 vpshufd $0, %zmm16, %zmm19 # CHECK-NEXT: 2 8 1.00 * vpshufd $0, (%rax), %zmm19 @@ -1871,7 +1871,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: 2 8 1.00 * vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: 36 7 16.00 * vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 8.00 * vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: 1 1 1.00 * vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: 19 7 8.00 * vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 19 7 8.00 * vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: 1 3 1.00 vshuff32x4 $0, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: 2 10 1.00 * vshuff32x4 $0, (%rax), %zmm17, %zmm19 @@ -2052,7 +2052,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] -# CHECK-NEXT: - 612.00 349.67 102.67 355.17 355.17 83.00 650.67 5.00 27.67 +# CHECK-NEXT: - 612.00 352.67 103.67 359.83 359.83 97.00 651.67 6.00 32.33 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: @@ -2772,7 +2772,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vpermq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - 1.50 0.50 5.33 5.33 16.00 1.50 0.50 5.33 vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 0.50 0.50 2.67 vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: - - - - 0.33 0.33 1.00 - - 0.33 vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 0.50 0.50 2.67 vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 0.50 0.50 2.67 vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - - - - - - 1.00 - - vpshufd $0, %zmm16, %zmm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vpshufd $0, (%rax), %zmm19 @@ -2839,7 +2839,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1} # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vpunpcklqdq (%rax){1to8}, %zmm17, %zmm19 {%k1} {z} # CHECK-NEXT: - - 1.50 0.50 5.33 5.33 16.00 1.50 0.50 5.33 vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 0.50 0.50 2.67 vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} -# CHECK-NEXT: - - - - 0.33 0.33 1.00 - - 0.33 vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +# CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 0.50 0.50 2.67 vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - 1.50 0.50 2.67 2.67 8.00 0.50 0.50 2.67 vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} # CHECK-NEXT: - - - - - - - 1.00 - - vshuff32x4 $0, %zmm16, %zmm17, %zmm19 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - vshuff32x4 $0, (%rax), %zmm17, %zmm19 From 3cca522d21876da36145655bc14f334035b4265d Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Fri, 23 Aug 2024 17:42:47 +0800 Subject: [PATCH 302/426] [C++20] [Modules] Warn for duplicated decls in mutliple module units (#105799) It is a long standing issue that the duplicated declarations in multiple module units would cause the compilation performance to get slowed down. And there are many questions or issue reports. So I think it is better to add a warning for it. And given this is not because the users' code violates the language specification or any best practices, the warning is disabled by default even if `-Wall` is specified. The users need to specify the warning explcitly or use `Weverything`. The documentation will add separately. --- .../Basic/DiagnosticSerializationKinds.td | 6 ++ clang/include/clang/Serialization/ASTReader.h | 6 ++ clang/lib/Serialization/ASTReader.cpp | 39 +++++++++ clang/lib/Serialization/ASTReaderDecl.cpp | 21 ++++- ...warn-duplicated-decls-in-module-units.cppm | 83 +++++++++++++++++++ 5 files changed, 152 insertions(+), 3 deletions(-) create mode 100644 clang/test/Modules/warn-duplicated-decls-in-module-units.cppm diff --git a/clang/include/clang/Basic/DiagnosticSerializationKinds.td b/clang/include/clang/Basic/DiagnosticSerializationKinds.td index 9854972cbfe7e4..253a955431997b 100644 --- a/clang/include/clang/Basic/DiagnosticSerializationKinds.td +++ b/clang/include/clang/Basic/DiagnosticSerializationKinds.td @@ -134,6 +134,12 @@ def warn_module_system_bit_conflict : Warning< "as a non-system module; any difference in diagnostic options will be ignored">, InGroup; +def warn_decls_in_multiple_modules : Warning< + "declaration %0 is detected to be defined in multiple module units, first is from '%1' and second is from '%2'; " + "the compiler may not be good at merging the definitions. ">, + InGroup>, + DefaultIgnore; + def err_failed_to_find_module_file : Error< "failed to find module file for module '%0'">; } // let CategoryName diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h index 4593213c5f43ce..2d8952ddbd71df 100644 --- a/clang/include/clang/Serialization/ASTReader.h +++ b/clang/include/clang/Serialization/ASTReader.h @@ -648,6 +648,12 @@ class ASTReader /// performed deduplication. llvm::SetVector PendingMergedDefinitionsToDeduplicate; + /// The duplicated definitions in module units which are pending to be warned. + /// We need to delay it to wait for the loading of definitions since we don't + /// want to warn for forward declarations. + llvm::SmallVector> + PendingWarningForDuplicatedDefsInModuleUnits; + /// Read the record that describes the lexical contents of a DC. bool ReadLexicalDeclContextStorage(ModuleFile &M, llvm::BitstreamCursor &Cursor, diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index fa9b815239dbb6..be83805f1e92b9 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -9955,6 +9955,45 @@ void ASTReader::finishPendingActions() { } PendingDefinitions.clear(); + for (auto [D, Previous] : PendingWarningForDuplicatedDefsInModuleUnits) { + auto hasDefinitionImpl = [this](Decl *D, auto hasDefinitionImpl) { + if (auto *VD = dyn_cast(D)) + return VD->isThisDeclarationADefinition() || + VD->isThisDeclarationADemotedDefinition(); + + if (auto *TD = dyn_cast(D)) + return TD->isThisDeclarationADefinition() || + TD->isThisDeclarationADemotedDefinition(); + + if (auto *FD = dyn_cast(D)) + return FD->isThisDeclarationADefinition() || PendingBodies.count(FD); + + if (auto *RTD = dyn_cast(D)) + return hasDefinitionImpl(RTD->getTemplatedDecl(), hasDefinitionImpl); + + // Conservatively return false here. + return false; + }; + + auto hasDefinition = [this, &hasDefinitionImpl](Decl *D) { + return hasDefinitionImpl(D, hasDefinitionImpl); + }; + + // It is not good to prevent multiple declarations since the forward + // declaration is common. Let's try to avoid duplicated definitions + // only. + if (!hasDefinition(D) || !hasDefinition(Previous)) + continue; + + Module *PM = Previous->getOwningModule(); + Module *DM = D->getOwningModule(); + Diag(D->getLocation(), diag::warn_decls_in_multiple_modules) + << cast(Previous) << PM->getTopLevelModuleName() + << (DM ? DM->getTopLevelModuleName() : "global module"); + Diag(Previous->getLocation(), diag::note_also_found); + } + PendingWarningForDuplicatedDefsInModuleUnits.clear(); + // Load the bodies of any functions or methods we've encountered. We do // this now (delayed) so that we can be sure that the declaration chains // have been fully wired up (hasBody relies on this). diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index 4d9d024796716e..d1b77358d0cde4 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -320,6 +320,9 @@ class ASTDeclReader : public DeclVisitor { static void attachPreviousDecl(ASTReader &Reader, Decl *D, Decl *Previous, Decl *Canon); + static void checkMultipleDefinitionInNamedModules(ASTReader &Reader, Decl *D, + Decl *Previous); + template static void attachLatestDeclImpl(Redeclarable *D, Decl *Latest); static void attachLatestDeclImpl(...); @@ -3690,8 +3693,9 @@ static void inheritDefaultTemplateArguments(ASTContext &Context, // [basic.link]/p10: // If two declarations of an entity are attached to different modules, // the program is ill-formed; -static void checkMultipleDefinitionInNamedModules(ASTReader &Reader, Decl *D, - Decl *Previous) { +void ASTDeclReader::checkMultipleDefinitionInNamedModules(ASTReader &Reader, + Decl *D, + Decl *Previous) { // If it is previous implcitly introduced, it is not meaningful to // diagnose it. if (Previous->isImplicit()) @@ -3721,8 +3725,19 @@ static void checkMultipleDefinitionInNamedModules(ASTReader &Reader, Decl *D, return; // We only forbids merging decls within named modules. - if (!M->isNamedModule()) + if (!M->isNamedModule()) { + // Try to warn the case that we merged decls from global module. + if (!M->isGlobalModule()) + return; + + if (D->getOwningModule() && + M->getTopLevelModule() == D->getOwningModule()->getTopLevelModule()) + return; + + Reader.PendingWarningForDuplicatedDefsInModuleUnits.push_back( + {D, Previous}); return; + } // It is fine if they are in the same module. if (Reader.getContext().isInSameModule(M, D->getOwningModule())) diff --git a/clang/test/Modules/warn-duplicated-decls-in-module-units.cppm b/clang/test/Modules/warn-duplicated-decls-in-module-units.cppm new file mode 100644 index 00000000000000..f9156497bc6b17 --- /dev/null +++ b/clang/test/Modules/warn-duplicated-decls-in-module-units.cppm @@ -0,0 +1,83 @@ +// RUN: rm -rf %t +// RUN: mkdir %t +// RUN: split-file %s %t +// +// RUN: %clang_cc1 -std=c++20 %t/m1.cppm -emit-module-interface -o %t/m1.pcm +// RUN: %clang_cc1 -std=c++20 %t/m2.cppm -emit-module-interface -o %t/m2.pcm +// RUN: %clang_cc1 -std=c++20 -fprebuilt-module-path=%t %t/use.cc -fsyntax-only \ +// RUN: -verify +// +// RUN: %clang_cc1 -std=c++20 %t/m1.cppm -Wall -emit-module-interface -o %t/m1.pcm +// RUN: %clang_cc1 -std=c++20 %t/m2.cppm -Wall -emit-module-interface -o %t/m2.pcm +// RUN: %clang_cc1 -std=c++20 -fprebuilt-module-path=%t %t/use.cc -fsyntax-only \ +// RUN: -verify -Wall +// +// RUN: %clang_cc1 -std=c++20 %t/m1.cppm -Wdecls-in-multiple-modules -emit-module-interface -o %t/m1.pcm +// RUN: %clang_cc1 -std=c++20 %t/m2.cppm -Wdecls-in-multiple-modules -emit-module-interface -o %t/m2.pcm +// RUN: %clang_cc1 -std=c++20 -fprebuilt-module-path=%t %t/use.cc -fsyntax-only \ +// RUN: -verify -Wdecls-in-multiple-modules -DWARNING + +//--- foo.h +#ifndef FOO_H +#define FOO_H + +enum E { E1, E2 }; + +int a = 43; + +class foo { +public: + void consume(E, int); +}; + +inline void func() {} + +void fwd_decl(); + +#endif + +//--- m1.cppm +module; +#include "foo.h" +export module m1; +export { + using ::foo; + using ::a; + using ::func; + using ::fwd_decl; + using ::E; +} + +//--- m2.cppm +module; +#include "foo.h" +export module m2; +export { + using ::foo; + using ::a; + using ::func; + using ::fwd_decl; + using ::E; +} + +//--- use.cc +import m1; +import m2; +void use(); +void use() { + E e = E1; + foo f; + f.consume(e, a); + func(); + fwd_decl(); +} + +#ifndef WARNING +// expected-no-diagnostics +#else +// expected-warning@* {{declaration 'E' is detected to be defined in multiple module units}} +// expected-warning@* {{declaration 'foo' is detected to be defined in multiple module units}} +// expected-warning@* {{declaration 'a' is detected to be defined in multiple module units}} +// expected-warning@* {{declaration 'func' is detected to be defined in multiple module units}} +// expected-note@* 1+ {{}} +#endif From c8ba31700588eabbace2af40d711e235d13ab9bf Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 23 Aug 2024 10:47:10 +0100 Subject: [PATCH 303/426] [AMDGPU] Remove comment outdated by #96933 --- llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index a41df9606749fd..95afc3fcc8d7d1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -905,8 +905,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); // We need to broadcast the value who was the lowest active lane (the first - // lane) to all other lanes in the wavefront. We use an intrinsic for this, - // but have to handle 64-bit broadcasts with two calls to this intrinsic. + // lane) to all other lanes in the wavefront. Value *BroadcastI = nullptr; BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI); From 646478f38b03cbc861ae17533c641c2a944118b3 Mon Sep 17 00:00:00 2001 From: David Green Date: Fri, 23 Aug 2024 10:53:31 +0100 Subject: [PATCH 304/426] [AArch64] Scalarize i128 add/sub/mul/and/or/xor vectors This mirrors what we do for SDAG, scalarizing i128 vectors with add/sub/mul/and/or/xor operators. --- .../AArch64/GISel/AArch64LegalizerInfo.cpp | 1 + llvm/test/CodeGen/AArch64/add.ll | 52 +-- llvm/test/CodeGen/AArch64/andorxor.ll | 318 ++++++++++++------ llvm/test/CodeGen/AArch64/mul.ll | 178 +++++++--- llvm/test/CodeGen/AArch64/sub.ll | 52 +-- 5 files changed, 395 insertions(+), 206 deletions(-) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 35d73d36df46fe..7eaf6a84bd204f 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -149,6 +149,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) return Query.Types[0].getNumElements() <= 16; }, 0, s8) + .scalarizeIf(scalarOrEltWiderThan(0, 64), 0) .moreElementsToNextPow2(0); getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR}) diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll index 39d1933f0e7b97..ee15445a7bbd62 100644 --- a/llvm/test/CodeGen/AArch64/add.ll +++ b/llvm/test/CodeGen/AArch64/add.ll @@ -1,10 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD -; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI - -; CHECK-GI: warning: Instruction selection used fallback path for v2i128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3i128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v4i128 +; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI define i8 @i8(i8 %a, i8 %b) { ; CHECK-LABEL: i8: @@ -480,21 +476,37 @@ entry: } define <4 x i128> @v4i128(<4 x i128> %d, <4 x i128> %e) { -; CHECK-LABEL: v4i128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp x8, x9, [sp] -; CHECK-NEXT: ldp x11, x10, [sp, #16] -; CHECK-NEXT: ldp x13, x12, [sp, #32] -; CHECK-NEXT: adds x0, x0, x8 -; CHECK-NEXT: adc x1, x1, x9 -; CHECK-NEXT: ldp x8, x9, [sp, #48] -; CHECK-NEXT: adds x2, x2, x11 -; CHECK-NEXT: adc x3, x3, x10 -; CHECK-NEXT: adds x4, x4, x13 -; CHECK-NEXT: adc x5, x5, x12 -; CHECK-NEXT: adds x6, x6, x8 -; CHECK-NEXT: adc x7, x7, x9 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: v4i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldp x8, x9, [sp] +; CHECK-SD-NEXT: ldp x11, x10, [sp, #16] +; CHECK-SD-NEXT: ldp x13, x12, [sp, #32] +; CHECK-SD-NEXT: adds x0, x0, x8 +; CHECK-SD-NEXT: adc x1, x1, x9 +; CHECK-SD-NEXT: ldp x8, x9, [sp, #48] +; CHECK-SD-NEXT: adds x2, x2, x11 +; CHECK-SD-NEXT: adc x3, x3, x10 +; CHECK-SD-NEXT: adds x4, x4, x13 +; CHECK-SD-NEXT: adc x5, x5, x12 +; CHECK-SD-NEXT: adds x6, x6, x8 +; CHECK-SD-NEXT: adc x7, x7, x9 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v4i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldp x8, x9, [sp] +; CHECK-GI-NEXT: ldp x10, x11, [sp, #16] +; CHECK-GI-NEXT: ldp x12, x13, [sp, #32] +; CHECK-GI-NEXT: adds x0, x0, x8 +; CHECK-GI-NEXT: adc x1, x1, x9 +; CHECK-GI-NEXT: ldp x8, x9, [sp, #48] +; CHECK-GI-NEXT: adds x2, x2, x10 +; CHECK-GI-NEXT: adc x3, x3, x11 +; CHECK-GI-NEXT: adds x4, x4, x12 +; CHECK-GI-NEXT: adc x5, x5, x13 +; CHECK-GI-NEXT: adds x6, x6, x8 +; CHECK-GI-NEXT: adc x7, x7, x9 +; CHECK-GI-NEXT: ret entry: %s = add <4 x i128> %d, %e ret <4 x i128> %s diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll index efa4be707ceda9..1176c98ce44e34 100644 --- a/llvm/test/CodeGen/AArch64/andorxor.ll +++ b/llvm/test/CodeGen/AArch64/andorxor.ll @@ -1,16 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD -; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI - -; CHECK-GI: warning: Instruction selection used fallback path for and_v2i128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for or_v2i128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for xor_v2i128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for and_v3i128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for or_v3i128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for xor_v3i128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for and_v4i128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for or_v4i128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for xor_v4i128 +; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI define i8 @and_i8(i8 %a, i8 %b) { ; CHECK-LABEL: and_i8: @@ -1369,153 +1359,261 @@ entry: } define <2 x i128> @and_v2i128(<2 x i128> %d, <2 x i128> %e) { -; CHECK-LABEL: and_v2i128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: and x2, x2, x6 -; CHECK-NEXT: and x0, x0, x4 -; CHECK-NEXT: and x1, x1, x5 -; CHECK-NEXT: and x3, x3, x7 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: and_v2i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: and x2, x2, x6 +; CHECK-SD-NEXT: and x0, x0, x4 +; CHECK-SD-NEXT: and x1, x1, x5 +; CHECK-SD-NEXT: and x3, x3, x7 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_v2i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: and x0, x0, x4 +; CHECK-GI-NEXT: and x1, x1, x5 +; CHECK-GI-NEXT: and x2, x2, x6 +; CHECK-GI-NEXT: and x3, x3, x7 +; CHECK-GI-NEXT: ret entry: %s = and <2 x i128> %d, %e ret <2 x i128> %s } define <2 x i128> @or_v2i128(<2 x i128> %d, <2 x i128> %e) { -; CHECK-LABEL: or_v2i128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: orr x2, x2, x6 -; CHECK-NEXT: orr x0, x0, x4 -; CHECK-NEXT: orr x1, x1, x5 -; CHECK-NEXT: orr x3, x3, x7 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: or_v2i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: orr x2, x2, x6 +; CHECK-SD-NEXT: orr x0, x0, x4 +; CHECK-SD-NEXT: orr x1, x1, x5 +; CHECK-SD-NEXT: orr x3, x3, x7 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: or_v2i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: orr x0, x0, x4 +; CHECK-GI-NEXT: orr x1, x1, x5 +; CHECK-GI-NEXT: orr x2, x2, x6 +; CHECK-GI-NEXT: orr x3, x3, x7 +; CHECK-GI-NEXT: ret entry: %s = or <2 x i128> %d, %e ret <2 x i128> %s } define <2 x i128> @xor_v2i128(<2 x i128> %d, <2 x i128> %e) { -; CHECK-LABEL: xor_v2i128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: eor x2, x2, x6 -; CHECK-NEXT: eor x0, x0, x4 -; CHECK-NEXT: eor x1, x1, x5 -; CHECK-NEXT: eor x3, x3, x7 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: xor_v2i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: eor x2, x2, x6 +; CHECK-SD-NEXT: eor x0, x0, x4 +; CHECK-SD-NEXT: eor x1, x1, x5 +; CHECK-SD-NEXT: eor x3, x3, x7 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: xor_v2i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: eor x0, x0, x4 +; CHECK-GI-NEXT: eor x1, x1, x5 +; CHECK-GI-NEXT: eor x2, x2, x6 +; CHECK-GI-NEXT: eor x3, x3, x7 +; CHECK-GI-NEXT: ret entry: %s = xor <2 x i128> %d, %e ret <2 x i128> %s } define <3 x i128> @and_v3i128(<3 x i128> %d, <3 x i128> %e) { -; CHECK-LABEL: and_v3i128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp x8, x9, [sp] -; CHECK-NEXT: and x0, x0, x6 -; CHECK-NEXT: ldp x11, x10, [sp, #16] -; CHECK-NEXT: and x1, x1, x7 -; CHECK-NEXT: and x2, x2, x8 -; CHECK-NEXT: and x3, x3, x9 -; CHECK-NEXT: and x4, x4, x11 -; CHECK-NEXT: and x5, x5, x10 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: and_v3i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldp x8, x9, [sp] +; CHECK-SD-NEXT: and x0, x0, x6 +; CHECK-SD-NEXT: ldp x11, x10, [sp, #16] +; CHECK-SD-NEXT: and x1, x1, x7 +; CHECK-SD-NEXT: and x2, x2, x8 +; CHECK-SD-NEXT: and x3, x3, x9 +; CHECK-SD-NEXT: and x4, x4, x11 +; CHECK-SD-NEXT: and x5, x5, x10 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_v3i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldp x8, x9, [sp] +; CHECK-GI-NEXT: and x0, x0, x6 +; CHECK-GI-NEXT: ldp x10, x11, [sp, #16] +; CHECK-GI-NEXT: and x1, x1, x7 +; CHECK-GI-NEXT: and x2, x2, x8 +; CHECK-GI-NEXT: and x3, x3, x9 +; CHECK-GI-NEXT: and x4, x4, x10 +; CHECK-GI-NEXT: and x5, x5, x11 +; CHECK-GI-NEXT: ret entry: %s = and <3 x i128> %d, %e ret <3 x i128> %s } define <3 x i128> @or_v3i128(<3 x i128> %d, <3 x i128> %e) { -; CHECK-LABEL: or_v3i128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp x8, x9, [sp] -; CHECK-NEXT: orr x0, x0, x6 -; CHECK-NEXT: ldp x11, x10, [sp, #16] -; CHECK-NEXT: orr x1, x1, x7 -; CHECK-NEXT: orr x2, x2, x8 -; CHECK-NEXT: orr x3, x3, x9 -; CHECK-NEXT: orr x4, x4, x11 -; CHECK-NEXT: orr x5, x5, x10 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: or_v3i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldp x8, x9, [sp] +; CHECK-SD-NEXT: orr x0, x0, x6 +; CHECK-SD-NEXT: ldp x11, x10, [sp, #16] +; CHECK-SD-NEXT: orr x1, x1, x7 +; CHECK-SD-NEXT: orr x2, x2, x8 +; CHECK-SD-NEXT: orr x3, x3, x9 +; CHECK-SD-NEXT: orr x4, x4, x11 +; CHECK-SD-NEXT: orr x5, x5, x10 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: or_v3i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldp x8, x9, [sp] +; CHECK-GI-NEXT: orr x0, x0, x6 +; CHECK-GI-NEXT: ldp x10, x11, [sp, #16] +; CHECK-GI-NEXT: orr x1, x1, x7 +; CHECK-GI-NEXT: orr x2, x2, x8 +; CHECK-GI-NEXT: orr x3, x3, x9 +; CHECK-GI-NEXT: orr x4, x4, x10 +; CHECK-GI-NEXT: orr x5, x5, x11 +; CHECK-GI-NEXT: ret entry: %s = or <3 x i128> %d, %e ret <3 x i128> %s } define <3 x i128> @xor_v3i128(<3 x i128> %d, <3 x i128> %e) { -; CHECK-LABEL: xor_v3i128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp x8, x9, [sp] -; CHECK-NEXT: eor x0, x0, x6 -; CHECK-NEXT: ldp x11, x10, [sp, #16] -; CHECK-NEXT: eor x1, x1, x7 -; CHECK-NEXT: eor x2, x2, x8 -; CHECK-NEXT: eor x3, x3, x9 -; CHECK-NEXT: eor x4, x4, x11 -; CHECK-NEXT: eor x5, x5, x10 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: xor_v3i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldp x8, x9, [sp] +; CHECK-SD-NEXT: eor x0, x0, x6 +; CHECK-SD-NEXT: ldp x11, x10, [sp, #16] +; CHECK-SD-NEXT: eor x1, x1, x7 +; CHECK-SD-NEXT: eor x2, x2, x8 +; CHECK-SD-NEXT: eor x3, x3, x9 +; CHECK-SD-NEXT: eor x4, x4, x11 +; CHECK-SD-NEXT: eor x5, x5, x10 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: xor_v3i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldp x8, x9, [sp] +; CHECK-GI-NEXT: eor x0, x0, x6 +; CHECK-GI-NEXT: ldp x10, x11, [sp, #16] +; CHECK-GI-NEXT: eor x1, x1, x7 +; CHECK-GI-NEXT: eor x2, x2, x8 +; CHECK-GI-NEXT: eor x3, x3, x9 +; CHECK-GI-NEXT: eor x4, x4, x10 +; CHECK-GI-NEXT: eor x5, x5, x11 +; CHECK-GI-NEXT: ret entry: %s = xor <3 x i128> %d, %e ret <3 x i128> %s } define <4 x i128> @and_v4i128(<4 x i128> %d, <4 x i128> %e) { -; CHECK-LABEL: and_v4i128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp x9, x8, [sp, #32] -; CHECK-NEXT: ldp x11, x10, [sp] -; CHECK-NEXT: ldp x13, x12, [sp, #16] -; CHECK-NEXT: ldp x15, x14, [sp, #48] -; CHECK-NEXT: and x4, x4, x9 -; CHECK-NEXT: and x0, x0, x11 -; CHECK-NEXT: and x1, x1, x10 -; CHECK-NEXT: and x5, x5, x8 -; CHECK-NEXT: and x2, x2, x13 -; CHECK-NEXT: and x3, x3, x12 -; CHECK-NEXT: and x6, x6, x15 -; CHECK-NEXT: and x7, x7, x14 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: and_v4i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldp x9, x8, [sp, #32] +; CHECK-SD-NEXT: ldp x11, x10, [sp] +; CHECK-SD-NEXT: ldp x13, x12, [sp, #16] +; CHECK-SD-NEXT: ldp x15, x14, [sp, #48] +; CHECK-SD-NEXT: and x4, x4, x9 +; CHECK-SD-NEXT: and x0, x0, x11 +; CHECK-SD-NEXT: and x1, x1, x10 +; CHECK-SD-NEXT: and x5, x5, x8 +; CHECK-SD-NEXT: and x2, x2, x13 +; CHECK-SD-NEXT: and x3, x3, x12 +; CHECK-SD-NEXT: and x6, x6, x15 +; CHECK-SD-NEXT: and x7, x7, x14 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: and_v4i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldp x8, x9, [sp] +; CHECK-GI-NEXT: ldp x10, x11, [sp, #16] +; CHECK-GI-NEXT: ldp x12, x13, [sp, #32] +; CHECK-GI-NEXT: ldp x14, x15, [sp, #48] +; CHECK-GI-NEXT: and x0, x0, x8 +; CHECK-GI-NEXT: and x1, x1, x9 +; CHECK-GI-NEXT: and x2, x2, x10 +; CHECK-GI-NEXT: and x3, x3, x11 +; CHECK-GI-NEXT: and x4, x4, x12 +; CHECK-GI-NEXT: and x5, x5, x13 +; CHECK-GI-NEXT: and x6, x6, x14 +; CHECK-GI-NEXT: and x7, x7, x15 +; CHECK-GI-NEXT: ret entry: %s = and <4 x i128> %d, %e ret <4 x i128> %s } define <4 x i128> @or_v4i128(<4 x i128> %d, <4 x i128> %e) { -; CHECK-LABEL: or_v4i128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp x9, x8, [sp, #32] -; CHECK-NEXT: ldp x11, x10, [sp] -; CHECK-NEXT: ldp x13, x12, [sp, #16] -; CHECK-NEXT: ldp x15, x14, [sp, #48] -; CHECK-NEXT: orr x4, x4, x9 -; CHECK-NEXT: orr x0, x0, x11 -; CHECK-NEXT: orr x1, x1, x10 -; CHECK-NEXT: orr x5, x5, x8 -; CHECK-NEXT: orr x2, x2, x13 -; CHECK-NEXT: orr x3, x3, x12 -; CHECK-NEXT: orr x6, x6, x15 -; CHECK-NEXT: orr x7, x7, x14 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: or_v4i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldp x9, x8, [sp, #32] +; CHECK-SD-NEXT: ldp x11, x10, [sp] +; CHECK-SD-NEXT: ldp x13, x12, [sp, #16] +; CHECK-SD-NEXT: ldp x15, x14, [sp, #48] +; CHECK-SD-NEXT: orr x4, x4, x9 +; CHECK-SD-NEXT: orr x0, x0, x11 +; CHECK-SD-NEXT: orr x1, x1, x10 +; CHECK-SD-NEXT: orr x5, x5, x8 +; CHECK-SD-NEXT: orr x2, x2, x13 +; CHECK-SD-NEXT: orr x3, x3, x12 +; CHECK-SD-NEXT: orr x6, x6, x15 +; CHECK-SD-NEXT: orr x7, x7, x14 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: or_v4i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldp x8, x9, [sp] +; CHECK-GI-NEXT: ldp x10, x11, [sp, #16] +; CHECK-GI-NEXT: ldp x12, x13, [sp, #32] +; CHECK-GI-NEXT: ldp x14, x15, [sp, #48] +; CHECK-GI-NEXT: orr x0, x0, x8 +; CHECK-GI-NEXT: orr x1, x1, x9 +; CHECK-GI-NEXT: orr x2, x2, x10 +; CHECK-GI-NEXT: orr x3, x3, x11 +; CHECK-GI-NEXT: orr x4, x4, x12 +; CHECK-GI-NEXT: orr x5, x5, x13 +; CHECK-GI-NEXT: orr x6, x6, x14 +; CHECK-GI-NEXT: orr x7, x7, x15 +; CHECK-GI-NEXT: ret entry: %s = or <4 x i128> %d, %e ret <4 x i128> %s } define <4 x i128> @xor_v4i128(<4 x i128> %d, <4 x i128> %e) { -; CHECK-LABEL: xor_v4i128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp x9, x8, [sp, #32] -; CHECK-NEXT: ldp x11, x10, [sp] -; CHECK-NEXT: ldp x13, x12, [sp, #16] -; CHECK-NEXT: ldp x15, x14, [sp, #48] -; CHECK-NEXT: eor x4, x4, x9 -; CHECK-NEXT: eor x0, x0, x11 -; CHECK-NEXT: eor x1, x1, x10 -; CHECK-NEXT: eor x5, x5, x8 -; CHECK-NEXT: eor x2, x2, x13 -; CHECK-NEXT: eor x3, x3, x12 -; CHECK-NEXT: eor x6, x6, x15 -; CHECK-NEXT: eor x7, x7, x14 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: xor_v4i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldp x9, x8, [sp, #32] +; CHECK-SD-NEXT: ldp x11, x10, [sp] +; CHECK-SD-NEXT: ldp x13, x12, [sp, #16] +; CHECK-SD-NEXT: ldp x15, x14, [sp, #48] +; CHECK-SD-NEXT: eor x4, x4, x9 +; CHECK-SD-NEXT: eor x0, x0, x11 +; CHECK-SD-NEXT: eor x1, x1, x10 +; CHECK-SD-NEXT: eor x5, x5, x8 +; CHECK-SD-NEXT: eor x2, x2, x13 +; CHECK-SD-NEXT: eor x3, x3, x12 +; CHECK-SD-NEXT: eor x6, x6, x15 +; CHECK-SD-NEXT: eor x7, x7, x14 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: xor_v4i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldp x8, x9, [sp] +; CHECK-GI-NEXT: ldp x10, x11, [sp, #16] +; CHECK-GI-NEXT: ldp x12, x13, [sp, #32] +; CHECK-GI-NEXT: ldp x14, x15, [sp, #48] +; CHECK-GI-NEXT: eor x0, x0, x8 +; CHECK-GI-NEXT: eor x1, x1, x9 +; CHECK-GI-NEXT: eor x2, x2, x10 +; CHECK-GI-NEXT: eor x3, x3, x11 +; CHECK-GI-NEXT: eor x4, x4, x12 +; CHECK-GI-NEXT: eor x5, x5, x13 +; CHECK-GI-NEXT: eor x6, x6, x14 +; CHECK-GI-NEXT: eor x7, x7, x15 +; CHECK-GI-NEXT: ret entry: %s = xor <4 x i128> %d, %e ret <4 x i128> %s diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll index d2804329f1e255..02258bc47c54d4 100644 --- a/llvm/test/CodeGen/AArch64/mul.ll +++ b/llvm/test/CodeGen/AArch64/mul.ll @@ -1,10 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD -; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI - -; CHECK-GI: warning: Instruction selection used fallback path for v2i128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3i128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v4i128 +; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI define i8 @i8(i8 %a, i8 %b) { ; CHECK-LABEL: i8: @@ -531,69 +527,139 @@ entry: } define <2 x i128> @v2i128(<2 x i128> %d, <2 x i128> %e) { -; CHECK-LABEL: v2i128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: umulh x8, x2, x6 -; CHECK-NEXT: umulh x9, x0, x4 -; CHECK-NEXT: madd x8, x2, x7, x8 -; CHECK-NEXT: madd x9, x0, x5, x9 -; CHECK-NEXT: madd x3, x3, x6, x8 -; CHECK-NEXT: madd x1, x1, x4, x9 -; CHECK-NEXT: mul x0, x0, x4 -; CHECK-NEXT: mul x2, x2, x6 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: v2i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: umulh x8, x2, x6 +; CHECK-SD-NEXT: umulh x9, x0, x4 +; CHECK-SD-NEXT: madd x8, x2, x7, x8 +; CHECK-SD-NEXT: madd x9, x0, x5, x9 +; CHECK-SD-NEXT: madd x3, x3, x6, x8 +; CHECK-SD-NEXT: madd x1, x1, x4, x9 +; CHECK-SD-NEXT: mul x0, x0, x4 +; CHECK-SD-NEXT: mul x2, x2, x6 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v2i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mul x9, x0, x5 +; CHECK-GI-NEXT: mul x12, x2, x7 +; CHECK-GI-NEXT: mul x8, x0, x4 +; CHECK-GI-NEXT: umulh x10, x0, x4 +; CHECK-GI-NEXT: madd x11, x1, x4, x9 +; CHECK-GI-NEXT: mov x0, x8 +; CHECK-GI-NEXT: mul x9, x2, x6 +; CHECK-GI-NEXT: umulh x13, x2, x6 +; CHECK-GI-NEXT: add x1, x11, x10 +; CHECK-GI-NEXT: madd x12, x3, x6, x12 +; CHECK-GI-NEXT: mov x2, x9 +; CHECK-GI-NEXT: add x3, x12, x13 +; CHECK-GI-NEXT: ret entry: %s = mul <2 x i128> %d, %e ret <2 x i128> %s } define <3 x i128> @v3i128(<3 x i128> %d, <3 x i128> %e) { -; CHECK-LABEL: v3i128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: umulh x9, x0, x6 -; CHECK-NEXT: ldp x8, x10, [sp] -; CHECK-NEXT: madd x9, x0, x7, x9 -; CHECK-NEXT: umulh x11, x2, x8 -; CHECK-NEXT: madd x1, x1, x6, x9 -; CHECK-NEXT: ldp x9, x12, [sp, #16] -; CHECK-NEXT: madd x10, x2, x10, x11 -; CHECK-NEXT: umulh x13, x4, x9 -; CHECK-NEXT: madd x3, x3, x8, x10 -; CHECK-NEXT: madd x11, x4, x12, x13 -; CHECK-NEXT: mul x0, x0, x6 -; CHECK-NEXT: madd x5, x5, x9, x11 -; CHECK-NEXT: mul x2, x2, x8 -; CHECK-NEXT: mul x4, x4, x9 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: v3i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: umulh x9, x0, x6 +; CHECK-SD-NEXT: ldp x8, x10, [sp] +; CHECK-SD-NEXT: madd x9, x0, x7, x9 +; CHECK-SD-NEXT: umulh x11, x2, x8 +; CHECK-SD-NEXT: madd x1, x1, x6, x9 +; CHECK-SD-NEXT: ldp x9, x12, [sp, #16] +; CHECK-SD-NEXT: madd x10, x2, x10, x11 +; CHECK-SD-NEXT: umulh x13, x4, x9 +; CHECK-SD-NEXT: madd x3, x3, x8, x10 +; CHECK-SD-NEXT: madd x11, x4, x12, x13 +; CHECK-SD-NEXT: mul x0, x0, x6 +; CHECK-SD-NEXT: madd x5, x5, x9, x11 +; CHECK-SD-NEXT: mul x2, x2, x8 +; CHECK-SD-NEXT: mul x4, x4, x9 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v3i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldp x10, x13, [sp] +; CHECK-GI-NEXT: mul x9, x0, x7 +; CHECK-GI-NEXT: mul x8, x0, x6 +; CHECK-GI-NEXT: mul x13, x2, x13 +; CHECK-GI-NEXT: madd x12, x1, x6, x9 +; CHECK-GI-NEXT: mul x9, x2, x10 +; CHECK-GI-NEXT: umulh x14, x2, x10 +; CHECK-GI-NEXT: madd x10, x3, x10, x13 +; CHECK-GI-NEXT: ldp x13, x15, [sp, #16] +; CHECK-GI-NEXT: mov x2, x9 +; CHECK-GI-NEXT: umulh x11, x0, x6 +; CHECK-GI-NEXT: mov x0, x8 +; CHECK-GI-NEXT: mul x15, x4, x15 +; CHECK-GI-NEXT: add x3, x10, x14 +; CHECK-GI-NEXT: umulh x16, x4, x13 +; CHECK-GI-NEXT: add x1, x12, x11 +; CHECK-GI-NEXT: madd x15, x5, x13, x15 +; CHECK-GI-NEXT: mul x4, x4, x13 +; CHECK-GI-NEXT: add x5, x15, x16 +; CHECK-GI-NEXT: ret entry: %s = mul <3 x i128> %d, %e ret <3 x i128> %s } define <4 x i128> @v4i128(<4 x i128> %d, <4 x i128> %e) { -; CHECK-LABEL: v4i128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp x8, x9, [sp] -; CHECK-NEXT: ldp x11, x12, [sp, #16] -; CHECK-NEXT: umulh x10, x0, x8 -; CHECK-NEXT: umulh x13, x2, x11 -; CHECK-NEXT: madd x9, x0, x9, x10 -; CHECK-NEXT: madd x10, x2, x12, x13 -; CHECK-NEXT: ldp x13, x14, [sp, #48] -; CHECK-NEXT: madd x1, x1, x8, x9 -; CHECK-NEXT: madd x3, x3, x11, x10 -; CHECK-NEXT: ldp x9, x10, [sp, #32] -; CHECK-NEXT: umulh x15, x6, x13 -; CHECK-NEXT: umulh x12, x4, x9 -; CHECK-NEXT: mul x0, x0, x8 -; CHECK-NEXT: madd x10, x4, x10, x12 -; CHECK-NEXT: madd x12, x6, x14, x15 -; CHECK-NEXT: madd x5, x5, x9, x10 -; CHECK-NEXT: madd x7, x7, x13, x12 -; CHECK-NEXT: mul x2, x2, x11 -; CHECK-NEXT: mul x4, x4, x9 -; CHECK-NEXT: mul x6, x6, x13 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: v4i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldp x8, x9, [sp] +; CHECK-SD-NEXT: ldp x11, x12, [sp, #16] +; CHECK-SD-NEXT: umulh x10, x0, x8 +; CHECK-SD-NEXT: umulh x13, x2, x11 +; CHECK-SD-NEXT: madd x9, x0, x9, x10 +; CHECK-SD-NEXT: madd x10, x2, x12, x13 +; CHECK-SD-NEXT: ldp x13, x14, [sp, #48] +; CHECK-SD-NEXT: madd x1, x1, x8, x9 +; CHECK-SD-NEXT: madd x3, x3, x11, x10 +; CHECK-SD-NEXT: ldp x9, x10, [sp, #32] +; CHECK-SD-NEXT: umulh x15, x6, x13 +; CHECK-SD-NEXT: umulh x12, x4, x9 +; CHECK-SD-NEXT: mul x0, x0, x8 +; CHECK-SD-NEXT: madd x10, x4, x10, x12 +; CHECK-SD-NEXT: madd x12, x6, x14, x15 +; CHECK-SD-NEXT: madd x5, x5, x9, x10 +; CHECK-SD-NEXT: madd x7, x7, x13, x12 +; CHECK-SD-NEXT: mul x2, x2, x11 +; CHECK-SD-NEXT: mul x4, x4, x9 +; CHECK-SD-NEXT: mul x6, x6, x13 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v4i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldp x9, x10, [sp] +; CHECK-GI-NEXT: ldp x15, x16, [sp, #32] +; CHECK-GI-NEXT: mul x10, x0, x10 +; CHECK-GI-NEXT: mul x16, x4, x16 +; CHECK-GI-NEXT: madd x12, x1, x9, x10 +; CHECK-GI-NEXT: ldp x10, x13, [sp, #16] +; CHECK-GI-NEXT: mul x8, x0, x9 +; CHECK-GI-NEXT: mul x13, x2, x13 +; CHECK-GI-NEXT: umulh x11, x0, x9 +; CHECK-GI-NEXT: mul x9, x2, x10 +; CHECK-GI-NEXT: umulh x14, x2, x10 +; CHECK-GI-NEXT: add x1, x12, x11 +; CHECK-GI-NEXT: madd x13, x3, x10, x13 +; CHECK-GI-NEXT: mov x2, x9 +; CHECK-GI-NEXT: mul x10, x4, x15 +; CHECK-GI-NEXT: umulh x17, x4, x15 +; CHECK-GI-NEXT: add x3, x13, x14 +; CHECK-GI-NEXT: madd x15, x5, x15, x16 +; CHECK-GI-NEXT: ldp x16, x18, [sp, #48] +; CHECK-GI-NEXT: mov x4, x10 +; CHECK-GI-NEXT: mul x18, x6, x18 +; CHECK-GI-NEXT: umulh x0, x6, x16 +; CHECK-GI-NEXT: add x5, x15, x17 +; CHECK-GI-NEXT: madd x18, x7, x16, x18 +; CHECK-GI-NEXT: mul x6, x6, x16 +; CHECK-GI-NEXT: add x7, x18, x0 +; CHECK-GI-NEXT: mov x0, x8 +; CHECK-GI-NEXT: ret entry: %s = mul <4 x i128> %d, %e ret <4 x i128> %s diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll index 0f18ed1006fac5..907605494dfbd0 100644 --- a/llvm/test/CodeGen/AArch64/sub.ll +++ b/llvm/test/CodeGen/AArch64/sub.ll @@ -1,10 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD -; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI - -; CHECK-GI: warning: Instruction selection used fallback path for v2i128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3i128 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v4i128 +; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI define i8 @i8(i8 %a, i8 %b) { ; CHECK-LABEL: i8: @@ -480,21 +476,37 @@ entry: } define <4 x i128> @v4i128(<4 x i128> %d, <4 x i128> %e) { -; CHECK-LABEL: v4i128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp x8, x9, [sp] -; CHECK-NEXT: ldp x11, x10, [sp, #16] -; CHECK-NEXT: ldp x13, x12, [sp, #32] -; CHECK-NEXT: subs x0, x0, x8 -; CHECK-NEXT: sbc x1, x1, x9 -; CHECK-NEXT: ldp x8, x9, [sp, #48] -; CHECK-NEXT: subs x2, x2, x11 -; CHECK-NEXT: sbc x3, x3, x10 -; CHECK-NEXT: subs x4, x4, x13 -; CHECK-NEXT: sbc x5, x5, x12 -; CHECK-NEXT: subs x6, x6, x8 -; CHECK-NEXT: sbc x7, x7, x9 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: v4i128: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldp x8, x9, [sp] +; CHECK-SD-NEXT: ldp x11, x10, [sp, #16] +; CHECK-SD-NEXT: ldp x13, x12, [sp, #32] +; CHECK-SD-NEXT: subs x0, x0, x8 +; CHECK-SD-NEXT: sbc x1, x1, x9 +; CHECK-SD-NEXT: ldp x8, x9, [sp, #48] +; CHECK-SD-NEXT: subs x2, x2, x11 +; CHECK-SD-NEXT: sbc x3, x3, x10 +; CHECK-SD-NEXT: subs x4, x4, x13 +; CHECK-SD-NEXT: sbc x5, x5, x12 +; CHECK-SD-NEXT: subs x6, x6, x8 +; CHECK-SD-NEXT: sbc x7, x7, x9 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v4i128: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldp x8, x9, [sp] +; CHECK-GI-NEXT: ldp x10, x11, [sp, #16] +; CHECK-GI-NEXT: ldp x12, x13, [sp, #32] +; CHECK-GI-NEXT: subs x0, x0, x8 +; CHECK-GI-NEXT: sbc x1, x1, x9 +; CHECK-GI-NEXT: ldp x8, x9, [sp, #48] +; CHECK-GI-NEXT: subs x2, x2, x10 +; CHECK-GI-NEXT: sbc x3, x3, x11 +; CHECK-GI-NEXT: subs x4, x4, x12 +; CHECK-GI-NEXT: sbc x5, x5, x13 +; CHECK-GI-NEXT: subs x6, x6, x8 +; CHECK-GI-NEXT: sbc x7, x7, x9 +; CHECK-GI-NEXT: ret entry: %s = sub <4 x i128> %d, %e ret <4 x i128> %s From 38b8e54682567d685bc03f9fdef26baa6b708ef9 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Fri, 23 Aug 2024 11:55:30 +0200 Subject: [PATCH 305/426] [clang][bytecode][NFC] Remove containsErrors() check from delegate (#105804) This check was removed a while ago from visit(), remove it from delegate() as well. --- clang/lib/AST/ByteCode/Compiler.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 655983a1ca0494..d24384ca4ac7a4 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -3262,9 +3262,6 @@ template bool Compiler::discard(const Expr *E) { } template bool Compiler::delegate(const Expr *E) { - if (E->containsErrors()) - return this->emitError(E); - // We're basically doing: // OptionScope Scope(this, DicardResult, Initializing); // but that's unnecessary of course. From 7b4b85b75d22a792b2ef80e6af4f0faf18da0a43 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Fri, 23 Aug 2024 11:56:42 +0200 Subject: [PATCH 306/426] [clang][bytecode] Reject void InitListExpr differently (#105802) This reverts c79d1fa540390f6e37e1ea326153559eeadd0de6 and 125aa10b3d645bd26523a1bc321bb2e6b1cf04e1 Instead, use the previous approach but allow void-typed InitListExprs with 0 initializers. --- clang/lib/AST/ByteCode/Compiler.cpp | 26 ++++++++++++++++---------- clang/test/AST/ByteCode/c.c | 7 ++++++- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index d24384ca4ac7a4..f11196d2b02707 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -1344,6 +1344,16 @@ bool Compiler::VisitArraySubscriptExpr(const ArraySubscriptExpr *E) { template bool Compiler::visitInitList(ArrayRef Inits, const Expr *ArrayFiller, const Expr *E) { + QualType QT = E->getType(); + if (const auto *AT = QT->getAs()) + QT = AT->getValueType(); + + if (QT->isVoidType()) { + if (Inits.size() == 0) + return true; + return this->emitInvalid(E); + } + // Handle discarding first. if (DiscardResult) { for (const Expr *Init : Inits) { @@ -1353,13 +1363,6 @@ bool Compiler::visitInitList(ArrayRef Inits, return true; } - QualType QT = E->getType(); - if (const auto *AT = QT->getAs()) - QT = AT->getValueType(); - - if (QT->isVoidType()) - return this->emitInvalid(E); - // Primitive values. if (std::optional T = classify(QT)) { assert(!DiscardResult); @@ -3272,9 +3275,12 @@ template bool Compiler::visit(const Expr *E) { if (E->getType().isNull()) return false; + if (E->getType()->isVoidType()) + return this->discard(E); + // Create local variable to hold the return value. - if (!E->getType()->isVoidType() && !E->isGLValue() && - !E->getType()->isAnyComplexType() && !classify(E->getType())) { + if (!E->isGLValue() && !E->getType()->isAnyComplexType() && + !classify(E->getType())) { std::optional LocalIndex = allocateLocal(E); if (!LocalIndex) return false; @@ -5174,7 +5180,7 @@ bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { // We should already have a pointer when we get here. return this->delegate(SubExpr); case UO_Deref: // *x - if (DiscardResult || E->getType()->isVoidType()) + if (DiscardResult) return this->discard(SubExpr); return this->visit(SubExpr); case UO_Not: // ~x diff --git a/clang/test/AST/ByteCode/c.c b/clang/test/AST/ByteCode/c.c index 60f4d6ad1b2967..7cb7f96049f2de 100644 --- a/clang/test/AST/ByteCode/c.c +++ b/clang/test/AST/ByteCode/c.c @@ -298,7 +298,6 @@ void T1(void) { enum teste1 test1f(void), (*test1)(void) = test1f; // pedantic-warning {{ISO C forbids forward references to 'enum' types}} enum teste1 { TEST1 }; - void func(void) { _Static_assert(func + 1 - func == 1, ""); // pedantic-warning {{arithmetic on a pointer to the function type}} \ // pedantic-warning {{arithmetic on pointers to the function type}} \ @@ -313,3 +312,9 @@ void func(void) { func - 0xdead000000000000UL; // all-warning {{expression result unused}} \ // pedantic-warning {{arithmetic on a pointer to the function type}} } + +void foo3 (void) +{ + void* x = 0; + void* y = &*x; +} From 4a12722110abb2ccb98173c82a7d7b96a5c098e0 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Fri, 23 Aug 2024 20:59:24 +1000 Subject: [PATCH 307/426] [ORC] Expose a non-destructive check-macho-buffer overload. This allows clients to check buffers that they don't own. rdar://133536831 --- llvm/include/llvm/ExecutionEngine/Orc/MachO.h | 7 ++++ llvm/lib/ExecutionEngine/Orc/MachO.cpp | 37 +++++++++++-------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachO.h b/llvm/include/llvm/ExecutionEngine/Orc/MachO.h index fdaa2f73cda6a3..8bf2550d2d4f06 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/MachO.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/MachO.h @@ -31,6 +31,13 @@ namespace orc { /// given triple. /// ObjIsSlice should be set to true if Obj is a slice of a universal binary /// (that fact will then be reported in the error messages). +Error checkMachORelocatableObject(MemoryBufferRef Obj, const Triple &TT, + bool ObjIsSlice); + +/// Check that the given buffer contains a MachO object file compatible with the +/// given triple. +/// This convenience overload returns the buffer if it passes all checks, +/// otherwise it returns an error. Expected> checkMachORelocatableObject(std::unique_ptr Obj, const Triple &TT, bool ObjIsSlice); diff --git a/llvm/lib/ExecutionEngine/Orc/MachO.cpp b/llvm/lib/ExecutionEngine/Orc/MachO.cpp index 8fc262220bf892..7fab56f393c506 100644 --- a/llvm/lib/ExecutionEngine/Orc/MachO.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MachO.cpp @@ -17,7 +17,7 @@ namespace llvm { namespace orc { -static std::string objDesc(MemoryBuffer &Obj, const Triple &TT, +static std::string objDesc(const MemoryBufferRef &Obj, const Triple &TT, bool ObjIsSlice) { std::string Desc; if (ObjIsSlice) @@ -27,11 +27,10 @@ static std::string objDesc(MemoryBuffer &Obj, const Triple &TT, } template -static Expected> -checkMachORelocatableObject(std::unique_ptr Obj, - bool SwapEndianness, const Triple &TT, - bool ObjIsSlice) { - StringRef Data = Obj->getBuffer(); +static Error checkMachORelocatableObject(MemoryBufferRef Obj, + bool SwapEndianness, const Triple &TT, + bool ObjIsSlice) { + StringRef Data = Obj.getBuffer(); HeaderType Hdr; memcpy(&Hdr, Data.data(), sizeof(HeaderType)); @@ -40,28 +39,27 @@ checkMachORelocatableObject(std::unique_ptr Obj, swapStruct(Hdr); if (Hdr.filetype != MachO::MH_OBJECT) - return make_error(objDesc(*Obj, TT, ObjIsSlice) + + return make_error(objDesc(Obj, TT, ObjIsSlice) + " is not a MachO relocatable object", inconvertibleErrorCode()); auto ObjArch = object::MachOObjectFile::getArch(Hdr.cputype, Hdr.cpusubtype); if (ObjArch != TT.getArch()) return make_error( - objDesc(*Obj, TT, ObjIsSlice) + Triple::getArchTypeName(ObjArch) + + objDesc(Obj, TT, ObjIsSlice) + Triple::getArchTypeName(ObjArch) + ", cannot be loaded into " + TT.str() + " process", inconvertibleErrorCode()); - return std::move(Obj); + return Error::success(); } -Expected> -checkMachORelocatableObject(std::unique_ptr Obj, const Triple &TT, - bool ObjIsSlice) { - StringRef Data = Obj->getBuffer(); +Error checkMachORelocatableObject(MemoryBufferRef Obj, const Triple &TT, + bool ObjIsSlice) { + StringRef Data = Obj.getBuffer(); if (Data.size() < 4) return make_error( - objDesc(*Obj, TT, ObjIsSlice) + + objDesc(Obj, TT, ObjIsSlice) + " is not a valid MachO relocatable object file (truncated header)", inconvertibleErrorCode()); @@ -79,12 +77,21 @@ checkMachORelocatableObject(std::unique_ptr Obj, const Triple &TT, std::move(Obj), Magic == MachO::MH_CIGAM_64, TT, ObjIsSlice); default: return make_error( - objDesc(*Obj, TT, ObjIsSlice) + + objDesc(Obj, TT, ObjIsSlice) + " is not a valid MachO relocatable object (bad magic value)", inconvertibleErrorCode()); } } +Expected> +checkMachORelocatableObject(std::unique_ptr Obj, const Triple &TT, + bool ObjIsSlice) { + if (auto Err = + checkMachORelocatableObject(Obj->getMemBufferRef(), TT, ObjIsSlice)) + return std::move(Err); + return std::move(Obj); +} + Expected> loadMachORelocatableObject(StringRef Path, const Triple &TT, std::optional IdentifierOverride) { From cbf34a5f7701148d68951320a72f483849b22eaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= Date: Fri, 23 Aug 2024 14:06:17 +0200 Subject: [PATCH 308/426] [AMDGPU] Remove dead pass: AMDGPUMachineCFGStructurizer (#105645) --- llvm/lib/Target/AMDGPU/AMDGPU.h | 4 - llvm/lib/Target/AMDGPU/AMDGPU.td | 3 - llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 4 +- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 2 - .../AMDGPU/AMDGPUInstructionSelector.cpp | 1 - .../Target/AMDGPU/AMDGPUInstructionSelector.h | 1 - .../AMDGPU/AMDGPUMachineCFGStructurizer.cpp | 2837 ----------------- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 24 +- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h | 1 - llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 - llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 101 +- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 8 - llvm/lib/Target/AMDGPU/SIInstructions.td | 9 - 13 files changed, 12 insertions(+), 2984 deletions(-) delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index f5044f52f1648d..afb8f2d93f0f15 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -57,7 +57,6 @@ FunctionPass *createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *); ModulePass *createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *); FunctionPass *createAMDGPUCodeGenPreparePass(); FunctionPass *createAMDGPULateCodeGenPrepareLegacyPass(); -FunctionPass *createAMDGPUMachineCFGStructurizerPass(); FunctionPass *createAMDGPURewriteOutArgumentsPass(); ModulePass * createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr); @@ -92,9 +91,6 @@ class SILowerI1CopiesPass : public PassInfoMixin { void initializeAMDGPUDAGToDAGISelLegacyPass(PassRegistry &); -void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&); -extern char &AMDGPUMachineCFGStructurizerID; - void initializeAMDGPUAlwaysInlinePass(PassRegistry&); Pass *createAMDGPUAnnotateKernelFeaturesPass(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 9efdbd751d96e3..5757ac0d4454d0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -2253,9 +2253,6 @@ def HasDefaultComponentBroadcast def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">, AssemblerPredicate<(all_of FeatureDsSrc2Insts)>; -def EnableLateCFGStructurize : Predicate< - "EnableLateStructurizeCFG">; - def EnableFlatScratch : Predicate<"Subtarget->enableFlatScratch()">; def DisableFlatScratch : Predicate<"!Subtarget->enableFlatScratch()">; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 6a0134e07567a1..0daaf6b6576030 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -121,9 +121,7 @@ FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM, AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM, CodeGenOptLevel OptLevel) - : SelectionDAGISel(TM, OptLevel) { - EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG; -} + : SelectionDAGISel(TM, OptLevel) {} bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { Subtarget = &MF.getSubtarget(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 45a951352c1eb1..11c4cdd560c2f3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -68,8 +68,6 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { // Default FP mode for the current function. SIModeRegisterDefaults Mode; - bool EnableLateStructurizeCFG; - // Instructions that will be lowered with a final instruction that zeros the // high result bits. bool fp16SrcZerosHighBits(unsigned Opc) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 05ed1b322c0d1b..17071970ca4bfe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -45,7 +45,6 @@ AMDGPUInstructionSelector::AMDGPUInstructionSelector( const AMDGPUTargetMachine &TM) : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), STI(STI), - EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), #define GET_GLOBALISEL_PREDICATES_INIT #include "AMDGPUGenGlobalISel.inc" #undef GET_GLOBALISEL_PREDICATES_INIT diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 69806b240cf2bc..207cd67f0eda0e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -371,7 +371,6 @@ class AMDGPUInstructionSelector final : public InstructionSelector { const AMDGPURegisterBankInfo &RBI; const AMDGPUTargetMachine &TM; const GCNSubtarget &STI; - bool EnableLateStructurizeCFG; #define GET_GLOBALISEL_PREDICATES_DECL #define AMDGPUSubtarget GCNSubtarget #include "AMDGPUGenGlobalISel.inc" diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp deleted file mode 100644 index 07b2ecc2fed0e9..00000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp +++ /dev/null @@ -1,2837 +0,0 @@ -//===- AMDGPUMachineCFGStructurizer.cpp - Machine code if conversion pass. ===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements the machine instruction level CFG structurizer pass. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "GCNSubtarget.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/PostOrderIterator.h" -#include "llvm/ADT/SetVector.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegionInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/TargetRegisterInfo.h" -#include "llvm/InitializePasses.h" - -using namespace llvm; - -#define DEBUG_TYPE "amdgpucfgstructurizer" - -namespace { - -class PHILinearizeDestIterator; - -class PHILinearize { - friend class PHILinearizeDestIterator; - -public: - using PHISourceT = std::pair; - -private: - using PHISourcesT = DenseSet; - using PHIInfoElementT = struct { - unsigned DestReg; - DebugLoc DL; - PHISourcesT Sources; - }; - using PHIInfoT = SmallPtrSet; - PHIInfoT PHIInfo; - - static unsigned phiInfoElementGetDest(PHIInfoElementT *Info); - static void phiInfoElementSetDef(PHIInfoElementT *Info, unsigned NewDef); - static PHISourcesT &phiInfoElementGetSources(PHIInfoElementT *Info); - static void phiInfoElementAddSource(PHIInfoElementT *Info, unsigned SourceReg, - MachineBasicBlock *SourceMBB); - static void phiInfoElementRemoveSource(PHIInfoElementT *Info, - unsigned SourceReg, - MachineBasicBlock *SourceMBB); - PHIInfoElementT *findPHIInfoElement(unsigned DestReg); - PHIInfoElementT *findPHIInfoElementFromSource(unsigned SourceReg, - MachineBasicBlock *SourceMBB); - -public: - bool findSourcesFromMBB(MachineBasicBlock *SourceMBB, - SmallVector &Sources); - void addDest(unsigned DestReg, const DebugLoc &DL); - void replaceDef(unsigned OldDestReg, unsigned NewDestReg); - void deleteDef(unsigned DestReg); - void addSource(unsigned DestReg, unsigned SourceReg, - MachineBasicBlock *SourceMBB); - void removeSource(unsigned DestReg, unsigned SourceReg, - MachineBasicBlock *SourceMBB = nullptr); - bool findDest(unsigned SourceReg, MachineBasicBlock *SourceMBB, - unsigned &DestReg); - bool isSource(unsigned Reg, MachineBasicBlock *SourceMBB = nullptr); - unsigned getNumSources(unsigned DestReg); - void dump(MachineRegisterInfo *MRI); - void clear(); - - using source_iterator = PHISourcesT::iterator; - using dest_iterator = PHILinearizeDestIterator; - - dest_iterator dests_begin(); - dest_iterator dests_end(); - - source_iterator sources_begin(unsigned Reg); - source_iterator sources_end(unsigned Reg); -}; - -class PHILinearizeDestIterator { -private: - PHILinearize::PHIInfoT::iterator Iter; - -public: - PHILinearizeDestIterator(PHILinearize::PHIInfoT::iterator I) : Iter(I) {} - - unsigned operator*() { return PHILinearize::phiInfoElementGetDest(*Iter); } - PHILinearizeDestIterator &operator++() { - ++Iter; - return *this; - } - bool operator==(const PHILinearizeDestIterator &I) const { - return I.Iter == Iter; - } - bool operator!=(const PHILinearizeDestIterator &I) const { - return I.Iter != Iter; - } -}; - -} // end anonymous namespace - -unsigned PHILinearize::phiInfoElementGetDest(PHIInfoElementT *Info) { - return Info->DestReg; -} - -void PHILinearize::phiInfoElementSetDef(PHIInfoElementT *Info, - unsigned NewDef) { - Info->DestReg = NewDef; -} - -PHILinearize::PHISourcesT & -PHILinearize::phiInfoElementGetSources(PHIInfoElementT *Info) { - return Info->Sources; -} - -void PHILinearize::phiInfoElementAddSource(PHIInfoElementT *Info, - unsigned SourceReg, - MachineBasicBlock *SourceMBB) { - // Assertion ensures we don't use the same SourceMBB for the - // sources, because we cannot have different registers with - // identical predecessors, but we can have the same register for - // multiple predecessors. -#if !defined(NDEBUG) - for (auto SI : phiInfoElementGetSources(Info)) { - assert((SI.second != SourceMBB || SourceReg == SI.first)); - } -#endif - - phiInfoElementGetSources(Info).insert(PHISourceT(SourceReg, SourceMBB)); -} - -void PHILinearize::phiInfoElementRemoveSource(PHIInfoElementT *Info, - unsigned SourceReg, - MachineBasicBlock *SourceMBB) { - auto &Sources = phiInfoElementGetSources(Info); - SmallVector ElimiatedSources; - for (auto SI : Sources) { - if (SI.first == SourceReg && - (SI.second == nullptr || SI.second == SourceMBB)) { - ElimiatedSources.push_back(PHISourceT(SI.first, SI.second)); - } - } - - for (auto &Source : ElimiatedSources) { - Sources.erase(Source); - } -} - -PHILinearize::PHIInfoElementT * -PHILinearize::findPHIInfoElement(unsigned DestReg) { - for (auto *I : PHIInfo) { - if (phiInfoElementGetDest(I) == DestReg) { - return I; - } - } - return nullptr; -} - -PHILinearize::PHIInfoElementT * -PHILinearize::findPHIInfoElementFromSource(unsigned SourceReg, - MachineBasicBlock *SourceMBB) { - for (auto *I : PHIInfo) { - for (auto SI : phiInfoElementGetSources(I)) { - if (SI.first == SourceReg && - (SI.second == nullptr || SI.second == SourceMBB)) { - return I; - } - } - } - return nullptr; -} - -bool PHILinearize::findSourcesFromMBB(MachineBasicBlock *SourceMBB, - SmallVector &Sources) { - bool FoundSource = false; - for (auto *I : PHIInfo) { - for (auto SI : phiInfoElementGetSources(I)) { - if (SI.second == SourceMBB) { - FoundSource = true; - Sources.push_back(SI.first); - } - } - } - return FoundSource; -} - -void PHILinearize::addDest(unsigned DestReg, const DebugLoc &DL) { - assert(findPHIInfoElement(DestReg) == nullptr && "Dest already exists"); - PHISourcesT EmptySet; - PHIInfoElementT *NewElement = new PHIInfoElementT(); - NewElement->DestReg = DestReg; - NewElement->DL = DL; - NewElement->Sources = EmptySet; - PHIInfo.insert(NewElement); -} - -void PHILinearize::replaceDef(unsigned OldDestReg, unsigned NewDestReg) { - phiInfoElementSetDef(findPHIInfoElement(OldDestReg), NewDestReg); -} - -void PHILinearize::deleteDef(unsigned DestReg) { - PHIInfoElementT *InfoElement = findPHIInfoElement(DestReg); - PHIInfo.erase(InfoElement); - delete InfoElement; -} - -void PHILinearize::addSource(unsigned DestReg, unsigned SourceReg, - MachineBasicBlock *SourceMBB) { - phiInfoElementAddSource(findPHIInfoElement(DestReg), SourceReg, SourceMBB); -} - -void PHILinearize::removeSource(unsigned DestReg, unsigned SourceReg, - MachineBasicBlock *SourceMBB) { - phiInfoElementRemoveSource(findPHIInfoElement(DestReg), SourceReg, SourceMBB); -} - -bool PHILinearize::findDest(unsigned SourceReg, MachineBasicBlock *SourceMBB, - unsigned &DestReg) { - PHIInfoElementT *InfoElement = - findPHIInfoElementFromSource(SourceReg, SourceMBB); - if (InfoElement != nullptr) { - DestReg = phiInfoElementGetDest(InfoElement); - return true; - } - return false; -} - -bool PHILinearize::isSource(unsigned Reg, MachineBasicBlock *SourceMBB) { - unsigned DestReg; - return findDest(Reg, SourceMBB, DestReg); -} - -unsigned PHILinearize::getNumSources(unsigned DestReg) { - return phiInfoElementGetSources(findPHIInfoElement(DestReg)).size(); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -LLVM_DUMP_METHOD void PHILinearize::dump(MachineRegisterInfo *MRI) { - const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo(); - dbgs() << "=PHIInfo Start=\n"; - for (auto *PII : this->PHIInfo) { - PHIInfoElementT &Element = *PII; - dbgs() << "Dest: " << printReg(Element.DestReg, TRI) - << " Sources: {"; - for (auto &SI : Element.Sources) { - dbgs() << printReg(SI.first, TRI) << '(' << printMBBReference(*SI.second) - << "),"; - } - dbgs() << "}\n"; - } - dbgs() << "=PHIInfo End=\n"; -} -#endif - -void PHILinearize::clear() { PHIInfo = PHIInfoT(); } - -PHILinearize::dest_iterator PHILinearize::dests_begin() { - return PHILinearizeDestIterator(PHIInfo.begin()); -} - -PHILinearize::dest_iterator PHILinearize::dests_end() { - return PHILinearizeDestIterator(PHIInfo.end()); -} - -PHILinearize::source_iterator PHILinearize::sources_begin(unsigned Reg) { - auto InfoElement = findPHIInfoElement(Reg); - return phiInfoElementGetSources(InfoElement).begin(); -} - -PHILinearize::source_iterator PHILinearize::sources_end(unsigned Reg) { - auto InfoElement = findPHIInfoElement(Reg); - return phiInfoElementGetSources(InfoElement).end(); -} - -static unsigned getPHINumInputs(MachineInstr &PHI) { - assert(PHI.isPHI()); - return (PHI.getNumOperands() - 1) / 2; -} - -static MachineBasicBlock *getPHIPred(MachineInstr &PHI, unsigned Index) { - assert(PHI.isPHI()); - return PHI.getOperand(Index * 2 + 2).getMBB(); -} - -static void setPhiPred(MachineInstr &PHI, unsigned Index, - MachineBasicBlock *NewPred) { - PHI.getOperand(Index * 2 + 2).setMBB(NewPred); -} - -static unsigned getPHISourceReg(MachineInstr &PHI, unsigned Index) { - assert(PHI.isPHI()); - return PHI.getOperand(Index * 2 + 1).getReg(); -} - -static unsigned getPHIDestReg(MachineInstr &PHI) { - assert(PHI.isPHI()); - return PHI.getOperand(0).getReg(); -} - -namespace { - -class RegionMRT; -class MBBMRT; - -class LinearizedRegion { -protected: - MachineBasicBlock *Entry; - // The exit block is part of the region, and is the last - // merge block before exiting the region. - MachineBasicBlock *Exit; - DenseSet LiveOuts; - SmallPtrSet MBBs; - bool HasLoop; - LinearizedRegion *Parent; - RegionMRT *RMRT; - - void storeLiveOutReg(MachineBasicBlock *MBB, Register Reg, - MachineInstr *DefInstr, const MachineRegisterInfo *MRI, - const TargetRegisterInfo *TRI, PHILinearize &PHIInfo); - - void storeLiveOutRegRegion(RegionMRT *Region, Register Reg, - MachineInstr *DefInstr, - const MachineRegisterInfo *MRI, - const TargetRegisterInfo *TRI, - PHILinearize &PHIInfo); - - void storeMBBLiveOuts(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI, - const TargetRegisterInfo *TRI, PHILinearize &PHIInfo, - RegionMRT *TopRegion); - - void storeLiveOuts(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI, - const TargetRegisterInfo *TRI, PHILinearize &PHIInfo); - - void storeLiveOuts(RegionMRT *Region, const MachineRegisterInfo *MRI, - const TargetRegisterInfo *TRI, PHILinearize &PHIInfo, - RegionMRT *TopRegion = nullptr); - -public: - LinearizedRegion(); - LinearizedRegion(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI, - const TargetRegisterInfo *TRI, PHILinearize &PHIInfo); - ~LinearizedRegion() = default; - - void setRegionMRT(RegionMRT *Region) { RMRT = Region; } - - RegionMRT *getRegionMRT() { return RMRT; } - - void setParent(LinearizedRegion *P) { Parent = P; } - - LinearizedRegion *getParent() { return Parent; } - - void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr); - - void setBBSelectRegIn(unsigned Reg); - - unsigned getBBSelectRegIn(); - - void setBBSelectRegOut(unsigned Reg, bool IsLiveOut); - - unsigned getBBSelectRegOut(); - - void setHasLoop(bool Value); - - bool getHasLoop(); - - void addLiveOut(unsigned VReg); - - void removeLiveOut(unsigned Reg); - - void replaceLiveOut(unsigned OldReg, unsigned NewReg); - - void replaceRegister(unsigned Register, class Register NewRegister, - MachineRegisterInfo *MRI, bool ReplaceInside, - bool ReplaceOutside, bool IncludeLoopPHIs); - - void replaceRegisterInsideRegion(unsigned Register, unsigned NewRegister, - bool IncludeLoopPHIs, - MachineRegisterInfo *MRI); - - void replaceRegisterOutsideRegion(unsigned Register, unsigned NewRegister, - bool IncludeLoopPHIs, - MachineRegisterInfo *MRI); - - DenseSet *getLiveOuts(); - - void setEntry(MachineBasicBlock *NewEntry); - - MachineBasicBlock *getEntry(); - - void setExit(MachineBasicBlock *NewExit); - - MachineBasicBlock *getExit(); - - void addMBB(MachineBasicBlock *MBB); - - void addMBBs(LinearizedRegion *InnerRegion); - - bool contains(MachineBasicBlock *MBB); - - bool isLiveOut(unsigned Reg); - - bool hasNoDef(unsigned Reg, MachineRegisterInfo *MRI); - - void removeFalseRegisterKills(MachineRegisterInfo *MRI); - - void initLiveOut(RegionMRT *Region, const MachineRegisterInfo *MRI, - const TargetRegisterInfo *TRI, PHILinearize &PHIInfo); -}; - -class MRT { -protected: - RegionMRT *Parent; - unsigned BBSelectRegIn; - unsigned BBSelectRegOut; - -public: - virtual ~MRT() = default; - - unsigned getBBSelectRegIn() { return BBSelectRegIn; } - - unsigned getBBSelectRegOut() { return BBSelectRegOut; } - - void setBBSelectRegIn(unsigned Reg) { BBSelectRegIn = Reg; } - - void setBBSelectRegOut(unsigned Reg) { BBSelectRegOut = Reg; } - - virtual RegionMRT *getRegionMRT() { return nullptr; } - - virtual MBBMRT *getMBBMRT() { return nullptr; } - - bool isRegion() { return getRegionMRT() != nullptr; } - - bool isMBB() { return getMBBMRT() != nullptr; } - - bool isRoot() { return Parent == nullptr; } - - void setParent(RegionMRT *Region) { Parent = Region; } - - RegionMRT *getParent() { return Parent; } - - static MachineBasicBlock * - initializeMRT(MachineFunction &MF, const MachineRegionInfo *RegionInfo, - DenseMap &RegionMap); - - static RegionMRT *buildMRT(MachineFunction &MF, - const MachineRegionInfo *RegionInfo, - const SIInstrInfo *TII, - MachineRegisterInfo *MRI); - - virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) = 0; - - void dumpDepth(int depth) { - for (int i = depth; i > 0; --i) { - dbgs() << " "; - } - } -}; - -class MBBMRT : public MRT { - MachineBasicBlock *MBB; - -public: - MBBMRT(MachineBasicBlock *BB) : MBB(BB) { - setParent(nullptr); - setBBSelectRegOut(0); - setBBSelectRegIn(0); - } - - MBBMRT *getMBBMRT() override { return this; } - - MachineBasicBlock *getMBB() { return MBB; } - - void dump(const TargetRegisterInfo *TRI, int depth = 0) override { - dumpDepth(depth); - dbgs() << "MBB: " << getMBB()->getNumber(); - dbgs() << " In: " << printReg(getBBSelectRegIn(), TRI); - dbgs() << ", Out: " << printReg(getBBSelectRegOut(), TRI) << "\n"; - } -}; - -class RegionMRT : public MRT { -protected: - MachineRegion *Region; - LinearizedRegion *LRegion = nullptr; - MachineBasicBlock *Succ = nullptr; - SetVector Children; - -public: - RegionMRT(MachineRegion *MachineRegion) : Region(MachineRegion) { - setParent(nullptr); - setBBSelectRegOut(0); - setBBSelectRegIn(0); - } - - ~RegionMRT() override { - if (LRegion) { - delete LRegion; - } - - for (auto *CI : Children) { - delete &(*CI); - } - } - - RegionMRT *getRegionMRT() override { return this; } - - void setLinearizedRegion(LinearizedRegion *LinearizeRegion) { - LRegion = LinearizeRegion; - } - - LinearizedRegion *getLinearizedRegion() { return LRegion; } - - MachineRegion *getMachineRegion() { return Region; } - - unsigned getInnerOutputRegister() { - return (*(Children.begin()))->getBBSelectRegOut(); - } - - void addChild(MRT *Tree) { Children.insert(Tree); } - - SetVector *getChildren() { return &Children; } - - void dump(const TargetRegisterInfo *TRI, int depth = 0) override { - dumpDepth(depth); - dbgs() << "Region: " << (void *)Region; - dbgs() << " In: " << printReg(getBBSelectRegIn(), TRI); - dbgs() << ", Out: " << printReg(getBBSelectRegOut(), TRI) << "\n"; - - dumpDepth(depth); - if (getSucc()) - dbgs() << "Succ: " << getSucc()->getNumber() << "\n"; - else - dbgs() << "Succ: none \n"; - for (auto *MRTI : Children) { - MRTI->dump(TRI, depth + 1); - } - } - - MRT *getEntryTree() { return Children.back(); } - - MRT *getExitTree() { return Children.front(); } - - MachineBasicBlock *getEntry() { - MRT *Tree = Children.back(); - return (Tree->isRegion()) ? Tree->getRegionMRT()->getEntry() - : Tree->getMBBMRT()->getMBB(); - } - - MachineBasicBlock *getExit() { - MRT *Tree = Children.front(); - return (Tree->isRegion()) ? Tree->getRegionMRT()->getExit() - : Tree->getMBBMRT()->getMBB(); - } - - void setSucc(MachineBasicBlock *MBB) { Succ = MBB; } - - MachineBasicBlock *getSucc() { return Succ; } - - bool contains(MachineBasicBlock *MBB) { - for (auto *CI : Children) { - if (CI->isMBB()) { - if (MBB == CI->getMBBMRT()->getMBB()) - return true; - } else { - if (CI->getRegionMRT()->contains(MBB)) - return true; - if (CI->getRegionMRT()->getLinearizedRegion() != nullptr && - CI->getRegionMRT()->getLinearizedRegion()->contains(MBB)) - return true; - } - } - return false; - } - - void replaceLiveOutReg(unsigned Register, unsigned NewRegister) { - LinearizedRegion *LRegion = getLinearizedRegion(); - LRegion->replaceLiveOut(Register, NewRegister); - for (auto &CI : Children) { - if (CI->isRegion()) { - CI->getRegionMRT()->replaceLiveOutReg(Register, NewRegister); - } - } - } -}; - -} // end anonymous namespace - -static unsigned createBBSelectReg(const SIInstrInfo *TII, - MachineRegisterInfo *MRI) { - return MRI->createVirtualRegister(TII->getPreferredSelectRegClass(32)); -} - -MachineBasicBlock * -MRT::initializeMRT(MachineFunction &MF, const MachineRegionInfo *RegionInfo, - DenseMap &RegionMap) { - for (auto &MFI : MF) { - MachineBasicBlock *ExitMBB = &MFI; - if (ExitMBB->succ_empty()) { - return ExitMBB; - } - } - llvm_unreachable("CFG has no exit block"); - return nullptr; -} - -RegionMRT *MRT::buildMRT(MachineFunction &MF, - const MachineRegionInfo *RegionInfo, - const SIInstrInfo *TII, MachineRegisterInfo *MRI) { - SmallPtrSet PlacedRegions; - DenseMap RegionMap; - MachineRegion *TopLevelRegion = RegionInfo->getTopLevelRegion(); - RegionMRT *Result = new RegionMRT(TopLevelRegion); - RegionMap[TopLevelRegion] = Result; - - // Insert the exit block first, we need it to be the merge node - // for the top level region. - MachineBasicBlock *Exit = initializeMRT(MF, RegionInfo, RegionMap); - - unsigned BBSelectRegIn = createBBSelectReg(TII, MRI); - MBBMRT *ExitMRT = new MBBMRT(Exit); - RegionMap[RegionInfo->getRegionFor(Exit)]->addChild(ExitMRT); - ExitMRT->setBBSelectRegIn(BBSelectRegIn); - - for (auto *MBBI : post_order(&(MF.front()))) { - MachineBasicBlock *MBB = &(*MBBI); - - // Skip Exit since we already added it - if (MBB == Exit) { - continue; - } - - LLVM_DEBUG(dbgs() << "Visiting " << printMBBReference(*MBB) << "\n"); - MBBMRT *NewMBB = new MBBMRT(MBB); - MachineRegion *Region = RegionInfo->getRegionFor(MBB); - - // Ensure we have the MRT region - if (RegionMap.count(Region) == 0) { - RegionMRT *NewMRTRegion = new RegionMRT(Region); - RegionMap[Region] = NewMRTRegion; - - // Ensure all parents are in the RegionMap - MachineRegion *Parent = Region->getParent(); - while (RegionMap.count(Parent) == 0) { - RegionMRT *NewMRTParent = new RegionMRT(Parent); - NewMRTParent->addChild(NewMRTRegion); - NewMRTRegion->setParent(NewMRTParent); - RegionMap[Parent] = NewMRTParent; - NewMRTRegion = NewMRTParent; - Parent = Parent->getParent(); - } - RegionMap[Parent]->addChild(NewMRTRegion); - NewMRTRegion->setParent(RegionMap[Parent]); - } - - // Add MBB to Region MRT - RegionMap[Region]->addChild(NewMBB); - NewMBB->setParent(RegionMap[Region]); - RegionMap[Region]->setSucc(Region->getExit()); - } - return Result; -} - -void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, Register Reg, - MachineInstr *DefInstr, - const MachineRegisterInfo *MRI, - const TargetRegisterInfo *TRI, - PHILinearize &PHIInfo) { - if (Reg.isVirtual()) { - LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI) - << "\n"); - // If this is a source register to a PHI we are chaining, it - // must be live out. - if (PHIInfo.isSource(Reg)) { - LLVM_DEBUG(dbgs() << "Add LiveOut (PHI): " << printReg(Reg, TRI) << "\n"); - addLiveOut(Reg); - } else { - // If this is live out of the MBB - for (auto &UI : MRI->use_operands(Reg)) { - if (UI.getParent()->getParent() != MBB) { - LLVM_DEBUG(dbgs() << "Add LiveOut (MBB " << printMBBReference(*MBB) - << "): " << printReg(Reg, TRI) << "\n"); - addLiveOut(Reg); - } else { - // If the use is in the same MBB we have to make sure - // it is after the def, otherwise it is live out in a loop - MachineInstr *UseInstr = UI.getParent(); - for (MachineBasicBlock::instr_iterator - MII = UseInstr->getIterator(), - MIE = UseInstr->getParent()->instr_end(); - MII != MIE; ++MII) { - if ((&(*MII)) == DefInstr) { - LLVM_DEBUG(dbgs() << "Add LiveOut (Loop): " << printReg(Reg, TRI) - << "\n"); - addLiveOut(Reg); - } - } - } - } - } - } -} - -void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, Register Reg, - MachineInstr *DefInstr, - const MachineRegisterInfo *MRI, - const TargetRegisterInfo *TRI, - PHILinearize &PHIInfo) { - if (Reg.isVirtual()) { - LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI) - << "\n"); - for (auto &UI : MRI->use_operands(Reg)) { - if (!Region->contains(UI.getParent()->getParent())) { - LLVM_DEBUG(dbgs() << "Add LiveOut (Region " << (void *)Region - << "): " << printReg(Reg, TRI) << "\n"); - addLiveOut(Reg); - } - } - } -} - -void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB, - const MachineRegisterInfo *MRI, - const TargetRegisterInfo *TRI, - PHILinearize &PHIInfo) { - LLVM_DEBUG(dbgs() << "-Store Live Outs Begin (" << printMBBReference(*MBB) - << ")-\n"); - for (auto &II : *MBB) { - for (auto &RI : II.defs()) { - storeLiveOutReg(MBB, RI.getReg(), RI.getParent(), MRI, TRI, PHIInfo); - } - for (auto &IRI : II.implicit_operands()) { - if (IRI.isDef()) { - storeLiveOutReg(MBB, IRI.getReg(), IRI.getParent(), MRI, TRI, PHIInfo); - } - } - } - - // If we have a successor with a PHI, source coming from this MBB we have to - // add the register as live out - for (MachineBasicBlock *Succ : MBB->successors()) { - for (auto &II : *Succ) { - if (II.isPHI()) { - MachineInstr &PHI = II; - int numPreds = getPHINumInputs(PHI); - for (int i = 0; i < numPreds; ++i) { - if (getPHIPred(PHI, i) == MBB) { - unsigned PHIReg = getPHISourceReg(PHI, i); - LLVM_DEBUG(dbgs() - << "Add LiveOut (PhiSource " << printMBBReference(*MBB) - << " -> " << printMBBReference(*Succ) - << "): " << printReg(PHIReg, TRI) << "\n"); - addLiveOut(PHIReg); - } - } - } - } - } - - LLVM_DEBUG(dbgs() << "-Store Live Outs Endn-\n"); -} - -void LinearizedRegion::storeMBBLiveOuts(MachineBasicBlock *MBB, - const MachineRegisterInfo *MRI, - const TargetRegisterInfo *TRI, - PHILinearize &PHIInfo, - RegionMRT *TopRegion) { - for (auto &II : *MBB) { - for (auto &RI : II.defs()) { - storeLiveOutRegRegion(TopRegion, RI.getReg(), RI.getParent(), MRI, TRI, - PHIInfo); - } - for (auto &IRI : II.implicit_operands()) { - if (IRI.isDef()) { - storeLiveOutRegRegion(TopRegion, IRI.getReg(), IRI.getParent(), MRI, - TRI, PHIInfo); - } - } - } -} - -void LinearizedRegion::storeLiveOuts(RegionMRT *Region, - const MachineRegisterInfo *MRI, - const TargetRegisterInfo *TRI, - PHILinearize &PHIInfo, - RegionMRT *CurrentTopRegion) { - MachineBasicBlock *Exit = Region->getSucc(); - - RegionMRT *TopRegion = - CurrentTopRegion == nullptr ? Region : CurrentTopRegion; - - // Check if exit is end of function, if so, no live outs. - if (Exit == nullptr) - return; - - auto Children = Region->getChildren(); - for (auto *CI : *Children) { - if (CI->isMBB()) { - auto MBB = CI->getMBBMRT()->getMBB(); - storeMBBLiveOuts(MBB, MRI, TRI, PHIInfo, TopRegion); - } else { - LinearizedRegion *SubRegion = CI->getRegionMRT()->getLinearizedRegion(); - // We should be limited to only store registers that are live out from the - // linearized region - for (auto *MBBI : SubRegion->MBBs) { - storeMBBLiveOuts(MBBI, MRI, TRI, PHIInfo, TopRegion); - } - } - } - - if (CurrentTopRegion == nullptr) { - auto Succ = Region->getSucc(); - for (auto &II : *Succ) { - if (II.isPHI()) { - MachineInstr &PHI = II; - int numPreds = getPHINumInputs(PHI); - for (int i = 0; i < numPreds; ++i) { - if (Region->contains(getPHIPred(PHI, i))) { - unsigned PHIReg = getPHISourceReg(PHI, i); - LLVM_DEBUG(dbgs() << "Add Region LiveOut (" << (void *)Region - << "): " << printReg(PHIReg, TRI) << "\n"); - addLiveOut(PHIReg); - } - } - } - } - } -} - -#ifndef NDEBUG -void LinearizedRegion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) { - OS << "Linearized Region {"; - bool IsFirst = true; - for (auto *MBB : MBBs) { - if (IsFirst) { - IsFirst = false; - } else { - OS << " ,"; - } - OS << MBB->getNumber(); - } - OS << "} (" << Entry->getNumber() << ", " - << (Exit == nullptr ? -1 : Exit->getNumber()) - << "): In:" << printReg(getBBSelectRegIn(), TRI) - << " Out:" << printReg(getBBSelectRegOut(), TRI) << " {"; - for (auto &LI : LiveOuts) { - OS << printReg(LI, TRI) << " "; - } - OS << "} \n"; -} -#endif - -unsigned LinearizedRegion::getBBSelectRegIn() { - return getRegionMRT()->getBBSelectRegIn(); -} - -unsigned LinearizedRegion::getBBSelectRegOut() { - return getRegionMRT()->getBBSelectRegOut(); -} - -void LinearizedRegion::setHasLoop(bool Value) { HasLoop = Value; } - -bool LinearizedRegion::getHasLoop() { return HasLoop; } - -void LinearizedRegion::addLiveOut(unsigned VReg) { LiveOuts.insert(VReg); } - -void LinearizedRegion::removeLiveOut(unsigned Reg) { - if (isLiveOut(Reg)) - LiveOuts.erase(Reg); -} - -void LinearizedRegion::replaceLiveOut(unsigned OldReg, unsigned NewReg) { - if (isLiveOut(OldReg)) { - removeLiveOut(OldReg); - addLiveOut(NewReg); - } -} - -void LinearizedRegion::replaceRegister(unsigned Register, - class Register NewRegister, - MachineRegisterInfo *MRI, - bool ReplaceInside, bool ReplaceOutside, - bool IncludeLoopPHI) { - assert(Register != NewRegister && "Cannot replace a reg with itself"); - - LLVM_DEBUG( - dbgs() << "Preparing to replace register (region): " - << printReg(Register, MRI->getTargetRegisterInfo()) << " with " - << printReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n"); - - // If we are replacing outside, we also need to update the LiveOuts - if (ReplaceOutside && - (isLiveOut(Register) || this->getParent()->isLiveOut(Register))) { - LinearizedRegion *Current = this; - while (Current != nullptr && Current->getEntry() != nullptr) { - LLVM_DEBUG(dbgs() << "Region before register replace\n"); - LLVM_DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo())); - Current->replaceLiveOut(Register, NewRegister); - LLVM_DEBUG(dbgs() << "Region after register replace\n"); - LLVM_DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo())); - Current = Current->getParent(); - } - } - - for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Register), - E = MRI->reg_end(); - I != E;) { - MachineOperand &O = *I; - ++I; - - // We don't rewrite defs. - if (O.isDef()) - continue; - - bool IsInside = contains(O.getParent()->getParent()); - bool IsLoopPHI = IsInside && (O.getParent()->isPHI() && - O.getParent()->getParent() == getEntry()); - bool ShouldReplace = (IsInside && ReplaceInside) || - (!IsInside && ReplaceOutside) || - (IncludeLoopPHI && IsLoopPHI); - if (ShouldReplace) { - - if (NewRegister.isPhysical()) { - LLVM_DEBUG(dbgs() << "Trying to substitute physical register: " - << printReg(NewRegister, MRI->getTargetRegisterInfo()) - << "\n"); - llvm_unreachable("Cannot substitute physical registers"); - } else { - LLVM_DEBUG(dbgs() << "Replacing register (region): " - << printReg(Register, MRI->getTargetRegisterInfo()) - << " with " - << printReg(NewRegister, MRI->getTargetRegisterInfo()) - << "\n"); - O.setReg(NewRegister); - } - } - } -} - -void LinearizedRegion::replaceRegisterInsideRegion(unsigned Register, - unsigned NewRegister, - bool IncludeLoopPHIs, - MachineRegisterInfo *MRI) { - replaceRegister(Register, NewRegister, MRI, true, false, IncludeLoopPHIs); -} - -void LinearizedRegion::replaceRegisterOutsideRegion(unsigned Register, - unsigned NewRegister, - bool IncludeLoopPHIs, - MachineRegisterInfo *MRI) { - replaceRegister(Register, NewRegister, MRI, false, true, IncludeLoopPHIs); -} - -DenseSet *LinearizedRegion::getLiveOuts() { return &LiveOuts; } - -void LinearizedRegion::setEntry(MachineBasicBlock *NewEntry) { - Entry = NewEntry; -} - -MachineBasicBlock *LinearizedRegion::getEntry() { return Entry; } - -void LinearizedRegion::setExit(MachineBasicBlock *NewExit) { Exit = NewExit; } - -MachineBasicBlock *LinearizedRegion::getExit() { return Exit; } - -void LinearizedRegion::addMBB(MachineBasicBlock *MBB) { MBBs.insert(MBB); } - -void LinearizedRegion::addMBBs(LinearizedRegion *InnerRegion) { - for (auto *MBB : InnerRegion->MBBs) { - addMBB(MBB); - } -} - -bool LinearizedRegion::contains(MachineBasicBlock *MBB) { - return MBBs.contains(MBB); -} - -bool LinearizedRegion::isLiveOut(unsigned Reg) { - return LiveOuts.contains(Reg); -} - -bool LinearizedRegion::hasNoDef(unsigned Reg, MachineRegisterInfo *MRI) { - return MRI->def_begin(Reg) == MRI->def_end(); -} - -// After the code has been structurized, what was flagged as kills -// before are no longer register kills. -void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) { - const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo(); - (void)TRI; // It's used by LLVM_DEBUG. - - for (auto *MBBI : MBBs) { - MachineBasicBlock *MBB = MBBI; - for (auto &II : *MBB) { - for (auto &RI : II.uses()) { - if (RI.isReg()) { - Register Reg = RI.getReg(); - if (Reg.isVirtual()) { - if (hasNoDef(Reg, MRI)) - continue; - if (!MRI->hasOneDef(Reg)) { - LLVM_DEBUG(this->getEntry()->getParent()->dump()); - LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << "\n"); - } - - if (MRI->def_begin(Reg) == MRI->def_end()) { - LLVM_DEBUG(dbgs() << "Register " - << printReg(Reg, MRI->getTargetRegisterInfo()) - << " has NO defs\n"); - } else if (!MRI->hasOneDef(Reg)) { - LLVM_DEBUG(dbgs() << "Register " - << printReg(Reg, MRI->getTargetRegisterInfo()) - << " has multiple defs\n"); - } - - assert(MRI->hasOneDef(Reg) && "Register has multiple definitions"); - MachineOperand *Def = &(*(MRI->def_begin(Reg))); - MachineOperand *UseOperand = &(RI); - bool UseIsOutsideDefMBB = Def->getParent()->getParent() != MBB; - if (UseIsOutsideDefMBB && UseOperand->isKill()) { - LLVM_DEBUG(dbgs() << "Removing kill flag on register: " - << printReg(Reg, TRI) << "\n"); - UseOperand->setIsKill(false); - } - } - } - } - } - } -} - -void LinearizedRegion::initLiveOut(RegionMRT *Region, - const MachineRegisterInfo *MRI, - const TargetRegisterInfo *TRI, - PHILinearize &PHIInfo) { - storeLiveOuts(Region, MRI, TRI, PHIInfo); -} - -LinearizedRegion::LinearizedRegion(MachineBasicBlock *MBB, - const MachineRegisterInfo *MRI, - const TargetRegisterInfo *TRI, - PHILinearize &PHIInfo) { - setEntry(MBB); - setExit(MBB); - storeLiveOuts(MBB, MRI, TRI, PHIInfo); - MBBs.insert(MBB); - Parent = nullptr; -} - -LinearizedRegion::LinearizedRegion() { - setEntry(nullptr); - setExit(nullptr); - Parent = nullptr; -} - -namespace { - -class AMDGPUMachineCFGStructurizer : public MachineFunctionPass { -private: - const MachineRegionInfo *Regions; - const SIInstrInfo *TII; - const TargetRegisterInfo *TRI; - MachineRegisterInfo *MRI; - PHILinearize PHIInfo; - DenseMap FallthroughMap; - RegionMRT *RMRT; - - void getPHIRegionIndices(RegionMRT *Region, MachineInstr &PHI, - SmallVector &RegionIndices); - void getPHIRegionIndices(LinearizedRegion *Region, MachineInstr &PHI, - SmallVector &RegionIndices); - void getPHINonRegionIndices(LinearizedRegion *Region, MachineInstr &PHI, - SmallVector &PHINonRegionIndices); - - void storePHILinearizationInfoDest( - unsigned LDestReg, MachineInstr &PHI, - SmallVector *RegionIndices = nullptr); - - unsigned storePHILinearizationInfo(MachineInstr &PHI, - SmallVector *RegionIndices); - - void extractKilledPHIs(MachineBasicBlock *MBB); - - bool shrinkPHI(MachineInstr &PHI, SmallVector &PHIIndices, - unsigned *ReplaceReg); - - bool shrinkPHI(MachineInstr &PHI, unsigned CombinedSourceReg, - MachineBasicBlock *SourceMBB, - SmallVector &PHIIndices, unsigned *ReplaceReg); - - void replacePHI(MachineInstr &PHI, unsigned CombinedSourceReg, - MachineBasicBlock *LastMerge, - SmallVector &PHIRegionIndices); - void replaceEntryPHI(MachineInstr &PHI, unsigned CombinedSourceReg, - MachineBasicBlock *IfMBB, - SmallVector &PHIRegionIndices); - void replaceLiveOutRegs(MachineInstr &PHI, - SmallVector &PHIRegionIndices, - unsigned CombinedSourceReg, - LinearizedRegion *LRegion); - void rewriteRegionExitPHI(RegionMRT *Region, MachineBasicBlock *LastMerge, - MachineInstr &PHI, LinearizedRegion *LRegion); - - void rewriteRegionExitPHIs(RegionMRT *Region, MachineBasicBlock *LastMerge, - LinearizedRegion *LRegion); - void rewriteRegionEntryPHI(LinearizedRegion *Region, MachineBasicBlock *IfMBB, - MachineInstr &PHI); - void rewriteRegionEntryPHIs(LinearizedRegion *Region, - MachineBasicBlock *IfMBB); - - bool regionIsSimpleIf(RegionMRT *Region); - - void transformSimpleIfRegion(RegionMRT *Region); - - void insertUnconditionalBranch(MachineBasicBlock *MBB, - MachineBasicBlock *Dest, - const DebugLoc &DL = DebugLoc()); - - MachineBasicBlock *createLinearizedExitBlock(RegionMRT *Region); - - void insertMergePHI(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB, - MachineBasicBlock *MergeBB, unsigned DestRegister, - unsigned IfSourceRegister, unsigned CodeSourceRegister, - bool IsUndefIfSource = false); - - MachineBasicBlock *createIfBlock(MachineBasicBlock *MergeBB, - MachineBasicBlock *CodeBBStart, - MachineBasicBlock *CodeBBEnd, - MachineBasicBlock *SelectBB, unsigned IfReg, - bool InheritPreds); - - void prunePHIInfo(MachineBasicBlock *MBB); - void createEntryPHI(LinearizedRegion *CurrentRegion, unsigned DestReg); - - void createEntryPHIs(LinearizedRegion *CurrentRegion); - void resolvePHIInfos(MachineBasicBlock *FunctionEntry); - - void replaceRegisterWith(unsigned Register, class Register NewRegister); - - MachineBasicBlock *createIfRegion(MachineBasicBlock *MergeBB, - MachineBasicBlock *CodeBB, - LinearizedRegion *LRegion, - unsigned BBSelectRegIn, - unsigned BBSelectRegOut); - - MachineBasicBlock * - createIfRegion(MachineBasicBlock *MergeMBB, LinearizedRegion *InnerRegion, - LinearizedRegion *CurrentRegion, MachineBasicBlock *SelectBB, - unsigned BBSelectRegIn, unsigned BBSelectRegOut); - void ensureCondIsNotKilled(SmallVector Cond); - - void rewriteCodeBBTerminator(MachineBasicBlock *CodeBB, - MachineBasicBlock *MergeBB, - unsigned BBSelectReg); - - MachineInstr *getDefInstr(unsigned Reg); - void insertChainedPHI(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB, - MachineBasicBlock *MergeBB, - LinearizedRegion *InnerRegion, unsigned DestReg, - unsigned SourceReg); - bool containsDef(MachineBasicBlock *MBB, LinearizedRegion *InnerRegion, - unsigned Register); - void rewriteLiveOutRegs(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB, - MachineBasicBlock *MergeBB, - LinearizedRegion *InnerRegion, - LinearizedRegion *LRegion); - - void splitLoopPHI(MachineInstr &PHI, MachineBasicBlock *Entry, - MachineBasicBlock *EntrySucc, LinearizedRegion *LRegion); - void splitLoopPHIs(MachineBasicBlock *Entry, MachineBasicBlock *EntrySucc, - LinearizedRegion *LRegion); - - MachineBasicBlock *splitExit(LinearizedRegion *LRegion); - - MachineBasicBlock *splitEntry(LinearizedRegion *LRegion); - - LinearizedRegion *initLinearizedRegion(RegionMRT *Region); - - bool structurizeComplexRegion(RegionMRT *Region); - - bool structurizeRegion(RegionMRT *Region); - - bool structurizeRegions(RegionMRT *Region, bool isTopRegion); - -public: - static char ID; - - AMDGPUMachineCFGStructurizer() : MachineFunctionPass(ID) { - initializeAMDGPUMachineCFGStructurizerPass(*PassRegistry::getPassRegistry()); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - void initFallthroughMap(MachineFunction &MF); - - void createLinearizedRegion(RegionMRT *Region, unsigned SelectOut); - - unsigned initializeSelectRegisters(MRT *MRT, unsigned ExistingExitReg, - MachineRegisterInfo *MRI, - const SIInstrInfo *TII); - - void setRegionMRT(RegionMRT *RegionTree) { RMRT = RegionTree; } - - RegionMRT *getRegionMRT() { return RMRT; } - - bool runOnMachineFunction(MachineFunction &MF) override; -}; - -} // end anonymous namespace - -char AMDGPUMachineCFGStructurizer::ID = 0; - -bool AMDGPUMachineCFGStructurizer::regionIsSimpleIf(RegionMRT *Region) { - MachineBasicBlock *Entry = Region->getEntry(); - MachineBasicBlock *Succ = Region->getSucc(); - bool FoundBypass = false; - bool FoundIf = false; - - if (Entry->succ_size() != 2) { - return false; - } - - for (MachineBasicBlock *Current : Entry->successors()) { - if (Current == Succ) { - FoundBypass = true; - } else if ((Current->succ_size() == 1) && - *(Current->succ_begin()) == Succ) { - FoundIf = true; - } - } - - return FoundIf && FoundBypass; -} - -void AMDGPUMachineCFGStructurizer::transformSimpleIfRegion(RegionMRT *Region) { - MachineBasicBlock *Entry = Region->getEntry(); - MachineBasicBlock *Exit = Region->getExit(); - TII->convertNonUniformIfRegion(Entry, Exit); -} - -static void fixMBBTerminator(MachineBasicBlock *MBB) { - if (MBB->succ_size() == 1) { - auto *Succ = *(MBB->succ_begin()); - for (auto &TI : MBB->terminators()) { - for (auto &UI : TI.uses()) { - if (UI.isMBB() && UI.getMBB() != Succ) { - UI.setMBB(Succ); - } - } - } - } -} - -static void fixRegionTerminator(RegionMRT *Region) { - MachineBasicBlock *InternalSucc = nullptr; - MachineBasicBlock *ExternalSucc = nullptr; - LinearizedRegion *LRegion = Region->getLinearizedRegion(); - auto Exit = LRegion->getExit(); - - SmallPtrSet Successors; - for (MachineBasicBlock *Succ : Exit->successors()) { - if (LRegion->contains(Succ)) { - // Do not allow re-assign - assert(InternalSucc == nullptr); - InternalSucc = Succ; - } else { - // Do not allow re-assign - assert(ExternalSucc == nullptr); - ExternalSucc = Succ; - } - } - - for (auto &TI : Exit->terminators()) { - for (auto &UI : TI.uses()) { - if (UI.isMBB()) { - auto Target = UI.getMBB(); - if (Target != InternalSucc && Target != ExternalSucc) { - UI.setMBB(ExternalSucc); - } - } - } - } -} - -// If a region is just a sequence of regions (and the exit -// block in the case of the top level region), we can simply skip -// linearizing it, because it is already linear -bool regionIsSequence(RegionMRT *Region) { - auto Children = Region->getChildren(); - for (auto *CI : *Children) { - if (!CI->isRegion()) { - if (CI->getMBBMRT()->getMBB()->succ_size() > 1) { - return false; - } - } - } - return true; -} - -void fixupRegionExits(RegionMRT *Region) { - auto Children = Region->getChildren(); - for (auto *CI : *Children) { - if (!CI->isRegion()) { - fixMBBTerminator(CI->getMBBMRT()->getMBB()); - } else { - fixRegionTerminator(CI->getRegionMRT()); - } - } -} - -void AMDGPUMachineCFGStructurizer::getPHIRegionIndices( - RegionMRT *Region, MachineInstr &PHI, - SmallVector &PHIRegionIndices) { - unsigned NumInputs = getPHINumInputs(PHI); - for (unsigned i = 0; i < NumInputs; ++i) { - MachineBasicBlock *Pred = getPHIPred(PHI, i); - if (Region->contains(Pred)) { - PHIRegionIndices.push_back(i); - } - } -} - -void AMDGPUMachineCFGStructurizer::getPHIRegionIndices( - LinearizedRegion *Region, MachineInstr &PHI, - SmallVector &PHIRegionIndices) { - unsigned NumInputs = getPHINumInputs(PHI); - for (unsigned i = 0; i < NumInputs; ++i) { - MachineBasicBlock *Pred = getPHIPred(PHI, i); - if (Region->contains(Pred)) { - PHIRegionIndices.push_back(i); - } - } -} - -void AMDGPUMachineCFGStructurizer::getPHINonRegionIndices( - LinearizedRegion *Region, MachineInstr &PHI, - SmallVector &PHINonRegionIndices) { - unsigned NumInputs = getPHINumInputs(PHI); - for (unsigned i = 0; i < NumInputs; ++i) { - MachineBasicBlock *Pred = getPHIPred(PHI, i); - if (!Region->contains(Pred)) { - PHINonRegionIndices.push_back(i); - } - } -} - -void AMDGPUMachineCFGStructurizer::storePHILinearizationInfoDest( - unsigned LDestReg, MachineInstr &PHI, - SmallVector *RegionIndices) { - if (RegionIndices) { - for (auto i : *RegionIndices) { - PHIInfo.addSource(LDestReg, getPHISourceReg(PHI, i), getPHIPred(PHI, i)); - } - } else { - unsigned NumInputs = getPHINumInputs(PHI); - for (unsigned i = 0; i < NumInputs; ++i) { - PHIInfo.addSource(LDestReg, getPHISourceReg(PHI, i), getPHIPred(PHI, i)); - } - } -} - -unsigned AMDGPUMachineCFGStructurizer::storePHILinearizationInfo( - MachineInstr &PHI, SmallVector *RegionIndices) { - unsigned DestReg = getPHIDestReg(PHI); - Register LinearizeDestReg = - MRI->createVirtualRegister(MRI->getRegClass(DestReg)); - PHIInfo.addDest(LinearizeDestReg, PHI.getDebugLoc()); - storePHILinearizationInfoDest(LinearizeDestReg, PHI, RegionIndices); - return LinearizeDestReg; -} - -void AMDGPUMachineCFGStructurizer::extractKilledPHIs(MachineBasicBlock *MBB) { - // We need to create a new chain for the killed phi, but there is no - // need to do the renaming outside or inside the block. - SmallPtrSet PHIs; - for (MachineBasicBlock::instr_iterator I = MBB->instr_begin(), - E = MBB->instr_end(); - I != E; ++I) { - MachineInstr &Instr = *I; - if (Instr.isPHI()) { - unsigned PHIDestReg = getPHIDestReg(Instr); - LLVM_DEBUG(dbgs() << "Extracting killed phi:\n"); - LLVM_DEBUG(Instr.dump()); - PHIs.insert(&Instr); - PHIInfo.addDest(PHIDestReg, Instr.getDebugLoc()); - storePHILinearizationInfoDest(PHIDestReg, Instr); - } - } - - for (auto *PI : PHIs) { - PI->eraseFromParent(); - } -} - -static bool isPHIRegionIndex(SmallVector PHIRegionIndices, - unsigned Index) { - return llvm::is_contained(PHIRegionIndices, Index); -} - -bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI, - SmallVector &PHIIndices, - unsigned *ReplaceReg) { - return shrinkPHI(PHI, 0, nullptr, PHIIndices, ReplaceReg); -} - -bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI, - unsigned CombinedSourceReg, - MachineBasicBlock *SourceMBB, - SmallVector &PHIIndices, - unsigned *ReplaceReg) { - LLVM_DEBUG(dbgs() << "Shrink PHI: "); - LLVM_DEBUG(PHI.dump()); - LLVM_DEBUG(dbgs() << " to " << printReg(getPHIDestReg(PHI), TRI) - << " = PHI("); - - bool Replaced = false; - unsigned NumInputs = getPHINumInputs(PHI); - int SingleExternalEntryIndex = -1; - for (unsigned i = 0; i < NumInputs; ++i) { - if (!isPHIRegionIndex(PHIIndices, i)) { - if (SingleExternalEntryIndex == -1) { - // Single entry - SingleExternalEntryIndex = i; - } else { - // Multiple entries - SingleExternalEntryIndex = -2; - } - } - } - - if (SingleExternalEntryIndex > -1) { - *ReplaceReg = getPHISourceReg(PHI, SingleExternalEntryIndex); - // We should not rewrite the code, we should only pick up the single value - // that represents the shrunk PHI. - Replaced = true; - } else { - MachineBasicBlock *MBB = PHI.getParent(); - MachineInstrBuilder MIB = - BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI), - getPHIDestReg(PHI)); - if (SourceMBB) { - MIB.addReg(CombinedSourceReg); - MIB.addMBB(SourceMBB); - LLVM_DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", " - << printMBBReference(*SourceMBB)); - } - - for (unsigned i = 0; i < NumInputs; ++i) { - if (isPHIRegionIndex(PHIIndices, i)) { - continue; - } - unsigned SourceReg = getPHISourceReg(PHI, i); - MachineBasicBlock *SourcePred = getPHIPred(PHI, i); - MIB.addReg(SourceReg); - MIB.addMBB(SourcePred); - LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", " - << printMBBReference(*SourcePred)); - } - LLVM_DEBUG(dbgs() << ")\n"); - } - PHI.eraseFromParent(); - return Replaced; -} - -void AMDGPUMachineCFGStructurizer::replacePHI( - MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *LastMerge, - SmallVector &PHIRegionIndices) { - LLVM_DEBUG(dbgs() << "Replace PHI: "); - LLVM_DEBUG(PHI.dump()); - LLVM_DEBUG(dbgs() << " with " << printReg(getPHIDestReg(PHI), TRI) - << " = PHI("); - - bool HasExternalEdge = false; - unsigned NumInputs = getPHINumInputs(PHI); - for (unsigned i = 0; i < NumInputs; ++i) { - if (!isPHIRegionIndex(PHIRegionIndices, i)) { - HasExternalEdge = true; - } - } - - if (HasExternalEdge) { - MachineBasicBlock *MBB = PHI.getParent(); - MachineInstrBuilder MIB = - BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI), - getPHIDestReg(PHI)); - MIB.addReg(CombinedSourceReg); - MIB.addMBB(LastMerge); - LLVM_DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", " - << printMBBReference(*LastMerge)); - for (unsigned i = 0; i < NumInputs; ++i) { - if (isPHIRegionIndex(PHIRegionIndices, i)) { - continue; - } - unsigned SourceReg = getPHISourceReg(PHI, i); - MachineBasicBlock *SourcePred = getPHIPred(PHI, i); - MIB.addReg(SourceReg); - MIB.addMBB(SourcePred); - LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", " - << printMBBReference(*SourcePred)); - } - LLVM_DEBUG(dbgs() << ")\n"); - } else { - replaceRegisterWith(getPHIDestReg(PHI), CombinedSourceReg); - } - PHI.eraseFromParent(); -} - -void AMDGPUMachineCFGStructurizer::replaceEntryPHI( - MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *IfMBB, - SmallVector &PHIRegionIndices) { - LLVM_DEBUG(dbgs() << "Replace entry PHI: "); - LLVM_DEBUG(PHI.dump()); - LLVM_DEBUG(dbgs() << " with "); - - unsigned NumInputs = getPHINumInputs(PHI); - unsigned NumNonRegionInputs = NumInputs; - for (unsigned i = 0; i < NumInputs; ++i) { - if (isPHIRegionIndex(PHIRegionIndices, i)) { - NumNonRegionInputs--; - } - } - - if (NumNonRegionInputs == 0) { - auto DestReg = getPHIDestReg(PHI); - replaceRegisterWith(DestReg, CombinedSourceReg); - LLVM_DEBUG(dbgs() << " register " << printReg(CombinedSourceReg, TRI) - << "\n"); - PHI.eraseFromParent(); - } else { - LLVM_DEBUG(dbgs() << printReg(getPHIDestReg(PHI), TRI) << " = PHI("); - MachineBasicBlock *MBB = PHI.getParent(); - MachineInstrBuilder MIB = - BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI), - getPHIDestReg(PHI)); - MIB.addReg(CombinedSourceReg); - MIB.addMBB(IfMBB); - LLVM_DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", " - << printMBBReference(*IfMBB)); - unsigned NumInputs = getPHINumInputs(PHI); - for (unsigned i = 0; i < NumInputs; ++i) { - if (isPHIRegionIndex(PHIRegionIndices, i)) { - continue; - } - unsigned SourceReg = getPHISourceReg(PHI, i); - MachineBasicBlock *SourcePred = getPHIPred(PHI, i); - MIB.addReg(SourceReg); - MIB.addMBB(SourcePred); - LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", " - << printMBBReference(*SourcePred)); - } - LLVM_DEBUG(dbgs() << ")\n"); - PHI.eraseFromParent(); - } -} - -void AMDGPUMachineCFGStructurizer::replaceLiveOutRegs( - MachineInstr &PHI, SmallVector &PHIRegionIndices, - unsigned CombinedSourceReg, LinearizedRegion *LRegion) { - bool WasLiveOut = false; - for (auto PII : PHIRegionIndices) { - unsigned Reg = getPHISourceReg(PHI, PII); - if (LRegion->isLiveOut(Reg)) { - bool IsDead = true; - - // Check if register is live out of the basic block - MachineBasicBlock *DefMBB = getDefInstr(Reg)->getParent(); - for (const MachineOperand &MO : MRI->use_operands(Reg)) - if (MO.getParent()->getParent() != DefMBB) - IsDead = false; - - LLVM_DEBUG(dbgs() << "Register " << printReg(Reg, TRI) << " is " - << (IsDead ? "dead" : "alive") - << " after PHI replace\n"); - if (IsDead) { - LRegion->removeLiveOut(Reg); - } - WasLiveOut = true; - } - } - - if (WasLiveOut) - LRegion->addLiveOut(CombinedSourceReg); -} - -void AMDGPUMachineCFGStructurizer::rewriteRegionExitPHI(RegionMRT *Region, - MachineBasicBlock *LastMerge, - MachineInstr &PHI, - LinearizedRegion *LRegion) { - SmallVector PHIRegionIndices; - getPHIRegionIndices(Region, PHI, PHIRegionIndices); - unsigned LinearizedSourceReg = - storePHILinearizationInfo(PHI, &PHIRegionIndices); - - replacePHI(PHI, LinearizedSourceReg, LastMerge, PHIRegionIndices); - replaceLiveOutRegs(PHI, PHIRegionIndices, LinearizedSourceReg, LRegion); -} - -void AMDGPUMachineCFGStructurizer::rewriteRegionEntryPHI(LinearizedRegion *Region, - MachineBasicBlock *IfMBB, - MachineInstr &PHI) { - SmallVector PHINonRegionIndices; - getPHINonRegionIndices(Region, PHI, PHINonRegionIndices); - unsigned LinearizedSourceReg = - storePHILinearizationInfo(PHI, &PHINonRegionIndices); - replaceEntryPHI(PHI, LinearizedSourceReg, IfMBB, PHINonRegionIndices); -} - -static void collectPHIs(MachineBasicBlock *MBB, - SmallVector &PHIs) { - for (auto &BBI : *MBB) { - if (BBI.isPHI()) { - PHIs.push_back(&BBI); - } - } -} - -void AMDGPUMachineCFGStructurizer::rewriteRegionExitPHIs(RegionMRT *Region, - MachineBasicBlock *LastMerge, - LinearizedRegion *LRegion) { - SmallVector PHIs; - auto Exit = Region->getSucc(); - if (Exit == nullptr) - return; - - collectPHIs(Exit, PHIs); - - for (auto *PHII : PHIs) { - rewriteRegionExitPHI(Region, LastMerge, *PHII, LRegion); - } -} - -void AMDGPUMachineCFGStructurizer::rewriteRegionEntryPHIs(LinearizedRegion *Region, - MachineBasicBlock *IfMBB) { - SmallVector PHIs; - auto Entry = Region->getEntry(); - - collectPHIs(Entry, PHIs); - - for (auto *PHII : PHIs) { - rewriteRegionEntryPHI(Region, IfMBB, *PHII); - } -} - -void AMDGPUMachineCFGStructurizer::insertUnconditionalBranch(MachineBasicBlock *MBB, - MachineBasicBlock *Dest, - const DebugLoc &DL) { - LLVM_DEBUG(dbgs() << "Inserting unconditional branch: " << MBB->getNumber() - << " -> " << Dest->getNumber() << "\n"); - MachineBasicBlock::instr_iterator Terminator = MBB->getFirstInstrTerminator(); - bool HasTerminator = Terminator != MBB->instr_end(); - if (HasTerminator) { - TII->ReplaceTailWithBranchTo(Terminator, Dest); - } - if (++MachineFunction::iterator(MBB) != MachineFunction::iterator(Dest)) { - TII->insertUnconditionalBranch(*MBB, Dest, DL); - } -} - -static MachineBasicBlock *getSingleExitNode(MachineFunction &MF) { - MachineBasicBlock *result = nullptr; - for (auto &MFI : MF) { - if (MFI.succ_empty()) { - if (result == nullptr) { - result = &MFI; - } else { - return nullptr; - } - } - } - - return result; -} - -static bool hasOneExitNode(MachineFunction &MF) { - return getSingleExitNode(MF) != nullptr; -} - -MachineBasicBlock * -AMDGPUMachineCFGStructurizer::createLinearizedExitBlock(RegionMRT *Region) { - auto Exit = Region->getSucc(); - - // If the exit is the end of the function, we just use the existing - MachineFunction *MF = Region->getEntry()->getParent(); - if (Exit == nullptr && hasOneExitNode(*MF)) { - return &(*(--(Region->getEntry()->getParent()->end()))); - } - - MachineBasicBlock *LastMerge = MF->CreateMachineBasicBlock(); - if (Exit == nullptr) { - MachineFunction::iterator ExitIter = MF->end(); - MF->insert(ExitIter, LastMerge); - } else { - MachineFunction::iterator ExitIter = Exit->getIterator(); - MF->insert(ExitIter, LastMerge); - LastMerge->addSuccessor(Exit); - insertUnconditionalBranch(LastMerge, Exit); - LLVM_DEBUG(dbgs() << "Created exit block: " << LastMerge->getNumber() - << "\n"); - } - return LastMerge; -} - -void AMDGPUMachineCFGStructurizer::insertMergePHI(MachineBasicBlock *IfBB, - MachineBasicBlock *CodeBB, - MachineBasicBlock *MergeBB, - unsigned DestRegister, - unsigned IfSourceRegister, - unsigned CodeSourceRegister, - bool IsUndefIfSource) { - // If this is the function exit block, we don't need a phi. - if (MergeBB->succ_empty()) { - return; - } - LLVM_DEBUG(dbgs() << "Merge PHI (" << printMBBReference(*MergeBB) - << "): " << printReg(DestRegister, TRI) << " = PHI(" - << printReg(IfSourceRegister, TRI) << ", " - << printMBBReference(*IfBB) - << printReg(CodeSourceRegister, TRI) << ", " - << printMBBReference(*CodeBB) << ")\n"); - const DebugLoc &DL = MergeBB->findDebugLoc(MergeBB->begin()); - MachineInstrBuilder MIB = BuildMI(*MergeBB, MergeBB->instr_begin(), DL, - TII->get(TargetOpcode::PHI), DestRegister); - if (IsUndefIfSource && false) { - MIB.addReg(IfSourceRegister, RegState::Undef); - } else { - MIB.addReg(IfSourceRegister); - } - MIB.addMBB(IfBB); - MIB.addReg(CodeSourceRegister); - MIB.addMBB(CodeBB); -} - -static void removeExternalCFGSuccessors(MachineBasicBlock *MBB) { - for (MachineBasicBlock::succ_iterator PI = MBB->succ_begin(), - E = MBB->succ_end(); - PI != E; ++PI) { - if ((*PI) != MBB) { - (MBB)->removeSuccessor(*PI); - } - } -} - -static void removeExternalCFGEdges(MachineBasicBlock *StartMBB, - MachineBasicBlock *EndMBB) { - - // We have to check against the StartMBB successor because a - // structurized region with a loop will have the entry block split, - // and the backedge will go to the entry successor. - DenseSet> Succs; - unsigned SuccSize = StartMBB->succ_size(); - if (SuccSize > 0) { - MachineBasicBlock *StartMBBSucc = *(StartMBB->succ_begin()); - for (MachineBasicBlock *Succ : EndMBB->successors()) { - // Either we have a back-edge to the entry block, or a back-edge to the - // successor of the entry block since the block may be split. - if (Succ != StartMBB && - !(Succ == StartMBBSucc && StartMBB != EndMBB && SuccSize == 1)) { - Succs.insert( - std::pair(EndMBB, Succ)); - } - } - } - - for (MachineBasicBlock *Pred : StartMBB->predecessors()) - if (Pred != EndMBB) - Succs.insert(std::pair(Pred, StartMBB)); - - for (auto SI : Succs) { - std::pair Edge = SI; - LLVM_DEBUG(dbgs() << "Removing edge: " << printMBBReference(*Edge.first) - << " -> " << printMBBReference(*Edge.second) << "\n"); - Edge.first->removeSuccessor(Edge.second); - } -} - -MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfBlock( - MachineBasicBlock *MergeBB, MachineBasicBlock *CodeBBStart, - MachineBasicBlock *CodeBBEnd, MachineBasicBlock *SelectBB, unsigned IfReg, - bool InheritPreds) { - MachineFunction *MF = MergeBB->getParent(); - MachineBasicBlock *IfBB = MF->CreateMachineBasicBlock(); - - if (InheritPreds) { - for (MachineBasicBlock *Pred : CodeBBStart->predecessors()) - if (Pred != CodeBBEnd) - Pred->addSuccessor(IfBB); - } - - removeExternalCFGEdges(CodeBBStart, CodeBBEnd); - - auto CodeBBStartI = CodeBBStart->getIterator(); - auto CodeBBEndI = CodeBBEnd->getIterator(); - auto MergeIter = MergeBB->getIterator(); - MF->insert(MergeIter, IfBB); - MF->splice(MergeIter, CodeBBStartI, ++CodeBBEndI); - IfBB->addSuccessor(MergeBB); - IfBB->addSuccessor(CodeBBStart); - - LLVM_DEBUG(dbgs() << "Created If block: " << IfBB->getNumber() << "\n"); - // Ensure that the MergeBB is a successor of the CodeEndBB. - if (!CodeBBEnd->isSuccessor(MergeBB)) - CodeBBEnd->addSuccessor(MergeBB); - - LLVM_DEBUG(dbgs() << "Moved " << printMBBReference(*CodeBBStart) - << " through " << printMBBReference(*CodeBBEnd) << "\n"); - - // If we have a single predecessor we can find a reasonable debug location - MachineBasicBlock *SinglePred = - CodeBBStart->pred_size() == 1 ? *(CodeBBStart->pred_begin()) : nullptr; - const DebugLoc &DL = SinglePred - ? SinglePred->findDebugLoc(SinglePred->getFirstTerminator()) - : DebugLoc(); - - Register Reg = - TII->insertEQ(IfBB, IfBB->begin(), DL, IfReg, - SelectBB->getNumber() /* CodeBBStart->getNumber() */); - if (&(*(IfBB->getParent()->begin())) == IfBB) { - TII->materializeImmediate(*IfBB, IfBB->begin(), DL, IfReg, - CodeBBStart->getNumber()); - } - MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true); - ArrayRef Cond(RegOp); - TII->insertBranch(*IfBB, MergeBB, CodeBBStart, Cond, DL); - - return IfBB; -} - -void AMDGPUMachineCFGStructurizer::ensureCondIsNotKilled( - SmallVector Cond) { - if (Cond.size() != 1) - return; - if (!Cond[0].isReg()) - return; - - Register CondReg = Cond[0].getReg(); - for (MachineOperand &MO : MRI->use_operands(CondReg)) - MO.setIsKill(false); -} - -void AMDGPUMachineCFGStructurizer::rewriteCodeBBTerminator(MachineBasicBlock *CodeBB, - MachineBasicBlock *MergeBB, - unsigned BBSelectReg) { - MachineBasicBlock *TrueBB = nullptr; - MachineBasicBlock *FalseBB = nullptr; - SmallVector Cond; - MachineBasicBlock *FallthroughBB = FallthroughMap[CodeBB]; - TII->analyzeBranch(*CodeBB, TrueBB, FalseBB, Cond); - - const DebugLoc &DL = CodeBB->findDebugLoc(CodeBB->getFirstTerminator()); - - if (FalseBB == nullptr && TrueBB == nullptr && FallthroughBB == nullptr) { - // This is an exit block, hence no successors. We will assign the - // bb select register to the entry block. - TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL, - BBSelectReg, - CodeBB->getParent()->begin()->getNumber()); - insertUnconditionalBranch(CodeBB, MergeBB, DL); - return; - } - - if (FalseBB == nullptr && TrueBB == nullptr) { - TrueBB = FallthroughBB; - } else if (TrueBB != nullptr) { - FalseBB = - (FallthroughBB && (FallthroughBB != TrueBB)) ? FallthroughBB : FalseBB; - } - - if ((TrueBB != nullptr && FalseBB == nullptr) || (TrueBB == FalseBB)) { - TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL, - BBSelectReg, TrueBB->getNumber()); - } else { - const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectReg); - Register TrueBBReg = MRI->createVirtualRegister(RegClass); - Register FalseBBReg = MRI->createVirtualRegister(RegClass); - TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL, - TrueBBReg, TrueBB->getNumber()); - TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL, - FalseBBReg, FalseBB->getNumber()); - ensureCondIsNotKilled(Cond); - TII->insertVectorSelect(*CodeBB, CodeBB->getFirstTerminator(), DL, - BBSelectReg, Cond, TrueBBReg, FalseBBReg); - } - - insertUnconditionalBranch(CodeBB, MergeBB, DL); -} - -MachineInstr *AMDGPUMachineCFGStructurizer::getDefInstr(unsigned Reg) { - if (MRI->def_begin(Reg) == MRI->def_end()) { - LLVM_DEBUG(dbgs() << "Register " - << printReg(Reg, MRI->getTargetRegisterInfo()) - << " has NO defs\n"); - } else if (!MRI->hasOneDef(Reg)) { - LLVM_DEBUG(dbgs() << "Register " - << printReg(Reg, MRI->getTargetRegisterInfo()) - << " has multiple defs\n"); - LLVM_DEBUG(dbgs() << "DEFS BEGIN:\n"); - for (auto DI = MRI->def_begin(Reg), DE = MRI->def_end(); DI != DE; ++DI) { - LLVM_DEBUG(DI->getParent()->dump()); - } - LLVM_DEBUG(dbgs() << "DEFS END\n"); - } - - assert(MRI->hasOneDef(Reg) && "Register has multiple definitions"); - return (*(MRI->def_begin(Reg))).getParent(); -} - -void AMDGPUMachineCFGStructurizer::insertChainedPHI(MachineBasicBlock *IfBB, - MachineBasicBlock *CodeBB, - MachineBasicBlock *MergeBB, - LinearizedRegion *InnerRegion, - unsigned DestReg, - unsigned SourceReg) { - // In this function we know we are part of a chain already, so we need - // to add the registers to the existing chain, and rename the register - // inside the region. - bool IsSingleBB = InnerRegion->getEntry() == InnerRegion->getExit(); - MachineInstr *DefInstr = getDefInstr(SourceReg); - if (DefInstr->isPHI() && DefInstr->getParent() == CodeBB && IsSingleBB) { - // Handle the case where the def is a PHI-def inside a basic - // block, then we only need to do renaming. Special care needs to - // be taken if the PHI-def is part of an existing chain, or if a - // new one needs to be created. - InnerRegion->replaceRegisterInsideRegion(SourceReg, DestReg, true, MRI); - - // We collect all PHI Information, and if we are at the region entry, - // all PHIs will be removed, and then re-introduced if needed. - storePHILinearizationInfoDest(DestReg, *DefInstr); - // We have picked up all the information we need now and can remove - // the PHI - PHIInfo.removeSource(DestReg, SourceReg, CodeBB); - DefInstr->eraseFromParent(); - } else { - // If this is not a phi-def, or it is a phi-def but from a linearized region - if (IsSingleBB && DefInstr->getParent() == InnerRegion->getEntry()) { - // If this is a single BB and the definition is in this block we - // need to replace any uses outside the region. - InnerRegion->replaceRegisterOutsideRegion(SourceReg, DestReg, false, MRI); - } - const TargetRegisterClass *RegClass = MRI->getRegClass(DestReg); - Register NextDestReg = MRI->createVirtualRegister(RegClass); - bool IsLastDef = PHIInfo.getNumSources(DestReg) == 1; - LLVM_DEBUG(dbgs() << "Insert Chained PHI\n"); - insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, DestReg, NextDestReg, - SourceReg, IsLastDef); - - PHIInfo.removeSource(DestReg, SourceReg, CodeBB); - if (IsLastDef) { - const DebugLoc &DL = IfBB->findDebugLoc(IfBB->getFirstTerminator()); - TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DL, - NextDestReg, 0); - PHIInfo.deleteDef(DestReg); - } else { - PHIInfo.replaceDef(DestReg, NextDestReg); - } - } -} - -bool AMDGPUMachineCFGStructurizer::containsDef(MachineBasicBlock *MBB, - LinearizedRegion *InnerRegion, - unsigned Register) { - return getDefInstr(Register)->getParent() == MBB || - InnerRegion->contains(getDefInstr(Register)->getParent()); -} - -void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB, - MachineBasicBlock *CodeBB, - MachineBasicBlock *MergeBB, - LinearizedRegion *InnerRegion, - LinearizedRegion *LRegion) { - DenseSet *LiveOuts = InnerRegion->getLiveOuts(); - SmallVector OldLiveOuts; - bool IsSingleBB = InnerRegion->getEntry() == InnerRegion->getExit(); - for (auto OLI : *LiveOuts) { - OldLiveOuts.push_back(OLI); - } - - for (auto LI : OldLiveOuts) { - LLVM_DEBUG(dbgs() << "LiveOut: " << printReg(LI, TRI)); - if (!containsDef(CodeBB, InnerRegion, LI) || - (!IsSingleBB && (getDefInstr(LI)->getParent() == LRegion->getExit()))) { - // If the register simply lives through the CodeBB, we don't have - // to rewrite anything since the register is not defined in this - // part of the code. - LLVM_DEBUG(dbgs() << "- through"); - continue; - } - LLVM_DEBUG(dbgs() << "\n"); - unsigned Reg = LI; - if (/*!PHIInfo.isSource(Reg) &&*/ Reg != InnerRegion->getBBSelectRegOut()) { - // If the register is live out, we do want to create a phi, - // unless it is from the Exit block, because in that case there - // is already a PHI, and no need to create a new one. - - // If the register is just a live out def and not part of a phi - // chain, we need to create a PHI node to handle the if region, - // and replace all uses outside of the region with the new dest - // register, unless it is the outgoing BB select register. We have - // already created phi nodes for these. - const TargetRegisterClass *RegClass = MRI->getRegClass(Reg); - Register PHIDestReg = MRI->createVirtualRegister(RegClass); - Register IfSourceReg = MRI->createVirtualRegister(RegClass); - // Create initializer, this value is never used, but is needed - // to satisfy SSA. - LLVM_DEBUG(dbgs() << "Initializer for reg: " << printReg(Reg) << "\n"); - TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DebugLoc(), - IfSourceReg, 0); - - InnerRegion->replaceRegisterOutsideRegion(Reg, PHIDestReg, true, MRI); - LLVM_DEBUG(dbgs() << "Insert Non-Chained Live out PHI\n"); - insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, PHIDestReg, - IfSourceReg, Reg, true); - } - } - - // Handle the chained definitions in PHIInfo, checking if this basic block - // is a source block for a definition. - SmallVector Sources; - if (PHIInfo.findSourcesFromMBB(CodeBB, Sources)) { - LLVM_DEBUG(dbgs() << "Inserting PHI Live Out from " - << printMBBReference(*CodeBB) << "\n"); - for (auto SI : Sources) { - unsigned DestReg; - PHIInfo.findDest(SI, CodeBB, DestReg); - insertChainedPHI(IfBB, CodeBB, MergeBB, InnerRegion, DestReg, SI); - } - LLVM_DEBUG(dbgs() << "Insertion done.\n"); - } - - LLVM_DEBUG(PHIInfo.dump(MRI)); -} - -void AMDGPUMachineCFGStructurizer::prunePHIInfo(MachineBasicBlock *MBB) { - LLVM_DEBUG(dbgs() << "Before PHI Prune\n"); - LLVM_DEBUG(PHIInfo.dump(MRI)); - SmallVector, 4> - ElimiatedSources; - for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE; - ++DRI) { - - unsigned DestReg = *DRI; - auto SE = PHIInfo.sources_end(DestReg); - - bool MBBContainsPHISource = false; - // Check if there is a PHI source in this MBB - for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) { - unsigned SourceReg = (*SRI).first; - MachineOperand *Def = &(*(MRI->def_begin(SourceReg))); - if (Def->getParent()->getParent() == MBB) { - MBBContainsPHISource = true; - } - } - - // If so, all other sources are useless since we know this block - // is always executed when the region is executed. - if (MBBContainsPHISource) { - for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) { - PHILinearize::PHISourceT Source = *SRI; - unsigned SourceReg = Source.first; - MachineBasicBlock *SourceMBB = Source.second; - MachineOperand *Def = &(*(MRI->def_begin(SourceReg))); - if (Def->getParent()->getParent() != MBB) { - ElimiatedSources.push_back(std::tuple(DestReg, SourceReg, SourceMBB)); - } - } - } - } - - // Remove the PHI sources that are in the given MBB - for (auto &SourceInfo : ElimiatedSources) { - PHIInfo.removeSource(std::get<0>(SourceInfo), std::get<1>(SourceInfo), - std::get<2>(SourceInfo)); - } - LLVM_DEBUG(dbgs() << "After PHI Prune\n"); - LLVM_DEBUG(PHIInfo.dump(MRI)); -} - -void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegion, - unsigned DestReg) { - MachineBasicBlock *Entry = CurrentRegion->getEntry(); - MachineBasicBlock *Exit = CurrentRegion->getExit(); - - LLVM_DEBUG(dbgs() << "RegionExit: " << Exit->getNumber() << " Pred: " - << (*(Entry->pred_begin()))->getNumber() << "\n"); - - int NumSources = 0; - auto SE = PHIInfo.sources_end(DestReg); - - for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) { - NumSources++; - } - - if (NumSources == 1) { - auto SRI = PHIInfo.sources_begin(DestReg); - unsigned SourceReg = (*SRI).first; - replaceRegisterWith(DestReg, SourceReg); - } else { - const DebugLoc &DL = Entry->findDebugLoc(Entry->begin()); - MachineInstrBuilder MIB = BuildMI(*Entry, Entry->instr_begin(), DL, - TII->get(TargetOpcode::PHI), DestReg); - LLVM_DEBUG(dbgs() << "Entry PHI " << printReg(DestReg, TRI) << " = PHI("); - - unsigned CurrentBackedgeReg = 0; - - for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) { - unsigned SourceReg = (*SRI).first; - - if (CurrentRegion->contains((*SRI).second)) { - if (CurrentBackedgeReg == 0) { - CurrentBackedgeReg = SourceReg; - } else { - MachineInstr *PHIDefInstr = getDefInstr(SourceReg); - MachineBasicBlock *PHIDefMBB = PHIDefInstr->getParent(); - const TargetRegisterClass *RegClass = - MRI->getRegClass(CurrentBackedgeReg); - Register NewBackedgeReg = MRI->createVirtualRegister(RegClass); - MachineInstrBuilder BackedgePHI = - BuildMI(*PHIDefMBB, PHIDefMBB->instr_begin(), DL, - TII->get(TargetOpcode::PHI), NewBackedgeReg); - BackedgePHI.addReg(CurrentBackedgeReg); - BackedgePHI.addMBB(getPHIPred(*PHIDefInstr, 0)); - BackedgePHI.addReg(getPHISourceReg(*PHIDefInstr, 1)); - BackedgePHI.addMBB((*SRI).second); - CurrentBackedgeReg = NewBackedgeReg; - LLVM_DEBUG(dbgs() - << "Inserting backedge PHI: " - << printReg(NewBackedgeReg, TRI) << " = PHI(" - << printReg(CurrentBackedgeReg, TRI) << ", " - << printMBBReference(*getPHIPred(*PHIDefInstr, 0)) << ", " - << printReg(getPHISourceReg(*PHIDefInstr, 1), TRI) << ", " - << printMBBReference(*(*SRI).second)); - } - } else { - MIB.addReg(SourceReg); - MIB.addMBB((*SRI).second); - LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", " - << printMBBReference(*(*SRI).second) << ", "); - } - } - - // Add the final backedge register source to the entry phi - if (CurrentBackedgeReg != 0) { - MIB.addReg(CurrentBackedgeReg); - MIB.addMBB(Exit); - LLVM_DEBUG(dbgs() << printReg(CurrentBackedgeReg, TRI) << ", " - << printMBBReference(*Exit) << ")\n"); - } else { - LLVM_DEBUG(dbgs() << ")\n"); - } - } -} - -void AMDGPUMachineCFGStructurizer::createEntryPHIs(LinearizedRegion *CurrentRegion) { - LLVM_DEBUG(PHIInfo.dump(MRI)); - - for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE; - ++DRI) { - - unsigned DestReg = *DRI; - createEntryPHI(CurrentRegion, DestReg); - } - PHIInfo.clear(); -} - -void AMDGPUMachineCFGStructurizer::replaceRegisterWith( - unsigned Register, class Register NewRegister) { - assert(Register != NewRegister && "Cannot replace a reg with itself"); - - for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Register), - E = MRI->reg_end(); - I != E;) { - MachineOperand &O = *I; - ++I; - if (NewRegister.isPhysical()) { - LLVM_DEBUG(dbgs() << "Trying to substitute physical register: " - << printReg(NewRegister, MRI->getTargetRegisterInfo()) - << "\n"); - llvm_unreachable("Cannot substitute physical registers"); - // We don't handle physical registers, but if we need to - // in the future This is how we do it: - // O.substPhysReg(NewRegister, *TRI); - } else { - LLVM_DEBUG(dbgs() << "Replacing register: " - << printReg(Register, MRI->getTargetRegisterInfo()) - << " with " - << printReg(NewRegister, MRI->getTargetRegisterInfo()) - << "\n"); - O.setReg(NewRegister); - } - } - PHIInfo.deleteDef(Register); - - getRegionMRT()->replaceLiveOutReg(Register, NewRegister); - - LLVM_DEBUG(PHIInfo.dump(MRI)); -} - -void AMDGPUMachineCFGStructurizer::resolvePHIInfos(MachineBasicBlock *FunctionEntry) { - LLVM_DEBUG(dbgs() << "Resolve PHI Infos\n"); - LLVM_DEBUG(PHIInfo.dump(MRI)); - for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE; - ++DRI) { - unsigned DestReg = *DRI; - LLVM_DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI) << "\n"); - auto SRI = PHIInfo.sources_begin(DestReg); - unsigned SourceReg = (*SRI).first; - LLVM_DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI) - << " SourceReg: " << printReg(SourceReg, TRI) << "\n"); - - assert(PHIInfo.sources_end(DestReg) == ++SRI && - "More than one phi source in entry node"); - replaceRegisterWith(DestReg, SourceReg); - } -} - -static bool isFunctionEntryBlock(MachineBasicBlock *MBB) { - return ((&(*(MBB->getParent()->begin()))) == MBB); -} - -MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion( - MachineBasicBlock *MergeBB, MachineBasicBlock *CodeBB, - LinearizedRegion *CurrentRegion, unsigned BBSelectRegIn, - unsigned BBSelectRegOut) { - if (isFunctionEntryBlock(CodeBB) && !CurrentRegion->getHasLoop()) { - // Handle non-loop function entry block. - // We need to allow loops to the entry block and then - rewriteCodeBBTerminator(CodeBB, MergeBB, BBSelectRegOut); - resolvePHIInfos(CodeBB); - removeExternalCFGSuccessors(CodeBB); - CodeBB->addSuccessor(MergeBB); - CurrentRegion->addMBB(CodeBB); - return nullptr; - } - if (CurrentRegion->getEntry() == CodeBB && !CurrentRegion->getHasLoop()) { - // Handle non-loop region entry block. - MachineFunction *MF = MergeBB->getParent(); - auto MergeIter = MergeBB->getIterator(); - auto CodeBBStartIter = CodeBB->getIterator(); - auto CodeBBEndIter = ++(CodeBB->getIterator()); - if (CodeBBEndIter != MergeIter) { - MF->splice(MergeIter, CodeBBStartIter, CodeBBEndIter); - } - rewriteCodeBBTerminator(CodeBB, MergeBB, BBSelectRegOut); - prunePHIInfo(CodeBB); - createEntryPHIs(CurrentRegion); - removeExternalCFGSuccessors(CodeBB); - CodeBB->addSuccessor(MergeBB); - CurrentRegion->addMBB(CodeBB); - return nullptr; - } - // Handle internal block. - const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectRegIn); - Register CodeBBSelectReg = MRI->createVirtualRegister(RegClass); - rewriteCodeBBTerminator(CodeBB, MergeBB, CodeBBSelectReg); - bool IsRegionEntryBB = CurrentRegion->getEntry() == CodeBB; - MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeBB, CodeBB, CodeBB, - BBSelectRegIn, IsRegionEntryBB); - CurrentRegion->addMBB(IfBB); - // If this is the entry block we need to make the If block the new - // linearized region entry. - if (IsRegionEntryBB) { - CurrentRegion->setEntry(IfBB); - - if (CurrentRegion->getHasLoop()) { - MachineBasicBlock *RegionExit = CurrentRegion->getExit(); - MachineBasicBlock *ETrueBB = nullptr; - MachineBasicBlock *EFalseBB = nullptr; - SmallVector ECond; - - const DebugLoc &DL = DebugLoc(); - TII->analyzeBranch(*RegionExit, ETrueBB, EFalseBB, ECond); - TII->removeBranch(*RegionExit); - - // We need to create a backedge if there is a loop - Register Reg = - TII->insertNE(RegionExit, RegionExit->instr_end(), DL, - CurrentRegion->getRegionMRT()->getInnerOutputRegister(), - CurrentRegion->getRegionMRT()->getEntry()->getNumber()); - MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true); - ArrayRef Cond(RegOp); - LLVM_DEBUG(dbgs() << "RegionExitReg: "); - LLVM_DEBUG(RegOp.print(dbgs(), TRI)); - LLVM_DEBUG(dbgs() << "\n"); - TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit, - Cond, DebugLoc()); - RegionExit->addSuccessor(CurrentRegion->getEntry()); - } - } - CurrentRegion->addMBB(CodeBB); - LinearizedRegion InnerRegion(CodeBB, MRI, TRI, PHIInfo); - - InnerRegion.setParent(CurrentRegion); - LLVM_DEBUG(dbgs() << "Insert BB Select PHI (BB)\n"); - insertMergePHI(IfBB, CodeBB, MergeBB, BBSelectRegOut, BBSelectRegIn, - CodeBBSelectReg); - InnerRegion.addMBB(MergeBB); - - LLVM_DEBUG(InnerRegion.print(dbgs(), TRI)); - rewriteLiveOutRegs(IfBB, CodeBB, MergeBB, &InnerRegion, CurrentRegion); - extractKilledPHIs(CodeBB); - if (IsRegionEntryBB) - createEntryPHIs(CurrentRegion); - return IfBB; -} - -MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion( - MachineBasicBlock *MergeBB, LinearizedRegion *InnerRegion, - LinearizedRegion *CurrentRegion, MachineBasicBlock *SelectBB, - unsigned BBSelectRegIn, unsigned BBSelectRegOut) { - unsigned CodeBBSelectReg = - InnerRegion->getRegionMRT()->getInnerOutputRegister(); - MachineBasicBlock *CodeEntryBB = InnerRegion->getEntry(); - MachineBasicBlock *CodeExitBB = InnerRegion->getExit(); - MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeEntryBB, CodeExitBB, - SelectBB, BBSelectRegIn, true); - CurrentRegion->addMBB(IfBB); - bool isEntry = CurrentRegion->getEntry() == InnerRegion->getEntry(); - if (isEntry) { - - if (CurrentRegion->getHasLoop()) { - MachineBasicBlock *RegionExit = CurrentRegion->getExit(); - MachineBasicBlock *ETrueBB = nullptr; - MachineBasicBlock *EFalseBB = nullptr; - SmallVector ECond; - - const DebugLoc &DL = DebugLoc(); - TII->analyzeBranch(*RegionExit, ETrueBB, EFalseBB, ECond); - TII->removeBranch(*RegionExit); - - // We need to create a backedge if there is a loop - Register Reg = - TII->insertNE(RegionExit, RegionExit->instr_end(), DL, - CurrentRegion->getRegionMRT()->getInnerOutputRegister(), - CurrentRegion->getRegionMRT()->getEntry()->getNumber()); - MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true); - ArrayRef Cond(RegOp); - LLVM_DEBUG(dbgs() << "RegionExitReg: "); - LLVM_DEBUG(Cond[0].print(dbgs(), TRI)); - LLVM_DEBUG(dbgs() << "\n"); - TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit, - Cond, DebugLoc()); - RegionExit->addSuccessor(IfBB); - } - } - CurrentRegion->addMBBs(InnerRegion); - LLVM_DEBUG(dbgs() << "Insert BB Select PHI (region)\n"); - insertMergePHI(IfBB, CodeExitBB, MergeBB, BBSelectRegOut, BBSelectRegIn, - CodeBBSelectReg); - - rewriteLiveOutRegs(IfBB, /* CodeEntryBB */ CodeExitBB, MergeBB, InnerRegion, - CurrentRegion); - - rewriteRegionEntryPHIs(InnerRegion, IfBB); - - if (isEntry) { - CurrentRegion->setEntry(IfBB); - } - - if (isEntry) { - createEntryPHIs(CurrentRegion); - } - - return IfBB; -} - -void AMDGPUMachineCFGStructurizer::splitLoopPHI(MachineInstr &PHI, - MachineBasicBlock *Entry, - MachineBasicBlock *EntrySucc, - LinearizedRegion *LRegion) { - SmallVector PHIRegionIndices; - getPHIRegionIndices(LRegion, PHI, PHIRegionIndices); - - assert(PHIRegionIndices.size() == 1); - - unsigned RegionIndex = PHIRegionIndices[0]; - unsigned RegionSourceReg = getPHISourceReg(PHI, RegionIndex); - MachineBasicBlock *RegionSourceMBB = getPHIPred(PHI, RegionIndex); - unsigned PHIDest = getPHIDestReg(PHI); - unsigned PHISource = PHIDest; - unsigned ReplaceReg; - - if (shrinkPHI(PHI, PHIRegionIndices, &ReplaceReg)) { - PHISource = ReplaceReg; - } - - const TargetRegisterClass *RegClass = MRI->getRegClass(PHIDest); - Register NewDestReg = MRI->createVirtualRegister(RegClass); - LRegion->replaceRegisterInsideRegion(PHIDest, NewDestReg, false, MRI); - MachineInstrBuilder MIB = - BuildMI(*EntrySucc, EntrySucc->instr_begin(), PHI.getDebugLoc(), - TII->get(TargetOpcode::PHI), NewDestReg); - LLVM_DEBUG(dbgs() << "Split Entry PHI " << printReg(NewDestReg, TRI) - << " = PHI("); - MIB.addReg(PHISource); - MIB.addMBB(Entry); - LLVM_DEBUG(dbgs() << printReg(PHISource, TRI) << ", " - << printMBBReference(*Entry)); - MIB.addReg(RegionSourceReg); - MIB.addMBB(RegionSourceMBB); - LLVM_DEBUG(dbgs() << " ," << printReg(RegionSourceReg, TRI) << ", " - << printMBBReference(*RegionSourceMBB) << ")\n"); -} - -void AMDGPUMachineCFGStructurizer::splitLoopPHIs(MachineBasicBlock *Entry, - MachineBasicBlock *EntrySucc, - LinearizedRegion *LRegion) { - SmallVector PHIs; - collectPHIs(Entry, PHIs); - - for (auto *PHII : PHIs) { - splitLoopPHI(*PHII, Entry, EntrySucc, LRegion); - } -} - -// Split the exit block so that we can insert a end control flow -MachineBasicBlock * -AMDGPUMachineCFGStructurizer::splitExit(LinearizedRegion *LRegion) { - auto MRTRegion = LRegion->getRegionMRT(); - auto Exit = LRegion->getExit(); - auto MF = Exit->getParent(); - auto Succ = MRTRegion->getSucc(); - - auto NewExit = MF->CreateMachineBasicBlock(); - auto AfterExitIter = Exit->getIterator(); - AfterExitIter++; - MF->insert(AfterExitIter, NewExit); - Exit->removeSuccessor(Succ); - Exit->addSuccessor(NewExit); - NewExit->addSuccessor(Succ); - insertUnconditionalBranch(NewExit, Succ); - LRegion->addMBB(NewExit); - LRegion->setExit(NewExit); - - LLVM_DEBUG(dbgs() << "Created new exit block: " << NewExit->getNumber() - << "\n"); - - // Replace any PHI Predecessors in the successor with NewExit - for (auto &II : *Succ) { - MachineInstr &Instr = II; - - // If we are past the PHI instructions we are done - if (!Instr.isPHI()) - break; - - int numPreds = getPHINumInputs(Instr); - for (int i = 0; i < numPreds; ++i) { - auto Pred = getPHIPred(Instr, i); - if (Pred == Exit) { - setPhiPred(Instr, i, NewExit); - } - } - } - - return NewExit; -} - -static MachineBasicBlock *split(MachineBasicBlock::iterator I) { - // Create the fall-through block. - MachineBasicBlock *MBB = (*I).getParent(); - MachineFunction *MF = MBB->getParent(); - MachineBasicBlock *SuccMBB = MF->CreateMachineBasicBlock(); - auto MBBIter = ++(MBB->getIterator()); - MF->insert(MBBIter, SuccMBB); - SuccMBB->transferSuccessorsAndUpdatePHIs(MBB); - MBB->addSuccessor(SuccMBB); - - // Splice the code over. - SuccMBB->splice(SuccMBB->end(), MBB, I, MBB->end()); - - return SuccMBB; -} - -// Split the entry block separating PHI-nodes and the rest of the code -// This is needed to insert an initializer for the bb select register -// inloop regions. - -MachineBasicBlock * -AMDGPUMachineCFGStructurizer::splitEntry(LinearizedRegion *LRegion) { - MachineBasicBlock *Entry = LRegion->getEntry(); - MachineBasicBlock *EntrySucc = split(Entry->getFirstNonPHI()); - MachineBasicBlock *Exit = LRegion->getExit(); - - LLVM_DEBUG(dbgs() << "Split " << printMBBReference(*Entry) << " to " - << printMBBReference(*Entry) << " -> " - << printMBBReference(*EntrySucc) << "\n"); - LRegion->addMBB(EntrySucc); - - // Make the backedge go to Entry Succ - if (Exit->isSuccessor(Entry)) { - Exit->removeSuccessor(Entry); - } - Exit->addSuccessor(EntrySucc); - MachineInstr &Branch = *(Exit->instr_rbegin()); - for (auto &UI : Branch.uses()) { - if (UI.isMBB() && UI.getMBB() == Entry) { - UI.setMBB(EntrySucc); - } - } - - splitLoopPHIs(Entry, EntrySucc, LRegion); - - return EntrySucc; -} - -LinearizedRegion * -AMDGPUMachineCFGStructurizer::initLinearizedRegion(RegionMRT *Region) { - LinearizedRegion *LRegion = Region->getLinearizedRegion(); - LRegion->initLiveOut(Region, MRI, TRI, PHIInfo); - LRegion->setEntry(Region->getEntry()); - return LRegion; -} - -static void removeOldExitPreds(RegionMRT *Region) { - MachineBasicBlock *Exit = Region->getSucc(); - if (Exit == nullptr) { - return; - } - for (MachineBasicBlock::pred_iterator PI = Exit->pred_begin(), - E = Exit->pred_end(); - PI != E; ++PI) { - if (Region->contains(*PI)) { - (*PI)->removeSuccessor(Exit); - } - } -} - -static bool mbbHasBackEdge(MachineBasicBlock *MBB, - SmallPtrSet &MBBs) { - for (MachineBasicBlock *Succ : MBB->successors()) - if (MBBs.contains(Succ)) - return true; - return false; -} - -static bool containsNewBackedge(MRT *Tree, - SmallPtrSet &MBBs) { - // Need to traverse this in reverse since it is in post order. - if (Tree == nullptr) - return false; - - if (Tree->isMBB()) { - MachineBasicBlock *MBB = Tree->getMBBMRT()->getMBB(); - MBBs.insert(MBB); - if (mbbHasBackEdge(MBB, MBBs)) { - return true; - } - } else { - RegionMRT *Region = Tree->getRegionMRT(); - for (MRT *C : llvm::reverse(*Region->getChildren())) - if (containsNewBackedge(C, MBBs)) - return true; - } - return false; -} - -static bool containsNewBackedge(RegionMRT *Region) { - SmallPtrSet MBBs; - return containsNewBackedge(Region, MBBs); -} - -bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) { - auto *LRegion = initLinearizedRegion(Region); - LRegion->setHasLoop(containsNewBackedge(Region)); - MachineBasicBlock *LastMerge = createLinearizedExitBlock(Region); - MachineBasicBlock *CurrentMerge = LastMerge; - LRegion->addMBB(LastMerge); - LRegion->setExit(LastMerge); - - rewriteRegionExitPHIs(Region, LastMerge, LRegion); - removeOldExitPreds(Region); - - LLVM_DEBUG(PHIInfo.dump(MRI)); - - SetVector *Children = Region->getChildren(); - LLVM_DEBUG(dbgs() << "===========If Region Start===============\n"); - if (LRegion->getHasLoop()) { - LLVM_DEBUG(dbgs() << "Has Backedge: Yes\n"); - } else { - LLVM_DEBUG(dbgs() << "Has Backedge: No\n"); - } - - unsigned BBSelectRegIn; - unsigned BBSelectRegOut; - for (MRT *Child : *Children) { - LLVM_DEBUG(dbgs() << "CurrentRegion: \n"); - LLVM_DEBUG(LRegion->print(dbgs(), TRI)); - - if (Child->isRegion()) { - - LinearizedRegion *InnerLRegion = - Child->getRegionMRT()->getLinearizedRegion(); - // We found the block is the exit of an inner region, we need - // to put it in the current linearized region. - - LLVM_DEBUG(dbgs() << "Linearizing region: "); - LLVM_DEBUG(InnerLRegion->print(dbgs(), TRI)); - LLVM_DEBUG(dbgs() << "\n"); - - MachineBasicBlock *InnerEntry = InnerLRegion->getEntry(); - if ((&(*(InnerEntry->getParent()->begin()))) == InnerEntry) { - // Entry has already been linearized, no need to do this region. - unsigned OuterSelect = InnerLRegion->getBBSelectRegOut(); - unsigned InnerSelectReg = - InnerLRegion->getRegionMRT()->getInnerOutputRegister(); - replaceRegisterWith(InnerSelectReg, OuterSelect), - resolvePHIInfos(InnerEntry); - if (!InnerLRegion->getExit()->isSuccessor(CurrentMerge)) - InnerLRegion->getExit()->addSuccessor(CurrentMerge); - continue; - } - - BBSelectRegOut = Child->getBBSelectRegOut(); - BBSelectRegIn = Child->getBBSelectRegIn(); - - LLVM_DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI) - << "\n"); - LLVM_DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI) - << "\n"); - - MachineBasicBlock *IfEnd = CurrentMerge; - CurrentMerge = createIfRegion(CurrentMerge, InnerLRegion, LRegion, - Child->getRegionMRT()->getEntry(), - BBSelectRegIn, BBSelectRegOut); - TII->convertNonUniformIfRegion(CurrentMerge, IfEnd); - } else { - MachineBasicBlock *MBB = Child->getMBBMRT()->getMBB(); - LLVM_DEBUG(dbgs() << "Linearizing block: " << MBB->getNumber() << "\n"); - - if (MBB == getSingleExitNode(*(MBB->getParent()))) { - // If this is the exit block then we need to skip to the next. - // The "in" register will be transferred to "out" in the next - // iteration. - continue; - } - - BBSelectRegOut = Child->getBBSelectRegOut(); - BBSelectRegIn = Child->getBBSelectRegIn(); - - LLVM_DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI) - << "\n"); - LLVM_DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI) - << "\n"); - - MachineBasicBlock *IfEnd = CurrentMerge; - // This is a basic block that is not part of an inner region, we - // need to put it in the current linearized region. - CurrentMerge = createIfRegion(CurrentMerge, MBB, LRegion, BBSelectRegIn, - BBSelectRegOut); - if (CurrentMerge) { - TII->convertNonUniformIfRegion(CurrentMerge, IfEnd); - } - - LLVM_DEBUG(PHIInfo.dump(MRI)); - } - } - - LRegion->removeFalseRegisterKills(MRI); - - if (LRegion->getHasLoop()) { - MachineBasicBlock *NewSucc = splitEntry(LRegion); - if (isFunctionEntryBlock(LRegion->getEntry())) { - resolvePHIInfos(LRegion->getEntry()); - } - const DebugLoc &DL = NewSucc->findDebugLoc(NewSucc->getFirstNonPHI()); - unsigned InReg = LRegion->getBBSelectRegIn(); - Register InnerSelectReg = - MRI->createVirtualRegister(MRI->getRegClass(InReg)); - Register NewInReg = MRI->createVirtualRegister(MRI->getRegClass(InReg)); - TII->materializeImmediate(*(LRegion->getEntry()), - LRegion->getEntry()->getFirstTerminator(), DL, - NewInReg, Region->getEntry()->getNumber()); - // Need to be careful about updating the registers inside the region. - LRegion->replaceRegisterInsideRegion(InReg, InnerSelectReg, false, MRI); - LLVM_DEBUG(dbgs() << "Loop BBSelect Merge PHI:\n"); - insertMergePHI(LRegion->getEntry(), LRegion->getExit(), NewSucc, - InnerSelectReg, NewInReg, - LRegion->getRegionMRT()->getInnerOutputRegister()); - splitExit(LRegion); - TII->convertNonUniformLoopRegion(NewSucc, LastMerge); - } - - if (Region->isRoot()) { - TII->insertReturn(*LastMerge); - } - - LLVM_DEBUG(Region->getEntry()->getParent()->dump()); - LLVM_DEBUG(LRegion->print(dbgs(), TRI)); - LLVM_DEBUG(PHIInfo.dump(MRI)); - - LLVM_DEBUG(dbgs() << "===========If Region End===============\n"); - - Region->setLinearizedRegion(LRegion); - return true; -} - -bool AMDGPUMachineCFGStructurizer::structurizeRegion(RegionMRT *Region) { - if (false && regionIsSimpleIf(Region)) { - transformSimpleIfRegion(Region); - return true; - } - if (regionIsSequence(Region)) - fixupRegionExits(Region); - else - structurizeComplexRegion(Region); - return false; -} - -static int structurize_once = 0; - -bool AMDGPUMachineCFGStructurizer::structurizeRegions(RegionMRT *Region, - bool isTopRegion) { - bool Changed = false; - - auto Children = Region->getChildren(); - for (auto *CI : *Children) { - if (CI->isRegion()) { - Changed |= structurizeRegions(CI->getRegionMRT(), false); - } - } - - if (structurize_once < 2 || true) { - Changed |= structurizeRegion(Region); - structurize_once++; - } - return Changed; -} - -void AMDGPUMachineCFGStructurizer::initFallthroughMap(MachineFunction &MF) { - LLVM_DEBUG(dbgs() << "Fallthrough Map:\n"); - for (auto &MBBI : MF) { - MachineBasicBlock *MBB = MBBI.getFallThrough(); - if (MBB != nullptr) { - LLVM_DEBUG(dbgs() << "Fallthrough: " << MBBI.getNumber() << " -> " - << MBB->getNumber() << "\n"); - } - FallthroughMap[&MBBI] = MBB; - } -} - -void AMDGPUMachineCFGStructurizer::createLinearizedRegion(RegionMRT *Region, - unsigned SelectOut) { - LinearizedRegion *LRegion = new LinearizedRegion(); - if (SelectOut) { - LRegion->addLiveOut(SelectOut); - LLVM_DEBUG(dbgs() << "Add LiveOut (BBSelect): " << printReg(SelectOut, TRI) - << "\n"); - } - LRegion->setRegionMRT(Region); - Region->setLinearizedRegion(LRegion); - LRegion->setParent(Region->getParent() - ? Region->getParent()->getLinearizedRegion() - : nullptr); -} - -unsigned -AMDGPUMachineCFGStructurizer::initializeSelectRegisters(MRT *MRT, unsigned SelectOut, - MachineRegisterInfo *MRI, - const SIInstrInfo *TII) { - if (MRT->isRegion()) { - RegionMRT *Region = MRT->getRegionMRT(); - Region->setBBSelectRegOut(SelectOut); - unsigned InnerSelectOut = createBBSelectReg(TII, MRI); - - // Fixme: Move linearization creation to the original spot - createLinearizedRegion(Region, SelectOut); - - for (auto *CI : *Region->getChildren()) - InnerSelectOut = initializeSelectRegisters(CI, InnerSelectOut, MRI, TII); - MRT->setBBSelectRegIn(InnerSelectOut); - return InnerSelectOut; - } - MRT->setBBSelectRegOut(SelectOut); - unsigned NewSelectIn = createBBSelectReg(TII, MRI); - MRT->setBBSelectRegIn(NewSelectIn); - return NewSelectIn; -} - -static void checkRegOnlyPHIInputs(MachineFunction &MF) { - for (auto &MBBI : MF) { - for (MachineInstr &Instr : MBBI.instrs()) { - if (Instr.isPHI()) { - int numPreds = getPHINumInputs(Instr); - for (int i = 0; i < numPreds; ++i) { - assert(Instr.getOperand(i * 2 + 1).isReg() && - "PHI Operand not a register"); - } - } - } - } -} - -bool AMDGPUMachineCFGStructurizer::runOnMachineFunction(MachineFunction &MF) { - const GCNSubtarget &ST = MF.getSubtarget(); - const SIInstrInfo *TII = ST.getInstrInfo(); - TRI = ST.getRegisterInfo(); - MRI = &(MF.getRegInfo()); - initFallthroughMap(MF); - - checkRegOnlyPHIInputs(MF); - LLVM_DEBUG(dbgs() << "----STRUCTURIZER START----\n"); - LLVM_DEBUG(MF.dump()); - - Regions = &(getAnalysis().getRegionInfo()); - LLVM_DEBUG(Regions->dump()); - - RegionMRT *RTree = MRT::buildMRT(MF, Regions, TII, MRI); - setRegionMRT(RTree); - initializeSelectRegisters(RTree, 0, MRI, TII); - LLVM_DEBUG(RTree->dump(TRI)); - bool result = structurizeRegions(RTree, true); - delete RTree; - LLVM_DEBUG(dbgs() << "----STRUCTURIZER END----\n"); - initFallthroughMap(MF); - return result; -} - -char AMDGPUMachineCFGStructurizerID = AMDGPUMachineCFGStructurizer::ID; - -INITIALIZE_PASS_BEGIN(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer", - "AMDGPU Machine CFG Structurizer", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineRegionInfoPass) -INITIALIZE_PASS_END(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer", - "AMDGPU Machine CFG Structurizer", false, false) - -FunctionPass *llvm::createAMDGPUMachineCFGStructurizerPass() { - return new AMDGPUMachineCFGStructurizer(); -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 2b9e431e86f893..5774045c0d36a6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -263,13 +263,6 @@ static cl::opt EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, cl::desc("Enable AMDGPU Alias Analysis"), cl::init(true)); -// Option to run late CFG structurizer -static cl::opt LateCFGStructurize( - "amdgpu-late-structurize", - cl::desc("Enable late CFG structurization"), - cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), - cl::Hidden); - // Disable structurizer-based control-flow lowering in order to test convergence // control tokens. This should eventually be replaced by the wave-transform. static cl::opt DisableStructurizer( @@ -627,7 +620,6 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, } } -bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; bool AMDGPUTargetMachine::EnableFunctionCalls = false; bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; bool AMDGPUTargetMachine::DisableStructurizer = false; @@ -1009,7 +1001,6 @@ class GCNPassConfig final : public AMDGPUPassConfig { bool addRegAssignAndRewriteFast() override; bool addRegAssignAndRewriteOptimized() override; - void addPreRegAlloc() override; bool addPreRewrite() override; void addPostRegAlloc() override; void addPreSched2() override; @@ -1248,7 +1239,7 @@ bool GCNPassConfig::addPreISel() { // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit // regions formed by them. addPass(&AMDGPUUnifyDivergentExitNodesID); - if (!LateCFGStructurize && !DisableStructurizer) { + if (!DisableStructurizer) { if (EnableStructurizerWorkarounds) { addPass(createFixIrreduciblePass()); addPass(createUnifyLoopExitsPass()); @@ -1256,7 +1247,7 @@ bool GCNPassConfig::addPreISel() { addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions } addPass(createAMDGPUAnnotateUniformValuesLegacy()); - if (!LateCFGStructurize && !DisableStructurizer) { + if (!DisableStructurizer) { addPass(createSIAnnotateControlFlowLegacyPass()); // TODO: Move this right after structurizeCFG to avoid extra divergence // analysis. This depends on stopping SIAnnotateControlFlow from making @@ -1347,12 +1338,6 @@ bool GCNPassConfig::addGlobalInstructionSelect() { return false; } -void GCNPassConfig::addPreRegAlloc() { - if (LateCFGStructurize) { - addPass(createAMDGPUMachineCFGStructurizerPass()); - } -} - void GCNPassConfig::addFastRegAlloc() { // FIXME: We have to disable the verifier here because of PHIElimination + // TwoAddressInstructions disabling it. @@ -1878,7 +1863,6 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { } void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const { - const bool LateCFGStructurize = AMDGPUTargetMachine::EnableLateStructurizeCFG; const bool DisableStructurizer = AMDGPUTargetMachine::DisableStructurizer; const bool EnableStructurizerWorkarounds = AMDGPUTargetMachine::EnableStructurizerWorkarounds; @@ -1896,7 +1880,7 @@ void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const { addPass(AMDGPUUnifyDivergentExitNodesPass()); - if (!LateCFGStructurize && !DisableStructurizer) { + if (!DisableStructurizer) { if (EnableStructurizerWorkarounds) { addPass(FixIrreduciblePass()); addPass(UnifyLoopExitsPass()); @@ -1907,7 +1891,7 @@ void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const { addPass(AMDGPUAnnotateUniformValuesPass()); - if (!LateCFGStructurize && !DisableStructurizer) { + if (!DisableStructurizer) { addPass(SIAnnotateControlFlowPass(TM)); // TODO: Move this right after structurizeCFG to avoid extra divergence diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 27ff0da5e812f4..c5d079ad7abb62 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -36,7 +36,6 @@ class AMDGPUTargetMachine : public LLVMTargetMachine { StringRef getFeatureString(const Function &F) const; public: - static bool EnableLateStructurizeCFG; static bool EnableFunctionCalls; static bool EnableLowerModuleLDS; static bool DisableStructurizer; diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 18a8e917fbb71f..9ed7981b3da5ae 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -74,7 +74,6 @@ add_llvm_target(AMDGPUCodeGen AMDGPULowerKernelArguments.cpp AMDGPULowerKernelAttributes.cpp AMDGPULowerModuleLDSPass.cpp - AMDGPUMachineCFGStructurizer.cpp AMDGPUMachineFunction.cpp AMDGPUMachineModuleInfo.cpp AMDGPUMacroFusion.cpp diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 6dce41d1605fa4..84d25a1fbd2722 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2867,8 +2867,7 @@ SIInstrInfo::getBranchDestBlock(const MachineInstr &MI) const { bool SIInstrInfo::hasDivergentBranch(const MachineBasicBlock *MBB) const { for (const MachineInstr &MI : MBB->terminators()) { - if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO || - MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE || + if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE || MI.getOpcode() == AMDGPU::SI_LOOP) return true; } @@ -3043,20 +3042,14 @@ bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, return false; } - MachineBasicBlock *CondBB = nullptr; + BranchPredicate Pred = getBranchPredicate(I->getOpcode()); + if (Pred == INVALID_BR) + return true; - if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { - CondBB = I->getOperand(1).getMBB(); - Cond.push_back(I->getOperand(0)); - } else { - BranchPredicate Pred = getBranchPredicate(I->getOpcode()); - if (Pred == INVALID_BR) - return true; + MachineBasicBlock *CondBB = I->getOperand(0).getMBB(); + Cond.push_back(MachineOperand::CreateImm(Pred)); + Cond.push_back(I->getOperand(1)); // Save the branch register. - CondBB = I->getOperand(0).getMBB(); - Cond.push_back(MachineOperand::CreateImm(Pred)); - Cond.push_back(I->getOperand(1)); // Save the branch register. - } ++I; if (I == MBB.end()) { @@ -3159,13 +3152,6 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, return 1; } - if(Cond.size() == 1 && Cond[0].isReg()) { - BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) - .add(Cond[0]) - .addMBB(TBB); - return 1; - } - assert(TBB && Cond[0].isImm()); unsigned Opcode @@ -8772,79 +8758,6 @@ bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { return false; } -bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const { - return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO; -} - -void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, - MachineBasicBlock *IfEnd) const { - MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator(); - assert(TI != IfEntry->end()); - - MachineInstr *Branch = &(*TI); - MachineFunction *MF = IfEntry->getParent(); - MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); - - if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { - Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); - MachineInstr *SIIF = - BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) - .add(Branch->getOperand(0)) - .add(Branch->getOperand(1)); - MachineInstr *SIEND = - BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) - .addReg(DstReg); - - IfEntry->erase(TI); - IfEntry->insert(IfEntry->end(), SIIF); - IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND); - } -} - -void SIInstrInfo::convertNonUniformLoopRegion( - MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const { - MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator(); - // We expect 2 terminators, one conditional and one unconditional. - assert(TI != LoopEnd->end()); - - MachineInstr *Branch = &(*TI); - MachineFunction *MF = LoopEnd->getParent(); - MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo(); - - if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { - - Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); - Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC()); - MachineInstrBuilder HeaderPHIBuilder = - BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); - for (MachineBasicBlock *PMBB : LoopEntry->predecessors()) { - if (PMBB == LoopEnd) { - HeaderPHIBuilder.addReg(BackEdgeReg); - } else { - Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC()); - materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), - ZeroReg, 0); - HeaderPHIBuilder.addReg(ZeroReg); - } - HeaderPHIBuilder.addMBB(PMBB); - } - MachineInstr *HeaderPhi = HeaderPHIBuilder; - MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(), - get(AMDGPU::SI_IF_BREAK), BackEdgeReg) - .addReg(DstReg) - .add(Branch->getOperand(0)); - MachineInstr *SILOOP = - BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP)) - .addReg(BackEdgeReg) - .addMBB(LoopEntry); - - LoopEntry->insert(LoopEntry->begin(), HeaderPhi); - LoopEnd->erase(TI); - LoopEnd->insert(LoopEnd->end(), SIIFBREAK); - LoopEnd->insert(LoopEnd->end(), SILOOP); - } -} - ArrayRef> SIInstrInfo::getSerializableTargetIndices() const { static const std::pair TargetIndices[] = { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 91855fb14f6f37..badfd91c0b9727 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1307,14 +1307,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { bool mayAccessFlatAddressSpace(const MachineInstr &MI) const; - bool isNonUniformBranchInstr(MachineInstr &Instr) const; - - void convertNonUniformIfRegion(MachineBasicBlock *IfEntry, - MachineBasicBlock *IfEnd) const; - - void convertNonUniformLoopRegion(MachineBasicBlock *LoopEntry, - MachineBasicBlock *LoopEnd) const; - std::pair decomposeMachineOperandsTargetFlags(unsigned TF) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index e7831d00a3a4a8..814d3182fb5df8 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -421,15 +421,6 @@ def IGLP_OPT : SPseudoInstSI<(outs), (ins i32imm:$mask), let isTerminator = 1, isNotDuplicable = 1 in { -let OtherPredicates = [EnableLateCFGStructurize] in { - def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI < - (outs), - (ins SReg_1:$vcc, brtarget:$target), - [(brcond i1:$vcc, bb:$target)]> { - let Size = 12; -} -} - def SI_IF: CFPseudoInstSI < (outs SReg_1:$dst), (ins SReg_1:$vcc, brtarget:$target), [(set i1:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> { From 2b4b909509bc2aa7e7f6b3bc469c214bf42fea49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= Date: Fri, 23 Aug 2024 14:14:17 +0200 Subject: [PATCH 309/426] [AMDGPU] Remove unused amdgpu-disable-structurizer flag (#105800) --- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 57 +++++++------------ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h | 1 - 2 files changed, 21 insertions(+), 37 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 5774045c0d36a6..7a9735790371a1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -263,13 +263,6 @@ static cl::opt EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, cl::desc("Enable AMDGPU Alias Analysis"), cl::init(true)); -// Disable structurizer-based control-flow lowering in order to test convergence -// control tokens. This should eventually be replaced by the wave-transform. -static cl::opt DisableStructurizer( - "amdgpu-disable-structurizer", - cl::desc("Disable structurizer for experiments; produces unusable code"), - cl::location(AMDGPUTargetMachine::DisableStructurizer), cl::ReallyHidden); - // Enable lib calls simplifications static cl::opt EnableLibCallSimplify( "amdgpu-simplify-libcall", @@ -622,7 +615,6 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, bool AMDGPUTargetMachine::EnableFunctionCalls = false; bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; -bool AMDGPUTargetMachine::DisableStructurizer = false; bool AMDGPUTargetMachine::EnableStructurizerWorkarounds = true; AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; @@ -1239,21 +1231,19 @@ bool GCNPassConfig::addPreISel() { // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit // regions formed by them. addPass(&AMDGPUUnifyDivergentExitNodesID); - if (!DisableStructurizer) { - if (EnableStructurizerWorkarounds) { - addPass(createFixIrreduciblePass()); - addPass(createUnifyLoopExitsPass()); - } - addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions + if (EnableStructurizerWorkarounds) { + addPass(createFixIrreduciblePass()); + addPass(createUnifyLoopExitsPass()); } + addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions + addPass(createAMDGPUAnnotateUniformValuesLegacy()); - if (!DisableStructurizer) { - addPass(createSIAnnotateControlFlowLegacyPass()); - // TODO: Move this right after structurizeCFG to avoid extra divergence - // analysis. This depends on stopping SIAnnotateControlFlow from making - // control flow modifications. - addPass(createAMDGPURewriteUndefForPHILegacyPass()); - } + addPass(createSIAnnotateControlFlowLegacyPass()); + // TODO: Move this right after structurizeCFG to avoid extra divergence + // analysis. This depends on stopping SIAnnotateControlFlow from making + // control flow modifications. + addPass(createAMDGPURewriteUndefForPHILegacyPass()); + addPass(createLCSSAPass()); if (TM->getOptLevel() > CodeGenOptLevel::Less) @@ -1863,7 +1853,6 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { } void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const { - const bool DisableStructurizer = AMDGPUTargetMachine::DisableStructurizer; const bool EnableStructurizerWorkarounds = AMDGPUTargetMachine::EnableStructurizerWorkarounds; @@ -1880,25 +1869,21 @@ void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const { addPass(AMDGPUUnifyDivergentExitNodesPass()); - if (!DisableStructurizer) { - if (EnableStructurizerWorkarounds) { - addPass(FixIrreduciblePass()); - addPass(UnifyLoopExitsPass()); - } - - addPass(StructurizeCFGPass(/*SkipUniformRegions=*/false)); + if (EnableStructurizerWorkarounds) { + addPass(FixIrreduciblePass()); + addPass(UnifyLoopExitsPass()); } + addPass(StructurizeCFGPass(/*SkipUniformRegions=*/false)); + addPass(AMDGPUAnnotateUniformValuesPass()); - if (!DisableStructurizer) { - addPass(SIAnnotateControlFlowPass(TM)); + addPass(SIAnnotateControlFlowPass(TM)); - // TODO: Move this right after structurizeCFG to avoid extra divergence - // analysis. This depends on stopping SIAnnotateControlFlow from making - // control flow modifications. - addPass(AMDGPURewriteUndefForPHIPass()); - } + // TODO: Move this right after structurizeCFG to avoid extra divergence + // analysis. This depends on stopping SIAnnotateControlFlow from making + // control flow modifications. + addPass(AMDGPURewriteUndefForPHIPass()); addPass(LCSSAPass()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index c5d079ad7abb62..66dfd2f733e3e9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -38,7 +38,6 @@ class AMDGPUTargetMachine : public LLVMTargetMachine { public: static bool EnableFunctionCalls; static bool EnableLowerModuleLDS; - static bool DisableStructurizer; static bool EnableStructurizerWorkarounds; AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, From 2f144ac5a13dc39389e1850417f4ac766b1f1ada Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Fri, 23 Aug 2024 14:28:52 +0200 Subject: [PATCH 310/426] [clang][bytecode][NFC] Remove containsErrors check from visitInitializer (#105811) --- clang/lib/AST/ByteCode/Compiler.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index f11196d2b02707..0fc942a4f1bc4f 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -3301,9 +3301,6 @@ template bool Compiler::visitInitializer(const Expr *E) { assert(!classify(E->getType())); - if (E->containsErrors()) - return this->emitError(E); - if (!this->checkLiteralType(E)) return false; From 2051a7bcd3f375c063f803df3cfde9e6e6d724ad Mon Sep 17 00:00:00 2001 From: jeanPerier Date: Fri, 23 Aug 2024 14:32:43 +0200 Subject: [PATCH 311/426] [flang][NFC] turn fir.call is_bind_c into enum for procedure flags (#105691) First patch to fix a BIND(C) ABI issue (https://github.com/llvm/llvm-project/issues/102113). I need to keep track of BIND(C) in more locations (fir.dispatch and func.func operations), and I need to fix a few passes that are dropping the attribute on the floor. Since I expect more procedure attributes that cannot be reflected in mlir::FunctionType will be needed for ABI, optimizations, or debug info, this NFC patch adds a new enum attribute to keep track of procedure attributes in the IR. This patch is not updating lowering to lower more attributes, this will be done in a separate patch to keep the test changes low here. Adding the attribute on fir.dispatch and func.func will also be done in separate patches. --- .../flang/Optimizer/Dialect/FIRAttr.td | 27 +++++++++++++++++++ .../include/flang/Optimizer/Dialect/FIROps.td | 4 +-- flang/lib/Lower/ConvertCall.cpp | 18 ++++++++----- flang/lib/Lower/ConvertExpr.cpp | 4 +-- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 4 +-- flang/lib/Optimizer/Dialect/FIRAttr.cpp | 3 ++- flang/lib/Optimizer/Dialect/FIROps.cpp | 23 +++++++++++++--- .../ConstantArgumentGlobalisation.cpp | 4 +-- .../Transforms/PolymorphicOpConversion.cpp | 5 +++- .../lib/Optimizer/Transforms/StackArrays.cpp | 8 +++--- flang/test/HLFIR/c_ptr_byvalue.f90 | 4 +-- flang/test/Lower/CUDA/cuda-device-proc.cuf | 8 +++--- flang/test/Lower/HLFIR/assumed-rank-calls.f90 | 2 +- flang/test/Lower/HLFIR/assumed-rank-iface.f90 | 4 +-- .../test/Lower/HLFIR/bindc-value-derived.f90 | 2 +- flang/test/Lower/HLFIR/block_bindc_pocs.f90 | 2 +- .../call-sequence-associated-descriptors.f90 | 8 +++--- .../calls-character-singleton-result.f90 | 2 +- .../Lower/HLFIR/ignore-type-assumed-shape.f90 | 8 +++--- .../OpenMP/threadprivate-default-clause.f90 | 4 +-- flang/test/Lower/block.f90 | 2 +- flang/test/Lower/call-bindc.f90 | 2 +- 22 files changed, 100 insertions(+), 48 deletions(-) diff --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.td b/flang/include/flang/Optimizer/Dialect/FIRAttr.td index 60281dfa637139..6400756b384482 100644 --- a/flang/include/flang/Optimizer/Dialect/FIRAttr.td +++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.td @@ -58,6 +58,33 @@ def fir_FortranVariableFlagsAttr : fir_Attr<"FortranVariableFlags"> { "::fir::FortranVariableFlagsAttr::get($_builder.getContext(), $0)"; } + +/// Fortran procedure attributes (F2023 15.6.2.1). BIND attribute (18.3.7) +/// is also tracked in the same enum. Recursive (resp. Impure) attribute +/// is implied by the absence of opposite NonRecursive (resp. Pure) attribute. +def FIRfuncNoAttributes : I32BitEnumAttrCaseNone<"none">; +def FIRfuncElemental : I32BitEnumAttrCaseBit<"elemental", 0>; +def FIRfuncPure : I32BitEnumAttrCaseBit<"pure", 1>; +def FIRfuncNonRecursive : I32BitEnumAttrCaseBit<"non_recursive", 2>; +def FIRfuncSimple : I32BitEnumAttrCaseBit<"simple", 3>; +def FIRfuncBind_c : I32BitEnumAttrCaseBit<"bind_c", 4>; + +def fir_FortranProcedureFlagsEnum : I32BitEnumAttr< + "FortranProcedureFlagsEnum", + "Fortran procedure attributes", + [FIRfuncNoAttributes, FIRfuncElemental, FIRfuncPure, FIRfuncNonRecursive, + FIRfuncSimple, FIRfuncBind_c]> { + let separator = ", "; + let cppNamespace = "::fir"; + let genSpecializedAttr = 0; + let printBitEnumPrimaryGroups = 1; +} + +def fir_FortranProcedureFlagsAttr : + EnumAttr { + let assemblyFormat = "`<` $value `>`"; +} + def fir_BoxFieldAttr : I32EnumAttr< "BoxFieldAttr", "", [ diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td index 04f5fedf2783fc..2cd202c3623250 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.td +++ b/flang/include/flang/Optimizer/Dialect/FIROps.td @@ -2432,9 +2432,9 @@ def fir_CallOp : fir_Op<"call", let arguments = (ins OptionalAttr:$callee, Variadic:$args, + OptionalAttr:$procedure_attrs, DefaultValuedAttr:$fastmath, - UnitAttr:$is_bind_c + "::mlir::arith::FastMathFlags::none">:$fastmath ); let results = (outs Variadic); diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp index fd873f55dd844e..f445a21e560bc9 100644 --- a/flang/lib/Lower/ConvertCall.cpp +++ b/flang/lib/Lower/ConvertCall.cpp @@ -372,7 +372,7 @@ std::pair Fortran::lower::genCallOpAndResult( auto stackSaveSymbol = bldr->getSymbolRefAttr(stackSaveFn.getName()); mlir::Value sp; fir::CallOp call = bldr->create( - loc, stackSaveFn.getFunctionType().getResults(), stackSaveSymbol, + loc, stackSaveSymbol, stackSaveFn.getFunctionType().getResults(), mlir::ValueRange{}); if (call.getNumResults() != 0) sp = call.getResult(0); @@ -380,9 +380,9 @@ std::pair Fortran::lower::genCallOpAndResult( auto stackRestoreFn = fir::factory::getLlvmStackRestore(*bldr); auto stackRestoreSymbol = bldr->getSymbolRefAttr(stackRestoreFn.getName()); - bldr->create(loc, + bldr->create(loc, stackRestoreSymbol, stackRestoreFn.getFunctionType().getResults(), - stackRestoreSymbol, mlir::ValueRange{sp}); + mlir::ValueRange{sp}); }); } mlir::Value temp = @@ -640,11 +640,15 @@ std::pair Fortran::lower::genCallOpAndResult( if (callNumResults != 0) callResult = dispatch.getResult(0); } else { - // Standard procedure call with fir.call. - auto call = builder.create(loc, funcType.getResults(), - funcSymbolAttr, operands); + // TODO: gather other procedure attributes. + fir::FortranProcedureFlagsEnumAttr procAttrs; if (caller.characterize().IsBindC()) - call.setIsBindC(true); + procAttrs = fir::FortranProcedureFlagsEnumAttr::get( + builder.getContext(), fir::FortranProcedureFlagsEnum::bind_c); + + // Standard procedure call with fir.call. + auto call = builder.create( + loc, funcType.getResults(), funcSymbolAttr, operands, procAttrs); callNumResults = call.getNumResults(); if (callNumResults != 0) diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp index 44c3dc88edd32f..7dd317d64436b5 100644 --- a/flang/lib/Lower/ConvertExpr.cpp +++ b/flang/lib/Lower/ConvertExpr.cpp @@ -6120,7 +6120,7 @@ class ArrayExprLowering { mlir::SymbolRefAttr funcSymAttr = builder.getSymbolRefAttr(memcpyFunc.getName()); mlir::FunctionType funcTy = memcpyFunc.getFunctionType(); - builder.create(loc, funcTy.getResults(), funcSymAttr, args); + builder.create(loc, funcSymAttr, funcTy.getResults(), args); } // Construct code to check for a buffer overrun and realloc the buffer when @@ -6146,7 +6146,7 @@ class ArrayExprLowering { builder.getSymbolRefAttr(reallocFunc.getName()); mlir::FunctionType funcTy = reallocFunc.getFunctionType(); auto newMem = builder.create( - loc, funcTy.getResults(), funcSymAttr, + loc, funcSymAttr, funcTy.getResults(), llvm::ArrayRef{ builder.createConvert(loc, funcTy.getInputs()[0], mem), builder.createConvert(loc, funcTy.getInputs()[1], byteSz)}); diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 22439010e7797b..dc0dc47bda9a9d 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -819,8 +819,8 @@ mlir::Value genLibCall(fir::FirOpBuilder &builder, mlir::Location loc, llvm::SmallVector operands{funcPointer}; operands.append(args.begin(), args.end()); - libCall = builder.create(loc, libFuncType.getResults(), - nullptr, operands); + libCall = builder.create(loc, mlir::SymbolRefAttr{}, + libFuncType.getResults(), operands); } LLVM_DEBUG(libCall.dump(); llvm::dbgs() << "\n"); diff --git a/flang/lib/Optimizer/Dialect/FIRAttr.cpp b/flang/lib/Optimizer/Dialect/FIRAttr.cpp index 443e94ae6606f1..4c78e223b41785 100644 --- a/flang/lib/Optimizer/Dialect/FIRAttr.cpp +++ b/flang/lib/Optimizer/Dialect/FIRAttr.cpp @@ -296,7 +296,8 @@ void fir::printFirAttribute(FIROpsDialect *dialect, mlir::Attribute attr, //===----------------------------------------------------------------------===// void FIROpsDialect::registerAttributes() { - addAttributes(); diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp index 9e6b88041ba69d..ce0ae446bb7af8 100644 --- a/flang/lib/Optimizer/Dialect/FIROps.cpp +++ b/flang/lib/Optimizer/Dialect/FIROps.cpp @@ -1103,6 +1103,14 @@ void fir::CallOp::print(mlir::OpAsmPrinter &p) { p << getOperand(0); p << '(' << (*this)->getOperands().drop_front(isDirect ? 0 : 1) << ')'; + // Print `proc_attrs<...>`, if present. + fir::FortranProcedureFlagsEnumAttr procAttrs = getProcedureAttrsAttr(); + if (procAttrs && + procAttrs.getValue() != fir::FortranProcedureFlagsEnum::none) { + p << ' ' << fir::FortranProcedureFlagsEnumAttr::getMnemonic(); + p.printStrippedAttrOrType(procAttrs); + } + // Print 'fastmath<...>' (if it has non-default value) before // any other attributes. mlir::arith::FastMathFlagsAttr fmfAttr = getFastmathAttr(); @@ -1111,9 +1119,9 @@ void fir::CallOp::print(mlir::OpAsmPrinter &p) { p.printStrippedAttrOrType(fmfAttr); } - p.printOptionalAttrDict( - (*this)->getAttrs(), - {fir::CallOp::getCalleeAttrNameStr(), getFastmathAttrName()}); + p.printOptionalAttrDict((*this)->getAttrs(), + {fir::CallOp::getCalleeAttrNameStr(), + getFastmathAttrName(), getProcedureAttrsAttrName()}); auto resultTypes{getResultTypes()}; llvm::SmallVector argTypes( llvm::drop_begin(getOperandTypes(), isDirect ? 0 : 1)); @@ -1138,6 +1146,15 @@ mlir::ParseResult fir::CallOp::parse(mlir::OpAsmParser &parser, if (parser.parseOperandList(operands, mlir::OpAsmParser::Delimiter::Paren)) return mlir::failure(); + // Parse `proc_attrs<...>`, if present. + fir::FortranProcedureFlagsEnumAttr procAttr; + if (mlir::succeeded(parser.parseOptionalKeyword( + fir::FortranProcedureFlagsEnumAttr::getMnemonic()))) + if (parser.parseCustomAttributeWithFallback( + procAttr, mlir::Type{}, getProcedureAttrsAttrName(result.name), + attrs)) + return mlir::failure(); + // Parse 'fastmath<...>', if present. mlir::arith::FastMathFlagsAttr fmfAttr; llvm::StringRef fmfAttrName = getFastmathAttrName(result.name); diff --git a/flang/lib/Optimizer/Transforms/ConstantArgumentGlobalisation.cpp b/flang/lib/Optimizer/Transforms/ConstantArgumentGlobalisation.cpp index 1e44288c784c2a..eef6f047fc1bf8 100644 --- a/flang/lib/Optimizer/Transforms/ConstantArgumentGlobalisation.cpp +++ b/flang/lib/Optimizer/Transforms/ConstantArgumentGlobalisation.cpp @@ -126,10 +126,10 @@ class CallOpRewriter : public mlir::OpRewritePattern { newResultTypes.append(callOp.getResultTypes().begin(), callOp.getResultTypes().end()); fir::CallOp newOp = builder.create( - loc, newResultTypes, + loc, callOp.getCallee().has_value() ? callOp.getCallee().value() : mlir::SymbolRefAttr{}, - newOperands); + newResultTypes, newOperands); // Copy all the attributes from the old to new op. newOp->setAttrs(callOp->getAttrs()); rewriter.replaceOp(callOp, newOp); diff --git a/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp b/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp index 57f19f257b569d..105f275de8b940 100644 --- a/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp @@ -205,7 +205,10 @@ struct DispatchOpConv : public OpConversionPattern { // Make the call. llvm::SmallVector args{funcPtr}; args.append(dispatch.getArgs().begin(), dispatch.getArgs().end()); - rewriter.replaceOpWithNewOp(dispatch, resTypes, nullptr, args); + // FIXME: add procedure_attrs to fir.dispatch and propagate to fir.call. + rewriter.replaceOpWithNewOp( + dispatch, resTypes, nullptr, args, + /*procedure_attrs=*/fir::FortranProcedureFlagsEnumAttr{}); return mlir::success(); } diff --git a/flang/lib/Optimizer/Transforms/StackArrays.cpp b/flang/lib/Optimizer/Transforms/StackArrays.cpp index 6bd5724f52043c..a8f1a744cda5fe 100644 --- a/flang/lib/Optimizer/Transforms/StackArrays.cpp +++ b/flang/lib/Optimizer/Transforms/StackArrays.cpp @@ -741,9 +741,9 @@ void AllocMemConversion::insertStackSaveRestore( builder.setInsertionPoint(oldAlloc); mlir::Value sp = builder - .create(oldAlloc.getLoc(), + .create(oldAlloc.getLoc(), stackSaveSym, stackSaveFn.getFunctionType().getResults(), - stackSaveSym, mlir::ValueRange{}) + mlir::ValueRange{}) .getResult(0); mlir::func::FuncOp stackRestoreFn = @@ -753,9 +753,9 @@ void AllocMemConversion::insertStackSaveRestore( auto createStackRestoreCall = [&](mlir::Operation *user) { builder.setInsertionPoint(user); - builder.create(user->getLoc(), + builder.create(user->getLoc(), stackRestoreSym, stackRestoreFn.getFunctionType().getResults(), - stackRestoreSym, mlir::ValueRange{sp}); + mlir::ValueRange{sp}); }; for (mlir::Operation *user : oldAlloc->getUsers()) { diff --git a/flang/test/HLFIR/c_ptr_byvalue.f90 b/flang/test/HLFIR/c_ptr_byvalue.f90 index ea48bdb1a100f0..b2c8da5e22579d 100644 --- a/flang/test/HLFIR/c_ptr_byvalue.f90 +++ b/flang/test/HLFIR/c_ptr_byvalue.f90 @@ -7,7 +7,7 @@ ! CHECK: %[[VAL_113:.*]] = fir.load %[[VAL_112]] : !fir.ref ! CHECK: %[[VAL_114:.*]] = fir.convert %[[VAL_113]] : (i64) -> !fir.ref ! CHECK: hlfir.end_associate %[[VAL_110]]#1, %[[VAL_110]]#2 : !fir.ref>, i1 -! CHECK: fir.call @get_expected_f(%[[VAL_114]]) fastmath {is_bind_c} : (!fir.ref) -> () +! CHECK: fir.call @get_expected_f(%[[VAL_114]]) proc_attrs fastmath : (!fir.ref) -> () subroutine test1 use iso_c_binding interface @@ -28,7 +28,7 @@ end subroutine get_expected_f ! CHECK: %[[VAL_99:.*]] = fir.coordinate_of %[[VAL_97]]#0, %[[VAL_98]] : (!fir.ref>, !fir.field) -> !fir.ref ! CHECK: %[[VAL_100:.*]] = fir.load %[[VAL_99]] : !fir.ref ! CHECK: %[[VAL_101:.*]] = fir.convert %[[VAL_100]] : (i64) -> !fir.ref -! CHECK: fir.call @get_expected_f(%[[VAL_101]]) fastmath {is_bind_c} : (!fir.ref) -> () +! CHECK: fir.call @get_expected_f(%[[VAL_101]]) proc_attrs fastmath : (!fir.ref) -> () subroutine test2(cptr) use iso_c_binding interface diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index e259e6ee00f91e..bed0a4574fe94d 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -18,13 +18,13 @@ end ! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc} ! CHECK: fir.call @__syncthreads() -! CHECK: fir.call @__syncwarp(%{{.*}}) fastmath {is_bind_c} : (!fir.ref) -> () +! CHECK: fir.call @__syncwarp(%{{.*}}) proc_attrs fastmath : (!fir.ref) -> () ! CHECK: fir.call @__threadfence() ! CHECK: fir.call @__threadfence_block() ! CHECK: fir.call @__threadfence_system() -! CHECK: %{{.*}} = fir.call @__syncthreads_and(%{{.*}}) fastmath {is_bind_c} : (!fir.ref) -> i32 -! CHECK: %{{.*}} = fir.call @__syncthreads_count(%{{.*}}) fastmath {is_bind_c} : (!fir.ref) -> i32 -! CHECK: %{{.*}} = fir.call @__syncthreads_or(%{{.*}}) fastmath {is_bind_c} : (!fir.ref) -> i32 +! CHECK: %{{.*}} = fir.call @__syncthreads_and(%{{.*}}) proc_attrs fastmath : (!fir.ref) -> i32 +! CHECK: %{{.*}} = fir.call @__syncthreads_count(%{{.*}}) proc_attrs fastmath : (!fir.ref) -> i32 +! CHECK: %{{.*}} = fir.call @__syncthreads_or(%{{.*}}) proc_attrs fastmath : (!fir.ref) -> i32 ! CHECK: func.func private @__syncthreads() attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__syncthreads"} ! CHECK: func.func private @__syncwarp(!fir.ref {cuf.data_attr = #cuf.cuda}) attributes {cuf.proc_attr = #cuf.cuda_proc, fir.bindc_name = "__syncwarp"} diff --git a/flang/test/Lower/HLFIR/assumed-rank-calls.f90 b/flang/test/Lower/HLFIR/assumed-rank-calls.f90 index 9d4503fef6fce9..afb2bbac998910 100644 --- a/flang/test/Lower/HLFIR/assumed-rank-calls.f90 +++ b/flang/test/Lower/HLFIR/assumed-rank-calls.f90 @@ -36,7 +36,7 @@ subroutine bindc_func(x) bind(c) ! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_to_bindcEx"} : (!fir.box>, !fir.dscope) -> (!fir.box>, !fir.box>) ! CHECK: %[[VAL_3:.*]] = fir.rebox_assumed_rank %[[VAL_2]]#0 lbs zeroes : (!fir.box>) -> !fir.box> -! CHECK: fir.call @bindc_func(%[[VAL_3]]) fastmath {is_bind_c} : (!fir.box>) -> () +! CHECK: fir.call @bindc_func(%[[VAL_3]]) proc_attrs fastmath : (!fir.box>) -> () ! CHECK: return ! CHECK: } diff --git a/flang/test/Lower/HLFIR/assumed-rank-iface.f90 b/flang/test/Lower/HLFIR/assumed-rank-iface.f90 index aaf003a59d84f1..0e094cc6646d1f 100644 --- a/flang/test/Lower/HLFIR/assumed-rank-iface.f90 +++ b/flang/test/Lower/HLFIR/assumed-rank-iface.f90 @@ -38,7 +38,7 @@ subroutine int_scalar_to_assumed_rank_bindc(x) ! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_scalar_to_assumed_rank_bindcEx"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) ! CHECK: %[[VAL_2:.*]] = fir.embox %[[VAL_1]]#0 : (!fir.ref) -> !fir.box ! CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.box) -> !fir.box> -! CHECK: fir.call @int_assumed_rank_bindc(%[[VAL_3]]) fastmath {is_bind_c} : (!fir.box>) -> () +! CHECK: fir.call @int_assumed_rank_bindc(%[[VAL_3]]) proc_attrs fastmath : (!fir.box>) -> () subroutine int_r1_to_assumed_rank(x) use ifaces, only : int_assumed_rank @@ -94,7 +94,7 @@ subroutine int_assumed_shape_to_assumed_rank_bindc(x) ! CHECK: %[[VAL_3:.*]] = fir.shift %[[VAL_2]], %[[VAL_2]] : (index, index) -> !fir.shift<2> ! CHECK: %[[VAL_4:.*]] = fir.rebox %[[VAL_1]]#0(%[[VAL_3]]) : (!fir.box>, !fir.shift<2>) -> !fir.box> ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.box>) -> !fir.box> -! CHECK: fir.call @int_assumed_rank_bindc(%[[VAL_5]]) fastmath {is_bind_c} : (!fir.box>) -> () +! CHECK: fir.call @int_assumed_rank_bindc(%[[VAL_5]]) proc_attrs fastmath : (!fir.box>) -> () subroutine int_allocatable_to_assumed_rank(x) use ifaces, only : int_assumed_rank diff --git a/flang/test/Lower/HLFIR/bindc-value-derived.f90 b/flang/test/Lower/HLFIR/bindc-value-derived.f90 index 8cefd1246173c2..a54b29b470e0b4 100644 --- a/flang/test/Lower/HLFIR/bindc-value-derived.f90 +++ b/flang/test/Lower/HLFIR/bindc-value-derived.f90 @@ -31,7 +31,7 @@ subroutine call_it(x) ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMbindc_byvalFcall_itEx"} : (!fir.ref>, !fir.dscope) -> (!fir.ref>, !fir.ref>) ! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#1 : !fir.ref> -! CHECK: fir.call @test(%[[VAL_2]]) fastmath {is_bind_c} : (!fir.type<_QMbindc_byvalTt{i:i32}>) -> () +! CHECK: fir.call @test(%[[VAL_2]]) proc_attrs fastmath : (!fir.type<_QMbindc_byvalTt{i:i32}>) -> () ! CHECK: return ! CHECK: } end module diff --git a/flang/test/Lower/HLFIR/block_bindc_pocs.f90 b/flang/test/Lower/HLFIR/block_bindc_pocs.f90 index c6519e92d0b5cd..090eeb35ea88b0 100644 --- a/flang/test/Lower/HLFIR/block_bindc_pocs.f90 +++ b/flang/test/Lower/HLFIR/block_bindc_pocs.f90 @@ -9,7 +9,7 @@ end subroutine test_proc end interface end module m !CHECK-DAG: %[[S0:.*]] = fir.call @llvm.stacksave.p0() fastmath : () -> !fir.ref -!CHECK-DAG: fir.call @test_proc() fastmath {is_bind_c} : () -> () +!CHECK-DAG: fir.call @test_proc() proc_attrs fastmath : () -> () !CHECK-DAG: fir.call @llvm.stackrestore.p0(%[[S0]]) fastmath : (!fir.ref) -> () !CHECK-DAG: func.func private @test_proc() attributes {fir.bindc_name = "test_proc"} subroutine test diff --git a/flang/test/Lower/HLFIR/call-sequence-associated-descriptors.f90 b/flang/test/Lower/HLFIR/call-sequence-associated-descriptors.f90 index ccbc1df96a73a5..86464f29e0831b 100644 --- a/flang/test/Lower/HLFIR/call-sequence-associated-descriptors.f90 +++ b/flang/test/Lower/HLFIR/call-sequence-associated-descriptors.f90 @@ -46,7 +46,7 @@ subroutine test_char_1(x) ! CHECK: %[[VAL_26:.*]] = fir.box_addr %[[VAL_10]] : (!fir.box>>) -> !fir.ref>> ! CHECK: %[[VAL_27:.*]] = fir.convert %[[VAL_26]] : (!fir.ref>>) -> !fir.ref>> ! CHECK: %[[VAL_28:.*]] = fir.embox %[[VAL_27]](%[[VAL_24]]) typeparams %[[VAL_25]] : (!fir.ref>>, !fir.shapeshift<1>, index) -> !fir.box>> -! CHECK: fir.call @takes_char(%[[VAL_28]], %[[VAL_11]]#1) fastmath {is_bind_c} : (!fir.box>>, !fir.ref) -> () +! CHECK: fir.call @takes_char(%[[VAL_28]], %[[VAL_11]]#1) proc_attrs fastmath : (!fir.box>>, !fir.ref) -> () ! CHECK: hlfir.end_associate %[[VAL_11]]#1, %[[VAL_11]]#2 : !fir.ref, i1 ! CHECK: return ! CHECK: } @@ -80,7 +80,7 @@ subroutine test_char_copy_in_copy_out(x) ! CHECK: %[[VAL_22:.*]] = fir.box_addr %[[VAL_6]] : (!fir.box>>) -> !fir.ref>> ! CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (!fir.ref>>) -> !fir.ref>> ! CHECK: %[[VAL_24:.*]] = fir.embox %[[VAL_23]](%[[VAL_20]]) typeparams %[[VAL_21]] : (!fir.ref>>, !fir.shapeshift<1>, index) -> !fir.box>> -! CHECK: fir.call @takes_char(%[[VAL_24]], %[[VAL_7]]#1) fastmath {is_bind_c} : (!fir.box>>, !fir.ref) -> () +! CHECK: fir.call @takes_char(%[[VAL_24]], %[[VAL_7]]#1) proc_attrs fastmath : (!fir.box>>, !fir.ref) -> () ! CHECK: hlfir.copy_out %[[TMP_BOX]], %[[VAL_3]]#1 to %[[VAL_1]]#0 : (!fir.ref>>>>, i1, !fir.box>>) -> () ! CHECK: hlfir.end_associate %[[VAL_7]]#1, %[[VAL_7]]#2 : !fir.ref, i1 ! CHECK: return @@ -113,7 +113,7 @@ subroutine test_char_assumed_size(x) ! CHECK: %[[VAL_20:.*]] = fir.box_addr %[[VAL_5]] : (!fir.box>>) -> !fir.ref>> ! CHECK: %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (!fir.ref>>) -> !fir.ref>> ! CHECK: %[[VAL_22:.*]] = fir.embox %[[VAL_21]](%[[VAL_18]]) typeparams %[[VAL_19]] : (!fir.ref>>, !fir.shapeshift<2>, index) -> !fir.box>> -! CHECK: fir.call @takes_char_assumed_size(%[[VAL_22]]) fastmath {is_bind_c} : (!fir.box>>) -> () +! CHECK: fir.call @takes_char_assumed_size(%[[VAL_22]]) proc_attrs fastmath : (!fir.box>>) -> () ! CHECK: hlfir.copy_out %[[TMP_BOX]], %[[VAL_2]]#1 to %[[VAL_1]]#0 : (!fir.ref>>>>, i1, !fir.box>>) -> () ! CHECK: return ! CHECK: } @@ -159,7 +159,7 @@ subroutine test_optional_char(x) ! CHECK: %[[VAL_33:.*]] = fir.absent !fir.box>> ! CHECK: fir.result %[[VAL_33]] : !fir.box>> ! CHECK: } -! CHECK: fir.call @takes_optional_char(%[[VAL_15]], %[[VAL_14]]#1) fastmath {is_bind_c} : (!fir.box>>, !fir.ref) -> () +! CHECK: fir.call @takes_optional_char(%[[VAL_15]], %[[VAL_14]]#1) proc_attrs fastmath : (!fir.box>>, !fir.ref) -> () ! CHECK: hlfir.end_associate %[[VAL_14]]#1, %[[VAL_14]]#2 : !fir.ref, i1 ! CHECK: return ! CHECK: } diff --git a/flang/test/Lower/HLFIR/calls-character-singleton-result.f90 b/flang/test/Lower/HLFIR/calls-character-singleton-result.f90 index b97d9774f734c2..6e0f977862b5a4 100644 --- a/flang/test/Lower/HLFIR/calls-character-singleton-result.f90 +++ b/flang/test/Lower/HLFIR/calls-character-singleton-result.f90 @@ -35,7 +35,7 @@ character(1) function bar() bind(c) ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.char<1> ! CHECK: %[[VAL_3:.*]] = fir.convert %{{.*}}#0 : (!fir.ref>) -> !fir.ref> ! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] typeparams %{{.*}} {{.*}}Ec -! CHECK: %[[VAL_5:.*]] = fir.call @bar() fastmath {is_bind_c} : () -> !fir.char<1> +! CHECK: %[[VAL_5:.*]] = fir.call @bar() proc_attrs fastmath : () -> !fir.char<1> ! CHECK: fir.store %[[VAL_5]] to %[[VAL_1]] : !fir.ref> ! CHECK: %[[VAL_6:.*]] = arith.constant false ! CHECK: %[[VAL_7:.*]] = hlfir.as_expr %[[VAL_1]] move %[[VAL_6]] : (!fir.ref>, i1) -> !hlfir.expr> diff --git a/flang/test/Lower/HLFIR/ignore-type-assumed-shape.f90 b/flang/test/Lower/HLFIR/ignore-type-assumed-shape.f90 index 60951cf1fa73a6..d1c61c6023e089 100644 --- a/flang/test/Lower/HLFIR/ignore-type-assumed-shape.f90 +++ b/flang/test/Lower/HLFIR/ignore-type-assumed-shape.f90 @@ -23,7 +23,7 @@ subroutine test_ignore_t_1(x) ! CHECK: %[[VAL_6:.*]] = fir.shift %[[VAL_5]] : (index) -> !fir.shift<1> ! CHECK: %[[VAL_7:.*]] = fir.rebox %{{.*}}(%[[VAL_6]]) : (!fir.box>, !fir.shift<1>) -> !fir.box> ! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.box>) -> !fir.box> -! CHECK: fir.call @takes_assumed_shape_ignore_tkr_t(%[[VAL_8]]) fastmath {is_bind_c} : (!fir.box>) -> () +! CHECK: fir.call @takes_assumed_shape_ignore_tkr_t(%[[VAL_8]]) proc_attrs fastmath : (!fir.box>) -> () subroutine test_ignore_t_2(x) use tkr_ifaces @@ -35,7 +35,7 @@ subroutine test_ignore_t_2(x) ! CHECK: %[[VAL_3:.*]] = fir.shift %[[VAL_2]] : (index) -> !fir.shift<1> ! CHECK: %[[VAL_4:.*]] = fir.rebox %{{.*}}(%[[VAL_3]]) : (!fir.class>, !fir.shift<1>) -> !fir.class> ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.class>) -> !fir.box> -! CHECK: fir.call @takes_assumed_shape_ignore_tkr_t(%[[VAL_5]]) fastmath {is_bind_c} : (!fir.box>) -> () +! CHECK: fir.call @takes_assumed_shape_ignore_tkr_t(%[[VAL_5]]) proc_attrs fastmath : (!fir.box>) -> () subroutine test_ignore_t_3(x) use tkr_ifaces @@ -47,7 +47,7 @@ subroutine test_ignore_t_3(x) ! CHECK: %[[VAL_13:.*]] = fir.shift %[[VAL_12]] : (index) -> !fir.shift<1> ! CHECK: %[[VAL_14:.*]] = fir.rebox %{{.*}}(%[[VAL_13]]) : (!fir.box>, !fir.shift<1>) -> !fir.box> ! CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (!fir.box>) -> !fir.box> -! CHECK: fir.call @takes_assumed_shape_ignore_tkr_t(%[[VAL_15]]) fastmath {is_bind_c} : (!fir.box>) -> () +! CHECK: fir.call @takes_assumed_shape_ignore_tkr_t(%[[VAL_15]]) proc_attrs fastmath : (!fir.box>) -> () subroutine test_ignore_t_4(x) use tkr_ifaces @@ -59,4 +59,4 @@ subroutine test_ignore_t_4(x) ! CHECK: %[[VAL_4:.*]] = fir.shift %[[VAL_3]] : (index) -> !fir.shift<1> ! CHECK: %[[VAL_5:.*]] = fir.rebox %{{.*}}(%[[VAL_4]]) : (!fir.box>>, !fir.shift<1>) -> !fir.box> ! CHECK: %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (!fir.box>) -> !fir.box> -! CHECK: fir.call @takes_assumed_shape_ignore_tkr_t(%[[VAL_6]]) fastmath {is_bind_c} : (!fir.box>) -> () +! CHECK: fir.call @takes_assumed_shape_ignore_tkr_t(%[[VAL_6]]) proc_attrs fastmath : (!fir.box>) -> () diff --git a/flang/test/Lower/OpenMP/threadprivate-default-clause.f90 b/flang/test/Lower/OpenMP/threadprivate-default-clause.f90 index 65ea30de521678..2a52e1c026a9e5 100644 --- a/flang/test/Lower/OpenMP/threadprivate-default-clause.f90 +++ b/flang/test/Lower/OpenMP/threadprivate-default-clause.f90 @@ -12,7 +12,7 @@ !CHECK: omp.parallel { !CHECK: %[[A_TP:.*]] = omp.threadprivate %[[A_DECL]]#1 : !fir.ref -> !fir.ref !CHECK: %[[A_TP_DECL:.*]]:2 = hlfir.declare %[[A_TP]] {uniq_name = "_QFsub1Ea"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK: %[[TID:.*]] = fir.call @omp_get_thread_num() fastmath {is_bind_c} : () -> i32 +!CHECK: %[[TID:.*]] = fir.call @omp_get_thread_num() proc_attrs fastmath : () -> i32 !CHECK: hlfir.assign %[[TID]] to %[[A_TP_DECL]]#0 : i32, !fir.ref !CHECK: omp.terminator !CHECK: } @@ -46,7 +46,7 @@ subroutine sub1() !CHECK: %[[A_TP_ADDR:.*]] = fir.coordinate_of %[[BLK_TP_CVT]], %c0_1 : (!fir.ref>, index) -> !fir.ref !CHECK: %[[A_TP_ADDR_CVT:.*]] = fir.convert %[[A_TP_ADDR]] : (!fir.ref) -> !fir.ref !CHECK: %[[A_TP_DECL:.*]]:2 = hlfir.declare %[[A_TP_ADDR_CVT]] {uniq_name = "_QFsub2Ea"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK: %[[TID:.*]] = fir.call @omp_get_thread_num() fastmath {is_bind_c} : () -> i32 +!CHECK: %[[TID:.*]] = fir.call @omp_get_thread_num() proc_attrs fastmath : () -> i32 !CHECK: hlfir.assign %[[TID]] to %[[A_TP_DECL]]#0 : i32, !fir.ref !CHECK: omp.terminator !CHECK: } diff --git a/flang/test/Lower/block.f90 b/flang/test/Lower/block.f90 index 9d1cb4c298c0aa..70ff67db718edc 100644 --- a/flang/test/Lower/block.f90 +++ b/flang/test/Lower/block.f90 @@ -81,7 +81,7 @@ program bb ! block stack management and exits ! CHECK: %[[V_51:[0-9]+]] = fir.call @llvm.stacksave.p0() fastmath : () -> !fir.ref ! CHECK: fir.store %c5{{.*}} to %[[V_0]] : !fir.ref - ! CHECK: fir.call @ss(%[[V_0]]) fastmath {is_bind_c} : (!fir.ref) -> () + ! CHECK: fir.call @ss(%[[V_0]]) proc_attrs fastmath : (!fir.ref) -> () ! CHECK: fir.call @llvm.stackrestore.p0(%[[V_51]]) fastmath : (!fir.ref) -> () block interface diff --git a/flang/test/Lower/call-bindc.f90 b/flang/test/Lower/call-bindc.f90 index b216a1e4d963d9..f8cafffa3867d8 100644 --- a/flang/test/Lower/call-bindc.f90 +++ b/flang/test/Lower/call-bindc.f90 @@ -18,4 +18,4 @@ program main end ! CHECK-LABEL: func.func @_QQmain() -! CHECK: fir.call %{{.*}}(%{{.*}}) fastmath {is_bind_c} : (i32) -> !fir.complex<4> +! CHECK: fir.call %{{.*}}(%{{.*}}) proc_attrs fastmath : (i32) -> !fir.complex<4> From 04ab647b3f145946397837c6ba10ae0795b9bd01 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Fri, 23 Aug 2024 05:39:23 -0700 Subject: [PATCH 312/426] [NFC][TableGen] Refactor StringToOffsetTable (#105655) - Make `EmitString` const by not mutating `AggregateString`. - Use C++17 structured bindings in `GetOrAddStringOffset`. - Use StringExtras version of isDigit instead of std::isdigit. --- .../llvm/TableGen/StringToOffsetTable.h | 36 +++++++++---------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/llvm/include/llvm/TableGen/StringToOffsetTable.h b/llvm/include/llvm/TableGen/StringToOffsetTable.h index f2a20f06ae007f..d4bb685acce327 100644 --- a/llvm/include/llvm/TableGen/StringToOffsetTable.h +++ b/llvm/include/llvm/TableGen/StringToOffsetTable.h @@ -14,7 +14,6 @@ #include "llvm/ADT/StringMap.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/raw_ostream.h" -#include #include namespace llvm { @@ -32,16 +31,15 @@ class StringToOffsetTable { size_t size() const { return AggregateString.size(); } unsigned GetOrAddStringOffset(StringRef Str, bool appendZero = true) { - auto IterBool = - StringOffset.insert(std::make_pair(Str, AggregateString.size())); - if (IterBool.second) { + auto [II, Inserted] = StringOffset.insert({Str, size()}); + if (Inserted) { // Add the string to the aggregate if this is the first time found. AggregateString.append(Str.begin(), Str.end()); if (appendZero) AggregateString += '\0'; } - return IterBool.first->second; + return II->second; } // Returns the offset of `Str` in the table if its preset, else return @@ -78,37 +76,35 @@ class StringToOffsetTable { } // Emit the string as one single string. - void EmitString(raw_ostream &O) { + void EmitString(raw_ostream &O) const { // Escape the string. - SmallString<256> Str; - raw_svector_ostream(Str).write_escaped(AggregateString); - AggregateString = std::string(Str); + SmallString<256> EscapedStr; + raw_svector_ostream(EscapedStr).write_escaped(AggregateString); O << " \""; unsigned CharsPrinted = 0; - for (unsigned i = 0, e = AggregateString.size(); i != e; ++i) { + for (unsigned i = 0, e = EscapedStr.size(); i != e; ++i) { if (CharsPrinted > 70) { O << "\"\n \""; CharsPrinted = 0; } - O << AggregateString[i]; + O << EscapedStr[i]; ++CharsPrinted; // Print escape sequences all together. - if (AggregateString[i] != '\\') + if (EscapedStr[i] != '\\') continue; - assert(i + 1 < AggregateString.size() && "Incomplete escape sequence!"); - if (isdigit(AggregateString[i + 1])) { - assert(isdigit(AggregateString[i + 2]) && - isdigit(AggregateString[i + 3]) && + assert(i + 1 < EscapedStr.size() && "Incomplete escape sequence!"); + if (isDigit(EscapedStr[i + 1])) { + assert(isDigit(EscapedStr[i + 2]) && isDigit(EscapedStr[i + 3]) && "Expected 3 digit octal escape!"); - O << AggregateString[++i]; - O << AggregateString[++i]; - O << AggregateString[++i]; + O << EscapedStr[++i]; + O << EscapedStr[++i]; + O << EscapedStr[++i]; CharsPrinted += 3; } else { - O << AggregateString[++i]; + O << EscapedStr[++i]; ++CharsPrinted; } } From 1e3dc8cdb49bf7b8344d1d7f7befbb95a9fbdb63 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 23 Aug 2024 05:49:49 -0700 Subject: [PATCH 313/426] [Serialization] Fix a warning This patch fixes: clang/lib/Serialization/ASTReader.cpp:9978:27: error: lambda capture 'this' is not used [-Werror,-Wunused-lambda-capture] --- clang/lib/Serialization/ASTReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index be83805f1e92b9..ffdaec4067e1c4 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -9975,7 +9975,7 @@ void ASTReader::finishPendingActions() { return false; }; - auto hasDefinition = [this, &hasDefinitionImpl](Decl *D) { + auto hasDefinition = [&hasDefinitionImpl](Decl *D) { return hasDefinitionImpl(D, hasDefinitionImpl); }; From 0d1d95ecc8cb0fc716f6535c5ceb403d42ef4862 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 23 Aug 2024 05:56:27 -0700 Subject: [PATCH 314/426] [Transforms] Use a range-based for loop (NFC) (#105769) --- llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp index bf86be0dd387f0..aa059cd99cc1ba 100644 --- a/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp +++ b/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp @@ -331,12 +331,9 @@ bool PlaceSafepointsPass::runImpl(Function &F, const TargetLibraryInfo &TLI) { // and b) edges to distinct loop headers. We need to insert pools on // each. SetVector Headers; - for (unsigned i = 0; i < Term->getNumSuccessors(); i++) { - BasicBlock *Succ = Term->getSuccessor(i); - if (DT.dominates(Succ, Term->getParent())) { + for (BasicBlock *Succ : successors(Term->getParent())) + if (DT.dominates(Succ, Term->getParent())) Headers.insert(Succ); - } - } assert(!Headers.empty() && "poll location is not a loop latch?"); // The split loop structure here is so that we only need to recalculate From 5def27c72c1f3e5be6770218fa45a615c411d5b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= Date: Fri, 23 Aug 2024 15:04:03 +0200 Subject: [PATCH 315/426] [AMDGPU] Remove "amdgpu-enable-structurizer-workarounds" flag (#105819) --- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 23 ++++--------------- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h | 1 - 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 7a9735790371a1..7ac7b3315bb972 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -338,12 +338,6 @@ static cl::opt EnableScalarIRPasses( cl::init(true), cl::Hidden); -static cl::opt EnableStructurizerWorkarounds( - "amdgpu-enable-structurizer-workarounds", - cl::desc("Enable workarounds for the StructurizeCFG pass"), - cl::location(AMDGPUTargetMachine::EnableStructurizerWorkarounds), - cl::init(true), cl::Hidden); - static cl::opt EnableLowerModuleLDS( "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), @@ -615,7 +609,6 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, bool AMDGPUTargetMachine::EnableFunctionCalls = false; bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; -bool AMDGPUTargetMachine::EnableStructurizerWorkarounds = true; AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; @@ -1231,10 +1224,8 @@ bool GCNPassConfig::addPreISel() { // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit // regions formed by them. addPass(&AMDGPUUnifyDivergentExitNodesID); - if (EnableStructurizerWorkarounds) { - addPass(createFixIrreduciblePass()); - addPass(createUnifyLoopExitsPass()); - } + addPass(createFixIrreduciblePass()); + addPass(createUnifyLoopExitsPass()); addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions addPass(createAMDGPUAnnotateUniformValuesLegacy()); @@ -1853,8 +1844,6 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const { } void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const { - const bool EnableStructurizerWorkarounds = - AMDGPUTargetMachine::EnableStructurizerWorkarounds; if (TM.getOptLevel() > CodeGenOptLevel::None) addPass(FlattenCFGPass()); @@ -1868,12 +1857,8 @@ void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const { // regions formed by them. addPass(AMDGPUUnifyDivergentExitNodesPass()); - - if (EnableStructurizerWorkarounds) { - addPass(FixIrreduciblePass()); - addPass(UnifyLoopExitsPass()); - } - + addPass(FixIrreduciblePass()); + addPass(UnifyLoopExitsPass()); addPass(StructurizeCFGPass(/*SkipUniformRegions=*/false)); addPass(AMDGPUAnnotateUniformValuesPass()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 66dfd2f733e3e9..5b7257ddb36f1e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -38,7 +38,6 @@ class AMDGPUTargetMachine : public LLVMTargetMachine { public: static bool EnableFunctionCalls; static bool EnableLowerModuleLDS; - static bool EnableStructurizerWorkarounds; AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, From 4d348f72d3ac4289821f944a99cdb4b6af21aa7b Mon Sep 17 00:00:00 2001 From: KaiWeng Date: Fri, 23 Aug 2024 21:06:12 +0800 Subject: [PATCH 316/426] [RISCV] Let -data-sections also work on sbss/sdata sections (#87040) Add an unique suffix to .sbss/.sdata if -fdata-sections. Without assigning an unique .sbss/.sdata section to each symbols, a linker may not be able to remove unused part when gc-section since all used and unused symbols are all mixed in the same .sbss/.sdata section. I believe this also matches the behavior of gcc. --- .../Target/RISCV/RISCVTargetObjectFile.cpp | 32 ++++++++++++++-- llvm/test/CodeGen/RISCV/sdata-sections.ll | 38 +++++++++++++++++++ 2 files changed, 66 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/sdata-sections.ll diff --git a/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp b/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp index 065541ba9f5933..b7d112ecfc72a7 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp @@ -105,10 +105,34 @@ bool RISCVELFTargetObjectFile::isGlobalInSmallSection( MCSection *RISCVELFTargetObjectFile::SelectSectionForGlobal( const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { // Handle Small Section classification here. - if (Kind.isBSS() && isGlobalInSmallSection(GO, TM)) - return SmallBSSSection; - if (Kind.isData() && isGlobalInSmallSection(GO, TM)) - return SmallDataSection; + if (isGlobalInSmallSection(GO, TM)) { + // Emit to an unique sdata/sbss section when -fdata-section is set. + // However, if a symbol has an explicit sdata/sbss section, place it in that + // section. + bool EmitUniquedSection = TM.getDataSections() && !GO->hasSection(); + + if (Kind.isBSS()) { + if (EmitUniquedSection) { + SmallString<128> Name(".sbss."); + Name.append(GO->getName()); + return getContext().getELFSection(Name.str(), ELF::SHT_NOBITS, + ELF::SHF_WRITE | ELF::SHF_ALLOC); + } + + return SmallBSSSection; + } + + if (Kind.isData()) { + if (EmitUniquedSection) { + SmallString<128> Name(".sdata."); + Name.append(GO->getName()); + return getContext().getELFSection(Name.str(), ELF::SHT_PROGBITS, + ELF::SHF_WRITE | ELF::SHF_ALLOC); + } + + return SmallDataSection; + } + } // Otherwise, we work the same as ELF. return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM); diff --git a/llvm/test/CodeGen/RISCV/sdata-sections.ll b/llvm/test/CodeGen/RISCV/sdata-sections.ll new file mode 100644 index 00000000000000..0357422aaf5249 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/sdata-sections.ll @@ -0,0 +1,38 @@ +; RUN: llc -mtriple=riscv32 -data-sections < %s | FileCheck -check-prefix=RV32 %s +; RUN: llc -mtriple=riscv64 -data-sections < %s | FileCheck -check-prefix=RV64 %s + +; Append an unique name to each sdata/sbss section when -data-section. + +@v = dso_local global i32 0, align 4 +@r = dso_local global i64 7, align 8 + +; If a symbol has an explicit section name, we should honor it. +@vv = dso_local global i32 0, section ".sbss", align 4 +@rr = dso_local global i64 7, section ".sdata", align 8 +@bb = dso_local global i32 0, section ".sbss_like", align 4 +@tt = dso_local global i64 7, section ".sdata_like", align 8 +@nn = dso_local global i32 0, section ".custom_a", align 4 +@yy = dso_local global i64 7, section ".custom_b", align 8 + +; SmallDataLimit set to 8, so we expect @v will be put in sbss +; and @r will be put in sdata. +!llvm.module.flags = !{!0} +!0 = !{i32 8, !"SmallDataLimit", i32 8} + +; RV32: .section .sbss.v,"aw" +; RV32: .section .sdata.r,"aw" +; RV32: .section .sbss,"aw" +; RV32: .section .sdata,"aw" +; RV32: .section .sbss_like,"aw" +; RV32: .section .sdata_like,"aw" +; RV32: .section .custom_a,"aw" +; RV32: .section .custom_b,"aw" + +; RV64: .section .sbss.v,"aw" +; RV64: .section .sdata.r,"aw" +; RV64: .section .sbss,"aw" +; RV64: .section .sdata,"aw" +; RV64: .section .sbss_like,"aw" +; RV64: .section .sdata_like,"aw" +; RV64: .section .custom_a,"aw" +; RV64: .section .custom_b,"aw" From b084111c8e26f96975f505c37d42e992066776f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Degioanni?= Date: Fri, 23 Aug 2024 15:15:10 +0200 Subject: [PATCH 317/426] [mlir][mem2reg] Fix Mem2Reg attempting to promote in graph regions (#104910) Mem2Reg assumes SSA dependencies but did not check for graph regions. This fixes it. --------- Co-authored-by: Christian Ulmann --- mlir/lib/Transforms/Mem2Reg.cpp | 13 +++++++++++++ mlir/test/Transforms/mem2reg.mlir | 15 ++++++++++++++- mlir/test/lib/Dialect/Test/TestOpDefs.cpp | 8 ++++++++ mlir/test/lib/Dialect/Test/TestOps.td | 12 ++++++++++++ 4 files changed, 47 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Transforms/Mem2Reg.cpp b/mlir/lib/Transforms/Mem2Reg.cpp index 1f6998709ae02e..4bc2bbd12b0945 100644 --- a/mlir/lib/Transforms/Mem2Reg.cpp +++ b/mlir/lib/Transforms/Mem2Reg.cpp @@ -13,6 +13,7 @@ #include "mlir/IR/Builders.h" #include "mlir/IR/Dominance.h" #include "mlir/IR/PatternMatch.h" +#include "mlir/IR/RegionKindInterface.h" #include "mlir/IR/Value.h" #include "mlir/Interfaces/ControlFlowInterfaces.h" #include "mlir/Interfaces/MemorySlotInterfaces.h" @@ -255,6 +256,18 @@ LogicalResult MemorySlotPromotionAnalyzer::computeBlockingUses( // delete itself). We thus need to start from the use of the slot pointer and // propagate further requests through the forward slice. + // Because this pass currently only supports analysing the parent region of + // the slot pointer, if a promotable memory op that needs promotion is within + // a graph region, the slot may only be used in a graph region and should + // therefore be ignored. + Region *slotPtrRegion = slot.ptr.getParentRegion(); + auto slotPtrRegionOp = + dyn_cast(slotPtrRegion->getParentOp()); + if (slotPtrRegionOp && + slotPtrRegionOp.getRegionKind(slotPtrRegion->getRegionNumber()) == + RegionKind::Graph) + return failure(); + // First insert that all immediate users of the slot pointer must no longer // use it. for (OpOperand &use : slot.ptr.getUses()) { diff --git a/mlir/test/Transforms/mem2reg.mlir b/mlir/test/Transforms/mem2reg.mlir index daeaa2da076341..89472ac0ca2842 100644 --- a/mlir/test/Transforms/mem2reg.mlir +++ b/mlir/test/Transforms/mem2reg.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s --pass-pipeline='builtin.module(func.func(mem2reg))' --split-input-file | FileCheck %s +// RUN: mlir-opt %s --pass-pipeline='builtin.module(any(mem2reg))' --split-input-file | FileCheck %s // Verifies that allocators with mutliple slots are handled properly. @@ -26,3 +26,16 @@ func.func @multi_slot_alloca_only_second() -> (i32, i32) { %4 = memref.load %2[] : memref return %3, %4 : i32, i32 } + +// ----- + +// Checks that slots are not promoted if used in a graph region. + +// CHECK-LABEL: test.isolated_graph_region +test.isolated_graph_region { + // CHECK: %{{[[:alnum:]]+}} = test.multi_slot_alloca + %slot = test.multi_slot_alloca : () -> (memref) + memref.store %a, %slot[] : memref + %a = memref.load %slot[] : memref + "test.foo"() : () -> () +} diff --git a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp index fbaa102d3e33cc..69091fb893fad6 100644 --- a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp +++ b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp @@ -126,6 +126,14 @@ RegionKind GraphRegionOp::getRegionKind(unsigned index) { return RegionKind::Graph; } +//===----------------------------------------------------------------------===// +// IsolatedGraphRegionOp +//===----------------------------------------------------------------------===// + +RegionKind IsolatedGraphRegionOp::getRegionKind(unsigned index) { + return RegionKind::Graph; +} + //===----------------------------------------------------------------------===// // AffineScopeOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td index 2b55bff3538d39..9e19966414d1d7 100644 --- a/mlir/test/lib/Dialect/Test/TestOps.td +++ b/mlir/test/lib/Dialect/Test/TestOps.td @@ -2048,6 +2048,18 @@ def GraphRegionOp : TEST_Op<"graph_region", [ let assemblyFormat = "attr-dict-with-keyword $region"; } +def IsolatedGraphRegionOp : TEST_Op<"isolated_graph_region", [ + DeclareOpInterfaceMethods, + IsolatedFromAbove]> { + let summary = "isolated from above operation with a graph region"; + let description = [{ + Test op that defines a graph region which is isolated from above. + }]; + + let regions = (region AnyRegion:$region); + let assemblyFormat = "attr-dict-with-keyword $region"; +} + def AffineScopeOp : TEST_Op<"affine_scope", [AffineScope]> { let summary = "affine scope operation"; let description = [{ From 2617023923175b0fd2a8cb94ad677c061c01627f Mon Sep 17 00:00:00 2001 From: SpencerAbson Date: Fri, 23 Aug 2024 14:27:49 +0100 Subject: [PATCH 318/426] [clang][AArch64] Add SME2.1 feature macros (#105657) --- clang/lib/Basic/Targets/AArch64.cpp | 40 ++++++++++++++----- clang/lib/Basic/Targets/AArch64.h | 3 ++ .../Preprocessor/aarch64-target-features.c | 16 ++++++++ 3 files changed, 50 insertions(+), 9 deletions(-) diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index 6ba31cc05a0d75..63fc15f916c558 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -471,23 +471,25 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts, if (HasSVE2 && HasSVE2SM4) Builder.defineMacro("__ARM_FEATURE_SVE2_SM4", "1"); + if (HasSVEB16B16) + Builder.defineMacro("__ARM_FEATURE_SVE_B16B16", "1"); + if (HasSME) { Builder.defineMacro("__ARM_FEATURE_SME"); Builder.defineMacro("__ARM_FEATURE_LOCALLY_STREAMING", "1"); } - if (HasSME2) { - Builder.defineMacro("__ARM_FEATURE_SME", "1"); + if (HasSME2) Builder.defineMacro("__ARM_FEATURE_SME2", "1"); - Builder.defineMacro("__ARM_FEATURE_LOCALLY_STREAMING", "1"); - } - if (HasSME2p1) { - Builder.defineMacro("__ARM_FEATURE_SME", "1"); - Builder.defineMacro("__ARM_FEATURE_SME2", "1"); + if (HasSME2p1) Builder.defineMacro("__ARM_FEATURE_SME2p1", "1"); - Builder.defineMacro("__ARM_FEATURE_LOCALLY_STREAMING", "1"); - } + + if (HasSMEF16F16) + Builder.defineMacro("__ARM_FEATURE_SME_F16F16", "1"); + + if (HasSMEB16B16) + Builder.defineMacro("__ARM_FEATURE_SME_B16B16", "1"); if (HasCRC) Builder.defineMacro("__ARM_FEATURE_CRC32", "1"); @@ -749,6 +751,7 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const { .Case("sve", FPU & SveMode) .Case("sve-bf16", FPU & SveMode && HasBFloat16) .Case("sve-i8mm", FPU & SveMode && HasMatMul) + .Case("sve-b16b16", HasSVEB16B16) .Case("f32mm", FPU & SveMode && HasMatmulFP32) .Case("f64mm", FPU & SveMode && HasMatmulFP64) .Case("sve2", FPU & SveMode && HasSVE2) @@ -763,6 +766,8 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const { .Case("sme-f64f64", HasSMEF64F64) .Case("sme-i16i64", HasSMEI16I64) .Case("sme-fa64", HasSMEFA64) + .Case("sme-f16f16", HasSMEF16F16) + .Case("sme-b16b16", HasSMEB16B16) .Cases("memtag", "memtag2", HasMTE) .Case("sb", HasSB) .Case("predres", HasPredRes) @@ -863,6 +868,8 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector &Features, HasSVE2 = true; HasSVE2SM4 = true; } + if (Feature == "+sve-b16b16") + HasSVEB16B16 = true; if (Feature == "+sve2-bitperm") { FPU |= NeonMode; FPU |= SveMode; @@ -919,6 +926,21 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector &Features, HasSVE2 = true; HasSMEFA64 = true; } + if (Feature == "+sme-f16f16") { + HasSME = true; + HasSME2 = true; + HasBFloat16 = true; + HasFullFP16 = true; + HasSMEF16F16 = true; + } + if (Feature == "+sme-b16b16") { + HasSME = true; + HasSME2 = true; + HasBFloat16 = true; + HasFullFP16 = true; + HasSVEB16B16 = true; + HasSMEB16B16 = true; + } if (Feature == "+sb") HasSB = true; if (Feature == "+predres") diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h index 7bdf5a2b4106e4..526f7f30a38618 100644 --- a/clang/lib/Basic/Targets/AArch64.h +++ b/clang/lib/Basic/Targets/AArch64.h @@ -53,6 +53,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo { bool HasSVE2AES = false; bool HasSVE2SHA3 = false; bool HasSVE2SM4 = false; + bool HasSVEB16B16 = false; bool HasSVE2BitPerm = false; bool HasMatmulFP64 = false; bool HasMatmulFP32 = false; @@ -71,6 +72,8 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo { bool HasSME2 = false; bool HasSMEF64F64 = false; bool HasSMEI16I64 = false; + bool HasSMEF16F16 = false; + bool HasSMEB16B16 = false; bool HasSME2p1 = false; bool HasSB = false; bool HasPredRes = false; diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c index 87bd3e142d2c40..ae2bdda6f536c5 100644 --- a/clang/test/Preprocessor/aarch64-target-features.c +++ b/clang/test/Preprocessor/aarch64-target-features.c @@ -709,3 +709,19 @@ // CHECK-SME2p1: __ARM_FEATURE_SME 1 // CHECK-SME2p1: __ARM_FEATURE_SME2 1 // CHECK-SME2p1: __ARM_FEATURE_SME2p1 1 + +// RUN: %clang --target=aarch64 -march=armv9-a+sve-b16b16 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVEB16B16 %s +// CHECK-SVEB16B16: __ARM_FEATURE_SVE_B16B16 1 + +// RUN: %clang --target=aarch64 -march=armv9-a+sme-f16f16 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SMEF16F16 %s +// CHECK-SMEF16F16: __ARM_FEATURE_LOCALLY_STREAMING 1 +// CHECK-SMEF16F16: __ARM_FEATURE_SME 1 +// CHECK-SMEF16F16: __ARM_FEATURE_SME2 1 +// CHECK-SMEF16F16: __ARM_FEATURE_SME_F16F16 1 + +// RUN: %clang --target=aarch64 -march=armv9-a+sme-b16b16 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SMEB16B16 %s +// CHECK-SMEB16B16: __ARM_FEATURE_LOCALLY_STREAMING 1 +// CHECK-SMEB16B16: __ARM_FEATURE_SME 1 +// CHECK-SMEB16B16: __ARM_FEATURE_SME2 1 +// CHECK-SMEB16B16: __ARM_FEATURE_SME_B16B16 1 +// CHECK-SMEB16B16: __ARM_FEATURE_SVE_B16B16 1 From c9b6339ad40cacb729cc714342d443e781fdfca3 Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee Date: Fri, 23 Aug 2024 06:40:12 -0700 Subject: [PATCH 319/426] [NFC] Use stable_hash_combine instead of hash_combine (#105619) I found the current stable hash is not deterministic across multiple runs on a specific platform. This is because it uses `hash_combine` instead of `stable_hash_combine`. --- llvm/lib/CodeGen/MachineStableHash.cpp | 30 ++++++++++++++------------ 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/llvm/lib/CodeGen/MachineStableHash.cpp b/llvm/lib/CodeGen/MachineStableHash.cpp index d2e02a2d739c1b..fb5e9a37d9b997 100644 --- a/llvm/lib/CodeGen/MachineStableHash.cpp +++ b/llvm/lib/CodeGen/MachineStableHash.cpp @@ -63,10 +63,10 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) { case MachineOperand::MO_Register: if (MO.getReg().isVirtual()) { const MachineRegisterInfo &MRI = MO.getParent()->getMF()->getRegInfo(); - SmallVector DefOpcodes; + SmallVector DefOpcodes; for (auto &Def : MRI.def_instructions(MO.getReg())) DefOpcodes.push_back(Def.getOpcode()); - return hash_combine_range(DefOpcodes.begin(), DefOpcodes.end()); + return stable_hash_combine_range(DefOpcodes.begin(), DefOpcodes.end()); } // Register operands don't have target flags. @@ -80,7 +80,7 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) { : MO.getFPImm()->getValueAPF().bitcastToAPInt(); auto ValHash = stable_hash_combine_array(Val.getRawData(), Val.getNumWords()); - return hash_combine(MO.getType(), MO.getTargetFlags(), ValHash); + return stable_hash_combine(MO.getType(), MO.getTargetFlags(), ValHash); } case MachineOperand::MO_MachineBasicBlock: @@ -112,8 +112,8 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) { MO.getIndex()); case MachineOperand::MO_ExternalSymbol: - return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getOffset(), - xxh3_64bits(MO.getSymbolName())); + return stable_hash_combine(MO.getType(), MO.getTargetFlags(), + MO.getOffset(), xxh3_64bits(MO.getSymbolName())); case MachineOperand::MO_RegisterMask: case MachineOperand::MO_RegisterLiveOut: { @@ -126,15 +126,16 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) { const uint32_t *RegMask = MO.getRegMask(); std::vector RegMaskHashes(RegMask, RegMask + RegMaskSize); - return hash_combine(MO.getType(), MO.getTargetFlags(), - stable_hash_combine_array(RegMaskHashes.data(), - RegMaskHashes.size())); + return stable_hash_combine( + MO.getType(), MO.getTargetFlags(), + stable_hash_combine_array(RegMaskHashes.data(), + RegMaskHashes.size())); } } } assert(0 && "MachineOperand not associated with any MachineFunction"); - return hash_combine(MO.getType(), MO.getTargetFlags()); + return stable_hash_combine(MO.getType(), MO.getTargetFlags()); } case MachineOperand::MO_ShuffleMask: { @@ -144,14 +145,15 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) { MO.getShuffleMask(), std::back_inserter(ShuffleMaskHashes), [](int S) -> llvm::stable_hash { return llvm::stable_hash(S); }); - return hash_combine(MO.getType(), MO.getTargetFlags(), - stable_hash_combine_array(ShuffleMaskHashes.data(), - ShuffleMaskHashes.size())); + return stable_hash_combine( + MO.getType(), MO.getTargetFlags(), + stable_hash_combine_array(ShuffleMaskHashes.data(), + ShuffleMaskHashes.size())); } case MachineOperand::MO_MCSymbol: { auto SymbolName = MO.getMCSymbol()->getName(); - return hash_combine(MO.getType(), MO.getTargetFlags(), - xxh3_64bits(SymbolName)); + return stable_hash_combine(MO.getType(), MO.getTargetFlags(), + xxh3_64bits(SymbolName)); } case MachineOperand::MO_CFIIndex: return stable_hash_combine(MO.getType(), MO.getTargetFlags(), From f142f8afe21bceb00fb495468aa0b5043e98c419 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 23 Aug 2024 14:43:31 +0100 Subject: [PATCH 320/426] [AMDGPU] Improve uniform argument handling in InstCombineIntrinsic (#105812) Common up handling of intrinsics that are a no-op on uniform arguments. This catches a couple of new cases: readlane (readlane x, y), z -> readlane x, y (for any z, does not have to equal y). permlane64 (readfirstlane x) -> readfirstlane x (and likewise for any other uniform argument to permlane64). --- .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 57 +++++++------------ .../InstCombine/AMDGPU/amdgcn-intrinsics.ll | 19 ++++++- 2 files changed, 36 insertions(+), 40 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 9197404309663a..4da3618357c420 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -440,6 +440,21 @@ static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) { SqrtOp->getType()->isHalfTy(); } +/// Return true if we can easily prove that use U is uniform. +static bool isTriviallyUniform(const Use &U) { + Value *V = U.get(); + if (isa(V)) + return true; + if (const auto *II = dyn_cast(V)) { + if (!AMDGPU::isIntrinsicAlwaysUniform(II->getIntrinsicID())) + return false; + // If II and U are in different blocks then there is a possibility of + // temporal divergence. + return II->getParent() == cast(U.getUser())->getParent(); + } + return false; +} + std::optional GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Intrinsic::ID IID = II.getIntrinsicID(); @@ -1060,46 +1075,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType())); } case Intrinsic::amdgcn_permlane64: - // A constant value is trivially uniform. - if (Constant *C = dyn_cast(II.getArgOperand(0))) { - return IC.replaceInstUsesWith(II, C); - } - break; case Intrinsic::amdgcn_readfirstlane: case Intrinsic::amdgcn_readlane: { - // A constant value is trivially uniform. - if (Constant *C = dyn_cast(II.getArgOperand(0))) { - return IC.replaceInstUsesWith(II, C); - } - - // The rest of these may not be safe if the exec may not be the same between - // the def and use. - Value *Src = II.getArgOperand(0); - Instruction *SrcInst = dyn_cast(Src); - if (SrcInst && SrcInst->getParent() != II.getParent()) - break; - - // readfirstlane (readfirstlane x) -> readfirstlane x - // readlane (readfirstlane x), y -> readfirstlane x - if (match(Src, - PatternMatch::m_Intrinsic())) { - return IC.replaceInstUsesWith(II, Src); - } - - if (IID == Intrinsic::amdgcn_readfirstlane) { - // readfirstlane (readlane x, y) -> readlane x, y - if (match(Src, PatternMatch::m_Intrinsic())) { - return IC.replaceInstUsesWith(II, Src); - } - } else { - // readlane (readlane x, y), y -> readlane x, y - if (match(Src, PatternMatch::m_Intrinsic( - PatternMatch::m_Value(), - PatternMatch::m_Specific(II.getArgOperand(1))))) { - return IC.replaceInstUsesWith(II, Src); - } - } - + // If the first argument is uniform these intrinsics return it unchanged. + const Use &Src = II.getArgOperandUse(0); + if (isTriviallyUniform(Src)) + return IC.replaceInstUsesWith(II, Src.get()); break; } case Intrinsic::amdgcn_trig_preop: { diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll index 9cb79b26448658..f3a3b8c1dc5d8a 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -2888,8 +2888,7 @@ define i32 @readlane_idempotent(i32 %arg, i32 %lane) { define i32 @readlane_idempotent_different_lanes(i32 %arg, i32 %lane0, i32 %lane1) { ; CHECK-LABEL: @readlane_idempotent_different_lanes( ; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE0:%.*]]) -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 [[LANE1:%.*]]) -; CHECK-NEXT: ret i32 [[READ1]] +; CHECK-NEXT: ret i32 [[READ0]] ; %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane0) %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 %lane1) @@ -3061,6 +3060,22 @@ define amdgpu_kernel void @permlanex16_fetch_invalid_bound_ctrl(ptr addrspace(1) ret void } +; -------------------------------------------------------------------- +; llvm.amdgcn.permlane64 +; -------------------------------------------------------------------- + +define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src0) { +; CHECK-LABEL: @permlane64_uniform( +; CHECK-NEXT: [[SRC1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0:%.*]]) +; CHECK-NEXT: store i32 [[SRC1]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: ret void +; + %src1 = call i32 @llvm.amdgcn.readfirstlane(i32 %src0) + %res = call i32 @llvm.amdgcn.permlane64(i32 %src1) + store i32 %res, ptr addrspace(1) %out + ret void +} + ; -------------------------------------------------------------------- ; llvm.amdgcn.image.sample a16 ; -------------------------------------------------------------------- From f3d2609af3031ddb54030548e86335f295cf49ca Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 22 Aug 2024 11:24:08 -0400 Subject: [PATCH 321/426] [SLP]Improve/fix subvectors in gather/buildvector nodes handling SLP vectorizer has an estimation for gather/buildvector nodes, which contain some scalar loads. SLP vectorizer performs pretty similar (but large in SLOCs) estimation, which not always correct. Instead, this patch implements clustering analysis and actual node allocation with the full analysis for the vectorized clustered scalars (not only loads, but also some other instructions) with the correct cost estimation and vector insert instructions. Improves overall vectorization quality and simplifies analysis/estimations. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/104144 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 347 ++++++++--------- .../PhaseOrdering/AArch64/slpordering.ll | 74 ++-- .../SLPVectorizer/AArch64/getelementptr.ll | 11 +- .../SLPVectorizer/AArch64/loadorder.ll | 192 ++++----- .../AArch64/multiple_reduction.ll | 365 +++++++----------- .../AArch64/scalarization-overhead.ll | 62 ++- .../AArch64/shuffle-vectors-mask-size.ll | 7 +- .../SLPVectorizer/AArch64/tsc-s116.ll | 8 +- .../vectorizable-selects-uniform-cmps.ll | 32 +- .../RISCV/combined-loads-stored.ll | 7 +- .../SLPVectorizer/RISCV/reductions.ll | 48 ++- .../SLPVectorizer/SystemZ/pr34619.ll | 11 +- .../Transforms/SLPVectorizer/X86/addsub.ll | 18 +- .../X86/extract-many-users-buildvector.ll | 43 +-- .../X86/extract-scalar-from-undef.ll | 27 +- .../X86/gather-node-same-as-vect-but-order.ll | 13 +- .../SLPVectorizer/X86/horizontal-minmax.ll | 16 +- .../SLPVectorizer/X86/inst_size_bug.ll | 18 +- .../SLPVectorizer/X86/landing_pad.ll | 19 +- llvm/test/Transforms/SLPVectorizer/X86/phi.ll | 36 +- .../SLPVectorizer/X86/reduction-logical.ll | 17 +- .../X86/remark-partial-loads-vectorize.ll | 16 +- .../X86/scatter-vectorize-reused-pointer.ll | 26 +- .../X86/schedule_budget_debug_info.ll | 40 +- .../SLPVectorizer/X86/split-load8_2-unord.ll | 39 +- .../Transforms/SLPVectorizer/X86/tiny-tree.ll | 5 +- .../X86/vect-gather-same-nodes.ll | 6 +- 27 files changed, 718 insertions(+), 785 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index d7763a022f3b6e..caee3bf9c958d5 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3094,6 +3094,10 @@ class BoUpSLP { /// The index of this treeEntry in VectorizableTree. int Idx = -1; + /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from + /// other nodes as a series of insertvector instructions. + SmallVector, 0> CombinedEntriesWithIndices; + private: /// The operands of each instruction in each lane Operands[op_index][lane]. /// Note: This helps avoid the replication of the code that performs the @@ -3394,7 +3398,9 @@ class BoUpSLP { if (!isConstant(V)) { auto *I = dyn_cast(V); AllConstsOrCasts &= I && I->getType()->isIntegerTy(); - ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last); + if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE || + !UserTreeIdx.UserTE->isGather()) + ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last); } if (AllConstsOrCasts) CastMaxMinBWSizes = @@ -8349,8 +8355,49 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef Ptrs, void BoUpSLP::transformNodes() { constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - for (std::unique_ptr &TE : VectorizableTree) { - TreeEntry &E = *TE; + // The tree may grow here, so iterate over nodes, built before. + for (unsigned Idx : seq(VectorizableTree.size())) { + TreeEntry &E = *VectorizableTree[Idx]; + if (E.isGather()) { + ArrayRef VL = E.Scalars; + const unsigned Sz = getVectorElementSize(VL.front()); + unsigned MinVF = getMinVF(2 * Sz); + if (VL.size() <= 2 || + (E.getOpcode() && + (E.isAltShuffle() || E.getOpcode() != Instruction::Load))) + continue; + // Try to find vectorizable sequences and transform them into a series of + // insertvector instructions. + unsigned StartIdx = 0; + unsigned End = VL.size(); + for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) { + for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) { + ArrayRef Slice = VL.slice(Cnt, VF); + InstructionsState S = getSameOpcode(Slice, *TLI); + if (!S.getOpcode() || S.isAltShuffle() || + (S.getOpcode() != Instruction::Load && + any_of(Slice, [&](Value *V) { + return !areAllUsersVectorized(cast(V), + UserIgnoreList); + }))) + continue; + if (!getTreeEntry(Slice.front()) && !getTreeEntry(Slice.back())) { + unsigned PrevSize = VectorizableTree.size(); + buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX)); + if (PrevSize + 1 == VectorizableTree.size() && + VectorizableTree[PrevSize]->isGather()) { + VectorizableTree.pop_back(); + continue; + } + E.CombinedEntriesWithIndices.emplace_back(PrevSize, Cnt); + if (StartIdx == Cnt) + StartIdx = Cnt + VF; + if (End == Cnt + VF) + End = Cnt; + } + } + } + } switch (E.getOpcode()) { case Instruction::Load: { // No need to reorder masked gather loads, just reorder the scalar @@ -8473,175 +8520,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { auto *VecTy = getWidenedType(ScalarTy, VL.size()); InstructionCost GatherCost = 0; SmallVector Gathers(VL); - // Improve gather cost for gather of loads, if we can group some of the - // loads into vector loads. - InstructionsState S = getSameOpcode(VL, *R.TLI); - const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy); - unsigned MinVF = R.getMinVF(2 * Sz); - if (VL.size() > 2 && - ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) || - (InVectors.empty() && - any_of(seq(0, VL.size() / MinVF), - [&](unsigned Idx) { - ArrayRef SubVL = VL.slice(Idx * MinVF, MinVF); - InstructionsState S = getSameOpcode(SubVL, *R.TLI); - return S.getOpcode() == Instruction::Load && - !S.isAltShuffle(); - }))) && - !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) && - !isSplat(Gathers)) { - InstructionCost BaseCost = R.getGatherCost(Gathers, !Root, ScalarTy); - SetVector VectorizedLoads; - SmallVector> VectorizedStarts; - SmallVector ScatterVectorized; - unsigned StartIdx = 0; - unsigned VF = VL.size() / 2; - for (; VF >= MinVF; VF /= 2) { - for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End; - Cnt += VF) { - ArrayRef Slice = VL.slice(Cnt, VF); - if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) { - InstructionsState SliceS = getSameOpcode(Slice, *R.TLI); - if (SliceS.getOpcode() != Instruction::Load || - SliceS.isAltShuffle()) - continue; - } - if (!VectorizedLoads.count(Slice.front()) && - !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) { - SmallVector PointerOps; - OrdersType CurrentOrder; - LoadsState LS = R.canVectorizeLoads(Slice, Slice.front(), - CurrentOrder, PointerOps); - switch (LS) { - case LoadsState::Vectorize: - case LoadsState::ScatterVectorize: - case LoadsState::StridedVectorize: - // Mark the vectorized loads so that we don't vectorize them - // again. - // TODO: better handling of loads with reorders. - if (((LS == LoadsState::Vectorize || - LS == LoadsState::StridedVectorize) && - CurrentOrder.empty()) || - (LS == LoadsState::StridedVectorize && - isReverseOrder(CurrentOrder))) - VectorizedStarts.emplace_back(Cnt, LS); - else - ScatterVectorized.push_back(Cnt); - VectorizedLoads.insert(Slice.begin(), Slice.end()); - // If we vectorized initial block, no need to try to vectorize - // it again. - if (Cnt == StartIdx) - StartIdx += VF; - break; - case LoadsState::Gather: - break; - } - } - } - // Check if the whole array was vectorized already - exit. - if (StartIdx >= VL.size()) - break; - // Found vectorizable parts - exit. - if (!VectorizedLoads.empty()) - break; - } - if (!VectorizedLoads.empty()) { - unsigned NumParts = TTI.getNumberOfParts(VecTy); - bool NeedInsertSubvectorAnalysis = - !NumParts || (VL.size() / VF) > NumParts; - // Get the cost for gathered loads. - for (unsigned I = 0, End = VL.size(); I < End; I += VF) { - if (VectorizedLoads.contains(VL[I])) - continue; - GatherCost += - getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root); - } - // Exclude potentially vectorized loads from list of gathered - // scalars. - Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType())); - // The cost for vectorized loads. - InstructionCost ScalarsCost = 0; - for (Value *V : VectorizedLoads) { - auto *LI = cast(V); - ScalarsCost += - TTI.getMemoryOpCost(Instruction::Load, LI->getType(), - LI->getAlign(), LI->getPointerAddressSpace(), - CostKind, TTI::OperandValueInfo(), LI); - } - auto *LoadTy = getWidenedType(VL.front()->getType(), VF); - for (const std::pair &P : VectorizedStarts) { - auto *LI = cast(VL[P.first]); - Align Alignment = LI->getAlign(); - GatherCost += - P.second == LoadsState::Vectorize - ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, - LI->getPointerAddressSpace(), CostKind, - TTI::OperandValueInfo(), LI) - : TTI.getStridedMemoryOpCost( - Instruction::Load, LoadTy, LI->getPointerOperand(), - /*VariableMask=*/false, Alignment, CostKind, LI); - // Add external uses costs. - for (auto [Idx, V] : enumerate(VL.slice( - P.first, std::min(VL.size() - P.first, VF)))) - if (!R.areAllUsersVectorized(cast(V))) - GatherCost += TTI.getVectorInstrCost(Instruction::ExtractElement, - LoadTy, CostKind, Idx); - // Estimate GEP cost. - SmallVector PointerOps(VF); - for (auto [I, V] : enumerate(VL.slice(P.first, VF))) - PointerOps[I] = cast(V)->getPointerOperand(); - auto [ScalarGEPCost, VectorGEPCost] = - getGEPCosts(TTI, PointerOps, LI->getPointerOperand(), - Instruction::Load, CostKind, LI->getType(), LoadTy); - GatherCost += VectorGEPCost - ScalarGEPCost; - } - for (unsigned P : ScatterVectorized) { - auto *LI0 = cast(VL[P]); - ArrayRef Slice = VL.slice(P, VF); - Align CommonAlignment = computeCommonAlignment(Slice); - GatherCost += TTI.getGatherScatterOpCost( - Instruction::Load, LoadTy, LI0->getPointerOperand(), - /*VariableMask=*/false, CommonAlignment, CostKind, LI0); - // Estimate GEP cost. - SmallVector PointerOps(VF); - for (auto [I, V] : enumerate(Slice)) - PointerOps[I] = cast(V)->getPointerOperand(); - OrdersType Order; - if (sortPtrAccesses(PointerOps, LI0->getType(), *R.DL, *R.SE, - Order)) { - // TODO: improve checks if GEPs can be vectorized. - Value *Ptr0 = PointerOps.front(); - Type *ScalarTy = Ptr0->getType(); - auto *VecTy = getWidenedType(ScalarTy, VF); - auto [ScalarGEPCost, VectorGEPCost] = - getGEPCosts(TTI, PointerOps, Ptr0, Instruction::GetElementPtr, - CostKind, ScalarTy, VecTy); - GatherCost += VectorGEPCost - ScalarGEPCost; - if (!Order.empty()) { - SmallVector Mask; - inversePermutation(Order, Mask); - GatherCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, - VecTy, Mask, CostKind); - } - } else { - GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true, - PointerOps.front()->getType()); - } - } - if (NeedInsertSubvectorAnalysis) { - // Add the cost for the subvectors insert. - SmallVector ShuffleMask(VL.size()); - for (unsigned I = VF, E = VL.size(); I < E; I += VF) { - for (unsigned Idx : seq(0, E)) - ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx; - GatherCost += ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, - ShuffleMask, CostKind, I, LoadTy); - } - } - GatherCost -= ScalarsCost; - } - GatherCost = std::min(BaseCost, GatherCost); - } else if (!Root && isSplat(VL)) { + if (!Root && isSplat(VL)) { // Found the broadcasting of the single scalar, calculate the cost as // the broadcast. const auto *It = find_if_not(VL, IsaPred); @@ -9389,7 +9268,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { InstructionCost createFreeze(InstructionCost Cost) { return Cost; } /// Finalize emission of the shuffles. InstructionCost - finalize(ArrayRef ExtMask, unsigned VF = 0, + finalize(ArrayRef ExtMask, + ArrayRef> SubVectors, + unsigned VF = 0, function_ref &)> Action = {}) { IsFinalized = true; if (Action) { @@ -9407,6 +9288,47 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { Action(V, CommonMask); InVectors.front() = V; } + if (!SubVectors.empty()) { + const PointerUnion &Vec = InVectors.front(); + if (InVectors.size() == 2) + Cost += createShuffle(Vec, InVectors.back(), CommonMask); + else + Cost += createShuffle(Vec, nullptr, CommonMask); + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (CommonMask[Idx] != PoisonMaskElem) + CommonMask[Idx] = Idx; + for (auto [E, Idx] : SubVectors) { + Type *EScalarTy = E->Scalars.front()->getType(); + bool IsSigned = true; + if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) { + EScalarTy = + IntegerType::get(EScalarTy->getContext(), It->second.first); + IsSigned = It->second.second; + } + if (ScalarTy != EScalarTy) { + unsigned CastOpcode = Instruction::Trunc; + unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy); + unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy); + if (DstSz > SrcSz) + CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt; + Cost += TTI.getCastInstrCost( + CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()), + getWidenedType(EScalarTy, E->getVectorFactor()), + TTI::CastContextHint::Normal, CostKind); + } + Cost += ::getShuffleCost( + TTI, TTI::SK_InsertSubvector, + FixedVectorType::get(ScalarTy, CommonMask.size()), std::nullopt, + CostKind, Idx, + FixedVectorType::get(ScalarTy, E->getVectorFactor())); + if (!CommonMask.empty()) { + std::iota(std::next(CommonMask.begin(), Idx), + std::next(CommonMask.begin(), Idx + E->getVectorFactor()), + Idx); + } + } + } + ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true); if (CommonMask.empty()) { assert(InVectors.size() == 1 && "Expected only one vector with no mask"); @@ -12504,7 +12426,9 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { /// \param Action the action (if any) to be performed before final applying of /// the \p ExtMask mask. Value * - finalize(ArrayRef ExtMask, unsigned VF = 0, + finalize(ArrayRef ExtMask, + ArrayRef> SubVectors, + unsigned VF = 0, function_ref &)> Action = {}) { IsFinalized = true; SmallVector NewExtMask(ExtMask); @@ -12538,6 +12462,30 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { Action(Vec, CommonMask); InVectors.front() = Vec; } + if (!SubVectors.empty()) { + Value *Vec = InVectors.front(); + if (InVectors.size() == 2) { + Vec = createShuffle(Vec, InVectors.back(), CommonMask); + InVectors.pop_back(); + } else { + Vec = createShuffle(Vec, nullptr, CommonMask); + } + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (CommonMask[Idx] != PoisonMaskElem) + CommonMask[Idx] = Idx; + for (auto [E, Idx] : SubVectors) { + Value *V = castToScalarTyElem(E->VectorizedValue); + Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V, + Builder.getInt64(Idx)); + if (!CommonMask.empty()) { + std::iota(std::next(CommonMask.begin(), Idx), + std::next(CommonMask.begin(), Idx + E->getVectorFactor()), + Idx); + } + } + InVectors.front() = Vec; + } + if (!ExtMask.empty()) { if (CommonMask.empty()) { CommonMask.assign(ExtMask.begin(), ExtMask.end()); @@ -12616,7 +12564,14 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, : ScalarTy, Builder, *this); ShuffleBuilder.add(V, Mask); - return ShuffleBuilder.finalize(std::nullopt); + SmallVector> SubVectors( + E->CombinedEntriesWithIndices.size()); + transform(E->CombinedEntriesWithIndices, SubVectors.begin(), + [&](const auto &P) { + return std::make_pair(VectorizableTree[P.first].get(), + P.second); + }); + return ShuffleBuilder.finalize(std::nullopt, SubVectors); }; Value *V = vectorizeTree(VE, PostponedPHIs); if (VF * getNumElements(VL[0]->getType()) != @@ -12699,6 +12654,17 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, SmallVector ReuseShuffleIndices(E->ReuseShuffleIndices.begin(), E->ReuseShuffleIndices.end()); SmallVector GatheredScalars(E->Scalars.begin(), E->Scalars.end()); + // Clear values, to be replaced by insertvector instructions. + for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices) + for_each(MutableArrayRef(GatheredScalars) + .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()), + [&](Value *&V) { V = PoisonValue::get(V->getType()); }); + SmallVector> SubVectors( + E->CombinedEntriesWithIndices.size()); + transform(E->CombinedEntriesWithIndices, SubVectors.begin(), + [&](const auto &P) { + return std::make_pair(VectorizableTree[P.first].get(), P.second); + }); // Build a mask out of the reorder indices and reorder scalars per this // mask. SmallVector ReorderMask; @@ -12836,7 +12802,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } } ShuffleBuilder.add(*FrontTE, Mask); - Res = ShuffleBuilder.finalize(E->getCommonMask()); + Res = ShuffleBuilder.finalize(E->getCommonMask(), SubVectors); return Res; } if (!Resized) { @@ -13093,10 +13059,10 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, (IsSingleShuffle && ((IsIdentityShuffle && IsNonPoisoned) || IsUsedInExpr) && isa(V)); })) - Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); else Res = ShuffleBuilder.finalize( - E->ReuseShuffleIndices, E->Scalars.size(), + E->ReuseShuffleIndices, SubVectors, E->Scalars.size(), [&](Value *&Vec, SmallVectorImpl &Mask) { TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false); Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec); @@ -13107,7 +13073,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true); Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size()); ShuffleBuilder.add(BV, ReuseMask); - Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); } else { // Gather all constants. SmallVector Mask(GatheredScalars.size(), PoisonMaskElem); @@ -13117,7 +13083,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } Value *BV = ShuffleBuilder.gather(GatheredScalars); ShuffleBuilder.add(BV, Mask); - Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); } if (NeedFreeze) @@ -13126,6 +13092,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) { + for (auto [EIdx, _] : E->CombinedEntriesWithIndices) + (void)vectorizeTree(VectorizableTree[EIdx].get(), /*PostponedPHIs=*/false); return processBuildVector(E, ScalarTy, Builder, *this); } @@ -13177,7 +13145,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { } else { ShuffleBuilder.addOrdered(V, E->ReorderIndices); } - return ShuffleBuilder.finalize(E->ReuseShuffleIndices); + SmallVector> SubVectors( + E->CombinedEntriesWithIndices.size()); + transform( + E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) { + return std::make_pair(VectorizableTree[P.first].get(), P.second); + }); + return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); }; assert(!E->isGather() && "Unhandled state"); @@ -14580,7 +14554,7 @@ Value *BoUpSLP::vectorizeTree( ShuffleBuilder.add(V1, CombinedMask1); if (V2) ShuffleBuilder.add(V2, CombinedMask2); - return ShuffleBuilder.finalize(std::nullopt); + return ShuffleBuilder.finalize(std::nullopt, std::nullopt); }; auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef Mask, @@ -14718,7 +14692,14 @@ Value *BoUpSLP::vectorizeTree( // Clear up reduction references, if any. if (UserIgnoreList) { for (Instruction *I : RemovedInsts) { - if (getTreeEntry(I)->Idx != 0) + const TreeEntry *IE = getTreeEntry(I); + if (IE->Idx != 0 && + !(VectorizableTree.front()->isGather() && isa(I) && + !IE->UserTreeIndices.empty() && + any_of(IE->UserTreeIndices, [&](const EdgeInfo &EI) { + return EI.UserTE == VectorizableTree.front().get() && + EI.EdgeIdx == UINT_MAX; + }))) continue; SmallVector LogicalOpSelects; I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) { diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll index 22511c018dca2d..2121775224098e 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll @@ -18,62 +18,62 @@ define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[IP2]] to i64 ; CHECK-NEXT: [[RRRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 4 ; CHECK-NEXT: [[RRRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 4 -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1, !tbaa [[TBAA0:![0-9]+]] -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[RDD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[RDD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[RRRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR]], i64 4 ; CHECK-NEXT: [[RRRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64]], i64 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[RDD_PTR]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[RDD_PTR64]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_1]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_1]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[RDD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[RDD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[RRRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_1]], i64 4 ; CHECK-NEXT: [[RRRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_1]], i64 4 -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[RDD_PTR_1]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_1]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_2]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[RDD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[RDD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_1]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[RRRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_2]], i64 4 ; CHECK-NEXT: [[RRRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_2]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1, !tbaa [[TBAA0:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[RDD_PTR]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[RDD_PTR64]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[RDD_PTR_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_2]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[RDD_PTR_2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_3]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_3]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP18]], <16 x i8> [[TMP19]], <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP21]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <16 x i8> [[TMP31]], <16 x i8> [[TMP32]], <16 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_3]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_3]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> ; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> ; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> -; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP34]], [[TMP40]] +; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]] ; CHECK-NEXT: [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], -; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP28]] +; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP26]] ; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll index c1cef6ff3d10b4..91c8db14a45aa1 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll @@ -169,11 +169,12 @@ define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[T11]] to i64 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP9]] ; CHECK-NEXT: [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[T10]], i64 2 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[T12]], i64 3 -; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]]) -; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP13]], [[SUM_032]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[T10]], i64 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[T12]], i64 3 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]]) +; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP14]], [[SUM_032]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll index d79aed89b0be73..5b878108af59af 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -340,12 +340,12 @@ entry: define i16 @reduce_blockstrided4(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, i32 noundef %stride) { ; CHECK-LABEL: @reduce_blockstrided4( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X:%.*]], align 2 ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X]], align 2 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y:%.*]], align 2 -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y]], align 2 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i16> [[TMP2]], [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i16> [[TMP3]], [[TMP1]] @@ -416,31 +416,31 @@ define i32 @reduce_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[OFF2:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4 -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 ; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> [[TMP11]], <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = zext <16 x i8> [[TMP12]] to <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP12]], <16 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP15]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> ; CHECK-NEXT: [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = mul nuw nsw <16 x i32> [[TMP13]], [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = mul nuw nsw <16 x i32> [[TMP11]], [[TMP19]] ; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP20]]) ; CHECK-NEXT: ret i32 [[TMP21]] ; @@ -677,63 +677,63 @@ entry: define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, ptr nocapture noundef writeonly %z, i32 noundef %stride) { ; CHECK-LABEL: @store_blockstrided3( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[X:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 8 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i64 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ADD4:%.*]] = add nsw i32 [[STRIDE:%.*]], 1 ; CHECK-NEXT: [[IDXPROM5:%.*]] = sext i32 [[ADD4]] to i64 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[STRIDE]], 1 ; CHECK-NEXT: [[IDXPROM11:%.*]] = sext i32 [[MUL]] to i64 ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM11]] -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[ADD14:%.*]] = or disjoint i32 [[MUL]], 1 -; CHECK-NEXT: [[IDXPROM15:%.*]] = sext i32 [[ADD14]] to i64 -; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM15]] +; CHECK-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL]], 2 +; CHECK-NEXT: [[IDXPROM19:%.*]] = sext i32 [[ADD18]] to i64 +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM19]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4 ; CHECK-NEXT: [[MUL21:%.*]] = mul nsw i32 [[STRIDE]], 3 ; CHECK-NEXT: [[IDXPROM23:%.*]] = sext i32 [[MUL21]] to i64 ; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM23]] -; CHECK-NEXT: [[ADD30:%.*]] = add nsw i32 [[MUL21]], 2 -; CHECK-NEXT: [[IDXPROM31:%.*]] = sext i32 [[ADD30]] to i64 -; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM31]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX32]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[Y:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, ptr [[Y]], i64 8 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX24]], align 4 +; CHECK-NEXT: [[ADD26:%.*]] = add nsw i32 [[MUL21]], 1 +; CHECK-NEXT: [[IDXPROM27:%.*]] = sext i32 [[ADD26]] to i64 +; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM27]] +; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i64 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]] -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4 -; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM15]] +; CHECK-NEXT: [[ARRAYIDX56:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM19]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX56]], align 4 ; CHECK-NEXT: [[ARRAYIDX60:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM23]] -; CHECK-NEXT: [[ARRAYIDX68:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM31]] -; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX68]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX60]], align 4 +; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]] ; CHECK-NEXT: [[ARRAYIDX72:%.*]] = getelementptr inbounds i8, ptr [[Z:%.*]], i64 4 -; CHECK-NEXT: [[MUL73:%.*]] = mul nsw i32 [[TMP6]], [[TMP1]] +; CHECK-NEXT: [[MUL73:%.*]] = mul nsw i32 [[TMP3]], [[TMP0]] ; CHECK-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 24 -; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP5]], [[TMP0]] -; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP7]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[X]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr [[Y]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP8]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP9]], [[TMP7]] ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> -; CHECK-NEXT: [[MUL81:%.*]] = mul nsw i32 [[TMP8]], [[TMP3]] -; CHECK-NEXT: [[ARRAYIDX82:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 32 -; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX16]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX52]], align 4 +; CHECK-NEXT: [[ARRAYIDX84:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 28 +; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX48]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP14]], [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[ARRAYIDX90:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 40 -; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX24]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX60]], align 4 +; CHECK-NEXT: [[MUL85:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]] +; CHECK-NEXT: [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 44 +; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 36 +; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4 ; CHECK-NEXT: store i32 [[MUL73]], ptr [[Z]], align 4 ; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr [[ARRAYIDX72]], align 4 -; CHECK-NEXT: store i32 [[MUL81]], ptr [[ARRAYIDX82]], align 4 -; CHECK-NEXT: store <2 x i32> [[TMP16]], ptr [[ARRAYIDX76]], align 4 +; CHECK-NEXT: store <2 x i32> [[TMP16]], ptr [[ARRAYIDX84]], align 4 +; CHECK-NEXT: store i32 [[MUL85]], ptr [[ARRAYIDX76]], align 4 +; CHECK-NEXT: store i32 [[MUL87]], ptr [[ARRAYIDX88]], align 4 ; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <2 x i32> [[TMP18]], [[TMP17]] ; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP19]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: store <2 x i32> [[TMP20]], ptr [[ARRAYIDX90]], align 4 -; CHECK-NEXT: [[MUL91:%.*]] = mul nsw i32 [[TMP9]], [[TMP4]] -; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 36 -; CHECK-NEXT: store i32 [[MUL91]], ptr [[ARRAYIDX92]], align 4 +; CHECK-NEXT: store <2 x i32> [[TMP20]], ptr [[ARRAYIDX92]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -833,12 +833,12 @@ entry: define void @store_blockstrided4(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, i32 noundef %stride, ptr %dst0) { ; CHECK-LABEL: @store_blockstrided4( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X:%.*]], align 2 ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X]], align 2 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y:%.*]], align 2 -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y]], align 2 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i16> [[TMP2]], [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i16> [[TMP3]], [[TMP1]] @@ -1203,62 +1203,62 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1, ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[ST2:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4 -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 4 -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 ; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 ; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP18]], <16 x i8> [[TMP19]], <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP21]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <16 x i8> [[TMP31]], <16 x i8> [[TMP32]], <16 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> ; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> ; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> -; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP34]], [[TMP40]] +; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]] ; CHECK-NEXT: [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], -; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP28]] +; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP26]] ; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP45:%.*]] = add nsw <16 x i32> [[TMP43]], [[TMP44]] ; CHECK-NEXT: [[TMP46:%.*]] = sub nsw <16 x i32> [[TMP43]], [[TMP44]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll index d89d6286703605..07411cacb36268 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll @@ -14,232 +14,161 @@ define i64 @straight(ptr nocapture noundef readonly %p, i32 noundef %st) { ; CHECK-LABEL: @straight( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[ST:%.*]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 2 -; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ADD_PTR]], align 2 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ADD_PTR_1]], align 2 ; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[ADD_PTR_2]], align 2 ; CHECK-NEXT: [[ADD_PTR_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[ADD_PTR_3]], align 2 ; CHECK-NEXT: [[ADD_PTR_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr [[ADD_PTR_4]], align 2 ; CHECK-NEXT: [[ADD_PTR_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[ADD_PTR_5]], align 2 ; CHECK-NEXT: [[ADD_PTR_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[P]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ADD_PTR]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ADD_PTR_1]], align 2 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[ADD_PTR_2]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[ADD_PTR_3]], align 2 +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr [[ADD_PTR_4]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[ADD_PTR_5]], align 2 ; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i16>, ptr [[ADD_PTR_6]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP7]], i32 7 -; CHECK-NEXT: [[CONV_7_7:%.*]] = zext i16 [[TMP8]] to i32 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <64 x i16> [[TMP9]], <64 x i16> [[TMP10]], <64 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <64 x i16> [[TMP11]], <64 x i16> [[TMP12]], <64 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <64 x i16> [[TMP13]], <64 x i16> [[TMP14]], <64 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <64 x i16> [[TMP15]], <64 x i16> [[TMP16]], <64 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <64 x i16> [[TMP17]], <64 x i16> [[TMP18]], <64 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <64 x i16> [[TMP19]], <64 x i16> [[TMP20]], <64 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <64 x i16> [[TMP21]], <64 x i16> [[TMP22]], <64 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = zext <64 x i16> [[TMP23]] to <64 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[TMP7]], i32 6 -; CHECK-NEXT: [[CONV_6_7:%.*]] = zext i16 [[TMP25]] to i32 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP7]], i32 5 -; CHECK-NEXT: [[CONV_5_7:%.*]] = zext i16 [[TMP26]] to i32 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x i16> [[TMP7]], i32 4 -; CHECK-NEXT: [[CONV_4_7:%.*]] = zext i16 [[TMP27]] to i32 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i16> [[TMP7]], i32 3 -; CHECK-NEXT: [[CONV_3_7:%.*]] = zext i16 [[TMP28]] to i32 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i16> [[TMP7]], i32 2 -; CHECK-NEXT: [[CONV_2_7:%.*]] = zext i16 [[TMP29]] to i32 -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i16> [[TMP7]], i32 1 -; CHECK-NEXT: [[CONV_1_7:%.*]] = zext i16 [[TMP30]] to i32 -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <8 x i16> [[TMP7]], i32 0 -; CHECK-NEXT: [[CONV_764:%.*]] = zext i16 [[TMP31]] to i32 -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7 -; CHECK-NEXT: [[CONV_7_6:%.*]] = zext i16 [[TMP32]] to i32 -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6 -; CHECK-NEXT: [[CONV_6_6:%.*]] = zext i16 [[TMP33]] to i32 -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5 -; CHECK-NEXT: [[CONV_5_6:%.*]] = zext i16 [[TMP34]] to i32 -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4 -; CHECK-NEXT: [[CONV_4_6:%.*]] = zext i16 [[TMP35]] to i32 -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 3 -; CHECK-NEXT: [[CONV_3_6:%.*]] = zext i16 [[TMP36]] to i32 -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i16> [[TMP6]], i32 2 -; CHECK-NEXT: [[CONV_2_6:%.*]] = zext i16 [[TMP37]] to i32 -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <8 x i16> [[TMP6]], i32 1 -; CHECK-NEXT: [[CONV_1_6:%.*]] = zext i16 [[TMP38]] to i32 -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 -; CHECK-NEXT: [[CONV_660:%.*]] = zext i16 [[TMP39]] to i32 -; CHECK-NEXT: [[TMP40:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7 -; CHECK-NEXT: [[CONV_7_5:%.*]] = zext i16 [[TMP40]] to i32 -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6 -; CHECK-NEXT: [[CONV_6_5:%.*]] = zext i16 [[TMP41]] to i32 -; CHECK-NEXT: [[TMP42:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5 -; CHECK-NEXT: [[CONV_5_5:%.*]] = zext i16 [[TMP42]] to i32 -; CHECK-NEXT: [[TMP43:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4 -; CHECK-NEXT: [[CONV_4_5:%.*]] = zext i16 [[TMP43]] to i32 -; CHECK-NEXT: [[TMP44:%.*]] = extractelement <8 x i16> [[TMP5]], i32 3 -; CHECK-NEXT: [[CONV_3_5:%.*]] = zext i16 [[TMP44]] to i32 -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <8 x i16> [[TMP5]], i32 2 -; CHECK-NEXT: [[CONV_2_5:%.*]] = zext i16 [[TMP45]] to i32 -; CHECK-NEXT: [[TMP46:%.*]] = extractelement <8 x i16> [[TMP5]], i32 1 -; CHECK-NEXT: [[CONV_1_5:%.*]] = zext i16 [[TMP46]] to i32 -; CHECK-NEXT: [[TMP47:%.*]] = extractelement <8 x i16> [[TMP5]], i32 0 -; CHECK-NEXT: [[CONV_556:%.*]] = zext i16 [[TMP47]] to i32 -; CHECK-NEXT: [[TMP48:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7 -; CHECK-NEXT: [[CONV_7_4:%.*]] = zext i16 [[TMP48]] to i32 -; CHECK-NEXT: [[TMP49:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6 -; CHECK-NEXT: [[CONV_6_4:%.*]] = zext i16 [[TMP49]] to i32 -; CHECK-NEXT: [[TMP50:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5 -; CHECK-NEXT: [[CONV_5_4:%.*]] = zext i16 [[TMP50]] to i32 -; CHECK-NEXT: [[TMP51:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4 -; CHECK-NEXT: [[CONV_4_4:%.*]] = zext i16 [[TMP51]] to i32 -; CHECK-NEXT: [[TMP52:%.*]] = extractelement <8 x i16> [[TMP4]], i32 3 -; CHECK-NEXT: [[CONV_3_4:%.*]] = zext i16 [[TMP52]] to i32 -; CHECK-NEXT: [[TMP53:%.*]] = extractelement <8 x i16> [[TMP4]], i32 2 -; CHECK-NEXT: [[CONV_2_4:%.*]] = zext i16 [[TMP53]] to i32 -; CHECK-NEXT: [[TMP54:%.*]] = extractelement <8 x i16> [[TMP4]], i32 1 -; CHECK-NEXT: [[CONV_1_4:%.*]] = zext i16 [[TMP54]] to i32 -; CHECK-NEXT: [[TMP55:%.*]] = extractelement <8 x i16> [[TMP4]], i32 0 -; CHECK-NEXT: [[CONV_452:%.*]] = zext i16 [[TMP55]] to i32 -; CHECK-NEXT: [[TMP56:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 -; CHECK-NEXT: [[CONV_7_3:%.*]] = zext i16 [[TMP56]] to i32 -; CHECK-NEXT: [[TMP57:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 -; CHECK-NEXT: [[CONV_6_3:%.*]] = zext i16 [[TMP57]] to i32 -; CHECK-NEXT: [[TMP58:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 -; CHECK-NEXT: [[CONV_5_3:%.*]] = zext i16 [[TMP58]] to i32 -; CHECK-NEXT: [[TMP59:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 -; CHECK-NEXT: [[CONV_4_3:%.*]] = zext i16 [[TMP59]] to i32 -; CHECK-NEXT: [[TMP60:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 -; CHECK-NEXT: [[CONV_3_3:%.*]] = zext i16 [[TMP60]] to i32 -; CHECK-NEXT: [[TMP61:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 -; CHECK-NEXT: [[CONV_2_3:%.*]] = zext i16 [[TMP61]] to i32 -; CHECK-NEXT: [[TMP62:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 -; CHECK-NEXT: [[CONV_1_3:%.*]] = zext i16 [[TMP62]] to i32 -; CHECK-NEXT: [[TMP63:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 -; CHECK-NEXT: [[CONV_348:%.*]] = zext i16 [[TMP63]] to i32 -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 -; CHECK-NEXT: [[CONV_7_2:%.*]] = zext i16 [[TMP64]] to i32 -; CHECK-NEXT: [[TMP65:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6 -; CHECK-NEXT: [[CONV_6_2:%.*]] = zext i16 [[TMP65]] to i32 -; CHECK-NEXT: [[TMP66:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5 -; CHECK-NEXT: [[CONV_5_2:%.*]] = zext i16 [[TMP66]] to i32 -; CHECK-NEXT: [[TMP67:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4 -; CHECK-NEXT: [[CONV_4_2:%.*]] = zext i16 [[TMP67]] to i32 -; CHECK-NEXT: [[TMP68:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3 -; CHECK-NEXT: [[CONV_3_2:%.*]] = zext i16 [[TMP68]] to i32 -; CHECK-NEXT: [[TMP69:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2 -; CHECK-NEXT: [[CONV_2_2:%.*]] = zext i16 [[TMP69]] to i32 -; CHECK-NEXT: [[TMP70:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1 -; CHECK-NEXT: [[CONV_1_2:%.*]] = zext i16 [[TMP70]] to i32 -; CHECK-NEXT: [[TMP71:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0 -; CHECK-NEXT: [[CONV_244:%.*]] = zext i16 [[TMP71]] to i32 -; CHECK-NEXT: [[TMP72:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 -; CHECK-NEXT: [[CONV_7_1:%.*]] = zext i16 [[TMP72]] to i32 -; CHECK-NEXT: [[TMP73:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6 -; CHECK-NEXT: [[CONV_6_1:%.*]] = zext i16 [[TMP73]] to i32 -; CHECK-NEXT: [[TMP74:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5 -; CHECK-NEXT: [[CONV_5_1:%.*]] = zext i16 [[TMP74]] to i32 -; CHECK-NEXT: [[TMP75:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4 -; CHECK-NEXT: [[CONV_4_1:%.*]] = zext i16 [[TMP75]] to i32 -; CHECK-NEXT: [[TMP76:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3 -; CHECK-NEXT: [[CONV_3_1:%.*]] = zext i16 [[TMP76]] to i32 -; CHECK-NEXT: [[TMP77:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2 -; CHECK-NEXT: [[CONV_2_1:%.*]] = zext i16 [[TMP77]] to i32 -; CHECK-NEXT: [[TMP78:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1 -; CHECK-NEXT: [[CONV_1_1:%.*]] = zext i16 [[TMP78]] to i32 -; CHECK-NEXT: [[TMP79:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 -; CHECK-NEXT: [[CONV_140:%.*]] = zext i16 [[TMP79]] to i32 -; CHECK-NEXT: [[TMP80:%.*]] = extractelement <8 x i16> [[TMP0]], i32 7 -; CHECK-NEXT: [[CONV_7:%.*]] = zext i16 [[TMP80]] to i32 -; CHECK-NEXT: [[TMP81:%.*]] = extractelement <8 x i16> [[TMP0]], i32 6 -; CHECK-NEXT: [[CONV_6:%.*]] = zext i16 [[TMP81]] to i32 -; CHECK-NEXT: [[TMP82:%.*]] = extractelement <8 x i16> [[TMP0]], i32 5 -; CHECK-NEXT: [[CONV_5:%.*]] = zext i16 [[TMP82]] to i32 -; CHECK-NEXT: [[TMP83:%.*]] = extractelement <8 x i16> [[TMP0]], i32 4 -; CHECK-NEXT: [[CONV_4:%.*]] = zext i16 [[TMP83]] to i32 -; CHECK-NEXT: [[TMP84:%.*]] = extractelement <8 x i16> [[TMP0]], i32 3 -; CHECK-NEXT: [[CONV_3:%.*]] = zext i16 [[TMP84]] to i32 -; CHECK-NEXT: [[TMP85:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2 -; CHECK-NEXT: [[CONV_2:%.*]] = zext i16 [[TMP85]] to i32 -; CHECK-NEXT: [[TMP86:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0 -; CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP86]] to i32 -; CHECK-NEXT: [[TMP87:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1 -; CHECK-NEXT: [[CONV_1:%.*]] = zext i16 [[TMP87]] to i32 -; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i32 [[CONV]], [[CONV_1]] -; CHECK-NEXT: [[TMP88:%.*]] = mul nuw nsw <64 x i32> [[TMP24]], [[TMP24]] -; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[CONV_2]] -; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[CONV_3]] -; CHECK-NEXT: [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[CONV_4]] -; CHECK-NEXT: [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[CONV_5]] -; CHECK-NEXT: [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[CONV_6]] -; CHECK-NEXT: [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[CONV_7]] -; CHECK-NEXT: [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[CONV_140]] -; CHECK-NEXT: [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[CONV_1_1]] -; CHECK-NEXT: [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[CONV_2_1]] -; CHECK-NEXT: [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[CONV_3_1]] -; CHECK-NEXT: [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[CONV_4_1]] -; CHECK-NEXT: [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[CONV_5_1]] -; CHECK-NEXT: [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[CONV_6_1]] -; CHECK-NEXT: [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[CONV_7_1]] -; CHECK-NEXT: [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[CONV_244]] -; CHECK-NEXT: [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[CONV_1_2]] -; CHECK-NEXT: [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[CONV_2_2]] -; CHECK-NEXT: [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[CONV_3_2]] -; CHECK-NEXT: [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[CONV_4_2]] -; CHECK-NEXT: [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[CONV_5_2]] -; CHECK-NEXT: [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[CONV_6_2]] -; CHECK-NEXT: [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[CONV_7_2]] -; CHECK-NEXT: [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[CONV_348]] -; CHECK-NEXT: [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[CONV_1_3]] -; CHECK-NEXT: [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[CONV_2_3]] -; CHECK-NEXT: [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[CONV_3_3]] -; CHECK-NEXT: [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[CONV_4_3]] -; CHECK-NEXT: [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[CONV_5_3]] -; CHECK-NEXT: [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[CONV_6_3]] -; CHECK-NEXT: [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[CONV_7_3]] -; CHECK-NEXT: [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[CONV_452]] -; CHECK-NEXT: [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[CONV_1_4]] -; CHECK-NEXT: [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[CONV_2_4]] -; CHECK-NEXT: [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[CONV_3_4]] -; CHECK-NEXT: [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[CONV_4_4]] -; CHECK-NEXT: [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[CONV_5_4]] -; CHECK-NEXT: [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[CONV_6_4]] -; CHECK-NEXT: [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[CONV_7_4]] -; CHECK-NEXT: [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[CONV_556]] -; CHECK-NEXT: [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[CONV_1_5]] -; CHECK-NEXT: [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[CONV_2_5]] -; CHECK-NEXT: [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[CONV_3_5]] -; CHECK-NEXT: [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[CONV_4_5]] -; CHECK-NEXT: [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[CONV_5_5]] -; CHECK-NEXT: [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[CONV_6_5]] -; CHECK-NEXT: [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[CONV_7_5]] -; CHECK-NEXT: [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[CONV_660]] -; CHECK-NEXT: [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[CONV_1_6]] -; CHECK-NEXT: [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[CONV_2_6]] -; CHECK-NEXT: [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[CONV_3_6]] -; CHECK-NEXT: [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[CONV_4_6]] -; CHECK-NEXT: [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[CONV_5_6]] -; CHECK-NEXT: [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[CONV_6_6]] -; CHECK-NEXT: [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[CONV_7_6]] -; CHECK-NEXT: [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[CONV_764]] -; CHECK-NEXT: [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[CONV_1_7]] -; CHECK-NEXT: [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[CONV_2_7]] -; CHECK-NEXT: [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[CONV_3_7]] -; CHECK-NEXT: [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[CONV_4_7]] -; CHECK-NEXT: [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[CONV_5_7]] -; CHECK-NEXT: [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[CONV_6_7]] -; CHECK-NEXT: [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[CONV_7_7]] -; CHECK-NEXT: [[TMP89:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP88]]) +; CHECK-NEXT: [[TMP8:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> poison, <8 x i16> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP9:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP8]], <8 x i16> [[TMP1]], i64 8) +; CHECK-NEXT: [[TMP10:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP9]], <8 x i16> [[TMP2]], i64 16) +; CHECK-NEXT: [[TMP11:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP10]], <8 x i16> [[TMP3]], i64 24) +; CHECK-NEXT: [[TMP12:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP11]], <8 x i16> [[TMP4]], i64 32) +; CHECK-NEXT: [[TMP13:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP12]], <8 x i16> [[TMP5]], i64 40) +; CHECK-NEXT: [[TMP14:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP13]], <8 x i16> [[TMP6]], i64 48) +; CHECK-NEXT: [[TMP15:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP14]], <8 x i16> [[TMP7]], i64 56) +; CHECK-NEXT: [[TMP16:%.*]] = zext <64 x i16> [[TMP15]] to <64 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <64 x i32> [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <64 x i32> [[TMP16]], i32 1 +; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i32 [[TMP17]], [[TMP18]] +; CHECK-NEXT: [[TMP19:%.*]] = mul nuw nsw <64 x i32> [[TMP16]], [[TMP16]] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <64 x i32> [[TMP16]], i32 2 +; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[TMP20]] +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <64 x i32> [[TMP16]], i32 3 +; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[TMP21]] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <64 x i32> [[TMP16]], i32 4 +; CHECK-NEXT: [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[TMP22]] +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <64 x i32> [[TMP16]], i32 5 +; CHECK-NEXT: [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[TMP23]] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <64 x i32> [[TMP16]], i32 6 +; CHECK-NEXT: [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[TMP24]] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <64 x i32> [[TMP16]], i32 7 +; CHECK-NEXT: [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[TMP25]] +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <64 x i32> [[TMP16]], i32 8 +; CHECK-NEXT: [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[TMP26]] +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <64 x i32> [[TMP16]], i32 9 +; CHECK-NEXT: [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[TMP27]] +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <64 x i32> [[TMP16]], i32 10 +; CHECK-NEXT: [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[TMP28]] +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <64 x i32> [[TMP16]], i32 11 +; CHECK-NEXT: [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[TMP29]] +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <64 x i32> [[TMP16]], i32 12 +; CHECK-NEXT: [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[TMP30]] +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <64 x i32> [[TMP16]], i32 13 +; CHECK-NEXT: [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[TMP31]] +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <64 x i32> [[TMP16]], i32 14 +; CHECK-NEXT: [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[TMP32]] +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <64 x i32> [[TMP16]], i32 15 +; CHECK-NEXT: [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[TMP33]] +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <64 x i32> [[TMP16]], i32 16 +; CHECK-NEXT: [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[TMP34]] +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <64 x i32> [[TMP16]], i32 17 +; CHECK-NEXT: [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[TMP35]] +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <64 x i32> [[TMP16]], i32 18 +; CHECK-NEXT: [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[TMP36]] +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <64 x i32> [[TMP16]], i32 19 +; CHECK-NEXT: [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[TMP37]] +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <64 x i32> [[TMP16]], i32 20 +; CHECK-NEXT: [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[TMP38]] +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <64 x i32> [[TMP16]], i32 21 +; CHECK-NEXT: [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[TMP39]] +; CHECK-NEXT: [[TMP40:%.*]] = extractelement <64 x i32> [[TMP16]], i32 22 +; CHECK-NEXT: [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[TMP40]] +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <64 x i32> [[TMP16]], i32 23 +; CHECK-NEXT: [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[TMP41]] +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <64 x i32> [[TMP16]], i32 24 +; CHECK-NEXT: [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[TMP42]] +; CHECK-NEXT: [[TMP43:%.*]] = extractelement <64 x i32> [[TMP16]], i32 25 +; CHECK-NEXT: [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[TMP43]] +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <64 x i32> [[TMP16]], i32 26 +; CHECK-NEXT: [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[TMP44]] +; CHECK-NEXT: [[TMP45:%.*]] = extractelement <64 x i32> [[TMP16]], i32 27 +; CHECK-NEXT: [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[TMP45]] +; CHECK-NEXT: [[TMP46:%.*]] = extractelement <64 x i32> [[TMP16]], i32 28 +; CHECK-NEXT: [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[TMP46]] +; CHECK-NEXT: [[TMP47:%.*]] = extractelement <64 x i32> [[TMP16]], i32 29 +; CHECK-NEXT: [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[TMP47]] +; CHECK-NEXT: [[TMP48:%.*]] = extractelement <64 x i32> [[TMP16]], i32 30 +; CHECK-NEXT: [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[TMP48]] +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <64 x i32> [[TMP16]], i32 31 +; CHECK-NEXT: [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[TMP49]] +; CHECK-NEXT: [[TMP50:%.*]] = extractelement <64 x i32> [[TMP16]], i32 32 +; CHECK-NEXT: [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[TMP50]] +; CHECK-NEXT: [[TMP51:%.*]] = extractelement <64 x i32> [[TMP16]], i32 33 +; CHECK-NEXT: [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[TMP51]] +; CHECK-NEXT: [[TMP52:%.*]] = extractelement <64 x i32> [[TMP16]], i32 34 +; CHECK-NEXT: [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[TMP52]] +; CHECK-NEXT: [[TMP53:%.*]] = extractelement <64 x i32> [[TMP16]], i32 35 +; CHECK-NEXT: [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[TMP53]] +; CHECK-NEXT: [[TMP54:%.*]] = extractelement <64 x i32> [[TMP16]], i32 36 +; CHECK-NEXT: [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[TMP54]] +; CHECK-NEXT: [[TMP55:%.*]] = extractelement <64 x i32> [[TMP16]], i32 37 +; CHECK-NEXT: [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[TMP55]] +; CHECK-NEXT: [[TMP56:%.*]] = extractelement <64 x i32> [[TMP16]], i32 38 +; CHECK-NEXT: [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[TMP56]] +; CHECK-NEXT: [[TMP57:%.*]] = extractelement <64 x i32> [[TMP16]], i32 39 +; CHECK-NEXT: [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[TMP57]] +; CHECK-NEXT: [[TMP58:%.*]] = extractelement <64 x i32> [[TMP16]], i32 40 +; CHECK-NEXT: [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[TMP58]] +; CHECK-NEXT: [[TMP59:%.*]] = extractelement <64 x i32> [[TMP16]], i32 41 +; CHECK-NEXT: [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[TMP59]] +; CHECK-NEXT: [[TMP60:%.*]] = extractelement <64 x i32> [[TMP16]], i32 42 +; CHECK-NEXT: [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[TMP60]] +; CHECK-NEXT: [[TMP61:%.*]] = extractelement <64 x i32> [[TMP16]], i32 43 +; CHECK-NEXT: [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[TMP61]] +; CHECK-NEXT: [[TMP62:%.*]] = extractelement <64 x i32> [[TMP16]], i32 44 +; CHECK-NEXT: [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[TMP62]] +; CHECK-NEXT: [[TMP63:%.*]] = extractelement <64 x i32> [[TMP16]], i32 45 +; CHECK-NEXT: [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[TMP63]] +; CHECK-NEXT: [[TMP64:%.*]] = extractelement <64 x i32> [[TMP16]], i32 46 +; CHECK-NEXT: [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[TMP64]] +; CHECK-NEXT: [[TMP65:%.*]] = extractelement <64 x i32> [[TMP16]], i32 47 +; CHECK-NEXT: [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[TMP65]] +; CHECK-NEXT: [[TMP66:%.*]] = extractelement <64 x i32> [[TMP16]], i32 48 +; CHECK-NEXT: [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[TMP66]] +; CHECK-NEXT: [[TMP67:%.*]] = extractelement <64 x i32> [[TMP16]], i32 49 +; CHECK-NEXT: [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[TMP67]] +; CHECK-NEXT: [[TMP68:%.*]] = extractelement <64 x i32> [[TMP16]], i32 50 +; CHECK-NEXT: [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[TMP68]] +; CHECK-NEXT: [[TMP69:%.*]] = extractelement <64 x i32> [[TMP16]], i32 51 +; CHECK-NEXT: [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[TMP69]] +; CHECK-NEXT: [[TMP70:%.*]] = extractelement <64 x i32> [[TMP16]], i32 52 +; CHECK-NEXT: [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[TMP70]] +; CHECK-NEXT: [[TMP71:%.*]] = extractelement <64 x i32> [[TMP16]], i32 53 +; CHECK-NEXT: [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[TMP71]] +; CHECK-NEXT: [[TMP72:%.*]] = extractelement <64 x i32> [[TMP16]], i32 54 +; CHECK-NEXT: [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[TMP72]] +; CHECK-NEXT: [[TMP73:%.*]] = extractelement <64 x i32> [[TMP16]], i32 55 +; CHECK-NEXT: [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[TMP73]] +; CHECK-NEXT: [[TMP74:%.*]] = extractelement <64 x i32> [[TMP16]], i32 56 +; CHECK-NEXT: [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[TMP74]] +; CHECK-NEXT: [[TMP75:%.*]] = extractelement <64 x i32> [[TMP16]], i32 57 +; CHECK-NEXT: [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[TMP75]] +; CHECK-NEXT: [[TMP76:%.*]] = extractelement <64 x i32> [[TMP16]], i32 58 +; CHECK-NEXT: [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[TMP76]] +; CHECK-NEXT: [[TMP77:%.*]] = extractelement <64 x i32> [[TMP16]], i32 59 +; CHECK-NEXT: [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[TMP77]] +; CHECK-NEXT: [[TMP78:%.*]] = extractelement <64 x i32> [[TMP16]], i32 60 +; CHECK-NEXT: [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[TMP78]] +; CHECK-NEXT: [[TMP79:%.*]] = extractelement <64 x i32> [[TMP16]], i32 61 +; CHECK-NEXT: [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[TMP79]] +; CHECK-NEXT: [[TMP80:%.*]] = extractelement <64 x i32> [[TMP16]], i32 62 +; CHECK-NEXT: [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[TMP80]] +; CHECK-NEXT: [[TMP81:%.*]] = extractelement <64 x i32> [[TMP16]], i32 63 +; CHECK-NEXT: [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[TMP81]] +; CHECK-NEXT: [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP19]]) ; CHECK-NEXT: [[CONV15:%.*]] = zext i32 [[ADD_7_7]] to i64 -; CHECK-NEXT: [[CONV16:%.*]] = zext i32 [[TMP89]] to i64 +; CHECK-NEXT: [[CONV16:%.*]] = zext i32 [[TMP82]] to i64 ; CHECK-NEXT: [[SHL:%.*]] = shl nuw i64 [[CONV16]], 32 ; CHECK-NEXT: [[ADD17:%.*]] = or i64 [[SHL]], [[CONV15]] ; CHECK-NEXT: ret i64 [[ADD17]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll index 6f6b66255a4340..8093285ad8717c 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll @@ -3,39 +3,63 @@ ; Test case reported on D134605 where the vectorization was causing a slowdown due to an underestimation in the cost of the extractions. +; NOTE: cost of shuffle <4 x float>, <4 x float>, <2 x i32> is 12! + define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, ptr %arg5, i1 %arg6, i1 %arg7, i1 %arg8) { ; CHECK-LABEL: @zot( ; CHECK-NEXT: bb: +; CHECK-NEXT: [[VAL:%.*]] = fmul fast float 0.000000e+00, 0.000000e+00 ; CHECK-NEXT: [[VAL9:%.*]] = fmul fast float 0.000000e+00, [[ARG:%.*]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> , float [[ARG]], i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> , [[TMP2]] +; CHECK-NEXT: [[VAL10:%.*]] = fmul fast float [[ARG3:%.*]], 1.000000e+00 +; CHECK-NEXT: [[VAL11:%.*]] = fmul fast float [[ARG3]], 1.000000e+00 ; CHECK-NEXT: [[VAL12:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP2]], float [[VAL12]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float 0.000000e+00, i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <4 x float> [[TMP5]], +; CHECK-NEXT: [[VAL13:%.*]] = fadd fast float [[VAL12]], 2.000000e+00 +; CHECK-NEXT: [[VAL14:%.*]] = fadd fast float 0.000000e+00, 0.000000e+00 +; CHECK-NEXT: [[VAL15:%.*]] = fadd fast float [[VAL14]], 1.000000e+00 +; CHECK-NEXT: [[VAL16:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 +; CHECK-NEXT: [[VAL17:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 ; CHECK-NEXT: br i1 [[ARG6:%.*]], label [[BB18:%.*]], label [[BB57:%.*]] ; CHECK: bb18: -; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x float> [ [[TMP6]], [[BB:%.*]] ] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP6]], i32 2 -; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[TMP8]], 2.000000e+00 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i32 3 -; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[TMP9]], 3.000000e+00 +; CHECK-NEXT: [[VAL19:%.*]] = phi float [ [[VAL13]], [[BB:%.*]] ] +; CHECK-NEXT: [[VAL20:%.*]] = phi float [ [[VAL15]], [[BB]] ] +; CHECK-NEXT: [[VAL21:%.*]] = phi float [ [[VAL16]], [[BB]] ] +; CHECK-NEXT: [[VAL22:%.*]] = phi float [ [[VAL17]], [[BB]] ] +; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[VAL16]], 2.000000e+00 +; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[VAL17]], 3.000000e+00 ; CHECK-NEXT: br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]] ; CHECK: bb25: -; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x float> [ [[TMP7]], [[BB18]] ] +; CHECK-NEXT: [[VAL26:%.*]] = phi float [ [[VAL19]], [[BB18]] ] +; CHECK-NEXT: [[VAL27:%.*]] = phi float [ [[VAL20]], [[BB18]] ] +; CHECK-NEXT: [[VAL28:%.*]] = phi float [ [[VAL21]], [[BB18]] ] +; CHECK-NEXT: [[VAL29:%.*]] = phi float [ [[VAL22]], [[BB18]] ] ; CHECK-NEXT: br label [[BB30:%.*]] ; CHECK: bb30: ; CHECK-NEXT: [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] ; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[VAL9]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1 -; CHECK-NEXT: [[TMP12:%.*]] = uitofp <4 x i8> [[TMP11]] to <4 x float> -; CHECK-NEXT: [[TMP13:%.*]] = fsub fast <4 x float> [[TMP12]], [[TMP3]] -; CHECK-NEXT: [[TMP14:%.*]] = fmul fast <4 x float> [[TMP13]], [[TMP10]] -; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP14]]) +; CHECK-NEXT: [[VAL33:%.*]] = load i8, ptr [[ARG5:%.*]], align 1 +; CHECK-NEXT: [[VAL34:%.*]] = uitofp i8 [[VAL33]] to float +; CHECK-NEXT: [[VAL35:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 1 +; CHECK-NEXT: [[VAL36:%.*]] = load i8, ptr [[VAL35]], align 1 +; CHECK-NEXT: [[VAL37:%.*]] = uitofp i8 [[VAL36]] to float +; CHECK-NEXT: [[VAL38:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 2 +; CHECK-NEXT: [[VAL39:%.*]] = load i8, ptr [[VAL38]], align 1 +; CHECK-NEXT: [[VAL40:%.*]] = uitofp i8 [[VAL39]] to float +; CHECK-NEXT: [[VAL41:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 3 +; CHECK-NEXT: [[VAL42:%.*]] = load i8, ptr [[VAL41]], align 1 +; CHECK-NEXT: [[VAL43:%.*]] = uitofp i8 [[VAL42]] to float +; CHECK-NEXT: [[VAL44:%.*]] = fsub fast float [[VAL34]], [[VAL]] +; CHECK-NEXT: [[VAL45:%.*]] = fsub fast float [[VAL37]], [[VAL9]] +; CHECK-NEXT: [[VAL46:%.*]] = fsub fast float [[VAL40]], [[VAL10]] +; CHECK-NEXT: [[VAL47:%.*]] = fsub fast float [[VAL43]], [[VAL11]] +; CHECK-NEXT: [[VAL48:%.*]] = fmul fast float [[VAL44]], [[VAL26]] +; CHECK-NEXT: [[VAL49:%.*]] = fmul fast float [[VAL45]], [[VAL27]] +; CHECK-NEXT: [[VAL50:%.*]] = fadd fast float [[VAL49]], [[VAL48]] +; CHECK-NEXT: [[VAL51:%.*]] = fmul fast float [[VAL46]], [[VAL28]] +; CHECK-NEXT: [[VAL52:%.*]] = fadd fast float [[VAL50]], [[VAL51]] +; CHECK-NEXT: [[VAL53:%.*]] = fmul fast float [[VAL47]], [[VAL29]] +; CHECK-NEXT: [[VAL54:%.*]] = fadd fast float [[VAL52]], [[VAL53]] ; CHECK-NEXT: [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]]) -; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[TMP15]]) +; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[VAL54]]) ; CHECK-NEXT: call void @ham(float [[VAL55]], float [[VAL56]]) ; CHECK-NEXT: br i1 [[ARG8:%.*]], label [[BB30]], label [[BB57]] ; CHECK: bb57: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll index e39cd8aaa111b1..4f881823746228 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll @@ -7,16 +7,13 @@ define void @p(double %0) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> , double [[TMP0]], i32 3 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[MUL16_150_1_I:%.*]] = fmul double 0.000000e+00, 0.000000e+00 ; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x double> zeroinitializer, [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP14]], double [[MUL16_150_1_I]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> zeroinitializer, [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x double> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> , <4 x double> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> , <2 x double> [[TMP7]], i64 2) ; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x double> [[TMP8]], [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = fadd <4 x double> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = fptosi <4 x double> [[TMP12]] to <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll index 95aa40f664c0ce..ff1d6253bec928 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -25,11 +25,11 @@ define void @s116_modified(ptr %a) { ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP2]], i64 2) +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP7]], <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x float> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP9]] ; CHECK-NEXT: store <4 x float> [[TMP10]], ptr [[A]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll index b59659ca75eb24..f04c359b432b5e 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll @@ -241,12 +241,9 @@ entry: define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) { ; CHECK-LABEL: @select_uniform_ugt_16xi8( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR:%.*]], align 1 -; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 8 +; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i8 8 ; CHECK-NEXT: [[L_8:%.*]] = load i8, ptr [[GEP_8]], align 1 ; CHECK-NEXT: [[CMP_8:%.*]] = icmp ugt i8 [[L_8]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0 -; CHECK-NEXT: [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]] ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 9 ; CHECK-NEXT: [[L_9:%.*]] = load i8, ptr [[GEP_9]], align 1 ; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 10 @@ -254,19 +251,28 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) { ; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 11 ; CHECK-NEXT: [[L_11:%.*]] = load i8, ptr [[GEP_11]], align 1 ; CHECK-NEXT: [[GEP_12:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 12 +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0 +; CHECK-NEXT: [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[L_9]], i32 9 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[L_10]], i32 10 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[L_11]], i32 11 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt <16 x i8> [[TMP9]], -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP11]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP10]], <16 x i8> [[TMP9]], <16 x i8> [[TMP12]] -; CHECK-NEXT: store <16 x i8> [[TMP13]], ptr [[PTR]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP6]], <8 x i8> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP7]], <4 x i8> [[TMP2]], i64 12) +; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt <16 x i8> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> [[TMP11]], <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[L_9]], i32 9 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[L_10]], i32 10 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[L_11]], i32 11 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP17]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP16]], <16 x i8> [[TMP18]] +; CHECK-NEXT: store <16 x i8> [[TMP19]], ptr [[PTR]], align 2 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll index 94a55c435c8c39..cd79250e8fb6be 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll @@ -4,12 +4,11 @@ define void @test(ptr noalias %p, ptr %p1) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ptr noalias [[P:%.*]], ptr [[P1:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[P]], align 2 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[P]], align 2 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr [[GEP2]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.vector.insert.v4i16.v2i16(<4 x i16> poison, <2 x i16> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i16> @llvm.vector.insert.v4i16.v2i16(<4 x i16> [[TMP3]], <2 x i16> [[TMP2]], i64 2) ; CHECK-NEXT: store <4 x i16> [[TMP5]], ptr [[P1]], align 2 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll index ff3d2c4c59394c..151b91184bf428 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll @@ -1013,22 +1013,20 @@ declare i32 @llvm.abs.i32(i32, i1) define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) { ; CHECK-LABEL: @stride_sum_abs_diff( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[Q:%.*]], align 4 -; CHECK-NEXT: [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[STRIDE:%.*]] -; CHECK-NEXT: [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q]], i64 [[STRIDE]] +; CHECK-NEXT: [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 [[STRIDE:%.*]] +; CHECK-NEXT: [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i64 [[STRIDE]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[Q]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[P_2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[Q_2]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP11]], i1 true) -; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]]) -; CHECK-NEXT: ret i32 [[TMP13]] +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP5]], <2 x i32> [[TMP3]], i64 2) +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP7]], <2 x i32> [[TMP4]], i64 2) +; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP9]], i1 true) +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) +; CHECK-NEXT: ret i32 [[TMP11]] ; %x.0 = load i32, ptr %p %y.0 = load i32, ptr %q @@ -1068,12 +1066,11 @@ define i32 @reduce_sum_2arrays_a(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[Q:%.*]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) -; CHECK-NEXT: ret i32 [[TMP6]] +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> poison, <4 x i8> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP2]], <4 x i8> [[TMP1]], i64 4) +; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]]) +; CHECK-NEXT: ret i32 [[TMP5]] ; entry: %x.0 = load i8, ptr %p, align 1 @@ -1117,12 +1114,11 @@ define i32 @reduce_sum_2arrays_b(ptr noalias noundef %x, ptr noalias %y) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[X:%.*]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[Y:%.*]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) -; CHECK-NEXT: ret i32 [[TMP6]] +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> poison, <4 x i8> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP2]], <4 x i8> [[TMP1]], i64 4) +; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]]) +; CHECK-NEXT: ret i32 [[TMP5]] ; entry: %0 = load i8, ptr %x, align 1 diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll index 0fcbead65d0d66..413aedefe9b6ad 100644 --- a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll +++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll @@ -13,12 +13,11 @@ define void @foo() local_unnamed_addr { ; CHECK-NEXT: [[ARRAYIDX372:%.*]] = getelementptr inbounds [4 x [4 x i32]], ptr @dct_luma, i64 0, i64 3, i64 0 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([4 x [4 x i32]], ptr @bar, i64 0, i64 3, i64 2), align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> , i32 [[ADD277]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = ashr <4 x i32> [[TMP6]], -; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr [[ARRAYIDX372]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP2]], <2 x i32> [[TMP1]], i64 2) +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[ADD277]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = ashr <4 x i32> [[TMP5]], +; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[ARRAYIDX372]], align 4 ; CHECK-NEXT: unreachable ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll index f7bd2431a76054..96b498ced7d0f8 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll @@ -392,16 +392,14 @@ define void @vec_shuff_reorder() #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr @fb, align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP7]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = fsub <4 x float> [[TMP7]], [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x i32> -; CHECK-NEXT: store <4 x float> [[TMP13]], ptr @fc, align 4 +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2) +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP7]], <2 x float> [[TMP4]], i64 2) +; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = fsub <4 x float> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> +; CHECK-NEXT: store <4 x float> [[TMP11]], ptr @fc, align 4 ; CHECK-NEXT: ret void ; %1 = load float, ptr @fb, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll index 3b03ca13ea65d0..87b1302e4cecf4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll @@ -6,30 +6,25 @@ define i1 @test(float %0, double %1) { ; CHECK-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) { ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> , float [[TMP0]], i32 3 ; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double> -; CHECK-NEXT: [[TMP5:%.*]] = fpext float 0.000000e+00 to double -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> zeroinitializer, [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x double> , double [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x double> [[TMP10]], double [[TMP5]], i32 2 -; CHECK-NEXT: [[TMP12:%.*]] = fmul <4 x double> [[TMP9]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x double> , <8 x double> [[TMP14]], <8 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x double> , <8 x double> [[TMP16]], <8 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x double> [[TMP17]], <8 x double> [[TMP18]], <8 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = fsub <8 x double> [[TMP15]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = fmul <8 x double> [[TMP15]], [[TMP19]] -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x double> [[TMP20]], <8 x double> [[TMP21]], <8 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = fptrunc <8 x double> [[TMP22]] to <8 x float> -; CHECK-NEXT: [[TMP24:%.*]] = fmul <8 x float> [[TMP23]], zeroinitializer -; CHECK-NEXT: [[TMP25:%.*]] = fcmp oeq <8 x float> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = freeze <8 x i1> [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP26]]) -; CHECK-NEXT: ret i1 [[TMP27]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> , <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> , <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP10]], i64 0) +; CHECK-NEXT: [[TMP13:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP11]], i64 0) +; CHECK-NEXT: [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP13]], <2 x double> [[TMP6]], i64 4) +; CHECK-NEXT: [[TMP15:%.*]] = fsub <8 x double> [[TMP12]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = fmul <8 x double> [[TMP12]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x double> [[TMP15]], <8 x double> [[TMP16]], <8 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = fptrunc <8 x double> [[TMP17]] to <8 x float> +; CHECK-NEXT: [[TMP19:%.*]] = fmul <8 x float> [[TMP18]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = fcmp oeq <8 x float> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = freeze <8 x i1> [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP21]]) +; CHECK-NEXT: ret i1 [[TMP22]] ; %3 = fpext float %0 to double %4 = fpext float 0.000000e+00 to double diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll index d326c855a10912..6ff03acf85cdfd 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll @@ -4,20 +4,19 @@ define i64 @foo(i32 %tmp7) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[TMP7:%.*]], i32 2 -; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = sub i32 undef, 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> , i32 [[TMP24]], i32 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 0, i32 5 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP24]], i32 6 -; CHECK-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP77:%.*]] = sub nsw <8 x i32> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP77]], <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i32> zeroinitializer, [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP10]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP11]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7:%.*]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 undef, i32 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 0, i32 5 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> , i32 [[TMP8]], i32 3 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP3]], <2 x i32> zeroinitializer, i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <8 x i32> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = add <8 x i32> zeroinitializer, [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP9]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP10]], 0 ; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[OP_RDX]] to i64 ; CHECK-NEXT: ret i64 [[TMP64]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll index d80d7b5ecd4e76..757d0b1708b6fb 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll @@ -8,19 +8,18 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) { ; CHECK-SAME: ptr [[I7:%.*]], i32 [[TMP0:%.*]], i1 [[TOBOOL62_NOT:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[RC21:%.*]] = alloca [0 x [0 x %struct.rect]], i32 0, align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[RC21]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x float> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[X1:%.*]] = getelementptr i8, ptr [[RC21]], i64 4 -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, ptr [[X1]], align 4 +; CHECK-NEXT: [[Y0:%.*]] = getelementptr i8, ptr [[RC21]], i64 8 +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[Y0]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[I7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[RC21]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 2 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 3 -; CHECK-NEXT: [[TMP12:%.*]] = fcmp olt <4 x float> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP11]], <2 x float> [[TMP8]], i64 0) +; CHECK-NEXT: [[TMP12:%.*]] = fcmp olt <4 x float> [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = fcmp olt <4 x float> [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP5]], <4 x float> zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP12]], <4 x float> zeroinitializer, <4 x float> [[TMP15]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll index fa022ad69af791..b0d9fea43a0e6c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -1016,15 +1016,13 @@ define i32 @maxi8_wrong_parent(i32) { ; THRESH-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @arr, align 16 ; THRESH-NEXT: br label [[PP:%.*]] ; THRESH: pp: -; THRESH-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 -; THRESH-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 -; THRESH-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> -; THRESH-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> -; THRESH-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> -; THRESH-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> -; THRESH-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> -; THRESH-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP9]]) -; THRESH-NEXT: ret i32 [[TMP10]] +; THRESH-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 +; THRESH-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16 +; THRESH-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP4]], i64 4) +; THRESH-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP2]], i64 0) +; THRESH-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP3]], i64 2) +; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP7]]) +; THRESH-NEXT: ret i32 [[TMP8]] ; %2 = load i32, ptr @arr, align 16 %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll index 6c4572593027d6..54c950a0785020 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll @@ -4,14 +4,20 @@ define void @inst_size(ptr %a, <2 x i64> %b) { ; CHECK-LABEL: @inst_size( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[VAL:%.*]] = extractelement <2 x i64> [[B:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[T41:%.*]] = icmp sgt i64 0, [[VAL]] -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i64> zeroinitializer, [[TMP1]] +; CHECK-NEXT: [[TMPL1:%.*]] = load i64, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[PTR2]], align 4 +; CHECK-NEXT: [[PTR4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 3 +; CHECK-NEXT: [[TMPL4:%.*]] = load i64, ptr [[PTR4]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[B:%.*]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[TMPL1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP2]], <2 x i64> [[TMP0]], i64 2) +; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i64> zeroinitializer, [[TMP3]] +; CHECK-NEXT: [[T45:%.*]] = icmp sgt i64 0, [[TMPL4]] ; CHECK-NEXT: br label [[BLOCK:%.*]] ; CHECK: block: -; CHECK-NEXT: [[PHI1:%.*]] = phi i1 [ [[T41]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x i1> [ [[TMP2]], [[ENTRY]] ] +; CHECK-NEXT: [[PHI5:%.*]] = phi i1 [ [[T45]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi <4 x i1> [ [[TMP4]], [[ENTRY]] ] ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll b/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll index 47b42bc8f32a7d..813c5e7418b30e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll @@ -10,10 +10,10 @@ define void @foo() personality ptr @bar { ; CHECK: bb2.loopexit: ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ [[TMP7:%.*]], [[BB9:%.*]] ], [ poison, [[BB2_LOOPEXIT:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ [[TMP8:%.*]], [[BB9:%.*]] ], [ poison, [[BB2_LOOPEXIT:%.*]] ] ; CHECK-NEXT: ret void ; CHECK: bb3: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP3:%.*]], [[BB6:%.*]] ], [ poison, [[BB1:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP4:%.*]], [[BB6:%.*]] ], [ poison, [[BB1:%.*]] ] ; CHECK-NEXT: [[TMP2:%.*]] = invoke i32 poison(ptr addrspace(1) nonnull poison, i32 0, i32 0, i32 poison) [ "deopt"() ] ; CHECK-NEXT: to label [[BB4:%.*]] unwind label [[BB10:%.*]] ; CHECK: bb4: @@ -21,29 +21,30 @@ define void @foo() personality ptr @bar { ; CHECK: bb5: ; CHECK-NEXT: br label [[BB7:%.*]] ; CHECK: bb6: -; CHECK-NEXT: [[TMP3]] = phi <2 x i32> [ , [[BB8:%.*]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ , [[BB8:%.*]] ] +; CHECK-NEXT: [[TMP4]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb7: ; CHECK-NEXT: [[LOCAL_5_84111:%.*]] = phi i32 [ poison, [[BB8]] ], [ poison, [[BB5]] ] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[LOCAL_5_84111]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = invoke i32 poison(ptr addrspace(1) nonnull poison, i32 poison, i32 poison, i32 poison) [ "deopt"() ] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[LOCAL_5_84111]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = invoke i32 poison(ptr addrspace(1) nonnull poison, i32 poison, i32 poison, i32 poison) [ "deopt"() ] ; CHECK-NEXT: to label [[BB8]] unwind label [[BB12:%.*]] ; CHECK: bb8: ; CHECK-NEXT: br i1 poison, label [[BB7]], label [[BB6]] ; CHECK: bb9: ; CHECK-NEXT: [[INDVARS_IV528799:%.*]] = phi i64 [ poison, [[BB10]] ], [ poison, [[BB12]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ [[TMP8:%.*]], [[BB10]] ], [ [[TMP9:%.*]], [[BB12]] ] -; CHECK-NEXT: [[TMP7]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x i32> [ [[TMP9:%.*]], [[BB10]] ], [ [[TMP10:%.*]], [[BB12]] ] +; CHECK-NEXT: [[TMP8]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP7]], i64 2) ; CHECK-NEXT: br label [[BB2]] ; CHECK: bb10: -; CHECK-NEXT: [[TMP8]] = phi <2 x i32> [ [[TMP1]], [[BB3]] ] +; CHECK-NEXT: [[TMP9]] = phi <2 x i32> [ [[TMP1]], [[BB3]] ] ; CHECK-NEXT: [[LANDING_PAD68:%.*]] = landingpad { ptr, i32 } ; CHECK-NEXT: cleanup ; CHECK-NEXT: br label [[BB9]] ; CHECK: bb11: ; CHECK-NEXT: ret void ; CHECK: bb12: -; CHECK-NEXT: [[TMP9]] = phi <2 x i32> [ [[TMP4]], [[BB7]] ] +; CHECK-NEXT: [[TMP10]] = phi <2 x i32> [ [[TMP5]], [[BB7]] ] ; CHECK-NEXT: [[LANDING_PAD149:%.*]] = landingpad { ptr, i32 } ; CHECK-NEXT: cleanup ; CHECK-NEXT: br label [[BB9]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll index 96151e0bd6c418..7201583f3450e0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -144,8 +144,8 @@ define float @foo3(ptr nocapture readonly %A) #0 { ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP8:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP14:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 ; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00 ; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]] @@ -154,23 +154,25 @@ define float @foo3(ptr nocapture readonly %A) #0 { ; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX14]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]] -; CHECK-NEXT: [[TMP8]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP10]], -; CHECK-NEXT: [[TMP12]] = fadd <4 x float> [[TMP3]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP13]], 121 +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 +; CHECK-NEXT: [[TMP9]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP11]], <2 x float> [[TMP8]], i64 2) +; CHECK-NEXT: [[TMP13:%.*]] = fmul <4 x float> [[TMP12]], +; CHECK-NEXT: [[TMP14]] = fadd <4 x float> [[TMP3]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP15]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP12]], i32 0 -; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP12]], i32 1 -; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP12]], i32 2 -; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP12]], i32 3 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP17]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP14]], i32 0 +; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP14]], i32 1 +; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP14]], i32 2 +; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP18]] +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP14]], i32 3 +; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP19]] ; CHECK-NEXT: ret float [[ADD31]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll index 865d8178667167..12389f4a3dbf4a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -390,14 +390,15 @@ define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) { define i1 @logical_and_icmp_clamp_pred_diff(<4 x i32> %x) { ; CHECK-LABEL: @logical_and_icmp_clamp_pred_diff( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp ult <8 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = freeze <8 x i1> [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP7]]) -; CHECK-NEXT: ret i1 [[TMP8]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP2]], <4 x i32> [[X]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <8 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ult <8 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP6]], <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = freeze <8 x i1> [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP8]]) +; CHECK-NEXT: ret i1 [[TMP9]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll index 7de2cde45525ae..8aaa71ef47a8c9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll @@ -10,16 +10,7 @@ ; YAML-NEXT: - String: 'SLP vectorized with cost ' ; YAML-NEXT: - Cost: '-4' ; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '4' -; YAML-LABEL: --- !Passed -; YAML-NEXT: Pass: slp-vectorizer -; YAML-NEXT: Name: VectorizedList -; YAML-NEXT: Function: test -; YAML-NEXT: Args: -; YAML-NEXT: - String: 'SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-2' -; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '2' +; YAML-NEXT: - TreeSize: '5' define <4 x float> @test(ptr %x, float %v, float %a) { ; CHECK-LABEL: define <4 x float> @test( @@ -28,9 +19,8 @@ define <4 x float> @test(ptr %x, float %v, float %a) { ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[A]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[V]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP1]], i64 2) ; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP3]], [[TMP7]] ; CHECK-NEXT: ret <4 x float> [[TMP8]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll index dadf5992ba288d..c01c44ff03c153 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll @@ -5,23 +5,25 @@ define void @test(i1 %c, ptr %arg) { ; CHECK-LABEL: @test( ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG:%.*]], align 8 -; CHECK-NEXT: [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 24 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP5]], <2 x i64> [[TMP2]], i64 2) ; CHECK-NEXT: br label [[JOIN:%.*]] ; CHECK: else: -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 ; CHECK-NEXT: [[ARG_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24 -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP10]], i64 0) +; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP11]], <2 x i64> [[TMP8]], i64 2) ; CHECK-NEXT: br label [[JOIN]] ; CHECK: join: -; CHECK-NEXT: [[TMP11:%.*]] = phi <4 x i64> [ [[TMP5]], [[IF]] ], [ [[TMP10]], [[ELSE]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i64> [ [[TMP6]], [[IF]] ], [ [[TMP12]], [[ELSE]] ] ; CHECK-NEXT: ret void ; br i1 %c, label %if, label %else diff --git a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget_debug_info.ll b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget_debug_info.ll index d45054b6bebce7..207b2d45c335e0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget_debug_info.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget_debug_info.ll @@ -14,7 +14,21 @@ declare void @unknown() define void @test(ptr %a, ptr %b, ptr %c, ptr %d) { ; VECTOR_DBG-LABEL: @test( ; VECTOR_DBG-NEXT: entry: -; VECTOR_DBG-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4 +; VECTOR_DBG-NEXT: [[L0:%.*]] = load float, ptr [[A:%.*]], align 4 +; VECTOR_DBG-NEXT: [[A1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; VECTOR_DBG-NEXT: [[L1:%.*]] = load float, ptr [[A1]], align 4 +; VECTOR_DBG-NEXT: [[A2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2 +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3:![0-9]+]], !DIExpression(), [[META5:![0-9]+]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: [[B1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 +; VECTOR_DBG-NEXT: [[B2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; VECTOR_DBG-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A2]], align 4 ; VECTOR_DBG-NEXT: call void @unknown() ; VECTOR_DBG-NEXT: call void @unknown() ; VECTOR_DBG-NEXT: call void @unknown() @@ -43,22 +57,22 @@ define void @test(ptr %a, ptr %b, ptr %c, ptr %d) { ; VECTOR_DBG-NEXT: call void @unknown() ; VECTOR_DBG-NEXT: call void @unknown() ; VECTOR_DBG-NEXT: call void @unknown() -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3:![0-9]+]], !DIExpression(), [[META5:![0-9]+]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: store <4 x float> [[TMP0]], ptr [[B:%.*]], align 4 +; VECTOR_DBG-NEXT: store float [[L0]], ptr [[B]], align 4 +; VECTOR_DBG-NEXT: store float [[L1]], ptr [[B1]], align 4 +; VECTOR_DBG-NEXT: store <2 x float> [[TMP0]], ptr [[B2]], align 4 ; VECTOR_DBG-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4 ; VECTOR_DBG-NEXT: store <4 x float> [[TMP1]], ptr [[D:%.*]], align 4 ; VECTOR_DBG-NEXT: ret void ; ; VECTOR_NODBG-LABEL: @test( ; VECTOR_NODBG-NEXT: entry: -; VECTOR_NODBG-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4 +; VECTOR_NODBG-NEXT: [[L0:%.*]] = load float, ptr [[A:%.*]], align 4 +; VECTOR_NODBG-NEXT: [[A1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; VECTOR_NODBG-NEXT: [[L1:%.*]] = load float, ptr [[A1]], align 4 +; VECTOR_NODBG-NEXT: [[A2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2 +; VECTOR_NODBG-NEXT: [[B1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 +; VECTOR_NODBG-NEXT: [[B2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; VECTOR_NODBG-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A2]], align 4 ; VECTOR_NODBG-NEXT: call void @unknown() ; VECTOR_NODBG-NEXT: call void @unknown() ; VECTOR_NODBG-NEXT: call void @unknown() @@ -87,7 +101,9 @@ define void @test(ptr %a, ptr %b, ptr %c, ptr %d) { ; VECTOR_NODBG-NEXT: call void @unknown() ; VECTOR_NODBG-NEXT: call void @unknown() ; VECTOR_NODBG-NEXT: call void @unknown() -; VECTOR_NODBG-NEXT: store <4 x float> [[TMP0]], ptr [[B:%.*]], align 4 +; VECTOR_NODBG-NEXT: store float [[L0]], ptr [[B]], align 4 +; VECTOR_NODBG-NEXT: store float [[L1]], ptr [[B1]], align 4 +; VECTOR_NODBG-NEXT: store <2 x float> [[TMP0]], ptr [[B2]], align 4 ; VECTOR_NODBG-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4 ; VECTOR_NODBG-NEXT: store <4 x float> [[TMP1]], ptr [[D:%.*]], align 4 ; VECTOR_NODBG-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll index 6825f43b5a9eb4..6ca1f8119c1cf0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll @@ -14,22 +14,21 @@ define dso_local void @_Z4testP1S(ptr %p) local_unnamed_addr { ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 4 ; CHECK-NEXT: [[I7:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4 ; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 12 -; CHECK-NEXT: [[I9:%.*]] = load i32, ptr [[ARRAYIDX27]], align 4 -; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 13 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX34]], align 4 +; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 14 +; CHECK-NEXT: [[I13:%.*]] = load i32, ptr [[ARRAYIDX41]], align 4 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 5 ; CHECK-NEXT: [[I15:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX27]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[I1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[I7]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[I9]], i32 4 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[I15]], i32 7 -; CHECK-NEXT: [[TMP11:%.*]] = add nsw <8 x i32> [[TMP10]], [[TMP2]] -; CHECK-NEXT: store <8 x i32> [[TMP11]], ptr [[P]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[I13]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[I15]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP8]], <2 x i32> [[TMP1]], i64 4) +; CHECK-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[TMP9]], [[TMP2]] +; CHECK-NEXT: store <8 x i32> [[TMP10]], ptr [[P]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -106,11 +105,10 @@ define dso_local void @test_unordered_splits(ptr nocapture %p) local_unnamed_add ; CHECK-NEXT: [[G20:%.*]] = getelementptr inbounds [16 x i32], ptr [[P2]], i32 0, i64 12 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[G10]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[G20]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: store <8 x i32> [[TMP5]], ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP2]], <4 x i32> [[TMP1]], i64 4) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: store <8 x i32> [[TMP4]], ptr [[P:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -165,14 +163,11 @@ define dso_local void @test_cost_splits(ptr nocapture %p) local_unnamed_addr { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[G12]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[G20]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[G22]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> -; CHECK-NEXT: store <8 x i32> [[TMP10]], ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> poison, <2 x i32> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP4]], <2 x i32> [[TMP1]], i64 2) +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP2]], i64 4) +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP3]], i64 6) +; CHECK-NEXT: store <8 x i32> [[TMP7]], ptr [[P:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll index eb3d395f4c6a6f..3eabed5882e58b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll @@ -160,9 +160,8 @@ define void @tiny_tree_not_fully_vectorizable2(ptr noalias nocapture %dst, ptr n ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> -; CHECK-NEXT: store <4 x float> [[TMP6]], ptr [[DST_ADDR_022]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP4]], <2 x float> [[TMP2]], i64 2) +; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[DST_ADDR_022]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds float, ptr [[SRC_ADDR_021]], i64 [[I_023]] ; CHECK-NEXT: [[ADD_PTR8]] = getelementptr inbounds float, ptr [[DST_ADDR_022]], i64 [[I_023]] ; CHECK-NEXT: [[INC]] = add i64 [[I_023]], 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll index 6ac6884ca5377f..e1b091cc6fcda7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll @@ -8,14 +8,14 @@ define void @test(ptr %a, ptr %b) { ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr null, align 4 ; CHECK-NEXT: [[ARRAYIDX120:%.*]] = getelementptr [4 x float], ptr [[B:%.*]], i64 0, i64 3 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX120]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 3 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr null, align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP6]], <2 x float> [[TMP1]], i64 0) ; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP5]], zeroinitializer From 002ba17094e8e60c5eb602938637ac97dbf280ed Mon Sep 17 00:00:00 2001 From: Brandon Wu Date: Fri, 23 Aug 2024 21:46:01 +0800 Subject: [PATCH 322/426] [RISCV][MC] Name the vector tuple registers. NFC (#102726) Currently vector tuple registers don't have the specified names, the default name is, for example: `VRN3M2` -> `V8M2_V10M2_V12M2`, however it's equivalent to `v8` in the assembly. --- llvm/lib/Target/RISCV/RISCVRegisterInfo.td | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td index b12634c24622f4..efdf6bebfce301 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -423,10 +423,12 @@ class IndexSet { // This class returns a list of vector register collections. // For example, for NF = 2 and LMUL = 4, -// it will return +// L would be: // ([ V8M4, V12M4, V16M4, V20M4, V24M4, V4M4], // [V12M4, V16M4, V20M4, V24M4, V28M4, V8M4]) -// +// Names are the starting register of each register list, +// in this example: +// ["v8", "v12", "v16", "v20", "v24", "v4"] class VRegList LIn, int start, int nf, int lmul, bit isV0> { list L = !if(!ge(start, nf), @@ -440,6 +442,9 @@ class VRegList LIn, int start, int nf, int lmul, bit isV0> { !listsplat("", !size(IndexSet.R)))], VRegList.L)); + list Names = + !if(!ge(start, nf), [], + !foreach(i, IndexSet.R, "v" # i)); } // Vector registers @@ -491,12 +496,16 @@ def VCSR : RISCVRegisterClass<[XLenVT], 32, foreach m = [1, 2, 4] in { foreach n = NFList.L in { + defvar RegListWOV0 = VRegList<[], 0, n, m, false>; + defvar RegListWV0 = VRegList<[], 0, n, m, true>; def "VN" # n # "M" # m # "NoV0": RegisterTuples< SubRegSet.L, - VRegList<[], 0, n, m, false>.L>; + RegListWOV0.L, + RegListWOV0.Names>; def "VN" # n # "M" # m # "V0" : RegisterTuples< SubRegSet.L, - VRegList<[], 0, n, m, true>.L>; + RegListWV0.L, + RegListWV0.Names>; } } From e3ce979f1b3ac1e7f2d0261d3abffbd12064eae6 Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Fri, 23 Aug 2024 09:48:02 -0400 Subject: [PATCH 323/426] Revert "[clang] Increase the default expression nesting limit (#104717)" This reverts commit 7597e0930638e0a20ca9bfc193a3d89575ce4469. It caused several buildbot failures due to stack overflows with the parser test. --- clang/docs/ReleaseNotes.rst | 2 -- clang/include/clang/Driver/Options.td | 2 +- clang/test/Parser/parser_overflow.c | 4 ++-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 93040c2eee2c0b..70ff5dedab217f 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -174,8 +174,6 @@ Deprecated Compiler Flags Modified Compiler Flags ----------------------- -- The compiler flag `-fbracket-depth` default value is increased from 256 to 2048. - - The ``-ffp-model`` option has been updated to enable a more limited set of optimizations when the ``fast`` argument is used and to accept a new argument, ``aggressive``. The behavior of ``-ffp-model=aggressive`` is equivalent diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 111608d30ff827..7e40e99e9ba252 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -7976,7 +7976,7 @@ def fapply_global_visibility_to_externs : Flag<["-"], "fapply-global-visibility- MarshallingInfoFlag>; def fbracket_depth : Separate<["-"], "fbracket-depth">, HelpText<"Maximum nesting level for parentheses, brackets, and braces">, - MarshallingInfoInt, "2048">; + MarshallingInfoInt, "256">; defm const_strings : BoolOption<"f", "const-strings", LangOpts<"ConstStrings">, DefaultFalse, PosFlag, diff --git a/clang/test/Parser/parser_overflow.c b/clang/test/Parser/parser_overflow.c index 53c79bc06d993d..9514e808550a4b 100644 --- a/clang/test/Parser/parser_overflow.c +++ b/clang/test/Parser/parser_overflow.c @@ -1,5 +1,5 @@ // RUN: not %clang_cc1 %s -fsyntax-only -DHUGE 2>&1 | FileCheck %s -// RUN: %clang_cc1 %s -fsyntax-only +// RUN: not %clang_cc1 %s -fsyntax-only 2>&1 | FileCheck %s // RUN: not %clang_cc1 %s -fsyntax-only -fbracket-depth 299 2>&1 | FileCheck %s // RUN: %clang_cc1 %s -fsyntax-only -fbracket-depth 300 // RUN: not %clang %s -fsyntax-only -fbracket-depth=299 2>&1 | FileCheck %s @@ -15,5 +15,5 @@ void foo(void) { #endif } -// CHECK: fatal error: bracket nesting level exceeded maximum of {{2048|299}} +// CHECK: fatal error: bracket nesting level exceeded maximum of {{256|299}} // CHECK: note: use -fbracket-depth=N to increase maximum nesting level From 67a9093a473c851f1fe60d746354023dd6f39337 Mon Sep 17 00:00:00 2001 From: cceerczw Date: Fri, 23 Aug 2024 22:30:51 +0800 Subject: [PATCH 324/426] [instCombine][bugfix] Fix crash caused by using of cast in instCombineSVECmpNE (#102472) --- .../AArch64/AArch64TargetTransformInfo.cpp | 3 +- .../AArch64/sve-inst-combine-cmpne.ll | 411 ++++++++++++++++++ 2 files changed, 413 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/InstCombine/AArch64/sve-inst-combine-cmpne.ll diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 8c64822c474b61..dc748290f2e21e 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1187,7 +1187,8 @@ static std::optional instCombineSVECmpNE(InstCombiner &IC, return std::nullopt; // Where the dupq is a lane 0 replicate of a vector insert - if (!cast(DupQLane->getArgOperand(1))->isZero()) + auto *DupQLaneIdx = dyn_cast(DupQLane->getArgOperand(1)); + if (!DupQLaneIdx || !DupQLaneIdx->isZero()) return std::nullopt; auto *VecIns = dyn_cast(DupQLane->getArgOperand(0)); diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-inst-combine-cmpne.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-inst-combine-cmpne.ll new file mode 100644 index 00000000000000..1e202b631758bc --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-inst-combine-cmpne.ll @@ -0,0 +1,411 @@ +; RUN: opt -S -passes=instcombine < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; DUPQ b8 + +define @dupq_b_idx(i64 %idx) #0 { + %1 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv16i8.v16i8( undef, + <16 x i8> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv16i8( %2 , i64 %idx) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv16i8( %1, %3, %4) + ret %5 + ; CHECK: %4 = tail call @llvm.aarch64.sve.cmpne.wide.nxv16i8( %1, %3, zeroinitializer) + ; CHECK-NEXT: ret %4 +} + +define @dupq_b_0() #0 { +; CHECK-LABEL: @dupq_b_0( +; CHECK: ret zeroinitializer + %1 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv16i8.v16i8( undef, + <16 x i8> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv16i8( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv16i8( %1, %3, %4) + ret %5 +} + +define @dupq_b_d() #0 { +; CHECK-LABEL: @dupq_b_d( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: %2 = call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %1) +; CHECK-NEXT: ret %2 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv16i8.v16i8( undef, + <16 x i8> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv16i8( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv16i8( %1, %3, %4) + ret %5 +} + +define @dupq_b_w() #0 { +; CHECK-LABEL: @dupq_b_w( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: %2 = call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %1) +; CHECK-NEXT: ret %2 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv16i8.v16i8( undef, + <16 x i8> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv16i8( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv16i8( %1, %3, %4) + ret %5 +} + +define @dupq_b_h() #0 { +; CHECK-LABEL: @dupq_b_h( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: %2 = call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %1) +; CHECK-NEXT: ret %2 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv16i8.v16i8( undef, + <16 x i8> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv16i8( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv16i8( %1, %3, %4) + ret %5 +} + +define @dupq_b_b() #0 { +; CHECK-LABEL: @dupq_b_b( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv16i8.v16i8( undef, + <16 x i8> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv16i8( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv16i8( %1, %3, %4) + ret %5 +} + +; DUPQ b16 + +define @dupq_h_0() #0 { +; CHECK-LABEL: @dupq_h_0( +; CHECK: ret zeroinitializer + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv8i16.v8i16( undef, + <8 x i16> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv8i16( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv8i16( %1, %3, %4) + ret %5 +} + +define @dupq_h_d() #0 { +; CHECK-LABEL: @dupq_h_d( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: %2 = call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %1) +; CHECK-NEXT: %3 = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %2) +; CHECK-NEXT: ret %3 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv8i16.v8i16( undef, + <8 x i16> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv8i16( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv8i16( %1, %3, %4) + ret %5 +} + +define @dupq_h_w() #0 { +; CHECK-LABEL: @dupq_h_w( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: %2 = call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %1) +; CHECK-NEXT: %3 = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %2) +; CHECK-NEXT: ret %3 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv8i16.v8i16( undef, + <8 x i16> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv8i16( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv8i16( %1, %3, %4) + ret %5 +} + +define @dupq_h_h() #0 { +; CHECK-LABEL: @dupq_h_h( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv8i16.v8i16( undef, + <8 x i16> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv8i16( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv8i16( %1, %3, %4) + ret %5 +} + +; DUPQ b32 + +define @dupq_w_0() #0 { +; CHECK-LABEL: @dupq_w_0( +; CHECK: ret zeroinitializer + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv4i32.v4i32( undef, + <4 x i32> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv4i32( %1, %3, %4) + ret %5 +} + +define @dupq_w_d() #0 { +; CHECK-LABEL: @dupq_w_d( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: %2 = call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %1) +; CHECK-NEXT: %3 = call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %2) +; CHECK-NEXT: ret %3 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv4i32.v4i32( undef, + <4 x i32> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv4i32( %1, %3, %4) + ret %5 +} + +define @dupq_w_w() #0 { +; CHECK-LABEL: @dupq_w_w( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv4i32.v4i32( undef, + <4 x i32> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv4i32( %1, %3, %4) + ret %5 +} + +; DUPQ b64 + +define @dupq_d_0() #0 { +; CHECK-LABEL: @dupq_d_0( +; CHECK: ret zeroinitializer + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv2i64.v2i64( undef, + <2 x i64> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.nxv2i64( %1, %3, %4) + ret %5 +} + +define @dupq_d_d() #0 { +; CHECK-LABEL: @dupq_d_d( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv2i64.v2i64( undef, + <2 x i64> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.nxv2i64( %1, %3, %4) + ret %5 +} + +; Cases that cannot be converted + +define @dupq_neg1() #0 { +; CHECK-LABEL: @dupq_neg1( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv2i64.v2i64( undef, + <2 x i64> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.nxv2i64( %1, %3, %4) + ret %5 +} + +define @dupq_neg2() #0 { +; CHECK-LABEL: @dupq_neg2( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv4i32.v4i32( undef, + <4 x i32> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv4i32( %1, %3, %4) + ret %5 +} + +define @dupq_neg3() #0 { +; CHECK-LABEL: @dupq_neg3( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv4i32.v4i32( undef, + <4 x i32> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv4i32( %1, %3, %4) + ret %5 +} + +define @dupq_neg4() #0 { +; CHECK-LABEL: @dupq_neg4( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv4i32.v4i32( undef, + <4 x i32> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv4i32( %1, %3, %4) + ret %5 +} + +define @dupq_neg5() #0 { +; CHECK-LABEL: @dupq_neg5( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv4i32.v4i32( undef, + <4 x i32> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv4i32( %1, %3, %4) + ret %5 +} + +define @dupq_neg6(i1 %a) #0 { +; CHECK-LABEL: @dupq_neg6( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = zext i1 %a to i32 + %3 = insertelement <4 x i32> , i32 %2, i32 3 + %4 = tail call @llvm.vector.insert.nxv4i32.v4i32( undef, <4 x i32> %3, i64 0) + %5 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %4 , i64 0) + %6 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %7 = tail call @llvm.aarch64.sve.cmpne.wide.nxv4i32( %1, %5, %6) + ret %7 +} + +define @dupq_neg7() #0 { +; CHECK-LABEL: @dupq_neg7( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv2i64.v2i64( undef, + <2 x i64> , i64 2) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.nxv2i64( %1, %3, %4) + ret %5 +} + +define @dupq_neg8() #0 { +; CHECK-LABEL: @dupq_neg8( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv2i64.v2i64( undef, + <2 x i64> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %2 , i64 1) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.nxv2i64( %1, %3, %4) + ret %5 +} + +define @dupq_neg9( %x) #0 { +; CHECK-LABEL: @dupq_neg9( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv2i64.v2i64( %x, + <2 x i64> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.nxv2i64( %1, %3, %4) + ret %5 +} + +define @dupq_neg10() #0 { +; CHECK-LABEL: @dupq_neg10( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv2i64.v2i64( undef, + <2 x i64> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 1) + %5 = tail call @llvm.aarch64.sve.cmpne.nxv2i64( %1, %3, %4) + ret %5 +} + +define @dupq_neg11( %pg) #0 { +; CHECK-LABEL: @dupq_neg11( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.vector.insert.nxv2i64.v2i64( undef, + <2 x i64> , i64 0) + %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %1 , i64 0) + %3 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %4 = tail call @llvm.aarch64.sve.cmpne.nxv2i64( %pg, %2, %3) + ret %4 +} + +define @dupq_neg12() #0 { +; CHECK-LABEL: @dupq_neg12( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 15) + %2 = tail call @llvm.vector.insert.nxv2i64.v2i64( undef, + <2 x i64> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.nxv2i64( %1, %3, %4) + ret %5 +} + +define @dupq_neg13( %x) #0 { +; CHECK-LABEL: @dupq_neg13( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call @llvm.vector.insert.nxv2i64.v2i64( undef, + <2 x i64> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.cmpne.nxv2i64( %1, %3, %x) + ret %4 +} + +declare @llvm.aarch64.sve.ptrue.nxv16i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv8i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv4i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv2i1(i32) + +declare @llvm.vector.insert.nxv16i8.v16i8(, <16 x i8>, i64) +declare @llvm.vector.insert.nxv8i16.v8i16(, <8 x i16>, i64) +declare @llvm.vector.insert.nxv4i32.v4i32(, <4 x i32>, i64) +declare @llvm.vector.insert.nxv2i64.v2i64(, <2 x i64>, i64) + +declare @llvm.aarch64.sve.dupq.lane.nxv16i8(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv8i16(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv4i32(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv2i64(, i64) + +declare @llvm.aarch64.sve.cmpne.wide.nxv16i8(, , ) +declare @llvm.aarch64.sve.cmpne.wide.nxv8i16(, , ) +declare @llvm.aarch64.sve.cmpne.wide.nxv4i32(, , ) +declare @llvm.aarch64.sve.cmpne.nxv2i64(, , ) + +declare @llvm.aarch64.sve.dup.x.nxv2i64(i64) + +attributes #0 = { "target-features"="+sve" } + From dab19dac94eee19483ba1a7c37bdec4b8501acc3 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 23 Aug 2024 07:34:26 -0700 Subject: [PATCH 325/426] [SLP]Fix a crash for the strided nodes with reversed order and externally used pointer. If the strided node is reversed, need to cehck for the last instruction, not the first one in the list of scalars, when checking if the root pointer must be extracted. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 27 +++++++--- ...reversed-strided-node-with-external-ptr.ll | 49 +++++++++++++++++++ 2 files changed, 70 insertions(+), 6 deletions(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index caee3bf9c958d5..949579772b94d5 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1020,6 +1020,8 @@ static bool allSameType(ArrayRef VL) { /// possible scalar operand in vectorized instruction. static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI) { + if (!UserInst) + return false; unsigned Opcode = UserInst->getOpcode(); switch (Opcode) { case Instruction::Load: { @@ -2809,6 +2811,11 @@ class BoUpSLP { /// \ returns the graph entry for the \p Idx operand of the \p E entry. const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const; + /// Gets the root instruction for the given node. If the node is a strided + /// load/store node with the reverse order, the root instruction is the last + /// one. + Instruction *getRootEntryInstruction(const TreeEntry &Entry) const; + /// \returns Cast context for the given graph node. TargetTransformInfo::CastContextHint getCastContextHint(const TreeEntry &TE) const; @@ -5987,6 +5994,15 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { VectorizableTree.front()->ReorderIndices.clear(); } +Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const { + if ((Entry.getOpcode() == Instruction::Store || + Entry.getOpcode() == Instruction::Load) && + Entry.State == TreeEntry::StridedVectorize && + !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices)) + return dyn_cast(Entry.Scalars[Entry.ReorderIndices.front()]); + return dyn_cast(Entry.Scalars.front()); +} + void BoUpSLP::buildExternalUses( const ExtraValueToDebugLocsMap &ExternallyUsedValues) { DenseMap ScalarToExtUses; @@ -6036,7 +6052,7 @@ void BoUpSLP::buildExternalUses( // be used. if (UseEntry->State == TreeEntry::ScatterVectorize || !doesInTreeUserNeedToExtract( - Scalar, cast(UseEntry->Scalars.front()), TLI)) { + Scalar, getRootEntryInstruction(*UseEntry), TLI)) { LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U << ".\n"); assert(!UseEntry->isGather() && "Bad state"); @@ -8450,8 +8466,8 @@ void BoUpSLP::transformNodes() { Instruction::Store, VecTy, BaseSI->getPointerOperand(), /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI); if (StridedCost < OriginalVecCost) - // Strided load is more profitable than consecutive load + reverse - - // transform the node to strided load. + // Strided store is more profitable than reverse + consecutive store - + // transform the node to strided store. E.State = TreeEntry::StridedVectorize; } break; @@ -13776,7 +13792,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign()); } else { assert(E->State == TreeEntry::StridedVectorize && - "Expected either strided or conseutive stores."); + "Expected either strided or consecutive stores."); if (!E->ReorderIndices.empty()) { SI = cast(E->Scalars[E->ReorderIndices.front()]); Ptr = SI->getPointerOperand(); @@ -14380,8 +14396,7 @@ Value *BoUpSLP::vectorizeTree( (E->State == TreeEntry::Vectorize || E->State == TreeEntry::StridedVectorize) && doesInTreeUserNeedToExtract( - Scalar, - cast(UseEntry->Scalars.front()), + Scalar, getRootEntryInstruction(*UseEntry), TLI); })) && "Scalar with nullptr User must be registered in " diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll new file mode 100644 index 00000000000000..3fa42047162e45 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll @@ -0,0 +1,49 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -slp-threshold=-99999 -mtriple=riscv64 -mattr=+v < %s | FileCheck %s + +define void @test(ptr %a, i64 %0) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr [[A:%.*]], i64 [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: br label %[[BB:.*]] +; CHECK: [[BB]]: +; CHECK-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 +; CHECK-NEXT: [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 0, i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr double, <2 x ptr> [[TMP2]], <2 x i64> [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> , <2 x double> poison) +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, ptr [[A]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, ptr [[A]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = fsub <2 x double> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = fsub <2 x double> [[TMP7]], [[TMP10]] +; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.v2f64.p0.i64(<2 x double> [[TMP11]], ptr align 8 [[ARRAYIDX17_I28_1]], i64 -8, <2 x i1> , i32 2) +; CHECK-NEXT: br label %[[BB]] +; +entry: + br label %bb + +bb: + %indvars.iv.next239.i = add i64 0, 0 + %arrayidx.i.1 = getelementptr double, ptr %a, i64 %indvars.iv.next239.i + %1 = load double, ptr %arrayidx.i.1, align 8 + %arrayidx10.i.1 = getelementptr double, ptr %a, i64 %0 + %2 = or disjoint i64 %0, 1 + %arrayidx17.i28.1 = getelementptr double, ptr %a, i64 %2 + %3 = load double, ptr %arrayidx17.i28.1, align 8 + %4 = load double, ptr %a, align 8 + %5 = load double, ptr %a, align 8 + %arrayidx38.i.1 = getelementptr double, ptr %a, i64 1 + %6 = load double, ptr %arrayidx38.i.1, align 8 + %arrayidx41.i.1 = getelementptr double, ptr %a, i64 1 + %7 = load double, ptr %arrayidx41.i.1, align 8 + %sub47.i.1 = fsub double %4, %5 + %sub54.i.1 = fsub double %6, %7 + %sub69.i.1 = fsub double %1, %sub54.i.1 + store double %sub69.i.1, ptr %arrayidx10.i.1, align 8 + %sub72.i.1 = fsub double %3, %sub47.i.1 + store double %sub72.i.1, ptr %arrayidx17.i28.1, align 8 + br label %bb +} From 858afe90aad9ca45165d64baec9249dd680c85d5 Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Fri, 23 Aug 2024 16:42:04 +0200 Subject: [PATCH 326/426] Revert "[RISCV] Add isel optimization for (and (sra y, c2), c1) to recover regression from #101751. (#104114)" This caused an assert to fire: llvm/include/llvm/Support/Casting.h:566: decltype(auto) llvm::cast(const From &) [To = llvm::ConstantSDNode, From = llvm::SDValue]: Assertion `isa(Val) && "cast() argument of incompatible type!"' failed. see comment on the PR. > If c1 is a shifted mask with c3 leading zeros and c4 trailing zeros. If > c2 is greater than c3, we can use (srli (srai y, c2 - c3), c3 + c4) > followed by a SHXADD with c4 as the X amount. > > Without Zba we can use (slli (srli (srai y, c2 - c3), c3 + c4), c4). > Alive2: https://alive2.llvm.org/ce/z/AwhheR This reverts commit 514481736cf943464125ef34570a7df0a19290de. --- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 52 +--------------- llvm/test/CodeGen/RISCV/rv64zba.ll | 66 --------------------- 2 files changed, 2 insertions(+), 116 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 1d0c264ca551d2..11210e6cec177f 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -1462,6 +1462,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { const uint64_t C1 = N1C->getZExtValue(); + // Turn (and (sra x, c2), c1) -> (srli (srai x, c2-c3), c3) if c1 is a mask + // with c3 leading zeros and c2 is larger than c3. if (N0.getOpcode() == ISD::SRA && isa(N0.getOperand(1)) && N0.hasOneUse()) { unsigned C2 = N0.getConstantOperandVal(1); @@ -1475,8 +1477,6 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { X.getOpcode() == ISD::SHL && isa(X.getOperand(1)) && X.getConstantOperandVal(1) == 32; - // Turn (and (sra x, c2), c1) -> (srli (srai x, c2-c3), c3) if c1 is a - // mask with c3 leading zeros and c2 is larger than c3. if (isMask_64(C1) && !Skip) { unsigned Leading = XLen - llvm::bit_width(C1); if (C2 > Leading) { @@ -1490,27 +1490,6 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { return; } } - - // Look for (and (sra y, c2), c1) where c1 is a shifted mask with c3 - // leading zeros and c4 trailing zeros. If c2 is greater than c3, we can - // use (slli (srli (srai y, c2 - c3), c3 + c4), c4). - if (isShiftedMask_64(C1) && !Skip) { - unsigned Leading = XLen - llvm::bit_width(C1); - unsigned Trailing = llvm::countr_zero(C1); - if (C2 > Leading && Leading > 0 && Trailing > 0) { - SDNode *SRAI = CurDAG->getMachineNode( - RISCV::SRAI, DL, VT, N0.getOperand(0), - CurDAG->getTargetConstant(C2 - Leading, DL, VT)); - SDNode *SRLI = CurDAG->getMachineNode( - RISCV::SRLI, DL, VT, SDValue(SRAI, 0), - CurDAG->getTargetConstant(Leading + Trailing, DL, VT)); - SDNode *SLLI = CurDAG->getMachineNode( - RISCV::SLLI, DL, VT, SDValue(SRLI, 0), - CurDAG->getTargetConstant(Trailing, DL, VT)); - ReplaceNode(Node, SLLI); - return; - } - } } // If C1 masks off the upper bits only (but can't be formed as an @@ -3053,33 +3032,6 @@ bool RISCVDAGToDAGISel::selectSHXADDOp(SDValue N, unsigned ShAmt, return true; } } - } else if (N0.getOpcode() == ISD::SRA && N0.hasOneUse() && - isa(N.getOperand(1))) { - uint64_t Mask = N.getConstantOperandVal(1); - unsigned C2 = N0.getConstantOperandVal(1); - - // Look for (and (sra y, c2), c1) where c1 is a shifted mask with c3 - // leading zeros and c4 trailing zeros. If c2 is greater than c3, we can - // use (srli (srai y, c2 - c3), c3 + c4) followed by a SHXADD with c4 as - // the X amount. - if (isShiftedMask_64(Mask)) { - unsigned XLen = Subtarget->getXLen(); - unsigned Leading = XLen - llvm::bit_width(Mask); - unsigned Trailing = llvm::countr_zero(Mask); - if (C2 > Leading && Leading > 0 && Trailing == ShAmt) { - SDLoc DL(N); - EVT VT = N.getValueType(); - Val = SDValue(CurDAG->getMachineNode( - RISCV::SRAI, DL, VT, N0.getOperand(0), - CurDAG->getTargetConstant(C2 - Leading, DL, VT)), - 0); - Val = SDValue(CurDAG->getMachineNode( - RISCV::SRLI, DL, VT, Val, - CurDAG->getTargetConstant(Leading + ShAmt, DL, VT)), - 0); - return true; - } - } } } else if (bool LeftShift = N.getOpcode() == ISD::SHL; (LeftShift || N.getOpcode() == ISD::SRL) && diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll index 62595fd4a7ad69..87796e2c7b72e9 100644 --- a/llvm/test/CodeGen/RISCV/rv64zba.ll +++ b/llvm/test/CodeGen/RISCV/rv64zba.ll @@ -2988,69 +2988,3 @@ entry: %2 = and i64 %1, 34359738360 ret i64 %2 } - -define ptr @srai_srli_sh3add(ptr %0, i64 %1) nounwind { -; RV64I-LABEL: srai_srli_sh3add: -; RV64I: # %bb.0: # %entry -; RV64I-NEXT: srai a1, a1, 32 -; RV64I-NEXT: srli a1, a1, 6 -; RV64I-NEXT: slli a1, a1, 3 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: ret -; -; RV64ZBA-LABEL: srai_srli_sh3add: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: srai a1, a1, 32 -; RV64ZBA-NEXT: srli a1, a1, 6 -; RV64ZBA-NEXT: sh3add a0, a1, a0 -; RV64ZBA-NEXT: ret -entry: - %2 = ashr i64 %1, 32 - %3 = lshr i64 %2, 6 - %4 = getelementptr i64, ptr %0, i64 %3 - ret ptr %4 -} - -define ptr @srai_srli_slli(ptr %0, i64 %1) nounwind { -; CHECK-LABEL: srai_srli_slli: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: srai a1, a1, 32 -; CHECK-NEXT: srli a1, a1, 6 -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: ret -entry: - %2 = ashr i64 %1, 32 - %3 = lshr i64 %2, 6 - %4 = getelementptr i128, ptr %0, i64 %3 - ret ptr %4 -} - -; Negative to make sure the peephole added for srai_srli_slli and -; srai_srli_sh3add doesn't break this. -define i64 @srai_andi(i64 %x) nounwind { -; CHECK-LABEL: srai_andi: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: srai a0, a0, 8 -; CHECK-NEXT: andi a0, a0, -8 -; CHECK-NEXT: ret -entry: - %y = ashr i64 %x, 8 - %z = and i64 %y, -8 - ret i64 %z -} - -; Negative to make sure the peephole added for srai_srli_slli and -; srai_srli_sh3add doesn't break this. -define i64 @srai_lui_and(i64 %x) nounwind { -; CHECK-LABEL: srai_lui_and: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: srai a0, a0, 8 -; CHECK-NEXT: lui a1, 1048574 -; CHECK-NEXT: and a0, a0, a1 -; CHECK-NEXT: ret -entry: - %y = ashr i64 %x, 8 - %z = and i64 %y, -8192 - ret i64 %z -} From 05ce95ef0412ba8b3e3189db5ed130a9949bbefd Mon Sep 17 00:00:00 2001 From: Edd Dawson Date: Fri, 23 Aug 2024 15:58:00 +0100 Subject: [PATCH 327/426] [PS5][clang][test] x86_64-scei-ps5 -> x86_64-sie-ps5 in tests (#105810) `x86_64-sie-ps5` is the triple we share with PS5 toolchain users who have reason to care about such things. The vast majority of PS5 checks and tests already use this variant. Quashing the handful of stragglers will help prevent future copy+paste of the discouraged variant. --- clang/test/CodeGen/tls-maxalign-modflag.c | 2 +- .../CodeGenCXX/windows-itanium-init-guard.cpp | 6 +++--- clang/test/Driver/debug-options.c | 4 ++-- clang/test/Driver/ps5-linker.c | 20 +++++++++---------- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/clang/test/CodeGen/tls-maxalign-modflag.c b/clang/test/CodeGen/tls-maxalign-modflag.c index 685057c3551a00..26dde569f389c9 100644 --- a/clang/test/CodeGen/tls-maxalign-modflag.c +++ b/clang/test/CodeGen/tls-maxalign-modflag.c @@ -2,7 +2,7 @@ // Test that we get the module flag TLSMaxAlign on the PS platforms. // RUN: %clang_cc1 -triple x86_64-scei-ps4 -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -triple x86_64-scei-ps5 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-sie-ps5 -emit-llvm -o - %s | FileCheck %s int main(void) { return 0; diff --git a/clang/test/CodeGenCXX/windows-itanium-init-guard.cpp b/clang/test/CodeGenCXX/windows-itanium-init-guard.cpp index 8bcfd272ae8f16..c51ce470061d64 100644 --- a/clang/test/CodeGenCXX/windows-itanium-init-guard.cpp +++ b/clang/test/CodeGenCXX/windows-itanium-init-guard.cpp @@ -11,9 +11,9 @@ // RUN: %clang_cc1 -emit-llvm -triple x86_64-scei-ps4 -fdeclspec %s -O1 -disable-llvm-passes -o - -DAPI="__declspec(dllexport)" | FileCheck %s --check-prefixes=EXPORT // RUN: %clang_cc1 -emit-llvm -triple x86_64-scei-ps4 -fdeclspec %s -O1 -disable-llvm-passes -o - -DAPI="__declspec(dllimport)" | FileCheck %s --check-prefixes=IMPORT -// RUN: %clang_cc1 -emit-llvm -triple x86_64-scei-ps5 -fdeclspec %s -O1 -disable-llvm-passes -o - -DAPI= | FileCheck %s --check-prefixes=NONE -// RUN: %clang_cc1 -emit-llvm -triple x86_64-scei-ps5 -fdeclspec %s -O1 -disable-llvm-passes -o - -DAPI="__declspec(dllexport)" | FileCheck %s --check-prefixes=EXPORT -// RUN: %clang_cc1 -emit-llvm -triple x86_64-scei-ps5 -fdeclspec %s -O1 -disable-llvm-passes -o - -DAPI="__declspec(dllimport)" | FileCheck %s --check-prefixes=IMPORT +// RUN: %clang_cc1 -emit-llvm -triple x86_64-sie-ps5 -fdeclspec %s -O1 -disable-llvm-passes -o - -DAPI= | FileCheck %s --check-prefixes=NONE +// RUN: %clang_cc1 -emit-llvm -triple x86_64-sie-ps5 -fdeclspec %s -O1 -disable-llvm-passes -o - -DAPI="__declspec(dllexport)" | FileCheck %s --check-prefixes=EXPORT +// RUN: %clang_cc1 -emit-llvm -triple x86_64-sie-ps5 -fdeclspec %s -O1 -disable-llvm-passes -o - -DAPI="__declspec(dllimport)" | FileCheck %s --check-prefixes=IMPORT //NONE: @_ZZN3foo3GetEvE9Singleton = linkonce_odr {{(dso_local )?}}global //NONE: @_ZGVZN3foo3GetEvE9Singleton = linkonce_odr {{(dso_local )?}}global diff --git a/clang/test/Driver/debug-options.c b/clang/test/Driver/debug-options.c index 21785ba01cb410..73f2f402efa97a 100644 --- a/clang/test/Driver/debug-options.c +++ b/clang/test/Driver/debug-options.c @@ -138,9 +138,9 @@ // RUN: | FileCheck -check-prefix=LDGARANGE %s // RUN: %clang -### %s -g -flto=full -target x86_64-scei-ps4 2>&1 \ // RUN: | FileCheck -check-prefix=LDGARANGE %s -// RUN: %clang -### %s -g -flto -target x86_64-scei-ps5 2>&1 \ +// RUN: %clang -### %s -g -flto -target x86_64-sie-ps5 2>&1 \ // RUN: | FileCheck -check-prefix=LDGARANGE %s -// RUN: %clang -### %s -g -target x86_64-scei-ps5 2>&1 \ +// RUN: %clang -### %s -g -target x86_64-sie-ps5 2>&1 \ // RUN: | FileCheck -check-prefix=LDGARANGE %s // On the AIX, -g defaults to limited debug info. diff --git a/clang/test/Driver/ps5-linker.c b/clang/test/Driver/ps5-linker.c index c462e5a178e4a6..84363deb0337f7 100644 --- a/clang/test/Driver/ps5-linker.c +++ b/clang/test/Driver/ps5-linker.c @@ -1,14 +1,14 @@ // Test that PIE is the default for main components -// RUN: %clang --target=x86_64-scei-ps5 %s -### 2>&1 | FileCheck --check-prefixes=CHECK-PIE %s +// RUN: %clang --target=x86_64-sie-ps5 %s -### 2>&1 | FileCheck --check-prefixes=CHECK-PIE %s // CHECK-PIE: {{ld(\.exe)?}}" // CHECK-PIE-SAME: "-pie" -// RUN: %clang --target=x86_64-scei-ps5 -no-pie %s -### 2>&1 | FileCheck --check-prefixes=CHECK-NO-PIE %s -// RUN: %clang --target=x86_64-scei-ps5 -r %s -### 2>&1 | FileCheck --check-prefixes=CHECK-NO-PIE %s -// RUN: %clang --target=x86_64-scei-ps5 -shared %s -### 2>&1 | FileCheck --check-prefixes=CHECK-NO-PIE,CHECK-SHARED %s -// RUN: %clang --target=x86_64-scei-ps5 -static %s -### 2>&1 | FileCheck --check-prefixes=CHECK-NO-PIE %s +// RUN: %clang --target=x86_64-sie-ps5 -no-pie %s -### 2>&1 | FileCheck --check-prefixes=CHECK-NO-PIE %s +// RUN: %clang --target=x86_64-sie-ps5 -r %s -### 2>&1 | FileCheck --check-prefixes=CHECK-NO-PIE %s +// RUN: %clang --target=x86_64-sie-ps5 -shared %s -### 2>&1 | FileCheck --check-prefixes=CHECK-NO-PIE,CHECK-SHARED %s +// RUN: %clang --target=x86_64-sie-ps5 -static %s -### 2>&1 | FileCheck --check-prefixes=CHECK-NO-PIE %s // CHECK-NO-PIE: {{ld(\.exe)?}}" // CHECK-NO-PIE-NOT: "-pie" @@ -16,15 +16,15 @@ // Test that -static is forwarded to the linker -// RUN: %clang --target=x86_64-scei-ps5 -static %s -### 2>&1 | FileCheck --check-prefixes=CHECK-STATIC %s +// RUN: %clang --target=x86_64-sie-ps5 -static %s -### 2>&1 | FileCheck --check-prefixes=CHECK-STATIC %s // CHECK-STATIC: {{ld(\.exe)?}}" // CHECK-STATIC-SAME: "-static" // Test the driver's control over the JustMyCode behavior with linker flags. -// RUN: %clang --target=x86_64-scei-ps5 -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK,CHECK-LIB %s -// RUN: %clang --target=x86_64-scei-ps5 -flto -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK,CHECK-LIB %s +// RUN: %clang --target=x86_64-sie-ps5 -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK,CHECK-LIB %s +// RUN: %clang --target=x86_64-sie-ps5 -flto -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK,CHECK-LIB %s // CHECK: -plugin-opt=-enable-jmc-instrument @@ -33,7 +33,7 @@ // Test the driver's control over the -fcrash-diagnostics-dir behavior with linker flags. -// RUN: %clang --target=x86_64-scei-ps5 -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG %s -// RUN: %clang --target=x86_64-scei-ps5 -flto -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG %s +// RUN: %clang --target=x86_64-sie-ps5 -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG %s +// RUN: %clang --target=x86_64-sie-ps5 -flto -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG %s // CHECK-DIAG: -plugin-opt=-crash-diagnostics-dir=mydumps From 885c4365c1e8b80bdbbdfecf9b6d436e96be52ac Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 23 Aug 2024 15:58:28 +0100 Subject: [PATCH 328/426] [VPlan] Skip branches marked as dead in cost precomputation. Don't consider the cost of branches marked to be skipped in VPlan cost pre-computation. Those aren't included in the legacy cost, so they should not be included in the VPlan cast. --- .../Transforms/Vectorize/LoopVectorize.cpp | 4 +- .../AArch64/conditional-branches-cost.ll | 152 ++++++++++++++++++ 2 files changed, 155 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f3fb888f20cbbd..b12121d4688c65 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7215,9 +7215,11 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, // TODO: Compute cost of branches for each replicate region in the VPlan, // which is more accurate than the legacy cost model. for (BasicBlock *BB : OrigLoop->blocks()) { - if (BB == OrigLoop->getLoopLatch()) + if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector())) continue; CostCtx.SkipCostComputation.insert(BB->getTerminator()); + if (BB == OrigLoop->getLoopLatch()) + continue; auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF); Cost += BranchCost; } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index 78452a9c884eed..9910be7224674c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -1697,6 +1697,154 @@ exit: ret void } +define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize { +; DEFAULT-LABEL: define void @redundant_branch_and_tail_folding( +; DEFAULT-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) #[[ATTR4:[0-9]+]] { +; DEFAULT-NEXT: entry: +; DEFAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; DEFAULT: vector.ph: +; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]] +; DEFAULT: vector.body: +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] +; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ] +; DEFAULT-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], +; DEFAULT-NEXT: [[TMP1:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], +; DEFAULT-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32> +; DEFAULT-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; DEFAULT-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; DEFAULT: pred.store.if: +; DEFAULT-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; DEFAULT-NEXT: store i32 [[TMP4]], ptr [[DST]], align 4 +; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE]] +; DEFAULT: pred.store.continue: +; DEFAULT-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 +; DEFAULT-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] +; DEFAULT: pred.store.if1: +; DEFAULT-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; DEFAULT-NEXT: store i32 [[TMP6]], ptr [[DST]], align 4 +; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE2]] +; DEFAULT: pred.store.continue2: +; DEFAULT-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 +; DEFAULT-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] +; DEFAULT: pred.store.if3: +; DEFAULT-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; DEFAULT-NEXT: store i32 [[TMP8]], ptr [[DST]], align 4 +; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE4]] +; DEFAULT: pred.store.continue4: +; DEFAULT-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 +; DEFAULT-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] +; DEFAULT: pred.store.if5: +; DEFAULT-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; DEFAULT-NEXT: store i32 [[TMP10]], ptr [[DST]], align 4 +; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE6]] +; DEFAULT: pred.store.continue6: +; DEFAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; DEFAULT-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 +; DEFAULT-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; DEFAULT: middle.block: +; DEFAULT-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; DEFAULT: scalar.ph: +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; DEFAULT-NEXT: br label [[LOOP_HEADER:%.*]] +; DEFAULT: loop.header: +; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; DEFAULT-NEXT: br i1 [[C]], label [[LOOP_LATCH]], label [[THEN:%.*]] +; DEFAULT: then: +; DEFAULT-NEXT: br label [[LOOP_LATCH]] +; DEFAULT: loop.latch: +; DEFAULT-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; DEFAULT-NEXT: [[T:%.*]] = trunc nuw nsw i64 [[IV_NEXT]] to i32 +; DEFAULT-NEXT: store i32 [[T]], ptr [[DST]], align 4 +; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 21 +; DEFAULT-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP31:![0-9]+]] +; DEFAULT: exit: +; DEFAULT-NEXT: ret void +; +; PRED-LABEL: define void @redundant_branch_and_tail_folding( +; PRED-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) #[[ATTR4:[0-9]+]] { +; PRED-NEXT: entry: +; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; PRED: vector.ph: +; PRED-NEXT: br label [[VECTOR_BODY:%.*]] +; PRED: vector.body: +; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] +; PRED-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ] +; PRED-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], +; PRED-NEXT: [[TMP1:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], +; PRED-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32> +; PRED-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; PRED-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; PRED: pred.store.if: +; PRED-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; PRED-NEXT: store i32 [[TMP4]], ptr [[DST]], align 4 +; PRED-NEXT: br label [[PRED_STORE_CONTINUE]] +; PRED: pred.store.continue: +; PRED-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 +; PRED-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] +; PRED: pred.store.if1: +; PRED-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; PRED-NEXT: store i32 [[TMP6]], ptr [[DST]], align 4 +; PRED-NEXT: br label [[PRED_STORE_CONTINUE2]] +; PRED: pred.store.continue2: +; PRED-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 +; PRED-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] +; PRED: pred.store.if3: +; PRED-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; PRED-NEXT: store i32 [[TMP8]], ptr [[DST]], align 4 +; PRED-NEXT: br label [[PRED_STORE_CONTINUE4]] +; PRED: pred.store.continue4: +; PRED-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 +; PRED-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] +; PRED: pred.store.if5: +; PRED-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; PRED-NEXT: store i32 [[TMP10]], ptr [[DST]], align 4 +; PRED-NEXT: br label [[PRED_STORE_CONTINUE6]] +; PRED: pred.store.continue6: +; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; PRED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 +; PRED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; PRED: middle.block: +; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; PRED: scalar.ph: +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; PRED-NEXT: br label [[LOOP_HEADER:%.*]] +; PRED: loop.header: +; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; PRED-NEXT: br i1 [[C]], label [[LOOP_LATCH]], label [[THEN:%.*]] +; PRED: then: +; PRED-NEXT: br label [[LOOP_LATCH]] +; PRED: loop.latch: +; PRED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; PRED-NEXT: [[T:%.*]] = trunc nuw nsw i64 [[IV_NEXT]] to i32 +; PRED-NEXT: store i32 [[T]], ptr [[DST]], align 4 +; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 21 +; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP28:![0-9]+]] +; PRED: exit: +; PRED-NEXT: ret void +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + br i1 %c, label %loop.latch, label %then + +then: + br label %loop.latch + +loop.latch: + %iv.next = add nuw nsw i64 %iv, 1 + %t = trunc nuw nsw i64 %iv.next to i32 + store i32 %t, ptr %dst, align 4 + %ec = icmp eq i64 %iv.next, 21 + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} + ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare float @llvm.fmuladd.f32(float, float, float) #1 @@ -1734,6 +1882,8 @@ attributes #2 = { vscale_range(2,2) "target-cpu"="neoverse-512tvb" } ; DEFAULT: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]} ; DEFAULT: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]} ; DEFAULT: [[LOOP29]] = distinct !{[[LOOP29]], [[META1]]} +; DEFAULT: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]} +; DEFAULT: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]} ;. ; PRED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; PRED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} @@ -1762,4 +1912,6 @@ attributes #2 = { vscale_range(2,2) "target-cpu"="neoverse-512tvb" } ; PRED: [[LOOP24]] = distinct !{[[LOOP24]], [[META2]], [[META1]]} ; PRED: [[LOOP25]] = distinct !{[[LOOP25]], [[META1]], [[META2]]} ; PRED: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]]} +; PRED: [[LOOP27]] = distinct !{[[LOOP27]], [[META1]], [[META2]]} +; PRED: [[LOOP28]] = distinct !{[[LOOP28]], [[META2]], [[META1]]} ;. From 6a8f73803a32db75d22490d341bf8744722a9025 Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Fri, 23 Aug 2024 17:01:04 +0200 Subject: [PATCH 329/426] Revert "Reland "[asan] Remove debug tracing from `report_globals` (#104404)" (#105601)" that change still breaks SanitizerCommon-asan-x86_64-Darwin :: Darwin/print-stack-trace-in-code-loaded-after-fork.cpp > This reverts commit 2704b804bec50c2b016bf678bd534c330ec655b6 > and relands #104404. > > The Darwin should not fail after #105599. This reverts commit 8c6f8c29e90666b747fc4b4612647554206a2be5. --- compiler-rt/lib/asan/asan_flags.inc | 7 +++++-- compiler-rt/lib/asan/asan_globals.cpp | 19 +++++++++++-------- .../Linux/initialization-nobug-lld.cpp | 2 +- .../Linux/odr_indicator_unregister.cpp | 2 +- .../asan/TestCases/Linux/odr_indicators.cpp | 4 ++-- .../TestCases/Windows/dll_global_dead_strip.c | 4 ++-- ...eport_globals_symbolization_at_startup.cpp | 2 +- .../TestCases/Windows/global_dead_strip.c | 4 ++-- .../Windows/report_globals_vs_freelibrary.cpp | 2 +- .../asan/TestCases/initialization-nobug.cpp | 8 ++++---- 10 files changed, 30 insertions(+), 24 deletions(-) diff --git a/compiler-rt/lib/asan/asan_flags.inc b/compiler-rt/lib/asan/asan_flags.inc index 5e0ced9706e664..fad1577d912a5e 100644 --- a/compiler-rt/lib/asan/asan_flags.inc +++ b/compiler-rt/lib/asan/asan_flags.inc @@ -36,8 +36,11 @@ ASAN_FLAG(int, max_redzone, 2048, ASAN_FLAG( bool, debug, false, "If set, prints some debugging information and does additional checks.") -ASAN_FLAG(bool, report_globals, true, - "If set, detect and report errors on globals .") +ASAN_FLAG( + int, report_globals, 1, + "Controls the way to handle globals (0 - don't detect buffer overflow on " + "globals, 1 - detect buffer overflow, 2 - print data about registered " + "globals).") ASAN_FLAG(bool, check_initialization_order, false, "If set, attempts to catch initialization order issues.") ASAN_FLAG( diff --git a/compiler-rt/lib/asan/asan_globals.cpp b/compiler-rt/lib/asan/asan_globals.cpp index a1211430b1268a..c83b782cb85f89 100644 --- a/compiler-rt/lib/asan/asan_globals.cpp +++ b/compiler-rt/lib/asan/asan_globals.cpp @@ -22,7 +22,6 @@ #include "asan_thread.h" #include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_dense_map.h" -#include "sanitizer_common/sanitizer_internal_defs.h" #include "sanitizer_common/sanitizer_list.h" #include "sanitizer_common/sanitizer_mutex.h" #include "sanitizer_common/sanitizer_placement_new.h" @@ -180,7 +179,7 @@ int GetGlobalsForAddress(uptr addr, Global *globals, u32 *reg_sites, int res = 0; for (const auto &l : list_of_all_globals) { const Global &g = *l.g; - if (UNLIKELY(common_flags()->verbosity >= 3)) + if (flags()->report_globals >= 2) ReportGlobal(g, "Search"); if (IsAddressNearGlobal(addr, g)) { internal_memcpy(&globals[res], &g, sizeof(g)); @@ -271,7 +270,7 @@ static inline bool UseODRIndicator(const Global *g) { // so we store the globals in a map. static void RegisterGlobal(const Global *g) SANITIZER_REQUIRES(mu_for_globals) { CHECK(AsanInited()); - if (UNLIKELY(common_flags()->verbosity >= 3)) + if (flags()->report_globals >= 2) ReportGlobal(*g, "Added"); CHECK(flags()->report_globals); CHECK(AddrIsInMem(g->beg)); @@ -308,7 +307,7 @@ static void RegisterGlobal(const Global *g) SANITIZER_REQUIRES(mu_for_globals) { static void UnregisterGlobal(const Global *g) SANITIZER_REQUIRES(mu_for_globals) { CHECK(AsanInited()); - if (UNLIKELY(common_flags()->verbosity >= 3)) + if (flags()->report_globals >= 2) ReportGlobal(*g, "Removed"); CHECK(flags()->report_globals); CHECK(AddrIsInMem(g->beg)); @@ -439,7 +438,7 @@ void __asan_register_globals(__asan_global *globals, uptr n) { } GlobalRegistrationSite site = {stack_id, &globals[0], &globals[n - 1]}; global_registration_site_vector->push_back(site); - if (UNLIKELY(common_flags()->verbosity >= 3)) { + if (flags()->report_globals >= 2) { PRINT_CURRENT_STACK(); Printf("=== ID %d; %p %p\n", stack_id, (void *)&globals[0], (void *)&globals[n - 1]); @@ -498,7 +497,9 @@ void __asan_before_dynamic_init(const char *module_name) { Lock lock(&mu_for_globals); if (current_dynamic_init_module_name == module_name) return; - VPrintf(2, "DynInitPoison module: %s\n", module_name); + if (flags()->report_globals >= 3) + Printf("DynInitPoison module: %s\n", module_name); + if (current_dynamic_init_module_name == nullptr) { // First call, poison all globals from other modules. DynInitGlobals().forEach([&](auto &kv) { @@ -544,7 +545,8 @@ static void UnpoisonBeforeMain(void) { return; allow_after_dynamic_init = true; } - VPrintf(2, "UnpoisonBeforeMain\n"); + if (flags()->report_globals >= 3) + Printf("UnpoisonBeforeMain\n"); __asan_after_dynamic_init(); } @@ -568,7 +570,8 @@ void __asan_after_dynamic_init() { if (!current_dynamic_init_module_name) return; - VPrintf(2, "DynInitUnpoison\n"); + if (flags()->report_globals >= 3) + Printf("DynInitUnpoison\n"); DynInitGlobals().forEach([&](auto &kv) { UnpoisonDynamicGlobals(kv.second, /*mark_initialized=*/false); diff --git a/compiler-rt/test/asan/TestCases/Linux/initialization-nobug-lld.cpp b/compiler-rt/test/asan/TestCases/Linux/initialization-nobug-lld.cpp index ef82c7a29575eb..5cec029811cbc8 100644 --- a/compiler-rt/test/asan/TestCases/Linux/initialization-nobug-lld.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/initialization-nobug-lld.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx_asan -O3 %S/../initialization-nobug.cpp %S/../Helpers/initialization-nobug-extra.cpp -fuse-ld=lld -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInit" +// RUN: %clangxx_asan -O3 %S/../initialization-nobug.cpp %S/../Helpers/initialization-nobug-extra.cpp -fuse-ld=lld -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInit" // Same as initialization-nobug.cpp, but with lld we expect just one // `DynInitUnpoison` executed after `AfterDynamicInit` at the end. diff --git a/compiler-rt/test/asan/TestCases/Linux/odr_indicator_unregister.cpp b/compiler-rt/test/asan/TestCases/Linux/odr_indicator_unregister.cpp index b75f5be101ef8a..0f2ed6597154bb 100644 --- a/compiler-rt/test/asan/TestCases/Linux/odr_indicator_unregister.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/odr_indicator_unregister.cpp @@ -4,7 +4,7 @@ // RUN: %clangxx_asan -g -O0 -DSHARED_LIB -DSIZE=1 %s -fPIC -shared -o %t-so-1.so // RUN: %clangxx_asan -g -O0 -DSHARED_LIB -DSIZE=2 %s -fPIC -shared -o %t-so-2.so // RUN: %clangxx_asan -g -O0 %s %libdl -Wl,--export-dynamic -o %t -// RUN: %env_asan_opts=report_globals=1:detect_odr_violation=1:verbosity=3 %run %t 2>&1 | FileCheck %s +// RUN: %env_asan_opts=report_globals=2:detect_odr_violation=1 %run %t 2>&1 | FileCheck %s // FIXME: Checks do not match on Android. // UNSUPPORTED: android diff --git a/compiler-rt/test/asan/TestCases/Linux/odr_indicators.cpp b/compiler-rt/test/asan/TestCases/Linux/odr_indicators.cpp index f28a9f6d07386d..8af3ec09be78c4 100644 --- a/compiler-rt/test/asan/TestCases/Linux/odr_indicators.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/odr_indicators.cpp @@ -1,8 +1,8 @@ // RUN: %clangxx_asan -fno-sanitize-address-use-odr-indicator -fPIC %s -o %t -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,INDICATOR0 +// RUN: %env_asan_opts=report_globals=2 %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,INDICATOR0 // RUN: %clangxx_asan -fsanitize-address-use-odr-indicator -fPIC %s -o %t -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,INDICATOR1 +// RUN: %env_asan_opts=report_globals=2 %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,INDICATOR1 #include diff --git a/compiler-rt/test/asan/TestCases/Windows/dll_global_dead_strip.c b/compiler-rt/test/asan/TestCases/Windows/dll_global_dead_strip.c index e5bd27bdf65fdf..a0c96622efeea4 100644 --- a/compiler-rt/test/asan/TestCases/Windows/dll_global_dead_strip.c +++ b/compiler-rt/test/asan/TestCases/Windows/dll_global_dead_strip.c @@ -1,11 +1,11 @@ // RUN: %clang_cl_asan %Od %p/dll_host.cpp %Fe%t // // RUN: %clang_cl_nocxx_asan %Gw %LD %Od %s %Fe%t.dll -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %t %t.dll 2>&1 | FileCheck %s --check-prefix=NOSTRIP +// RUN: %env_asan_opts=report_globals=2 %run %t %t.dll 2>&1 | FileCheck %s --check-prefix=NOSTRIP // RUN: %clang_cl_nocxx_asan %Gw %LD -O2 %s %Fe%t.dll \ // RUN: %if target={{.*-windows-gnu}} %{ -Wl,--gc-sections %} \ // RUN: %else %{ -link -opt:ref %} -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %t %t.dll 2>&1 | FileCheck %s --check-prefix=STRIP +// RUN: %env_asan_opts=report_globals=2 %run %t %t.dll 2>&1 | FileCheck %s --check-prefix=STRIP #include diff --git a/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp b/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp index c74b66f2b43b3e..06a632e6708b1e 100644 --- a/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp +++ b/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp @@ -1,7 +1,7 @@ // RUN: %clang_cl_asan %LD %Od -DDLL %s %Fe%t.dll \ // RUN: %if target={{.*-windows-gnu}} %{ -Wl,--out-implib,%t.lib %} // RUN: %clang_cl_asan %Od -DEXE %s %t.lib %Fe%te.exe -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %te.exe 2>&1 | FileCheck %s +// RUN: %env_asan_opts=report_globals=2 %run %te.exe 2>&1 | FileCheck %s // FIXME: Currently, the MT runtime build crashes on startup due to dbghelp.dll // initialization failure. diff --git a/compiler-rt/test/asan/TestCases/Windows/global_dead_strip.c b/compiler-rt/test/asan/TestCases/Windows/global_dead_strip.c index 7f2405fdfc8364..0e15120a46f776 100644 --- a/compiler-rt/test/asan/TestCases/Windows/global_dead_strip.c +++ b/compiler-rt/test/asan/TestCases/Windows/global_dead_strip.c @@ -1,9 +1,9 @@ // RUN: %clang_cl_nocxx_asan %Gw %Od %s %Fe%t.exe -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %t.exe 2>&1 | FileCheck %s --check-prefix=NOSTRIP +// RUN: %env_asan_opts=report_globals=2 %t.exe 2>&1 | FileCheck %s --check-prefix=NOSTRIP // RUN: %clang_cl_nocxx_asan %Gw -O2 %s %Fe%t.exe \ // RUN: %if target={{.*-windows-gnu}} %{ -Wl,--gc-sections %} \ // RUN: %else %{ -link -opt:ref %} -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %t.exe 2>&1 | FileCheck %s --check-prefix=STRIP +// RUN: %env_asan_opts=report_globals=2 %t.exe 2>&1 | FileCheck %s --check-prefix=STRIP #include int dead_global = 42; diff --git a/compiler-rt/test/asan/TestCases/Windows/report_globals_vs_freelibrary.cpp b/compiler-rt/test/asan/TestCases/Windows/report_globals_vs_freelibrary.cpp index 34ce18e146d677..7cad3f39be1ec2 100644 --- a/compiler-rt/test/asan/TestCases/Windows/report_globals_vs_freelibrary.cpp +++ b/compiler-rt/test/asan/TestCases/Windows/report_globals_vs_freelibrary.cpp @@ -1,6 +1,6 @@ // RUN: %clang_cl_asan %LD %Od -DDLL %s %Fe%t.dll // RUN: %clang_cl_asan %Od -DEXE %s %Fe%te.exe -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %te.exe %t.dll 2>&1 | FileCheck %s +// RUN: %env_asan_opts=report_globals=2 %run %te.exe %t.dll 2>&1 | FileCheck %s #include #include diff --git a/compiler-rt/test/asan/TestCases/initialization-nobug.cpp b/compiler-rt/test/asan/TestCases/initialization-nobug.cpp index 61328b9de28ae6..f66d501124bc48 100644 --- a/compiler-rt/test/asan/TestCases/initialization-nobug.cpp +++ b/compiler-rt/test/asan/TestCases/initialization-nobug.cpp @@ -1,10 +1,10 @@ // A collection of various initializers which shouldn't trip up initialization // order checking. If successful, this will just return 0. -// RUN: %clangxx_asan -O0 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" -// RUN: %clangxx_asan -O1 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" -// RUN: %clangxx_asan -O2 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" -// RUN: %clangxx_asan -O3 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" +// RUN: %clangxx_asan -O0 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" +// RUN: %clangxx_asan -O1 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" +// RUN: %clangxx_asan -O2 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" +// RUN: %clangxx_asan -O3 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" // Simple access: // Make sure that accessing a global in the same TU is safe From f77e8f765e425a575516c16e7034cb448d270fcc Mon Sep 17 00:00:00 2001 From: Chris Apple Date: Fri, 23 Aug 2024 08:16:52 -0700 Subject: [PATCH 330/426] [clang][rtsan] Reland realtime sanitizer codegen and driver (#102622) This reverts commit a1e9b7e646b76bf844e8a9a101ebd27de11992ff This relands commit d010ec6af8162a8ae4e42d2cac5282f83db0ce07 No modifications from the original patch. It was determined that the ubsan build failure was happening even after the revert, some examples: https://lab.llvm.org/buildbot/#/builders/159/builds/4477 https://lab.llvm.org/buildbot/#/builders/159/builds/4478 https://lab.llvm.org/buildbot/#/builders/159/builds/4479 --- clang/docs/RealtimeSanitizer.rst | 85 +++++++++++++++++++ clang/docs/ReleaseNotes.rst | 5 ++ clang/docs/UsersManual.rst | 2 + clang/docs/index.rst | 1 + clang/include/clang/Basic/Sanitizers.def | 3 + clang/include/clang/Driver/SanitizerArgs.h | 1 + clang/lib/CodeGen/BackendUtil.cpp | 8 ++ clang/lib/CodeGen/CodeGenFunction.cpp | 7 ++ clang/lib/Driver/SanitizerArgs.cpp | 14 +-- clang/lib/Driver/ToolChains/CommonArgs.cpp | 6 ++ clang/lib/Driver/ToolChains/Darwin.cpp | 8 ++ clang/lib/Driver/ToolChains/Linux.cpp | 1 + clang/test/CodeGen/rtsan_attribute_inserted.c | 7 ++ .../test/CodeGen/rtsan_entry_exit_insertion.c | 13 +++ .../rtsan_no_attribute_sanitizer_disabled.c | 6 ++ clang/test/Driver/fsanitize.c | 46 ++++++++++ 16 files changed, 208 insertions(+), 5 deletions(-) create mode 100644 clang/docs/RealtimeSanitizer.rst create mode 100644 clang/test/CodeGen/rtsan_attribute_inserted.c create mode 100644 clang/test/CodeGen/rtsan_entry_exit_insertion.c create mode 100644 clang/test/CodeGen/rtsan_no_attribute_sanitizer_disabled.c diff --git a/clang/docs/RealtimeSanitizer.rst b/clang/docs/RealtimeSanitizer.rst new file mode 100644 index 00000000000000..799cd43509c6e6 --- /dev/null +++ b/clang/docs/RealtimeSanitizer.rst @@ -0,0 +1,85 @@ +================= +RealtimeSanitizer +================= + +.. contents:: + :local: + +Introduction +============ +RealtimeSanitizer (a.k.a. RTSan) is a real-time safety testing tool for C and C++ +projects. RTSan can be used to detect real-time violations, i.e. calls to methods +that are not safe for use in functions with deterministic runtime requirements. +RTSan considers any function marked with the ``[[clang::nonblocking]]`` attribute +to be a real-time function. If RTSan detects a call to ``malloc``, ``free``, +``pthread_mutex_lock``, or anything else that could have a non-deterministic +execution time in a function marked ``[[clang::nonblocking]]`` +RTSan raises an error. + +The runtime slowdown introduced by RealtimeSanitizer is negligible. + +How to build +============ + +Build LLVM/Clang with `CMake ` and enable the +``compiler-rt`` runtime. An example CMake configuration that will allow for the +use/testing of RealtimeSanitizer: + +.. code-block:: console + + $ cmake -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_ENABLE_RUNTIMES="compiler-rt" /llvm + +Usage +===== + +There are two requirements: + +1. The code must be compiled with the ``-fsanitize=realtime`` flag. +2. Functions that are subject to real-time constraints must be marked + with the ``[[clang::nonblocking]]`` attribute. + +Typically, these attributes should be added onto the functions that are entry +points for threads with real-time priority. These threads are subject to a fixed +callback time, such as audio callback threads or rendering loops in video game +code. + +.. code-block:: console + + % cat example_realtime_violation.cpp + #include + + void violation() [[clang::nonblocking]]{ + std::vector v; + v.resize(100); + } + + int main() { + violation(); + return 0; + } + # Compile and link + % clang++ -fsanitize=realtime -g example_realtime_violation.cpp + +If a real-time safety violation is detected in a ``[[clang::nonblocking]]`` +context, or any function invoked by that function, the program will exit with a +non-zero exit code. + +.. code-block:: console + + % clang++ -fsanitize=realtime -g example_realtime_violation.cpp + % ./a.out + Real-time violation: intercepted call to real-time unsafe function `malloc` in real-time context! Stack trace: + #0 0x000102893034 in __rtsan::PrintStackTrace() rtsan_stack.cpp:45 + #1 0x000102892e64 in __rtsan::Context::ExpectNotRealtime(char const*) rtsan_context.cpp:78 + #2 0x00010289397c in malloc rtsan_interceptors.cpp:286 + #3 0x000195bd7bd0 in operator new(unsigned long)+0x1c (libc++abi.dylib:arm64+0x16bd0) + #4 0x5c7f00010230f07c () + #5 0x00010230f058 in std::__1::__libcpp_allocate[abi:ue170006](unsigned long, unsigned long) new:324 + #6 0x00010230effc in std::__1::allocator::allocate[abi:ue170006](unsigned long) allocator.h:114 + ... snip ... + #10 0x00010230e4bc in std::__1::vector>::__append(unsigned long) vector:1162 + #11 0x00010230dcdc in std::__1::vector>::resize(unsigned long) vector:1981 + #12 0x00010230dc28 in violation() main.cpp:5 + #13 0x00010230dd64 in main main.cpp:9 + #14 0x0001958960dc () + #15 0x2f557ffffffffffc () diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 70ff5dedab217f..fb3c2f699964c3 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -444,6 +444,11 @@ Moved checkers Sanitizers ---------- +- Introduced Realtime Sanitizer, activated by using the -fsanitize=realtime + flag. This sanitizer detects unsafe system library calls, such as memory + allocations and mutex locks. If any such function is called during invocation + of a function marked with the ``[[clang::nonblocking]]`` attribute, an error + is printed to the console and the process exits non-zero. - Added the ``-fsanitize-undefined-ignore-overflow-pattern`` flag which can be used to disable specific overflow-dependent code patterns. The supported diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst index d19b77ae40b0d7..069ecba875cd59 100644 --- a/clang/docs/UsersManual.rst +++ b/clang/docs/UsersManual.rst @@ -2068,6 +2068,8 @@ are listed below. integrity. - ``-fsanitize=safe-stack``: :doc:`safe stack ` protection against stack-based memory corruption errors. + - ``-fsanitize=realtime``: :doc:`RealtimeSanitizer`, + a real-time safety checker. There are more fine-grained checks available: see the :ref:`list ` of specific kinds of diff --git a/clang/docs/index.rst b/clang/docs/index.rst index 9bae0bd83243bd..4a497f4d9bcc3c 100644 --- a/clang/docs/index.rst +++ b/clang/docs/index.rst @@ -32,6 +32,7 @@ Using Clang as a Compiler UndefinedBehaviorSanitizer DataFlowSanitizer LeakSanitizer + RealtimeSanitizer SanitizerCoverage SanitizerStats SanitizerSpecialCaseList diff --git a/clang/include/clang/Basic/Sanitizers.def b/clang/include/clang/Basic/Sanitizers.def index bee35e9dca7c39..9223f62b3639a7 100644 --- a/clang/include/clang/Basic/Sanitizers.def +++ b/clang/include/clang/Basic/Sanitizers.def @@ -79,6 +79,9 @@ SANITIZER("thread", Thread) // Numerical stability sanitizer. SANITIZER("numerical", NumericalStability) +// RealtimeSanitizer +SANITIZER("realtime", Realtime) + // LeakSanitizer SANITIZER("leak", Leak) diff --git a/clang/include/clang/Driver/SanitizerArgs.h b/clang/include/clang/Driver/SanitizerArgs.h index e64ec463ca8907..0c6f3869549ef7 100644 --- a/clang/include/clang/Driver/SanitizerArgs.h +++ b/clang/include/clang/Driver/SanitizerArgs.h @@ -107,6 +107,7 @@ class SanitizerArgs { bool needsNsanRt() const { return Sanitizers.has(SanitizerKind::NumericalStability); } + bool needsRtsanRt() const { return Sanitizers.has(SanitizerKind::Realtime); } bool hasMemTag() const { return hasMemtagHeap() || hasMemtagStack() || hasMemtagGlobals(); diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index fdd89edd72e109..026f16484c0949 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -78,6 +78,7 @@ #include "llvm/Transforms/Instrumentation/MemorySanitizer.h" #include "llvm/Transforms/Instrumentation/NumericalStabilitySanitizer.h" #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" +#include "llvm/Transforms/Instrumentation/RealtimeSanitizer.h" #include "llvm/Transforms/Instrumentation/SanitizerBinaryMetadata.h" #include "llvm/Transforms/Instrumentation/SanitizerCoverage.h" #include "llvm/Transforms/Instrumentation/ThreadSanitizer.h" @@ -990,6 +991,13 @@ void EmitAssemblyHelper::RunOptimizationPipeline( FPM.addPass(BoundsCheckingPass()); }); + if (LangOpts.Sanitize.has(SanitizerKind::Realtime)) + PB.registerScalarOptimizerLateEPCallback( + [](FunctionPassManager &FPM, OptimizationLevel Level) { + RealtimeSanitizerOptions Opts; + FPM.addPass(RealtimeSanitizerPass(Opts)); + }); + // Don't add sanitizers if we are here from ThinLTO PostLink. That already // done on PreLink stage. if (!IsThinLTOPostLink) { diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index eff8c9f5694084..c89eaa0f4e3bfc 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -845,6 +845,13 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy, if (SanOpts.has(SanitizerKind::ShadowCallStack)) Fn->addFnAttr(llvm::Attribute::ShadowCallStack); + if (SanOpts.has(SanitizerKind::Realtime)) + if (FD && FD->getASTContext().hasAnyFunctionEffects()) + for (const FunctionEffectWithCondition &Fe : FD->getFunctionEffects()) { + if (Fe.Effect.kind() == FunctionEffect::Kind::NonBlocking) + Fn->addFnAttr(llvm::Attribute::SanitizeRealtime); + } + // Apply fuzzing attribute to the function. if (SanOpts.hasOneOf(SanitizerKind::Fuzzer | SanitizerKind::FuzzerNoLink)) Fn->addFnAttr(llvm::Attribute::OptForFuzzing); diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp index 9d9ad79d51d7f8..09262f40b5b50c 100644 --- a/clang/lib/Driver/SanitizerArgs.cpp +++ b/clang/lib/Driver/SanitizerArgs.cpp @@ -558,11 +558,15 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC, SanitizerKind::Leak | SanitizerKind::Thread | SanitizerKind::Memory | SanitizerKind::KernelAddress | SanitizerKind::Scudo | SanitizerKind::SafeStack), - std::make_pair(SanitizerKind::MemTag, - SanitizerKind::Address | SanitizerKind::KernelAddress | - SanitizerKind::HWAddress | - SanitizerKind::KernelHWAddress), - std::make_pair(SanitizerKind::KCFI, SanitizerKind::Function)}; + std::make_pair(SanitizerKind::MemTag, SanitizerKind::Address | + SanitizerKind::KernelAddress | + SanitizerKind::HWAddress | + SanitizerKind::KernelHWAddress), + std::make_pair(SanitizerKind::KCFI, SanitizerKind::Function), + std::make_pair(SanitizerKind::Realtime, + SanitizerKind::Address | SanitizerKind::Thread | + SanitizerKind::Undefined | SanitizerKind::Memory)}; + // Enable toolchain specific default sanitizers if not explicitly disabled. SanitizerMask Default = TC.getDefaultSanitizers() & ~AllRemove; diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 0738ed18f54078..0601016c3b14b8 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1456,6 +1456,8 @@ collectSanitizerRuntimes(const ToolChain &TC, const ArgList &Args, if (!Args.hasArg(options::OPT_shared)) HelperStaticRuntimes.push_back("hwasan-preinit"); } + if (SanArgs.needsRtsanRt() && SanArgs.linkRuntimes()) + SharedRuntimes.push_back("rtsan"); } // The stats_client library is also statically linked into DSOs. @@ -1481,6 +1483,10 @@ collectSanitizerRuntimes(const ToolChain &TC, const ArgList &Args, StaticRuntimes.push_back("asan_cxx"); } + if (!SanArgs.needsSharedRt() && SanArgs.needsRtsanRt() && + SanArgs.linkRuntimes()) + StaticRuntimes.push_back("rtsan"); + if (!SanArgs.needsSharedRt() && SanArgs.needsMemProfRt()) { StaticRuntimes.push_back("memprof"); if (SanArgs.linkCXXRuntimes()) diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp index 2550541a438481..5e7f9290e2009d 100644 --- a/clang/lib/Driver/ToolChains/Darwin.cpp +++ b/clang/lib/Driver/ToolChains/Darwin.cpp @@ -1519,6 +1519,8 @@ void DarwinClang::AddLinkRuntimeLibArgs(const ArgList &Args, const char *sanitizer = nullptr; if (Sanitize.needsUbsanRt()) { sanitizer = "UndefinedBehaviorSanitizer"; + } else if (Sanitize.needsRtsanRt()) { + sanitizer = "RealtimeSanitizer"; } else if (Sanitize.needsAsanRt()) { sanitizer = "AddressSanitizer"; } else if (Sanitize.needsTsanRt()) { @@ -1541,6 +1543,11 @@ void DarwinClang::AddLinkRuntimeLibArgs(const ArgList &Args, AddLinkSanitizerLibArgs(Args, CmdArgs, "asan"); } } + if (Sanitize.needsRtsanRt()) { + assert(Sanitize.needsSharedRt() && + "Static sanitizer runtimes not supported"); + AddLinkSanitizerLibArgs(Args, CmdArgs, "rtsan"); + } if (Sanitize.needsLsanRt()) AddLinkSanitizerLibArgs(Args, CmdArgs, "lsan"); if (Sanitize.needsUbsanRt()) { @@ -3539,6 +3546,7 @@ SanitizerMask Darwin::getSupportedSanitizers() const { Res |= SanitizerKind::Address; Res |= SanitizerKind::PointerCompare; Res |= SanitizerKind::PointerSubtract; + Res |= SanitizerKind::Realtime; Res |= SanitizerKind::Leak; Res |= SanitizerKind::Fuzzer; Res |= SanitizerKind::FuzzerNoLink; diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp index 2265138edbffbe..96680b3412a2db 100644 --- a/clang/lib/Driver/ToolChains/Linux.cpp +++ b/clang/lib/Driver/ToolChains/Linux.cpp @@ -800,6 +800,7 @@ SanitizerMask Linux::getSupportedSanitizers() const { Res |= SanitizerKind::Address; Res |= SanitizerKind::PointerCompare; Res |= SanitizerKind::PointerSubtract; + Res |= SanitizerKind::Realtime; Res |= SanitizerKind::Fuzzer; Res |= SanitizerKind::FuzzerNoLink; Res |= SanitizerKind::KernelAddress; diff --git a/clang/test/CodeGen/rtsan_attribute_inserted.c b/clang/test/CodeGen/rtsan_attribute_inserted.c new file mode 100644 index 00000000000000..05a1d9a8c2047a --- /dev/null +++ b/clang/test/CodeGen/rtsan_attribute_inserted.c @@ -0,0 +1,7 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=realtime %s -emit-llvm -o - %s | FileCheck %s + +float process(float *a) [[clang::nonblocking]] { return *a; } + +// CHECK-LABEL: @process{{.*}}#0 { +// CHECK: attributes #0 = { +// CHECK-SAME: {{.*sanitize_realtime.*}} diff --git a/clang/test/CodeGen/rtsan_entry_exit_insertion.c b/clang/test/CodeGen/rtsan_entry_exit_insertion.c new file mode 100644 index 00000000000000..9ba0103ca1e353 --- /dev/null +++ b/clang/test/CodeGen/rtsan_entry_exit_insertion.c @@ -0,0 +1,13 @@ +// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -fsanitize=realtime -emit-llvm -o - %s | FileCheck %s + +int foo(int *a) [[clang::nonblocking]] { return *a; } + +// The first instruction after the function is entred should be a call to +// enable the realtime sanitizer stack. +// CHECK-LABEL: define{{.*}}@foo +// CHECK-NEXT: entry: +// CHECK-NEXT: call{{.*}}__rtsan_realtime_enter + +// __rtsan_realtime_exit should be inserted at all function returns. +// CHECK-LABEL: call{{.*}}__rtsan_realtime_exit +// CHECK-NEXT: ret diff --git a/clang/test/CodeGen/rtsan_no_attribute_sanitizer_disabled.c b/clang/test/CodeGen/rtsan_no_attribute_sanitizer_disabled.c new file mode 100644 index 00000000000000..43ad6ed1a429ee --- /dev/null +++ b/clang/test/CodeGen/rtsan_no_attribute_sanitizer_disabled.c @@ -0,0 +1,6 @@ +// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s + +float process(float *a) [[clang::nonblocking]] { return *a; } + +// Without the -fsanitize=realtime flag, we shouldn't attach the attribute. +// CHECK-NOT: {{.*sanitize_realtime.*}} diff --git a/clang/test/Driver/fsanitize.c b/clang/test/Driver/fsanitize.c index 678fa432fb0a0a..f86c978f221cd4 100644 --- a/clang/test/Driver/fsanitize.c +++ b/clang/test/Driver/fsanitize.c @@ -1040,3 +1040,49 @@ // RUN: not %clang --target=aarch64-none-elf -fsanitize=dataflow %s -### 2>&1 | FileCheck %s -check-prefix=UNSUPPORTED-BAREMETAL // RUN: not %clang --target=arm-arm-none-eabi -fsanitize=shadow-call-stack %s -### 2>&1 | FileCheck %s -check-prefix=UNSUPPORTED-BAREMETAL // UNSUPPORTED-BAREMETAL: unsupported option '-fsanitize={{.*}}' for target + +// RUN: %clang --target=x86_64-apple-darwin -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-X86-64-DARWIN +// CHECK-RTSAN-X86-64-DARWIN-NOT: unsupported option + +// RUN: %clang --target=x86_64-apple-darwin -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-X86-64-DARWIN +// CHECK-RTSAN-X86-64-DARWIN-NOT: unsupported option +// RUN: %clang --target=x86_64-apple-macos -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-X86-64-MACOS +// CHECK-RTSAN-X86-64-MACOS-NOT: unsupported option +// RUN: %clang --target=arm64-apple-macos -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-ARM64-MACOS +// CHECK-RTSAN-ARM64-MACOS-NOT: unsupported option + +// RUN: %clang --target=arm64-apple-ios-simulator -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-ARM64-IOSSIMULATOR +// CHECK-RTSAN-ARM64-IOSSIMULATOR-NOT: unsupported option + +// RUN: %clang --target=arm64-apple-watchos-simulator -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-ARM64-WATCHOSSIMULATOR +// CHECK-RTSAN-ARM64-WATCHOSSIMULATOR-NOT: unsupported option + +// RUN: %clang --target=arm64-apple-tvos-simulator -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-ARM64-TVOSSIMULATOR +// CHECK-RTSAN-ARM64-TVOSSIMULATOR-NOT: unsupported option + +// RUN: %clang --target=x86_64-apple-ios-simulator -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-X86-64-IOSSIMULATOR +// CHECK-RTSAN-X86-64-IOSSIMULATOR-NOT: unsupported option + +// RUN: %clang --target=x86_64-apple-watchos-simulator -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-X86-64-WATCHOSSIMULATOR +// CHECK-RTSAN-X86-64-WATCHOSSIMULATOR-NOT: unsupported option + +// RUN: %clang --target=x86_64-apple-tvos-simulator -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-X86-64-TVOSSIMULATOR +// CHECK-RTSAN-X86-64-TVOSSIMULATOR-NOT: unsupported option + +// RUN: %clang --target=x86_64-linux-gnu -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-X86-64-LINUX +// CHECK-RTSAN-X86-64-LINUX-NOT: unsupported option + +// RUN: not %clang --target=i386-pc-openbsd -fsanitize=realtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-RTSAN-OPENBSD +// CHECK-RTSAN-OPENBSD: unsupported option '-fsanitize=realtime' for target 'i386-pc-openbsd' + +// RUN: not %clang --target=x86_64-linux-gnu -fsanitize=realtime,thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-REALTIME-TSAN +// CHECK-REALTIME-TSAN: error: invalid argument '-fsanitize=realtime' not allowed with '-fsanitize=thread' + +// RUN: not %clang --target=x86_64-linux-gnu -fsanitize=realtime,address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-REALTIME-ASAN +// CHECK-REALTIME-ASAN: error: invalid argument '-fsanitize=realtime' not allowed with '-fsanitize=address' + +// RUN: not %clang --target=x86_64-linux-gnu -fsanitize=realtime,memory %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-REALTIME-MSAN +// CHECK-REALTIME-MSAN: error: invalid argument '-fsanitize=realtime' not allowed with '-fsanitize=memory' + +// RUN: not %clang --target=x86_64-linux-gnu -fsanitize=realtime,undefined %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-REALTIME-UBSAN +// CHECK-REALTIME-UBSAN: error: invalid argument '-fsanitize=realtime' not allowed with '-fsanitize=undefined' From 3faf5b93cfd2b2723851191a244a9616d40771e7 Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Fri, 23 Aug 2024 11:17:34 -0400 Subject: [PATCH 331/426] [C23] Update status page for TS 18661 integration (#105693) WG14 N2401 was removed from the list because it was library-only changes that don't impact the compiler. Everything having to do with decimal floating-point types was changed to No because we do not currently have any support for those. WG14 N2314 remains Unknown because it has changes to Annex F for binary floating-point types. --- clang/www/c_status.html | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/clang/www/c_status.html b/clang/www/c_status.html index 6555b8e5e3da39..1a0f320de04e83 100644 --- a/clang/www/c_status.html +++ b/clang/www/c_status.html @@ -216,11 +216,7 @@

C23 implementation status

N2341 - Unknown - - - N2401 - Unknown + No N2359 @@ -228,23 +224,19 @@

C23 implementation status

N2546 - Unknown - - - N2580 - Unknown + No N2640 - Unknown + Yes N2755 - Unknown + No N2931 - Unknown + No Preprocessor line numbers unspecified From 7f3793207bfcbb52b1367baefdfa7a6453041ade Mon Sep 17 00:00:00 2001 From: Harini0924 <79345568+Harini0924@users.noreply.github.com> Date: Fri, 23 Aug 2024 08:20:11 -0700 Subject: [PATCH 332/426] [BOLT][test] Removed the use of parentheses in BOLT tests with lit internal shell (#105720) This patch addresses compatibility issues with the lit internal shell by removing the use of subshell execution (parentheses and subshell syntax) in the `BOLT` tests. The lit internal shell does not support parentheses, so the tests have been refactored to use separate command invocations, with outputs redirected to temporary files where necessary. This change is relevant for enabling the lit internal shell by default, as outlined in [[RFC] Enabling the Lit Internal Shell by Default](https://discourse.llvm.org/t/rfc-enabling-the-lit-internal-shell-by-default/80179) fixes: #102401 --- bolt/test/X86/end-symbol.test | 6 ++++-- bolt/test/X86/instrumentation-eh_frame_hdr.cpp | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/bolt/test/X86/end-symbol.test b/bolt/test/X86/end-symbol.test index 18c11a8a749fc4..69087b7f2597e0 100644 --- a/bolt/test/X86/end-symbol.test +++ b/bolt/test/X86/end-symbol.test @@ -1,7 +1,9 @@ # RUN: yaml2obj %p/Inputs/plt-sec.yaml &> %t.exe # RUN: llvm-bolt %t.exe -o %t.out -# RUN: (llvm-readelf --program-headers %t.out | grep LOAD | tail -n 1 ; llvm-nm %t.out) \ -# RUN: | FileCheck %s + +# RUN: llvm-readelf --program-headers %t.out | grep LOAD | tail -n 1 > %t.load +# RUN: llvm-nm %t.out >> %t.load +# RUN: FileCheck %s < %t.load ## Check that llvm-bolt correctly updates _end symbol to match the end of the ## last loadable segment. diff --git a/bolt/test/X86/instrumentation-eh_frame_hdr.cpp b/bolt/test/X86/instrumentation-eh_frame_hdr.cpp index 4ed8be42cd0f37..b360530099ce32 100644 --- a/bolt/test/X86/instrumentation-eh_frame_hdr.cpp +++ b/bolt/test/X86/instrumentation-eh_frame_hdr.cpp @@ -6,8 +6,9 @@ // RUN: %clangxx %cxxflags -static -Wl,-q %s -o %t.exe -Wl,--entry=_start // RUN: llvm-bolt %t.exe -o %t.instr -instrument \ // RUN: --instrumentation-file=%t.fdata -instrumentation-sleep-time=1 -// RUN: (llvm-readelf -SW %t.instr | grep -v bolt; llvm-readelf -lW %t.instr | \ -// RUN: grep LOAD | tail -n 1) | FileCheck %s +// RUN: llvm-readelf -SW %t.instr | grep -v bolt > %t.sections +// RUN: llvm-readelf -lW %t.instr | grep LOAD | tail -n 1 >> %t.sections +// RUN: FileCheck %s < %t.sections // CHECK: {{.*}} .eh_frame_hdr PROGBITS [[#%x, EH_ADDR:]] // CHECK: LOAD 0x[[#%x, LD_OFFSET:]] 0x[[#%x, LD_VADDR:]] 0x[[#%x, LD_FSIZE:]] From 7c9008115a2a24788f07bb476fb28dcf5e661ae4 Mon Sep 17 00:00:00 2001 From: pawelszczerbuk <153013546+pawelszczerbuk@users.noreply.github.com> Date: Fri, 23 Aug 2024 08:23:11 -0700 Subject: [PATCH 333/426] [SCF][PIPELINE] Handle the case when values from the peeled prologue may escape out of the loop (#105755) Previously the values in the peeled prologue that weren't treated with the `predicateFn` were passed to the loop body without any other predication. If those values are later used outside of the loop body, they may be incorrect if the num iterations is smaller than num stages - 1. We need similar masking for those, as is done in the main loop body, using already existing predicates. --- .../Dialect/SCF/Transforms/LoopPipelining.cpp | 20 ++++++++++---- mlir/test/Dialect/SCF/loop-pipelining.mlir | 26 ++++++++++++------- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp index cc1a22d0d48a18..d8e1cc0ecef88e 100644 --- a/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp @@ -268,7 +268,7 @@ cloneAndUpdateOperands(RewriterBase &rewriter, Operation *op, } void LoopPipelinerInternal::emitPrologue(RewriterBase &rewriter) { - // Initialize the iteration argument to the loop initiale values. + // Initialize the iteration argument to the loop initial values. for (auto [arg, operand] : llvm::zip(forOp.getRegionIterArgs(), forOp.getInitsMutable())) { setValueMapping(arg, operand.get(), 0); @@ -320,16 +320,26 @@ void LoopPipelinerInternal::emitPrologue(RewriterBase &rewriter) { if (annotateFn) annotateFn(newOp, PipeliningOption::PipelinerPart::Prologue, i); for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) { - setValueMapping(op->getResult(destId), newOp->getResult(destId), - i - stages[op]); + Value source = newOp->getResult(destId); // If the value is a loop carried dependency update the loop argument - // mapping. for (OpOperand &operand : yield->getOpOperands()) { if (operand.get() != op->getResult(destId)) continue; + if (predicates[predicateIdx] && + !forOp.getResult(operand.getOperandNumber()).use_empty()) { + // If the value is used outside the loop, we need to make sure we + // return the correct version of it. + Value prevValue = valueMapping + [forOp.getRegionIterArgs()[operand.getOperandNumber()]] + [i - stages[op]]; + source = rewriter.create( + loc, predicates[predicateIdx], source, prevValue); + } setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()], - newOp->getResult(destId), i - stages[op] + 1); + source, i - stages[op] + 1); } + setValueMapping(op->getResult(destId), newOp->getResult(destId), + i - stages[op]); } } } diff --git a/mlir/test/Dialect/SCF/loop-pipelining.mlir b/mlir/test/Dialect/SCF/loop-pipelining.mlir index 46e7feca4329ee..9687f80f5ddfc8 100644 --- a/mlir/test/Dialect/SCF/loop-pipelining.mlir +++ b/mlir/test/Dialect/SCF/loop-pipelining.mlir @@ -703,18 +703,26 @@ func.func @distance_1_use(%A: memref, %result: memref) { // ----- // NOEPILOGUE-LABEL: stage_0_value_escape( -func.func @stage_0_value_escape(%A: memref, %result: memref) { +func.func @stage_0_value_escape(%A: memref, %result: memref, %ub: index) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index %cf = arith.constant 1.0 : f32 -// NOEPILOGUE: %[[C3:.+]] = arith.constant 3 : index -// NOEPILOGUE: %[[A:.+]] = arith.addf -// NOEPILOGUE: scf.for %[[IV:.+]] = {{.*}} iter_args(%[[ARG:.+]] = %[[A]], -// NOEPILOGUE: %[[C:.+]] = arith.cmpi slt, %[[IV]], %[[C3]] : index -// NOEPILOGUE: %[[S:.+]] = arith.select %[[C]], %{{.+}}, %[[ARG]] : f32 -// NOEPILOGUE: scf.yield %[[S]] - %r = scf.for %i0 = %c0 to %c4 step %c1 iter_args(%arg0 = %cf) -> (f32) { +// NOEPILOGUE: %[[UB:[^,]+]]: index) +// NOEPILOGUE-DAG: %[[C0:.+]] = arith.constant 0 : index +// NOEPILOGUE-DAG: %[[C1:.+]] = arith.constant 1 : index +// NOEPILOGUE-DAG: %[[CF:.+]] = arith.constant 1.000000e+00 +// NOEPILOGUE: %[[CND0:.+]] = arith.cmpi sgt, %[[UB]], %[[C0]] +// NOEPILOGUE: scf.if +// NOEPILOGUE: %[[IF:.+]] = scf.if %[[CND0]] +// NOEPILOGUE: %[[A:.+]] = arith.addf +// NOEPILOGUE: scf.yield %[[A]] +// NOEPILOGUE: %[[S0:.+]] = arith.select %[[CND0]], %[[IF]], %[[CF]] +// NOEPILOGUE: scf.for %[[IV:.+]] = {{.*}} iter_args(%[[ARG:.+]] = %[[S0]], +// NOEPILOGUE: %[[UB_1:.+]] = arith.subi %[[UB]], %[[C1]] : index +// NOEPILOGUE: %[[CND1:.+]] = arith.cmpi slt, %[[IV]], %[[UB_1]] : index +// NOEPILOGUE: %[[S1:.+]] = arith.select %[[CND1]], %{{.+}}, %[[ARG]] : f32 +// NOEPILOGUE: scf.yield %[[S1]] + %r = scf.for %i0 = %c0 to %ub step %c1 iter_args(%arg0 = %cf) -> (f32) { %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 1 } : memref %A1_elem = arith.addf %A_elem, %arg0 { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 0 } : f32 memref.store %A1_elem, %result[%c0] { __test_pipelining_stage__ = 2, __test_pipelining_op_order__ = 2 } : memref From 6e78aef646c22b7087cbf7939c8016f4f59614a1 Mon Sep 17 00:00:00 2001 From: cor3ntin Date: Fri, 23 Aug 2024 17:24:08 +0200 Subject: [PATCH 334/426] [Clang] Implement P2747 constexpr placement new (#104586) The implementation follows the resolution of CWG2922 --- clang/docs/ReleaseNotes.rst | 3 + .../include/clang/Basic/DiagnosticASTKinds.td | 3 +- clang/lib/AST/ExprConstant.cpp | 63 ++++++---- clang/lib/Frontend/InitPreprocessor.cpp | 2 +- clang/test/AST/ByteCode/new-delete.cpp | 6 +- clang/test/CXX/drs/cwg29xx.cpp | 26 ++++ clang/test/Lexer/cxx-features.cpp | 2 +- .../SemaCXX/constant-expression-cxx2a.cpp | 4 +- .../test/SemaCXX/cxx2a-constexpr-dynalloc.cpp | 12 +- .../SemaCXX/cxx2c-constexpr-placement-new.cpp | 116 ++++++++++++++++++ clang/www/cxx_dr_status.html | 2 +- clang/www/cxx_status.html | 2 +- 12 files changed, 203 insertions(+), 38 deletions(-) create mode 100644 clang/test/SemaCXX/cxx2c-constexpr-placement-new.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index fb3c2f699964c3..baedc3cd6f03fc 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -118,6 +118,8 @@ C++2c Feature Support - Implemented `P2893R3 Variadic Friends `_ +- Implemented `P2747R2 constexpr placement new `_. + C++23 Feature Support ^^^^^^^^^^^^^^^^^^^^^ - Removed the restriction to literal types in constexpr functions in C++23 mode. @@ -125,6 +127,7 @@ C++23 Feature Support C++20 Feature Support ^^^^^^^^^^^^^^^^^^^^^ + Resolutions to C++ Defect Reports ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/DiagnosticASTKinds.td b/clang/include/clang/Basic/DiagnosticASTKinds.td index f317c5ac44f32b..45ad84831589b1 100644 --- a/clang/include/clang/Basic/DiagnosticASTKinds.td +++ b/clang/include/clang/Basic/DiagnosticASTKinds.td @@ -333,7 +333,8 @@ def note_constexpr_new : Note< def note_constexpr_new_non_replaceable : Note< "call to %select{placement|class-specific}0 %1">; def note_constexpr_new_placement : Note< - "this placement new expression is not yet supported in constant expressions">; + "this placement new expression is not supported in constant expressions " + "%select{|before C++2c}0">; def note_constexpr_placement_new_wrong_type : Note< "placement new would change type of storage from %0 to %1">; def note_constexpr_new_negative : Note< diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 5540f58b526705..826cc5f58bdf51 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -6691,7 +6691,9 @@ static bool HandleDestructionImpl(EvalInfo &Info, SourceRange CallRange, if (Size && Size > Value.getArrayInitializedElts()) expandArray(Value, Value.getArraySize() - 1); - for (; Size != 0; --Size) { + // The size of the array might have been reduced by + // a placement new. + for (Size = Value.getArraySize(); Size != 0; --Size) { APValue &Elem = Value.getArrayInitializedElt(Size - 1); if (!HandleLValueArrayAdjustment(Info, &LocE, ElemLV, ElemT, -1) || !HandleDestructionImpl(Info, CallRange, ElemLV, Elem, ElemT)) @@ -10003,23 +10005,14 @@ bool PointerExprEvaluator::VisitCXXNewExpr(const CXXNewExpr *E) { return false; FunctionDecl *OperatorNew = E->getOperatorNew(); + QualType AllocType = E->getAllocatedType(); + QualType TargetType = AllocType; bool IsNothrow = false; bool IsPlacement = false; - if (OperatorNew->isReservedGlobalPlacementOperator() && - Info.CurrentCall->isStdFunction() && !E->isArray()) { - // FIXME Support array placement new. - assert(E->getNumPlacementArgs() == 1); - if (!EvaluatePointer(E->getPlacementArg(0), Result, Info)) - return false; - if (Result.Designator.Invalid) - return false; - IsPlacement = true; - } else if (!OperatorNew->isReplaceableGlobalAllocationFunction()) { - Info.FFDiag(E, diag::note_constexpr_new_non_replaceable) - << isa(OperatorNew) << OperatorNew; - return false; - } else if (E->getNumPlacementArgs()) { + + if (E->getNumPlacementArgs() == 1 && + E->getPlacementArg(0)->getType()->isNothrowT()) { // The only new-placement list we support is of the form (std::nothrow). // // FIXME: There is no restriction on this, but it's not clear that any @@ -10030,14 +10023,31 @@ bool PointerExprEvaluator::VisitCXXNewExpr(const CXXNewExpr *E) { // (which should presumably be valid only if N is a multiple of // alignof(int), and in any case can't be deallocated unless N is // alignof(X) and X has new-extended alignment). - if (E->getNumPlacementArgs() != 1 || - !E->getPlacementArg(0)->getType()->isNothrowT()) - return Error(E, diag::note_constexpr_new_placement); - LValue Nothrow; if (!EvaluateLValue(E->getPlacementArg(0), Nothrow, Info)) return false; IsNothrow = true; + } else if (OperatorNew->isReservedGlobalPlacementOperator()) { + if (Info.CurrentCall->isStdFunction() || Info.getLangOpts().CPlusPlus26) { + if (!EvaluatePointer(E->getPlacementArg(0), Result, Info)) + return false; + if (Result.Designator.Invalid) + return false; + TargetType = E->getPlacementArg(0)->getType(); + IsPlacement = true; + } else { + Info.FFDiag(E, diag::note_constexpr_new_placement) + << /*C++26 feature*/ 1 << E->getSourceRange(); + return false; + } + } else if (E->getNumPlacementArgs()) { + Info.FFDiag(E, diag::note_constexpr_new_placement) + << /*Unsupported*/ 0 << E->getSourceRange(); + return false; + } else if (!OperatorNew->isReplaceableGlobalAllocationFunction()) { + Info.FFDiag(E, diag::note_constexpr_new_non_replaceable) + << isa(OperatorNew) << OperatorNew; + return false; } const Expr *Init = E->getInitializer(); @@ -10045,7 +10055,6 @@ bool PointerExprEvaluator::VisitCXXNewExpr(const CXXNewExpr *E) { const CXXConstructExpr *ResizedArrayCCE = nullptr; bool ValueInit = false; - QualType AllocType = E->getAllocatedType(); if (std::optional ArraySize = E->getArraySize()) { const Expr *Stripped = *ArraySize; for (; auto *ICE = dyn_cast(Stripped); @@ -10139,9 +10148,17 @@ bool PointerExprEvaluator::VisitCXXNewExpr(const CXXNewExpr *E) { bool found(APValue &Subobj, QualType SubobjType) { // FIXME: Reject the cases where [basic.life]p8 would not permit the // old name of the object to be used to name the new object. - if (!Info.Ctx.hasSameUnqualifiedType(SubobjType, AllocType)) { - Info.FFDiag(E, diag::note_constexpr_placement_new_wrong_type) << - SubobjType << AllocType; + unsigned SubobjectSize = 1; + unsigned AllocSize = 1; + if (auto *CAT = dyn_cast(AllocType)) + AllocSize = CAT->getZExtSize(); + if (auto *CAT = dyn_cast(SubobjType)) + SubobjectSize = CAT->getZExtSize(); + if (SubobjectSize < AllocSize || + !Info.Ctx.hasSimilarType(Info.Ctx.getBaseElementType(SubobjType), + Info.Ctx.getBaseElementType(AllocType))) { + Info.FFDiag(E, diag::note_constexpr_placement_new_wrong_type) + << SubobjType << AllocType; return false; } Value = &Subobj; diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 4f2856dd2247f8..61260a3379828d 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -660,7 +660,7 @@ static void InitializeCPlusPlusFeatureTestMacros(const LangOptions &LangOpts, Builder.defineMacro("__cpp_unicode_literals", "200710L"); Builder.defineMacro("__cpp_user_defined_literals", "200809L"); Builder.defineMacro("__cpp_lambdas", "200907L"); - Builder.defineMacro("__cpp_constexpr", LangOpts.CPlusPlus26 ? "202306L" + Builder.defineMacro("__cpp_constexpr", LangOpts.CPlusPlus26 ? "202406L" : LangOpts.CPlusPlus23 ? "202211L" : LangOpts.CPlusPlus20 ? "201907L" : LangOpts.CPlusPlus17 ? "201603L" diff --git a/clang/test/AST/ByteCode/new-delete.cpp b/clang/test/AST/ByteCode/new-delete.cpp index 6bb30bc19f110c..a7be4102fd0a05 100644 --- a/clang/test/AST/ByteCode/new-delete.cpp +++ b/clang/test/AST/ByteCode/new-delete.cpp @@ -245,7 +245,7 @@ namespace std { namespace PlacementNew { constexpr int foo() { // both-error {{never produces a constant expression}} char c[sizeof(int)]; - new (c) int{12}; // ref-note {{call to placement 'operator new'}} \ + new (c) int{12}; // ref-note {{this placement new expression is not supported in constant expressions before C++2c}} \ // expected-note {{subexpression not valid in a constant expression}} return 0; } @@ -309,7 +309,7 @@ namespace placement_new_delete { constexpr bool bad(int which) { switch (which) { case 0: - delete new (placement_new_arg{}) int; // ref-note {{call to placement 'operator new'}} \ + delete new (placement_new_arg{}) int; // ref-note {{this placement new expression is not supported in constant expressions}} \ // expected-note {{subexpression not valid in a constant expression}} break; @@ -328,7 +328,7 @@ namespace placement_new_delete { case 4: // FIXME: This technically follows the standard's rules, but it seems // unreasonable to expect implementations to support this. - delete new (std::align_val_t{64}) Overaligned; // ref-note {{placement new expression is not yet supported}} \ + delete new (std::align_val_t{64}) Overaligned; // ref-note {{this placement new expression is not supported in constant expressions}} \ // expected-note {{subexpression not valid in a constant expression}} break; } diff --git a/clang/test/CXX/drs/cwg29xx.cpp b/clang/test/CXX/drs/cwg29xx.cpp index 8cac9f283980b6..2515785f47bf19 100644 --- a/clang/test/CXX/drs/cwg29xx.cpp +++ b/clang/test/CXX/drs/cwg29xx.cpp @@ -23,3 +23,29 @@ struct S { friend class C::Nested...; // expected-error {{friend declaration expands pack 'Ts' that is declared it its own template parameter list}} }; } // namespace cwg2917 + +#if __cplusplus >= 202400L + +namespace std { + using size_t = decltype(sizeof(0)); +}; +void *operator new(std::size_t, void *p) { return p; } +void* operator new[] (std::size_t, void* p) {return p;} + + +namespace cwg2922 { // cwg2922: 20 open 2024-07-10 +union U { int a, b; }; +constexpr U nondeterministic(bool i) { + if(i) { + U u; + new (&u) int(); + // expected-note@-1 {{placement new would change type of storage from 'U' to 'int'}} + return u; + } + return {}; +} +constexpr U _ = nondeterministic(true); +// expected-error@-1 {{constexpr variable '_' must be initialized by a constant expression}} \ +// expected-note@-1 {{in call to 'nondeterministic(true)'}} +} +#endif diff --git a/clang/test/Lexer/cxx-features.cpp b/clang/test/Lexer/cxx-features.cpp index 1c51013ca06f77..4a06d29ae9dbc6 100644 --- a/clang/test/Lexer/cxx-features.cpp +++ b/clang/test/Lexer/cxx-features.cpp @@ -317,7 +317,7 @@ #error "wrong value for __cpp_lambdas" #endif -#if check(constexpr, 0, 200704, 201304, 201603, 201907, 202211, 202306) +#if check(constexpr, 0, 200704, 201304, 201603, 201907, 202211, 202406L) #error "wrong value for __cpp_constexpr" #endif diff --git a/clang/test/SemaCXX/constant-expression-cxx2a.cpp b/clang/test/SemaCXX/constant-expression-cxx2a.cpp index e4d97dcb73562d..36d4d25c48471b 100644 --- a/clang/test/SemaCXX/constant-expression-cxx2a.cpp +++ b/clang/test/SemaCXX/constant-expression-cxx2a.cpp @@ -994,7 +994,7 @@ namespace placement_new_delete { constexpr bool bad(int which) { switch (which) { case 0: - delete new (placement_new_arg{}) int; // expected-note {{call to placement 'operator new'}} + delete new (placement_new_arg{}) int; // expected-note {{this placement new expression is not supported in constant expressions}} break; case 1: @@ -1012,7 +1012,7 @@ namespace placement_new_delete { case 4: // FIXME: This technically follows the standard's rules, but it seems // unreasonable to expect implementations to support this. - delete new (std::align_val_t{64}) Overaligned; // expected-note {{placement new expression is not yet supported}} + delete new (std::align_val_t{64}) Overaligned; // expected-note {{this placement new expression is not supported in constant expressions}} break; } diff --git a/clang/test/SemaCXX/cxx2a-constexpr-dynalloc.cpp b/clang/test/SemaCXX/cxx2a-constexpr-dynalloc.cpp index 357dc67bd5ad22..6d9c0b607d8a67 100644 --- a/clang/test/SemaCXX/cxx2a-constexpr-dynalloc.cpp +++ b/clang/test/SemaCXX/cxx2a-constexpr-dynalloc.cpp @@ -1,6 +1,7 @@ -// RUN: %clang_cc1 -std=c++2a -verify %s -DNEW=__builtin_operator_new -DDELETE=__builtin_operator_delete -// RUN: %clang_cc1 -std=c++2a -verify %s "-DNEW=operator new" "-DDELETE=operator delete" -// RUN: %clang_cc1 -std=c++2a -verify %s "-DNEW=::operator new" "-DDELETE=::operator delete" +// RUN: %clang_cc1 -std=c++2a -verify=expected,cxx20 %s -DNEW=__builtin_operator_new -DDELETE=__builtin_operator_delete +// RUN: %clang_cc1 -std=c++2a -verify=expected,cxx20 %s "-DNEW=operator new" "-DDELETE=operator delete" +// RUN: %clang_cc1 -std=c++2a -verify=expected,cxx20 %s "-DNEW=::operator new" "-DDELETE=::operator delete" +// RUN: %clang_cc1 -std=c++2c -verify=expected,cxx26 %s "-DNEW=::operator new" "-DDELETE=::operator delete" constexpr bool alloc_from_user_code() { void *p = NEW(sizeof(int)); // expected-note {{cannot allocate untyped memory in a constant expression; use 'std::allocator::allocate'}} @@ -90,9 +91,10 @@ constexpr int no_deallocate_nonalloc = (std::allocator().deallocate((int*)& // expected-note@-2 {{declared here}} void *operator new(std::size_t, void *p) { return p; } -constexpr bool no_placement_new_in_user_code() { // expected-error {{never produces a constant expression}} +void* operator new[] (std::size_t, void* p) {return p;} +constexpr bool no_placement_new_in_user_code() { // cxx20-error {{constexpr function never produces a constant expression}} int a; - new (&a) int(42); // expected-note {{call to placement 'operator new'}} + new (&a) int(42); // cxx20-note {{this placement new expression is not supported in constant expressions before C++2c}} return a == 42; } diff --git a/clang/test/SemaCXX/cxx2c-constexpr-placement-new.cpp b/clang/test/SemaCXX/cxx2c-constexpr-placement-new.cpp new file mode 100644 index 00000000000000..a29fb981cedbf0 --- /dev/null +++ b/clang/test/SemaCXX/cxx2c-constexpr-placement-new.cpp @@ -0,0 +1,116 @@ +// RUN: %clang_cc1 -std=c++2c -verify %s + + +namespace std { + using size_t = decltype(sizeof(0)); +} + +void *operator new(std::size_t, void *p) { return p; } +void* operator new[] (std::size_t, void* p) {return p;} + + +consteval int ok() { + int i; + new (&i) int(0); + new (&i) int[1]{1}; + new (static_cast(&i)) int(0); + return 0; +} + +consteval int conversion() { + int i; + new (static_cast(&i)) float(0); + // expected-note@-1 {{placement new would change type of storage from 'int' to 'float'}} + return 0; +} + +consteval int indeterminate() { + int * indeterminate; + new (indeterminate) int(0); + // expected-note@-1 {{read of uninitialized object is not allowed in a constant expression}} + return 0; +} + +consteval int array1() { + int i[2]; + new (&i) int[]{1,2}; + new (&i) int[]{1}; + new (&i) int(0); + new (static_cast(&i)) int[]{1,2}; + new (static_cast(&i)) int[]{1}; + return 0; +} + +consteval int array2() { + int i[1]; + new (&i) int[2]; + //expected-note@-1 {{placement new would change type of storage from 'int[1]' to 'int[2]'}} + return 0; +} + +struct S{ + int* i; + constexpr S() : i(new int(42)) {} // #no-deallocation + constexpr ~S() {delete i;} +}; + +consteval void alloc() { + S* s = new S(); + s->~S(); + new (s) S(); + delete s; +} + + +consteval void alloc_err() { + S* s = new S(); + new (s) S(); + delete s; +} + + + +int a = ok(); +int b = conversion(); // expected-error {{call to consteval function 'conversion' is not a constant expression}} \ + // expected-note {{in call to 'conversion()'}} +int c = indeterminate(); // expected-error {{call to consteval function 'indeterminate' is not a constant expression}} \ + // expected-note {{in call to 'indeterminate()'}} +int d = array1(); +int e = array2(); // expected-error {{call to consteval function 'array2' is not a constant expression}} \ + // expected-note {{in call to 'array2()'}} +int alloc1 = (alloc(), 0); +int alloc2 = (alloc_err(), 0); // expected-error {{call to consteval function 'alloc_err' is not a constant expression}} + // expected-note@#no-deallocation {{allocation performed here was not deallocated}} + +constexpr int *intptr() { + return new int; +} + +constexpr bool yay() { + int *ptr = new (intptr()) int(42); + bool ret = *ptr == 42; + delete ptr; + return ret; +} +static_assert(yay()); + +constexpr bool blah() { + int *ptr = new (intptr()) int[3]{ 1, 2, 3 }; // expected-note {{placement new would change type of storage from 'int' to 'int[3]'}} + bool ret = ptr[0] == 1 && ptr[1] == 2 && ptr[2] == 3; + delete [] ptr; + return ret; +} +static_assert(blah()); // expected-error {{not an integral constant expression}} \ + // expected-note {{in call to 'blah()'}} + +constexpr int *get_indeterminate() { + int *evil; + return evil; // expected-note {{read of uninitialized object is not allowed in a constant expression}} +} + +constexpr bool bleh() { + int *ptr = new (get_indeterminate()) int; // expected-note {{in call to 'get_indeterminate()'}} + return true; +} +static_assert(bleh()); // expected-error {{not an integral constant expression}} \ + // expected-note {{in call to 'bleh()'}} diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index a8d2d813d0f536..395b5d3bff49a6 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -17348,7 +17348,7 @@

C++ defect report implementation status

2922 open constexpr placement-new is too permissive - Not resolved + Not Resolved* diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html index faee8b578b6242..58bbb12a76dd75 100755 --- a/clang/www/cxx_status.html +++ b/clang/www/cxx_status.html @@ -208,7 +208,7 @@

C++2c implementation status

constexpr placement new P2747R2 - No + Clang 20 Deleting a Pointer to an Incomplete Type Should be Ill-formed From 807557654a3c1c75b9ca3aedf8672805c7b441d4 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 23 Aug 2024 16:09:17 +0100 Subject: [PATCH 335/426] [DAG] visitTRUNCATE_USAT_U - use sd_match to match FP_TO_UINT_SAT pattern. NFC. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 23 ++++++------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 11935cbc309f01..b27f06f94ff0e7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14919,23 +14919,14 @@ SDValue DAGCombiner::visitTRUNCATE_USAT_U(SDNode *N) { EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); - std::function MatchFPTOINT = [&](SDValue Val) -> SDValue { - if (Val.getOpcode() == ISD::FP_TO_UINT) - return Val; - return SDValue(); - }; - - SDValue FPInstr = MatchFPTOINT(N0); - if (!FPInstr) - return SDValue(); + SDValue FPVal; + if (sd_match(N0, m_FPToUI(m_Value(FPVal))) && + DAG.getTargetLoweringInfo().shouldConvertFpToSat( + ISD::FP_TO_UINT_SAT, FPVal.getValueType(), VT)) + return DAG.getNode(ISD::FP_TO_UINT_SAT, SDLoc(N0), VT, FPVal, + DAG.getValueType(VT.getScalarType())); - EVT FPVT = FPInstr.getOperand(0).getValueType(); - if (!DAG.getTargetLoweringInfo().shouldConvertFpToSat(ISD::FP_TO_UINT_SAT, - FPVT, VT)) - return SDValue(); - return DAG.getNode(ISD::FP_TO_UINT_SAT, SDLoc(FPInstr), VT, - FPInstr.getOperand(0), - DAG.getValueType(VT.getScalarType())); + return SDValue(); } /// Detect patterns of truncation with unsigned saturation: From ff5552c1b82ada19750792fa1f28a23a33ee39b3 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Fri, 23 Aug 2024 11:44:45 -0400 Subject: [PATCH 336/426] [libc++] Remove status pages tracking SpecialMath and Zip (#105672) Instead of tracking those using our static CSV files, I created lists of subtasks in their respective issues (#99939 and #105169) to track the work that is still left. --- libcxx/docs/ReleaseNotes/19.rst | 3 +- libcxx/docs/Status/Cxx17.rst | 2 +- libcxx/docs/Status/SpecialMath.rst | 35 ---------------------- libcxx/docs/Status/SpecialMathProjects.csv | 22 -------------- libcxx/docs/Status/Zip.rst | 29 ------------------ libcxx/docs/Status/ZipProjects.csv | 27 ----------------- libcxx/docs/index.rst | 2 -- 7 files changed, 3 insertions(+), 117 deletions(-) delete mode 100644 libcxx/docs/Status/SpecialMath.rst delete mode 100644 libcxx/docs/Status/SpecialMathProjects.csv delete mode 100644 libcxx/docs/Status/Zip.rst delete mode 100644 libcxx/docs/Status/ZipProjects.csv diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst index e167d21e39f93c..b55f3fa04f24b6 100644 --- a/libcxx/docs/ReleaseNotes/19.rst +++ b/libcxx/docs/ReleaseNotes/19.rst @@ -46,7 +46,8 @@ Work on the experimental C++17 Parallel STL has progressed. See :ref:`pstl-status` for the current status. Work on the C++17 mathematical special functions has started. See -:ref:`special-math-status` for the current status. +`this issue `__ +for the current status. Implemented Papers ------------------ diff --git a/libcxx/docs/Status/Cxx17.rst b/libcxx/docs/Status/Cxx17.rst index 94f9d890c36417..d55fad2a7ec7ef 100644 --- a/libcxx/docs/Status/Cxx17.rst +++ b/libcxx/docs/Status/Cxx17.rst @@ -39,7 +39,7 @@ Paper Status .. note:: .. [#note-P0067R5] P0067R5: ``std::(to|from)_chars`` for integrals has been available since version 7.0. ``std::to_chars`` for ``float`` and ``double`` since version 14.0 ``std::to_chars`` for ``long double`` uses the implementation for ``double``. - .. [#note-P0226] P0226: Progress is tracked `here `_. + .. [#note-P0226] P0226: Progress is tracked `here `_. .. [#note-P0607] P0607: The parts of P0607 that are not done are the ```` bits. .. [#note-P0154] P0154: The required macros are only implemented as of clang 19. .. [#note-P0452] P0452: The changes to ``std::transform_inclusive_scan`` and ``std::transform_exclusive_scan`` have not yet been implemented. diff --git a/libcxx/docs/Status/SpecialMath.rst b/libcxx/docs/Status/SpecialMath.rst deleted file mode 100644 index 46e5c97cdaab2c..00000000000000 --- a/libcxx/docs/Status/SpecialMath.rst +++ /dev/null @@ -1,35 +0,0 @@ -.. _special-math-status: - -====================================================== -libc++ Mathematical Special Functions Status (P0226R1) -====================================================== - -.. include:: ../Helpers/Styles.rst - -.. contents:: - :local: - -Overview -======== - -This document contains the status of the C++17 mathematical special functions implementation in libc++. -It is used to track both the status of the sub-projects of the effort and who is assigned to these sub-projects. -This avoids duplicating effort. - -If you are interested in contributing to this effort, please send a message -to the #libcxx channel in the LLVM discord. Please *do not* start working -on any items below that has already been assigned to someone else. - -Sub-projects in the Implementation Effort -========================================= - -.. csv-table:: - :file: SpecialMathProjects.csv - :header-rows: 1 - :widths: auto - -Paper and Issue Status -====================== - -The underlying paper is `Mathematical Special Functions for C++17 (P0226) `_ and is included in C++17. -Implementation is *In Progress*. diff --git a/libcxx/docs/Status/SpecialMathProjects.csv b/libcxx/docs/Status/SpecialMathProjects.csv deleted file mode 100644 index f964e79de91d3c..00000000000000 --- a/libcxx/docs/Status/SpecialMathProjects.csv +++ /dev/null @@ -1,22 +0,0 @@ -Section,Description,Assignee,Complete -| `[sf.cmath.assoc.laguerre] `_, std::assoc_laguerre, None, |Not Started| -| `[sf.cmath.assoc.legendre] `_, std::assoc_legendre, None, |Not Started| -| `[sf.cmath.beta] `_, std::beta, None, |Not Started| -| `[sf.cmath.comp.ellint.1] `_, std::comp_ellint_1, None, |Not Started| -| `[sf.cmath.comp.ellint.2] `_, std::comp_ellint_2, None, |Not Started| -| `[sf.cmath.comp.ellint.3] `_, std::comp_ellint_3, None, |Not Started| -| `[sf.cmath.cyl.bessel.i] `_, std::cyl_bessel_i, None, |Not Started| -| `[sf.cmath.cyl.bessel.j] `_, std::cyl_bessel_j, None, |Not Started| -| `[sf.cmath.cyl.bessel.k] `_, std::cyl_bessel_k, None, |Not Started| -| `[sf.cmath.cyl.neumann] `_, std::cyl_neumann, None, |Not Started| -| `[sf.cmath.ellint.1] `_, std::ellint_1, None, |Not Started| -| `[sf.cmath.ellint.2] `_, std::ellint_2, None, |Not Started| -| `[sf.cmath.ellint.3] `_, std::ellint_3, None, |Not Started| -| `[sf.cmath.expint] `_, std::expint, None, |Not Started| -| `[sf.cmath.hermite] `_, std::hermite, Paul Xi Cao, |Complete| -| `[sf.cmath.laguerre] `_, std::laguerre, None, |Not Started| -| `[sf.cmath.legendre] `_, std::legendre, None, |Not Started| -| `[sf.cmath.riemann.zeta] `_, std::riemann_zeta, None, |Not Started| -| `[sf.cmath.sph.bessel] `_, std::sph_bessel, None, |Not Started| -| `[sf.cmath.sph.legendre] `_, std::sph_legendre, None, |Not Started| -| `[sf.cmath.sph.neumann] `_, std::sph_neumann, None, |Not Started| diff --git a/libcxx/docs/Status/Zip.rst b/libcxx/docs/Status/Zip.rst deleted file mode 100644 index 9f713866973530..00000000000000 --- a/libcxx/docs/Status/Zip.rst +++ /dev/null @@ -1,29 +0,0 @@ -.. zip-status: - -=========================== -libc++ Zip Status (P2321R2) -=========================== - -.. include:: ../Helpers/Styles.rst - -.. contents:: - :local: - -Overview -======== - -This document contains the status of the C++23 zip implementation in libc++. -It is used to track both the status of the sub-projects of the effort and who -is assigned to these sub-projects. This avoids duplicating effort. - -If you are interested in contributing to this effort, please send a message -to the #libcxx channel in the LLVM discord. Please *do not* start working -on any items below that has already been assigned to someone else. - -Sub-projects in the Implementation Effort -========================================= - -.. csv-table:: - :file: ZipProjects.csv - :header-rows: 1 - :widths: auto diff --git a/libcxx/docs/Status/ZipProjects.csv b/libcxx/docs/Status/ZipProjects.csv deleted file mode 100644 index 699a382ff66b73..00000000000000 --- a/libcxx/docs/Status/ZipProjects.csv +++ /dev/null @@ -1,27 +0,0 @@ -Section,Description,Dependencies,Assignee,Complete -| `[tuple.syn] `_, "`[tuple] basic_common_reference, common_type `_", None, Nikolas Klauser, |Complete| -| `[tuple.tuple] `_, "`[tuple] constructor, assignment and swap overloads `_", None, Hui Xie, |Complete| -| `[utility.syn] `_, "[pair] basic_common_reference, common_type", None, Nikolas Klauser, |Complete| -| `[pairs.pair] `_, "`[pair] constructor, assignment and swap overloads `_", None, Hui Xie, |Complete| -"| `[memory.syn] `_ -| `[allocator.uses.construction] `_", "[pair] uses_allocator_construction_args overloads", None, Nikolas Klauser, |Complete| -| `[vector.bool] `_, "[vector::reference] add const operator= overload", None, Hui Xie, |Not Started| -| `[iterator.concept.winc] `_, "Update weakly_comparable", None, Hui Xie, |Not Started| -| `[range.zip] `_, "`zip_view `_", "| `zip_view::iterator` -| `zip_view::sentinel`", Hui Xie, |Complete| -| `[range.zip.iterator] `_, "`zip_view::iterator `_", None, Hui Xie, |Complete| -| `[range.zip.sentinel] `_, "`zip_view::sentinel `_", None, Hui Xie, |Complete| -| `[range.zip.transform.view] `_, "zip_transform_view", "| `zip_transform_view::iterator` -| `zip_transform_view::sentinel`", Hui Xie, |Not Started| -| `[range.zip.transform.iterator] `_, "zip_transform_view::iterator", None, Hui Xie, |Not Started| -| `[range.zip.transform.sentinel] `_, "zip_transform_view::sentinel", None, Hui Xie, |Not Started| -| `[range.adjacent.view] `_, "adjacent_view", "| `adjacent_view::iterator` -| `adjacent_view::sentinel`", Hui Xie, |Not Started| -| `[range.adjacent.iterator] `_, "adjacent_view::iterator", None, unassigned, |Not Started| -| `[range.adjacent.sentinel] `_, "adjacent_view::sentinel", None, unassigned, |Not Started| -| `[range.adjacent.transform.view] `_, "adjacent_transform_view", "| `adjacent_transform_view::iterator`, -| `adjacent_transform_view::sentinel`", Hui Xie, |Not Started| -| `[range.adjacent.transform.iterator] `_, "adjacent_transform_view::iterator", None, Hui Xie, |Not Started| -| `[range.adjacent.transform.sentinel] `_, "adjacent_transform_view::sentinel", None, Hui Xie, |Not Started| -| `[ranges.syn] `_, "enable_borrowed_range zip_view and adjacent_view", "| `zip_view` -| `adjacent_view`", Hui Xie, |Not Started| diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst index a77405eb138124..a9610cbb4db3a4 100644 --- a/libcxx/docs/index.rst +++ b/libcxx/docs/index.rst @@ -50,8 +50,6 @@ Getting Started with libc++ Status/Format Status/Parallelism Status/PSTL - Status/SpecialMath - Status/Zip .. toctree:: From b8f15051369978c423d74a3bd48a1b9ab6d31ee6 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 23 Aug 2024 08:45:19 -0700 Subject: [PATCH 337/426] [IR] Use a range-based for loop (NFC) (#105826) --- llvm/lib/IR/Constants.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index a1c9e925a024fe..e32a54fa346a9a 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -681,9 +681,8 @@ Constant::PossibleRelocationsTy Constant::getRelocationInfo() const { } PossibleRelocationsTy Result = NoRelocation; - for (unsigned i = 0, e = getNumOperands(); i != e; ++i) - Result = - std::max(cast(getOperand(i))->getRelocationInfo(), Result); + for (const Value *Op : operands()) + Result = std::max(cast(Op)->getRelocationInfo(), Result); return Result; } From 5a25854ed18ec5a51df6d1f7a2366a574a6846b0 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 23 Aug 2024 08:45:29 -0700 Subject: [PATCH 338/426] [clangd] Construct SmallVector with ArrayRef (NFC) (#105829) --- clang-tools-extra/clangd/TUScheduler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang-tools-extra/clangd/TUScheduler.cpp b/clang-tools-extra/clangd/TUScheduler.cpp index 324ba1fc8cb895..71548b59cc3088 100644 --- a/clang-tools-extra/clangd/TUScheduler.cpp +++ b/clang-tools-extra/clangd/TUScheduler.cpp @@ -1838,7 +1838,7 @@ DebouncePolicy::compute(llvm::ArrayRef History) const { // Base the result on the median rebuild. // nth_element needs a mutable array, take the chance to bound the data size. History = History.take_back(15); - llvm::SmallVector Recent(History.begin(), History.end()); + llvm::SmallVector Recent(History); auto *Median = Recent.begin() + Recent.size() / 2; std::nth_element(Recent.begin(), Median, Recent.end()); From a9f62244f28a64e7b7338c2299ba169df70fbb03 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Fri, 23 Aug 2024 08:46:31 -0700 Subject: [PATCH 339/426] [mlir][Transforms][NFC] Move `ReconcileUnrealizedCasts` implementation (#104671) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the implementation of `ReconcileUnrealizedCasts` to `DialectConversion.cpp`, so that it can be called from there in a future commit. This commit is in preparation of decoupling argument/source/target materializations from the dialect conversion framework. The existing logic around unresolved materializations that predicts IR changes to decide if a cast op can be folded/erased will become obsolete, as `ReconcileUnrealizedCasts` will perform these kind of foldings on fully materialized IR. --------- Co-authored-by: Markus Böck --- .../mlir/Transforms/DialectConversion.h | 23 ++++++ .../ReconcileUnrealizedCasts.cpp | 60 +-------------- .../Transforms/Utils/DialectConversion.cpp | 74 +++++++++++++++++++ 3 files changed, 101 insertions(+), 56 deletions(-) diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h index a51b00271f0aeb..60113bdef16a23 100644 --- a/mlir/include/mlir/Transforms/DialectConversion.h +++ b/mlir/include/mlir/Transforms/DialectConversion.h @@ -1126,6 +1126,29 @@ struct ConversionConfig { RewriterBase::Listener *listener = nullptr; }; +//===----------------------------------------------------------------------===// +// Reconcile Unrealized Casts +//===----------------------------------------------------------------------===// + +/// Try to reconcile all given UnrealizedConversionCastOps and store the +/// left-over ops in `remainingCastOps` (if provided). +/// +/// This function processes cast ops in a worklist-driven fashion. For each +/// cast op, if the chain of input casts eventually reaches a cast op where the +/// input types match the output types of the matched op, replace the matched +/// op with the inputs. +/// +/// Example: +/// %1 = unrealized_conversion_cast %0 : !A to !B +/// %2 = unrealized_conversion_cast %1 : !B to !C +/// %3 = unrealized_conversion_cast %2 : !C to !A +/// +/// In the above example, %0 can be used instead of %3 and all cast ops are +/// folded away. +void reconcileUnrealizedCasts( + ArrayRef castOps, + SmallVectorImpl *remainingCastOps = nullptr); + //===----------------------------------------------------------------------===// // Op Conversion Entry Points //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.cpp b/mlir/lib/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.cpp index 12e0029cebfd0d..2ce6dcbb490149 100644 --- a/mlir/lib/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.cpp +++ b/mlir/lib/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.cpp @@ -10,6 +10,7 @@ #include "mlir/IR/BuiltinOps.h" #include "mlir/Pass/Pass.h" +#include "mlir/Transforms/DialectConversion.h" namespace mlir { #define GEN_PASS_DEF_RECONCILEUNREALIZEDCASTS @@ -39,63 +40,10 @@ struct ReconcileUnrealizedCasts ReconcileUnrealizedCasts() = default; void runOnOperation() override { - // Gather all unrealized_conversion_cast ops. - SetVector worklist; + SmallVector ops; getOperation()->walk( - [&](UnrealizedConversionCastOp castOp) { worklist.insert(castOp); }); - - // Helper function that adds all operands to the worklist that are an - // unrealized_conversion_cast op result. - auto enqueueOperands = [&](UnrealizedConversionCastOp castOp) { - for (Value v : castOp.getInputs()) - if (auto inputCastOp = v.getDefiningOp()) - worklist.insert(inputCastOp); - }; - - // Helper function that return the unrealized_conversion_cast op that - // defines all inputs of the given op (in the same order). Return "nullptr" - // if there is no such op. - auto getInputCast = - [](UnrealizedConversionCastOp castOp) -> UnrealizedConversionCastOp { - if (castOp.getInputs().empty()) - return {}; - auto inputCastOp = castOp.getInputs() - .front() - .getDefiningOp(); - if (!inputCastOp) - return {}; - if (inputCastOp.getOutputs() != castOp.getInputs()) - return {}; - return inputCastOp; - }; - - // Process ops in the worklist bottom-to-top. - while (!worklist.empty()) { - UnrealizedConversionCastOp castOp = worklist.pop_back_val(); - if (castOp->use_empty()) { - // DCE: If the op has no users, erase it. Add the operands to the - // worklist to find additional DCE opportunities. - enqueueOperands(castOp); - castOp->erase(); - continue; - } - - // Traverse the chain of input cast ops to see if an op with the same - // input types can be found. - UnrealizedConversionCastOp nextCast = castOp; - while (nextCast) { - if (nextCast.getInputs().getTypes() == castOp.getResultTypes()) { - // Found a cast where the input types match the output types of the - // matched op. We can directly use those inputs and the matched op can - // be removed. - enqueueOperands(castOp); - castOp.replaceAllUsesWith(nextCast.getInputs()); - castOp->erase(); - break; - } - nextCast = getInputCast(nextCast); - } - } + [&](UnrealizedConversionCastOp castOp) { ops.push_back(castOp); }); + reconcileUnrealizedCasts(ops); } }; diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index bf2990c257bad2..adf012a261cb7e 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -2870,6 +2870,80 @@ LogicalResult OperationConverter::legalizeErasedResult( return success(); } +//===----------------------------------------------------------------------===// +// Reconcile Unrealized Casts +//===----------------------------------------------------------------------===// + +void mlir::reconcileUnrealizedCasts( + ArrayRef castOps, + SmallVectorImpl *remainingCastOps) { + SetVector worklist(castOps.begin(), + castOps.end()); + // This set is maintained only if `remainingCastOps` is provided. + DenseSet erasedOps; + + // Helper function that adds all operands to the worklist that are an + // unrealized_conversion_cast op result. + auto enqueueOperands = [&](UnrealizedConversionCastOp castOp) { + for (Value v : castOp.getInputs()) + if (auto inputCastOp = v.getDefiningOp()) + worklist.insert(inputCastOp); + }; + + // Helper function that return the unrealized_conversion_cast op that + // defines all inputs of the given op (in the same order). Return "nullptr" + // if there is no such op. + auto getInputCast = + [](UnrealizedConversionCastOp castOp) -> UnrealizedConversionCastOp { + if (castOp.getInputs().empty()) + return {}; + auto inputCastOp = + castOp.getInputs().front().getDefiningOp(); + if (!inputCastOp) + return {}; + if (inputCastOp.getOutputs() != castOp.getInputs()) + return {}; + return inputCastOp; + }; + + // Process ops in the worklist bottom-to-top. + while (!worklist.empty()) { + UnrealizedConversionCastOp castOp = worklist.pop_back_val(); + if (castOp->use_empty()) { + // DCE: If the op has no users, erase it. Add the operands to the + // worklist to find additional DCE opportunities. + enqueueOperands(castOp); + if (remainingCastOps) + erasedOps.insert(castOp.getOperation()); + castOp->erase(); + continue; + } + + // Traverse the chain of input cast ops to see if an op with the same + // input types can be found. + UnrealizedConversionCastOp nextCast = castOp; + while (nextCast) { + if (nextCast.getInputs().getTypes() == castOp.getResultTypes()) { + // Found a cast where the input types match the output types of the + // matched op. We can directly use those inputs and the matched op can + // be removed. + enqueueOperands(castOp); + castOp.replaceAllUsesWith(nextCast.getInputs()); + if (remainingCastOps) + erasedOps.insert(castOp.getOperation()); + castOp->erase(); + break; + } + nextCast = getInputCast(nextCast); + } + } + + if (remainingCastOps) + for (UnrealizedConversionCastOp op : castOps) + if (!erasedOps.contains(op.getOperation())) + remainingCastOps->push_back(op); +} + //===----------------------------------------------------------------------===// // Type Conversion //===----------------------------------------------------------------------===// From b1560bdb2bc67006f3b8f7e84ee0356632bf8126 Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Fri, 23 Aug 2024 17:50:27 +0200 Subject: [PATCH 340/426] Reland "[clang] Merge lifetimebound and GSL code paths for lifetime analysis (#104906)" (#105838) Reland without the `EnableLifetimeWarnings` removal. I will remove the EnableLifetimeWarnings in a follow-up patch. I have added a test to prevent regression. --- clang/docs/ReleaseNotes.rst | 2 + clang/lib/Sema/CheckExprLifetime.cpp | 127 ++++++++---------- .../Sema/warn-lifetime-analysis-nocfg.cpp | 20 +++ 3 files changed, 77 insertions(+), 72 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index baedc3cd6f03fc..17a707102d041f 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -234,6 +234,8 @@ Improvements to Clang's diagnostics - Clang now diagnoses when the result of a [[nodiscard]] function is discarded after being cast in C. Fixes #GH104391. +- Don't emit duplicated dangling diagnostics. (#GH93386). + - Improved diagnostic when trying to befriend a concept. (#GH45182). Improvements to Clang's time-trace diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp index 7389046eaddde1..c1362559536962 100644 --- a/clang/lib/Sema/CheckExprLifetime.cpp +++ b/clang/lib/Sema/CheckExprLifetime.cpp @@ -326,66 +326,6 @@ static bool shouldTrackFirstArgument(const FunctionDecl *FD) { return false; } -static void handleGslAnnotatedTypes(IndirectLocalPath &Path, Expr *Call, - LocalVisitor Visit) { - auto VisitPointerArg = [&](const Decl *D, Expr *Arg, bool Value) { - // We are not interested in the temporary base objects of gsl Pointers: - // Temp().ptr; // Here ptr might not dangle. - if (isa(Arg->IgnoreImpCasts())) - return; - // Once we initialized a value with a reference, it can no longer dangle. - if (!Value) { - for (const IndirectLocalPathEntry &PE : llvm::reverse(Path)) { - if (PE.Kind == IndirectLocalPathEntry::GslReferenceInit) - continue; - if (PE.Kind == IndirectLocalPathEntry::GslPointerInit || - PE.Kind == IndirectLocalPathEntry::GslPointerAssignment) - return; - break; - } - } - Path.push_back({Value ? IndirectLocalPathEntry::GslPointerInit - : IndirectLocalPathEntry::GslReferenceInit, - Arg, D}); - if (Arg->isGLValue()) - visitLocalsRetainedByReferenceBinding(Path, Arg, RK_ReferenceBinding, - Visit, - /*EnableLifetimeWarnings=*/true); - else - visitLocalsRetainedByInitializer(Path, Arg, Visit, true, - /*EnableLifetimeWarnings=*/true); - Path.pop_back(); - }; - - if (auto *MCE = dyn_cast(Call)) { - const auto *MD = cast_or_null(MCE->getDirectCallee()); - if (MD && shouldTrackImplicitObjectArg(MD)) - VisitPointerArg(MD, MCE->getImplicitObjectArgument(), - !MD->getReturnType()->isReferenceType()); - return; - } else if (auto *OCE = dyn_cast(Call)) { - FunctionDecl *Callee = OCE->getDirectCallee(); - if (Callee && Callee->isCXXInstanceMember() && - shouldTrackImplicitObjectArg(cast(Callee))) - VisitPointerArg(Callee, OCE->getArg(0), - !Callee->getReturnType()->isReferenceType()); - return; - } else if (auto *CE = dyn_cast(Call)) { - FunctionDecl *Callee = CE->getDirectCallee(); - if (Callee && shouldTrackFirstArgument(Callee)) - VisitPointerArg(Callee, CE->getArg(0), - !Callee->getReturnType()->isReferenceType()); - return; - } - - if (auto *CCE = dyn_cast(Call)) { - const auto *Ctor = CCE->getConstructor(); - const CXXRecordDecl *RD = Ctor->getParent(); - if (CCE->getNumArgs() > 0 && RD->hasAttr()) - VisitPointerArg(Ctor->getParamDecl(0), CCE->getArgs()[0], true); - } -} - static bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) { const TypeSourceInfo *TSI = FD->getTypeSourceInfo(); if (!TSI) @@ -423,8 +363,10 @@ static bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) { return false; } -static void visitLifetimeBoundArguments(IndirectLocalPath &Path, Expr *Call, - LocalVisitor Visit) { +// Visit lifetimebound or gsl-pointer arguments. +static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call, + LocalVisitor Visit, + bool EnableLifetimeWarnings) { const FunctionDecl *Callee; ArrayRef Args; @@ -458,6 +400,34 @@ static void visitLifetimeBoundArguments(IndirectLocalPath &Path, Expr *Call, /*EnableLifetimeWarnings=*/false); Path.pop_back(); }; + auto VisitGSLPointerArg = [&](const Decl *D, Expr *Arg, bool Value) { + // We are not interested in the temporary base objects of gsl Pointers: + // Temp().ptr; // Here ptr might not dangle. + if (isa(Arg->IgnoreImpCasts())) + return; + // Once we initialized a value with a reference, it can no longer dangle. + if (!Value) { + for (const IndirectLocalPathEntry &PE : llvm::reverse(Path)) { + if (PE.Kind == IndirectLocalPathEntry::GslReferenceInit) + continue; + if (PE.Kind == IndirectLocalPathEntry::GslPointerInit || + PE.Kind == IndirectLocalPathEntry::GslPointerAssignment) + return; + break; + } + } + Path.push_back({Value ? IndirectLocalPathEntry::GslPointerInit + : IndirectLocalPathEntry::GslReferenceInit, + Arg, D}); + if (Arg->isGLValue()) + visitLocalsRetainedByReferenceBinding(Path, Arg, RK_ReferenceBinding, + Visit, + /*EnableLifetimeWarnings=*/true); + else + visitLocalsRetainedByInitializer(Path, Arg, Visit, true, + /*EnableLifetimeWarnings=*/true); + Path.pop_back(); + }; bool CheckCoroCall = false; if (const auto *RD = Callee->getReturnType()->getAsRecordDecl()) { @@ -478,6 +448,12 @@ static void visitLifetimeBoundArguments(IndirectLocalPath &Path, Expr *Call, CheckCoroObjArg = false; if (implicitObjectParamIsLifetimeBound(Callee) || CheckCoroObjArg) VisitLifetimeBoundArg(Callee, ObjectArg); + else if (EnableLifetimeWarnings) { + if (auto *CME = dyn_cast(Callee); + CME && shouldTrackImplicitObjectArg(CME)) + VisitGSLPointerArg(Callee, ObjectArg, + !Callee->getReturnType()->isReferenceType()); + } } for (unsigned I = 0, @@ -485,6 +461,17 @@ static void visitLifetimeBoundArguments(IndirectLocalPath &Path, Expr *Call, I != N; ++I) { if (CheckCoroCall || Callee->getParamDecl(I)->hasAttr()) VisitLifetimeBoundArg(Callee->getParamDecl(I), Args[I]); + else if (EnableLifetimeWarnings && I == 0) { + if (shouldTrackFirstArgument(Callee)) { + VisitGSLPointerArg(Callee, Args[0], + !Callee->getReturnType()->isReferenceType()); + } else { + if (auto *CCE = dyn_cast(Call); + CCE && CCE->getConstructor()->getParent()->hasAttr()) + VisitGSLPointerArg(CCE->getConstructor()->getParamDecl(0), Args[0], + true); + } + } } } @@ -557,11 +544,9 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path, EnableLifetimeWarnings); } - if (isa(Init)) { - if (EnableLifetimeWarnings) - handleGslAnnotatedTypes(Path, Init, Visit); - return visitLifetimeBoundArguments(Path, Init, Visit); - } + if (isa(Init)) + return visitFunctionCallArguments(Path, Init, Visit, + EnableLifetimeWarnings); switch (Init->getStmtClass()) { case Stmt::DeclRefExprClass: { @@ -835,11 +820,9 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, } } - if (isa(Init) || isa(Init)) { - if (EnableLifetimeWarnings) - handleGslAnnotatedTypes(Path, Init, Visit); - return visitLifetimeBoundArguments(Path, Init, Visit); - } + if (isa(Init) || isa(Init)) + return visitFunctionCallArguments(Path, Init, Visit, + EnableLifetimeWarnings); switch (Init->getStmtClass()) { case Stmt::UnaryOperatorClass: { diff --git a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp index 09dfb2b5d96a89..cd1904db327105 100644 --- a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp +++ b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp @@ -479,3 +479,23 @@ void testForBug49342() { auto it = std::iter{} - 2; // Used to be false positive. } + +namespace GH93386 { +// verify no duplicated diagnostics are emitted. +struct [[gsl::Pointer]] S { + S(const std::vector& abc [[clang::lifetimebound]]); +}; + +S test(std::vector a) { + return S(a); // expected-warning {{address of stack memory associated with}} +} + +auto s = S(std::vector()); // expected-warning {{temporary whose address is used as value of local variable}} + +// Verify no regression on the follow case. +std::string_view test2(int i, std::optional a) { + if (i) + return std::move(*a); + return std::move(a.value()); +} +} From fd7904a07bc26950fa7735fb6871a064e3ebc836 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Fri, 23 Aug 2024 09:25:24 -0700 Subject: [PATCH 341/426] Revert "[lldb] Speculative fix for trap_frame_sym_ctx.test" This reverts commit 19d3f3417100dc99caa4394fbd26fc0c4702264e. --- lldb/test/Shell/Unwind/trap_frame_sym_ctx.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/test/Shell/Unwind/trap_frame_sym_ctx.test b/lldb/test/Shell/Unwind/trap_frame_sym_ctx.test index 08a26616240e68..1bf1fb1d6e85f9 100644 --- a/lldb/test/Shell/Unwind/trap_frame_sym_ctx.test +++ b/lldb/test/Shell/Unwind/trap_frame_sym_ctx.test @@ -15,7 +15,7 @@ breakpoint set -n bar process launch # CHECK: stop reason = breakpoint 1.1 -thread backtrace -u +thread backtrace # CHECK: frame #0: {{.*}}`bar # CHECK: frame #1: {{.*}}`tramp # CHECK: frame #2: {{.*}}`main From 0381e01424692a746b941e470c4cc44f6f0bf258 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 23 Aug 2024 09:37:48 -0700 Subject: [PATCH 342/426] Recommit "[RISCV] Add isel optimization for (and (sra y, c2), c1) to recover regression from #101751. (#104114)" Fixed an incorrect cast. Original message: If c1 is a shifted mask with c3 leading zeros and c4 trailing zeros. If c2 is greater than c3, we can use (srli (srai y, c2 - c3), c3 + c4) followed by a SHXADD with c4 as the X amount. Without Zba we can use (slli (srli (srai y, c2 - c3), c3 + c4), c4). Alive2: https://alive2.llvm.org/ce/z/AwhheR --- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 52 +++++++++++++++- llvm/test/CodeGen/RISCV/rv64zba.ll | 66 +++++++++++++++++++++ 2 files changed, 116 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 11210e6cec177f..58f8dc4970282c 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -1462,8 +1462,6 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { const uint64_t C1 = N1C->getZExtValue(); - // Turn (and (sra x, c2), c1) -> (srli (srai x, c2-c3), c3) if c1 is a mask - // with c3 leading zeros and c2 is larger than c3. if (N0.getOpcode() == ISD::SRA && isa(N0.getOperand(1)) && N0.hasOneUse()) { unsigned C2 = N0.getConstantOperandVal(1); @@ -1477,6 +1475,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { X.getOpcode() == ISD::SHL && isa(X.getOperand(1)) && X.getConstantOperandVal(1) == 32; + // Turn (and (sra x, c2), c1) -> (srli (srai x, c2-c3), c3) if c1 is a + // mask with c3 leading zeros and c2 is larger than c3. if (isMask_64(C1) && !Skip) { unsigned Leading = XLen - llvm::bit_width(C1); if (C2 > Leading) { @@ -1490,6 +1490,27 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { return; } } + + // Look for (and (sra y, c2), c1) where c1 is a shifted mask with c3 + // leading zeros and c4 trailing zeros. If c2 is greater than c3, we can + // use (slli (srli (srai y, c2 - c3), c3 + c4), c4). + if (isShiftedMask_64(C1) && !Skip) { + unsigned Leading = XLen - llvm::bit_width(C1); + unsigned Trailing = llvm::countr_zero(C1); + if (C2 > Leading && Leading > 0 && Trailing > 0) { + SDNode *SRAI = CurDAG->getMachineNode( + RISCV::SRAI, DL, VT, N0.getOperand(0), + CurDAG->getTargetConstant(C2 - Leading, DL, VT)); + SDNode *SRLI = CurDAG->getMachineNode( + RISCV::SRLI, DL, VT, SDValue(SRAI, 0), + CurDAG->getTargetConstant(Leading + Trailing, DL, VT)); + SDNode *SLLI = CurDAG->getMachineNode( + RISCV::SLLI, DL, VT, SDValue(SRLI, 0), + CurDAG->getTargetConstant(Trailing, DL, VT)); + ReplaceNode(Node, SLLI); + return; + } + } } // If C1 masks off the upper bits only (but can't be formed as an @@ -3032,6 +3053,33 @@ bool RISCVDAGToDAGISel::selectSHXADDOp(SDValue N, unsigned ShAmt, return true; } } + } else if (N0.getOpcode() == ISD::SRA && N0.hasOneUse() && + isa(N0.getOperand(1))) { + uint64_t Mask = N.getConstantOperandVal(1); + unsigned C2 = N0.getConstantOperandVal(1); + + // Look for (and (sra y, c2), c1) where c1 is a shifted mask with c3 + // leading zeros and c4 trailing zeros. If c2 is greater than c3, we can + // use (srli (srai y, c2 - c3), c3 + c4) followed by a SHXADD with c4 as + // the X amount. + if (isShiftedMask_64(Mask)) { + unsigned XLen = Subtarget->getXLen(); + unsigned Leading = XLen - llvm::bit_width(Mask); + unsigned Trailing = llvm::countr_zero(Mask); + if (C2 > Leading && Leading > 0 && Trailing == ShAmt) { + SDLoc DL(N); + EVT VT = N.getValueType(); + Val = SDValue(CurDAG->getMachineNode( + RISCV::SRAI, DL, VT, N0.getOperand(0), + CurDAG->getTargetConstant(C2 - Leading, DL, VT)), + 0); + Val = SDValue(CurDAG->getMachineNode( + RISCV::SRLI, DL, VT, Val, + CurDAG->getTargetConstant(Leading + ShAmt, DL, VT)), + 0); + return true; + } + } } } else if (bool LeftShift = N.getOpcode() == ISD::SHL; (LeftShift || N.getOpcode() == ISD::SRL) && diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll index 87796e2c7b72e9..62595fd4a7ad69 100644 --- a/llvm/test/CodeGen/RISCV/rv64zba.ll +++ b/llvm/test/CodeGen/RISCV/rv64zba.ll @@ -2988,3 +2988,69 @@ entry: %2 = and i64 %1, 34359738360 ret i64 %2 } + +define ptr @srai_srli_sh3add(ptr %0, i64 %1) nounwind { +; RV64I-LABEL: srai_srli_sh3add: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: srai a1, a1, 32 +; RV64I-NEXT: srli a1, a1, 6 +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: srai_srli_sh3add: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: srai a1, a1, 32 +; RV64ZBA-NEXT: srli a1, a1, 6 +; RV64ZBA-NEXT: sh3add a0, a1, a0 +; RV64ZBA-NEXT: ret +entry: + %2 = ashr i64 %1, 32 + %3 = lshr i64 %2, 6 + %4 = getelementptr i64, ptr %0, i64 %3 + ret ptr %4 +} + +define ptr @srai_srli_slli(ptr %0, i64 %1) nounwind { +; CHECK-LABEL: srai_srli_slli: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: srai a1, a1, 32 +; CHECK-NEXT: srli a1, a1, 6 +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: ret +entry: + %2 = ashr i64 %1, 32 + %3 = lshr i64 %2, 6 + %4 = getelementptr i128, ptr %0, i64 %3 + ret ptr %4 +} + +; Negative to make sure the peephole added for srai_srli_slli and +; srai_srli_sh3add doesn't break this. +define i64 @srai_andi(i64 %x) nounwind { +; CHECK-LABEL: srai_andi: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: srai a0, a0, 8 +; CHECK-NEXT: andi a0, a0, -8 +; CHECK-NEXT: ret +entry: + %y = ashr i64 %x, 8 + %z = and i64 %y, -8 + ret i64 %z +} + +; Negative to make sure the peephole added for srai_srli_slli and +; srai_srli_sh3add doesn't break this. +define i64 @srai_lui_and(i64 %x) nounwind { +; CHECK-LABEL: srai_lui_and: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: srai a0, a0, 8 +; CHECK-NEXT: lui a1, 1048574 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: ret +entry: + %y = ashr i64 %x, 8 + %z = and i64 %y, -8192 + ret i64 %z +} From 3d18cea904391f510ffd754713ce4e1731845ffb Mon Sep 17 00:00:00 2001 From: Max Coplan Date: Fri, 23 Aug 2024 09:40:38 -0700 Subject: [PATCH 343/426] [libc++][regex] Add _LIBCPP_FALLTHROUGH to suppress fallthrough warning (#100821) --- libcxx/include/regex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcxx/include/regex b/libcxx/include/regex index b8141351213212..08aebc2266f5de 100644 --- a/libcxx/include/regex +++ b/libcxx/include/regex @@ -3921,7 +3921,7 @@ _ForwardIterator basic_regex<_CharT, _Traits>::__parse_character_escape( if (__hd == -1) __throw_regex_error(); __sum = 16 * __sum + static_cast(__hd); - // fallthrough + _LIBCPP_FALLTHROUGH(); case 'x': ++__first; if (__first == __last) From 0bf5846553412978d30b84f06c6b6183890ab8e5 Mon Sep 17 00:00:00 2001 From: Daniel Sanders Date: Fri, 23 Aug 2024 09:43:36 -0700 Subject: [PATCH 344/426] InstructionSelect: Use GISelChangeObserver instead of MachineFunction::Delegate (#105725) The main difference is that it's possible for multiple change observers to be installed at the same time whereas there can only be one MachineFunction delegate installed. This allows downstream targets to continue to use observers to recursively select. The target in question was selecting a gMIR instruction to a machine instruction plus some gMIR around it and relying on observers to ensure it correctly selected any gMIR it created before returning to the main loop. --- .../CodeGen/GlobalISel/GISelChangeObserver.h | 14 ++++++++++++++ .../CodeGen/GlobalISel/InstructionSelector.h | 7 +++++++ .../GlobalISel/GISelChangeObserver.cpp | 10 ++++++++++ .../CodeGen/GlobalISel/InstructionSelect.cpp | 19 ++++++++++++++----- 4 files changed, 45 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GISelChangeObserver.h b/llvm/include/llvm/CodeGen/GlobalISel/GISelChangeObserver.h index 7ec5dac9a6ebaf..1167d51e88b71c 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/GISelChangeObserver.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/GISelChangeObserver.h @@ -138,5 +138,19 @@ class RAIIMFObsDelInstaller { ~RAIIMFObsDelInstaller() = default; }; +/// A simple RAII based Observer installer. +/// Use this in a scope to install the Observer to the MachineFunction and reset +/// it at the end of the scope. +class RAIITemporaryObserverInstaller { +public: + RAIITemporaryObserverInstaller(GISelObserverWrapper &Observers, + GISelChangeObserver &TemporaryObserver); + ~RAIITemporaryObserverInstaller(); + +private: + GISelObserverWrapper &Observers; + GISelChangeObserver &TemporaryObserver; +}; + } // namespace llvm #endif diff --git a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h index aaba56ee11251c..fa9ab9fd760515 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h @@ -16,6 +16,8 @@ #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h" namespace llvm { +class GISelObserverWrapper; + class InstructionSelector : public GIMatchTableExecutor { public: virtual ~InstructionSelector(); @@ -36,6 +38,11 @@ class InstructionSelector : public GIMatchTableExecutor { const TargetPassConfig *TPC = nullptr; MachineOptimizationRemarkEmitter *MORE = nullptr; + + /// Note: InstructionSelect does not track changed instructions. + /// changingInstr() and changedInstr() will never be called on these + /// observers. + GISelObserverWrapper *AllObservers = nullptr; }; } // namespace llvm diff --git a/llvm/lib/CodeGen/GlobalISel/GISelChangeObserver.cpp b/llvm/lib/CodeGen/GlobalISel/GISelChangeObserver.cpp index 59f4d60a41d80d..836d54fa989d78 100644 --- a/llvm/lib/CodeGen/GlobalISel/GISelChangeObserver.cpp +++ b/llvm/lib/CodeGen/GlobalISel/GISelChangeObserver.cpp @@ -46,3 +46,13 @@ RAIIMFObserverInstaller::RAIIMFObserverInstaller(MachineFunction &MF, } RAIIMFObserverInstaller::~RAIIMFObserverInstaller() { MF.setObserver(nullptr); } + +RAIITemporaryObserverInstaller::RAIITemporaryObserverInstaller( + GISelObserverWrapper &Observers, GISelChangeObserver &TemporaryObserver) + : Observers(Observers), TemporaryObserver(TemporaryObserver) { + Observers.addObserver(&TemporaryObserver); +} + +RAIITemporaryObserverInstaller::~RAIITemporaryObserverInstaller() { + Observers.removeObserver(&TemporaryObserver); +} diff --git a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp index 8c0bb85fd0771c..9444ff518ca9cb 100644 --- a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp +++ b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp @@ -75,19 +75,25 @@ InstructionSelect::InstructionSelect(CodeGenOptLevel OL, char &PassID) /// a non-obvious limitation for selector implementers. Therefore, to allow /// deletion of arbitrary instructions, we detect this case and continue /// selection with the predecessor of the deleted instruction. -class InstructionSelect::MIIteratorMaintainer - : public MachineFunction::Delegate { +class InstructionSelect::MIIteratorMaintainer : public GISelChangeObserver { #ifndef NDEBUG SmallSetVector CreatedInstrs; #endif public: MachineBasicBlock::reverse_iterator MII; - void MF_HandleInsertion(MachineInstr &MI) override { + void changingInstr(MachineInstr &MI) override { + llvm_unreachable("InstructionSelect does not track changed instructions!"); + } + void changedInstr(MachineInstr &MI) override { + llvm_unreachable("InstructionSelect does not track changed instructions!"); + } + + void createdInstr(MachineInstr &MI) override { LLVM_DEBUG(dbgs() << "Creating: " << MI; CreatedInstrs.insert(&MI)); } - void MF_HandleRemoval(MachineInstr &MI) override { + void erasingInstr(MachineInstr &MI) override { LLVM_DEBUG(dbgs() << "Erasing: " << MI; CreatedInstrs.remove(&MI)); if (MII.getInstrIterator().getNodePtr() == &MI) { // If the iterator points to the MI that will be erased (i.e. the MI prior @@ -190,8 +196,11 @@ bool InstructionSelect::selectMachineFunction(MachineFunction &MF) { // GISelChangeObserver, because we do not want notifications about changed // instructions. This prevents significant compile-time regressions from // e.g. constrainOperandRegClass(). + GISelObserverWrapper AllObservers; MIIteratorMaintainer MIIMaintainer; - RAIIDelegateInstaller DelInstaller(MF, &MIIMaintainer); + AllObservers.addObserver(&MIIMaintainer); + RAIIDelegateInstaller DelInstaller(MF, &AllObservers); + ISel->AllObservers = &AllObservers; for (MachineBasicBlock *MBB : post_order(&MF)) { ISel->CurMBB = MBB; From aec3ec04ac611f9a3d1e1ad075d50f62c1d1a1e2 Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Fri, 23 Aug 2024 09:45:42 -0700 Subject: [PATCH 345/426] [SCCP] fix non-determinism (#105758) the visit order depended on hashing because we iterated over a SmallPtrSet --- llvm/lib/Transforms/Utils/SCCPSolver.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index 40f0f04c323ddc..670d88ac7cf8fa 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/SCCPSolver.h" +#include "llvm/ADT/SetVector.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/ValueLattice.h" @@ -418,7 +419,7 @@ class SCCPInstVisitor : public InstVisitor { DenseMap> FnPredicateInfo; - DenseMap> AdditionalUsers; + DenseMap> AdditionalUsers; LLVMContext &Ctx; From df9767385701b6bb2ff0411ad6b407bcefbfe34c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 23 Aug 2024 18:08:33 +0100 Subject: [PATCH 346/426] [X86] Add some initial test coverage for half libcall expansion/promotion We can add additional tests in the future, but this is an initial placeholder Inspired by #105775 --- llvm/test/CodeGen/X86/fp16-libcalls.ll | 375 +++++++++++++++++++++++++ 1 file changed, 375 insertions(+) create mode 100644 llvm/test/CodeGen/X86/fp16-libcalls.ll diff --git a/llvm/test/CodeGen/X86/fp16-libcalls.ll b/llvm/test/CodeGen/X86/fp16-libcalls.ll new file mode 100644 index 00000000000000..db3d031a8fe3fb --- /dev/null +++ b/llvm/test/CodeGen/X86/fp16-libcalls.ll @@ -0,0 +1,375 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -enable-legalize-types-checking -mtriple=x86_64-linux-gnu -mattr=+f16c | FileCheck %s --check-prefix=F16C +; RUN: llc < %s -enable-legalize-types-checking -mtriple=x86_64-linux-gnu -mattr=+avx512fp16 | FileCheck %s --check-prefix=FP16 +; RUN: llc < %s -enable-legalize-types-checking -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -enable-legalize-types-checking -mtriple=i686-linux-gnu -mattr=sse2 | FileCheck %s --check-prefix=X86 + +; Check all soft floating point library function calls. + +define void @test_half_ceil(half %a0, ptr %p0) nounwind { +; F16C-LABEL: test_half_ceil: +; F16C: # %bb.0: +; F16C-NEXT: vpextrw $0, %xmm0, %eax +; F16C-NEXT: vmovd %eax, %xmm0 +; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; F16C-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0 +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: movw %ax, (%rdi) +; F16C-NEXT: retq +; +; FP16-LABEL: test_half_ceil: +; FP16: # %bb.0: +; FP16-NEXT: vrndscalesh $10, %xmm0, %xmm0, %xmm0 +; FP16-NEXT: vmovsh %xmm0, (%rdi) +; FP16-NEXT: retq +; +; X64-LABEL: test_half_ceil: +; X64: # %bb.0: +; X64-NEXT: pushq %rbx +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: callq __extendhfsf2@PLT +; X64-NEXT: callq ceilf@PLT +; X64-NEXT: callq __truncsfhf2@PLT +; X64-NEXT: pextrw $0, %xmm0, %eax +; X64-NEXT: movw %ax, (%rbx) +; X64-NEXT: popq %rbx +; X64-NEXT: retq +; +; X86-LABEL: test_half_ceil: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: subl $8, %esp +; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esp) +; X86-NEXT: calll __extendhfsf2 +; X86-NEXT: fstps (%esp) +; X86-NEXT: calll ceilf +; X86-NEXT: fstps (%esp) +; X86-NEXT: calll __truncsfhf2 +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esi) +; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %esi +; X86-NEXT: retl + %res = call half @llvm.ceil.half(half %a0) + store half %res, ptr %p0, align 2 + ret void +} + +define void @test_half_cos(half %a0, ptr %p0) nounwind { +; F16C-LABEL: test_half_cos: +; F16C: # %bb.0: +; F16C-NEXT: pushq %rbx +; F16C-NEXT: movq %rdi, %rbx +; F16C-NEXT: vpextrw $0, %xmm0, %eax +; F16C-NEXT: vmovd %eax, %xmm0 +; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; F16C-NEXT: callq cosf@PLT +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: movw %ax, (%rbx) +; F16C-NEXT: popq %rbx +; F16C-NEXT: retq +; +; FP16-LABEL: test_half_cos: +; FP16: # %bb.0: +; FP16-NEXT: pushq %rbx +; FP16-NEXT: movq %rdi, %rbx +; FP16-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; FP16-NEXT: callq cosf@PLT +; FP16-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 +; FP16-NEXT: vmovsh %xmm0, (%rbx) +; FP16-NEXT: popq %rbx +; FP16-NEXT: retq +; +; X64-LABEL: test_half_cos: +; X64: # %bb.0: +; X64-NEXT: pushq %rbx +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: callq __extendhfsf2@PLT +; X64-NEXT: callq cosf@PLT +; X64-NEXT: callq __truncsfhf2@PLT +; X64-NEXT: pextrw $0, %xmm0, %eax +; X64-NEXT: movw %ax, (%rbx) +; X64-NEXT: popq %rbx +; X64-NEXT: retq +; +; X86-LABEL: test_half_cos: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: subl $8, %esp +; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esp) +; X86-NEXT: calll __extendhfsf2 +; X86-NEXT: fstps (%esp) +; X86-NEXT: calll cosf +; X86-NEXT: fstps (%esp) +; X86-NEXT: calll __truncsfhf2 +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esi) +; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %esi +; X86-NEXT: retl + %res = call half @llvm.cos.half(half %a0) + store half %res, ptr %p0, align 2 + ret void +} + +define void @test_half_fabs(half %a0, ptr %p0) nounwind { +; F16C-LABEL: test_half_fabs: +; F16C: # %bb.0: +; F16C-NEXT: vpextrw $0, %xmm0, %eax +; F16C-NEXT: vmovd %eax, %xmm0 +; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; F16C-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: movw %ax, (%rdi) +; F16C-NEXT: retq +; +; FP16-LABEL: test_half_fabs: +; FP16: # %bb.0: +; FP16-NEXT: vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; FP16-NEXT: vpand %xmm1, %xmm0, %xmm0 +; FP16-NEXT: vmovsh %xmm0, (%rdi) +; FP16-NEXT: retq +; +; X64-LABEL: test_half_fabs: +; X64: # %bb.0: +; X64-NEXT: pushq %rbx +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: callq __extendhfsf2@PLT +; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: callq __truncsfhf2@PLT +; X64-NEXT: pextrw $0, %xmm0, %eax +; X64-NEXT: movw %ax, (%rbx) +; X64-NEXT: popq %rbx +; X64-NEXT: retq +; +; X86-LABEL: test_half_fabs: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: subl $8, %esp +; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esp) +; X86-NEXT: calll __extendhfsf2 +; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: movd %xmm0, (%esp) +; X86-NEXT: calll __truncsfhf2 +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esi) +; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %esi +; X86-NEXT: retl + %res = call half @llvm.fabs.half(half %a0) + store half %res, ptr %p0, align 2 + ret void +} + +define void @test_half_pow(half %a0, half %a1, ptr %p0) nounwind { +; F16C-LABEL: test_half_pow: +; F16C: # %bb.0: +; F16C-NEXT: pushq %rbx +; F16C-NEXT: movq %rdi, %rbx +; F16C-NEXT: vpextrw $0, %xmm1, %eax +; F16C-NEXT: vpextrw $0, %xmm0, %ecx +; F16C-NEXT: vmovd %ecx, %xmm0 +; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; F16C-NEXT: vmovd %eax, %xmm1 +; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 +; F16C-NEXT: callq powf@PLT +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: movw %ax, (%rbx) +; F16C-NEXT: popq %rbx +; F16C-NEXT: retq +; +; FP16-LABEL: test_half_pow: +; FP16: # %bb.0: +; FP16-NEXT: pushq %rbx +; FP16-NEXT: movq %rdi, %rbx +; FP16-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; FP16-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 +; FP16-NEXT: callq powf@PLT +; FP16-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 +; FP16-NEXT: vmovsh %xmm0, (%rbx) +; FP16-NEXT: popq %rbx +; FP16-NEXT: retq +; +; X64-LABEL: test_half_pow: +; X64: # %bb.0: +; X64-NEXT: pushq %rbx +; X64-NEXT: subq $16, %rsp +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: callq __extendhfsf2@PLT +; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; X64-NEXT: # xmm0 = mem[0],zero,zero,zero +; X64-NEXT: callq __extendhfsf2@PLT +; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; X64-NEXT: # xmm1 = mem[0],zero,zero,zero +; X64-NEXT: callq powf@PLT +; X64-NEXT: callq __truncsfhf2@PLT +; X64-NEXT: pextrw $0, %xmm0, %eax +; X64-NEXT: movw %ax, (%rbx) +; X64-NEXT: addq $16, %rsp +; X64-NEXT: popq %rbx +; X64-NEXT: retq +; +; X86-LABEL: test_half_pow: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: subl $56, %esp +; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esp) +; X86-NEXT: calll __extendhfsf2 +; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill +; X86-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esp) +; X86-NEXT: calll __extendhfsf2 +; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload +; X86-NEXT: fstps (%esp) +; X86-NEXT: calll powf +; X86-NEXT: fstps (%esp) +; X86-NEXT: calll __truncsfhf2 +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esi) +; X86-NEXT: addl $56, %esp +; X86-NEXT: popl %esi +; X86-NEXT: retl + %res = call half @llvm.pow.half(half %a0, half %a1) + store half %res, ptr %p0, align 2 + ret void +} + +define void @test_half_sin(half %a0, ptr %p0) nounwind { +; F16C-LABEL: test_half_sin: +; F16C: # %bb.0: +; F16C-NEXT: pushq %rbx +; F16C-NEXT: movq %rdi, %rbx +; F16C-NEXT: vpextrw $0, %xmm0, %eax +; F16C-NEXT: vmovd %eax, %xmm0 +; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; F16C-NEXT: callq sinf@PLT +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: movw %ax, (%rbx) +; F16C-NEXT: popq %rbx +; F16C-NEXT: retq +; +; FP16-LABEL: test_half_sin: +; FP16: # %bb.0: +; FP16-NEXT: pushq %rbx +; FP16-NEXT: movq %rdi, %rbx +; FP16-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; FP16-NEXT: callq sinf@PLT +; FP16-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 +; FP16-NEXT: vmovsh %xmm0, (%rbx) +; FP16-NEXT: popq %rbx +; FP16-NEXT: retq +; +; X64-LABEL: test_half_sin: +; X64: # %bb.0: +; X64-NEXT: pushq %rbx +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: callq __extendhfsf2@PLT +; X64-NEXT: callq sinf@PLT +; X64-NEXT: callq __truncsfhf2@PLT +; X64-NEXT: pextrw $0, %xmm0, %eax +; X64-NEXT: movw %ax, (%rbx) +; X64-NEXT: popq %rbx +; X64-NEXT: retq +; +; X86-LABEL: test_half_sin: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: subl $8, %esp +; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esp) +; X86-NEXT: calll __extendhfsf2 +; X86-NEXT: fstps (%esp) +; X86-NEXT: calll sinf +; X86-NEXT: fstps (%esp) +; X86-NEXT: calll __truncsfhf2 +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esi) +; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %esi +; X86-NEXT: retl + %res = call half @llvm.sin.half(half %a0) + store half %res, ptr %p0, align 2 + ret void +} + +define void @test_half_sqrt(half %a0, ptr %p0) nounwind { +; F16C-LABEL: test_half_sqrt: +; F16C: # %bb.0: +; F16C-NEXT: vpextrw $0, %xmm0, %eax +; F16C-NEXT: vmovd %eax, %xmm0 +; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; F16C-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: movw %ax, (%rdi) +; F16C-NEXT: retq +; +; FP16-LABEL: test_half_sqrt: +; FP16: # %bb.0: +; FP16-NEXT: vsqrtsh %xmm0, %xmm0, %xmm0 +; FP16-NEXT: vmovsh %xmm0, (%rdi) +; FP16-NEXT: retq +; +; X64-LABEL: test_half_sqrt: +; X64: # %bb.0: +; X64-NEXT: pushq %rbx +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: callq __extendhfsf2@PLT +; X64-NEXT: sqrtss %xmm0, %xmm0 +; X64-NEXT: callq __truncsfhf2@PLT +; X64-NEXT: pextrw $0, %xmm0, %eax +; X64-NEXT: movw %ax, (%rbx) +; X64-NEXT: popq %rbx +; X64-NEXT: retq +; +; X86-LABEL: test_half_sqrt: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: subl $8, %esp +; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esp) +; X86-NEXT: calll __extendhfsf2 +; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: sqrtss %xmm0, %xmm0 +; X86-NEXT: movss %xmm0, (%esp) +; X86-NEXT: calll __truncsfhf2 +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esi) +; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %esi +; X86-NEXT: retl + %res = call half @llvm.sqrt.half(half %a0) + store half %res, ptr %p0, align 2 + ret void +} From 1821cb38995796e1b8d46357c2b26eff4ca0f88c Mon Sep 17 00:00:00 2001 From: Michael Park Date: Fri, 23 Aug 2024 10:29:33 -0700 Subject: [PATCH 347/426] [NFC] Fix an incorrect comment about operator precedence. (#105784) The comment talks about left-associative operators twice, when the latter mention is actually describing right-associative operators. --- clang/lib/Parse/ParseExpr.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp index 1405aef700bec5..64f284d78b24db 100644 --- a/clang/lib/Parse/ParseExpr.cpp +++ b/clang/lib/Parse/ParseExpr.cpp @@ -607,7 +607,7 @@ Parser::ParseRHSOfBinaryExpression(ExprResult LHS, prec::Level MinPrec) { RHS = ExprError(); } // If this is left-associative, only parse things on the RHS that bind - // more tightly than the current operator. If it is left-associative, it + // more tightly than the current operator. If it is right-associative, it // is okay, to bind exactly as tightly. For example, compile A=B=C=D as // A=(B=(C=D)), where each paren is a level of recursion here. // The function takes ownership of the RHS. From 960a210b1f22f74ba32a04acbb5d3134d4443839 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Fri, 23 Aug 2024 10:31:43 -0700 Subject: [PATCH 348/426] [ctx_prof] Remove the dependency on the "name" GlobalVariable (#105731) We don't need that name variable for contextual instrumentation, we just use the function to get its GUID which we pass to the runtime, and rely on metadata to capture it through the various optimization passes. This change removes the need for the name global variable. --- llvm/include/llvm/IR/IntrinsicInst.h | 14 +++++-- .../Instrumentation/PGOCtxProfLowering.cpp | 3 +- .../Instrumentation/PGOInstrumentation.cpp | 15 +++---- .../PGOProfile/ctx-instrumentation.ll | 41 +++++++------------ .../PGOProfile/ctx-prof-use-prelink.ll | 11 ++--- 5 files changed, 39 insertions(+), 45 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h index c188bec631a239..b45c89cadb0fde 100644 --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -1503,11 +1503,19 @@ class InstrProfInstBase : public IntrinsicInst { return isCounterBase(*Instr) || isMCDCBitmapBase(*Instr); return false; } - // The name of the instrumented function. + + // The name of the instrumented function, assuming it is a global variable. GlobalVariable *getName() const { - return cast( - const_cast(getArgOperand(0))->stripPointerCasts()); + return cast(getNameValue()); + } + + // The "name" operand of the profile instrumentation instruction - this is the + // operand that can be used to relate the instruction to the function it + // belonged to at instrumentation time. + Value *getNameValue() const { + return const_cast(getArgOperand(0))->stripPointerCasts(); } + // The hash of the CFG for the instrumented function. ConstantInt *getHash() const { return cast(const_cast(getArgOperand(1))); diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp index 9b10cbba84075a..43bebc99316e06 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp @@ -226,7 +226,8 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) { IRBuilder<> Builder(Mark); - Guid = Builder.getInt64(AssignGUIDPass::getGUID(F)); + Guid = Builder.getInt64( + AssignGUIDPass::getGUID(cast(*Mark->getNameValue()))); // The type of the context of this function is now knowable since we have // NrCallsites and NrCounters. We delcare it here because it's more // convenient - we have the Builder. diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 39cf94daab7d3b..aacfe39f16fbc4 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -464,7 +464,7 @@ struct SelectInstVisitor : public InstVisitor { VisitMode Mode = VM_counting; // Visiting mode. unsigned *CurCtrIdx = nullptr; // Pointer to current counter index. unsigned TotalNumCtrs = 0; // Total number of counters - GlobalVariable *FuncNameVar = nullptr; + GlobalValue *FuncNameVar = nullptr; uint64_t FuncHash = 0; PGOUseFunc *UseFunc = nullptr; bool HasSingleByteCoverage; @@ -482,7 +482,7 @@ struct SelectInstVisitor : public InstVisitor { // Ind is a pointer to the counter index variable; \p TotalNC // is the total number of counters; \p FNV is the pointer to the // PGO function name var; \p FHash is the function hash. - void instrumentSelects(unsigned *Ind, unsigned TotalNC, GlobalVariable *FNV, + void instrumentSelects(unsigned *Ind, unsigned TotalNC, GlobalValue *FNV, uint64_t FHash) { Mode = VM_instrument; CurCtrIdx = Ind; @@ -901,13 +901,14 @@ void FunctionInstrumenter::instrument() { SplitIndirectBrCriticalEdges(F, /*IgnoreBlocksWithoutPHI=*/false, BPI, BFI); } + const bool IsCtxProf = InstrumentationType == PGOInstrumentationType::CTXPROF; FuncPGOInstrumentation FuncInfo( - F, TLI, ComdatMembers, true, BPI, BFI, + F, TLI, ComdatMembers, /*CreateGlobalVar=*/!IsCtxProf, BPI, BFI, InstrumentationType == PGOInstrumentationType::CSFDO, shouldInstrumentEntryBB(), PGOBlockCoverage); - auto Name = FuncInfo.FuncNameVar; - auto CFGHash = + auto *const Name = IsCtxProf ? cast(&F) : FuncInfo.FuncNameVar; + auto *const CFGHash = ConstantInt::get(Type::getInt64Ty(M.getContext()), FuncInfo.FunctionHash); // Make sure that pointer to global is passed in with zero addrspace // This is relevant during GPU profiling @@ -929,7 +930,7 @@ void FunctionInstrumenter::instrument() { unsigned NumCounters = InstrumentBBs.size() + FuncInfo.SIVisitor.getNumOfSelectInsts(); - if (InstrumentationType == PGOInstrumentationType::CTXPROF) { + if (IsCtxProf) { auto *CSIntrinsic = Intrinsic::getDeclaration(&M, Intrinsic::instrprof_callsite); // We want to count the instrumentable callsites, then instrument them. This @@ -995,7 +996,7 @@ void FunctionInstrumenter::instrument() { } // Now instrument select instructions: - FuncInfo.SIVisitor.instrumentSelects(&I, NumCounters, FuncInfo.FuncNameVar, + FuncInfo.SIVisitor.instrumentSelects(&I, NumCounters, Name, FuncInfo.FunctionHash); assert(I == NumCounters); diff --git a/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll b/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll index df4e467567c46e..c94c2b4da57a98 100644 --- a/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll +++ b/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll @@ -9,19 +9,6 @@ declare void @bar() ;. -; INSTRUMENT: @__profn_foo = private constant [3 x i8] c"foo" -; INSTRUMENT: @__profn_an_entrypoint = private constant [13 x i8] c"an_entrypoint" -; INSTRUMENT: @__profn_another_entrypoint_no_callees = private constant [29 x i8] c"another_entrypoint_no_callees" -; INSTRUMENT: @__profn_simple = private constant [6 x i8] c"simple" -; INSTRUMENT: @__profn_no_callsites = private constant [12 x i8] c"no_callsites" -; INSTRUMENT: @__profn_no_counters = private constant [11 x i8] c"no_counters" -;. -; LOWERING: @__profn_foo = private constant [3 x i8] c"foo" -; LOWERING: @__profn_an_entrypoint = private constant [13 x i8] c"an_entrypoint" -; LOWERING: @__profn_another_entrypoint_no_callees = private constant [29 x i8] c"another_entrypoint_no_callees" -; LOWERING: @__profn_simple = private constant [6 x i8] c"simple" -; LOWERING: @__profn_no_callsites = private constant [12 x i8] c"no_callsites" -; LOWERING: @__profn_no_counters = private constant [11 x i8] c"no_counters" ; LOWERING: @an_entrypoint_ctx_root = global { ptr, ptr, ptr, i8 } zeroinitializer ; LOWERING: @another_entrypoint_no_callees_ctx_root = global { ptr, ptr, ptr, i8 } zeroinitializer ; LOWERING: @__llvm_ctx_profile_callsite = external hidden thread_local global ptr @@ -30,16 +17,16 @@ declare void @bar() define void @foo(i32 %a, ptr %fct) { ; INSTRUMENT-LABEL: define void @foo( ; INSTRUMENT-SAME: i32 [[A:%.*]], ptr [[FCT:%.*]]) { -; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 0) +; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @foo, i64 728453322856651412, i32 2, i32 0) ; INSTRUMENT-NEXT: [[T:%.*]] = icmp eq i32 [[A]], 0 ; INSTRUMENT-NEXT: br i1 [[T]], label [[YES:%.*]], label [[NO:%.*]] ; INSTRUMENT: yes: -; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 1) -; INSTRUMENT-NEXT: call void @llvm.instrprof.callsite(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 0, ptr [[FCT]]) +; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @foo, i64 728453322856651412, i32 2, i32 1) +; INSTRUMENT-NEXT: call void @llvm.instrprof.callsite(ptr @foo, i64 728453322856651412, i32 2, i32 0, ptr [[FCT]]) ; INSTRUMENT-NEXT: call void [[FCT]](i32 [[A]]) ; INSTRUMENT-NEXT: br label [[EXIT:%.*]] ; INSTRUMENT: no: -; INSTRUMENT-NEXT: call void @llvm.instrprof.callsite(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 1, ptr @bar) +; INSTRUMENT-NEXT: call void @llvm.instrprof.callsite(ptr @foo, i64 728453322856651412, i32 2, i32 1, ptr @bar) ; INSTRUMENT-NEXT: call void @bar() ; INSTRUMENT-NEXT: br label [[EXIT]] ; INSTRUMENT: exit: @@ -92,12 +79,12 @@ exit: define void @an_entrypoint(i32 %a) { ; INSTRUMENT-LABEL: define void @an_entrypoint( ; INSTRUMENT-SAME: i32 [[A:%.*]]) { -; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @__profn_an_entrypoint, i64 784007058953177093, i32 2, i32 0) +; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @an_entrypoint, i64 784007058953177093, i32 2, i32 0) ; INSTRUMENT-NEXT: [[T:%.*]] = icmp eq i32 [[A]], 0 ; INSTRUMENT-NEXT: br i1 [[T]], label [[YES:%.*]], label [[NO:%.*]] ; INSTRUMENT: yes: -; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @__profn_an_entrypoint, i64 784007058953177093, i32 2, i32 1) -; INSTRUMENT-NEXT: call void @llvm.instrprof.callsite(ptr @__profn_an_entrypoint, i64 784007058953177093, i32 1, i32 0, ptr @foo) +; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @an_entrypoint, i64 784007058953177093, i32 2, i32 1) +; INSTRUMENT-NEXT: call void @llvm.instrprof.callsite(ptr @an_entrypoint, i64 784007058953177093, i32 1, i32 0, ptr @foo) ; INSTRUMENT-NEXT: call void @foo(i32 1, ptr null) ; INSTRUMENT-NEXT: ret void ; INSTRUMENT: no: @@ -144,11 +131,11 @@ no: define void @another_entrypoint_no_callees(i32 %a) { ; INSTRUMENT-LABEL: define void @another_entrypoint_no_callees( ; INSTRUMENT-SAME: i32 [[A:%.*]]) { -; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @__profn_another_entrypoint_no_callees, i64 784007058953177093, i32 2, i32 0) +; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @another_entrypoint_no_callees, i64 784007058953177093, i32 2, i32 0) ; INSTRUMENT-NEXT: [[T:%.*]] = icmp eq i32 [[A]], 0 ; INSTRUMENT-NEXT: br i1 [[T]], label [[YES:%.*]], label [[NO:%.*]] ; INSTRUMENT: yes: -; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @__profn_another_entrypoint_no_callees, i64 784007058953177093, i32 2, i32 1) +; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @another_entrypoint_no_callees, i64 784007058953177093, i32 2, i32 1) ; INSTRUMENT-NEXT: ret void ; INSTRUMENT: no: ; INSTRUMENT-NEXT: ret void @@ -184,7 +171,7 @@ no: define void @simple(i32 %a) { ; INSTRUMENT-LABEL: define void @simple( ; INSTRUMENT-SAME: i32 [[A:%.*]]) { -; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @__profn_simple, i64 742261418966908927, i32 1, i32 0) +; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @simple, i64 742261418966908927, i32 1, i32 0) ; INSTRUMENT-NEXT: ret void ; ; LOWERING-LABEL: define void @simple( @@ -202,11 +189,11 @@ define void @simple(i32 %a) { define i32 @no_callsites(i32 %a) { ; INSTRUMENT-LABEL: define i32 @no_callsites( ; INSTRUMENT-SAME: i32 [[A:%.*]]) { -; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @__profn_no_callsites, i64 784007058953177093, i32 2, i32 0) +; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @no_callsites, i64 784007058953177093, i32 2, i32 0) ; INSTRUMENT-NEXT: [[C:%.*]] = icmp eq i32 [[A]], 0 ; INSTRUMENT-NEXT: br i1 [[C]], label [[YES:%.*]], label [[NO:%.*]] ; INSTRUMENT: yes: -; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @__profn_no_callsites, i64 784007058953177093, i32 2, i32 1) +; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @no_callsites, i64 784007058953177093, i32 2, i32 1) ; INSTRUMENT-NEXT: ret i32 1 ; INSTRUMENT: no: ; INSTRUMENT-NEXT: ret i32 0 @@ -238,8 +225,8 @@ no: define void @no_counters() { ; INSTRUMENT-LABEL: define void @no_counters() { -; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @__profn_no_counters, i64 742261418966908927, i32 1, i32 0) -; INSTRUMENT-NEXT: call void @llvm.instrprof.callsite(ptr @__profn_no_counters, i64 742261418966908927, i32 1, i32 0, ptr @bar) +; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @no_counters, i64 742261418966908927, i32 1, i32 0) +; INSTRUMENT-NEXT: call void @llvm.instrprof.callsite(ptr @no_counters, i64 742261418966908927, i32 1, i32 0, ptr @bar) ; INSTRUMENT-NEXT: call void @bar() ; INSTRUMENT-NEXT: ret void ; diff --git a/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll b/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll index cb8ab78dc0f414..7959e4d0760edb 100644 --- a/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll +++ b/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll @@ -7,22 +7,19 @@ declare void @bar() -;. -; CHECK: @__profn_foo = private constant [3 x i8] c"foo" -;. define void @foo(i32 %a, ptr %fct) { ; CHECK-LABEL: define void @foo( ; CHECK-SAME: i32 [[A:%.*]], ptr [[FCT:%.*]]) local_unnamed_addr !guid [[META0:![0-9]+]] { -; CHECK-NEXT: call void @llvm.instrprof.increment(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.instrprof.increment(ptr @foo, i64 728453322856651412, i32 2, i32 0) ; CHECK-NEXT: [[T:%.*]] = icmp eq i32 [[A]], 0 ; CHECK-NEXT: br i1 [[T]], label %[[YES:.*]], label %[[NO:.*]] ; CHECK: [[YES]]: -; CHECK-NEXT: call void @llvm.instrprof.increment(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 1) -; CHECK-NEXT: call void @llvm.instrprof.callsite(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 0, ptr [[FCT]]) +; CHECK-NEXT: call void @llvm.instrprof.increment(ptr @foo, i64 728453322856651412, i32 2, i32 1) +; CHECK-NEXT: call void @llvm.instrprof.callsite(ptr @foo, i64 728453322856651412, i32 2, i32 0, ptr [[FCT]]) ; CHECK-NEXT: call void [[FCT]](i32 0) ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[NO]]: -; CHECK-NEXT: call void @llvm.instrprof.callsite(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 1, ptr @bar) +; CHECK-NEXT: call void @llvm.instrprof.callsite(ptr @foo, i64 728453322856651412, i32 2, i32 1, ptr @bar) ; CHECK-NEXT: call void @bar() ; CHECK-NEXT: br label %[[EXIT]] ; CHECK: [[EXIT]]: From fa089efa6c70f4da8618f2f41ee9c7db86e2b0e0 Mon Sep 17 00:00:00 2001 From: Ben Langmuir Date: Fri, 23 Aug 2024 10:32:14 -0700 Subject: [PATCH 349/426] [orc][mach-o] Unlock the JITDylib state mutex during +load (#105333) Similar to what was already done for static initializers, we need to unlock the state mutext when calling out to libobjc to run +load methods in case they cause us to reenter the runtime, which was previously deadlocking. No test for now, because we don't have any code paths in llvm-jitlink itself that could lead to this deadlock. If we interpose calls to dlopen to go back to the JIT in the future then calling dlopen from a +load is the easiest way to reproduce this. rdar://133430490 --- compiler-rt/lib/orc/macho_platform.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/compiler-rt/lib/orc/macho_platform.cpp b/compiler-rt/lib/orc/macho_platform.cpp index 340846f5f90017..9b4f6e5fd417c2 100644 --- a/compiler-rt/lib/orc/macho_platform.cpp +++ b/compiler-rt/lib/orc/macho_platform.cpp @@ -367,7 +367,9 @@ class MachOPlatformRuntimeState { static Error registerEHFrames(span EHFrameSection); static Error deregisterEHFrames(span EHFrameSection); - static Error registerObjCRegistrationObjects(JITDylibState &JDS); + static Error + registerObjCRegistrationObjects(std::unique_lock &JDStatesLock, + JITDylibState &JDS); static Error runModInits(std::unique_lock &JDStatesLock, JITDylibState &JDS); @@ -1059,7 +1061,7 @@ Error MachOPlatformRuntimeState::deregisterEHFrames( } Error MachOPlatformRuntimeState::registerObjCRegistrationObjects( - JITDylibState &JDS) { + std::unique_lock &JDStatesLock, JITDylibState &JDS) { ORC_RT_DEBUG(printdbg("Registering Objective-C / Swift metadata.\n")); std::vector RegObjBases; @@ -1074,6 +1076,9 @@ Error MachOPlatformRuntimeState::registerObjCRegistrationObjects( "Could not register Objective-C / Swift metadata: _objc_map_images / " "_objc_load_image not found"); + // Release the lock while calling out to libobjc in case +load methods cause + // reentering the orc runtime. + JDStatesLock.unlock(); std::vector Paths; Paths.resize(RegObjBases.size()); _objc_map_images(RegObjBases.size(), Paths.data(), @@ -1081,6 +1086,7 @@ Error MachOPlatformRuntimeState::registerObjCRegistrationObjects( for (void *RegObjBase : RegObjBases) _objc_load_image(nullptr, reinterpret_cast(RegObjBase)); + JDStatesLock.lock(); return Error::success(); } @@ -1218,7 +1224,7 @@ Error MachOPlatformRuntimeState::dlopenInitialize( } // Initialize this JITDylib. - if (auto Err = registerObjCRegistrationObjects(JDS)) + if (auto Err = registerObjCRegistrationObjects(JDStatesLock, JDS)) return Err; if (auto Err = runModInits(JDStatesLock, JDS)) return Err; From ebc4a66e9b525f7efc03053e3c7472d3e3fb0412 Mon Sep 17 00:00:00 2001 From: Joshua Batista Date: Fri, 23 Aug 2024 10:47:05 -0700 Subject: [PATCH 350/426] Implement resource binding type prefix mismatch diagnostic infrastructure (#97103) There are currently no diagnostics being emitted for when a resource is bound to a register with an incorrect binding type prefix. For example, a CBuffer type resource should be bound with a a binding type prefix of 'b', but if instead the prefix is 'u', no errors will be emitted. This PR implements such diagnostics. The focus of this PR is to implement both the flag setting and diagnostic emisison steps specified in the relevant spec: https://github.com/microsoft/hlsl-specs/pull/230 The relevant issue is: https://github.com/llvm/llvm-project/issues/57886 This is a continuation / refresh of this PR: https://github.com/llvm/llvm-project/pull/87578 --- clang/include/clang/Basic/Attr.td | 4 +- clang/include/clang/Basic/DiagnosticGroups.td | 3 + .../clang/Basic/DiagnosticSemaKinds.td | 8 +- clang/include/clang/Parse/Parser.h | 4 +- clang/lib/Parse/ParseDecl.cpp | 3 +- clang/lib/Sema/HLSLExternalSemaSource.cpp | 4 +- clang/lib/Sema/SemaDeclAttr.cpp | 3 + clang/lib/Sema/SemaHLSL.cpp | 405 +++++++++++++++++- .../ast-dump-comment-cbuffe-tbufferr.hlsl | 4 + clang/test/AST/HLSL/cbuffer_tbuffer.hlsl | 8 +- clang/test/AST/HLSL/packoffset.hlsl | 2 + clang/test/AST/HLSL/pch_hlsl_buffer.hlsl | 4 + .../test/AST/HLSL/resource_binding_attr.hlsl | 8 +- ...a-attribute-supported-attributes-list.test | 2 +- .../ParserHLSL/hlsl_resource_class_attr.hlsl | 30 +- .../hlsl_resource_class_attr_error.hlsl | 15 +- .../SemaHLSL/resource_binding_attr_error.hlsl | 52 ++- .../resource_binding_attr_error_basic.hlsl | 39 ++ .../resource_binding_attr_error_other.hlsl | 9 + .../resource_binding_attr_error_resource.hlsl | 49 +++ ...urce_binding_attr_error_silence_diags.hlsl | 27 ++ .../resource_binding_attr_error_udt.hlsl | 128 ++++++ 22 files changed, 751 insertions(+), 60 deletions(-) create mode 100644 clang/test/SemaHLSL/resource_binding_attr_error_basic.hlsl create mode 100644 clang/test/SemaHLSL/resource_binding_attr_error_other.hlsl create mode 100644 clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl create mode 100644 clang/test/SemaHLSL/resource_binding_attr_error_silence_diags.hlsl create mode 100644 clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 98bedfe20f5d98..a83e908899c83b 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -4538,7 +4538,7 @@ def HLSLSV_GroupIndex: HLSLAnnotationAttr { def HLSLResourceBinding: InheritableAttr { let Spellings = [HLSLAnnotation<"register">]; - let Subjects = SubjectList<[HLSLBufferObj, ExternalGlobalVar]>; + let Subjects = SubjectList<[HLSLBufferObj, ExternalGlobalVar], ErrorDiag>; let LangOpts = [HLSL]; let Args = [StringArgument<"Slot">, StringArgument<"Space", 1>]; let Documentation = [HLSLResourceBindingDocs]; @@ -4622,7 +4622,7 @@ def HLSLROV : InheritableAttr { def HLSLResourceClass : InheritableAttr { let Spellings = [CXX11<"hlsl", "resource_class">]; - let Subjects = SubjectList<[Struct]>; + let Subjects = SubjectList<[Field]>; let LangOpts = [HLSL]; let Args = [ EnumArgument<"ResourceClass", "llvm::hlsl::ResourceClass", diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index 19c3f1e0433496..28d315f63e5c47 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -1547,6 +1547,9 @@ def DXILValidation : DiagGroup<"dxil-validation">; // Warning for HLSL API availability def HLSLAvailability : DiagGroup<"hlsl-availability">; +// Warnings for legacy binding behavior +def LegacyConstantRegisterBinding : DiagGroup<"legacy-constant-register-binding">; + // Warnings and notes related to const_var_decl_type attribute checks def ReadOnlyPlacementChecks : DiagGroup<"read-only-types">; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 4b6aadd635786a..ede3435d3e1b71 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -12352,7 +12352,13 @@ def err_hlsl_missing_semantic_annotation : Error< def err_hlsl_init_priority_unsupported : Error< "initializer priorities are not supported in HLSL">; -def err_hlsl_unsupported_register_type : Error<"invalid resource class specifier '%0' used; expected 'b', 's', 't', or 'u'">; +def warn_hlsl_user_defined_type_missing_member: Warning<"binding type '%select{t|u|b|s|c}0' only applies to types containing %select{SRV resources|UAV resources|constant buffer resources|sampler state|numeric types}0">, InGroup; +def err_hlsl_binding_type_mismatch: Error<"binding type '%select{t|u|b|s|c}0' only applies to %select{SRV resources|UAV resources|constant buffer resources|sampler state|numeric variables in the global scope}0">; +def err_hlsl_binding_type_invalid: Error<"binding type '%0' is invalid">; +def err_hlsl_duplicate_register_annotation: Error<"binding type '%select{t|u|b|s|c|i}0' cannot be applied more than once">; +def warn_hlsl_register_type_c_packoffset: Warning<"binding type 'c' ignored in buffer declaration. Did you mean 'packoffset'?">, InGroup, DefaultError; +def warn_hlsl_deprecated_register_type_b: Warning<"binding type 'b' only applies to constant buffers. The 'bool constant' binding type is no longer supported">, InGroup, DefaultError; +def warn_hlsl_deprecated_register_type_i: Warning<"binding type 'i' ignored. The 'integer constant' binding type is no longer supported">, InGroup, DefaultError; def err_hlsl_unsupported_register_number : Error<"register number should be an integer">; def err_hlsl_expected_space : Error<"invalid space specifier '%0' used; expected 'space' followed by an integer, like space1">; def warn_hlsl_packoffset_mix : Warning<"cannot mix packoffset elements with nonpackoffset elements in a cbuffer">, diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index 99a0b0200fa06f..a7513069ff5da0 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -3021,7 +3021,7 @@ class Parser : public CodeCompletionHandler { SemaCodeCompletion::AttributeCompletion::None, const IdentifierInfo *EnclosingScope = nullptr); - void MaybeParseHLSLAnnotations(Declarator &D, + bool MaybeParseHLSLAnnotations(Declarator &D, SourceLocation *EndLoc = nullptr, bool CouldBeBitField = false) { assert(getLangOpts().HLSL && "MaybeParseHLSLAnnotations is for HLSL only"); @@ -3029,7 +3029,9 @@ class Parser : public CodeCompletionHandler { ParsedAttributes Attrs(AttrFactory); ParseHLSLAnnotations(Attrs, EndLoc, CouldBeBitField); D.takeAttributes(Attrs); + return true; } + return false; } void MaybeParseHLSLAnnotations(ParsedAttributes &Attrs, diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index ed5d6ce90aa7d1..78d729c5ef7d8a 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -2326,7 +2326,8 @@ Parser::DeclGroupPtrTy Parser::ParseDeclGroup(ParsingDeclSpec &DS, } if (getLangOpts().HLSL) - MaybeParseHLSLAnnotations(D); + while (MaybeParseHLSLAnnotations(D)) + ; if (Tok.is(tok::kw_requires)) ParseTrailingRequiresClause(D); diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp index 89a0e391920cc6..9aacbe4ad9548e 100644 --- a/clang/lib/Sema/HLSLExternalSemaSource.cpp +++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp @@ -503,9 +503,11 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() { Decl = BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, "RWBuffer") .addSimpleTemplateParams(*SemaPtr, {"element_type"}) .Record; + onCompletion(Decl, [this](CXXRecordDecl *Decl) { setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, - ResourceKind::TypedBuffer, /*IsROV=*/false) + ResourceKind::TypedBuffer, + /*IsROV=*/false) .addArraySubscriptOperators() .completeDefinition(); }); diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 73d11ac972b020..1e074298ac5289 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -6889,6 +6889,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL, case ParsedAttr::AT_HLSLSV_GroupIndex: handleSimpleAttribute(S, D, AL); break; + case ParsedAttr::AT_HLSLGroupSharedAddressSpace: + handleSimpleAttribute(S, D, AL); + break; case ParsedAttr::AT_HLSLSV_DispatchThreadID: S.HLSL().handleSV_DispatchThreadIDAttr(D, AL); break; diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index df01549cc2eeb6..320e38b740a742 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -38,6 +38,14 @@ Decl *SemaHLSL::ActOnStartBuffer(Scope *BufferScope, bool CBuffer, HLSLBufferDecl *Result = HLSLBufferDecl::Create( getASTContext(), LexicalParent, CBuffer, KwLoc, Ident, IdentLoc, LBrace); + // if CBuffer is false, then it's a TBuffer + auto RC = CBuffer ? llvm::hlsl::ResourceClass::CBuffer + : llvm::hlsl::ResourceClass::SRV; + auto RK = CBuffer ? llvm::hlsl::ResourceKind::CBuffer + : llvm::hlsl::ResourceKind::TBuffer; + Result->addAttr(HLSLResourceClassAttr::CreateImplicit(getASTContext(), RC)); + Result->addAttr(HLSLResourceAttr::CreateImplicit(getASTContext(), RK)); + SemaRef.PushOnScopeChains(Result, BufferScope); SemaRef.PushDeclContext(BufferScope, Result); @@ -459,7 +467,378 @@ void SemaHLSL::handleResourceClassAttr(Decl *D, const ParsedAttr &AL) { D->addAttr(HLSLResourceClassAttr::Create(getASTContext(), RC, ArgLoc)); } -void SemaHLSL::handleResourceBindingAttr(Decl *D, const ParsedAttr &AL) { +struct RegisterBindingFlags { + bool Resource = false; + bool UDT = false; + bool Other = false; + bool Basic = false; + + bool SRV = false; + bool UAV = false; + bool CBV = false; + bool Sampler = false; + + bool ContainsNumeric = false; + bool DefaultGlobals = false; +}; + +static bool isDeclaredWithinCOrTBuffer(const Decl *TheDecl) { + return TheDecl && isa(TheDecl->getDeclContext()); +} + +// get the record decl from a var decl that we expect +// represents a resource +static CXXRecordDecl *getRecordDeclFromVarDecl(VarDecl *VD) { + const Type *Ty = VD->getType()->getPointeeOrArrayElementType(); + assert(Ty && "Resource must have an element type."); + + if (Ty->isBuiltinType()) + return nullptr; + + CXXRecordDecl *TheRecordDecl = Ty->getAsCXXRecordDecl(); + assert(TheRecordDecl && "Resource should have a resource type declaration."); + return TheRecordDecl; +} + +static void updateResourceClassFlagsFromDeclResourceClass( + RegisterBindingFlags &Flags, llvm::hlsl::ResourceClass DeclResourceClass) { + switch (DeclResourceClass) { + case llvm::hlsl::ResourceClass::SRV: + Flags.SRV = true; + break; + case llvm::hlsl::ResourceClass::UAV: + Flags.UAV = true; + break; + case llvm::hlsl::ResourceClass::CBuffer: + Flags.CBV = true; + break; + case llvm::hlsl::ResourceClass::Sampler: + Flags.Sampler = true; + break; + } +} + +template +static const T *getSpecifiedHLSLAttrFromRecordDecl(RecordDecl *TheRecordDecl) { + if (!TheRecordDecl) + return nullptr; + + if (TheRecordDecl->hasAttr()) + return TheRecordDecl->getAttr(); + for (auto *FD : TheRecordDecl->fields()) { + const T *Attr = FD->getAttr(); + if (Attr) + return Attr; + } + return nullptr; +} + +template +static const T *getSpecifiedHLSLAttrFromVarDecl(VarDecl *VD) { + RecordDecl *TheRecordDecl = nullptr; + if (VD) { + TheRecordDecl = getRecordDeclFromVarDecl(VD); + if (!TheRecordDecl) + return nullptr; + } + + return getSpecifiedHLSLAttrFromRecordDecl(TheRecordDecl); +} + +static void updateFlagsFromType(QualType TheQualTy, + RegisterBindingFlags &Flags); + +static void updateResourceClassFlagsFromRecordDecl(RegisterBindingFlags &Flags, + const RecordDecl *RD) { + if (!RD) + return; + + if (RD->isCompleteDefinition()) { + for (auto Field : RD->fields()) { + QualType T = Field->getType(); + updateFlagsFromType(T, Flags); + } + } +} + +static void updateFlagsFromType(QualType TheQualTy, + RegisterBindingFlags &Flags) { + // if the member's type is a numeric type, set the ContainsNumeric flag + if (TheQualTy->isIntegralOrEnumerationType() || TheQualTy->isFloatingType()) { + Flags.ContainsNumeric = true; + return; + } + + const clang::Type *TheBaseType = TheQualTy.getTypePtr(); + while (TheBaseType->isArrayType()) + TheBaseType = TheBaseType->getArrayElementTypeNoTypeQual(); + // otherwise, if the member's base type is not a record type, return + const RecordType *TheRecordTy = TheBaseType->getAs(); + if (!TheRecordTy) + return; + + RecordDecl *SubRecordDecl = TheRecordTy->getDecl(); + const HLSLResourceClassAttr *Attr = + getSpecifiedHLSLAttrFromRecordDecl(SubRecordDecl); + // find the attr if it's on the member, or on any of the member's fields + if (Attr) { + llvm::hlsl::ResourceClass DeclResourceClass = Attr->getResourceClass(); + updateResourceClassFlagsFromDeclResourceClass(Flags, DeclResourceClass); + } + + // otherwise, dig deeper and recurse into the member + else { + updateResourceClassFlagsFromRecordDecl(Flags, SubRecordDecl); + } +} + +static RegisterBindingFlags HLSLFillRegisterBindingFlags(Sema &S, + Decl *TheDecl) { + + // Cbuffers and Tbuffers are HLSLBufferDecl types + HLSLBufferDecl *CBufferOrTBuffer = dyn_cast(TheDecl); + // Samplers, UAVs, and SRVs are VarDecl types + VarDecl *TheVarDecl = dyn_cast(TheDecl); + + assert(((TheVarDecl && !CBufferOrTBuffer) || + (!TheVarDecl && CBufferOrTBuffer)) && + "either TheVarDecl or CBufferOrTBuffer should be set"); + + RegisterBindingFlags Flags; + + // check if the decl type is groupshared + if (TheDecl->hasAttr()) { + Flags.Other = true; + return Flags; + } + + if (!isDeclaredWithinCOrTBuffer(TheDecl)) { + // make sure the type is a basic / numeric type + if (TheVarDecl) { + QualType TheQualTy = TheVarDecl->getType(); + // a numeric variable or an array of numeric variables + // will inevitably end up in $Globals buffer + const clang::Type *TheBaseType = TheQualTy.getTypePtr(); + while (TheBaseType->isArrayType()) + TheBaseType = TheBaseType->getArrayElementTypeNoTypeQual(); + if (TheBaseType->isIntegralType(S.getASTContext()) || + TheBaseType->isFloatingType()) + Flags.DefaultGlobals = true; + } + } + + if (CBufferOrTBuffer) { + Flags.Resource = true; + if (CBufferOrTBuffer->isCBuffer()) + Flags.CBV = true; + else + Flags.SRV = true; + } else if (TheVarDecl) { + const HLSLResourceClassAttr *resClassAttr = + getSpecifiedHLSLAttrFromVarDecl(TheVarDecl); + + if (resClassAttr) { + llvm::hlsl::ResourceClass DeclResourceClass = + resClassAttr->getResourceClass(); + Flags.Resource = true; + updateResourceClassFlagsFromDeclResourceClass(Flags, DeclResourceClass); + } else { + const clang::Type *TheBaseType = TheVarDecl->getType().getTypePtr(); + while (TheBaseType->isArrayType()) + TheBaseType = TheBaseType->getArrayElementTypeNoTypeQual(); + if (TheBaseType->isArithmeticType()) + Flags.Basic = true; + else if (TheBaseType->isRecordType()) { + Flags.UDT = true; + const RecordType *TheRecordTy = TheBaseType->getAs(); + assert(TheRecordTy && "The Qual Type should be Record Type"); + const RecordDecl *TheRecordDecl = TheRecordTy->getDecl(); + // recurse through members, set appropriate resource class flags. + updateResourceClassFlagsFromRecordDecl(Flags, TheRecordDecl); + } else + Flags.Other = true; + } + } + return Flags; +} + +enum class RegisterType { SRV, UAV, CBuffer, Sampler, C, I, Invalid }; + +static RegisterType getRegisterType(StringRef Slot) { + switch (Slot[0]) { + case 't': + case 'T': + return RegisterType::SRV; + case 'u': + case 'U': + return RegisterType::UAV; + case 'b': + case 'B ': + return RegisterType::CBuffer; + case 's': + case 'S': + return RegisterType::Sampler; + case 'c': + case 'C': + return RegisterType::C; + case 'i': + case 'I': + return RegisterType::I; + default: + return RegisterType::Invalid; + } +} + +static void ValidateMultipleRegisterAnnotations(Sema &S, Decl *TheDecl, + RegisterType regType) { + // make sure that there are no two register annotations + // applied to the decl with the same register type + bool RegisterTypesDetected[5] = {false}; + + RegisterTypesDetected[static_cast(regType)] = true; + + // we need a static map to keep track of previous conflicts + // so that we don't emit the same error multiple times + static std::map> PreviousConflicts; + + for (auto it = TheDecl->attr_begin(); it != TheDecl->attr_end(); ++it) { + if (HLSLResourceBindingAttr *attr = + dyn_cast(*it)) { + + RegisterType otherRegType = getRegisterType(attr->getSlot()); + if (RegisterTypesDetected[static_cast(otherRegType)]) { + if (PreviousConflicts[TheDecl].count(otherRegType)) + continue; + int otherRegTypeNum = static_cast(otherRegType); + S.Diag(TheDecl->getLocation(), + diag::err_hlsl_duplicate_register_annotation) + << otherRegTypeNum; + PreviousConflicts[TheDecl].insert(otherRegType); + } else { + RegisterTypesDetected[static_cast(otherRegType)] = true; + } + } + } +} + +static std::string getHLSLResourceTypeStr(Sema &S, Decl *TheDecl) { + if (VarDecl *TheVarDecl = dyn_cast(TheDecl)) { + QualType TheQualTy = TheVarDecl->getType(); + PrintingPolicy PP = S.getPrintingPolicy(); + return QualType::getAsString(TheQualTy.split(), PP); + } + if (HLSLBufferDecl *CBufferOrTBuffer = dyn_cast(TheDecl)) + return CBufferOrTBuffer->isCBuffer() ? "cbuffer" : "tbuffer"; +} + +static void DiagnoseHLSLRegisterAttribute(Sema &S, SourceLocation &ArgLoc, + Decl *TheDecl, RegisterType regType) { + + // Samplers, UAVs, and SRVs are VarDecl types + VarDecl *TheVarDecl = dyn_cast(TheDecl); + // Cbuffers and Tbuffers are HLSLBufferDecl types + HLSLBufferDecl *CBufferOrTBuffer = dyn_cast(TheDecl); + + // exactly one of these two types should be set + assert(((TheVarDecl && !CBufferOrTBuffer) || + (!TheVarDecl && CBufferOrTBuffer)) && + "either TheVarDecl or CBufferOrTBuffer should be set"); + + RegisterBindingFlags Flags = HLSLFillRegisterBindingFlags(S, TheDecl); + assert((int)Flags.Other + (int)Flags.Resource + (int)Flags.Basic + + (int)Flags.UDT == + 1 && + "only one resource analysis result should be expected"); + + int regTypeNum = static_cast(regType); + + // first, if "other" is set, emit an error + if (Flags.Other) { + S.Diag(ArgLoc, diag::err_hlsl_binding_type_mismatch) << regTypeNum; + return; + } + + // next, if multiple register annotations exist, check that none conflict. + ValidateMultipleRegisterAnnotations(S, TheDecl, regType); + + // next, if resource is set, make sure the register type in the register + // annotation is compatible with the variable's resource type. + if (Flags.Resource) { + const HLSLResourceClassAttr *resClassAttr = nullptr; + if (CBufferOrTBuffer) { + resClassAttr = CBufferOrTBuffer->getAttr(); + } else if (TheVarDecl) { + resClassAttr = + getSpecifiedHLSLAttrFromVarDecl(TheVarDecl); + } + + assert(resClassAttr && + "any decl that set the resource flag on analysis should " + "have a resource class attribute attached."); + const llvm::hlsl::ResourceClass DeclResourceClass = + resClassAttr->getResourceClass(); + + // confirm that the register type is bound to its expected resource class + static RegisterType ExpectedRegisterTypesForResourceClass[] = { + RegisterType::SRV, + RegisterType::UAV, + RegisterType::CBuffer, + RegisterType::Sampler, + }; + assert((int)DeclResourceClass < + std::size(ExpectedRegisterTypesForResourceClass) && + "DeclResourceClass has unexpected value"); + + RegisterType ExpectedRegisterType = + ExpectedRegisterTypesForResourceClass[(int)DeclResourceClass]; + if (regType != ExpectedRegisterType) { + S.Diag(TheDecl->getLocation(), diag::err_hlsl_binding_type_mismatch) + << regTypeNum; + } + return; + } + + // next, handle diagnostics for when the "basic" flag is set + if (Flags.Basic) { + if (Flags.DefaultGlobals) { + if (regType == RegisterType::CBuffer) + S.Diag(ArgLoc, diag::warn_hlsl_deprecated_register_type_b); + else if (regType != RegisterType::C) + S.Diag(ArgLoc, diag::err_hlsl_binding_type_mismatch) << regTypeNum; + return; + } + + if (regType == RegisterType::C) + S.Diag(ArgLoc, diag::warn_hlsl_register_type_c_packoffset); + else + S.Diag(ArgLoc, diag::err_hlsl_binding_type_mismatch) << regTypeNum; + + return; + } + + // finally, we handle the udt case + if (Flags.UDT) { + const bool ExpectedRegisterTypesForUDT[] = { + Flags.SRV, Flags.UAV, Flags.CBV, Flags.Sampler, Flags.ContainsNumeric}; + assert(regTypeNum < std::size(ExpectedRegisterTypesForUDT) && + "regType has unexpected value"); + + if (!ExpectedRegisterTypesForUDT[regTypeNum]) + S.Diag(TheDecl->getLocation(), + diag::warn_hlsl_user_defined_type_missing_member) + << regTypeNum; + + return; + } +} + +void SemaHLSL::handleResourceBindingAttr(Decl *TheDecl, const ParsedAttr &AL) { + if (dyn_cast(TheDecl)) { + if (SemaRef.RequireCompleteType(TheDecl->getBeginLoc(), + cast(TheDecl)->getType(), + diag::err_incomplete_type)) + return; + } StringRef Space = "space0"; StringRef Slot = ""; @@ -489,17 +868,17 @@ void SemaHLSL::handleResourceBindingAttr(Decl *D, const ParsedAttr &AL) { Slot = Str; } + RegisterType regType; + // Validate. if (!Slot.empty()) { - switch (Slot[0]) { - case 'u': - case 'b': - case 's': - case 't': - break; - default: - Diag(ArgLoc, diag::err_hlsl_unsupported_register_type) - << Slot.substr(0, 1); + regType = getRegisterType(Slot); + if (regType == RegisterType::I) { + Diag(ArgLoc, diag::warn_hlsl_deprecated_register_type_i); + return; + } + if (regType == RegisterType::Invalid) { + Diag(ArgLoc, diag::err_hlsl_binding_type_invalid) << Slot.substr(0, 1); return; } @@ -522,12 +901,12 @@ void SemaHLSL::handleResourceBindingAttr(Decl *D, const ParsedAttr &AL) { return; } - // FIXME: check reg type match decl. Issue - // https://github.com/llvm/llvm-project/issues/57886. + DiagnoseHLSLRegisterAttribute(SemaRef, ArgLoc, TheDecl, regType); + HLSLResourceBindingAttr *NewAttr = HLSLResourceBindingAttr::Create(getASTContext(), Slot, Space, AL); if (NewAttr) - D->addAttr(NewAttr); + TheDecl->addAttr(NewAttr); } void SemaHLSL::handleParamModifierAttr(Decl *D, const ParsedAttr &AL) { diff --git a/clang/test/AST/HLSL/ast-dump-comment-cbuffe-tbufferr.hlsl b/clang/test/AST/HLSL/ast-dump-comment-cbuffe-tbufferr.hlsl index a98dc0f4ce4312..e6a2ea7c6d2dc6 100644 --- a/clang/test/AST/HLSL/ast-dump-comment-cbuffe-tbufferr.hlsl +++ b/clang/test/AST/HLSL/ast-dump-comment-cbuffe-tbufferr.hlsl @@ -38,12 +38,16 @@ tbuffer B { } // AST:HLSLBufferDecl {{.*}}:11:1, line:20:1> line:11:9 cbuffer A +// AST-NEXT:-HLSLResourceClassAttr {{.*}} <> Implicit CBuffer +// AST-NEXT:-HLSLResourceAttr {{.*}} <> Implicit CBuffer // AST-NEXT:FullComment {{.*}} // AST-NEXT:`-ParagraphComment {{.*}} // AST-NEXT:`-TextComment {{.*}} Text=" CBuffer decl." // AST-NEXT:-VarDecl {{.*}} col:11 a 'float' // AST-NEXT:`-VarDecl {{.*}} col:9 b 'int' // AST-NEXT:HLSLBufferDecl {{.*}} line:29:9 tbuffer B +// AST-NEXT:-HLSLResourceClassAttr {{.*}} <> Implicit SRV +// AST-NEXT:-HLSLResourceAttr {{.*}} <> Implicit TBuffer // AST-NEXT:-FullComment {{.*}} // AST-NEXT: `-ParagraphComment {{.*}} // AST-NEXT: `-TextComment {{.*}} Text=" TBuffer decl." diff --git a/clang/test/AST/HLSL/cbuffer_tbuffer.hlsl b/clang/test/AST/HLSL/cbuffer_tbuffer.hlsl index 7204dcd16e0a92..5e558354cd3a03 100644 --- a/clang/test/AST/HLSL/cbuffer_tbuffer.hlsl +++ b/clang/test/AST/HLSL/cbuffer_tbuffer.hlsl @@ -1,12 +1,16 @@ // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -ast-dump -o - %s | FileCheck %s -// CHECK:HLSLBufferDecl 0x[[CB:[0-9a-f]+]] {{.*}} line:5:9 cbuffer CB +// CHECK:HLSLBufferDecl 0x[[CB:[0-9a-f]+]] {{.*}} line:7:9 cbuffer CB +// CHECK:HLSLResourceClassAttr 0x{{[0-9a-f]+}} <> Implicit CBuffer +// CHECK-NEXT:HLSLResourceAttr {{.*}} <> Implicit CBuffer // CHECK-NEXT:VarDecl 0x[[A:[0-9a-f]+]] {{.*}} col:9 used a 'float' cbuffer CB { float a; } -// CHECK:HLSLBufferDecl 0x[[TB:[0-9a-f]+]] {{.*}} line:11:9 tbuffer TB +// CHECK:HLSLBufferDecl 0x[[TB:[0-9a-f]+]] {{.*}} line:15:9 tbuffer TB +// CHECK:HLSLResourceClassAttr 0x{{[0-9a-f]+}} <> Implicit SRV +// CHECK-NEXT:HLSLResourceAttr {{.*}} <> Implicit TBuffer // CHECK-NEXT:VarDecl 0x[[B:[0-9a-f]+]] {{.*}} col:9 used b 'float' tbuffer TB { float b; diff --git a/clang/test/AST/HLSL/packoffset.hlsl b/clang/test/AST/HLSL/packoffset.hlsl index 060288c2f7f76c..9c928bd6d922ed 100644 --- a/clang/test/AST/HLSL/packoffset.hlsl +++ b/clang/test/AST/HLSL/packoffset.hlsl @@ -4,6 +4,8 @@ // CHECK: HLSLBufferDecl {{.*}} cbuffer A cbuffer A { + // CHECK-NEXT:-HLSLResourceClassAttr {{.*}} <> Implicit CBuffer + // CHECK-NEXT:-HLSLResourceAttr {{.*}} <> Implicit CBuffer // CHECK-NEXT: VarDecl {{.*}} A1 'float4' // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 0 float4 A1 : packoffset(c); diff --git a/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl b/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl index e9a6ea1a16312c..281d8be8addf09 100644 --- a/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl +++ b/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl @@ -17,8 +17,12 @@ float foo() { } // Make sure cbuffer/tbuffer works for PCH. // CHECK:HLSLBufferDecl 0x{{[0-9a-f]+}} <{{.*}}:7:1, line:9:1> line:7:9 imported cbuffer A +// CHECK-NEXT:HLSLResourceClassAttr {{.*}} <> Implicit CBuffer +// CHECK-NEXT:HLSLResourceAttr {{.*}} <> Implicit CBuffer // CHECK-NEXT:`-VarDecl 0x[[A:[0-9a-f]+]] col:9 imported used a 'float' // CHECK-NEXT:HLSLBufferDecl 0x{{[0-9a-f]+}} line:11:9 imported tbuffer B +// CHECK-NEXT:HLSLResourceClassAttr {{.*}} <> Implicit SRV +// CHECK-NEXT:HLSLResourceAttr {{.*}} <> Implicit TBuffer // CHECK-NEXT:`-VarDecl 0x[[B:[0-9a-f]+]] col:9 imported used b 'float' // CHECK-NEXT:FunctionDecl 0x{{[0-9a-f]+}} line:15:7 imported foo 'float ()' // CHECK-NEXT:CompoundStmt 0x{{[0-9a-f]+}} diff --git a/clang/test/AST/HLSL/resource_binding_attr.hlsl b/clang/test/AST/HLSL/resource_binding_attr.hlsl index 71900f2dbda550..13957ad3c1fcc7 100644 --- a/clang/test/AST/HLSL/resource_binding_attr.hlsl +++ b/clang/test/AST/HLSL/resource_binding_attr.hlsl @@ -1,13 +1,17 @@ // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -ast-dump -o - %s | FileCheck %s -// CHECK:HLSLBufferDecl 0x[[CB:[0-9a-f]+]] {{.*}} line:6:9 cbuffer CB +// CHECK:HLSLBufferDecl 0x[[CB:[0-9a-f]+]] {{.*}} line:8:9 cbuffer CB +// CHECK-NEXT:HLSLResourceClassAttr 0x[[CB:[0-9a-f]+]] {{.*}} Implicit CBuffer +// CHECK-NEXT:HLSLResourceAttr 0x[[CB:[0-9a-f]+]] {{.*}} Implicit CBuffer // CHECK-NEXT:HLSLResourceBindingAttr 0x{{[0-9a-f]+}} "b3" "space2" // CHECK-NEXT:VarDecl 0x[[A:[0-9a-f]+]] {{.*}} col:9 used a 'float' cbuffer CB : register(b3, space2) { float a; } -// CHECK:HLSLBufferDecl 0x[[TB:[0-9a-f]+]] {{.*}} line:13:9 tbuffer TB +// CHECK:HLSLBufferDecl 0x[[TB:[0-9a-f]+]] {{.*}} line:17:9 tbuffer TB +// CHECK-NEXT:HLSLResourceClassAttr 0x[[CB:[0-9a-f]+]] {{.*}} Implicit SRV +// CHECK-NEXT:HLSLResourceAttr 0x[[CB:[0-9a-f]+]] {{.*}} Implicit TBuffer // CHECK-NEXT:HLSLResourceBindingAttr 0x{{[0-9a-f]+}} "t2" "space1" // CHECK-NEXT:VarDecl 0x[[B:[0-9a-f]+]] {{.*}} col:9 used b 'float' tbuffer TB : register(t2, space1) { diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test index a7e425e3d5f431..5ebbd29b316bfa 100644 --- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test +++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test @@ -83,7 +83,7 @@ // CHECK-NEXT: GNUInline (SubjectMatchRule_function) // CHECK-NEXT: HIPManaged (SubjectMatchRule_variable) // CHECK-NEXT: HLSLROV (SubjectMatchRule_record_not_is_union) -// CHECK-NEXT: HLSLResourceClass (SubjectMatchRule_record_not_is_union) +// CHECK-NEXT: HLSLResourceClass (SubjectMatchRule_field) // CHECK-NEXT: Hot (SubjectMatchRule_function) // CHECK-NEXT: HybridPatchable (SubjectMatchRule_function) // CHECK-NEXT: IBAction (SubjectMatchRule_objc_method_is_instance) diff --git a/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl b/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl index 410b4524f1c3df..4b002e2d890093 100644 --- a/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl +++ b/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl @@ -1,31 +1,31 @@ // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s -// CHECK: -HLSLResourceClassAttr 0x{{[0-9a-f]+}} SRV -struct [[hlsl::resource_class(SRV)]] Eg1 { - int i; +// CHECK: -HLSLResourceClassAttr 0x{{[0-9a-f]+}} SRV +struct Eg1 { + [[hlsl::resource_class(SRV)]] int i; }; Eg1 e1; -// CHECK: -CXXRecordDecl 0x{{[0-9a-f]+}} line:13:38 referenced struct Eg2 definition -// CHECK: -HLSLResourceClassAttr 0x{{[0-9a-f]+}} UAV -struct [[hlsl::resource_class(UAV)]] Eg2 { - int i; +// CHECK: -CXXRecordDecl 0x{{[0-9a-f]+}} line:13:8 referenced struct Eg2 definition +// CHECK: -HLSLResourceClassAttr 0x{{[0-9a-f]+}} UAV +struct Eg2 { + [[hlsl::resource_class(UAV)]] int i; }; Eg2 e2; -// CHECK: -CXXRecordDecl 0x{{[0-9a-f]+}} line:20:42 referenced struct Eg3 definition -// CHECK: -HLSLResourceClassAttr 0x{{[0-9a-f]+}} CBuffer -struct [[hlsl::resource_class(CBuffer)]] Eg3 { - int i; +// CHECK: -CXXRecordDecl 0x{{[0-9a-f]+}} line:20:8 referenced struct Eg3 definition +// CHECK: -HLSLResourceClassAttr 0x{{[0-9a-f]+}} CBuffer +struct Eg3 { + [[hlsl::resource_class(CBuffer)]] int i; }; Eg3 e3; -// CHECK: -CXXRecordDecl 0x{{[0-9a-f]+}} line:27:42 referenced struct Eg4 definition -// CHECK: -HLSLResourceClassAttr 0x{{[0-9a-f]+}} Sampler -struct [[hlsl::resource_class(Sampler)]] Eg4 { - int i; +// CHECK: -CXXRecordDecl 0x{{[0-9a-f]+}} line:27:8 referenced struct Eg4 definition +// CHECK: -HLSLResourceClassAttr 0x{{[0-9a-f]+}} Sampler +struct Eg4 { + [[hlsl::resource_class(Sampler)]] int i; }; Eg4 e4; diff --git a/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl index 00fcd769760bba..76bed2f0607830 100644 --- a/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl +++ b/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl @@ -1,15 +1,22 @@ // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s -verify +struct Eg1 { // expected-error@+1{{'resource_class' attribute takes one argument}} -struct [[hlsl::resource_class()]] Eg1 { - int i; + [[hlsl::resource_class()]] int i; }; Eg1 e1; +struct Eg2 { // expected-warning@+1{{ResourceClass attribute argument not supported: gibberish}} -struct [[hlsl::resource_class(gibberish)]] Eg2 { - int i; + [[hlsl::resource_class(gibberish)]] int i; }; Eg2 e2; + +// expected-warning@+1{{'resource_class' attribute only applies to non-static data members}} +struct [[hlsl::resource_class(SRV)]] Eg3 { + int i; +}; + +Eg3 e3; diff --git a/clang/test/SemaHLSL/resource_binding_attr_error.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error.hlsl index 2f8aa098db701a..6a0b5956545dd8 100644 --- a/clang/test/SemaHLSL/resource_binding_attr_error.hlsl +++ b/clang/test/SemaHLSL/resource_binding_attr_error.hlsl @@ -1,9 +1,15 @@ // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify -// expected-error@+1 {{invalid resource class specifier 'c' used; expected 'b', 's', 't', or 'u'}} -float a : register(c0, space1); +template +struct MyTemplatedSRV { + [[hlsl::resource_class(SRV)]] T x; +}; + +// valid, The register keyword in this statement isn't binding a resource, rather it is +// specifying a constant register binding offset within the $Globals cbuffer, which is legacy behavior from DX9. +float a : register(c0); -// expected-error@+1 {{invalid resource class specifier 'i' used; expected 'b', 's', 't', or 'u'}} +// expected-error@+1 {{binding type 'i' ignored. The 'integer constant' binding type is no longer supported}} cbuffer b : register(i0) { } @@ -33,28 +39,40 @@ cbuffer C : register(b 2) {} // expected-error@+1 {{wrong argument format for hlsl attribute, use space3 instead}} cbuffer D : register(b 2, space 3) {} -// expected-warning@+1 {{'register' attribute only applies to cbuffer/tbuffer and external global variables}} -static RWBuffer U : register(u5); +// expected-error@+1 {{'register' attribute only applies to cbuffer/tbuffer and external global variables}} +static MyTemplatedSRV U : register(u5); + +// expected-error@+1 {{'register' attribute only applies to cbuffer/tbuffer and external global variables}} +static float sa : register(c1); + +float x[2] : register(c2); // valid +float y[2][2] : register(c3); // valid +float z[2][2][3] : register(c4); // valid + +// expected-error@+1 {{binding type 'c' only applies to numeric variables in the global scope}} +groupshared float fa[10] : register(c5); void foo() { - // expected-warning@+1 {{'register' attribute only applies to cbuffer/tbuffer and external global variables}} - RWBuffer U : register(u3); + // expected-error@+1 {{'register' attribute only applies to cbuffer/tbuffer and external global variables}} + MyTemplatedSRV U : register(u3); } void foo2() { - // expected-warning@+1 {{'register' attribute only applies to cbuffer/tbuffer and external global variables}} - extern RWBuffer U2 : register(u5); + // expected-error@+1 {{'register' attribute only applies to cbuffer/tbuffer and external global variables}} + extern MyTemplatedSRV U2 : register(u5); } -// FIXME: expect-error once fix https://github.com/llvm/llvm-project/issues/57886. + +// expected-error@+1 {{binding type 'u' only applies to UAV resources}} float b : register(u0, space1); -// expected-warning@+1 {{'register' attribute only applies to cbuffer/tbuffer and external global variables}} -void bar(RWBuffer U : register(u3)) { +// expected-error@+1 {{'register' attribute only applies to cbuffer/tbuffer and external global variables}} +void bar(MyTemplatedSRV U : register(u3)) { } -struct S { - // FIXME: generate better error when support semantic on struct field. - // See https://github.com/llvm/llvm-project/issues/57889. - // expected-warning@+1 {{'register' attribute only applies to cbuffer/tbuffer and external global variables}} - RWBuffer U : register(u3); +struct S { + // expected-error@+1 {{'register' attribute only applies to cbuffer/tbuffer and external global variables}} + MyTemplatedSRV U : register(u3); }; + +// expected-error@+1 {{binding type 'z' is invalid}} +MyTemplatedSRV U3 : register(z5); diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_basic.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_basic.hlsl new file mode 100644 index 00000000000000..0a547ed66af0a2 --- /dev/null +++ b/clang/test/SemaHLSL/resource_binding_attr_error_basic.hlsl @@ -0,0 +1,39 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify + +// expected-error@+1{{binding type 't' only applies to SRV resources}} +float f1 : register(t0); + + +float f2 : register(c0); + +// expected-error@+1{{binding type 'b' only applies to constant buffers. The 'bool constant' binding type is no longer supported}} +float f3 : register(b9); + +// expected-error@+1{{binding type 'i' ignored. The 'integer constant' binding type is no longer supported}} +float f4 : register(i9); + +// expected-error@+1{{binding type 'x' is invalid}} +float f5 : register(x9); + +cbuffer g_cbuffer1 { +// expected-error@+1{{binding type 'c' ignored in buffer declaration. Did you mean 'packoffset'?}} + float f6 : register(c2); +}; + +tbuffer g_tbuffer1 { +// expected-error@+1{{binding type 'c' ignored in buffer declaration. Did you mean 'packoffset'?}} + float f7 : register(c2); +}; + +cbuffer g_cbuffer2 { +// expected-error@+1{{binding type 'b' only applies to constant buffer resources}} + float f8 : register(b2); +}; + +tbuffer g_tbuffer2 { +// expected-error@+1{{binding type 'i' ignored. The 'integer constant' binding type is no longer supported}} + float f9 : register(i2); +}; + +// expected-error@+1{{binding type 'c' only applies to numeric variables in the global scope}} +RWBuffer f10 : register(c3); diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_other.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_other.hlsl new file mode 100644 index 00000000000000..4c9e9a6b44c928 --- /dev/null +++ b/clang/test/SemaHLSL/resource_binding_attr_error_other.hlsl @@ -0,0 +1,9 @@ +// RUN: not %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s | FileCheck %s + +// XFAIL: * +// This expectedly fails because RayQuery is an unsupported type. +// When it becomes supported, we should expect an error due to +// the variable type being classified as "other", and according +// to the spec, err_hlsl_unsupported_register_type_and_variable_type +// should be emitted. +RayQuery<0> r1: register(t0); diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl new file mode 100644 index 00000000000000..c40d1d7f60b347 --- /dev/null +++ b/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl @@ -0,0 +1,49 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify + +// This test validates the diagnostics that are emitted when a variable with a "resource" type +// is bound to a register using the register annotation + + +template +struct MyTemplatedSRV { + [[hlsl::resource_class(SRV)]] T x; +}; + +struct MySRV { + [[hlsl::resource_class(SRV)]] int x; +}; + +struct MySampler { + [[hlsl::resource_class(Sampler)]] int x; +}; + +struct MyUAV { + [[hlsl::resource_class(UAV)]] int x; +}; + +struct MyCBuffer { + [[hlsl::resource_class(CBuffer)]] int x; +}; + + +// expected-error@+1 {{binding type 'i' ignored. The 'integer constant' binding type is no longer supported}} +MySRV invalid : register(i2); + +// expected-error@+1 {{binding type 't' only applies to SRV resources}} +MyUAV a : register(t2, space1); + +// expected-error@+1 {{binding type 'u' only applies to UAV resources}} +MySampler b : register(u2, space1); + +// expected-error@+1 {{binding type 'b' only applies to constant buffer resources}} +MyTemplatedSRV c : register(b2); + +// expected-error@+1 {{binding type 's' only applies to sampler state}} +MyUAV d : register(s2, space1); + +// empty binding prefix cases: +// expected-error@+1 {{expected identifier}} +MyTemplatedSRV e: register(); + +// expected-error@+1 {{expected identifier}} +MyTemplatedSRV f: register(""); diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_silence_diags.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_silence_diags.hlsl new file mode 100644 index 00000000000000..e63f264452da79 --- /dev/null +++ b/clang/test/SemaHLSL/resource_binding_attr_error_silence_diags.hlsl @@ -0,0 +1,27 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only -Wno-legacy-constant-register-binding %s -verify + +// expected-no-diagnostics +float f2 : register(b9); + +float f3 : register(i9); + +cbuffer g_cbuffer1 { + float f4 : register(c2); +}; + + +struct Eg12{ + RWBuffer a; +}; + +Eg12 e12 : register(c9); + +Eg12 bar : register(i1); + +struct Eg7 { + struct Bar { + float f; + }; + Bar b; +}; +Eg7 e7 : register(t0); diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl new file mode 100644 index 00000000000000..f8e38b6d2851d9 --- /dev/null +++ b/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl @@ -0,0 +1,128 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify + +template +struct MyTemplatedUAV { + [[hlsl::resource_class(UAV)]] T x; +}; + +struct MySRV { + [[hlsl::resource_class(SRV)]] int x; +}; + +struct MySampler { + [[hlsl::resource_class(Sampler)]] int x; +}; + +struct MyUAV { + [[hlsl::resource_class(UAV)]] int x; +}; + +struct MyCBuffer { + [[hlsl::resource_class(CBuffer)]] int x; +}; + +// Valid: f is skipped, SRVBuf is bound to t0, UAVBuf is bound to u0 +struct Eg1 { + float f; + MySRV SRVBuf; + MyUAV UAVBuf; + }; +Eg1 e1 : register(t0) : register(u0); + +// Valid: f is skipped, SRVBuf is bound to t0, UAVBuf is bound to u0. +// UAVBuf2 gets automatically assigned to u1 even though there is no explicit binding for u1. +struct Eg2 { + float f; + MySRV SRVBuf; + MyUAV UAVBuf; + MyUAV UAVBuf2; + }; +Eg2 e2 : register(t0) : register(u0); + +// Valid: Bar, the struct within Eg3, has a valid resource that can be bound to t0. +struct Eg3 { + struct Bar { + MyUAV a; + }; + Bar b; +}; +Eg3 e3 : register(u0); + +// Valid: the first sampler state object within 's' is bound to slot 5 +struct Eg4 { + MySampler s[3]; +}; + +Eg4 e4 : register(s5); + + +struct Eg5 { + float f; +}; +// expected-warning@+1{{binding type 't' only applies to types containing SRV resources}} +Eg5 e5 : register(t0); + +struct Eg6 { + float f; +}; +// expected-warning@+1{{binding type 'u' only applies to types containing UAV resources}} +Eg6 e6 : register(u0); + +struct Eg7 { + float f; +}; +// expected-warning@+1{{binding type 'b' only applies to types containing constant buffer resources}} +Eg7 e7 : register(b0); + +struct Eg8 { + float f; +}; +// expected-warning@+1{{binding type 's' only applies to types containing sampler state}} +Eg8 e8 : register(s0); + +struct Eg9 { + MySRV s; +}; +// expected-warning@+1{{binding type 'c' only applies to types containing numeric types}} +Eg9 e9 : register(c0); + +struct Eg10{ + // expected-error@+1{{'register' attribute only applies to cbuffer/tbuffer and external global variables}} + MyTemplatedUAV a : register(u9); +}; +Eg10 e10; + + +template +struct Eg11 { + R b; +}; +// expected-warning@+1{{binding type 'u' only applies to types containing UAV resources}} +Eg11 e11 : register(u0); +// invalid because after template expansion, there are no valid resources inside Eg11 to bind as a UAV, only an SRV + + +struct Eg12{ + MySRV s1; + MySRV s2; +}; +// expected-warning@+3{{binding type 'u' only applies to types containing UAV resources}} +// expected-warning@+2{{binding type 'u' only applies to types containing UAV resources}} +// expected-error@+1{{binding type 'u' cannot be applied more than once}} +Eg12 e12 : register(u9) : register(u10); + +struct Eg13{ + MySRV s1; + MySRV s2; +}; +// expected-warning@+4{{binding type 'u' only applies to types containing UAV resources}} +// expected-warning@+3{{binding type 'u' only applies to types containing UAV resources}} +// expected-warning@+2{{binding type 'u' only applies to types containing UAV resources}} +// expected-error@+1{{binding type 'u' cannot be applied more than once}} +Eg13 e13 : register(u9) : register(u10) : register(u11); + +struct Eg14{ + MyTemplatedUAV r1; +}; +// expected-warning@+1{{binding type 't' only applies to types containing SRV resources}} +Eg14 e14 : register(t9); From f607102a0d6be0e2aebc1bfaed2ed0a6ae020145 Mon Sep 17 00:00:00 2001 From: Peiming Liu Date: Fri, 23 Aug 2024 10:47:44 -0700 Subject: [PATCH 351/426] [mlir][sparse] partially support lowering sparse coiteration loops to scf.while/for. (#105565) --- .../Dialect/SparseTensor/IR/SparseTensor.h | 22 +- .../SparseTensor/IR/SparseTensorOps.td | 4 + .../SparseTensor/IR/SparseTensorDialect.cpp | 10 + .../Transforms/SparseIterationToScf.cpp | 291 +++++++++++++++++- .../Transforms/Utils/LoopEmitter.cpp | 174 ++++++----- .../Transforms/Utils/LoopEmitter.h | 11 + .../Transforms/Utils/SparseTensorIterator.h | 2 + .../sparse_kernels_to_iterator.mlir | 77 ++++- ...-sqsum.mlir => iterator-based-kernel.mlir} | 49 ++- 9 files changed, 549 insertions(+), 91 deletions(-) rename mlir/test/Integration/Dialect/SparseTensor/CPU/{iterator-based-sqsum.mlir => iterator-based-kernel.mlir} (63%) diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h index 388efd1c454b1e..fca2629d72efcf 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h @@ -96,24 +96,32 @@ class I64BitSet { return *this; } + bool isSubSetOf(const I64BitSet p) const { + I64BitSet tmp = *this; + tmp |= p; + return tmp == p; + } + // Needed by `llvm::const_set_bits_iterator_impl`. int find_first() const { return min(); } int find_next(unsigned prev) const { - if (prev >= max()) + if (prev >= max() - 1) return -1; - uint64_t b = storage >> (prev + 1); - if (b == 0) - return -1; + uint64_t b = storage >> (prev + static_cast(1)); + assert(b != 0); - return llvm::countr_zero(b) + prev + 1; + return llvm::countr_zero(b) + prev + static_cast(1); } bool operator[](unsigned i) const { assert(i < 64); - return (storage & (1 << i)) != 0; + return (storage & (static_cast(1) << i)) != 0; + } + unsigned min() const { + unsigned m = llvm::countr_zero(storage); + return m == 64 ? -1 : m; } - unsigned min() const { return llvm::countr_zero(storage); } unsigned max() const { return 64 - llvm::countl_zero(storage); } unsigned count() const { return llvm::popcount(storage); } bool empty() const { return storage == 0; } diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td index 2803223354d5ee..20512f972e67cd 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td @@ -1787,6 +1787,10 @@ def SparseTensor_CoIterateOp : SparseTensor_Op<"coiterate", .take_back(getRegionDefinedSpace(regionIdx).count()); } ValueRange getYieldedValues(unsigned regionIdx); + + // Returns a vector of regions that are the `sub-cases` of the given case region. + // E.g., `case %it1, _, %it3` is a subcase of `case %it1, %it2, %it3`. + SmallVector getSubCasesOf(unsigned regionIdx); }]; let hasVerifier = 1; diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp index a143189c301a43..16856b958d4f13 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp +++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp @@ -2745,6 +2745,16 @@ LogicalResult CoIterateOp::verifyRegions() { return success(); } +SmallVector CoIterateOp::getSubCasesOf(unsigned regionIdx) { + SmallVector ret; + I64BitSet caseBit = getRegionDefinedSpace(regionIdx); + for (Region &r : getCaseRegions()) + if (getRegionDefinedSpace(r.getRegionNumber()).isSubSetOf(caseBit)) + ret.push_back(&r); + + return ret; +} + //===----------------------------------------------------------------------===// // Sparse Tensor Dialect Setups. //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp index b1451dee738ac3..d6c0da4a9e4573 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp @@ -1,5 +1,6 @@ #include "Utils/CodegenUtils.h" +#include "Utils/LoopEmitter.h" #include "Utils/SparseTensorIterator.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" @@ -49,6 +50,144 @@ convertIteratorType(IteratorType itTp, SmallVectorImpl &fields) { return success(); } +static ValueRange +genCoIterateBranchNest(PatternRewriter &rewriter, Location loc, CoIterateOp op, + Value loopCrd, + ArrayRef> iters, + ArrayRef subCases, ArrayRef userReduc) { + if (subCases.empty()) + return userReduc; + + // The current branch that we are handling. + Region *b = subCases.front(); + Value casePred = constantI1(rewriter, loc, true); + I64BitSet caseBits = op.getRegionDefinedSpace(b->getRegionNumber()); + for (unsigned i : caseBits.bits()) { + SparseIterator *it = iters[i].get(); + Value pred = rewriter.create(loc, arith::CmpIPredicate::eq, + it->getCrd(), loopCrd); + casePred = rewriter.create(loc, casePred, pred); + } + scf::IfOp ifOp = rewriter.create( + loc, ValueRange(userReduc).getTypes(), casePred, /*else=*/true); + rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front()); + + // Erase the empty block. + rewriter.eraseBlock(&ifOp.getThenRegion().front()); + // Set up block arguments: user-provided values -> loop coord -> iterators. + SmallVector blockArgs(userReduc); + blockArgs.push_back(loopCrd); + for (unsigned idx : caseBits.bits()) + llvm::append_range(blockArgs, iters[idx]->getCursor()); + + IRMapping mapping; + for (auto [from, to] : + llvm::zip_equal(b->front().getArguments(), blockArgs)) { + mapping.map(from, to); + } + + // Clone the region, we can not erase the region now because the same region + // might be a subcase for multiple lattice point. + rewriter.cloneRegionBefore(*b, ifOp.getThenRegion(), + ifOp.getThenRegion().begin(), mapping); + + // replace sparse_tensor::YieldOp -> scf::YieldOp + auto spY = cast(&ifOp.getThenRegion().front().back()); + ValueRange yields = spY.getResults(); + rewriter.eraseOp(spY); + rewriter.setInsertionPointToEnd(&ifOp.getThenRegion().front()); + rewriter.create(loc, yields); + + // Generates remaining case recursively. + rewriter.setInsertionPointToStart(&ifOp.getElseRegion().front()); + ValueRange res = genCoIterateBranchNest(rewriter, loc, op, loopCrd, iters, + subCases.drop_front(), userReduc); + if (!res.empty()) + rewriter.create(loc, res); + + rewriter.setInsertionPointAfter(ifOp); + return ifOp.getResults(); +} + +static ValueRange genLoopWithIterator( + PatternRewriter &rewriter, Location loc, SparseIterator *it, + ValueRange reduc, bool iterFirst, + function_ref(PatternRewriter &rewriter, Location loc, + Region &loopBody, SparseIterator *it, + ValueRange reduc)> + bodyBuilder) { + if (it->iteratableByFor()) { + auto [lo, hi] = it->genForCond(rewriter, loc); + Value step = constantIndex(rewriter, loc, 1); + scf::ForOp forOp = rewriter.create(loc, lo, hi, step, reduc); + { + OpBuilder::InsertionGuard guard(rewriter); + // Erase the implicit yield operation created by ForOp when there is no + // yielding values. + if (!forOp.getBody()->empty()) + rewriter.eraseOp(&forOp.getBody()->front()); + assert(forOp.getBody()->empty()); + + it->linkNewScope(forOp.getInductionVar()); + rewriter.setInsertionPointToStart(forOp.getBody()); + SmallVector ret = bodyBuilder(rewriter, loc, forOp.getBodyRegion(), + it, forOp.getRegionIterArgs()); + + rewriter.setInsertionPointToEnd(forOp.getBody()); + rewriter.create(loc, ret); + } + return forOp.getResults(); + } + SmallVector ivs; + // TODO: always put iterator SSA values at the end of argument list to be + // consistent with coiterate operation. + if (!iterFirst) + llvm::append_range(ivs, it->getCursor()); + // Appends the user-provided values. + llvm::append_range(ivs, reduc); + if (iterFirst) + llvm::append_range(ivs, it->getCursor()); + + TypeRange types = ValueRange(ivs).getTypes(); + auto whileOp = rewriter.create(loc, types, ivs); + { + OpBuilder::InsertionGuard guard(rewriter); + // Generates loop conditions. + SmallVector l(types.size(), loc); + Block *before = rewriter.createBlock(&whileOp.getBefore(), {}, types, l); + rewriter.setInsertionPointToStart(before); + ValueRange bArgs = before->getArguments(); + auto [whileCond, remArgs] = it->genWhileCond(rewriter, loc, bArgs); + rewriter.create(loc, whileCond, before->getArguments()); + + // Delegates loop body generation. + Region &dstRegion = whileOp.getAfter(); + Block *after = rewriter.createBlock(&dstRegion, {}, types, l); + ValueRange aArgs = whileOp.getAfterArguments(); + if (iterFirst) { + aArgs = it->linkNewScope(aArgs); + } else { + aArgs = aArgs.take_front(reduc.size()); + it->linkNewScope(aArgs.drop_front(reduc.size())); + } + + rewriter.setInsertionPointToStart(after); + SmallVector ret = bodyBuilder(rewriter, loc, dstRegion, it, aArgs); + rewriter.setInsertionPointToEnd(after); + + // Forward loops + SmallVector yields; + ValueRange nx = it->forward(rewriter, loc); + if (iterFirst) + llvm::append_range(yields, nx); + llvm::append_range(yields, ret); + if (!iterFirst) + llvm::append_range(yields, nx); + rewriter.create(loc, yields); + } + return whileOp.getResults().drop_front(it->getCursor().size()); +} + namespace { /// Sparse codegen rule for number of entries operator. @@ -136,6 +275,8 @@ class SparseIterateOpConverter : public OneToNOpConversionPattern { rewriter.replaceOp(op, forOp.getResults(), resultMapping); } else { SmallVector ivs; + // TODO: put iterator at the end of argument list to be consistent with + // coiterate operation. llvm::append_range(ivs, it->getCursor()); for (ValueRange inits : adaptor.getInitArgs()) llvm::append_range(ivs, inits); @@ -189,6 +330,153 @@ class SparseIterateOpConverter : public OneToNOpConversionPattern { } }; +class SparseCoIterateOpConverter + : public OneToNOpConversionPattern { + using OneToNOpConversionPattern::OneToNOpConversionPattern; + + LogicalResult + matchAndRewrite(CoIterateOp op, OpAdaptor adaptor, + OneToNPatternRewriter &rewriter) const override { + assert(op.getSpaceDim() == 1 && "Not implemented"); + Location loc = op.getLoc(); + + I64BitSet denseBits(0); + for (auto [idx, spaceTp] : llvm::enumerate(op.getIterSpaces().getTypes())) + if (all_of(cast(spaceTp).getLvlTypes(), isDenseLT)) + denseBits.set(idx); + + // If there exists a case that only contains dense spaces. I.e., case + // bits is a subset of dense bits, or when there is a full empty case (due + // to complements), we need a universal pointer to forward the coiteration + // loop. + bool needUniv = + any_of(op.getRegionDefinedSpaces(), [denseBits](I64BitSet caseBits) { + // A case for complement. + if (caseBits.count() == 0) + return true; + // An all-dense case. + return caseBits.isSubSetOf(denseBits); + }); + assert(!needUniv && "Not implemented"); + (void)needUniv; + + for (Region ®ion : op.getCaseRegions()) { + // Do a one-shot type conversion on all region blocks, since the same + // region might be used multiple time. + Block *block = ®ion.getBlocks().front(); + OneToNTypeMapping blockTypeMapping(block->getArgumentTypes()); + if (failed(typeConverter->convertSignatureArgs(block->getArgumentTypes(), + blockTypeMapping))) + return rewriter.notifyMatchFailure( + op, "failed to convert coiterate region argurment types"); + + rewriter.applySignatureConversion(block, blockTypeMapping); + } + + SmallVector spaces; + SmallVector> iters; + for (auto [spaceTp, spaceVals] : llvm::zip_equal( + op.getIterSpaces().getTypes(), adaptor.getIterSpaces())) { + // TODO: do we really need tid? + spaces.push_back(SparseIterationSpace::fromValues( + cast(spaceTp), spaceVals, /*tid=*/0)); + // Extract the iterator. + iters.push_back(spaces.back().extractIterator(rewriter, loc)); + } + + auto getFilteredIters = [&iters](I64BitSet caseBits) { + // Retrives a vector of pointers to the iterators used in the case. + SmallVector validIters; + for (auto idx : caseBits.bits()) + validIters.push_back(iters[idx].get()); + return validIters; + }; + + // Get a flattened user-provided loop reduction values. + SmallVector userReduc; + for (ValueRange r : adaptor.getInitArgs()) + llvm::append_range(userReduc, r); + + // TODO: we need to sort the cases such that they appears in lexical order. + // Although sparsification always generates cases in that order, it might + // not be the case for human-written code. + + // Generates a loop sequence, one loop per case. + for (auto [r, caseBits] : + llvm::zip_equal(op.getCaseRegions(), op.getRegionDefinedSpaces())) { + assert(caseBits.count() > 0 && "Complement space not implemented"); + + // Retrives a vector of pointers to the iterators used in the case. + SmallVector validIters = getFilteredIters(caseBits); + + if (validIters.size() > 1) { + auto [loop, loopCrd] = + genCoIteration(rewriter, loc, validIters, userReduc, + /*uniIdx=*/nullptr, /*userReducFirst=*/true); + + // 1st. find all the cases that is a strict subset of the current case + // condition, for which we generate one branch per case inside the loop. + // The subcases are never empty, it must contains at least the current + // region itself. + // TODO: these cases should be sorted. + SmallVector subCases = op.getSubCasesOf(r.getRegionNumber()); + assert(!subCases.empty()); + + ValueRange res = genCoIterateBranchNest(rewriter, loc, op, loopCrd, + iters, subCases, userReduc); + + SmallVector nextIterYields(res); + // 2nd. foward the loop. + for (SparseIterator *it : validIters) { + Value cmp = rewriter.create( + loc, arith::CmpIPredicate::eq, it->getCrd(), loopCrd); + it->forwardIf(rewriter, loc, cmp); + llvm::append_range(nextIterYields, it->getCursor()); + } + rewriter.create(loc, nextIterYields); + + // Exit the loop, relink the iterator SSA value. + rewriter.setInsertionPointAfter(loop); + ValueRange iterVals = loop->getResults().drop_front(userReduc.size()); + for (SparseIterator *it : validIters) + iterVals = it->linkNewScope(iterVals); + assert(iterVals.empty()); + + ValueRange curResult = loop->getResults().take_front(userReduc.size()); + userReduc.assign(curResult.begin(), curResult.end()); + } else { + // This is a simple iteration loop. + assert(caseBits.count() == 1); + + Block *block = &r.getBlocks().front(); + ValueRange curResult = genLoopWithIterator( + rewriter, loc, validIters.front(), userReduc, /*iterFirst=*/false, + /*bodyBuilder=*/ + [block](PatternRewriter &rewriter, Location loc, Region &dstRegion, + SparseIterator *it, + ValueRange reduc) -> SmallVector { + SmallVector blockArgs(reduc); + blockArgs.push_back(it->deref(rewriter, loc)); + llvm::append_range(blockArgs, it->getCursor()); + + Block *dstBlock = &dstRegion.getBlocks().front(); + rewriter.inlineBlockBefore( + block, dstBlock, rewriter.getInsertionPoint(), blockArgs); + auto yield = llvm::cast(dstBlock->back()); + SmallVector result(yield.getResults()); + rewriter.eraseOp(yield); + return result; + }); + + userReduc.assign(curResult.begin(), curResult.end()); + } + } + + rewriter.replaceOp(op, userReduc); + return success(); + } +}; + } // namespace mlir::SparseIterationTypeConverter::SparseIterationTypeConverter() { @@ -210,5 +498,6 @@ void mlir::populateLowerSparseIterationToSCFPatterns( IterateOp::getCanonicalizationPatterns(patterns, patterns.getContext()); patterns.add(converter, patterns.getContext()); + SparseIterateOpConverter, SparseCoIterateOpConverter>( + converter, patterns.getContext()); } diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp index efb3295fb2a4bf..cb5874ff45068e 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp @@ -524,84 +524,8 @@ std::pair LoopEmitter::emitForLoopOverTensorAtLvl( std::pair LoopEmitter::emitWhileLoopOverTensorsAtLvls( OpBuilder &builder, Location loc, ArrayRef spIters, MutableArrayRef reduc, bool needsUniv) { - // NOTE: the slice driven tensor-related reduction variable must - // appear before normal tensors. - - // The set of induction variables for the while loop. - SmallVector ivs; - - // Construct the while-loop with a parameter for each coordinate. - for (SparseIterator *it : spIters) { - ValueRange itVals = it->getCursor(); - ivs.append(itVals.begin(), itVals.end()); - } - - // The position where user-supplied reduction variable starts. - ivs.append(reduc.begin(), reduc.end()); - // Update universal index. - if (needsUniv) - ivs.push_back(loopSeqStack.back().first); - - // Ensures all operands are valid. - assert(llvm::all_of(ivs, [](Value v) { return v != nullptr; })); - TypeRange types = ValueRange(ivs).getTypes(); - auto whileOp = builder.create(loc, types, ivs); - - SmallVector locs(types.size(), loc); - Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs); - Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs); - - // Generates loop conditions. - builder.setInsertionPointToStart(before); - ValueRange bArgs = before->getArguments(); - Value whileCond = nullptr; // bool values for loop condition. - - for (SparseIterator *it : spIters) { - auto [cond, remArgs] = it->genWhileCond(builder, loc, bArgs); - whileCond = !whileCond ? cond : ANDI(whileCond, cond); - bArgs = remArgs; - } - // The remaining block arguments are user-provided reduction values and an - // optional universal index. Make sure their sizes match. - assert(bArgs.size() == reduc.size() + needsUniv); - builder.create(loc, whileCond, before->getArguments()); - - // Generates loop body. - builder.setInsertionPointToStart(after); - ValueRange aArgs = after->getArguments(); - // Since some LoopCondKind might need extra checks to filter out invalid - // iterations, we maintains another array to hold the iteration arguments to - // yield if the checks fails. - SmallVector nextArgs(aArgs.begin(), aArgs.end()); - - for (SparseIterator *it : spIters) { - aArgs = it->linkNewScope(aArgs); - // Dereference the iterator to cache the coordinate. - it->deref(builder, loc); - } - - // In-place update on reduction variable. - assert(aArgs.size() == reduc.size() + needsUniv); - for (unsigned i = 0, e = reduc.size(); i < e; i++) - reduc[i] = aArgs[i]; - - Value min; - // Finds the minimum coordinate - if (!needsUniv) { - for (SparseIterator *it : spIters) { - if (min) { - Value cmp = CMPI(ult, it->getCrd(), min); - min = SELECT(cmp, it->getCrd(), min); - } else { - min = it->getCrd(); - } - } - } else { - // Otherwise, universal index is the minimal pos. - min = whileOp.getAfterArguments().back(); - } - - return {whileOp, min}; + return genCoIteration(builder, loc, spIters, reduc, + needsUniv ? loopSeqStack.back().first : nullptr); } bool LoopEmitter::shouldIteratedByForLoop(ArrayRef spIters) { @@ -972,6 +896,100 @@ void LoopEmitter::exitCurrentLoop(RewriterBase &rewriter, Location loc, loopStack.pop_back(); } +//===----------------------------------------------------------------------===// +// Loop generation utils +//===----------------------------------------------------------------------===// + +std::pair sparse_tensor::genCoIteration( + OpBuilder &builder, Location loc, ArrayRef spIters, + MutableArrayRef reduc, Value uniIdx, bool userReducFirst) { + // NOTE: the slice driven tensor-related reduction variable must + // appear before normal tensors. + + // The set of induction variables for the while loop. + SmallVector ivs; + + // TODO: remove the flag after full migration. Currently + // `sparse_tensor.coiterate` operation (must) put user provided reduction + // values at the front of the block list, while direct sparsification to scf + // loops put them at the end. + if (userReducFirst) + ivs.append(reduc.begin(), reduc.end()); + + // Construct the while-loop with a parameter for each coordinate. + for (SparseIterator *it : spIters) { + ValueRange itVals = it->getCursor(); + ivs.append(itVals.begin(), itVals.end()); + } + + if (!userReducFirst) + ivs.append(reduc.begin(), reduc.end()); + + // Update universal index. + if (uniIdx) + ivs.push_back(uniIdx); + + // Ensures all operands are valid. + assert(llvm::all_of(ivs, [](Value v) { return v != nullptr; })); + TypeRange types = ValueRange(ivs).getTypes(); + auto whileOp = builder.create(loc, types, ivs); + + SmallVector locs(types.size(), loc); + Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs); + Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs); + + // Generates loop conditions. + builder.setInsertionPointToStart(before); + ValueRange bArgs = before->getArguments(); + Value whileCond = nullptr; // bool values for loop condition. + + for (SparseIterator *it : spIters) { + auto [cond, remArgs] = it->genWhileCond(builder, loc, bArgs); + whileCond = !whileCond ? cond : ANDI(whileCond, cond); + bArgs = remArgs; + } + // The remaining block arguments are user-provided reduction values and an + // optional universal index. Make sure their sizes match. + assert(bArgs.size() == reduc.size() + (uniIdx ? 1 : 0)); + builder.create(loc, whileCond, before->getArguments()); + + // Generates loop body. + builder.setInsertionPointToStart(after); + ValueRange aArgs = after->getArguments(); + // Since some LoopCondKind might need extra checks to filter out invalid + // iterations, we maintains another array to hold the iteration arguments to + // yield if the checks fails. + SmallVector nextArgs(aArgs.begin(), aArgs.end()); + + for (SparseIterator *it : spIters) { + aArgs = it->linkNewScope(aArgs); + // Dereference the iterator to cache the coordinate. + it->deref(builder, loc); + } + + // In-place update on reduction variable. + for (unsigned i = 0, e = reduc.size(); i < e; i++) + reduc[i] = aArgs[i]; + + Value min; + // Finds the minimum coordinate + if (!uniIdx) { + for (SparseIterator *it : spIters) { + if (min) { + Value cmp = CMPI(ult, it->getCrd(), min); + min = SELECT(cmp, it->getCrd(), min); + } else { + min = it->getCrd(); + } + } + } else { + // Otherwise, universal index is the minimal pos. + min = whileOp.getAfterArguments().back(); + } + + return {whileOp, min}; +} + #undef CMPI #undef C_IDX #undef YIELD diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h index a9eb888c8b6bec..3e61b5f27fcc2a 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h @@ -436,6 +436,17 @@ class LoopEmitter { std::vector> spIterVals; }; +// +// Utils functions to generate sparse loops. +// + +// Generate a while loop that co-iterates over a set of iterators. +std::pair genCoIteration(OpBuilder &builder, Location loc, + ArrayRef iters, + MutableArrayRef reduc, + Value uniIdx, + bool userReducFirst = false); + } // namespace sparse_tensor } // namespace mlir diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h index 91f363db93f1df..642cb1afa156b0 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h @@ -95,6 +95,8 @@ enum class IterKind : uint8_t { class SparseIterationSpace { public: SparseIterationSpace() = default; + SparseIterationSpace(SparseIterationSpace &) = delete; + SparseIterationSpace(SparseIterationSpace &&) = default; // Constructs a N-D iteration space. SparseIterationSpace(Location loc, OpBuilder &b, Value t, unsigned tid, diff --git a/mlir/test/Dialect/SparseTensor/sparse_kernels_to_iterator.mlir b/mlir/test/Dialect/SparseTensor/sparse_kernels_to_iterator.mlir index 2487156a9a2e48..f819458e038582 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_kernels_to_iterator.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_kernels_to_iterator.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s --sparse-reinterpret-map -sparsification="sparse-emit-strategy=sparse-iterator" --cse | FileCheck %s --check-prefix="ITER" - -// TODO: temporarilly disabled since there is no lowering rules from `coiterate` to `scf`. -// R_U_N: mlir-opt %s --sparse-reinterpret-map -sparsification="sparse-emit-strategy=sparse-iterator" --cse --sparse-space-collapse --lower-sparse-iteration-to-scf --loop-invariant-code-motion | FileCheck %s +// RUN: mlir-opt %s --sparse-reinterpret-map -sparsification="sparse-emit-strategy=sparse-iterator" --cse --sparse-space-collapse --lower-sparse-iteration-to-scf --loop-invariant-code-motion -cse --canonicalize | FileCheck %s @@ -79,6 +77,79 @@ func.func @sqsum(%arg0: tensor) -> tensor { // ITER: bufferization.to_tensor // ITER: return // ITER: } + +// CHECK-LABEL: func.func @add( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<10xi32, #sparse{{.*}}>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<10xi32, #sparse{{.*}}>) -> tensor<10xi32> { +// CHECK: %[[VAL_2:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : i32 +// CHECK: %[[VAL_5:.*]] = arith.constant dense<0> : tensor<10xi32> +// CHECK: %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_5]] : memref<10xi32> +// CHECK: linalg.fill ins(%[[VAL_4]] : i32) outs(%[[VAL_6]] : memref<10xi32>) +// CHECK: %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<10xi32, #sparse{{.*}}> to memref +// CHECK: %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<10xi32, #sparse{{.*}}> to memref +// CHECK: %[[VAL_9:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_3]]] : memref +// CHECK: %[[VAL_10:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_2]]] : memref +// CHECK: %[[VAL_11:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<10xi32, #sparse{{.*}}> to memref +// CHECK: %[[VAL_12:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<10xi32, #sparse{{.*}}> to memref +// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_3]]] : memref +// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_2]]] : memref +// CHECK: %[[VAL_15:.*]]:2 = scf.while (%[[VAL_16:.*]] = %[[VAL_9]], %[[VAL_17:.*]] = %[[VAL_13]]) : (index, index) -> (index, index) { +// CHECK: %[[VAL_18:.*]] = arith.cmpi ult, %[[VAL_16]], %[[VAL_10]] : index +// CHECK: %[[VAL_19:.*]] = arith.cmpi ult, %[[VAL_17]], %[[VAL_14]] : index +// CHECK: %[[VAL_20:.*]] = arith.andi %[[VAL_18]], %[[VAL_19]] : i1 +// CHECK: scf.condition(%[[VAL_20]]) %[[VAL_16]], %[[VAL_17]] : index, index +// CHECK: } do { +// CHECK: ^bb0(%[[VAL_21:.*]]: index, %[[VAL_22:.*]]: index): +// CHECK: %[[VAL_23:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_21]]] : memref +// CHECK: %[[VAL_24:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_22]]] : memref +// CHECK: %[[VAL_25:.*]] = arith.cmpi ult, %[[VAL_24]], %[[VAL_23]] : index +// CHECK: %[[VAL_26:.*]] = arith.select %[[VAL_25]], %[[VAL_24]], %[[VAL_23]] : index +// CHECK: %[[VAL_27:.*]] = arith.cmpi eq, %[[VAL_23]], %[[VAL_26]] : index +// CHECK: %[[VAL_28:.*]] = arith.cmpi eq, %[[VAL_24]], %[[VAL_26]] : index +// CHECK: %[[VAL_29:.*]] = arith.andi %[[VAL_27]], %[[VAL_28]] : i1 +// CHECK: scf.if %[[VAL_29]] { +// CHECK: %[[VAL_30:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10xi32, #sparse{{.*}}> to memref +// CHECK: %[[VAL_31:.*]] = memref.load %[[VAL_30]]{{\[}}%[[VAL_21]]] : memref +// CHECK: %[[VAL_32:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<10xi32, #sparse{{.*}}> to memref +// CHECK: %[[VAL_33:.*]] = memref.load %[[VAL_32]]{{\[}}%[[VAL_22]]] : memref +// CHECK: %[[VAL_34:.*]] = arith.addi %[[VAL_31]], %[[VAL_33]] : i32 +// CHECK: memref.store %[[VAL_34]], %[[VAL_6]]{{\[}}%[[VAL_26]]] : memref<10xi32> +// CHECK: } else { +// CHECK: scf.if %[[VAL_27]] { +// CHECK: %[[VAL_35:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10xi32, #sparse{{.*}}> to memref +// CHECK: %[[VAL_36:.*]] = memref.load %[[VAL_35]]{{\[}}%[[VAL_21]]] : memref +// CHECK: memref.store %[[VAL_36]], %[[VAL_6]]{{\[}}%[[VAL_26]]] : memref<10xi32> +// CHECK: } else { +// CHECK: scf.if %[[VAL_28]] { +// CHECK: %[[VAL_37:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<10xi32, #sparse{{.*}}> to memref +// CHECK: %[[VAL_38:.*]] = memref.load %[[VAL_37]]{{\[}}%[[VAL_22]]] : memref +// CHECK: memref.store %[[VAL_38]], %[[VAL_6]]{{\[}}%[[VAL_26]]] : memref<10xi32> +// CHECK: } +// CHECK: } +// CHECK: } +// CHECK: %[[VAL_39:.*]] = arith.addi %[[VAL_21]], %[[VAL_2]] : index +// CHECK: %[[VAL_40:.*]] = arith.select %[[VAL_27]], %[[VAL_39]], %[[VAL_21]] : index +// CHECK: %[[VAL_41:.*]] = arith.addi %[[VAL_22]], %[[VAL_2]] : index +// CHECK: %[[VAL_42:.*]] = arith.select %[[VAL_28]], %[[VAL_41]], %[[VAL_22]] : index +// CHECK: scf.yield %[[VAL_40]], %[[VAL_42]] : index, index +// CHECK: } +// CHECK: %[[VAL_43:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10xi32, #sparse{{.*}}> to memref +// CHECK: scf.for %[[VAL_44:.*]] = %[[VAL_45:.*]]#0 to %[[VAL_10]] step %[[VAL_2]] { +// CHECK: %[[VAL_46:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_44]]] : memref +// CHECK: %[[VAL_47:.*]] = memref.load %[[VAL_43]]{{\[}}%[[VAL_44]]] : memref +// CHECK: memref.store %[[VAL_47]], %[[VAL_6]]{{\[}}%[[VAL_46]]] : memref<10xi32> +// CHECK: } +// CHECK: %[[VAL_48:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<10xi32, #sparse{{.*}}> to memref +// CHECK: scf.for %[[VAL_49:.*]] = %[[VAL_50:.*]]#1 to %[[VAL_14]] step %[[VAL_2]] { +// CHECK: %[[VAL_51:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_49]]] : memref +// CHECK: %[[VAL_52:.*]] = memref.load %[[VAL_48]]{{\[}}%[[VAL_49]]] : memref +// CHECK: memref.store %[[VAL_52]], %[[VAL_6]]{{\[}}%[[VAL_51]]] : memref<10xi32> +// CHECK: } +// CHECK: %[[VAL_53:.*]] = bufferization.to_tensor %[[VAL_6]] : memref<10xi32> +// CHECK: return %[[VAL_53]] : tensor<10xi32> +// CHECK: } func.func @add(%arg0: tensor<10xi32, #VEC>, %arg1: tensor<10xi32, #VEC>) -> tensor<10xi32> { %cst = arith.constant dense<0> : tensor<10xi32> %0 = linalg.generic { diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/iterator-based-sqsum.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/iterator-based-kernel.mlir similarity index 63% rename from mlir/test/Integration/Dialect/SparseTensor/CPU/iterator-based-sqsum.mlir rename to mlir/test/Integration/Dialect/SparseTensor/CPU/iterator-based-kernel.mlir index 6d03565f8f7b2a..6cca4fa86a162e 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/iterator-based-sqsum.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/iterator-based-kernel.mlir @@ -35,9 +35,13 @@ explicitVal = 1 : i32 }> -// An example of vector reductions. -module { +#VEC = #sparse_tensor.encoding<{ + map = (d0) -> (d0 : compressed) +}> + +module { + // An example of vector reductions (lowered through sparse_tensor.iterate). func.func @sqsum(%arg0: tensor<2x3x4x5xi32, #COO>) -> tensor { %cst = arith.constant dense<0> : tensor %0 = linalg.generic { @@ -55,7 +59,30 @@ module { return %0 : tensor } + // An example of vector addition (lowered through sparse_tensor.coiterate). + func.func @vec_add(%arg0: tensor<4xi32, #VEC>, %arg1: tensor<4xi32, #VEC>) -> tensor<4xi32> { + %cst = arith.constant dense<0> : tensor<4xi32> + %0 = linalg.generic { + indexing_maps = [ + affine_map<(d0) -> (d0)>, + affine_map<(d0) -> (d0)>, + affine_map<(d0) -> (d0)> + ], + iterator_types = ["parallel"] + } + ins(%arg0, %arg1 : tensor<4xi32, #VEC>, tensor<4xi32, #VEC>) + outs(%cst : tensor<4xi32>) { + ^bb0(%in1: i32, %in2: i32, %out: i32): + %2 = arith.addi %in1, %in2 : i32 + linalg.yield %2 : i32 + } -> tensor<4xi32> + return %0 : tensor<4xi32> + } + func.func @main() { + %c0 = arith.constant 0 : index + %i0 = arith.constant 0 : i32 + %cst = arith.constant sparse< [ [0, 1, 2, 3], @@ -66,15 +93,33 @@ module { [1, 1, 1, 1] > : tensor<2x3x4x5xi32> + %l = arith.constant dense< + [0, 1, 2, 3] + > : tensor<4xi32> + %r = arith.constant dense< + [1, 0, 3, 0] + > : tensor<4xi32> + %input = sparse_tensor.convert %cst : tensor<2x3x4x5xi32> to tensor<2x3x4x5xi32, #COO> %0 = call @sqsum(%input) : (tensor<2x3x4x5xi32, #COO>) -> tensor %v = tensor.extract %0[] : tensor + %lhs = sparse_tensor.convert %l : tensor<4xi32> to tensor<4xi32, #VEC> + %rhs = sparse_tensor.convert %r : tensor<4xi32> to tensor<4xi32, #VEC> + %add = call @vec_add(%lhs, %rhs) : (tensor<4xi32, #VEC>, tensor<4xi32, #VEC>) -> tensor<4xi32> + // CHECK: 4 vector.print %v : i32 + // CHECK-NEXT: ( 1, 1, 5, 3 ) + %vec = vector.transfer_read %add[%c0], %i0 : tensor<4xi32>, vector<4xi32> + vector.print %vec : vector<4xi32> bufferization.dealloc_tensor %input : tensor<2x3x4x5xi32, #COO> bufferization.dealloc_tensor %0 : tensor + + bufferization.dealloc_tensor %lhs : tensor<4xi32, #VEC> + bufferization.dealloc_tensor %rhs : tensor<4xi32, #VEC> + bufferization.dealloc_tensor %add : tensor<4xi32> return } } From f4cf93fb509c53771d61a973f27be9b1a90dee0a Mon Sep 17 00:00:00 2001 From: agozillon Date: Fri, 23 Aug 2024 19:48:43 +0200 Subject: [PATCH 352/426] [Flang][OpenMP] Align map clause generation and fix issue with non-shared allocations for assumed shape/size descriptor types (#97855) This PR aims to unify the map argument generation behavior across both the implicit capture (captured in a target region) and the explicit capture (process map), currently the varPtr field of the MapInfo for the same variable will be different depending on how it's captured. This PR tries to align that across the generations of MapInfoOp in the OpenMP lowering. Currently, I have opted to utilise the rawInput (input memref to a HLFIR DeclareInfoOp) as opposed to the addr field which includes more information. The side affect of this is that we have to deal with BoxTypes less often, which will result in simpler maps in these cases. The negative side affect of this is that we don't have access to the bounds information through the resulting value, however, I believe the bounds information we require in our case is still appropriately stored in the map bounds, and this seems to be the case from testing so far. The other fix is for cases where we end up with a BoxType argument into a function (certain assumed shape and sizes cases do this) that has no fir.ref wrapping it. As we need the Box to be a reference type to actually utilise the operation to access the base address stored inside and create the correct mappings we currently generate an intermediate allocation in these cases, and then store into it, and utilise this as the map argument, as opposed to the original. However, as we were not sharing the same intermediate allocation across all of the maps for a variable, this resulted in errors in certain cases when detatching/attatching the data e.g. via enter and exit. This PR adjusts this for cases Currently we only maintain tracking of all intermediate allocations for the current function scope, as opposed to module. Primarily as the only case I am aware of that this is required is in cases where we pass certain types of arguments to functions (so I opted to minimize the overhead of the pass for now). It could likely be extended to module scope if required if we find other cases where it's applicable and causing issues. --- .../include/flang/Optimizer/OpenMP/Passes.td | 2 +- flang/include/flang/Tools/CLOptions.inc | 3 +- flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 16 +- flang/lib/Lower/OpenMP/ClauseProcessor.h | 10 +- flang/lib/Lower/OpenMP/OpenMP.cpp | 152 +++++++++--------- .../Optimizer/OpenMP/MapInfoFinalization.cpp | 90 +++++++---- flang/test/Lower/OpenMP/array-bounds.f90 | 11 +- flang/test/Lower/OpenMP/common-block-map.f90 | 2 +- flang/test/Lower/OpenMP/derived-type-map.f90 | 2 +- flang/test/Lower/OpenMP/target.f90 | 23 ++- .../fortran/local-descriptor-map-regress.f90 | 75 +++++++++ 11 files changed, 244 insertions(+), 142 deletions(-) create mode 100644 offload/test/offloading/fortran/local-descriptor-map-regress.f90 diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td index 395178e26a5762..a5f98e1320c940 100644 --- a/flang/include/flang/Optimizer/OpenMP/Passes.td +++ b/flang/include/flang/Optimizer/OpenMP/Passes.td @@ -12,7 +12,7 @@ include "mlir/Pass/PassBase.td" def MapInfoFinalizationPass - : Pass<"omp-map-info-finalization"> { + : Pass<"omp-map-info-finalization", "mlir::ModuleOp"> { let summary = "expands OpenMP MapInfo operations containing descriptors"; let description = [{ Expands MapInfo operations containing descriptor types into multiple diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc index 1881e23b00045a..20351dcf6d6f82 100644 --- a/flang/include/flang/Tools/CLOptions.inc +++ b/flang/include/flang/Tools/CLOptions.inc @@ -367,8 +367,7 @@ inline void createHLFIRToFIRPassPipeline( /// rather than the host device. inline void createOpenMPFIRPassPipeline( mlir::PassManager &pm, bool isTargetDevice) { - addNestedPassToAllTopLevelOperations( - pm, flangomp::createMapInfoFinalizationPass); + pm.addPass(flangomp::createMapInfoFinalizationPass()); pm.addPass(flangomp::createMarkDeclareTargetPass()); if (isTargetDevice) pm.addPass(flangomp::createFunctionFiltering()); diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index da6c21730dfba5..dd6068ba048cc9 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -960,25 +960,21 @@ bool ClauseProcessor::processMap( object.ref(), clauseLocation, asFortran, bounds, treatIndexAsSection); - auto origSymbol = converter.getSymbolAddress(*object.sym()); - mlir::Value symAddr = info.addr; - if (origSymbol && fir::isTypeWithDescriptor(origSymbol.getType())) - symAddr = origSymbol; - // Explicit map captures are captured ByRef by default, // optimisation passes may alter this to ByCopy or other capture // types to optimise + mlir::Value baseOp = info.rawInput; auto location = mlir::NameLoc::get( mlir::StringAttr::get(firOpBuilder.getContext(), asFortran.str()), - symAddr.getLoc()); + baseOp.getLoc()); mlir::omp::MapInfoOp mapOp = createMapInfoOp( - firOpBuilder, location, symAddr, + firOpBuilder, location, baseOp, /*varPtrPtr=*/mlir::Value{}, asFortran.str(), bounds, /*members=*/{}, /*membersIndex=*/mlir::DenseIntElementsAttr{}, static_cast< std::underlying_type_t>( mapTypeBits), - mlir::omp::VariableCaptureKind::ByRef, symAddr.getType()); + mlir::omp::VariableCaptureKind::ByRef, baseOp.getType()); if (object.sym()->owner().IsDerivedType()) { addChildIndexAndMapToParent(object, parentMemberIndices, mapOp, @@ -987,9 +983,9 @@ bool ClauseProcessor::processMap( result.mapVars.push_back(mapOp); ptrMapSyms->push_back(object.sym()); if (mapSymTypes) - mapSymTypes->push_back(symAddr.getType()); + mapSymTypes->push_back(baseOp.getType()); if (mapSymLocs) - mapSymLocs->push_back(symAddr.getLoc()); + mapSymLocs->push_back(baseOp.getLoc()); } } }); diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h index ea4db3e6db0cce..4a90f667c7248b 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.h +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h @@ -211,22 +211,18 @@ bool ClauseProcessor::processMotionClauses(lower::StatementContext &stmtCtx, object.ref(), clauseLocation, asFortran, bounds, treatIndexAsSection); - auto origSymbol = converter.getSymbolAddress(*object.sym()); - mlir::Value symAddr = info.addr; - if (origSymbol && fir::isTypeWithDescriptor(origSymbol.getType())) - symAddr = origSymbol; - // Explicit map captures are captured ByRef by default, // optimisation passes may alter this to ByCopy or other capture // types to optimise + mlir::Value baseOp = info.rawInput; mlir::omp::MapInfoOp mapOp = createMapInfoOp( - firOpBuilder, clauseLocation, symAddr, + firOpBuilder, clauseLocation, baseOp, /*varPtrPtr=*/mlir::Value{}, asFortran.str(), bounds, /*members=*/{}, /*membersIndex=*/mlir::DenseIntElementsAttr{}, static_cast< std::underlying_type_t>( mapTypeBits), - mlir::omp::VariableCaptureKind::ByRef, symAddr.getType()); + mlir::omp::VariableCaptureKind::ByRef, baseOp.getType()); if (object.sym()->owner().IsDerivedType()) { addChildIndexAndMapToParent(object, parentMemberIndices, mapOp, diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index d614db8b68ef65..d22680407b209c 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -1698,6 +1698,12 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable, if (dsp.getAllSymbolsToPrivatize().contains(&sym)) return; + // Structure component symbols don't have bindings, and can only be + // explicitly mapped individually. If a member is captured implicitly + // we map the entirety of the derived type when we find its symbol. + if (sym.owner().IsDerivedType()) + return; + // if the symbol is part of an already mapped common block, do not make a // map for it. if (const Fortran::semantics::Symbol *common = @@ -1705,85 +1711,85 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable, if (llvm::is_contained(mapSyms, common)) return; - if (!llvm::is_contained(mapSyms, &sym)) { - mlir::Value baseOp = converter.getSymbolAddress(sym); - if (!baseOp) - if (const auto *details = - sym.template detailsIf()) { - baseOp = converter.getSymbolAddress(details->symbol()); - converter.copySymbolBinding(details->symbol(), sym); - } + // If we come across a symbol without a symbol address, we + // return as we cannot process it, this is intended as a + // catch all early exit for symbols that do not have a + // corresponding extended value. Such as subroutines, + // interfaces and named blocks. + if (!converter.getSymbolAddress(sym)) + return; - if (baseOp) { - llvm::SmallVector bounds; - std::stringstream name; - fir::ExtendedValue dataExv = converter.getSymbolExtendedValue(sym); - name << sym.name().ToString(); - - lower::AddrAndBoundsInfo info = getDataOperandBaseAddr( - converter, firOpBuilder, sym, converter.getCurrentLocation()); - if (mlir::isa( - fir::unwrapRefType(info.addr.getType()))) - bounds = lower::genBoundsOpsFromBox( - firOpBuilder, converter.getCurrentLocation(), dataExv, info); - if (mlir::isa( - fir::unwrapRefType(info.addr.getType()))) { - bool dataExvIsAssumedSize = - semantics::IsAssumedSizeArray(sym.GetUltimate()); - bounds = lower::genBaseBoundsOps( - firOpBuilder, converter.getCurrentLocation(), dataExv, - dataExvIsAssumedSize); - } + if (!llvm::is_contained(mapSyms, &sym)) { + if (const auto *details = + sym.template detailsIf()) + converter.copySymbolBinding(details->symbol(), sym); + llvm::SmallVector bounds; + std::stringstream name; + fir::ExtendedValue dataExv = converter.getSymbolExtendedValue(sym); + name << sym.name().ToString(); + + lower::AddrAndBoundsInfo info = getDataOperandBaseAddr( + converter, firOpBuilder, sym, converter.getCurrentLocation()); + mlir::Value baseOp = info.rawInput; + if (mlir::isa(fir::unwrapRefType(baseOp.getType()))) + bounds = lower::genBoundsOpsFromBox( + firOpBuilder, converter.getCurrentLocation(), dataExv, info); + if (mlir::isa(fir::unwrapRefType(baseOp.getType()))) { + bool dataExvIsAssumedSize = + semantics::IsAssumedSizeArray(sym.GetUltimate()); + bounds = lower::genBaseBoundsOps( + firOpBuilder, converter.getCurrentLocation(), dataExv, + dataExvIsAssumedSize); + } - llvm::omp::OpenMPOffloadMappingFlags mapFlag = - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT; - mlir::omp::VariableCaptureKind captureKind = - mlir::omp::VariableCaptureKind::ByRef; - - mlir::Type eleType = baseOp.getType(); - if (auto refType = mlir::dyn_cast(baseOp.getType())) - eleType = refType.getElementType(); - - // If a variable is specified in declare target link and if device - // type is not specified as `nohost`, it needs to be mapped tofrom - mlir::ModuleOp mod = firOpBuilder.getModule(); - mlir::Operation *op = mod.lookupSymbol(converter.mangleName(sym)); - auto declareTargetOp = - llvm::dyn_cast_if_present(op); - if (declareTargetOp && declareTargetOp.isDeclareTarget()) { - if (declareTargetOp.getDeclareTargetCaptureClause() == - mlir::omp::DeclareTargetCaptureClause::link && - declareTargetOp.getDeclareTargetDeviceType() != - mlir::omp::DeclareTargetDeviceType::nohost) { - mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO; - mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; - } - } else if (fir::isa_trivial(eleType) || fir::isa_char(eleType)) { - captureKind = mlir::omp::VariableCaptureKind::ByCopy; - } else if (!fir::isa_builtin_cptr_type(eleType)) { + llvm::omp::OpenMPOffloadMappingFlags mapFlag = + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT; + mlir::omp::VariableCaptureKind captureKind = + mlir::omp::VariableCaptureKind::ByRef; + + mlir::Type eleType = baseOp.getType(); + if (auto refType = mlir::dyn_cast(baseOp.getType())) + eleType = refType.getElementType(); + + // If a variable is specified in declare target link and if device + // type is not specified as `nohost`, it needs to be mapped tofrom + mlir::ModuleOp mod = firOpBuilder.getModule(); + mlir::Operation *op = mod.lookupSymbol(converter.mangleName(sym)); + auto declareTargetOp = + llvm::dyn_cast_if_present(op); + if (declareTargetOp && declareTargetOp.isDeclareTarget()) { + if (declareTargetOp.getDeclareTargetCaptureClause() == + mlir::omp::DeclareTargetCaptureClause::link && + declareTargetOp.getDeclareTargetDeviceType() != + mlir::omp::DeclareTargetDeviceType::nohost) { mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO; mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; } - auto location = - mlir::NameLoc::get(mlir::StringAttr::get(firOpBuilder.getContext(), - sym.name().ToString()), - baseOp.getLoc()); - mlir::Value mapOp = createMapInfoOp( - firOpBuilder, location, baseOp, /*varPtrPtr=*/mlir::Value{}, - name.str(), bounds, /*members=*/{}, - /*membersIndex=*/mlir::DenseIntElementsAttr{}, - static_cast< - std::underlying_type_t>( - mapFlag), - captureKind, baseOp.getType()); - - clauseOps.mapVars.push_back(mapOp); - mapSyms.push_back(&sym); - mapLocs.push_back(baseOp.getLoc()); - mapTypes.push_back(baseOp.getType()); + } else if (fir::isa_trivial(eleType) || fir::isa_char(eleType)) { + captureKind = mlir::omp::VariableCaptureKind::ByCopy; + } else if (!fir::isa_builtin_cptr_type(eleType)) { + mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO; + mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; } + auto location = + mlir::NameLoc::get(mlir::StringAttr::get(firOpBuilder.getContext(), + sym.name().ToString()), + baseOp.getLoc()); + mlir::Value mapOp = createMapInfoOp( + firOpBuilder, location, baseOp, /*varPtrPtr=*/mlir::Value{}, + name.str(), bounds, /*members=*/{}, + /*membersIndex=*/mlir::DenseIntElementsAttr{}, + static_cast< + std::underlying_type_t>( + mapFlag), + captureKind, baseOp.getType()); + + clauseOps.mapVars.push_back(mapOp); + mapSyms.push_back(&sym); + mapLocs.push_back(baseOp.getLoc()); + mapTypes.push_back(baseOp.getType()); } }; lower::pft::visitAllSymbols(eval, captureImplicitMap); diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp index 6e9cd03dca8f3f..04a11a52dbd040 100644 --- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp +++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp @@ -50,6 +50,14 @@ class MapInfoFinalizationPass : public flangomp::impl::MapInfoFinalizationPassBase< MapInfoFinalizationPass> { + /// Tracks any intermediate function/subroutine local allocations we + /// generate for the descriptors of box type dummy arguments, so that + /// we can retrieve it for subsequent reuses within the functions + /// scope + std::map + localBoxAllocas; + void genDescriptorMemberMaps(mlir::omp::MapInfoOp op, fir::FirOpBuilder &builder, mlir::Operation *target) { @@ -74,14 +82,26 @@ class MapInfoFinalizationPass // perform an alloca and then store to it and retrieve the data from the new // alloca. if (mlir::isa(descriptor.getType())) { - mlir::OpBuilder::InsertPoint insPt = builder.saveInsertionPoint(); - mlir::Block *allocaBlock = builder.getAllocaBlock(); - assert(allocaBlock && "No alloca block found for this top level op"); - builder.setInsertionPointToStart(allocaBlock); - auto alloca = builder.create(loc, descriptor.getType()); - builder.restoreInsertionPoint(insPt); - builder.create(loc, descriptor, alloca); - descriptor = alloca; + // If we have already created a local allocation for this BoxType, + // we must be sure to re-use it so that we end up with the same + // allocations being utilised for the same descriptor across all map uses, + // this prevents runtime issues such as not appropriately releasing or + // deleting all mapped data. + auto find = localBoxAllocas.find(descriptor.getAsOpaquePointer()); + if (find != localBoxAllocas.end()) { + builder.create(loc, descriptor, find->second); + descriptor = find->second; + } else { + mlir::OpBuilder::InsertPoint insPt = builder.saveInsertionPoint(); + mlir::Block *allocaBlock = builder.getAllocaBlock(); + assert(allocaBlock && "No alloca block found for this top level op"); + builder.setInsertionPointToStart(allocaBlock); + auto alloca = builder.create(loc, descriptor.getType()); + builder.restoreInsertionPoint(insPt); + builder.create(loc, descriptor, alloca); + localBoxAllocas[descriptor.getAsOpaquePointer()] = alloca; + descriptor = alloca; + } } mlir::Value baseAddrAddr = builder.create( @@ -234,27 +254,41 @@ class MapInfoFinalizationPass fir::KindMapping kindMap = fir::getKindMapping(module); fir::FirOpBuilder builder{module, std::move(kindMap)}; - getOperation()->walk([&](mlir::omp::MapInfoOp op) { - // TODO: Currently only supports a single user for the MapInfoOp, this - // is fine for the moment as the Fortran Frontend will generate a - // new MapInfoOp per Target operation for the moment. However, when/if - // we optimise/cleanup the IR, it likely isn't too difficult to - // extend this function, it would require some modification to create a - // single new MapInfoOp per new MapInfoOp generated and share it across - // all users appropriately, making sure to only add a single member link - // per new generation for the original originating descriptor MapInfoOp. - assert(llvm::hasSingleElement(op->getUsers()) && - "MapInfoFinalization currently only supports single users " - "of a MapInfoOp"); + // We wish to maintain some function level scope (currently + // just local function scope variables used to load and store box + // variables into so we can access their base address, an + // quirk of box_offset requires us to have an in memory box, but Fortran + // in certain cases does not provide this) whilst not subjecting + // ourselves to the possibility of race conditions while this pass + // undergoes frequent re-iteration for the near future. So we loop + // over function in the module and then map.info inside of those. + getOperation()->walk([&](mlir::func::FuncOp func) { + // clear all local allocations we made for any boxes in any prior + // iterations from previous function scopes. + localBoxAllocas.clear(); - if (!op.getMembers().empty()) { - addImplicitMembersToTarget(op, builder, *op->getUsers().begin()); - } else if (fir::isTypeWithDescriptor(op.getVarType()) || - mlir::isa_and_present( - op.getVarPtr().getDefiningOp())) { - builder.setInsertionPoint(op); - genDescriptorMemberMaps(op, builder, *op->getUsers().begin()); - } + func->walk([&](mlir::omp::MapInfoOp op) { + // TODO: Currently only supports a single user for the MapInfoOp, this + // is fine for the moment as the Fortran Frontend will generate a + // new MapInfoOp per Target operation for the moment. However, when/if + // we optimise/cleanup the IR, it likely isn't too difficult to + // extend this function, it would require some modification to create a + // single new MapInfoOp per new MapInfoOp generated and share it across + // all users appropriately, making sure to only add a single member link + // per new generation for the original originating descriptor MapInfoOp. + assert(llvm::hasSingleElement(op->getUsers()) && + "OMPMapInfoFinalization currently only supports single users " + "of a MapInfoOp"); + + if (!op.getMembers().empty()) { + addImplicitMembersToTarget(op, builder, *op->getUsers().begin()); + } else if (fir::isTypeWithDescriptor(op.getVarType()) || + mlir::isa_and_present( + op.getVarPtr().getDefiningOp())) { + builder.setInsertionPoint(op); + genDescriptorMemberMaps(op, builder, *op->getUsers().begin()); + } + }); }); } }; diff --git a/flang/test/Lower/OpenMP/array-bounds.f90 b/flang/test/Lower/OpenMP/array-bounds.f90 index f235d5041ab26a..9e59d71f560fc8 100644 --- a/flang/test/Lower/OpenMP/array-bounds.f90 +++ b/flang/test/Lower/OpenMP/array-bounds.f90 @@ -15,12 +15,12 @@ !HOST: %[[C2:.*]] = arith.constant 1 : index !HOST: %[[C3:.*]] = arith.constant 4 : index !HOST: %[[BOUNDS0:.*]] = omp.map.bounds lower_bound(%[[C2]] : index) upper_bound(%[[C3]] : index) extent(%[[C10]] : index) stride(%[[C1]] : index) start_idx(%[[C1]] : index) -!HOST: %[[MAP0:.*]] = omp.map.info var_ptr(%[[READ_DECL]]#0 : !fir.ref>, !fir.array<10xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS0]]) -> !fir.ref> {name = "sp_read(2:5)"} +!HOST: %[[MAP0:.*]] = omp.map.info var_ptr(%[[READ_DECL]]#1 : !fir.ref>, !fir.array<10xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS0]]) -> !fir.ref> {name = "sp_read(2:5)"} !HOST: %[[C4:.*]] = arith.constant 1 : index !HOST: %[[C5:.*]] = arith.constant 1 : index !HOST: %[[C6:.*]] = arith.constant 4 : index !HOST: %[[BOUNDS1:.*]] = omp.map.bounds lower_bound(%[[C5]] : index) upper_bound(%[[C6]] : index) extent(%[[C10_0]] : index) stride(%[[C4]] : index) start_idx(%[[C4]] : index) -!HOST: %[[MAP1:.*]] = omp.map.info var_ptr(%[[WRITE_DECL]]#0 : !fir.ref>, !fir.array<10xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS1]]) -> !fir.ref> {name = "sp_write(2:5)"} +!HOST: %[[MAP1:.*]] = omp.map.info var_ptr(%[[WRITE_DECL]]#1 : !fir.ref>, !fir.array<10xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS1]]) -> !fir.ref> {name = "sp_write(2:5)"} !HOST: omp.target map_entries(%[[MAP0]] -> %{{.*}}, %[[MAP1]] -> %{{.*}}, {{.*}} -> {{.*}} : !fir.ref>, !fir.ref>, !fir.ref) { subroutine read_write_section() @@ -67,7 +67,6 @@ end subroutine assumed_shape_array !HOST-LABEL: func.func @_QMassumed_array_routinesPassumed_size_array( !HOST-SAME: %[[ARG0:.*]]: !fir.ref> {fir.bindc_name = "arr_read_write"}) { -!HOST: %[[INTERMEDIATE_ALLOCA:.*]] = fir.alloca !fir.box> !HOST: %[[ARG0_SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1> !HOST: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]](%[[ARG0_SHAPE]]) dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs, uniq_name = "_QMassumed_array_routinesFassumed_size_arrayEarr_read_write"} : (!fir.ref>, !fir.shape<1>, !fir.dscope) -> (!fir.box>, !fir.ref>) !HOST: %[[ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMassumed_array_routinesFassumed_size_arrayEi"} @@ -75,10 +74,8 @@ end subroutine assumed_shape_array !HOST: %[[C4_1:.*]] = arith.subi %c4, %c1{{.*}} : index !HOST: %[[EXT:.*]] = arith.addi %[[C4_1]], %c1{{.*}} : index !HOST: %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%c1{{.*}} : index) upper_bound(%c4{{.*}} : index) extent(%[[EXT]] : index) stride(%[[DIMS0]]#2 : index) start_idx(%c1{{.*}} : index) {stride_in_bytes = true} -!HOST: %[[VAR_PTR_PTR:.*]] = fir.box_offset %[[INTERMEDIATE_ALLOCA]] base_addr : (!fir.ref>>) -> !fir.llvm_ptr>> -!HOST: %[[MAP_INFO_MEMBER:.*]] = omp.map.info var_ptr(%[[INTERMEDIATE_ALLOCA]] : !fir.ref>>, !fir.array) var_ptr_ptr(%[[VAR_PTR_PTR]] : !fir.llvm_ptr>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr>> {name = ""} -!HOST: %[[MAP:.*]] = omp.map.info var_ptr(%[[INTERMEDIATE_ALLOCA]] : !fir.ref>>, !fir.box>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_INFO_MEMBER]] : [0] : !fir.llvm_ptr>>) -> !fir.ref> {name = "arr_read_write(2:5)"} -!HOST: omp.target map_entries(%[[MAP_INFO_MEMBER]] -> %{{.*}}, %[[MAP]] -> %{{.*}}, {{.*}} -> {{.*}} : !fir.llvm_ptr>>, !fir.ref>, !fir.ref) { +!HOST: %[[MAP:.*]] = omp.map.info var_ptr(%[[ARG0_DECL]]#1 : !fir.ref>, !fir.array) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "arr_read_write(2:5)"} +!HOST: omp.target map_entries(%[[MAP]] -> %{{.*}}, {{.*}} -> {{.*}} : !fir.ref>, !fir.ref) { subroutine assumed_size_array(arr_read_write) integer, intent(inout) :: arr_read_write(*) diff --git a/flang/test/Lower/OpenMP/common-block-map.f90 b/flang/test/Lower/OpenMP/common-block-map.f90 index 5033129683a8eb..0c423efd5eef49 100644 --- a/flang/test/Lower/OpenMP/common-block-map.f90 +++ b/flang/test/Lower/OpenMP/common-block-map.f90 @@ -40,7 +40,7 @@ subroutine map_full_block !CHECK: %[[COORD:.*]] = fir.coordinate_of %[[CB_CONV]], %[[INDEX]] : (!fir.ref>, index) -> !fir.ref !CHECK: %[[CONV:.*]] = fir.convert %[[COORD]] : (!fir.ref) -> !fir.ref !CHECK: %[[CB_MEMBER_2:.*]]:2 = hlfir.declare %[[CONV]] {uniq_name = "_QFmap_mix_of_membersEvar2"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK: %[[MAP_EXP:.*]] = omp.map.info var_ptr(%[[CB_MEMBER_2]]#0 : !fir.ref, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref {name = "var2"} +!CHECK: %[[MAP_EXP:.*]] = omp.map.info var_ptr(%[[CB_MEMBER_2]]#1 : !fir.ref, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref {name = "var2"} !CHECK: %[[MAP_IMP:.*]] = omp.map.info var_ptr(%[[CB_MEMBER_1]]#1 : !fir.ref, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref {name = "var1"} !CHECK: omp.target map_entries(%[[MAP_EXP]] -> %[[ARG_EXP:.*]], %[[MAP_IMP]] -> %[[ARG_IMP:.*]] : !fir.ref, !fir.ref) { !CHECK: ^bb0(%[[ARG_EXP]]: !fir.ref, %[[ARG_IMP]]: !fir.ref): diff --git a/flang/test/Lower/OpenMP/derived-type-map.f90 b/flang/test/Lower/OpenMP/derived-type-map.f90 index 6121b450f06206..30b89e90470b0a 100644 --- a/flang/test/Lower/OpenMP/derived-type-map.f90 +++ b/flang/test/Lower/OpenMP/derived-type-map.f90 @@ -21,7 +21,7 @@ end subroutine mapType_derived_implicit !CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.type<_QFmaptype_derived_explicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}> {bindc_name = "scalar_arr", uniq_name = "_QFmaptype_derived_explicitEscalar_arr"} !CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFmaptype_derived_explicitEscalar_arr"} : (!fir.ref,int:i32}>>) -> (!fir.ref,int:i32}>>, !fir.ref,int:i32}>>) -!CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#0 : !fir.ref,int:i32}>>, !fir.type<_QFmaptype_derived_explicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>) map_clauses(tofrom) capture(ByRef) -> !fir.ref,int:i32}>> {name = "scalar_arr"} +!CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref,int:i32}>>, !fir.type<_QFmaptype_derived_explicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>) map_clauses(tofrom) capture(ByRef) -> !fir.ref,int:i32}>> {name = "scalar_arr"} !CHECK: omp.target map_entries(%[[MAP]] -> %[[ARG0:.*]] : !fir.ref,int:i32}>>) { !CHECK: ^bb0(%[[ARG0]]: !fir.ref,int:i32}>>): subroutine mapType_derived_explicit diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90 index 9b92293cbf92f0..1d5ab6942dfa33 100644 --- a/flang/test/Lower/OpenMP/target.f90 +++ b/flang/test/Lower/OpenMP/target.f90 @@ -189,7 +189,7 @@ subroutine omp_target_update_depend !$omp end task !CHECK: %[[BOUNDS:.*]] = omp.map.bounds - !CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[A]]#0 : !fir.ref>, !fir.array<1024xi32>) map_clauses(to) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "a"} + !CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[A]]#1 : !fir.ref>, !fir.array<1024xi32>) map_clauses(to) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "a"} !CHECK: omp.target_update depend(taskdependin -> %[[A]]#1 : !fir.ref>) map_entries(%[[MAP]] : !fir.ref>) !$omp target update to(a) depend(in:a) end subroutine omp_target_update_depend @@ -205,7 +205,7 @@ subroutine omp_target_update_to !CHECK-DAG: %[[A_DECL:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) !CHECK-DAG: %[[BOUNDS:.*]] = omp.map.bounds - !CHECK: %[[TO_MAP:.*]] = omp.map.info var_ptr(%[[A_DECL]]#0 : !fir.ref>, !fir.array<1024xi32>) + !CHECK: %[[TO_MAP:.*]] = omp.map.info var_ptr(%[[A_DECL]]#1 : !fir.ref>, !fir.array<1024xi32>) !CHECK-SAME: map_clauses(to) capture(ByRef) !CHECK-SAME: bounds(%[[BOUNDS]]) -> !fir.ref> {name = "a"} @@ -224,7 +224,7 @@ subroutine omp_target_update_from !CHECK-DAG: %[[A_DECL:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) !CHECK-DAG: %[[BOUNDS:.*]] = omp.map.bounds - !CHECK: %[[FROM_MAP:.*]] = omp.map.info var_ptr(%[[A_DECL]]#0 : !fir.ref>, !fir.array<1024xi32>) + !CHECK: %[[FROM_MAP:.*]] = omp.map.info var_ptr(%[[A_DECL]]#1 : !fir.ref>, !fir.array<1024xi32>) !CHECK-SAME: map_clauses(from) capture(ByRef) !CHECK-SAME: bounds(%[[BOUNDS]]) -> !fir.ref> {name = "a"} @@ -292,7 +292,7 @@ subroutine omp_target_data !CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[VAL_0]](%{{.*}}) {uniq_name = "_QFomp_target_dataEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) integer :: a(1024) !CHECK: %[[BOUNDS:.*]] = omp.map.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) - !CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[A_DECL]]#0 : !fir.ref>, !fir.array<1024xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "a"} + !CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[A_DECL]]#1 : !fir.ref>, !fir.array<1024xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "a"} !CHECK: omp.target_data map_entries(%[[MAP]] : !fir.ref>) { !$omp target data map(tofrom: a) !CHECK: %[[C10:.*]] = arith.constant 10 : i32 @@ -314,14 +314,14 @@ subroutine omp_target_data_mt !CHECK: %[[VAR_B:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "b", uniq_name = "_QFomp_target_data_mtEb"} !CHECK: %[[VAR_B_DECL:.*]]:2 = hlfir.declare %[[VAR_B]](%{{.*}}) {uniq_name = "_QFomp_target_data_mtEb"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) !CHECK: %[[BOUNDS_A:.*]] = omp.map.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) - !CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAR_A_DECL]]#0 : !fir.ref>, !fir.array<1024xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS_A]]) -> !fir.ref> {name = "a"} + !CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAR_A_DECL]]#1 : !fir.ref>, !fir.array<1024xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS_A]]) -> !fir.ref> {name = "a"} !CHECK: omp.target_data map_entries(%[[MAP_A]] : !fir.ref>) { !$omp target data map(a) !CHECK: omp.terminator !$omp end target data !CHECK: } !CHECK: %[[BOUNDS_B:.*]] = omp.map.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) - !CHECK: %[[MAP_B:.*]] = omp.map.info var_ptr(%[[VAR_B_DECL]]#0 : !fir.ref>, !fir.array<1024xi32>) map_clauses(always, from) capture(ByRef) bounds(%[[BOUNDS_B]]) -> !fir.ref> {name = "b"} + !CHECK: %[[MAP_B:.*]] = omp.map.info var_ptr(%[[VAR_B_DECL]]#1 : !fir.ref>, !fir.array<1024xi32>) map_clauses(always, from) capture(ByRef) bounds(%[[BOUNDS_B]]) -> !fir.ref> {name = "b"} !CHECK: omp.target_data map_entries(%[[MAP_B]] : !fir.ref>) { !$omp target data map(always, from : b) !CHECK: omp.terminator @@ -338,7 +338,7 @@ subroutine omp_target !CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFomp_targetEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) integer :: a(1024) !CHECK: %[[BOUNDS:.*]] = omp.map.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) - !CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[VAL_1]]#0 : !fir.ref>, !fir.array<1024xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "a"} + !CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[VAL_1]]#1 : !fir.ref>, !fir.array<1024xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "a"} !CHECK: omp.target map_entries(%[[MAP]] -> %[[ARG_0:.*]] : !fir.ref>) { !CHECK: ^bb0(%[[ARG_0]]: !fir.ref>): !$omp target map(tofrom: a) @@ -372,7 +372,7 @@ subroutine omp_target_depend !CHECK: %[[LBOUND_A:.*]] = arith.constant 0 : index !CHECK: %[[UBOUND_A:.*]] = arith.subi %c1024, %c1 : index !CHECK: %[[BOUNDS_A:.*]] = omp.map.bounds lower_bound(%[[LBOUND_A]] : index) upper_bound(%[[UBOUND_A]] : index) extent(%[[EXTENT_A]] : index) stride(%[[STRIDE_A]] : index) start_idx(%[[STRIDE_A]] : index) - !CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[A]]#0 : !fir.ref>, !fir.array<1024xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS_A]]) -> !fir.ref> {name = "a"} + !CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[A]]#1 : !fir.ref>, !fir.array<1024xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS_A]]) -> !fir.ref> {name = "a"} !CHECK: omp.target depend(taskdependin -> %[[A]]#1 : !fir.ref>) map_entries(%[[MAP_A]] -> %[[BB0_ARG:.*]] : !fir.ref>) { !$omp target map(tofrom: a) depend(in: a) a(1) = 10 @@ -455,12 +455,11 @@ subroutine omp_target_implicit_bounds(n) !CHECK: %[[VAL_8:.*]] = fir.alloca !fir.array, %[[VAL_7]] {bindc_name = "a", uniq_name = "_QFomp_target_implicit_boundsEa"} !CHECK: %[[VAL_9:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1> !CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_8]](%[[VAL_9]]) {uniq_name = "_QFomp_target_implicit_boundsEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.box>, !fir.ref>) - !CHECK: %[[DIMS0:.*]]:3 = fir.box_dims %{{[0-9]+}}#0, %c0{{.*}} : (!fir.box>, index) -> (index, index, index) - !CHECK: %[[UB:.*]] = arith.subi %[[DIMS0]]#1, %c1{{.*}} : index + !CHECK: %[[UB:.*]] = arith.subi %[[VAL_7]], %c1{{.*}} : index integer :: n integer :: a(n) - !CHECK: %[[VAL_14:.*]] = omp.map.bounds lower_bound(%c0{{.*}} : index) upper_bound(%[[UB]] : index) extent(%[[DIMS0]]#1 : index) stride(%[[DIMS0]]#2 : index) start_idx(%c1{{.*}} : index) {stride_in_bytes = true} + !CHECK: %[[VAL_14:.*]] = omp.map.bounds lower_bound(%c0{{.*}} : index) upper_bound(%[[UB]] : index) extent(%[[VAL_7]] : index) stride(%c1{{.*}} : index) start_idx(%c1{{.*}} : index) !CHECK: %[[VAL_15:.*]] = omp.map.info var_ptr(%[[VAL_10]]#1 : !fir.ref>, !fir.array) map_clauses(implicit, tofrom) capture(ByRef) bounds(%[[VAL_14]]) -> !fir.ref> {name = "a"} !CHECK: %[[VAL_16:.*]] = omp.map.info var_ptr(%[[VAL_COPY]] : !fir.ref, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref {name = ""} !CHECK: omp.target map_entries(%[[VAL_15]] -> %[[VAL_17:.*]], %[[VAL_16]] -> %[[VAL_18:.*]] : !fir.ref>, !fir.ref) { @@ -587,7 +586,7 @@ subroutine omp_target_parallel_do !CHECK: %[[C0:.*]] = arith.constant 0 : index !CHECK: %[[SUB:.*]] = arith.subi %[[C1024]], %[[C1]] : index !CHECK: %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%[[C0]] : index) upper_bound(%[[SUB]] : index) extent(%[[C1024]] : index) stride(%[[C1]] : index) start_idx(%[[C1]] : index) - !CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[VAL_0_DECL]]#0 : !fir.ref>, !fir.array<1024xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "a"} + !CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[VAL_0_DECL]]#1 : !fir.ref>, !fir.array<1024xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "a"} !CHECK: omp.target map_entries(%[[MAP]] -> %[[ARG_0:.*]], %{{.*}} -> %{{.*}} : !fir.ref>, !fir.ref) { !CHECK: ^bb0(%[[ARG_0]]: !fir.ref>, %{{.*}}: !fir.ref): !CHECK: %[[VAL_0_DECL:.*]]:2 = hlfir.declare %[[ARG_0]](%{{.*}}) {uniq_name = "_QFomp_target_parallel_doEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) diff --git a/offload/test/offloading/fortran/local-descriptor-map-regress.f90 b/offload/test/offloading/fortran/local-descriptor-map-regress.f90 new file mode 100644 index 00000000000000..5f628b8ad8c7af --- /dev/null +++ b/offload/test/offloading/fortran/local-descriptor-map-regress.f90 @@ -0,0 +1,75 @@ +! Small regression test that checks that we do not cause +! a runtime map error in cases where we are required to +! allocate a local variable for the fortran descriptor +! to store into and then load from it, done so by +! re-using the temporary local variable across all +! maps related to the mapped variable and associated +! local variable to make sure that each map does as +! it's intended to do with the original data. This +! prevents blobs of local descriptor data remaining +! attatched on device long after it's supposed to, +! which can cause weird map issues later in susbequent +! function invocations. However, it doesn't avoid a user +! shooting themselves in the foot by mapping data via enter +! and then not providing a corresponding exit. +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +subroutine launchOne(n1, n2, ret) + implicit none + real, intent(out) :: ret + real(4), dimension(n1,n2) :: sbuf31 + integer :: n1,n2 +!$omp target enter data map(alloc:sbuf31) + +!$omp target + sbuf31(2, 2) = 10 +!$omp end target + +!$omp target update from(sbuf31) + + ret = sbuf31(2, 2) + +!$omp target exit data map(delete:sbuf31) +end subroutine launchOne + +subroutine launchTwo(N, ret) + implicit none + real, intent(inout) :: ret + integer :: N + real(4), dimension(N) :: p + +!$omp target enter data map(to:p) + +!$omp target + p(8) = 20 +!$omp end target + +!$omp target update from(p) + +ret = ret + p(8) + +! intentional non-deletion, can trigger an illegal map +! issue in cases where the local map we store and load +! from for the variable is different across all maps. +! Not too sure why this is the thing that triggers the +! problem in general. It seems like it would be an issue +! made apparent with and without this statement commented, +! especially as the issue occurs with the enter and not the +! corresponding exit (from the runtime trace at least). +!!$omp target exit data map(delete:p) +end subroutine launchTwo + +program reproducer + implicit none + integer :: N = 10 + integer :: nr = 10, nt = 10 + real :: output = 0 + + call launchOne(nr, nt, output) + call launchTwo(N, output) + + print *, output +end program reproducer + +! CHECK: 30 From d86349cf40196bc7f52e3f294ed2afafacadf1f5 Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Fri, 23 Aug 2024 20:00:48 +0200 Subject: [PATCH 353/426] Fix some warnings in SemaHLSL.cpp. --- clang/lib/Sema/SemaHLSL.cpp | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 320e38b740a742..17cb47f80590d9 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -673,7 +673,7 @@ static RegisterType getRegisterType(StringRef Slot) { case 'U': return RegisterType::UAV; case 'b': - case 'B ': + case 'B': return RegisterType::CBuffer; case 's': case 'S': @@ -721,16 +721,6 @@ static void ValidateMultipleRegisterAnnotations(Sema &S, Decl *TheDecl, } } -static std::string getHLSLResourceTypeStr(Sema &S, Decl *TheDecl) { - if (VarDecl *TheVarDecl = dyn_cast(TheDecl)) { - QualType TheQualTy = TheVarDecl->getType(); - PrintingPolicy PP = S.getPrintingPolicy(); - return QualType::getAsString(TheQualTy.split(), PP); - } - if (HLSLBufferDecl *CBufferOrTBuffer = dyn_cast(TheDecl)) - return CBufferOrTBuffer->isCBuffer() ? "cbuffer" : "tbuffer"; -} - static void DiagnoseHLSLRegisterAttribute(Sema &S, SourceLocation &ArgLoc, Decl *TheDecl, RegisterType regType) { @@ -785,7 +775,7 @@ static void DiagnoseHLSLRegisterAttribute(Sema &S, SourceLocation &ArgLoc, RegisterType::CBuffer, RegisterType::Sampler, }; - assert((int)DeclResourceClass < + assert((size_t)DeclResourceClass < std::size(ExpectedRegisterTypesForResourceClass) && "DeclResourceClass has unexpected value"); @@ -820,7 +810,7 @@ static void DiagnoseHLSLRegisterAttribute(Sema &S, SourceLocation &ArgLoc, if (Flags.UDT) { const bool ExpectedRegisterTypesForUDT[] = { Flags.SRV, Flags.UAV, Flags.CBV, Flags.Sampler, Flags.ContainsNumeric}; - assert(regTypeNum < std::size(ExpectedRegisterTypesForUDT) && + assert((size_t)regTypeNum < std::size(ExpectedRegisterTypesForUDT) && "regType has unexpected value"); if (!ExpectedRegisterTypesForUDT[regTypeNum]) From b7c1be1a7f49539ea644ff3fd8b55f237e37b35e Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Fri, 23 Aug 2024 09:55:03 -0700 Subject: [PATCH 354/426] Revert "Revert "[lldb] Speculative fix for trap_frame_sym_ctx.test"" This reverts commit fd7904a07bc26950fa7735fb6871a064e3ebc836. --- lldb/test/Shell/Unwind/trap_frame_sym_ctx.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/test/Shell/Unwind/trap_frame_sym_ctx.test b/lldb/test/Shell/Unwind/trap_frame_sym_ctx.test index 1bf1fb1d6e85f9..08a26616240e68 100644 --- a/lldb/test/Shell/Unwind/trap_frame_sym_ctx.test +++ b/lldb/test/Shell/Unwind/trap_frame_sym_ctx.test @@ -15,7 +15,7 @@ breakpoint set -n bar process launch # CHECK: stop reason = breakpoint 1.1 -thread backtrace +thread backtrace -u # CHECK: frame #0: {{.*}}`bar # CHECK: frame #1: {{.*}}`tramp # CHECK: frame #2: {{.*}}`main From 3c0fba4f2471cacb27d787c7d8f54f21d9dcafae Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Fri, 23 Aug 2024 09:55:47 -0700 Subject: [PATCH 355/426] Revert "Revert "[lldb] Extend frame recognizers to hide frames from backtraces (#104523)"" This reverts commit 547917aebd1e79a8929b53f0ddf3b5185ee4df74. --- lldb/bindings/python/python-wrapper.swig | 18 +++- lldb/include/lldb/API/SBFrame.h | 4 + .../lldb/Interpreter/ScriptInterpreter.h | 5 ++ lldb/include/lldb/Target/StackFrame.h | 36 ++++---- lldb/include/lldb/Target/StackFrameList.h | 2 +- .../lldb/Target/StackFrameRecognizer.h | 21 +++-- lldb/include/lldb/Target/Thread.h | 4 +- lldb/source/API/SBFrame.cpp | 15 +++- lldb/source/API/SBThread.cpp | 3 +- lldb/source/Commands/CommandCompletions.cpp | 4 +- lldb/source/Commands/CommandObjectFrame.cpp | 24 ++++++ lldb/source/Commands/CommandObjectMemory.cpp | 3 +- lldb/source/Commands/CommandObjectThread.cpp | 19 ++++- lldb/source/Commands/Options.td | 2 + lldb/source/Core/Debugger.cpp | 3 +- .../source/Interpreter/CommandInterpreter.cpp | 9 +- .../CPlusPlus/CPPLanguageRuntime.cpp | 44 +++++++++- .../Python/SWIGPythonBridge.h | 3 + .../Python/ScriptInterpreterPython.cpp | 29 +++++++ .../Python/ScriptInterpreterPythonImpl.h | 3 + lldb/source/Target/Process.cpp | 7 +- lldb/source/Target/StackFrame.cpp | 26 ++++-- lldb/source/Target/StackFrameList.cpp | 8 +- lldb/source/Target/StackFrameRecognizer.cpp | 29 +++++-- lldb/source/Target/Thread.cpp | 12 +-- lldb/source/Target/ThreadPlanStepOut.cpp | 2 +- .../frame/recognizer/TestFrameRecognizer.py | 40 +++++++++ .../test/API/commands/frame/recognizer/main.m | 21 ++--- .../commands/frame/recognizer/recognizer.py | 5 ++ .../lang/cpp/std-function-recognizer/Makefile | 4 + .../TestStdFunctionRecognizer.py | 84 +++++++++++++++++++ .../lang/cpp/std-function-recognizer/main.cpp | 10 +++ 32 files changed, 424 insertions(+), 75 deletions(-) create mode 100644 lldb/test/API/lang/cpp/std-function-recognizer/Makefile create mode 100644 lldb/test/API/lang/cpp/std-function-recognizer/TestStdFunctionRecognizer.py create mode 100644 lldb/test/API/lang/cpp/std-function-recognizer/main.cpp diff --git a/lldb/bindings/python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig index 8f050643fa68b3..2ce42e3e017d5b 100644 --- a/lldb/bindings/python/python-wrapper.swig +++ b/lldb/bindings/python/python-wrapper.swig @@ -813,7 +813,7 @@ PythonObject lldb_private::python::SWIGBridge::LLDBSWIGPython_CreateFrameRecogni } PyObject *lldb_private::python::SWIGBridge::LLDBSwigPython_GetRecognizedArguments( - PyObject * implementor, const lldb::StackFrameSP &frame_sp) { + PyObject *implementor, const lldb::StackFrameSP &frame_sp) { static char callee_name[] = "get_recognized_arguments"; PythonObject arg = SWIGBridge::ToSWIGWrapper(frame_sp); @@ -824,6 +824,22 @@ PyObject *lldb_private::python::SWIGBridge::LLDBSwigPython_GetRecognizedArgument return result; } +bool lldb_private::python::SWIGBridge::LLDBSwigPython_ShouldHide( + PyObject *implementor, const lldb::StackFrameSP &frame_sp) { + static char callee_name[] = "should_hide"; + + PythonObject arg = SWIGBridge::ToSWIGWrapper(frame_sp); + + PythonString str(callee_name); + + PyObject *result = + PyObject_CallMethodObjArgs(implementor, str.get(), arg.get(), NULL); + bool ret_val = result ? PyObject_IsTrue(result) : false; + Py_XDECREF(result); + + return result; +} + void *lldb_private::python::SWIGBridge::LLDBSWIGPython_GetDynamicSetting( void *module, const char *setting, const lldb::TargetSP &target_sp) { if (!module || !setting) diff --git a/lldb/include/lldb/API/SBFrame.h b/lldb/include/lldb/API/SBFrame.h index 821ff3cf7ce519..e0d15c3ecc5b1c 100644 --- a/lldb/include/lldb/API/SBFrame.h +++ b/lldb/include/lldb/API/SBFrame.h @@ -104,6 +104,10 @@ class LLDB_API SBFrame { bool IsArtificial() const; + /// Return whether a frame recognizer decided this frame should not + /// be displayes in backtraces etc. + bool IsHidden() const; + /// The version that doesn't supply a 'use_dynamic' value will use the /// target's default. lldb::SBValue EvaluateExpression(const char *expr); diff --git a/lldb/include/lldb/Interpreter/ScriptInterpreter.h b/lldb/include/lldb/Interpreter/ScriptInterpreter.h index 05f0d7f0955f3e..89a480a28880aa 100644 --- a/lldb/include/lldb/Interpreter/ScriptInterpreter.h +++ b/lldb/include/lldb/Interpreter/ScriptInterpreter.h @@ -252,6 +252,11 @@ class ScriptInterpreter : public PluginInterface { return lldb::ValueObjectListSP(); } + virtual bool ShouldHide(const StructuredData::ObjectSP &implementor, + lldb::StackFrameSP frame_sp) { + return false; + } + virtual StructuredData::GenericSP CreateScriptedBreakpointResolver(const char *class_name, const StructuredDataImpl &args_data, diff --git a/lldb/include/lldb/Target/StackFrame.h b/lldb/include/lldb/Target/StackFrame.h index 52f0a1ee662176..e4d17847763acf 100644 --- a/lldb/include/lldb/Target/StackFrame.h +++ b/lldb/include/lldb/Target/StackFrame.h @@ -407,6 +407,11 @@ class StackFrame : public ExecutionContextScope, /// may have limited support for inspecting variables. bool IsArtificial() const; + /// Query whether this frame should be hidden from backtraces. Frame + /// recognizers can customize this behavior and hide distracting + /// system implementation details this way. + bool IsHidden(); + /// Query this frame to find what frame it is in this Thread's /// StackFrameList. /// @@ -518,33 +523,36 @@ class StackFrame : public ExecutionContextScope, bool HasCachedData() const; private: - // For StackFrame only + /// For StackFrame only. + /// \{ lldb::ThreadWP m_thread_wp; uint32_t m_frame_index; uint32_t m_concrete_frame_index; lldb::RegisterContextSP m_reg_context_sp; StackID m_id; - Address m_frame_code_addr; // The frame code address (might not be the same as - // the actual PC for inlined frames) as a - // section/offset address + /// \} + + /// The frame code address (might not be the same as the actual PC + /// for inlined frames) as a section/offset address. + Address m_frame_code_addr; SymbolContext m_sc; Flags m_flags; Scalar m_frame_base; Status m_frame_base_error; - bool m_cfa_is_valid; // Does this frame have a CFA? Different from CFA == - // LLDB_INVALID_ADDRESS + uint16_t m_frame_recognizer_generation; + /// Does this frame have a CFA? Different from CFA == LLDB_INVALID_ADDRESS. + bool m_cfa_is_valid; Kind m_stack_frame_kind; - // Whether this frame behaves like the zeroth frame, in the sense - // that its pc value might not immediately follow a call (and thus might - // be the first address of its function). True for actual frame zero as - // well as any other frame with the same trait. + /// Whether this frame behaves like the zeroth frame, in the sense + /// that its pc value might not immediately follow a call (and thus might + /// be the first address of its function). True for actual frame zero as + /// well as any other frame with the same trait. bool m_behaves_like_zeroth_frame; lldb::VariableListSP m_variable_list_sp; - ValueObjectList m_variable_list_value_objects; // Value objects for each - // variable in - // m_variable_list_sp - lldb::RecognizedStackFrameSP m_recognized_frame_sp; + /// Value objects for each variable in m_variable_list_sp. + ValueObjectList m_variable_list_value_objects; + std::optional m_recognized_frame_sp; StreamString m_disassembly; std::recursive_mutex m_mutex; diff --git a/lldb/include/lldb/Target/StackFrameList.h b/lldb/include/lldb/Target/StackFrameList.h index 88e211ff692bd9..7d0e7a5b9a71b2 100644 --- a/lldb/include/lldb/Target/StackFrameList.h +++ b/lldb/include/lldb/Target/StackFrameList.h @@ -91,7 +91,7 @@ class StackFrameList { size_t GetStatus(Stream &strm, uint32_t first_frame, uint32_t num_frames, bool show_frame_info, uint32_t num_frames_with_source, - bool show_unique = false, + bool show_unique = false, bool show_hidden = false, const char *frame_marker = nullptr); protected: diff --git a/lldb/include/lldb/Target/StackFrameRecognizer.h b/lldb/include/lldb/Target/StackFrameRecognizer.h index e9ac2750192ef6..8acebc12c4b1dc 100644 --- a/lldb/include/lldb/Target/StackFrameRecognizer.h +++ b/lldb/include/lldb/Target/StackFrameRecognizer.h @@ -17,6 +17,7 @@ #include "lldb/lldb-private-forward.h" #include "lldb/lldb-public.h" +#include #include #include #include @@ -28,20 +29,23 @@ namespace lldb_private { /// This class provides extra information about a stack frame that was /// provided by a specific stack frame recognizer. Right now, this class only /// holds recognized arguments (via GetRecognizedArguments). - class RecognizedStackFrame : public std::enable_shared_from_this { public: + virtual ~RecognizedStackFrame() = default; + virtual lldb::ValueObjectListSP GetRecognizedArguments() { return m_arguments; } virtual lldb::ValueObjectSP GetExceptionObject() { return lldb::ValueObjectSP(); } - virtual lldb::StackFrameSP GetMostRelevantFrame() { return nullptr; }; - virtual ~RecognizedStackFrame() = default; + virtual lldb::StackFrameSP GetMostRelevantFrame() { return nullptr; } std::string GetStopDescription() { return m_stop_desc; } + /// Controls whether this frame should be filtered out when + /// displaying backtraces, for example. + virtual bool ShouldHide() { return false; } protected: lldb::ValueObjectListSP m_arguments; @@ -53,7 +57,6 @@ class RecognizedStackFrame /// A base class for frame recognizers. Subclasses (actual frame recognizers) /// should implement RecognizeFrame to provide a RecognizedStackFrame for a /// given stack frame. - class StackFrameRecognizer : public std::enable_shared_from_this { public: @@ -73,10 +76,10 @@ class StackFrameRecognizer /// Python implementation for frame recognizers. An instance of this class /// tracks a particular Python classobject, which will be asked to recognize /// stack frames. - class ScriptedStackFrameRecognizer : public StackFrameRecognizer { lldb_private::ScriptInterpreter *m_interpreter; lldb_private::StructuredData::ObjectSP m_python_object_sp; + std::string m_python_class; public: @@ -123,8 +126,14 @@ class StackFrameRecognizerManager { lldb::StackFrameRecognizerSP GetRecognizerForFrame(lldb::StackFrameSP frame); lldb::RecognizedStackFrameSP RecognizeFrame(lldb::StackFrameSP frame); + /// Returns a number that changes whenever the list of recognizers + /// has been modified. + uint16_t GetGeneration() const { return m_generation; } private: + /// Increase the generation counter. + void BumpGeneration(); + struct RegisteredEntry { uint32_t recognizer_id; lldb::StackFrameRecognizerSP recognizer; @@ -137,6 +146,7 @@ class StackFrameRecognizerManager { }; std::deque m_recognizers; + uint16_t m_generation; }; /// \class ValueObjectRecognizerSynthesizedValue @@ -144,7 +154,6 @@ class StackFrameRecognizerManager { /// ValueObject subclass that presents the passed ValueObject as a recognized /// value with the specified ValueType. Frame recognizers should return /// instances of this class as the returned objects in GetRecognizedArguments(). - class ValueObjectRecognizerSynthesizedValue : public ValueObject { public: static lldb::ValueObjectSP Create(ValueObject &parent, lldb::ValueType type) { diff --git a/lldb/include/lldb/Target/Thread.h b/lldb/include/lldb/Target/Thread.h index aacc59c292ec79..38b65b2bc58490 100644 --- a/lldb/include/lldb/Target/Thread.h +++ b/lldb/include/lldb/Target/Thread.h @@ -1128,11 +1128,11 @@ class Thread : public std::enable_shared_from_this, size_t GetStatus(Stream &strm, uint32_t start_frame, uint32_t num_frames, uint32_t num_frames_with_source, bool stop_format, - bool only_stacks = false); + bool show_hidden, bool only_stacks = false); size_t GetStackFrameStatus(Stream &strm, uint32_t first_frame, uint32_t num_frames, bool show_frame_info, - uint32_t num_frames_with_source); + uint32_t num_frames_with_source, bool show_hidden); // We need a way to verify that even though we have a thread in a shared // pointer that the object itself is still valid. Currently this won't be the diff --git a/lldb/source/API/SBFrame.cpp b/lldb/source/API/SBFrame.cpp index 47fc88625e30c5..2689ecb2ab7bc7 100644 --- a/lldb/source/API/SBFrame.cpp +++ b/lldb/source/API/SBFrame.cpp @@ -1195,13 +1195,24 @@ bool SBFrame::IsArtificial() const { std::unique_lock lock; ExecutionContext exe_ctx(m_opaque_sp.get(), lock); - StackFrame *frame = exe_ctx.GetFramePtr(); - if (frame) + if (StackFrame *frame = exe_ctx.GetFramePtr()) return frame->IsArtificial(); return false; } +bool SBFrame::IsHidden() const { + LLDB_INSTRUMENT_VA(this); + + std::unique_lock lock; + ExecutionContext exe_ctx(m_opaque_sp.get(), lock); + + if (StackFrame *frame = exe_ctx.GetFramePtr()) + return frame->IsHidden(); + + return false; +} + const char *SBFrame::GetFunctionName() { LLDB_INSTRUMENT_VA(this); diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp index 786f62bd66d520..140a2920f05673 100644 --- a/lldb/source/API/SBThread.cpp +++ b/lldb/source/API/SBThread.cpp @@ -1208,7 +1208,8 @@ bool SBThread::GetStatus(SBStream &status) const { ExecutionContext exe_ctx(m_opaque_sp.get(), lock); if (exe_ctx.HasThreadScope()) { - exe_ctx.GetThreadPtr()->GetStatus(strm, 0, 1, 1, true); + exe_ctx.GetThreadPtr()->GetStatus(strm, 0, 1, 1, true, + /*show_hidden=*/true); } else strm.PutCString("No status"); diff --git a/lldb/source/Commands/CommandCompletions.cpp b/lldb/source/Commands/CommandCompletions.cpp index 54f4b368166492..216aaf9abce6cf 100644 --- a/lldb/source/Commands/CommandCompletions.cpp +++ b/lldb/source/Commands/CommandCompletions.cpp @@ -791,7 +791,7 @@ void CommandCompletions::ThreadIndexes(CommandInterpreter &interpreter, lldb::ThreadSP thread_sp; for (uint32_t idx = 0; (thread_sp = threads.GetThreadAtIndex(idx)); ++idx) { StreamString strm; - thread_sp->GetStatus(strm, 0, 1, 1, true); + thread_sp->GetStatus(strm, 0, 1, 1, true, /*show_hidden*/ true); request.TryCompleteCurrentArg(std::to_string(thread_sp->GetIndexID()), strm.GetString()); } @@ -835,7 +835,7 @@ void CommandCompletions::ThreadIDs(CommandInterpreter &interpreter, lldb::ThreadSP thread_sp; for (uint32_t idx = 0; (thread_sp = threads.GetThreadAtIndex(idx)); ++idx) { StreamString strm; - thread_sp->GetStatus(strm, 0, 1, 1, true); + thread_sp->GetStatus(strm, 0, 1, 1, true, /*show_hidden*/ true); request.TryCompleteCurrentArg(std::to_string(thread_sp->GetID()), strm.GetString()); } diff --git a/lldb/source/Commands/CommandObjectFrame.cpp b/lldb/source/Commands/CommandObjectFrame.cpp index 29e460fe3885ff..46c75e3dd159c0 100644 --- a/lldb/source/Commands/CommandObjectFrame.cpp +++ b/lldb/source/Commands/CommandObjectFrame.cpp @@ -278,6 +278,30 @@ class CommandObjectFrameSelect : public CommandObjectParsed { if (frame_idx == UINT32_MAX) frame_idx = 0; + // If moving up/down by one, skip over hidden frames. + if (*m_options.relative_frame_offset == 1 || + *m_options.relative_frame_offset == -1) { + uint32_t candidate_idx = frame_idx; + const unsigned max_depth = 12; + for (unsigned num_try = 0; num_try < max_depth; ++num_try) { + if (candidate_idx == 0 && *m_options.relative_frame_offset == -1) { + candidate_idx = UINT32_MAX; + break; + } + candidate_idx += *m_options.relative_frame_offset; + if (auto candidate_sp = thread->GetStackFrameAtIndex(candidate_idx)) { + if (candidate_sp->IsHidden()) + continue; + // Now candidate_idx is the first non-hidden frame. + break; + } + candidate_idx = UINT32_MAX; + break; + }; + if (candidate_idx != UINT32_MAX) + m_options.relative_frame_offset = candidate_idx - frame_idx; + } + if (*m_options.relative_frame_offset < 0) { if (static_cast(frame_idx) >= -*m_options.relative_frame_offset) diff --git a/lldb/source/Commands/CommandObjectMemory.cpp b/lldb/source/Commands/CommandObjectMemory.cpp index 137b1ad981073c..baf5d9196e553e 100644 --- a/lldb/source/Commands/CommandObjectMemory.cpp +++ b/lldb/source/Commands/CommandObjectMemory.cpp @@ -1570,7 +1570,8 @@ class CommandObjectMemoryHistory : public CommandObjectParsed { const bool stop_format = false; for (auto thread : thread_list) { - thread->GetStatus(*output_stream, 0, UINT32_MAX, 0, stop_format); + thread->GetStatus(*output_stream, 0, UINT32_MAX, 0, stop_format, + /*should_filter*/ false); } result.SetStatus(eReturnStatusSuccessFinishResult); diff --git a/lldb/source/Commands/CommandObjectThread.cpp b/lldb/source/Commands/CommandObjectThread.cpp index 605f872a9f45e1..6a89c163f37d51 100644 --- a/lldb/source/Commands/CommandObjectThread.cpp +++ b/lldb/source/Commands/CommandObjectThread.cpp @@ -89,6 +89,9 @@ class CommandObjectThreadBacktrace : public CommandObjectIterateOverThreads { "invalid boolean value for option '%c': %s", short_option, option_arg.data()); } break; + case 'u': + m_filtered_backtrace = false; + break; default: llvm_unreachable("Unimplemented option"); } @@ -99,6 +102,7 @@ class CommandObjectThreadBacktrace : public CommandObjectIterateOverThreads { m_count = UINT32_MAX; m_start = 0; m_extended_backtrace = false; + m_filtered_backtrace = true; } llvm::ArrayRef GetDefinitions() override { @@ -109,6 +113,7 @@ class CommandObjectThreadBacktrace : public CommandObjectIterateOverThreads { uint32_t m_count; uint32_t m_start; bool m_extended_backtrace; + bool m_filtered_backtrace; }; CommandObjectThreadBacktrace(CommandInterpreter &interpreter) @@ -121,7 +126,10 @@ class CommandObjectThreadBacktrace : public CommandObjectIterateOverThreads { "call stacks.\n" "Use 'settings set frame-format' to customize the printing of " "frames in the backtrace and 'settings set thread-format' to " - "customize the thread header.", + "customize the thread header.\n" + "Customizable frame recognizers may filter out less interesting " + "frames, which results in gaps in the numbering. " + "Use '-u' to see all frames.", nullptr, eCommandRequiresProcess | eCommandRequiresThread | eCommandTryTargetAPILock | eCommandProcessMustBeLaunched | @@ -199,7 +207,8 @@ class CommandObjectThreadBacktrace : public CommandObjectIterateOverThreads { strm.PutChar('\n'); if (ext_thread_sp->GetStatus(strm, m_options.m_start, m_options.m_count, - num_frames_with_source, stop_format)) { + num_frames_with_source, stop_format, + !m_options.m_filtered_backtrace)) { DoExtendedBacktrace(ext_thread_sp.get(), result); } } @@ -228,7 +237,8 @@ class CommandObjectThreadBacktrace : public CommandObjectIterateOverThreads { const uint32_t num_frames_with_source = 0; const bool stop_format = true; if (!thread->GetStatus(strm, m_options.m_start, m_options.m_count, - num_frames_with_source, stop_format, only_stacks)) { + num_frames_with_source, stop_format, + !m_options.m_filtered_backtrace, only_stacks)) { result.AppendErrorWithFormat( "error displaying backtrace for thread: \"0x%4.4x\"\n", thread->GetIndexID()); @@ -1392,7 +1402,8 @@ class CommandObjectThreadException : public CommandObjectIterateOverThreads { const uint32_t num_frames_with_source = 0; const bool stop_format = false; exception_thread_sp->GetStatus(strm, 0, UINT32_MAX, - num_frames_with_source, stop_format); + num_frames_with_source, stop_format, + /*filtered*/ false); } return true; diff --git a/lldb/source/Commands/Options.td b/lldb/source/Commands/Options.td index f050cd2ebb5ae0..9c4dbed6939ba9 100644 --- a/lldb/source/Commands/Options.td +++ b/lldb/source/Commands/Options.td @@ -1048,6 +1048,8 @@ let Command = "thread backtrace" in { Arg<"FrameIndex">, Desc<"Frame in which to start the backtrace">; def thread_backtrace_extended : Option<"extended", "e">, Group<1>, Arg<"Boolean">, Desc<"Show the extended backtrace, if available">; + def thread_backtrace_unfiltered : Option<"unfiltered", "u">, Group<1>, + Desc<"Filter out frames according to installed frame recognizers">; } let Command = "thread step scope" in { diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp index 309e01e456580c..67f01707a2afee 100644 --- a/lldb/source/Core/Debugger.cpp +++ b/lldb/source/Core/Debugger.cpp @@ -1869,7 +1869,8 @@ void Debugger::HandleThreadEvent(const EventSP &event_sp) { ThreadSP thread_sp( Thread::ThreadEventData::GetThreadFromEvent(event_sp.get())); if (thread_sp) { - thread_sp->GetStatus(*GetAsyncOutputStream(), 0, 1, 1, stop_format); + thread_sp->GetStatus(*GetAsyncOutputStream(), 0, 1, 1, stop_format, + /*show_hidden*/ true); } } } diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp index e45112530404b8..87298803e8415a 100644 --- a/lldb/source/Interpreter/CommandInterpreter.cpp +++ b/lldb/source/Interpreter/CommandInterpreter.cpp @@ -835,11 +835,12 @@ void CommandInterpreter::LoadCommandDictionary() { std::unique_ptr bt_regex_cmd_up( new CommandObjectRegexCommand( *this, "_regexp-bt", - "Show backtrace of the current thread's call stack. Any numeric " - "argument displays at most that many frames. The argument 'all' " - "displays all threads. Use 'settings set frame-format' to customize " + "Show backtrace of the current thread's call stack. Any numeric " + "argument displays at most that many frames. The argument 'all' " + "displays all threads. Use 'settings set frame-format' to customize " "the printing of individual frames and 'settings set thread-format' " - "to customize the thread header.", + "to customize the thread header. Frame recognizers may filter the" + "list. Use 'thread backtrace -u (--unfiltered)' to see them all.", "bt [ | all]", 0, false)); if (bt_regex_cmd_up) { // accept but don't document "bt -c " -- before bt was a regex diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp index c7202a47d0157e..c60200ab186d09 100644 --- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp @@ -26,6 +26,7 @@ #include "lldb/Target/RegisterContext.h" #include "lldb/Target/SectionLoadList.h" #include "lldb/Target/StackFrame.h" +#include "lldb/Target/StackFrameRecognizer.h" #include "lldb/Target/ThreadPlanRunToAddress.h" #include "lldb/Target/ThreadPlanStepInRange.h" #include "lldb/Utility/Timer.h" @@ -40,8 +41,49 @@ static ConstString g_coro_frame = ConstString("__coro_frame"); char CPPLanguageRuntime::ID = 0; +/// A frame recognizer that is installed to hide libc++ implementation +/// details from the backtrace. +class LibCXXFrameRecognizer : public StackFrameRecognizer { + RegularExpression m_hidden_function_regex; + RecognizedStackFrameSP m_hidden_frame; + + struct LibCXXHiddenFrame : public RecognizedStackFrame { + bool ShouldHide() override { return true; } + }; + +public: + LibCXXFrameRecognizer() + : m_hidden_function_regex( + R"(^std::__1::(__function.*::operator\(\)|__invoke))" + R"((\[.*\])?)" // ABI tag. + R"(( const)?$)"), // const. + m_hidden_frame(new LibCXXHiddenFrame()) {} + + std::string GetName() override { return "libc++ frame recognizer"; } + + lldb::RecognizedStackFrameSP + RecognizeFrame(lldb::StackFrameSP frame_sp) override { + if (!frame_sp) + return {}; + const auto &sc = frame_sp->GetSymbolContext(lldb::eSymbolContextFunction); + if (!sc.function) + return {}; + + if (m_hidden_function_regex.Execute(sc.function->GetNameNoArguments())) + return m_hidden_frame; + + return {}; + } +}; + CPPLanguageRuntime::CPPLanguageRuntime(Process *process) - : LanguageRuntime(process) {} + : LanguageRuntime(process) { + if (process) + process->GetTarget().GetFrameRecognizerManager().AddRecognizer( + StackFrameRecognizerSP(new LibCXXFrameRecognizer()), {}, + std::make_shared("^std::__1::"), + /*first_instruction_only*/ false); +} bool CPPLanguageRuntime::IsAllowedRuntimeValue(ConstString name) { return name == g_this || name == g_promise || name == g_coro_frame; diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h index 3026b6113ae8f3..5351c1a698b4a7 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h @@ -227,6 +227,9 @@ class SWIGBridge { LLDBSwigPython_GetRecognizedArguments(PyObject *implementor, const lldb::StackFrameSP &frame_sp); + static bool LLDBSwigPython_ShouldHide(PyObject *implementor, + const lldb::StackFrameSP &frame_sp); + static bool LLDBSWIGPythonRunScriptKeywordProcess( const char *python_function_name, const char *session_dictionary_name, const lldb::ProcessSP &process, std::string &output); diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp index 335c482f8495ad..2a94f110910400 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp @@ -1524,6 +1524,35 @@ lldb::ValueObjectListSP ScriptInterpreterPythonImpl::GetRecognizedArguments( return ValueObjectListSP(); } +bool ScriptInterpreterPythonImpl::ShouldHide( + const StructuredData::ObjectSP &os_plugin_object_sp, + lldb::StackFrameSP frame_sp) { + Locker py_lock(this, Locker::AcquireLock | Locker::NoSTDIN, Locker::FreeLock); + + if (!os_plugin_object_sp) + return false; + + StructuredData::Generic *generic = os_plugin_object_sp->GetAsGeneric(); + if (!generic) + return false; + + PythonObject implementor(PyRefType::Borrowed, + (PyObject *)generic->GetValue()); + + if (!implementor.IsAllocated()) + return false; + + bool result = + SWIGBridge::LLDBSwigPython_ShouldHide(implementor.get(), frame_sp); + + // if it fails, print the error but otherwise go on + if (PyErr_Occurred()) { + PyErr_Print(); + PyErr_Clear(); + } + return result; +} + ScriptedProcessInterfaceUP ScriptInterpreterPythonImpl::CreateScriptedProcessInterface() { return std::make_unique(*this); diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h index c2024efb395d70..85d79955e45efc 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h @@ -107,6 +107,9 @@ class ScriptInterpreterPythonImpl : public ScriptInterpreterPython { GetRecognizedArguments(const StructuredData::ObjectSP &implementor, lldb::StackFrameSP frame_sp) override; + bool ShouldHide(const StructuredData::ObjectSP &implementor, + lldb::StackFrameSP frame_sp) override; + lldb::ScriptedProcessInterfaceUP CreateScriptedProcessInterface() override; lldb::ScriptedThreadInterfaceSP CreateScriptedThreadInterface() override; diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index 3c9247fdbbbc96..b2a0f13b9a1549 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -5545,7 +5545,8 @@ Process::RunThreadPlan(ExecutionContext &exe_ctx, // Print a backtrace into the log so we can figure out where we are: StreamString s; s.PutCString("Thread state after unsuccessful completion: \n"); - thread->GetStackFrameStatus(s, 0, UINT32_MAX, true, UINT32_MAX); + thread->GetStackFrameStatus(s, 0, UINT32_MAX, true, UINT32_MAX, + /*show_hidden*/ true); log->PutString(s.GetString()); } // Restore the thread state if we are going to discard the plan execution. @@ -5819,8 +5820,8 @@ size_t Process::GetThreadStatus(Stream &strm, continue; } thread_sp->GetStatus(strm, start_frame, num_frames, - num_frames_with_source, - stop_format); + num_frames_with_source, stop_format, + /*show_hidden*/ num_frames <= 1); ++num_thread_infos_dumped; } else { Log *log = GetLog(LLDBLog::Process); diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp index 3a2b4d05b28810..0ebaf555f86beb 100644 --- a/lldb/source/Target/StackFrame.cpp +++ b/lldb/source/Target/StackFrame.cpp @@ -1198,6 +1198,12 @@ bool StackFrame::IsArtificial() const { return m_stack_frame_kind == StackFrame::Kind::Artificial; } +bool StackFrame::IsHidden() { + if (auto recognized_frame_sp = GetRecognizedFrame()) + return recognized_frame_sp->ShouldHide(); + return false; +} + SourceLanguage StackFrame::GetLanguage() { CompileUnit *cu = GetSymbolContext(eSymbolContextCompUnit).comp_unit; if (cu) @@ -1971,12 +1977,16 @@ bool StackFrame::GetStatus(Stream &strm, bool show_frame_info, bool show_source, } RecognizedStackFrameSP StackFrame::GetRecognizedFrame() { - if (!m_recognized_frame_sp) { - m_recognized_frame_sp = GetThread() - ->GetProcess() - ->GetTarget() - .GetFrameRecognizerManager() - .RecognizeFrame(CalculateStackFrame()); - } - return m_recognized_frame_sp; + auto process = GetThread()->GetProcess(); + if (!process) + return {}; + // If recognizer list has been modified, discard cache. + auto &manager = process->GetTarget().GetFrameRecognizerManager(); + auto new_generation = manager.GetGeneration(); + if (m_frame_recognizer_generation != new_generation) + m_recognized_frame_sp.reset(); + m_frame_recognizer_generation = new_generation; + if (!m_recognized_frame_sp.has_value()) + m_recognized_frame_sp = manager.RecognizeFrame(CalculateStackFrame()); + return m_recognized_frame_sp.value(); } diff --git a/lldb/source/Target/StackFrameList.cpp b/lldb/source/Target/StackFrameList.cpp index 0cf9ce1bf043f5..7808bd3674ab19 100644 --- a/lldb/source/Target/StackFrameList.cpp +++ b/lldb/source/Target/StackFrameList.cpp @@ -924,7 +924,7 @@ StackFrameList::GetStackFrameSPForStackFramePtr(StackFrame *stack_frame_ptr) { size_t StackFrameList::GetStatus(Stream &strm, uint32_t first_frame, uint32_t num_frames, bool show_frame_info, uint32_t num_frames_with_source, - bool show_unique, + bool show_unique, bool show_hidden, const char *selected_frame_marker) { size_t num_frames_displayed = 0; @@ -951,7 +951,6 @@ size_t StackFrameList::GetStatus(Stream &strm, uint32_t first_frame, unselected_marker = buffer.c_str(); } const char *marker = nullptr; - for (frame_idx = first_frame; frame_idx < last_frame; ++frame_idx) { frame_sp = GetFrameAtIndex(frame_idx); if (!frame_sp) @@ -963,6 +962,11 @@ size_t StackFrameList::GetStatus(Stream &strm, uint32_t first_frame, else marker = unselected_marker; } + + // Hide uninteresting frames unless it's the selected frame. + if (!show_hidden && frame_sp != selected_frame_sp && frame_sp->IsHidden()) + continue; + // Check for interruption here. If we're fetching arguments, this loop // can go slowly: Debugger &dbg = m_thread.GetProcess()->GetTarget().GetDebugger(); diff --git a/lldb/source/Target/StackFrameRecognizer.cpp b/lldb/source/Target/StackFrameRecognizer.cpp index 0ccb1ae9c031e3..44411afc65dda9 100644 --- a/lldb/source/Target/StackFrameRecognizer.cpp +++ b/lldb/source/Target/StackFrameRecognizer.cpp @@ -17,10 +17,14 @@ using namespace lldb; using namespace lldb_private; class ScriptedRecognizedStackFrame : public RecognizedStackFrame { + bool m_hidden; + public: - ScriptedRecognizedStackFrame(ValueObjectListSP args) { - m_arguments = args; + ScriptedRecognizedStackFrame(ValueObjectListSP args, bool hidden) + : m_hidden(hidden) { + m_arguments = std::move(args); } + bool ShouldHide() override { return m_hidden; } }; ScriptedStackFrameRecognizer::ScriptedStackFrameRecognizer( @@ -38,13 +42,22 @@ ScriptedStackFrameRecognizer::RecognizeFrame(lldb::StackFrameSP frame) { ValueObjectListSP args = m_interpreter->GetRecognizedArguments(m_python_object_sp, frame); auto args_synthesized = ValueObjectListSP(new ValueObjectList()); - for (const auto &o : args->GetObjects()) { - args_synthesized->Append(ValueObjectRecognizerSynthesizedValue::Create( - *o, eValueTypeVariableArgument)); + if (args) { + for (const auto &o : args->GetObjects()) + args_synthesized->Append(ValueObjectRecognizerSynthesizedValue::Create( + *o, eValueTypeVariableArgument)); } + bool hidden = m_interpreter->ShouldHide(m_python_object_sp, frame); + return RecognizedStackFrameSP( - new ScriptedRecognizedStackFrame(args_synthesized)); + new ScriptedRecognizedStackFrame(args_synthesized, hidden)); +} + +void StackFrameRecognizerManager::BumpGeneration() { + uint32_t n = m_generation; + n = (n + 1) & ((1 << 16) - 1); + m_generation = n; } void StackFrameRecognizerManager::AddRecognizer( @@ -53,6 +66,7 @@ void StackFrameRecognizerManager::AddRecognizer( m_recognizers.push_front({(uint32_t)m_recognizers.size(), recognizer, false, module, RegularExpressionSP(), symbols, RegularExpressionSP(), first_instruction_only}); + BumpGeneration(); } void StackFrameRecognizerManager::AddRecognizer( @@ -61,6 +75,7 @@ void StackFrameRecognizerManager::AddRecognizer( m_recognizers.push_front({(uint32_t)m_recognizers.size(), recognizer, true, ConstString(), module, std::vector(), symbol, first_instruction_only}); + BumpGeneration(); } void StackFrameRecognizerManager::ForEach( @@ -97,10 +112,12 @@ bool StackFrameRecognizerManager::RemoveRecognizerWithID( if (found == m_recognizers.end()) return false; m_recognizers.erase(found); + BumpGeneration(); return true; } void StackFrameRecognizerManager::RemoveAllRecognizers() { + BumpGeneration(); m_recognizers.clear(); } diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp index 74d1a268c6dffb..fcf0f4e2519085 100644 --- a/lldb/source/Target/Thread.cpp +++ b/lldb/source/Target/Thread.cpp @@ -1748,7 +1748,7 @@ std::string Thread::RunModeAsString(lldb::RunMode mode) { size_t Thread::GetStatus(Stream &strm, uint32_t start_frame, uint32_t num_frames, uint32_t num_frames_with_source, - bool stop_format, bool only_stacks) { + bool stop_format, bool show_hidden, bool only_stacks) { if (!only_stacks) { ExecutionContext exe_ctx(shared_from_this()); @@ -1795,7 +1795,7 @@ size_t Thread::GetStatus(Stream &strm, uint32_t start_frame, num_frames_shown = GetStackFrameList()->GetStatus( strm, start_frame, num_frames, show_frame_info, num_frames_with_source, - show_frame_unique, selected_frame_marker); + show_frame_unique, show_hidden, selected_frame_marker); if (num_frames == 1) strm.IndentLess(); strm.IndentLess(); @@ -1893,9 +1893,11 @@ bool Thread::GetDescription(Stream &strm, lldb::DescriptionLevel level, size_t Thread::GetStackFrameStatus(Stream &strm, uint32_t first_frame, uint32_t num_frames, bool show_frame_info, - uint32_t num_frames_with_source) { - return GetStackFrameList()->GetStatus( - strm, first_frame, num_frames, show_frame_info, num_frames_with_source); + uint32_t num_frames_with_source, + bool show_hidden) { + return GetStackFrameList()->GetStatus(strm, first_frame, num_frames, + show_frame_info, num_frames_with_source, + /*show_unique*/ false, show_hidden); } Unwind &Thread::GetUnwinder() { diff --git a/lldb/source/Target/ThreadPlanStepOut.cpp b/lldb/source/Target/ThreadPlanStepOut.cpp index 0a1e2ae605efcf..8ca1dbc2fe4c46 100644 --- a/lldb/source/Target/ThreadPlanStepOut.cpp +++ b/lldb/source/Target/ThreadPlanStepOut.cpp @@ -58,7 +58,7 @@ ThreadPlanStepOut::ThreadPlanStepOut( return; // we can't do anything here. ValidatePlan() will return false. // While stepping out, behave as-if artificial frames are not present. - while (return_frame_sp->IsArtificial()) { + while (return_frame_sp->IsArtificial() || return_frame_sp->IsHidden()) { m_stepped_past_frames.push_back(return_frame_sp); ++return_frame_index; diff --git a/lldb/test/API/commands/frame/recognizer/TestFrameRecognizer.py b/lldb/test/API/commands/frame/recognizer/TestFrameRecognizer.py index eea0aafce6e25e..6174ac61a709dd 100644 --- a/lldb/test/API/commands/frame/recognizer/TestFrameRecognizer.py +++ b/lldb/test/API/commands/frame/recognizer/TestFrameRecognizer.py @@ -162,6 +162,46 @@ def test_frame_recognizer_1(self): substrs=['*a = 78']) """ + @skipUnlessDarwin + def test_frame_recognizer_hiding(self): + self.build() + + target, process, thread, _ = lldbutil.run_to_name_breakpoint(self, "nested") + frame = thread.GetSelectedFrame() + + # Sanity check. + self.expect( + "thread backtrace", patterns=["frame.*nested", "frame.*baz", "frame.*main"] + ) + + self.expect("frame recognizer clear") + self.expect( + "command script import " + + os.path.join(self.getSourceDir(), "recognizer.py") + ) + + self.expect( + "frame recognizer add -l recognizer.BazFrameRecognizer -f false -s a.out -n baz" + ) + + self.expect( + "frame recognizer list", + substrs=["0: recognizer.BazFrameRecognizer"], + ) + + # Now main should be hidden. + self.expect("thread backtrace", matching=False, patterns=["frame.*baz"]) + self.assertFalse(frame.IsHidden()) + frame = thread.SetSelectedFrame(1) + self.assertIn("baz", frame.name) + self.assertTrue(frame.IsHidden()) + + # Test StepOut. + frame = thread.SetSelectedFrame(0) + thread.StepOut() + frame = thread.GetSelectedFrame() + self.assertIn("main", frame.name) + @skipUnlessDarwin def test_frame_recognizer_multi_symbol(self): self.build() diff --git a/lldb/test/API/commands/frame/recognizer/main.m b/lldb/test/API/commands/frame/recognizer/main.m index 6546692bba772e..74d219f1fff4c5 100644 --- a/lldb/test/API/commands/frame/recognizer/main.m +++ b/lldb/test/API/commands/frame/recognizer/main.m @@ -1,16 +1,17 @@ #import -void foo(int a, int b) -{ - printf("%d %d\n", a, b); -} +void foo(int a, int b) { printf("%d %d\n", a, b); } void bar(int *ptr) { printf("%d\n", *ptr); } -int main (int argc, const char * argv[]) -{ - foo(42, 56); - int i = 78; - bar(&i); - return 0; +void nested(int *ptr) { bar(ptr); } + +void baz(int *ptr) { nested(ptr); } + +int main(int argc, const char *argv[]) { + foo(42, 56); + int i = 78; + bar(&i); + baz(&i); + return 0; } diff --git a/lldb/test/API/commands/frame/recognizer/recognizer.py b/lldb/test/API/commands/frame/recognizer/recognizer.py index 1a2a2d5c265070..98666b720b1e2b 100644 --- a/lldb/test/API/commands/frame/recognizer/recognizer.py +++ b/lldb/test/API/commands/frame/recognizer/recognizer.py @@ -36,3 +36,8 @@ def get_recognized_arguments(self, frame): class MyOtherFrameRecognizer(object): def get_recognized_arguments(self, frame): return [] + + +class BazFrameRecognizer(object): + def should_hide(self, frame): + return "baz" in frame.name diff --git a/lldb/test/API/lang/cpp/std-function-recognizer/Makefile b/lldb/test/API/lang/cpp/std-function-recognizer/Makefile new file mode 100644 index 00000000000000..ab034edd121f9f --- /dev/null +++ b/lldb/test/API/lang/cpp/std-function-recognizer/Makefile @@ -0,0 +1,4 @@ +CXX_SOURCES := main.cpp +USE_LIBCPP := 1 + +include Makefile.rules diff --git a/lldb/test/API/lang/cpp/std-function-recognizer/TestStdFunctionRecognizer.py b/lldb/test/API/lang/cpp/std-function-recognizer/TestStdFunctionRecognizer.py new file mode 100644 index 00000000000000..30fe3ecb1e4bf4 --- /dev/null +++ b/lldb/test/API/lang/cpp/std-function-recognizer/TestStdFunctionRecognizer.py @@ -0,0 +1,84 @@ +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class LibCxxStdFunctionRecognizerTestCase(TestBase): + NO_DEBUG_INFO_TESTCASE = True + + @add_test_categories(["libc++"]) + def test_backtrace(self): + """Test that std::function implementation details are hidden in bt""" + self.build() + (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( + self, "// break here", lldb.SBFileSpec("main.cpp") + ) + # Filtered. + self.expect( + "thread backtrace", + ordered=True, + substrs=["frame", "foo", "frame", "main"], + ) + self.expect( + "thread backtrace", matching=False, patterns=["frame.*std::__1::__function"] + ) + # Unfiltered. + self.expect( + "thread backtrace -u", + ordered=True, + patterns=["frame.*foo", "frame.*std::__1::__function", "frame.*main"], + ) + self.expect( + "thread backtrace --unfiltered", + ordered=True, + patterns=["frame.*foo", "frame.*std::__1::__function", "frame.*main"], + ) + + @add_test_categories(["libc++"]) + def test_up_down(self): + """Test that std::function implementation details are skipped""" + self.build() + (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( + self, "// break here", lldb.SBFileSpec("main.cpp") + ) + frame = thread.GetSelectedFrame() + # up + self.assertIn("foo", frame.GetFunctionName()) + start_idx = frame.GetFrameID() + i = 0 + while i < thread.GetNumFrames(): + self.expect("up") + frame = thread.GetSelectedFrame() + if frame.GetFunctionName() == "main": + break + end_idx = frame.GetFrameID() + self.assertLess(i, end_idx - start_idx, "skipped frames") + + # Back down again. + start_idx = frame.GetFrameID() + for i in range(1, thread.GetNumFrames()): + self.expect("down") + frame = thread.GetSelectedFrame() + if "foo" in frame.GetFunctionName(): + break + end_idx = frame.GetFrameID() + self.assertLess(i, start_idx - end_idx, "skipped frames") + + @add_test_categories(["libc++"]) + def test_api(self): + """Test that std::function implementation details are skipped""" + self.build() + (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( + self, "// break here", lldb.SBFileSpec("main.cpp") + ) + frame = thread.GetSelectedFrame() + num_hidden = 0 + for i in range(1, thread.GetNumFrames()): + thread.SetSelectedFrame(i) + frame = thread.GetSelectedFrame() + if frame.IsHidden(): + num_hidden += 1 + + self.assertGreater(num_hidden, 0) + self.assertLess(num_hidden, thread.GetNumFrames()) diff --git a/lldb/test/API/lang/cpp/std-function-recognizer/main.cpp b/lldb/test/API/lang/cpp/std-function-recognizer/main.cpp new file mode 100644 index 00000000000000..8cf4eaa2e51929 --- /dev/null +++ b/lldb/test/API/lang/cpp/std-function-recognizer/main.cpp @@ -0,0 +1,10 @@ +#include + +int foo(int x, int y) { + return x * y; // break here +} + +int main(int argc, char *argv[]) { + std::function fn = foo; + return fn(argc, 1); +} From 9e9e8238df63b9f10c6635d3f16d8a0fbc7f00c4 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Fri, 23 Aug 2024 09:56:06 -0700 Subject: [PATCH 356/426] Revert "Revert "[lldb-dap] Mark hidden frames as "subtle" (#105457)"" This reverts commit aa70f83e660453c006193aab7ba67c94db236948. --- .../lldb-dap/stackTrace/subtleFrames/Makefile | 3 ++ .../subtleFrames/TestDAP_subtleFrames.py | 29 +++++++++++++++++++ .../lldb-dap/stackTrace/subtleFrames/main.cpp | 13 +++++++++ lldb/tools/lldb-dap/JSONUtils.cpp | 3 ++ 4 files changed, 48 insertions(+) create mode 100644 lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/Makefile create mode 100644 lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/TestDAP_subtleFrames.py create mode 100644 lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/main.cpp diff --git a/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/Makefile b/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/Makefile new file mode 100644 index 00000000000000..99998b20bcb050 --- /dev/null +++ b/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/TestDAP_subtleFrames.py b/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/TestDAP_subtleFrames.py new file mode 100644 index 00000000000000..1e41e841e39bc8 --- /dev/null +++ b/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/TestDAP_subtleFrames.py @@ -0,0 +1,29 @@ +""" +Test lldb-dap stack trace response +""" + + +import dap_server +from lldbsuite.test.decorators import * + +import lldbdap_testcase +from lldbsuite.test.lldbtest import * + + +class TestDAP_subtleFrames(lldbdap_testcase.DAPTestCaseBase): + @add_test_categories(["libc++"]) + def test_subtleFrames(self): + """ + Internal stack frames (such as the ones used by `std::function`) are marked as "subtle". + """ + program = self.getBuildArtifact("a.out") + self.build_and_launch(program) + source = "main.cpp" + self.set_source_breakpoints(source, [line_number(source, "BREAK HERE")]) + self.continue_to_next_stop() + + frames = self.get_stackFrames() + for f in frames: + if "__function" in f["name"]: + self.assertEqual(f["presentationHint"], "subtle") + self.assertTrue(any(f.get("presentationHint") == "subtle" for f in frames)) diff --git a/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/main.cpp b/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/main.cpp new file mode 100644 index 00000000000000..71944528441e38 --- /dev/null +++ b/lldb/test/API/tools/lldb-dap/stackTrace/subtleFrames/main.cpp @@ -0,0 +1,13 @@ +#include +#include + +void greet() { + // BREAK HERE + std::cout << "Hello\n"; +} + +int main() { + std::function func{greet}; + func(); + return 0; +} diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp index a8b85f55939e17..c080fd395b7288 100644 --- a/lldb/tools/lldb-dap/JSONUtils.cpp +++ b/lldb/tools/lldb-dap/JSONUtils.cpp @@ -763,6 +763,9 @@ llvm::json::Value CreateStackFrame(lldb::SBFrame &frame) { object.try_emplace("instructionPointerReference", formatted_addr); } + if (frame.IsArtificial() || frame.IsHidden()) + object.try_emplace("presentationHint", "subtle"); + return llvm::json::Value(std::move(object)); } From ad7577524286ae6070dc7f18bde35cf050d31e5e Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Fri, 23 Aug 2024 09:56:09 -0700 Subject: [PATCH 357/426] Revert "Revert "[lldb][swig] Use the correct variable in the return statement"" This reverts commit 7323e7eee3a819e9a2d8ec29f00d362bcad87731. --- lldb/bindings/python/python-wrapper.swig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/bindings/python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig index 2ce42e3e017d5b..360c392235a866 100644 --- a/lldb/bindings/python/python-wrapper.swig +++ b/lldb/bindings/python/python-wrapper.swig @@ -837,7 +837,7 @@ bool lldb_private::python::SWIGBridge::LLDBSwigPython_ShouldHide( bool ret_val = result ? PyObject_IsTrue(result) : false; Py_XDECREF(result); - return result; + return ret_val; } void *lldb_private::python::SWIGBridge::LLDBSWIGPython_GetDynamicSetting( From 11d2de436cbab8667fe1f99d7b538e6fb555b4d7 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Fri, 23 Aug 2024 09:57:45 -0700 Subject: [PATCH 358/426] [lldb] Fix uninitialized variable --- lldb/include/lldb/Target/StackFrame.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/include/lldb/Target/StackFrame.h b/lldb/include/lldb/Target/StackFrame.h index e4d17847763acf..5cc0fccee03b8f 100644 --- a/lldb/include/lldb/Target/StackFrame.h +++ b/lldb/include/lldb/Target/StackFrame.h @@ -539,7 +539,7 @@ class StackFrame : public ExecutionContextScope, Flags m_flags; Scalar m_frame_base; Status m_frame_base_error; - uint16_t m_frame_recognizer_generation; + uint16_t m_frame_recognizer_generation = 0; /// Does this frame have a CFA? Different from CFA == LLDB_INVALID_ADDRESS. bool m_cfa_is_valid; Kind m_stack_frame_kind; From a968ae6873d4dba50dabaa321fe05d3ccc9f38c8 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Fri, 23 Aug 2024 11:14:50 -0700 Subject: [PATCH 359/426] [TableGen] Refactor SequenceToOffsetTable class (#104986) - Replace use of std::isalnum/ispunct with StringExtras version to avoid possibly locale dependent behavior. - Remove `static` from printChar (do its deduplicated when linking). - Use range based for loops and structured bindings. - No need to use `llvm::` for code in llvm namespace. --- .../TableGen/Basic/SequenceToOffsetTable.h | 27 +++++++++---------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/llvm/utils/TableGen/Basic/SequenceToOffsetTable.h b/llvm/utils/TableGen/Basic/SequenceToOffsetTable.h index 09100b39650d81..497e74afc18ec9 100644 --- a/llvm/utils/TableGen/Basic/SequenceToOffsetTable.h +++ b/llvm/utils/TableGen/Basic/SequenceToOffsetTable.h @@ -15,20 +15,20 @@ #ifndef LLVM_UTILS_TABLEGEN_BASIC_SEQUENCETOOFFSETTABLE_H #define LLVM_UTILS_TABLEGEN_BASIC_SEQUENCETOOFFSETTABLE_H +#include "llvm/ADT/StringExtras.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/raw_ostream.h" #include #include -#include #include #include namespace llvm { -extern llvm::cl::opt EmitLongStrLiterals; +extern cl::opt EmitLongStrLiterals; -static inline void printChar(raw_ostream &OS, char C) { +inline void printChar(raw_ostream &OS, char C) { unsigned char UC(C); - if (isalnum(UC) || ispunct(UC)) { + if (isAlnum(UC) || isPunct(UC)) { OS << '\''; if (C == '\\' || C == '\'') OS << '\\'; @@ -126,7 +126,7 @@ class SequenceToOffsetTable { /// initializer, where each element is a C string literal terminated by /// `\0`. Falls back to emitting a comma-separated integer list if /// `EmitLongStrLiterals` is false - void emitStringLiteralDef(raw_ostream &OS, const llvm::Twine &Decl) const { + void emitStringLiteralDef(raw_ostream &OS, const Twine &Decl) const { assert(Entries && "Call layout() before emitStringLiteralDef()"); if (!EmitLongStrLiterals) { OS << Decl << " = {\n"; @@ -140,9 +140,9 @@ class SequenceToOffsetTable { << "#pragma GCC diagnostic ignored \"-Woverlength-strings\"\n" << "#endif\n" << Decl << " = {\n"; - for (auto I : Seqs) { - OS << " /* " << I.second << " */ \""; - OS.write_escaped(I.first); + for (const auto &[Seq, Offset] : Seqs) { + OS << " /* " << Offset << " */ \""; + OS.write_escaped(Seq); OS << "\\0\"\n"; } OS << "};\n" @@ -156,13 +156,10 @@ class SequenceToOffsetTable { void emit(raw_ostream &OS, void (*Print)(raw_ostream &, ElemT), const char *Term = "0") const { assert((empty() || Entries) && "Call layout() before emit()"); - for (typename SeqMap::const_iterator I = Seqs.begin(), E = Seqs.end(); - I != E; ++I) { - OS << " /* " << I->second << " */ "; - for (typename SeqT::const_iterator SI = I->first.begin(), - SE = I->first.end(); - SI != SE; ++SI) { - Print(OS, *SI); + for (const auto &[Seq, Offset] : Seqs) { + OS << " /* " << Offset << " */ "; + for (const ElemT &Element : Seq) { + Print(OS, Element); OS << ", "; } OS << Term << ",\n"; From 71867042041ebb02c2865ed7c9b908e691b31a91 Mon Sep 17 00:00:00 2001 From: Peiming Liu Date: Fri, 23 Aug 2024 11:21:44 -0700 Subject: [PATCH 360/426] [mlir][sparse] refactoring sparse_tensor.iterate lowering pattern implementation. (#105566) --- .../Transforms/SparseIterationToScf.cpp | 118 ++++++------------ 1 file changed, 36 insertions(+), 82 deletions(-) diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp index d6c0da4a9e4573..f7fcabb0220b50 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp @@ -244,88 +244,41 @@ class SparseIterateOpConverter : public OneToNOpConversionPattern { std::unique_ptr it = iterSpace.extractIterator(rewriter, loc); - if (it->iteratableByFor()) { - auto [lo, hi] = it->genForCond(rewriter, loc); - Value step = constantIndex(rewriter, loc, 1); - SmallVector ivs; - for (ValueRange inits : adaptor.getInitArgs()) - llvm::append_range(ivs, inits); - scf::ForOp forOp = rewriter.create(loc, lo, hi, step, ivs); - - Block *loopBody = op.getBody(); - OneToNTypeMapping bodyTypeMapping(loopBody->getArgumentTypes()); - if (failed(typeConverter->convertSignatureArgs( - loopBody->getArgumentTypes(), bodyTypeMapping))) - return failure(); - rewriter.applySignatureConversion(loopBody, bodyTypeMapping); - - rewriter.eraseBlock(forOp.getBody()); - Region &dstRegion = forOp.getRegion(); - rewriter.inlineRegionBefore(op.getRegion(), dstRegion, dstRegion.end()); - - auto yieldOp = - llvm::cast(forOp.getBody()->getTerminator()); - - rewriter.setInsertionPointToEnd(forOp.getBody()); - // replace sparse_tensor.yield with scf.yield. - rewriter.create(loc, yieldOp.getResults()); - rewriter.eraseOp(yieldOp); - - const OneToNTypeMapping &resultMapping = adaptor.getResultMapping(); - rewriter.replaceOp(op, forOp.getResults(), resultMapping); - } else { - SmallVector ivs; - // TODO: put iterator at the end of argument list to be consistent with - // coiterate operation. - llvm::append_range(ivs, it->getCursor()); - for (ValueRange inits : adaptor.getInitArgs()) - llvm::append_range(ivs, inits); - - assert(llvm::all_of(ivs, [](Value v) { return v != nullptr; })); - - TypeRange types = ValueRange(ivs).getTypes(); - auto whileOp = rewriter.create(loc, types, ivs); - SmallVector l(types.size(), op.getIterator().getLoc()); - - // Generates loop conditions. - Block *before = rewriter.createBlock(&whileOp.getBefore(), {}, types, l); - rewriter.setInsertionPointToStart(before); - ValueRange bArgs = before->getArguments(); - auto [whileCond, remArgs] = it->genWhileCond(rewriter, loc, bArgs); - assert(remArgs.size() == adaptor.getInitArgs().size()); - rewriter.create(loc, whileCond, before->getArguments()); - - // Generates loop body. - Block *loopBody = op.getBody(); - OneToNTypeMapping bodyTypeMapping(loopBody->getArgumentTypes()); - if (failed(typeConverter->convertSignatureArgs( - loopBody->getArgumentTypes(), bodyTypeMapping))) - return failure(); - rewriter.applySignatureConversion(loopBody, bodyTypeMapping); - - Region &dstRegion = whileOp.getAfter(); - // TODO: handle uses of coordinate! - rewriter.inlineRegionBefore(op.getRegion(), dstRegion, dstRegion.end()); - ValueRange aArgs = whileOp.getAfterArguments(); - auto yieldOp = llvm::cast( - whileOp.getAfterBody()->getTerminator()); - - rewriter.setInsertionPointToEnd(whileOp.getAfterBody()); + SmallVector ivs; + for (ValueRange inits : adaptor.getInitArgs()) + llvm::append_range(ivs, inits); + + // Type conversion on iterate op block. + OneToNTypeMapping blockTypeMapping(op.getBody()->getArgumentTypes()); + if (failed(typeConverter->convertSignatureArgs( + op.getBody()->getArgumentTypes(), blockTypeMapping))) + return rewriter.notifyMatchFailure( + op, "failed to convert iterate region argurment types"); + rewriter.applySignatureConversion(op.getBody(), blockTypeMapping); + + Block *block = op.getBody(); + ValueRange ret = genLoopWithIterator( + rewriter, loc, it.get(), ivs, /*iterFirst=*/true, + [block](PatternRewriter &rewriter, Location loc, Region &loopBody, + SparseIterator *it, ValueRange reduc) -> SmallVector { + SmallVector blockArgs(it->getCursor()); + // TODO: Also appends coordinates if used. + // blockArgs.push_back(it->deref(rewriter, loc)); + llvm::append_range(blockArgs, reduc); + + Block *dstBlock = &loopBody.getBlocks().front(); + rewriter.inlineBlockBefore(block, dstBlock, dstBlock->end(), + blockArgs); + auto yield = llvm::cast(dstBlock->back()); + // We can not use ValueRange as the operation holding the values will + // be destoryed. + SmallVector result(yield.getResults()); + rewriter.eraseOp(yield); + return result; + }); - aArgs = it->linkNewScope(aArgs); - ValueRange nx = it->forward(rewriter, loc); - SmallVector yields; - llvm::append_range(yields, nx); - llvm::append_range(yields, yieldOp.getResults()); - - // replace sparse_tensor.yield with scf.yield. - rewriter.eraseOp(yieldOp); - rewriter.create(loc, yields); - const OneToNTypeMapping &resultMapping = adaptor.getResultMapping(); - rewriter.replaceOp( - op, whileOp.getResults().drop_front(it->getCursor().size()), - resultMapping); - } + const OneToNTypeMapping &resultMapping = adaptor.getResultMapping(); + rewriter.replaceOp(op, ret, resultMapping); return success(); } }; @@ -366,9 +319,10 @@ class SparseCoIterateOpConverter Block *block = ®ion.getBlocks().front(); OneToNTypeMapping blockTypeMapping(block->getArgumentTypes()); if (failed(typeConverter->convertSignatureArgs(block->getArgumentTypes(), - blockTypeMapping))) + blockTypeMapping))) { return rewriter.notifyMatchFailure( op, "failed to convert coiterate region argurment types"); + } rewriter.applySignatureConversion(block, blockTypeMapping); } From 8f08b75ce4af9dc72fb560033db14891ac01a682 Mon Sep 17 00:00:00 2001 From: smanna12 Date: Fri, 23 Aug 2024 11:23:25 -0700 Subject: [PATCH 361/426] [Clang] Assert non-null enum definition in CGDebugInfo::CreateTypeDefinition(const EnumType*) (#105556) This commit adds an assert to check for a non-null enum definition in CGDebugInfo::CreateTypeDefinition(const EnumType*), ensuring precondition validity. Previous discussion on https://github.com/llvm/llvm-project/pull/97105 --- clang/lib/CodeGen/CGDebugInfo.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index 7ad3088f0ab756..dc83d596e3cb06 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -3561,6 +3561,7 @@ llvm::DIType *CGDebugInfo::CreateTypeDefinition(const EnumType *Ty) { SmallVector Enumerators; ED = ED->getDefinition(); + assert(ED && "An enumeration definition is required"); for (const auto *Enum : ED->enumerators()) { Enumerators.push_back( DBuilder.createEnumerator(Enum->getName(), Enum->getInitVal())); From 57b89fdd8af0a230ff270d6f018c0ca6b8562d71 Mon Sep 17 00:00:00 2001 From: Peter Klausler <35819229+klausler@users.noreply.github.com> Date: Fri, 23 Aug 2024 11:27:25 -0700 Subject: [PATCH 362/426] [flang][runtime] Add FLANG_RUNTIME_NO_REAL_3 flag to build (#105856) Allow a runtime build to disable SELECTED_REAL_KIND from returning kind 3 (16-bit truncated form of 32-bit IEEE-754 floating point, a/k/a "brain float" or bfloat16). --- flang/runtime/numeric.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/flang/runtime/numeric.cpp b/flang/runtime/numeric.cpp index 28687b1971b7ed..40bacf07157a27 100644 --- a/flang/runtime/numeric.cpp +++ b/flang/runtime/numeric.cpp @@ -142,6 +142,11 @@ inline RT_API_ATTRS CppTypeFor SelectedRealKind( #else constexpr bool hasReal2{false}; #endif +#ifndef FLANG_RUNTIME_NO_REAL_3 + constexpr bool hasReal3{true}; +#else + constexpr bool hasReal3{false}; +#endif #if defined LDBL_MANT_DIG == 64 && !defined FLANG_RUNTIME_NO_REAL_10 constexpr bool hasReal10{true}; #else @@ -171,9 +176,9 @@ inline RT_API_ATTRS CppTypeFor SelectedRealKind( } if (r <= 4) { - kind = kind < 2 ? 2 : kind; + kind = kind < 2 ? (hasReal2 ? 2 : 4) : kind; } else if (r <= 37) { - kind = kind < 3 ? (p == 3 ? 4 : 3) : kind; + kind = kind < 3 ? (hasReal3 && p != 3 ? 3 : 4) : kind; } else if (r <= 307) { kind = kind < 8 ? 8 : kind; } else if (hasReal10 && r <= 4931) { From caa844e67cbb5e4f5f20a237d713a227ce65b5b9 Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Fri, 23 Aug 2024 20:29:19 +0200 Subject: [PATCH 363/426] [LLD][COFF] Add support for CHPE redirection metadata. (#105739) This is part of CHPE metadata containing a sorted list of x86_64 export thunks RVAs and RVAs of ARM64EC functions associated with them. It's stored in a dedicated .a64xrm section. --- lld/COFF/Chunks.cpp | 13 +++++++++ lld/COFF/Chunks.h | 11 ++++++++ lld/COFF/Driver.cpp | 2 ++ lld/COFF/Writer.cpp | 26 +++++++++++++++++- lld/test/COFF/Inputs/loadconfig-arm64ec.s | 4 +-- lld/test/COFF/arm64ec-export-thunks.test | 29 +++++++++++++++++++-- lld/test/COFF/arm64ec-patchable-thunks.test | 6 ++++- 7 files changed, 85 insertions(+), 6 deletions(-) diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp index be44950a1720e3..4e3a564ebacd87 100644 --- a/lld/COFF/Chunks.cpp +++ b/lld/COFF/Chunks.cpp @@ -1078,4 +1078,17 @@ void ECExportThunkChunk::writeTo(uint8_t *buf) const { write32le(buf + 10, target->getRVA() - rva - 14); } +size_t CHPERedirectionChunk::getSize() const { + return exportThunks.size() * sizeof(chpe_redirection_entry); +} + +void CHPERedirectionChunk::writeTo(uint8_t *buf) const { + auto entries = reinterpret_cast(buf); + + for (uint32_t i = 0; i < exportThunks.size(); i++) { + entries[i].Source = exportThunks[i].first->getRVA(); + entries[i].Destination = exportThunks[i].second->getRVA(); + } +} + } // namespace lld::coff diff --git a/lld/COFF/Chunks.h b/lld/COFF/Chunks.h index 5443d4619a977e..015df41b04c67d 100644 --- a/lld/COFF/Chunks.h +++ b/lld/COFF/Chunks.h @@ -749,6 +749,17 @@ class ECCodeMapChunk : public NonSectionChunk { std::vector ↦ }; +class CHPERedirectionChunk : public NonSectionChunk { +public: + CHPERedirectionChunk(std::vector> &exportThunks) + : exportThunks(exportThunks) {} + size_t getSize() const override; + void writeTo(uint8_t *buf) const override; + +private: + std::vector> &exportThunks; +}; + static const uint8_t ECExportThunkCode[] = { 0x48, 0x8b, 0xc4, // movq %rsp, %rax 0x48, 0x89, 0x58, 0x20, // movq %rbx, 0x20(%rax) diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index c09c91fe4b1719..472f5074ba8b8c 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -2440,6 +2440,8 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { if (isArm64EC(config->machine)) { ctx.symtab.addAbsolute("__arm64x_extra_rfe_table", 0); ctx.symtab.addAbsolute("__arm64x_extra_rfe_table_size", 0); + ctx.symtab.addAbsolute("__arm64x_redirection_metadata", 0); + ctx.symtab.addAbsolute("__arm64x_redirection_metadata_count", 0); ctx.symtab.addAbsolute("__hybrid_code_map", 0); ctx.symtab.addAbsolute("__hybrid_code_map_count", 0); } diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp index 776595d98c391d..358d16fe330cea 100644 --- a/lld/COFF/Writer.cpp +++ b/lld/COFF/Writer.cpp @@ -298,6 +298,9 @@ class Writer { CVDebugRecordChunk *buildId = nullptr; ArrayRef sectionTable; + // List of Arm64EC export thunks. + std::vector> exportThunks; + uint64_t fileSize; uint32_t pointerToSymbolTable = 0; uint64_t sizeOfImage; @@ -312,6 +315,7 @@ class Writer { OutputSection *idataSec; OutputSection *edataSec; OutputSection *didatSec; + OutputSection *a64xrmSec; OutputSection *rsrcSec; OutputSection *relocSec; OutputSection *ctorsSec; @@ -995,6 +999,8 @@ void Writer::createSections() { idataSec = createSection(".idata", data | r); edataSec = createSection(".edata", data | r); didatSec = createSection(".didat", data | r); + if (isArm64EC(ctx.config.machine)) + a64xrmSec = createSection(".a64xrm", data | r); rsrcSec = createSection(".rsrc", data | r); relocSec = createSection(".reloc", data | discardable | r); ctorsSec = createSection(".ctors", data | r | w); @@ -2053,8 +2059,10 @@ void Writer::createECChunks() { auto sym = dyn_cast(s); if (!sym || !sym->getChunk()) continue; - if (auto thunk = dyn_cast(sym->getChunk())) + if (auto thunk = dyn_cast(sym->getChunk())) { hexpthkSec->addChunk(thunk); + exportThunks.push_back({thunk, thunk->target}); + } } auto codeMapChunk = make(codeMap); @@ -2062,6 +2070,13 @@ void Writer::createECChunks() { Symbol *codeMapSym = ctx.symtab.findUnderscore("__hybrid_code_map"); replaceSymbol(codeMapSym, codeMapSym->getName(), codeMapChunk); + + CHPERedirectionChunk *entryPoints = make(exportThunks); + a64xrmSec->addChunk(entryPoints); + Symbol *entryPointsSym = + ctx.symtab.findUnderscore("__arm64x_redirection_metadata"); + replaceSymbol(entryPointsSym, entryPointsSym->getName(), + entryPoints); } // MinGW specific. Gather all relocations that are imported from a DLL even @@ -2154,6 +2169,11 @@ void Writer::setECSymbols() { if (!isArm64EC(ctx.config.machine)) return; + llvm::stable_sort(exportThunks, [](const std::pair &a, + const std::pair &b) { + return a.first->getRVA() < b.first->getRVA(); + }); + Symbol *rfeTableSym = ctx.symtab.findUnderscore("__arm64x_extra_rfe_table"); replaceSymbol(rfeTableSym, "__arm64x_extra_rfe_table", pdata.first); @@ -2165,6 +2185,10 @@ void Writer::setECSymbols() { ->setVA(pdata.last->getRVA() + pdata.last->getSize() - pdata.first->getRVA()); } + + Symbol *entryPointCountSym = + ctx.symtab.findUnderscore("__arm64x_redirection_metadata_count"); + cast(entryPointCountSym)->setVA(exportThunks.size()); } // Write section contents to a mmap'ed file. diff --git a/lld/test/COFF/Inputs/loadconfig-arm64ec.s b/lld/test/COFF/Inputs/loadconfig-arm64ec.s index a270d281095dd6..62a6d0cab642e9 100644 --- a/lld/test/COFF/Inputs/loadconfig-arm64ec.s +++ b/lld/test/COFF/Inputs/loadconfig-arm64ec.s @@ -67,7 +67,7 @@ __chpe_metadata: .rva __hybrid_code_map .word __hybrid_code_map_count .word 0 // __x64_code_ranges_to_entry_points - .word 0 //__arm64x_redirection_metadata + .rva __arm64x_redirection_metadata .rva __os_arm64x_dispatch_call_no_redirect .rva __os_arm64x_dispatch_ret .rva __os_arm64x_check_call @@ -76,7 +76,7 @@ __chpe_metadata: .word 0 // __arm64x_native_entrypoint .word 0 // __hybrid_auxiliary_iat .word 0 // __x64_code_ranges_to_entry_points_count - .word 0 // __arm64x_redirection_metadata_count + .word __arm64x_redirection_metadata_count .rva __os_arm64x_get_x64_information .rva __os_arm64x_set_x64_information .rva __arm64x_extra_rfe_table diff --git a/lld/test/COFF/arm64ec-export-thunks.test b/lld/test/COFF/arm64ec-export-thunks.test index 6ed0514d4b17f3..2e4cfd6203b751 100644 --- a/lld/test/COFF/arm64ec-export-thunks.test +++ b/lld/test/COFF/arm64ec-export-thunks.test @@ -49,7 +49,7 @@ EXP-DISASM-NEXT: 18000301f: cc int3 RUN: llvm-objdump -p exports.dll | FileCheck -check-prefix=EXP-EXPORT %s EXP-EXPORT: Ordinal RVA Name EXP-EXPORT-NEXT: 1 0x3010 arm64ec_func -EXP-EXPORT-NEXT: 2 0x6000 data_sym +EXP-EXPORT-NEXT: 2 0x7000 data_sym EXP-EXPORT-NEXT: 3 0x3000 func EXP-EXPORT-NEXT: 4 0x2000 x86_64_func @@ -58,9 +58,30 @@ EXP-CHPE: CodeMap [ EXP-CHPE-NEXT: 0x1000 - 0x100C ARM64EC EXP-CHPE-NEXT: 0x2000 - 0x3020 X64 EXP-CHPE-NEXT: ] +EXP-CHPE-NEXT: CodeRangesToEntryPoints: 0 +EXP-CHPE-NEXT: RedirectionMetadata [ +EXP-CHPE-NEXT: 0x3000 -> 0x1000 +EXP-CHPE-NEXT: 0x3010 -> 0x1000 +EXP-CHPE-NEXT: ] + +RUN: llvm-readobj --sections exports.dll | FileCheck --check-prefix=A64XRM %s + +A64XRM: Name: .a64xrm (2E 61 36 34 78 72 6D 00) +A64XRM-NEXT: VirtualSize: 0x10 +A64XRM-NEXT: VirtualAddress: 0x6000 +A64XRM-NEXT: RawDataSize: 512 +A64XRM-NEXT: PointerToRawData: +A64XRM-NEXT: PointerToRelocations: 0x0 +A64XRM-NEXT: PointerToLineNumbers: 0x0 +A64XRM-NEXT: RelocationCount: 0 +A64XRM-NEXT: LineNumberCount: 0 +A64XRM-NEXT: Characteristics [ (0x40000040) +A64XRM-NEXT: IMAGE_SCN_CNT_INITIALIZED_DATA (0x40) +A64XRM-NEXT: IMAGE_SCN_MEM_READ (0x40000000) +A64XRM-NEXT: ] RUN: llvm-objdump -s --section=.test exports.dll | FileCheck --check-prefix=EXP-DATA %s -EXP-DATA: 180006000 00300000 10300000 +EXP-DATA: 180007000 00300000 10300000 RUN: lld-link -out:exports2.dll -machine:arm64ec antidep-func.obj x86_64-func.obj loadconfig-arm64ec.obj \ RUN: arm64ec-data.obj -dll -noentry -export:arm64ec_func -export:func=arm64ec_func \ @@ -100,6 +121,10 @@ ENTRY-CHPE: CodeMap [ ENTRY-CHPE-NEXT: 0x1000 - 0x100C ARM64EC ENTRY-CHPE-NEXT: 0x2000 - 0x2010 X64 ENTRY-CHPE-NEXT: ] +ENTRY-CHPE-NEXT: CodeRangesToEntryPoints: 0 +ENTRY-CHPE-NEXT: RedirectionMetadata [ +ENTRY-CHPE-NEXT: 0x2000 -> 0x1000 +ENTRY-CHPE-NEXT: ] Test exporting data symbol as a function: diff --git a/lld/test/COFF/arm64ec-patchable-thunks.test b/lld/test/COFF/arm64ec-patchable-thunks.test index cccd42eebfd367..044f3c7cebdf8e 100644 --- a/lld/test/COFF/arm64ec-patchable-thunks.test +++ b/lld/test/COFF/arm64ec-patchable-thunks.test @@ -27,13 +27,17 @@ PATCH-DISASM-NEXT: 18000200e: cc int3 PATCH-DISASM-NEXT: 18000200f: cc int3 RUN: llvm-readobj --hex-dump=.test test.dll | FileCheck -check-prefix=RVA %s -RVA: 0x180005000 00200000 +RVA: 0x180006000 00200000 RUN: llvm-readobj --coff-load-config test.dll | FileCheck -check-prefix=PATCH-CHPE %s PATCH-CHPE: CodeMap [ PATCH-CHPE-NEXT: 0x1000 - 0x1008 ARM64EC PATCH-CHPE-NEXT: 0x2000 - 0x2010 X64 PATCH-CHPE-NEXT: ] +PATCH-CHPE-NEXT: CodeRangesToEntryPoints: 0 +PATCH-CHPE-NEXT: RedirectionMetadata [ +PATCH-CHPE-NEXT: 0x2000 -> 0x1000 +PATCH-CHPE-NEXT: ] RUN: lld-link -out:test2.dll -machine:arm64ec arm64ec-alias.obj test-sec.obj loadconfig-arm64ec.obj -dll -noentry From ceb587a16cc2f5d61dc3299d2e54d6c17be14e4a Mon Sep 17 00:00:00 2001 From: Austin Kerbow Date: Fri, 23 Aug 2024 11:51:37 -0700 Subject: [PATCH 364/426] [AMDGPU] Fix crash in allowsMisalignedMemoryAccesses with i1 (#105794) --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 3 ++- llvm/test/CodeGen/AMDGPU/load-local-i1.ll | 13 ++++++++++++ .../AMDGPU/load-i1-misaligned.ll | 20 +++++++++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/load-i1-misaligned.ll diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ecd4451c504727..1437f3d58b5e79 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1695,7 +1695,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4)) return false; - Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment. + Align RequiredAlignment( + PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment. if (Subtarget->hasLDSMisalignedBug() && Size > 32 && Alignment < RequiredAlignment) return false; diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i1.ll b/llvm/test/CodeGen/AMDGPU/load-local-i1.ll index 578170941efaaa..43d102e4655b23 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i1.ll @@ -462,4 +462,17 @@ define amdgpu_kernel void @local_sextload_v64i1_to_v64i64(ptr addrspace(3) %out, ret void } +; FUNC-LABEL: {{^}}local_load_i1_misaligned: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 +define amdgpu_kernel void @local_load_i1_misaligned(ptr addrspace(3) %in, ptr addrspace (3) %out) #0 { + %in.gep.1 = getelementptr i1, ptr addrspace(3) %in, i32 1 + %load.1 = load <16 x i1>, ptr addrspace(3) %in.gep.1, align 4 + %load.2 = load <8 x i1>, ptr addrspace(3) %in, align 1 + %out.gep.1 = getelementptr i1, ptr addrspace(3) %out, i32 16 + store <16 x i1> %load.1, ptr addrspace(3) %out + store <8 x i1> %load.2, ptr addrspace(3) %out.gep.1 + ret void +} + attributes #0 = { nounwind } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/load-i1-misaligned.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/load-i1-misaligned.ll new file mode 100644 index 00000000000000..6f3d2cb69090eb --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/load-i1-misaligned.ll @@ -0,0 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=amdgcn-amd-amdhsa --mcpu=gfx940 -passes=load-store-vectorizer -S -o - %s | FileCheck %s + +; Don't crash when checking for misaligned accesses with sub-byte size. + +define void @misaligned_access_i1(ptr addrspace(3) %in) #0 { +; CHECK-LABEL: define void @misaligned_access_i1( +; CHECK-SAME: ptr addrspace(3) [[IN:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[IN_GEP_1:%.*]] = getelementptr i1, ptr addrspace(3) [[IN]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i1>, ptr addrspace(3) [[IN_GEP_1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i1>, ptr addrspace(3) [[IN]], align 1 +; CHECK-NEXT: ret void +; + %in.gep.1 = getelementptr i1, ptr addrspace(3) %in, i32 1 + + %1 = load <16 x i1>, ptr addrspace(3) %in.gep.1, align 4 + %2 = load <8 x i1>, ptr addrspace(3) %in, align 1 + ret void +} + From 00620abc7f6bdd824e033744f84408c98decd95c Mon Sep 17 00:00:00 2001 From: MaheshRavishankar <1663364+MaheshRavishankar@users.noreply.github.com> Date: Fri, 23 Aug 2024 12:05:52 -0700 Subject: [PATCH 365/426] [mlir][SCF] Allow canonicalization of zero-trip count `scf.forall` with empty mapping. (#105793) Current folding of one-trip count loop does not kick in with an empty mapping. Enable this for empty mapping. Signed-off-by: MaheshRavishankar --- mlir/lib/Dialect/SCF/IR/SCF.cpp | 13 ++++++----- mlir/test/Dialect/SCF/canonicalize.mlir | 27 ++++++++++++++++++++++ mlir/test/Dialect/Tensor/canonicalize.mlir | 21 ----------------- 3 files changed, 34 insertions(+), 27 deletions(-) diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp index e92d9503372cdf..bfa7db84bd9af7 100644 --- a/mlir/lib/Dialect/SCF/IR/SCF.cpp +++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp @@ -1700,7 +1700,7 @@ struct ForallOpSingleOrZeroIterationDimsFolder LogicalResult matchAndRewrite(ForallOp op, PatternRewriter &rewriter) const override { // Do not fold dimensions if they are mapped to processing units. - if (op.getMapping().has_value()) + if (op.getMapping().has_value() && !op.getMapping()->empty()) return failure(); Location loc = op.getLoc(); @@ -1729,11 +1729,6 @@ struct ForallOpSingleOrZeroIterationDimsFolder newMixedUpperBounds.push_back(ub); newMixedSteps.push_back(step); } - // Exit if none of the loop dimensions perform a single iteration. - if (newMixedLowerBounds.size() == static_cast(op.getRank())) { - return rewriter.notifyMatchFailure( - op, "no dimensions have 0 or 1 iterations"); - } // All of the loop dimensions perform a single iteration. Inline loop body. if (newMixedLowerBounds.empty()) { @@ -1741,6 +1736,12 @@ struct ForallOpSingleOrZeroIterationDimsFolder return success(); } + // Exit if none of the loop dimensions perform a single iteration. + if (newMixedLowerBounds.size() == static_cast(op.getRank())) { + return rewriter.notifyMatchFailure( + op, "no dimensions have 0 or 1 iterations"); + } + // Replace the loop by a lower-dimensional loop. ForallOp newOp; newOp = rewriter.create(loc, newMixedLowerBounds, diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir index 268946803de7a5..c68369a8e4fce7 100644 --- a/mlir/test/Dialect/SCF/canonicalize.mlir +++ b/mlir/test/Dialect/SCF/canonicalize.mlir @@ -1635,6 +1635,33 @@ func.func @do_not_inline_distributed_forall_loop( // ----- +func.func @inline_empty_loop_with_empty_mapping( + %in: tensor<16xf32>) -> tensor<16xf32> { + %cst = arith.constant 0.000000e+00 : f32 + %0 = tensor.empty() : tensor<16xf32> + %1 = scf.forall () in () shared_outs (%out_ = %0) -> (tensor<16xf32>) { + %slice = tensor.extract_slice %out_[0] [16] [1] + : tensor<16xf32> to tensor<16xf32> + %generic = linalg.generic { + indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], + iterator_types = ["parallel"]} + ins(%slice : tensor<16xf32>) outs(%0 : tensor<16xf32>) { + ^bb0(%b0 : f32, %b1 : f32): + %2 = arith.addf %b0, %b0 : f32 + linalg.yield %2 : f32 + } -> tensor<16xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %generic into %out_[0] [16] [1] + : tensor<16xf32> into tensor<16xf32> + } + }{ mapping = [] } + return %1 : tensor<16xf32> +} +// CHECK-LABEL: func @inline_empty_loop_with_empty_mapping +// CHECK-NOT: scf.forall + +// ----- + func.func @collapse_one_dim_parallel(%in: tensor<8x8xf32>) -> tensor<8x8xf32> { %c8 = arith.constant 8 : index %c0 = arith.constant 0 : index diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir index 4b8efde78cc23c..458ff51be7462e 100644 --- a/mlir/test/Dialect/Tensor/canonicalize.mlir +++ b/mlir/test/Dialect/Tensor/canonicalize.mlir @@ -2076,27 +2076,6 @@ func.func @canonicalize_parallel_insert_slice_indices( // ----- -// CHECK-LABEL: func.func @dont_fold_parallel_insert_slice( -// CHECK-SAME: %[[arg0:[0-9a-z]*]]: tensor<1x5xf32>, -// CHECK-SAME: %[[arg1:[0-9a-z]*]]: tensor<1x5xf32>) -func.func @dont_fold_parallel_insert_slice( - %arg0 : tensor<1x5xf32>, %arg1: tensor<1x5xf32>) -> tensor<1x5xf32> -{ - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - // CHECK: scf.forall () in () shared_outs(%[[o:.*]] = %[[arg1]]) -> (tensor<1x5xf32>) { - // CHECK-NEXT: scf.forall.in_parallel { - // CHECK-NEXT: tensor.parallel_insert_slice %[[arg0]] into %[[o]][0, 0] [1, 5] [1, 1] : tensor<1x5xf32> into tensor<1x5xf32> - %2 = scf.forall () in () shared_outs(%o = %arg1) -> (tensor<1x5xf32>) { - scf.forall.in_parallel { - tensor.parallel_insert_slice %arg0 into %o[%c0, %c0] [1, 5] [%c1, %c1] : tensor<1x5xf32> into tensor<1x5xf32> - } - } - return %2 : tensor<1x5xf32> -} - -// ----- - // CHECK-LABEL: func.func @fold_insert_slice_after_extract_slice // CHECK-SAME: (%[[INPUT:.+]]: tensor<1x2x2x4xf32>) func.func @fold_insert_slice_after_extract_slice(%input: tensor<1x2x2x4xf32>) -> tensor<1x2x2x4xf32> { From 782bc4f669d3c2b52d1c9db121dea6a545216149 Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Fri, 23 Aug 2024 12:06:53 -0700 Subject: [PATCH 366/426] [DXIL][Analysis] Uniquify duplicate resources in DXILResourceAnalysis If a resources is used multiple times, we should only have one resource record for it. This comes up most prominantly with arrays of resources like so: ```hlsl RWBuffer BufferArray[10] : register(u0, space4); RWBuffer B1 = BufferArray[0]; RWBuffer B2 = BufferArray[SomeIndex]; RWBuffer B3 = BufferArray[3]; ``` In this case, there's only one resource, but we'll generate 3 different `dx.handle.fromBinding` calls to access different slices. Note that this adds some API that won't be used until #104447 later in the stack. Trying to avoid that results in unnecessary churn. Fixes #105143 Pull Request: https://github.com/llvm/llvm-project/pull/105602 --- llvm/include/llvm/Analysis/DXILResource.h | 61 +++++- llvm/lib/Analysis/DXILResource.cpp | 173 +++++++++++------- .../DXILResource/buffer-frombinding.ll | 164 +++++++++-------- 3 files changed, 251 insertions(+), 147 deletions(-) diff --git a/llvm/include/llvm/Analysis/DXILResource.h b/llvm/include/llvm/Analysis/DXILResource.h index 14cf03c9a3acee..d9b4e968fe3e97 100644 --- a/llvm/include/llvm/Analysis/DXILResource.h +++ b/llvm/include/llvm/Analysis/DXILResource.h @@ -25,6 +25,7 @@ class Value; namespace dxil { class ResourceInfo { +public: struct ResourceBinding { uint32_t RecordID; uint32_t Space; @@ -38,6 +39,10 @@ class ResourceInfo { bool operator!=(const ResourceBinding &RHS) const { return !(*this == RHS); } + bool operator<(const ResourceBinding &RHS) const { + return std::tie(RecordID, Space, LowerBound, Size) < + std::tie(RHS.RecordID, RHS.Space, RHS.LowerBound, RHS.Size); + } }; struct UAVInfo { @@ -50,6 +55,10 @@ class ResourceInfo { std::tie(RHS.GloballyCoherent, RHS.HasCounter, RHS.IsROV); } bool operator!=(const UAVInfo &RHS) const { return !(*this == RHS); } + bool operator<(const UAVInfo &RHS) const { + return std::tie(GloballyCoherent, HasCounter, IsROV) < + std::tie(RHS.GloballyCoherent, RHS.HasCounter, RHS.IsROV); + } }; struct StructInfo { @@ -64,6 +73,9 @@ class ResourceInfo { return std::tie(Stride, AlignLog2) == std::tie(RHS.Stride, RHS.AlignLog2); } bool operator!=(const StructInfo &RHS) const { return !(*this == RHS); } + bool operator<(const StructInfo &RHS) const { + return std::tie(Stride, AlignLog2) < std::tie(RHS.Stride, RHS.AlignLog2); + } }; struct TypedInfo { @@ -75,6 +87,10 @@ class ResourceInfo { std::tie(RHS.ElementTy, RHS.ElementCount); } bool operator!=(const TypedInfo &RHS) const { return !(*this == RHS); } + bool operator<(const TypedInfo &RHS) const { + return std::tie(ElementTy, ElementCount) < + std::tie(RHS.ElementTy, RHS.ElementCount); + } }; struct MSInfo { @@ -82,6 +98,7 @@ class ResourceInfo { bool operator==(const MSInfo &RHS) const { return Count == RHS.Count; } bool operator!=(const MSInfo &RHS) const { return !(*this == RHS); } + bool operator<(const MSInfo &RHS) const { return Count < RHS.Count; } }; struct FeedbackInfo { @@ -89,8 +106,10 @@ class ResourceInfo { bool operator==(const FeedbackInfo &RHS) const { return Type == RHS.Type; } bool operator!=(const FeedbackInfo &RHS) const { return !(*this == RHS); } + bool operator<(const FeedbackInfo &RHS) const { return Type < RHS.Type; } }; +private: // Universal properties. Value *Symbol; StringRef Name; @@ -138,6 +157,7 @@ class ResourceInfo { Binding.LowerBound = LowerBound; Binding.Size = Size; } + const ResourceBinding &getBinding() const { return Binding; } void setUAV(bool GloballyCoherent, bool HasCounter, bool IsROV) { assert(isUAV() && "Not a UAV"); UAVFlags.GloballyCoherent = GloballyCoherent; @@ -168,7 +188,11 @@ class ResourceInfo { MultiSample.Count = Count; } + dxil::ResourceClass getResourceClass() const { return RC; } + bool operator==(const ResourceInfo &RHS) const; + bool operator!=(const ResourceInfo &RHS) const { return !(*this == RHS); } + bool operator<(const ResourceInfo &RHS) const; static ResourceInfo SRV(Value *Symbol, StringRef Name, dxil::ElementType ElementTy, uint32_t ElementCount, @@ -216,7 +240,6 @@ class ResourceInfo { MDTuple *getAsMetadata(LLVMContext &Ctx) const; - ResourceBinding getBinding() const { return Binding; } std::pair getAnnotateProps() const; void print(raw_ostream &OS) const; @@ -224,7 +247,41 @@ class ResourceInfo { } // namespace dxil -using DXILResourceMap = MapVector; +class DXILResourceMap { + SmallVector Resources; + DenseMap CallMap; + unsigned FirstUAV = 0; + unsigned FirstCBuffer = 0; + unsigned FirstSampler = 0; + +public: + using iterator = SmallVector::iterator; + using const_iterator = SmallVector::const_iterator; + + DXILResourceMap( + SmallVectorImpl> &&CIToRI); + + iterator begin() { return Resources.begin(); } + const_iterator begin() const { return Resources.begin(); } + iterator end() { return Resources.end(); } + const_iterator end() const { return Resources.end(); } + + bool empty() const { return Resources.empty(); } + + iterator find(const CallInst *Key) { + auto Pos = CallMap.find(Key); + return Pos == CallMap.end() ? Resources.end() + : (Resources.begin() + Pos->second); + } + + const_iterator find(const CallInst *Key) const { + auto Pos = CallMap.find(Key); + return Pos == CallMap.end() ? Resources.end() + : (Resources.begin() + Pos->second); + } + + void print(raw_ostream &OS) const; +}; class DXILResourceAnalysis : public AnalysisInfoMixin { friend AnalysisInfoMixin; diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp index 1b5b051c9db29e..2802480481690d 100644 --- a/llvm/lib/Analysis/DXILResource.cpp +++ b/llvm/lib/Analysis/DXILResource.cpp @@ -335,27 +335,45 @@ bool ResourceInfo::operator==(const ResourceInfo &RHS) const { if (std::tie(Symbol, Name, Binding, RC, Kind) != std::tie(RHS.Symbol, RHS.Name, RHS.Binding, RHS.RC, RHS.Kind)) return false; - if (isCBuffer()) - return CBufferSize == RHS.CBufferSize; - if (isSampler()) - return SamplerTy == RHS.SamplerTy; - if (isUAV() && UAVFlags != RHS.UAVFlags) + if (isCBuffer() && RHS.isCBuffer() && CBufferSize != RHS.CBufferSize) return false; - - if (isStruct()) - return Struct == RHS.Struct; - if (isFeedback()) - return Feedback == RHS.Feedback; - if (isTyped() && Typed != RHS.Typed) + if (isSampler() && RHS.isSampler() && SamplerTy != RHS.SamplerTy) + return false; + if (isUAV() && RHS.isUAV() && UAVFlags != RHS.UAVFlags) + return false; + if (isStruct() && RHS.isStruct() && Struct != RHS.Struct) + return false; + if (isFeedback() && RHS.isFeedback() && Feedback != RHS.Feedback) + return false; + if (isTyped() && RHS.isTyped() && Typed != RHS.Typed) + return false; + if (isMultiSample() && RHS.isMultiSample() && MultiSample != RHS.MultiSample) return false; - - if (isMultiSample()) - return MultiSample == RHS.MultiSample; - - assert((Kind == ResourceKind::RawBuffer) && "Unhandled resource kind"); return true; } +bool ResourceInfo::operator<(const ResourceInfo &RHS) const { + // Skip the symbol to avoid non-determinism, and the name to keep a consistent + // ordering even when we strip reflection data. + if (std::tie(Binding, RC, Kind) < std::tie(RHS.Binding, RHS.RC, RHS.Kind)) + return true; + if (isCBuffer() && RHS.isCBuffer() && CBufferSize < RHS.CBufferSize) + return true; + if (isSampler() && RHS.isSampler() && SamplerTy < RHS.SamplerTy) + return true; + if (isUAV() && RHS.isUAV() && UAVFlags < RHS.UAVFlags) + return true; + if (isStruct() && RHS.isStruct() && Struct < RHS.Struct) + return true; + if (isFeedback() && RHS.isFeedback() && Feedback < RHS.Feedback) + return true; + if (isTyped() && RHS.isTyped() && Typed < RHS.Typed) + return true; + if (isMultiSample() && RHS.isMultiSample() && MultiSample < RHS.MultiSample) + return true; + return false; +} + MDTuple *ResourceInfo::getAsMetadata(LLVMContext &Ctx) const { SmallVector MDVals; @@ -534,18 +552,10 @@ namespace { class ResourceMapper { Module &M; LLVMContext &Context; - DXILResourceMap &Resources; - - // In DXC, Record ID is unique per resource type. Match that. - uint32_t NextUAV = 0; - uint32_t NextSRV = 0; - uint32_t NextCBuf = 0; - uint32_t NextSmp = 0; + SmallVector> Resources; public: - ResourceMapper(Module &M, - MapVector &Resources) - : M(M), Context(M.getContext()), Resources(Resources) {} + ResourceMapper(Module &M) : M(M), Context(M.getContext()) {} void diagnoseHandle(CallInst *CI, const Twine &Msg, DiagnosticSeverity Severity = DS_Error) { @@ -585,13 +595,11 @@ class ResourceMapper { // TODO: We don't actually keep track of the name right now... StringRef Name = ""; - auto [It, Success] = Resources.try_emplace(CI, RC, Kind, Symbol, Name); - assert(Success && "Mapping the same CallInst again?"); - (void)Success; - // We grab a pointer into the map's storage, which isn't generally safe. - // Since we're just using this to fill in the info the map won't mutate and - // the pointer stays valid for as long as we need it to. - ResourceInfo *RI = &(It->second); + // Note that we return a pointer into the vector's storage. This is okay as + // long as we don't add more elements until we're done with the pointer. + auto &Pair = + Resources.emplace_back(CI, ResourceInfo{RC, Kind, Symbol, Name}); + ResourceInfo *RI = &Pair.second; if (RI->isUAV()) // TODO: We need analysis for GloballyCoherent and HasCounter @@ -658,27 +666,18 @@ class ResourceMapper { if (!RI) return nullptr; - uint32_t NextID; - if (RI->isCBuffer()) - NextID = NextCBuf++; - else if (RI->isSampler()) - NextID = NextSmp++; - else if (RI->isUAV()) - NextID = NextUAV++; - else - NextID = NextSRV++; - uint32_t Space = cast(CI->getArgOperand(0))->getZExtValue(); uint32_t LowerBound = cast(CI->getArgOperand(1))->getZExtValue(); uint32_t Size = cast(CI->getArgOperand(2))->getZExtValue(); - RI->bind(NextID, Space, LowerBound, Size); + // We use a binding ID of zero for now - these will be filled in later. + RI->bind(0U, Space, LowerBound, Size); return RI; } - void mapResources() { + DXILResourceMap mapResources() { for (Function &F : M.functions()) { if (!F.isDeclaration()) continue; @@ -697,11 +696,68 @@ class ResourceMapper { break; } } + + return DXILResourceMap(std::move(Resources)); } }; } // namespace +DXILResourceMap::DXILResourceMap( + SmallVectorImpl> &&CIToRI) { + if (CIToRI.empty()) + return; + + llvm::stable_sort(CIToRI, [](auto &LHS, auto &RHS) { + // Sort by resource class first for grouping purposes, and then by the rest + // of the fields so that we can remove duplicates. + ResourceClass LRC = LHS.second.getResourceClass(); + ResourceClass RRC = RHS.second.getResourceClass(); + return std::tie(LRC, LHS.second) < std::tie(RRC, RHS.second); + }); + for (auto [CI, RI] : CIToRI) { + if (Resources.empty() || RI != Resources.back()) + Resources.push_back(RI); + CallMap[CI] = Resources.size() - 1; + } + + unsigned Size = Resources.size(); + // In DXC, Record ID is unique per resource type. Match that. + FirstUAV = FirstCBuffer = FirstSampler = Size; + uint32_t NextID = 0; + for (unsigned I = 0, E = Size; I != E; ++I) { + ResourceInfo &RI = Resources[I]; + if (RI.isUAV() && FirstUAV == Size) { + FirstUAV = I; + NextID = 0; + } else if (RI.isCBuffer() && FirstCBuffer == Size) { + FirstCBuffer = I; + NextID = 0; + } else if (RI.isSampler() && FirstSampler == Size) { + FirstSampler = I; + NextID = 0; + } + + // Adjust the resource binding to use the next ID. + const ResourceInfo::ResourceBinding &Binding = RI.getBinding(); + RI.bind(NextID++, Binding.Space, Binding.LowerBound, Binding.Size); + } +} + +void DXILResourceMap::print(raw_ostream &OS) const { + for (unsigned I = 0, E = Resources.size(); I != E; ++I) { + OS << "Binding " << I << ":\n"; + Resources[I].print(OS); + OS << "\n"; + } + + for (const auto &[CI, Index] : CallMap) { + OS << "Call bound to " << Index << ":"; + CI->print(OS); + OS << "\n"; + } +} + //===----------------------------------------------------------------------===// // DXILResourceAnalysis and DXILResourcePrinterPass @@ -710,24 +766,14 @@ AnalysisKey DXILResourceAnalysis::Key; DXILResourceMap DXILResourceAnalysis::run(Module &M, ModuleAnalysisManager &AM) { - DXILResourceMap Data; - ResourceMapper(M, Data).mapResources(); + DXILResourceMap Data = ResourceMapper(M).mapResources(); return Data; } PreservedAnalyses DXILResourcePrinterPass::run(Module &M, ModuleAnalysisManager &AM) { - DXILResourceMap &Data = - AM.getResult(M); - - for (const auto &[Handle, Info] : Data) { - OS << "Binding for "; - Handle->print(OS); - OS << "\n"; - Info.print(OS); - OS << "\n"; - } - + DXILResourceMap &DRM = AM.getResult(M); + DRM.print(OS); return PreservedAnalyses::all(); } @@ -745,8 +791,7 @@ void DXILResourceWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { } bool DXILResourceWrapperPass::runOnModule(Module &M) { - ResourceMap.reset(new DXILResourceMap()); - ResourceMapper(M, *ResourceMap).mapResources(); + ResourceMap.reset(new DXILResourceMap(ResourceMapper(M).mapResources())); return false; } @@ -757,13 +802,7 @@ void DXILResourceWrapperPass::print(raw_ostream &OS, const Module *) const { OS << "No resource map has been built!\n"; return; } - for (const auto &[Handle, Info] : *ResourceMap) { - OS << "Binding for "; - Handle->print(OS); - OS << "\n"; - Info.print(OS); - OS << "\n"; - } + ResourceMap->print(OS); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/test/Analysis/DXILResource/buffer-frombinding.ll b/llvm/test/Analysis/DXILResource/buffer-frombinding.ll index 65802c6d1ff87a..b26a185423597d 100644 --- a/llvm/test/Analysis/DXILResource/buffer-frombinding.ll +++ b/llvm/test/Analysis/DXILResource/buffer-frombinding.ll @@ -3,55 +3,48 @@ @G = external constant <4 x float>, align 4 define void @test_typedbuffer() { - ; RWBuffer Buf : register(u5, space3) - %typed0 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) - @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_1_0( - i32 3, i32 5, i32 1, i32 0, i1 false) - ; CHECK: Binding for %typed0 + ; ByteAddressBuffer Buf : register(t8, space1) + %srv0 = call target("dx.RawBuffer", i8, 0, 0) + @llvm.dx.handle.fromBinding.tdx.RawBuffer_i8_0_0t( + i32 1, i32 8, i32 1, i32 0, i1 false) + ; CHECK: Binding [[SRV0:[0-9]+]]: ; CHECK: Symbol: ptr undef ; CHECK: Name: "" ; CHECK: Binding: ; CHECK: Record ID: 0 - ; CHECK: Space: 3 - ; CHECK: Lower Bound: 5 + ; CHECK: Space: 1 + ; CHECK: Lower Bound: 8 ; CHECK: Size: 1 - ; CHECK: Class: UAV - ; CHECK: Kind: TypedBuffer - ; CHECK: Globally Coherent: 0 - ; CHECK: HasCounter: 0 - ; CHECK: IsROV: 0 - ; CHECK: Element Type: f32 - ; CHECK: Element Count: 4 + ; CHECK: Class: SRV + ; CHECK: Kind: RawBuffer - ; RWBuffer Buf : register(u7, space2) - %typed1 = call target("dx.TypedBuffer", i32, 1, 0, 1) - @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_1_0t( - i32 2, i32 7, i32 1, i32 0, i1 false) - ; CHECK: Binding for %typed1 + ; struct S { float4 a; uint4 b; }; + ; StructuredBuffer Buf : register(t2, space4) + %srv1 = call target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0) + @llvm.dx.handle.fromBinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0t( + i32 4, i32 2, i32 1, i32 0, i1 false) + ; CHECK: Binding [[SRV1:[0-9]+]]: ; CHECK: Symbol: ptr undef ; CHECK: Name: "" ; CHECK: Binding: ; CHECK: Record ID: 1 - ; CHECK: Space: 2 - ; CHECK: Lower Bound: 7 + ; CHECK: Space: 4 + ; CHECK: Lower Bound: 2 ; CHECK: Size: 1 - ; CHECK: Class: UAV - ; CHECK: Kind: TypedBuffer - ; CHECK: Globally Coherent: 0 - ; CHECK: HasCounter: 0 - ; CHECK: IsROV: 0 - ; CHECK: Element Type: i32 - ; CHECK: Element Count: 1 + ; CHECK: Class: SRV + ; CHECK: Kind: StructuredBuffer + ; CHECK: Buffer Stride: 32 + ; CHECK: Alignment: 4 ; Buffer Buf[24] : register(t3, space5) - %typed2 = call target("dx.TypedBuffer", <4 x i32>, 0, 0, 0) + %srv2 = call target("dx.TypedBuffer", <4 x i32>, 0, 0, 0) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_0_0t( i32 5, i32 3, i32 24, i32 0, i1 false) - ; CHECK: Binding for %typed2 + ; CHECK: Binding [[SRV2:[0-9]+]]: ; CHECK: Symbol: ptr undef ; CHECK: Name: "" ; CHECK: Binding: - ; CHECK: Record ID: 0 + ; CHECK: Record ID: 2 ; CHECK: Space: 5 ; CHECK: Lower Bound: 3 ; CHECK: Size: 24 @@ -60,67 +53,82 @@ define void @test_typedbuffer() { ; CHECK: Element Type: u32 ; CHECK: Element Count: 4 - ret void -} + ; RWBuffer Buf : register(u7, space2) + %uav0 = call target("dx.TypedBuffer", i32, 1, 0, 1) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_1_0t( + i32 2, i32 7, i32 1, i32 0, i1 false) + ; CHECK: Binding [[UAV0:[0-9]+]]: + ; CHECK: Symbol: ptr undef + ; CHECK: Name: "" + ; CHECK: Binding: + ; CHECK: Record ID: 0 + ; CHECK: Space: 2 + ; CHECK: Lower Bound: 7 + ; CHECK: Size: 1 + ; CHECK: Class: UAV + ; CHECK: Kind: TypedBuffer + ; CHECK: Globally Coherent: 0 + ; CHECK: HasCounter: 0 + ; CHECK: IsROV: 0 + ; CHECK: Element Type: i32 + ; CHECK: Element Count: 1 -define void @test_structbuffer() { - ; struct S { float4 a; uint4 b; }; - ; StructuredBuffer Buf : register(t2, space4) - %struct0 = call target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0) - @llvm.dx.handle.fromBinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0t( - i32 4, i32 2, i32 1, i32 0, i1 false) - ; CHECK: Binding for %struct0 + ; RWBuffer Buf : register(u5, space3) + %uav1 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_1_0( + i32 3, i32 5, i32 1, i32 0, i1 false) + ; CHECK: Binding [[UAV1:[0-9]+]]: ; CHECK: Symbol: ptr undef ; CHECK: Name: "" ; CHECK: Binding: ; CHECK: Record ID: 1 - ; CHECK: Space: 4 - ; CHECK: Lower Bound: 2 + ; CHECK: Space: 3 + ; CHECK: Lower Bound: 5 ; CHECK: Size: 1 - ; CHECK: Class: SRV - ; CHECK: Kind: StructuredBuffer - ; CHECK: Buffer Stride: 32 - ; CHECK: Alignment: 4 - - ret void -} + ; CHECK: Class: UAV + ; CHECK: Kind: TypedBuffer + ; CHECK: Globally Coherent: 0 + ; CHECK: HasCounter: 0 + ; CHECK: IsROV: 0 + ; CHECK: Element Type: f32 + ; CHECK: Element Count: 4 -define void @test_bytebuffer() { - ; ByteAddressBuffer Buf : register(t8, space1) - %byteaddr0 = call target("dx.RawBuffer", i8, 0, 0) - @llvm.dx.handle.fromBinding.tdx.RawBuffer_i8_0_0t( - i32 1, i32 8, i32 1, i32 0, i1 false) - ; CHECK: Binding for %byteaddr0 + ; RWBuffer BufferArray[10] : register(u0, space4) + ; RWBuffer Buf = BufferArray[0] + %uav2_1 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_1_0( + i32 4, i32 0, i32 10, i32 0, i1 false) + ; RWBuffer Buf = BufferArray[5] + %uav2_2 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_1_0( + i32 4, i32 0, i32 10, i32 5, i1 false) + ; CHECK: Binding [[UAV2:[0-9]+]]: ; CHECK: Symbol: ptr undef ; CHECK: Name: "" ; CHECK: Binding: ; CHECK: Record ID: 2 - ; CHECK: Space: 1 - ; CHECK: Lower Bound: 8 - ; CHECK: Size: 1 - ; CHECK: Class: SRV - ; CHECK: Kind: RawBuffer + ; CHECK: Space: 4 + ; CHECK: Lower Bound: 0 + ; CHECK: Size: 10 + ; CHECK: Class: UAV + ; CHECK: Kind: TypedBuffer + ; CHECK: Globally Coherent: 0 + ; CHECK: HasCounter: 0 + ; CHECK: IsROV: 0 + ; CHECK: Element Type: f32 + ; CHECK: Element Count: 4 + + ; CHECK-NOT: Binding {{[0-9]+}}: ret void } -; Note: We need declarations for each handle.fromBinding in the same -; order as they appear in source to ensure that we can put our CHECK -; lines along side the thing they're checking. -declare target("dx.TypedBuffer", <4 x float>, 1, 0, 0) - @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0t( - i32, i32, i32, i32, i1) #0 -declare target("dx.TypedBuffer", i32, 1, 0, 1) - @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_1_0_1t( - i32, i32, i32, i32, i1) #0 -declare target("dx.TypedBuffer", <4 x i32>, 0, 0, 0) - @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4i32_0_0_0t( - i32, i32, i32, i32, i1) #0 -declare target("dx.RawBuffer", { <4 x float>, <4 x i32> }, 0, 0) - @llvm.dx.handle.fromBinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0t( - i32, i32, i32, i32, i1) #0 -declare target("dx.RawBuffer", i8, 0, 0) - @llvm.dx.handle.fromBinding.tdx.RawBuffer_i8_0_0t( - i32, i32, i32, i32, i1) #0 +; CHECK-DAG: Call bound to [[SRV0]]: %srv0 = +; CHECK-DAG: Call bound to [[SRV1]]: %srv1 = +; CHECK-DAG: Call bound to [[SRV2]]: %srv2 = +; CHECK-DAG: Call bound to [[UAV0]]: %uav0 = +; CHECK-DAG: Call bound to [[UAV1]]: %uav1 = +; CHECK-DAG: Call bound to [[UAV2]]: %uav2_1 = +; CHECK-DAG: Call bound to [[UAV2]]: %uav2_2 = attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) } From a0fac6f2d868316a88aa5b62963e26dca9bfa372 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Fri, 23 Aug 2024 12:09:19 -0700 Subject: [PATCH 367/426] [lldb] Add missing initialization (NFC) --- lldb/include/lldb/Target/StackFrameRecognizer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/include/lldb/Target/StackFrameRecognizer.h b/lldb/include/lldb/Target/StackFrameRecognizer.h index 8acebc12c4b1dc..2f5c5caa6a4561 100644 --- a/lldb/include/lldb/Target/StackFrameRecognizer.h +++ b/lldb/include/lldb/Target/StackFrameRecognizer.h @@ -146,7 +146,7 @@ class StackFrameRecognizerManager { }; std::deque m_recognizers; - uint16_t m_generation; + uint16_t m_generation = 0; }; /// \class ValueObjectRecognizerSynthesizedValue From 52a7116f5c6ada234f47f7794aaf501a3692b997 Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Fri, 23 Aug 2024 21:17:38 +0200 Subject: [PATCH 368/426] [LLD][COFF] Add support for CHPE code ranges metadata. (#105741) This is part of CHPE metadata containing a sorted list of x86_64 export thunks RVAs and sizes. --- lld/COFF/Chunks.cpp | 16 ++++++++++++++++ lld/COFF/Chunks.h | 11 +++++++++++ lld/COFF/Driver.cpp | 2 ++ lld/COFF/Writer.cpp | 10 ++++++++++ lld/test/COFF/Inputs/loadconfig-arm64ec.s | 4 ++-- lld/test/COFF/arm64ec-export-thunks.test | 9 +++++++-- lld/test/COFF/arm64ec-patchable-thunks.test | 4 +++- 7 files changed, 51 insertions(+), 5 deletions(-) diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp index 4e3a564ebacd87..72a9ad05ca11c1 100644 --- a/lld/COFF/Chunks.cpp +++ b/lld/COFF/Chunks.cpp @@ -1078,6 +1078,22 @@ void ECExportThunkChunk::writeTo(uint8_t *buf) const { write32le(buf + 10, target->getRVA() - rva - 14); } +size_t CHPECodeRangesChunk::getSize() const { + return exportThunks.size() * sizeof(chpe_code_range_entry); +} + +void CHPECodeRangesChunk::writeTo(uint8_t *buf) const { + auto ranges = reinterpret_cast(buf); + + for (uint32_t i = 0; i < exportThunks.size(); i++) { + Chunk *thunk = exportThunks[i].first; + uint32_t start = thunk->getRVA(); + ranges[i].StartRva = start; + ranges[i].EndRva = start + thunk->getSize(); + ranges[i].EntryPoint = start; + } +} + size_t CHPERedirectionChunk::getSize() const { return exportThunks.size() * sizeof(chpe_redirection_entry); } diff --git a/lld/COFF/Chunks.h b/lld/COFF/Chunks.h index 015df41b04c67d..fe202008971a54 100644 --- a/lld/COFF/Chunks.h +++ b/lld/COFF/Chunks.h @@ -749,6 +749,17 @@ class ECCodeMapChunk : public NonSectionChunk { std::vector ↦ }; +class CHPECodeRangesChunk : public NonSectionChunk { +public: + CHPECodeRangesChunk(std::vector> &exportThunks) + : exportThunks(exportThunks) {} + size_t getSize() const override; + void writeTo(uint8_t *buf) const override; + +private: + std::vector> &exportThunks; +}; + class CHPERedirectionChunk : public NonSectionChunk { public: CHPERedirectionChunk(std::vector> &exportThunks) diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index 472f5074ba8b8c..3ef9fa3f65c6a6 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -2444,6 +2444,8 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { ctx.symtab.addAbsolute("__arm64x_redirection_metadata_count", 0); ctx.symtab.addAbsolute("__hybrid_code_map", 0); ctx.symtab.addAbsolute("__hybrid_code_map_count", 0); + ctx.symtab.addAbsolute("__x64_code_ranges_to_entry_points", 0); + ctx.symtab.addAbsolute("__x64_code_ranges_to_entry_points_count", 0); } if (config->pseudoRelocs) { diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp index 358d16fe330cea..0360e186ecf0cf 100644 --- a/lld/COFF/Writer.cpp +++ b/lld/COFF/Writer.cpp @@ -2071,6 +2071,12 @@ void Writer::createECChunks() { replaceSymbol(codeMapSym, codeMapSym->getName(), codeMapChunk); + CHPECodeRangesChunk *ranges = make(exportThunks); + rdataSec->addChunk(ranges); + Symbol *rangesSym = + ctx.symtab.findUnderscore("__x64_code_ranges_to_entry_points"); + replaceSymbol(rangesSym, rangesSym->getName(), ranges); + CHPERedirectionChunk *entryPoints = make(exportThunks); a64xrmSec->addChunk(entryPoints); Symbol *entryPointsSym = @@ -2186,6 +2192,10 @@ void Writer::setECSymbols() { pdata.first->getRVA()); } + Symbol *rangesCountSym = + ctx.symtab.findUnderscore("__x64_code_ranges_to_entry_points_count"); + cast(rangesCountSym)->setVA(exportThunks.size()); + Symbol *entryPointCountSym = ctx.symtab.findUnderscore("__arm64x_redirection_metadata_count"); cast(entryPointCountSym)->setVA(exportThunks.size()); diff --git a/lld/test/COFF/Inputs/loadconfig-arm64ec.s b/lld/test/COFF/Inputs/loadconfig-arm64ec.s index 62a6d0cab642e9..78e7fba43a0a4d 100644 --- a/lld/test/COFF/Inputs/loadconfig-arm64ec.s +++ b/lld/test/COFF/Inputs/loadconfig-arm64ec.s @@ -66,7 +66,7 @@ __chpe_metadata: .word 1 .rva __hybrid_code_map .word __hybrid_code_map_count - .word 0 // __x64_code_ranges_to_entry_points + .rva __x64_code_ranges_to_entry_points .rva __arm64x_redirection_metadata .rva __os_arm64x_dispatch_call_no_redirect .rva __os_arm64x_dispatch_ret @@ -75,7 +75,7 @@ __chpe_metadata: .rva __os_arm64x_check_icall_cfg .word 0 // __arm64x_native_entrypoint .word 0 // __hybrid_auxiliary_iat - .word 0 // __x64_code_ranges_to_entry_points_count + .word __x64_code_ranges_to_entry_points_count .word __arm64x_redirection_metadata_count .rva __os_arm64x_get_x64_information .rva __os_arm64x_set_x64_information diff --git a/lld/test/COFF/arm64ec-export-thunks.test b/lld/test/COFF/arm64ec-export-thunks.test index 2e4cfd6203b751..809fac1f24a7dc 100644 --- a/lld/test/COFF/arm64ec-export-thunks.test +++ b/lld/test/COFF/arm64ec-export-thunks.test @@ -58,7 +58,10 @@ EXP-CHPE: CodeMap [ EXP-CHPE-NEXT: 0x1000 - 0x100C ARM64EC EXP-CHPE-NEXT: 0x2000 - 0x3020 X64 EXP-CHPE-NEXT: ] -EXP-CHPE-NEXT: CodeRangesToEntryPoints: 0 +EXP-CHPE-NEXT: CodeRangesToEntryPoints [ +EXP-CHPE-NEXT: 0x3000 - 0x3010 -> 0x3000 +EXP-CHPE-NEXT: 0x3010 - 0x3020 -> 0x3010 +EXP-CHPE-NEXT: ] EXP-CHPE-NEXT: RedirectionMetadata [ EXP-CHPE-NEXT: 0x3000 -> 0x1000 EXP-CHPE-NEXT: 0x3010 -> 0x1000 @@ -121,7 +124,9 @@ ENTRY-CHPE: CodeMap [ ENTRY-CHPE-NEXT: 0x1000 - 0x100C ARM64EC ENTRY-CHPE-NEXT: 0x2000 - 0x2010 X64 ENTRY-CHPE-NEXT: ] -ENTRY-CHPE-NEXT: CodeRangesToEntryPoints: 0 +ENTRY-CHPE-NEXT: CodeRangesToEntryPoints [ +ENTRY-CHPE-NEXT: 0x2000 - 0x2010 -> 0x2000 +ENTRY-CHPE-NEXT: ] ENTRY-CHPE-NEXT: RedirectionMetadata [ ENTRY-CHPE-NEXT: 0x2000 -> 0x1000 ENTRY-CHPE-NEXT: ] diff --git a/lld/test/COFF/arm64ec-patchable-thunks.test b/lld/test/COFF/arm64ec-patchable-thunks.test index 044f3c7cebdf8e..5cebe7cc27ad63 100644 --- a/lld/test/COFF/arm64ec-patchable-thunks.test +++ b/lld/test/COFF/arm64ec-patchable-thunks.test @@ -34,7 +34,9 @@ PATCH-CHPE: CodeMap [ PATCH-CHPE-NEXT: 0x1000 - 0x1008 ARM64EC PATCH-CHPE-NEXT: 0x2000 - 0x2010 X64 PATCH-CHPE-NEXT: ] -PATCH-CHPE-NEXT: CodeRangesToEntryPoints: 0 +PATCH-CHPE-NEXT: CodeRangesToEntryPoints [ +PATCH-CHPE-NEXT: 0x2000 - 0x2010 -> 0x2000 +PATCH-CHPE-NEXT: ] PATCH-CHPE-NEXT: RedirectionMetadata [ PATCH-CHPE-NEXT: 0x2000 -> 0x1000 PATCH-CHPE-NEXT: ] From c505ce9df7006edabf402a73782121c44b697289 Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Fri, 23 Aug 2024 15:38:21 -0400 Subject: [PATCH 369/426] Deprecate -fheinous-gnu-extensions; introduce a new warning flag (#105821) The new warning flag is `-Winvalid-gnu-asm-cast`, which is enabled by default and is a downgradable diagnostic which defaults to an error. This language dialect flag only controls whether a single diagnostic is emitted as a warning or as an error, and has never been expanded to include other behaviors. Given the rather perjorative name, it's better for us to just expose a diagnostic flag for the one warning in question and let the user elect to do `-Wno-error=` if they need to. There's not a lot of use of the language dialect flag in the wild, but there is some use of it. For the time being, this aliases the -f flag to `-Wno-error=invalid-gnu-asm-cast`, but the -f flag can eventually be removed. --- clang/docs/ReleaseNotes.rst | 9 +++++++++ clang/include/clang/Basic/DiagnosticSemaKinds.td | 8 ++------ clang/include/clang/Basic/LangOptions.def | 1 - clang/include/clang/Driver/Options.td | 10 ++++++++-- clang/lib/Sema/SemaStmtAsm.cpp | 9 ++------- clang/test/Analysis/asm.cpp | 2 +- clang/test/Analysis/cfg.c | 2 +- clang/test/Analysis/cfg.cpp | 4 ++-- clang/test/Driver/heinous-gnu-extensions.c | 5 +++++ clang/test/Misc/warning-flags.c | 3 +-- clang/test/Sema/heinous-extensions-off.c | 13 ++++++------- clang/test/Sema/heinous-extensions-on.c | 6 +++--- 12 files changed, 40 insertions(+), 32 deletions(-) create mode 100644 clang/test/Driver/heinous-gnu-extensions.c diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 17a707102d041f..798f59009af3c3 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -174,6 +174,10 @@ New Compiler Flags Deprecated Compiler Flags ------------------------- +- ``-fheinous-gnu-extensions`` is deprecated; it is now equivalent to + specifying ``-Wno-error=invalid-gnu-asm-cast`` and may be removed in the + future. + Modified Compiler Flags ----------------------- @@ -238,6 +242,11 @@ Improvements to Clang's diagnostics - Improved diagnostic when trying to befriend a concept. (#GH45182). +- Added the ``-Winvalid-gnu-asm-cast`` diagnostic group to control warnings + about use of "noop" casts for lvalues (a GNU extension). This diagnostic is + a warning which defaults to being an error, is enabled by default, and is + also controlled by the now-deprecated ``-fheinous-gnu-extensions`` flag. + Improvements to Clang's time-trace ---------------------------------- diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index ede3435d3e1b71..edf22b909c4d57 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -9365,9 +9365,6 @@ let CategoryName = "Inline Assembly Issue" in { "invalid input size for constraint '%0'">; def err_asm_invalid_output_size : Error< "invalid output size for constraint '%0'">; - def err_invalid_asm_cast_lvalue : Error< - "invalid use of a cast in a inline asm context requiring an lvalue: " - "remove the cast or build with -fheinous-gnu-extensions">; def err_invalid_asm_value_for_constraint : Error <"value '%0' out of range for constraint '%1'">; def err_asm_non_addr_value_in_memory_constraint : Error < @@ -9381,9 +9378,8 @@ let CategoryName = "Inline Assembly Issue" in { def warn_asm_label_on_auto_decl : Warning< "ignored asm label '%0' on automatic variable">; def warn_invalid_asm_cast_lvalue : Warning< - "invalid use of a cast in an inline asm context requiring an lvalue: " - "accepted due to -fheinous-gnu-extensions, but clang may remove support " - "for this in the future">; + "invalid use of a cast in an inline asm context requiring an lvalue">, + InGroup>, DefaultError; def warn_asm_mismatched_size_modifier : Warning< "value size does not match register size specified by the constraint " "and modifier">, diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index d454a7ff2f8cf4..956d9a2d2434c4 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -190,7 +190,6 @@ LANGOPT(POSIXThreads , 1, 0, "POSIX thread support") LANGOPT(Blocks , 1, 0, "blocks extension to C") BENIGN_LANGOPT(EmitAllDecls , 1, 0, "emitting all declarations") LANGOPT(MathErrno , 1, 1, "errno in math functions") -BENIGN_LANGOPT(HeinousExtensions , 1, 0, "extensions that we really don't like and may be ripped out at any time") LANGOPT(Modules , 1, 0, "modules semantics") COMPATIBLE_LANGOPT(CPlusPlusModules, 1, 0, "C++ modules syntax") LANGOPT(SkipODRCheckInGMF, 1, 0, "Skip ODR checks for decls in the global module fragment") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 7e40e99e9ba252..4bf604d46a0f70 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1011,6 +1011,8 @@ def Wwrite_strings : Flag<["-"], "Wwrite-strings">, Group, Flags<[HelpHidden]>, Visibility<[ClangOption, CC1Option]>; def Wno_write_strings : Flag<["-"], "Wno-write-strings">, Group, Flags<[HelpHidden]>, Visibility<[ClangOption, CC1Option]>; +def Winvalid_gnu_asm_cast : Flag<["-"], "Winvalid-gnu-asm-cast">, Group, + Flags<[HelpHidden]>, Visibility<[ClangOption, CC1Option]>; def W_Joined : Joined<["-"], "W">, Group, Visibility<[ClangOption, CC1Option, CLOption, DXCOption, FC1Option, FlangOption]>, MetaVarName<"">, HelpText<"Enable the specified warning">; @@ -2761,9 +2763,13 @@ defm gnu89_inline : BoolFOption<"gnu89-inline", NegFlag>, ShouldParseIf; def fgnu_runtime : Flag<["-"], "fgnu-runtime">, Group, HelpText<"Generate output compatible with the standard GNU Objective-C runtime">; +// This used to be a standalone flag but is now mapped to +// -Wno-error=invalid-gnu-asm-cast, which is the only thing the flag used to +// control. def fheinous_gnu_extensions : Flag<["-"], "fheinous-gnu-extensions">, - Visibility<[ClangOption, CC1Option]>, - MarshallingInfoFlag>; + Alias, AliasArgs<["no-error=invalid-gnu-asm-cast"]>, + HelpText<"(Deprecated) Controls whether '-Winvalid-gnu-asm-cast' defaults to " + "an error or a warning">; def filelist : Separate<["-"], "filelist">, Flags<[LinkerInput]>, Group; def : Flag<["-"], "findirect-virtual-calls">, Alias; diff --git a/clang/lib/Sema/SemaStmtAsm.cpp b/clang/lib/Sema/SemaStmtAsm.cpp index 32d42f3c3f3bb7..245969a03777e9 100644 --- a/clang/lib/Sema/SemaStmtAsm.cpp +++ b/clang/lib/Sema/SemaStmtAsm.cpp @@ -71,13 +71,8 @@ static void removeLValueToRValueCast(Expr *E) { /// and fix the argument with removing LValueToRValue cast from the expression. static void emitAndFixInvalidAsmCastLValue(const Expr *LVal, Expr *BadArgument, Sema &S) { - if (!S.getLangOpts().HeinousExtensions) { - S.Diag(LVal->getBeginLoc(), diag::err_invalid_asm_cast_lvalue) - << BadArgument->getSourceRange(); - } else { - S.Diag(LVal->getBeginLoc(), diag::warn_invalid_asm_cast_lvalue) - << BadArgument->getSourceRange(); - } + S.Diag(LVal->getBeginLoc(), diag::warn_invalid_asm_cast_lvalue) + << BadArgument->getSourceRange(); removeLValueToRValueCast(BadArgument); } diff --git a/clang/test/Analysis/asm.cpp b/clang/test/Analysis/asm.cpp index 3181aea870c8aa..b17ab04994d249 100644 --- a/clang/test/Analysis/asm.cpp +++ b/clang/test/Analysis/asm.cpp @@ -1,5 +1,5 @@ // RUN: %clang_analyze_cc1 -triple=x86_64-unknown-unknown \ -// RUN: -analyzer-checker debug.ExprInspection,core -fheinous-gnu-extensions -w %s -verify +// RUN: -analyzer-checker debug.ExprInspection,core -Wno-error=invalid-gnu-asm-cast -w %s -verify int clang_analyzer_eval(int); diff --git a/clang/test/Analysis/cfg.c b/clang/test/Analysis/cfg.c index fc2523859e49b4..e21f6109dbd597 100644 --- a/clang/test/Analysis/cfg.c +++ b/clang/test/Analysis/cfg.c @@ -1,4 +1,4 @@ -// RUN: %clang_analyze_cc1 -analyzer-checker=debug.DumpCFG -triple x86_64-apple-darwin12 -fheinous-gnu-extensions %s > %t 2>&1 +// RUN: %clang_analyze_cc1 -analyzer-checker=debug.DumpCFG -triple x86_64-apple-darwin12 -Wno-error=invalid-gnu-asm-cast %s > %t 2>&1 // RUN: FileCheck --input-file=%t --check-prefix=CHECK %s // This file is the C version of cfg.cpp. diff --git a/clang/test/Analysis/cfg.cpp b/clang/test/Analysis/cfg.cpp index dadf157be1a54d..44a89df28e3b29 100644 --- a/clang/test/Analysis/cfg.cpp +++ b/clang/test/Analysis/cfg.cpp @@ -1,6 +1,6 @@ -// RUN: %clang_analyze_cc1 -analyzer-checker=debug.DumpCFG -triple x86_64-apple-darwin12 -fheinous-gnu-extensions -std=c++11 -analyzer-config cfg-rich-constructors=false %s > %t 2>&1 +// RUN: %clang_analyze_cc1 -analyzer-checker=debug.DumpCFG -triple x86_64-apple-darwin12 -Wno-error=invalid-gnu-asm-cast -std=c++11 -analyzer-config cfg-rich-constructors=false %s > %t 2>&1 // RUN: FileCheck --input-file=%t -check-prefixes=CHECK,WARNINGS %s -// RUN: %clang_analyze_cc1 -analyzer-checker=debug.DumpCFG -triple x86_64-apple-darwin12 -fheinous-gnu-extensions -std=c++11 -analyzer-config cfg-rich-constructors=true %s > %t 2>&1 +// RUN: %clang_analyze_cc1 -analyzer-checker=debug.DumpCFG -triple x86_64-apple-darwin12 -Wno-error=invalid-gnu-asm-cast -std=c++11 -analyzer-config cfg-rich-constructors=true %s > %t 2>&1 // RUN: FileCheck --input-file=%t -check-prefixes=CHECK,ANALYZER %s // This file tests how we construct two different flavors of the Clang CFG - diff --git a/clang/test/Driver/heinous-gnu-extensions.c b/clang/test/Driver/heinous-gnu-extensions.c new file mode 100644 index 00000000000000..e05dd7feb9ed8d --- /dev/null +++ b/clang/test/Driver/heinous-gnu-extensions.c @@ -0,0 +1,5 @@ +// RUN: %clang -### -fsyntax-only -fheinous-gnu-extensions %s 2>&1 | FileCheck %s + +// CHECK: -Wno-error=invalid-gnu-asm-cast + +int main(void) {} diff --git a/clang/test/Misc/warning-flags.c b/clang/test/Misc/warning-flags.c index 35543e6a49ffda..e4e16f074cef33 100644 --- a/clang/test/Misc/warning-flags.c +++ b/clang/test/Misc/warning-flags.c @@ -18,7 +18,7 @@ This test serves two purposes: The list of warnings below should NEVER grow. It should gradually shrink to 0. -CHECK: Warnings without flags (64): +CHECK: Warnings without flags (63): CHECK-NEXT: ext_expected_semi_decl_list CHECK-NEXT: ext_missing_whitespace_after_macro_name @@ -55,7 +55,6 @@ CHECK-NEXT: warn_fe_macro_contains_embedded_newline CHECK-NEXT: warn_ignoring_ftabstop_value CHECK-NEXT: warn_implements_nscopying CHECK-NEXT: warn_incompatible_qualified_id -CHECK-NEXT: warn_invalid_asm_cast_lvalue CHECK-NEXT: warn_invalid_cpu_supports CHECK-NEXT: warn_maynot_respond CHECK-NEXT: warn_method_param_redefinition diff --git a/clang/test/Sema/heinous-extensions-off.c b/clang/test/Sema/heinous-extensions-off.c index beaf2dcbccaf96..6515879be2405d 100644 --- a/clang/test/Sema/heinous-extensions-off.c +++ b/clang/test/Sema/heinous-extensions-off.c @@ -1,10 +1,9 @@ // RUN: %clang_cc1 %s -verify -int foo(void) { - int a; - // PR3788 - asm("nop" : : "m"((int)(a))); // expected-error {{cast in a inline asm context requiring an lvalue}} - // PR3794 - asm("nop" : "=r"((unsigned)a)); // expected-error {{cast in a inline asm context requiring an lvalue}} +void foo(void) { + int a; + // PR3788 + asm("nop" : : "m"((int)(a))); // expected-error {{invalid use of a cast in an inline asm context requiring an lvalue}} + // PR3794 + asm("nop" : "=r"((unsigned)a)); // expected-error {{invalid use of a cast in an inline asm context requiring an lvalue}} } - diff --git a/clang/test/Sema/heinous-extensions-on.c b/clang/test/Sema/heinous-extensions-on.c index 9a348d8dfd572d..79c8fe14eefd3d 100644 --- a/clang/test/Sema/heinous-extensions-on.c +++ b/clang/test/Sema/heinous-extensions-on.c @@ -1,9 +1,9 @@ -// RUN: %clang_cc1 %s -verify -fheinous-gnu-extensions +// RUN: %clang_cc1 %s -verify -Wno-error=invalid-gnu-asm-cast void foo(void) { int a; // PR3788 - asm("nop" : : "m"((int)(a))); // expected-warning {{cast in an inline asm context requiring an lvalue}} + asm("nop" : : "m"((int)(a))); // expected-warning {{invalid use of a cast in an inline asm context requiring an lvalue}} // PR3794 - asm("nop" : "=r"((unsigned)a)); // expected-warning {{cast in an inline asm context requiring an lvalue}} + asm("nop" : "=r"((unsigned)a)); // expected-warning {{invalid use of a cast in an inline asm context requiring an lvalue}} } From a74f0ab50bcb9d4b848ac8552051434bd00172dc Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Fri, 23 Aug 2024 15:44:46 -0400 Subject: [PATCH 370/426] Fix rowspan formatting; NFC --- clang/www/c_status.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/www/c_status.html b/clang/www/c_status.html index 1a0f320de04e83..91638331be877a 100644 --- a/clang/www/c_status.html +++ b/clang/www/c_status.html @@ -208,7 +208,7 @@

C23 implementation status

Clang 9 - TS 18661 Integration + TS 18661 Integration N2314 From aa61925eace86602ce1da00bda4a993719061df2 Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Fri, 23 Aug 2024 12:58:12 -0700 Subject: [PATCH 371/426] [DirectX] Lower `@llvm.dx.handle.fromBinding` to DXIL ops The `@llvm.dx.handle.fromBinding` intrinsic is lowered either to the `CreateHandle` op or a pair of `CreateHandleFromBinding` and `AnnotateHandle` ops, depending on the DXIL version. Regardless of the DXIL version we need to emit metadata about the binding, but that's left to a separate change. These DXIL ops all need to return the `%dx.types.Handle` type, but the llvm intrinsic returns a target extension type. To facilitate changing the type of the operation and all of its users, we introduce `%llvm.dx.cast.handle`, which can cast between the two handle representations. Pull Request: https://github.com/llvm/llvm-project/pull/104251 --- llvm/docs/DirectX/DXILResources.rst | 6 +- llvm/include/llvm/IR/IntrinsicsDirectX.td | 3 + llvm/lib/Target/DirectX/DXIL.td | 24 +++ llvm/lib/Target/DirectX/DXILOpBuilder.cpp | 44 +++++ llvm/lib/Target/DirectX/DXILOpBuilder.h | 11 ++ llvm/lib/Target/DirectX/DXILOpLowering.cpp | 169 +++++++++++++++++- llvm/test/CodeGen/DirectX/CreateHandle.ll | 53 ++++++ .../DirectX/CreateHandleFromBinding.ll | 58 ++++++ 8 files changed, 360 insertions(+), 8 deletions(-) create mode 100644 llvm/test/CodeGen/DirectX/CreateHandle.ll create mode 100644 llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll diff --git a/llvm/docs/DirectX/DXILResources.rst b/llvm/docs/DirectX/DXILResources.rst index aef88bc43b224d..a6ec80ce4329b2 100644 --- a/llvm/docs/DirectX/DXILResources.rst +++ b/llvm/docs/DirectX/DXILResources.rst @@ -162,6 +162,10 @@ the subsequent ``dx.op.annotateHandle`` operation in. Note that we don't have an analogue for `dx.op.createHandle`_, since ``dx.op.createHandleFromBinding`` subsumes it. +For simplicity of lowering, we match DXIL in using an index from the beginning +of the binding space rather than an index from the lower bound of the binding +itself. + .. _dx.op.createHandle: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#resource-handles .. list-table:: ``@llvm.dx.handle.fromBinding`` @@ -190,7 +194,7 @@ subsumes it. * - ``%index`` - 4 - ``i32`` - - Index of the resource to access. + - Index from the beginning of the binding space to access. * - ``%non-uniform`` - 5 - i1 diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index e959e70dc1cd4f..32af50b25f3904 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -30,6 +30,9 @@ def int_dx_handle_fromBinding [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty], [IntrNoMem]>; +// Cast between target extension handle types and dxil-style opaque handles +def int_dx_cast_handle : Intrinsic<[llvm_any_ty], [llvm_any_ty]>; + def int_dx_all : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty]>; def int_dx_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty]>; def int_dx_clamp : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>; diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index c4b278c109dbb9..83ea36ca048ad2 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -42,6 +42,8 @@ def FloatTy : DXILOpParamType; def DoubleTy : DXILOpParamType; def ResRetTy : DXILOpParamType; def HandleTy : DXILOpParamType; +def ResBindTy : DXILOpParamType; +def ResPropsTy : DXILOpParamType; class DXILOpClass; @@ -683,6 +685,14 @@ def Dot4 : DXILOp<56, dot4> { let attributes = [Attributes]; } +def CreateHandle : DXILOp<57, createHandle> { + let Doc = "creates the handle to a resource"; + // ResourceClass, RangeID, Index, NonUniform + let arguments = [Int8Ty, Int32Ty, Int32Ty, Int1Ty]; + let result = HandleTy; + let stages = [Stages, Stages]; +} + def ThreadId : DXILOp<93, threadId> { let Doc = "Reads the thread ID"; let LLVMIntrinsic = int_dx_thread_id; @@ -722,3 +732,17 @@ def FlattenedThreadIdInGroup : DXILOp<96, flattenedThreadIdInGroup> { let stages = [Stages]; let attributes = [Attributes]; } + +def AnnotateHandle : DXILOp<217, annotateHandle> { + let Doc = "annotate handle with resource properties"; + let arguments = [HandleTy, ResPropsTy]; + let result = HandleTy; + let stages = [Stages]; +} + +def CreateHandleFromBinding : DXILOp<218, createHandleFromBinding> { + let Doc = "create resource handle from binding"; + let arguments = [ResBindTy, Int32Ty, Int1Ty]; + let result = HandleTy; + let stages = [Stages]; +} diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp index 8e26483d675c89..ab3ea61d05fc45 100644 --- a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp +++ b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp @@ -208,6 +208,23 @@ static StructType *getHandleType(LLVMContext &Ctx) { Ctx); } +static StructType *getResBindType(LLVMContext &Context) { + if (auto *ST = StructType::getTypeByName(Context, "dx.types.ResBind")) + return ST; + Type *Int32Ty = Type::getInt32Ty(Context); + Type *Int8Ty = Type::getInt8Ty(Context); + return StructType::create({Int32Ty, Int32Ty, Int32Ty, Int8Ty}, + "dx.types.ResBind"); +} + +static StructType *getResPropsType(LLVMContext &Context) { + if (auto *ST = + StructType::getTypeByName(Context, "dx.types.ResourceProperties")) + return ST; + Type *Int32Ty = Type::getInt32Ty(Context); + return StructType::create({Int32Ty, Int32Ty}, "dx.types.ResourceProperties"); +} + static Type *getTypeFromOpParamType(OpParamType Kind, LLVMContext &Ctx, Type *OverloadTy) { switch (Kind) { @@ -235,6 +252,10 @@ static Type *getTypeFromOpParamType(OpParamType Kind, LLVMContext &Ctx, return getResRetType(OverloadTy, Ctx); case OpParamType::HandleTy: return getHandleType(Ctx); + case OpParamType::ResBindTy: + return getResBindType(Ctx); + case OpParamType::ResPropsTy: + return getResPropsType(Ctx); } llvm_unreachable("Invalid parameter kind"); return nullptr; @@ -430,6 +451,29 @@ CallInst *DXILOpBuilder::createOp(dxil::OpCode OpCode, ArrayRef Args, return *Result; } +StructType *DXILOpBuilder::getHandleType() { + return ::getHandleType(IRB.getContext()); +} + +Constant *DXILOpBuilder::getResBind(uint32_t LowerBound, uint32_t UpperBound, + uint32_t SpaceID, dxil::ResourceClass RC) { + Type *Int32Ty = IRB.getInt32Ty(); + Type *Int8Ty = IRB.getInt8Ty(); + return ConstantStruct::get( + getResBindType(IRB.getContext()), + {ConstantInt::get(Int32Ty, LowerBound), + ConstantInt::get(Int32Ty, UpperBound), + ConstantInt::get(Int32Ty, SpaceID), + ConstantInt::get(Int8Ty, llvm::to_underlying(RC))}); +} + +Constant *DXILOpBuilder::getResProps(uint32_t Word0, uint32_t Word1) { + Type *Int32Ty = IRB.getInt32Ty(); + return ConstantStruct::get( + getResPropsType(IRB.getContext()), + {ConstantInt::get(Int32Ty, Word0), ConstantInt::get(Int32Ty, Word1)}); +} + const char *DXILOpBuilder::getOpCodeName(dxil::OpCode DXILOp) { return ::getOpCodeName(DXILOp); } diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.h b/llvm/lib/Target/DirectX/DXILOpBuilder.h index 483d5ddc8b6197..4a55a8ac9eadb5 100644 --- a/llvm/lib/Target/DirectX/DXILOpBuilder.h +++ b/llvm/lib/Target/DirectX/DXILOpBuilder.h @@ -15,6 +15,7 @@ #include "DXILConstants.h" #include "llvm/ADT/SmallVector.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/Support/DXILABI.h" #include "llvm/Support/Error.h" #include "llvm/TargetParser/Triple.h" @@ -22,6 +23,7 @@ namespace llvm { class Module; class IRBuilderBase; class CallInst; +class Constant; class Value; class Type; class FunctionType; @@ -44,6 +46,15 @@ class DXILOpBuilder { Expected tryCreateOp(dxil::OpCode Op, ArrayRef Args, Type *RetTy = nullptr); + /// Get the `%dx.types.Handle` type. + StructType *getHandleType(); + + /// Get a constant `%dx.types.ResBind` value. + Constant *getResBind(uint32_t LowerBound, uint32_t UpperBound, + uint32_t SpaceID, dxil::ResourceClass RC); + /// Get a constant `%dx.types.ResourceProperties` value. + Constant *getResProps(uint32_t Word0, uint32_t Word1); + /// Return the name of the given opcode. static const char *getOpCodeName(dxil::OpCode DXILOp); diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index fb708a61dd318d..1f6d37087bc9f4 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -12,6 +12,7 @@ #include "DXILOpBuilder.h" #include "DirectX.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/DXILResource.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IRBuilder.h" @@ -20,6 +21,7 @@ #include "llvm/IR/IntrinsicsDirectX.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/ErrorHandling.h" @@ -74,9 +76,11 @@ namespace { class OpLowerer { Module &M; DXILOpBuilder OpBuilder; + DXILResourceMap &DRM; + SmallVector CleanupCasts; public: - OpLowerer(Module &M) : M(M), OpBuilder(M) {} + OpLowerer(Module &M, DXILResourceMap &DRM) : M(M), OpBuilder(M), DRM(DRM) {} void replaceFunction(Function &F, llvm::function_ref ReplaceCall) { @@ -119,6 +123,142 @@ class OpLowerer { }); } + /// Create a cast between a `target("dx")` type and `dx.types.Handle`, which + /// is intended to be removed by the end of lowering. This is used to allow + /// lowering of ops which need to change their return or argument types in a + /// piecemeal way - we can add the casts in to avoid updating all of the uses + /// or defs, and by the end all of the casts will be redundant. + Value *createTmpHandleCast(Value *V, Type *Ty) { + Function *CastFn = Intrinsic::getDeclaration(&M, Intrinsic::dx_cast_handle, + {Ty, V->getType()}); + CallInst *Cast = OpBuilder.getIRB().CreateCall(CastFn, {V}); + CleanupCasts.push_back(Cast); + return Cast; + } + + void cleanupHandleCasts() { + SmallVector ToRemove; + SmallVector CastFns; + + for (CallInst *Cast : CleanupCasts) { + // These casts were only put in to ease the move from `target("dx")` types + // to `dx.types.Handle in a piecemeal way. At this point, all of the + // non-cast uses should now be `dx.types.Handle`, and remaining casts + // should all form pairs to and from the now unused `target("dx")` type. + CastFns.push_back(Cast->getCalledFunction()); + + // If the cast is not to `dx.types.Handle`, it should be the first part of + // the pair. Keep track so we can remove it once it has no more uses. + if (Cast->getType() != OpBuilder.getHandleType()) { + ToRemove.push_back(Cast); + continue; + } + // Otherwise, we're the second handle in a pair. Forward the arguments and + // remove the (second) cast. + CallInst *Def = cast(Cast->getOperand(0)); + assert(Def->getIntrinsicID() == Intrinsic::dx_cast_handle && + "Unbalanced pair of temporary handle casts"); + Cast->replaceAllUsesWith(Def->getOperand(0)); + Cast->eraseFromParent(); + } + for (CallInst *Cast : ToRemove) { + assert(Cast->user_empty() && "Temporary handle cast still has users"); + Cast->eraseFromParent(); + } + + // Deduplicate the cast functions so that we only erase each one once. + llvm::sort(CastFns); + CastFns.erase(llvm::unique(CastFns), CastFns.end()); + for (Function *F : CastFns) + F->eraseFromParent(); + + CleanupCasts.clear(); + } + + void lowerToCreateHandle(Function &F) { + IRBuilder<> &IRB = OpBuilder.getIRB(); + Type *Int8Ty = IRB.getInt8Ty(); + Type *Int32Ty = IRB.getInt32Ty(); + + replaceFunction(F, [&](CallInst *CI) -> Error { + IRB.SetInsertPoint(CI); + + auto *It = DRM.find(CI); + assert(It != DRM.end() && "Resource not in map?"); + dxil::ResourceInfo &RI = *It; + const auto &Binding = RI.getBinding(); + + std::array Args{ + ConstantInt::get(Int8Ty, llvm::to_underlying(RI.getResourceClass())), + ConstantInt::get(Int32Ty, Binding.RecordID), CI->getArgOperand(3), + CI->getArgOperand(4)}; + Expected OpCall = + OpBuilder.tryCreateOp(OpCode::CreateHandle, Args); + if (Error E = OpCall.takeError()) + return E; + + Value *Cast = createTmpHandleCast(*OpCall, CI->getType()); + + CI->replaceAllUsesWith(Cast); + CI->eraseFromParent(); + return Error::success(); + }); + } + + void lowerToBindAndAnnotateHandle(Function &F) { + IRBuilder<> &IRB = OpBuilder.getIRB(); + + replaceFunction(F, [&](CallInst *CI) -> Error { + IRB.SetInsertPoint(CI); + + auto *It = DRM.find(CI); + assert(It != DRM.end() && "Resource not in map?"); + dxil::ResourceInfo &RI = *It; + + const auto &Binding = RI.getBinding(); + std::pair Props = RI.getAnnotateProps(); + + // For `CreateHandleFromBinding` we need the upper bound rather than the + // size, so we need to be careful about the difference for "unbounded". + uint32_t Unbounded = std::numeric_limits::max(); + uint32_t UpperBound = Binding.Size == Unbounded + ? Unbounded + : Binding.LowerBound + Binding.Size - 1; + Constant *ResBind = OpBuilder.getResBind( + Binding.LowerBound, UpperBound, Binding.Space, RI.getResourceClass()); + std::array BindArgs{ResBind, CI->getArgOperand(3), + CI->getArgOperand(4)}; + Expected OpBind = + OpBuilder.tryCreateOp(OpCode::CreateHandleFromBinding, BindArgs); + if (Error E = OpBind.takeError()) + return E; + + std::array AnnotateArgs{ + *OpBind, OpBuilder.getResProps(Props.first, Props.second)}; + Expected OpAnnotate = + OpBuilder.tryCreateOp(OpCode::AnnotateHandle, AnnotateArgs); + if (Error E = OpAnnotate.takeError()) + return E; + + Value *Cast = createTmpHandleCast(*OpAnnotate, CI->getType()); + + CI->replaceAllUsesWith(Cast); + CI->eraseFromParent(); + + return Error::success(); + }); + } + + /// Lower `dx.handle.fromBinding` intrinsics depending on the shader model and + /// taking into account binding information from DXILResourceAnalysis. + void lowerHandleFromBinding(Function &F) { + Triple TT(Triple(M.getTargetTriple())); + if (TT.getDXILVersion() < VersionTuple(1, 6)) + lowerToCreateHandle(F); + else + lowerToBindAndAnnotateHandle(F); + } + bool lowerIntrinsics() { bool Updated = false; @@ -134,33 +274,47 @@ class OpLowerer { replaceFunctionWithOp(F, OpCode); \ break; #include "DXILOperation.inc" + case Intrinsic::dx_handle_fromBinding: + lowerHandleFromBinding(F); } Updated = true; } + if (Updated) + cleanupHandleCasts(); + return Updated; } }; } // namespace -PreservedAnalyses DXILOpLowering::run(Module &M, ModuleAnalysisManager &) { - if (OpLowerer(M).lowerIntrinsics()) - return PreservedAnalyses::none(); - return PreservedAnalyses::all(); +PreservedAnalyses DXILOpLowering::run(Module &M, ModuleAnalysisManager &MAM) { + DXILResourceMap &DRM = MAM.getResult(M); + + bool MadeChanges = OpLowerer(M, DRM).lowerIntrinsics(); + if (!MadeChanges) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserve(); + return PA; } namespace { class DXILOpLoweringLegacy : public ModulePass { public: bool runOnModule(Module &M) override { - return OpLowerer(M).lowerIntrinsics(); + DXILResourceMap &DRM = + getAnalysis().getResourceMap(); + + return OpLowerer(M, DRM).lowerIntrinsics(); } StringRef getPassName() const override { return "DXIL Op Lowering"; } DXILOpLoweringLegacy() : ModulePass(ID) {} static char ID; // Pass identification. void getAnalysisUsage(llvm::AnalysisUsage &AU) const override { - // Specify the passes that your pass depends on AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); } }; char DXILOpLoweringLegacy::ID = 0; @@ -168,6 +322,7 @@ char DXILOpLoweringLegacy::ID = 0; INITIALIZE_PASS_BEGIN(DXILOpLoweringLegacy, DEBUG_TYPE, "DXIL Op Lowering", false, false) +INITIALIZE_PASS_DEPENDENCY(DXILResourceWrapperPass) INITIALIZE_PASS_END(DXILOpLoweringLegacy, DEBUG_TYPE, "DXIL Op Lowering", false, false) diff --git a/llvm/test/CodeGen/DirectX/CreateHandle.ll b/llvm/test/CodeGen/DirectX/CreateHandle.ll new file mode 100644 index 00000000000000..13d59c6caf6c95 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/CreateHandle.ll @@ -0,0 +1,53 @@ +; RUN: opt -S -dxil-op-lower %s | FileCheck %s + +target triple = "dxil-pc-shadermodel6.0-compute" + +declare i32 @some_val(); + +define void @test_buffers() { + ; RWBuffer Buf : register(u5, space3) + %typed0 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0( + i32 3, i32 5, i32 1, i32 4, i1 false) + ; CHECK: call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 1, i32 4, i1 false) + ; CHECK-NOT: @llvm.dx.cast.handle + + ; RWBuffer Buf : register(u7, space2) + %typed1 = call target("dx.TypedBuffer", i32, 1, 0, 1) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_1_0_1t( + i32 2, i32 7, i32 1, i32 6, i1 false) + ; CHECK: call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 0, i32 6, i1 false) + + ; Buffer Buf[24] : register(t3, space5) + ; Buffer typed2 = Buf[4] + ; Note that the index below is 3 + 4 = 7 + %typed2 = call target("dx.TypedBuffer", <4 x i32>, 0, 0, 0) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_0_0_0t( + i32 5, i32 3, i32 24, i32 7, i1 false) + ; CHECK: call %dx.types.Handle @dx.op.createHandle(i32 57, i8 0, i32 3, i32 7, i1 false) + + ; struct S { float4 a; uint4 b; }; + ; StructuredBuffer Buf : register(t2, space4) + %struct0 = call target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0) + @llvm.dx.handle.fromBinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0t( + i32 4, i32 2, i32 1, i32 10, i1 true) + ; CHECK: call %dx.types.Handle @dx.op.createHandle(i32 57, i8 0, i32 2, i32 10, i1 true) + + ; ByteAddressBuffer Buf : register(t8, space1) + %byteaddr0 = call target("dx.RawBuffer", i8, 0, 0) + @llvm.dx.handle.fromBinding.tdx.RawBuffer_i8_0_0t( + i32 1, i32 8, i32 1, i32 12, i1 false) + ; CHECK: call %dx.types.Handle @dx.op.createHandle(i32 57, i8 0, i32 1, i32 12, i1 false) + + ; Buffer Buf[] : register(t0) + ; Buffer typed3 = Buf[ix] + %typed3_ix = call i32 @some_val() + %typed3 = call target("dx.TypedBuffer", <4 x float>, 0, 0, 0) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_0_0_0t( + i32 0, i32 0, i32 -1, i32 %typed3_ix, i1 false) + ; CHECK: call %dx.types.Handle @dx.op.createHandle(i32 57, i8 0, i32 0, i32 %typed3_ix, i1 false) + + ret void +} + +attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll b/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll new file mode 100644 index 00000000000000..e78a0bf02e4ae3 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll @@ -0,0 +1,58 @@ +; RUN: opt -S -dxil-op-lower %s | FileCheck %s + +target triple = "dxil-pc-shadermodel6.6-compute" + +declare i32 @some_val(); + +define void @test_bindings() { + ; RWBuffer Buf : register(u5, space3) + %typed0 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0( + i32 3, i32 5, i32 1, i32 4, i1 false) + ; CHECK: [[BUF0:%[0-9]*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 218, %dx.types.ResBind { i32 5, i32 5, i32 3, i8 1 }, i32 4, i1 false) + ; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BUF0]], %dx.types.ResourceProperties { i32 4106, i32 1033 }) + + ; RWBuffer Buf : register(u7, space2) + %typed1 = call target("dx.TypedBuffer", i32, 1, 0, 1) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_1_0_0t( + i32 2, i32 7, i32 1, i32 6, i1 false) + ; CHECK: [[BUF1:%[0-9]*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 218, %dx.types.ResBind { i32 7, i32 7, i32 2, i8 1 }, i32 6, i1 false) + ; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BUF1]], %dx.types.ResourceProperties { i32 4106, i32 260 }) + + ; Buffer Buf[24] : register(t3, space5) + ; Buffer typed2 = Buf[4] + ; Note that the index below is 3 + 4 = 7 + %typed2 = call target("dx.TypedBuffer", <4 x i32>, 0, 0, 0) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_0_0_0t( + i32 5, i32 3, i32 24, i32 7, i1 false) + ; CHECK: [[BUF2:%[0-9]*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 218, %dx.types.ResBind { i32 3, i32 26, i32 5, i8 0 }, i32 7, i1 false) + ; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BUF2]], %dx.types.ResourceProperties { i32 10, i32 1029 }) + + ; struct S { float4 a; uint4 b; }; + ; StructuredBuffer Buf : register(t2, space4) + %struct0 = call target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0) + @llvm.dx.handle.fromBinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0t( + i32 4, i32 2, i32 1, i32 10, i1 true) + ; CHECK: [[BUF3:%[0-9]*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 218, %dx.types.ResBind { i32 2, i32 2, i32 4, i8 0 }, i32 10, i1 true) + ; CHECK: = call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BUF3]], %dx.types.ResourceProperties { i32 1036, i32 32 }) + + ; ByteAddressBuffer Buf : register(t8, space1) + %byteaddr0 = call target("dx.RawBuffer", i8, 0, 0) + @llvm.dx.handle.fromBinding.tdx.RawBuffer_i8_0_0t( + i32 1, i32 8, i32 1, i32 12, i1 false) + ; CHECK: [[BUF4:%[0-9]*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 218, %dx.types.ResBind { i32 8, i32 8, i32 1, i8 0 }, i32 12, i1 false) + ; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BUF4]], %dx.types.ResourceProperties { i32 11, i32 0 }) + + ; Buffer Buf[] : register(t0) + ; Buffer typed3 = Buf[ix] + %typed3_ix = call i32 @some_val() + %typed3 = call target("dx.TypedBuffer", <4 x float>, 0, 0, 0) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_0_0_0t( + i32 0, i32 0, i32 -1, i32 %typed3_ix, i1 false) + ; CHECK: [[BUF5:%[0-9]*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 218, %dx.types.ResBind { i32 0, i32 -1, i32 0, i8 0 }, i32 %typed3_ix, i1 false) + ; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BUF5]], %dx.types.ResourceProperties { i32 10, i32 1033 }) + + ret void +} + +attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) } From 8b4147d14c460f8886e882db48361d4c101917d7 Mon Sep 17 00:00:00 2001 From: Felipe de Azevedo Piovezan Date: Fri, 23 Aug 2024 13:09:31 -0700 Subject: [PATCH 372/426] [GDBRemote] Fix processing of comma-separated memory region entries (#105873) The existing algorithm was performing the following comparisons for an `aaa,bbb,ccc,ddd`: aaa\0bbb,ccc,ddd == "stack" aaa\0bbb\0ccc,ddd == "stack" aaa\0bbb\0ccc\0ddd == "stack" Which wouldn't work. This commit just dispatches to a known algorithm implementation. --- .../gdb-remote/GDBRemoteCommunicationClient.cpp | 12 ++---------- .../gdb-remote/GDBRemoteCommunicationClientTest.cpp | 7 +++++-- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp index 83ba27783da471..d7a0baa488edc5 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp @@ -1632,17 +1632,9 @@ Status GDBRemoteCommunicationClient::GetMemoryRegionInfo( } } } else if (name == "type") { - std::string comma_sep_str = value.str(); - size_t comma_pos; - while ((comma_pos = comma_sep_str.find(',')) != std::string::npos) { - comma_sep_str[comma_pos] = '\0'; - if (comma_sep_str == "stack") { + for (llvm::StringRef entry : llvm::split(value, ',')) { + if (entry == "stack") region_info.SetIsStackMemory(MemoryRegionInfo::eYes); - } - } - // handle final (or only) type of "stack" - if (comma_sep_str == "stack") { - region_info.SetIsStackMemory(MemoryRegionInfo::eYes); } } else if (name == "error") { StringExtractorGDBRemote error_extractor(value); diff --git a/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp b/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp index 11e14f9472164d..18020c8e43fe06 100644 --- a/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp +++ b/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp @@ -343,24 +343,27 @@ TEST_F(GDBRemoteCommunicationClientTest, GetMemoryRegionInfo) { EXPECT_EQ(MemoryRegionInfo::eYes, region_info.GetExecutable()); EXPECT_EQ("/foo/bar.so", region_info.GetName().GetStringRef()); EXPECT_EQ(MemoryRegionInfo::eDontKnow, region_info.GetMemoryTagged()); + EXPECT_EQ(MemoryRegionInfo::eDontKnow, region_info.IsStackMemory()); result = std::async(std::launch::async, [&] { return client.GetMemoryRegionInfo(addr, region_info); }); HandlePacket(server, "qMemoryRegionInfo:a000", - "start:a000;size:2000;flags:;"); + "start:a000;size:2000;flags:;type:stack;"); EXPECT_TRUE(result.get().Success()); EXPECT_EQ(MemoryRegionInfo::eNo, region_info.GetMemoryTagged()); + EXPECT_EQ(MemoryRegionInfo::eYes, region_info.IsStackMemory()); result = std::async(std::launch::async, [&] { return client.GetMemoryRegionInfo(addr, region_info); }); HandlePacket(server, "qMemoryRegionInfo:a000", - "start:a000;size:2000;flags: mt zz mt ;"); + "start:a000;size:2000;flags: mt zz mt ;type:ha,ha,stack;"); EXPECT_TRUE(result.get().Success()); EXPECT_EQ(MemoryRegionInfo::eYes, region_info.GetMemoryTagged()); + EXPECT_EQ(MemoryRegionInfo::eYes, region_info.IsStackMemory()); } TEST_F(GDBRemoteCommunicationClientTest, GetMemoryRegionInfoInvalidResponse) { From a2a5508bdae7d115b6c3ace461beb7a987a44407 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Fri, 23 Aug 2024 13:13:41 -0700 Subject: [PATCH 373/426] [nfc][mlgo] Incrementally update DominatorTreeAnalysis in FunctionPropertiesAnalysis (#104867) We need the dominator tree analysis for loop info analysis, which we need to get features like most nested loop and number of top level loops. Invalidating and recomputing these from scratch after each successful inlining can sometimes lead to lengthy compile times. We don't need to recompute from scratch, though, since we have some boundary information about where the changes to the CFG happen; moreover, for dom tree, the API supports incrementally updating the analysis result. This change addresses the dom tree part. The loop info is still recomputed from scratch. This does reduce the compile time quite significantly already, though (~5x in a specific case) The loop info change might be more involved and would follow in a subsequent PR. --- .../Analysis/FunctionPropertiesAnalysis.h | 6 ++ .../Analysis/FunctionPropertiesAnalysis.cpp | 58 ++++++++++++++++++- llvm/lib/Analysis/MLInlineAdvisor.cpp | 1 - 3 files changed, 62 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h b/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h index ee447d3e4ebb6a..af72f6e0f90b11 100644 --- a/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h +++ b/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h @@ -15,6 +15,7 @@ #define LLVM_ANALYSIS_FUNCTIONPROPERTIESANALYSIS_H #include "llvm/ADT/DenseSet.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/PassManager.h" namespace llvm { @@ -186,7 +187,12 @@ class FunctionPropertiesUpdater { static bool isUpdateValid(Function &F, const FunctionPropertiesInfo &FPI, FunctionAnalysisManager &FAM); + DominatorTree &getUpdatedDominatorTree(FunctionAnalysisManager &FAM) const; + DenseSet Successors; + + // Edges we might potentially need to remove from the dominator tree. + SmallVector DomTreeUpdates; }; } // namespace llvm #endif // LLVM_ANALYSIS_FUNCTIONPROPERTIESANALYSIS_H diff --git a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp index 6d6ec6c7b1cc76..479cfc58ab38f5 100644 --- a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp +++ b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp @@ -326,6 +326,14 @@ FunctionPropertiesUpdater::FunctionPropertiesUpdater( // with the CB BB ('Entry') between which the inlined callee will be pasted. Successors.insert(succ_begin(&CallSiteBB), succ_end(&CallSiteBB)); + // the outcome of the inlining may be that some edges get lost (DCEd BBs + // because inlining brought some constant, for example). We don't know which + // edges will be removed, so we list all of them as potentially removable. + for (auto *Succ : successors(&CallSiteBB)) + DomTreeUpdates.emplace_back(DominatorTree::UpdateKind::Delete, + const_cast(&CallSiteBB), + const_cast(Succ)); + // Inlining only handles invoke and calls. If this is an invoke, and inlining // it pulls another invoke, the original landing pad may get split, so as to // share its content with other potential users. So the edge up to which we @@ -336,6 +344,11 @@ FunctionPropertiesUpdater::FunctionPropertiesUpdater( if (const auto *II = dyn_cast(&CB)) { const auto *UnwindDest = II->getUnwindDest(); Successors.insert(succ_begin(UnwindDest), succ_end(UnwindDest)); + // Same idea as above, we pretend we lose all these edges. + for (auto *Succ : successors(UnwindDest)) + DomTreeUpdates.emplace_back(DominatorTree::UpdateKind::Delete, + const_cast(UnwindDest), + const_cast(Succ)); } // Exclude the CallSiteBB, if it happens to be its own successor (1-BB loop). @@ -356,6 +369,45 @@ FunctionPropertiesUpdater::FunctionPropertiesUpdater( FPI.updateForBB(*BB, -1); } +DominatorTree &FunctionPropertiesUpdater::getUpdatedDominatorTree( + FunctionAnalysisManager &FAM) const { + auto &DT = + FAM.getResult(const_cast(Caller)); + + SetVector NewSucc; + NewSucc.insert(succ_begin(&CallSiteBB), succ_end(&CallSiteBB)); + + // tell the DomTree about the new edges + std::deque Worklist; + Worklist.push_back(&CallSiteBB); + + // Build the list of edges to actually remove. Those are those edges in the + // DomTreeUpdates that cannot be found in the CFG anymore. + SmallVector FinalDomTreeUpdates; + while (!Worklist.empty()) { + auto *BB = Worklist.front(); + Worklist.pop_front(); + assert(DT.getNode(BB)); + + for (auto *Succ : NewSucc) { + if (!DT.getNode(Succ)) + Worklist.push_back(Succ); + FinalDomTreeUpdates.push_back({DominatorTree::UpdateKind::Insert, + const_cast(BB), + const_cast(Succ)}); + } + } + for (auto &Upd : DomTreeUpdates) + if (!llvm::is_contained(successors(Upd.getFrom()), Upd.getTo())) + FinalDomTreeUpdates.push_back(Upd); + + DT.applyUpdates(FinalDomTreeUpdates); +#ifdef EXPENSIVE_CHECKS + assert(DT.verify(DominatorTree::VerificationLevel::Full)); +#endif + return DT; +} + void FunctionPropertiesUpdater::finish(FunctionAnalysisManager &FAM) const { // Update feature values from the BBs that were copied from the callee, or // might have been modified because of inlining. The latter have been @@ -383,8 +435,7 @@ void FunctionPropertiesUpdater::finish(FunctionAnalysisManager &FAM) const { // remove E. SetVector Reinclude; SetVector Unreachable; - const auto &DT = - FAM.getResult(const_cast(Caller)); + auto &DT = getUpdatedDominatorTree(FAM); if (&CallSiteBB != &*Caller.begin()) Reinclude.insert(&*Caller.begin()); @@ -427,6 +478,9 @@ void FunctionPropertiesUpdater::finish(FunctionAnalysisManager &FAM) const { const auto &LI = FAM.getResult(const_cast(Caller)); FPI.updateAggregateStats(Caller, LI); +#ifdef EXPENSIVE_CHECKS + assert(isUpdateValid(Caller, FPI, FAM)); +#endif } bool FunctionPropertiesUpdater::isUpdateValid(Function &F, diff --git a/llvm/lib/Analysis/MLInlineAdvisor.cpp b/llvm/lib/Analysis/MLInlineAdvisor.cpp index b59aa4810005bc..8bb5efcf1b2ecb 100644 --- a/llvm/lib/Analysis/MLInlineAdvisor.cpp +++ b/llvm/lib/Analysis/MLInlineAdvisor.cpp @@ -288,7 +288,6 @@ void MLInlineAdvisor::onSuccessfulInlining(const MLInlineAdvice &Advice, { PreservedAnalyses PA = PreservedAnalyses::all(); PA.abandon(); - PA.abandon(); PA.abandon(); FAM.invalidate(*Caller, PA); } From 4dbaef6d5ea71fb183114a82da4028960906c42b Mon Sep 17 00:00:00 2001 From: MaheshRavishankar <1663364+MaheshRavishankar@users.noreply.github.com> Date: Fri, 23 Aug 2024 13:43:33 -0700 Subject: [PATCH 374/426] [mlir][Linalg] Avoid doing op replacement in `linalg::dropUnitDims`. (#105749) It is better to do the replacement in the caller. This avoids the footgun if the caller needs the original operation. Instead return the produced operation and replacement values. Signed-off-by: MaheshRavishankar --- .../mlir/Dialect/Linalg/Transforms/Transforms.h | 9 +++++++-- .../Dialect/Linalg/Transforms/DropUnitDims.cpp | 16 +++++++++++----- .../Dialect/Linalg/TestLinalgDropUnitDims.cpp | 8 +++++++- 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h index bee3452ebb685f..0208f854f799ec 100644 --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -488,8 +488,13 @@ struct ControlDropUnitDims { return SmallVector{}; }; }; -LogicalResult dropUnitDims(RewriterBase &rewriter, GenericOp genericOp, - const ControlDropUnitDims &options); +struct DropUnitDimsResult { + linalg::GenericOp resultOp; + SmallVector replacements; +}; +FailureOr dropUnitDims(RewriterBase &rewriter, + GenericOp genericOp, + const ControlDropUnitDims &options); /// Fuse two `linalg.generic` operations that have a producer-consumer /// relationship captured through `fusedOperand`. The method expects diff --git a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp index 36f8696bf1b274..88ef82fb38d67b 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp @@ -386,8 +386,9 @@ static UnitExtentReplacementInfo dropUnitExtentFromOperandMetadata( return info; } -LogicalResult linalg::dropUnitDims(RewriterBase &rewriter, GenericOp genericOp, - const ControlDropUnitDims &options) { +FailureOr +linalg::dropUnitDims(RewriterBase &rewriter, GenericOp genericOp, + const ControlDropUnitDims &options) { SmallVector indexingMaps = genericOp.getIndexingMapsArray(); if (indexingMaps.empty()) return failure(); @@ -545,8 +546,7 @@ LogicalResult linalg::dropUnitDims(RewriterBase &rewriter, GenericOp genericOp, resultReplacements.push_back(expandedValue); } - rewriter.replaceOp(genericOp, resultReplacements); - return success(); + return DropUnitDimsResult{replacementOp, resultReplacements}; } namespace { @@ -557,7 +557,13 @@ struct DropUnitDims : public OpRewritePattern { LogicalResult matchAndRewrite(GenericOp genericOp, PatternRewriter &rewriter) const override { - return dropUnitDims(rewriter, genericOp, options); + FailureOr result = + dropUnitDims(rewriter, genericOp, options); + if (failed(result)) { + return failure(); + } + rewriter.replaceOp(genericOp, result->replacements); + return success(); } private: diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgDropUnitDims.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgDropUnitDims.cpp index 85a6d5f9d9215c..402ce154c0848e 100644 --- a/mlir/test/lib/Dialect/Linalg/TestLinalgDropUnitDims.cpp +++ b/mlir/test/lib/Dialect/Linalg/TestLinalgDropUnitDims.cpp @@ -25,7 +25,13 @@ LogicalResult dropOutermostUnitDims(RewriterBase &rewriter, linalg::GenericOp genericOp) { linalg::ControlDropUnitDims options; options.controlFn = [](Operation *op) { return SmallVector{0}; }; - return linalg::dropUnitDims(rewriter, genericOp, options); + FailureOr result = + linalg::dropUnitDims(rewriter, genericOp, options); + if (failed(result)) { + return failure(); + } + rewriter.replaceOp(genericOp, result->replacements); + return success(); } struct TestLinalgDropUnitDims From d7073c527457dc0a71126381afb3c6f0efa1821c Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Fri, 23 Aug 2024 14:03:10 -0700 Subject: [PATCH 375/426] [mlir][Transforms] Dialect conversion: Make materializations optional (#104668) This commit makes source/target/argument materializations (via the `TypeConverter` API) optional. By default (`ConversionConfig::buildMaterializations = true`), the dialect conversion infrastructure tries to legalize all unresolved materializations right after the main transformation process has succeeded. If at least one unresolved materialization fails to resolve, the dialect conversion fails. (With an error message such as `failed to legalize unresolved materialization ...`.) Automatic materializations through the `TypeConverter` API can now be deactivated. In that case, every unresolved materialization will show up as a `builtin.unrealized_conversion_cast` op in the output IR. There used to be a complex and error-prone analysis in the dialect conversion that predicted the future uses of unresolved materializations. Based on that logic, some casts (that were deemed to unnecessary) were folded. This analysis was needed because folding happened at a point of time when some IR changes (e.g., op replacements) had not materialized yet. This commit removes that analysis. Any folding of cast ops now happens after all other IR changes have been materialized and the uses can directly be queried from the IR. This simplifies the analysis significantly. And certain helper data structures such as `inverseMapping` are no longer needed for the analysis. The folding itself is done by `reconcileUnrealizedCasts` (which also exists as a standalone pass). After casts have been folded, the remaining casts are materialized through the `TypeConverter`, as usual. This last step can be deactivated in the `ConversionConfig`. `ConversionConfig::buildMaterializations = false` can be used to debug error messages such as `failed to legalize unresolved materialization ...`. (It is also useful in case automatic materializations are not needed.) The materializations that failed to resolve can then be seen as `builtin.unrealized_conversion_cast` ops in the resulting IR. (This is better than running with `-debug`, because `-debug` shows IR where some IR changes have not been materialized yet.) --- .../mlir/Transforms/DialectConversion.h | 11 + .../Transforms/Utils/DialectConversion.cpp | 393 +++++------------- .../Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir | 5 +- .../Transforms/finalizing-bufferize.mlir | 1 + .../test-legalize-type-conversion.mlir | 6 +- 5 files changed, 118 insertions(+), 298 deletions(-) diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h index 60113bdef16a23..5f680e8eca7559 100644 --- a/mlir/include/mlir/Transforms/DialectConversion.h +++ b/mlir/include/mlir/Transforms/DialectConversion.h @@ -1124,6 +1124,17 @@ struct ConversionConfig { // already been modified) and iterators into past IR state cannot be // represented at the moment. RewriterBase::Listener *listener = nullptr; + + /// If set to "true", the dialect conversion attempts to build source/target/ + /// argument materializations through the type converter API in lieu of + /// builtin.unrealized_conversion_cast ops. The conversion process fails if + /// at least one materialization could not be built. + /// + /// If set to "false", the dialect conversion does not does not build any + /// custom materializations and instead inserts + /// builtin.unrealized_conversion_cast ops to ensure that the resulting IR + /// is valid. + bool buildMaterializations = true; }; //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index adf012a261cb7e..4058ed39621198 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -702,14 +702,12 @@ class UnresolvedMaterializationRewrite : public OperationRewrite { return rewrite->getKind() == Kind::UnresolvedMaterialization; } + void rollback() override; + UnrealizedConversionCastOp getOperation() const { return cast(op); } - void rollback() override; - - void cleanup(RewriterBase &rewriter) override; - /// Return the type converter of this materialization (which may be null). const TypeConverter *getConverter() const { return converterAndKind.getPointer(); @@ -766,7 +764,7 @@ namespace detail { struct ConversionPatternRewriterImpl : public RewriterBase::Listener { explicit ConversionPatternRewriterImpl(MLIRContext *ctx, const ConversionConfig &config) - : context(ctx), config(config) {} + : context(ctx), eraseRewriter(ctx), config(config) {} //===--------------------------------------------------------------------===// // State Management @@ -834,6 +832,7 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { //===--------------------------------------------------------------------===// // Materializations //===--------------------------------------------------------------------===// + /// Build an unresolved materialization operation given an output type and set /// of input operands. Value buildUnresolvedMaterialization(MaterializationKind kind, @@ -882,7 +881,7 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// Erase the given op (unless it was already erased). void eraseOp(Operation *op) override { - if (erased.contains(op)) + if (wasErased(op)) return; op->dropAllUses(); RewriterBase::eraseOp(op); @@ -890,17 +889,24 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// Erase the given block (unless it was already erased). void eraseBlock(Block *block) override { - if (erased.contains(block)) + if (wasErased(block)) return; assert(block->empty() && "expected empty block"); block->dropAllDefinedValueUses(); RewriterBase::eraseBlock(block); } + bool wasErased(void *ptr) const { return erased.contains(ptr); } + + bool wasErased(OperationRewrite *rewrite) const { + return wasErased(rewrite->getOperation()); + } + void notifyOperationErased(Operation *op) override { erased.insert(op); } void notifyBlockErased(Block *block) override { erased.insert(block); } + private: /// Pointers to all erased operations and blocks. DenseSet erased; }; @@ -912,6 +918,11 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// MLIR context. MLIRContext *context; + /// A rewriter that keeps track of ops/block that were already erased and + /// skips duplicate op/block erasures. This rewriter is used during the + /// "cleanup" phase. + SingleEraseRewriter eraseRewriter; + // Mapping between replaced values that differ in type. This happens when // replacing a value with one of a different type. ConversionValueMapping mapping; @@ -1058,10 +1069,6 @@ void UnresolvedMaterializationRewrite::rollback() { op->erase(); } -void UnresolvedMaterializationRewrite::cleanup(RewriterBase &rewriter) { - rewriter.eraseOp(op); -} - void ConversionPatternRewriterImpl::applyRewrites() { // Commit all rewrites. IRRewriter rewriter(context, config.listener); @@ -1069,7 +1076,6 @@ void ConversionPatternRewriterImpl::applyRewrites() { rewrite->commit(rewriter); // Clean up all rewrites. - SingleEraseRewriter eraseRewriter(context); for (auto &rewrite : rewrites) rewrite->cleanup(eraseRewriter); } @@ -2354,12 +2360,6 @@ struct OperationConverter { ConversionPatternRewriterImpl &rewriterImpl, DenseMap> &inverseMapping); - /// Legalize any unresolved type materializations. - LogicalResult legalizeUnresolvedMaterializations( - ConversionPatternRewriter &rewriter, - ConversionPatternRewriterImpl &rewriterImpl, - DenseMap> &inverseMapping); - /// Legalize an operation result that was marked as "erased". LogicalResult legalizeErasedResult(Operation *op, OpResult result, @@ -2406,6 +2406,56 @@ LogicalResult OperationConverter::convert(ConversionPatternRewriter &rewriter, return success(); } +static LogicalResult +legalizeUnresolvedMaterialization(RewriterBase &rewriter, + UnresolvedMaterializationRewrite *rewrite) { + UnrealizedConversionCastOp op = rewrite->getOperation(); + assert(!op.use_empty() && + "expected that dead materializations have already been DCE'd"); + Operation::operand_range inputOperands = op.getOperands(); + Type outputType = op.getResultTypes()[0]; + + // Try to materialize the conversion. + if (const TypeConverter *converter = rewrite->getConverter()) { + rewriter.setInsertionPoint(op); + Value newMaterialization; + switch (rewrite->getMaterializationKind()) { + case MaterializationKind::Argument: + // Try to materialize an argument conversion. + newMaterialization = converter->materializeArgumentConversion( + rewriter, op->getLoc(), outputType, inputOperands); + if (newMaterialization) + break; + // If an argument materialization failed, fallback to trying a target + // materialization. + [[fallthrough]]; + case MaterializationKind::Target: + newMaterialization = converter->materializeTargetConversion( + rewriter, op->getLoc(), outputType, inputOperands); + break; + case MaterializationKind::Source: + newMaterialization = converter->materializeSourceConversion( + rewriter, op->getLoc(), outputType, inputOperands); + break; + } + if (newMaterialization) { + assert(newMaterialization.getType() == outputType && + "materialization callback produced value of incorrect type"); + rewriter.replaceOp(op, newMaterialization); + return success(); + } + } + + InFlightDiagnostic diag = op->emitError() + << "failed to legalize unresolved materialization " + "from (" + << inputOperands.getTypes() << ") to " << outputType + << " that remained live after conversion"; + diag.attachNote(op->getUsers().begin()->getLoc()) + << "see existing live user here: " << *op->getUsers().begin(); + return failure(); +} + LogicalResult OperationConverter::convertOperations(ArrayRef ops) { if (ops.empty()) return success(); @@ -2447,6 +2497,37 @@ LogicalResult OperationConverter::convertOperations(ArrayRef ops) { } else { rewriterImpl.applyRewrites(); } + + // Gather all unresolved materializations. + SmallVector allCastOps; + DenseMap rewriteMap; + for (std::unique_ptr &rewrite : rewriterImpl.rewrites) { + auto *mat = dyn_cast(rewrite.get()); + if (!mat) + continue; + if (rewriterImpl.eraseRewriter.wasErased(mat)) + continue; + allCastOps.push_back(mat->getOperation()); + rewriteMap[mat->getOperation()] = mat; + } + + // Reconcile all UnrealizedConversionCastOps that were inserted by the + // dialect conversion frameworks. (Not the one that were inserted by + // patterns.) + SmallVector remainingCastOps; + reconcileUnrealizedCasts(allCastOps, &remainingCastOps); + + // Try to legalize all unresolved materializations. + if (config.buildMaterializations) { + IRRewriter rewriter(rewriterImpl.context, config.listener); + for (UnrealizedConversionCastOp castOp : remainingCastOps) { + auto it = rewriteMap.find(castOp.getOperation()); + assert(it != rewriteMap.end() && "inconsistent state"); + if (failed(legalizeUnresolvedMaterialization(rewriter, it->second))) + return failure(); + } + } + return success(); } @@ -2460,9 +2541,6 @@ OperationConverter::finalize(ConversionPatternRewriter &rewriter) { if (failed(legalizeConvertedOpResultTypes(rewriter, rewriterImpl, inverseMapping))) return failure(); - if (failed(legalizeUnresolvedMaterializations(rewriter, rewriterImpl, - inverseMapping))) - return failure(); return success(); } @@ -2578,279 +2656,6 @@ LogicalResult OperationConverter::legalizeConvertedArgumentTypes( return success(); } -/// Replace the results of a materialization operation with the given values. -static void -replaceMaterialization(ConversionPatternRewriterImpl &rewriterImpl, - ResultRange matResults, ValueRange values, - DenseMap> &inverseMapping) { - matResults.replaceAllUsesWith(values); - - // For each of the materialization results, update the inverse mappings to - // point to the replacement values. - for (auto [matResult, newValue] : llvm::zip(matResults, values)) { - auto inverseMapIt = inverseMapping.find(matResult); - if (inverseMapIt == inverseMapping.end()) - continue; - - // Update the reverse mapping, or remove the mapping if we couldn't update - // it. Not being able to update signals that the mapping would have become - // circular (i.e. %foo -> newValue -> %foo), which may occur as values are - // propagated through temporary materializations. We simply drop the - // mapping, and let the post-conversion replacement logic handle updating - // uses. - for (Value inverseMapVal : inverseMapIt->second) - if (!rewriterImpl.mapping.tryMap(inverseMapVal, newValue)) - rewriterImpl.mapping.erase(inverseMapVal); - } -} - -/// Compute all of the unresolved materializations that will persist beyond the -/// conversion process, and require inserting a proper user materialization for. -static void computeNecessaryMaterializations( - DenseMap - &materializationOps, - ConversionPatternRewriter &rewriter, - ConversionPatternRewriterImpl &rewriterImpl, - DenseMap> &inverseMapping, - SetVector &necessaryMaterializations) { - // Helper function to check if the given value or a not yet materialized - // replacement of the given value is live. - // Note: `inverseMapping` maps from replaced values to original values. - auto isLive = [&](Value value) { - auto findFn = [&](Operation *user) { - auto matIt = materializationOps.find(user); - if (matIt != materializationOps.end()) - return !necessaryMaterializations.count(matIt->second); - return rewriterImpl.isOpIgnored(user); - }; - // A worklist is needed because a value may have gone through a chain of - // replacements and each of the replaced values may have live users. - SmallVector worklist; - worklist.push_back(value); - while (!worklist.empty()) { - Value next = worklist.pop_back_val(); - if (llvm::find_if_not(next.getUsers(), findFn) != next.user_end()) - return true; - // This value may be replacing another value that has a live user. - llvm::append_range(worklist, inverseMapping.lookup(next)); - } - return false; - }; - - llvm::unique_function lookupRemappedValue = - [&](Value invalidRoot, Value value, Type type) { - // Check to see if the input operation was remapped to a variant of the - // output. - Value remappedValue = rewriterImpl.mapping.lookupOrDefault(value, type); - if (remappedValue.getType() == type && remappedValue != invalidRoot) - return remappedValue; - - // Check to see if the input is a materialization operation that - // provides an inverse conversion. We just check blindly for - // UnrealizedConversionCastOp here, but it has no effect on correctness. - auto inputCastOp = value.getDefiningOp(); - if (inputCastOp && inputCastOp->getNumOperands() == 1) - return lookupRemappedValue(invalidRoot, inputCastOp->getOperand(0), - type); - - return Value(); - }; - - SetVector worklist; - for (auto &rewrite : rewriterImpl.rewrites) { - auto *mat = dyn_cast(rewrite.get()); - if (!mat) - continue; - materializationOps.try_emplace(mat->getOperation(), mat); - worklist.insert(mat); - } - while (!worklist.empty()) { - UnresolvedMaterializationRewrite *mat = worklist.pop_back_val(); - UnrealizedConversionCastOp op = mat->getOperation(); - - // We currently only handle target materializations here. - assert(op->getNumResults() == 1 && "unexpected materialization type"); - OpResult opResult = op->getOpResult(0); - Type outputType = opResult.getType(); - Operation::operand_range inputOperands = op.getOperands(); - - // Try to forward propagate operands for user conversion casts that result - // in the input types of the current cast. - for (Operation *user : llvm::make_early_inc_range(opResult.getUsers())) { - auto castOp = dyn_cast(user); - if (!castOp) - continue; - if (castOp->getResultTypes() == inputOperands.getTypes()) { - replaceMaterialization(rewriterImpl, user->getResults(), inputOperands, - inverseMapping); - necessaryMaterializations.remove(materializationOps.lookup(user)); - } - } - - // Try to avoid materializing a resolved materialization if possible. - // Handle the case of a 1-1 materialization. - if (inputOperands.size() == 1) { - // Check to see if the input operation was remapped to a variant of the - // output. - Value remappedValue = - lookupRemappedValue(opResult, inputOperands[0], outputType); - if (remappedValue && remappedValue != opResult) { - replaceMaterialization(rewriterImpl, opResult, remappedValue, - inverseMapping); - necessaryMaterializations.remove(mat); - continue; - } - } else { - // TODO: Avoid materializing other types of conversions here. - } - - // If the materialization does not have any live users, we don't need to - // generate a user materialization for it. - bool isMaterializationLive = isLive(opResult); - if (!isMaterializationLive) - continue; - if (!necessaryMaterializations.insert(mat)) - continue; - - // Reprocess input materializations to see if they have an updated status. - for (Value input : inputOperands) { - if (auto parentOp = input.getDefiningOp()) { - if (auto *mat = materializationOps.lookup(parentOp)) - worklist.insert(mat); - } - } - } -} - -/// Legalize the given unresolved materialization. Returns success if the -/// materialization was legalized, failure otherise. -static LogicalResult legalizeUnresolvedMaterialization( - UnresolvedMaterializationRewrite &mat, - DenseMap - &materializationOps, - ConversionPatternRewriter &rewriter, - ConversionPatternRewriterImpl &rewriterImpl, - DenseMap> &inverseMapping) { - auto findLiveUser = [&](auto &&users) { - auto liveUserIt = llvm::find_if_not( - users, [&](Operation *user) { return rewriterImpl.isOpIgnored(user); }); - return liveUserIt == users.end() ? nullptr : *liveUserIt; - }; - - llvm::unique_function lookupRemappedValue = - [&](Value value, Type type) { - // Check to see if the input operation was remapped to a variant of the - // output. - Value remappedValue = rewriterImpl.mapping.lookupOrDefault(value, type); - if (remappedValue.getType() == type) - return remappedValue; - return Value(); - }; - - UnrealizedConversionCastOp op = mat.getOperation(); - if (!rewriterImpl.ignoredOps.insert(op)) - return success(); - - // We currently only handle target materializations here. - OpResult opResult = op->getOpResult(0); - Operation::operand_range inputOperands = op.getOperands(); - Type outputType = opResult.getType(); - - // If any input to this materialization is another materialization, resolve - // the input first. - for (Value value : op->getOperands()) { - auto valueCast = value.getDefiningOp(); - if (!valueCast) - continue; - - auto matIt = materializationOps.find(valueCast); - if (matIt != materializationOps.end()) - if (failed(legalizeUnresolvedMaterialization( - *matIt->second, materializationOps, rewriter, rewriterImpl, - inverseMapping))) - return failure(); - } - - // Perform a last ditch attempt to avoid materializing a resolved - // materialization if possible. - // Handle the case of a 1-1 materialization. - if (inputOperands.size() == 1) { - // Check to see if the input operation was remapped to a variant of the - // output. - Value remappedValue = lookupRemappedValue(inputOperands[0], outputType); - if (remappedValue && remappedValue != opResult) { - replaceMaterialization(rewriterImpl, opResult, remappedValue, - inverseMapping); - return success(); - } - } else { - // TODO: Avoid materializing other types of conversions here. - } - - // Try to materialize the conversion. - if (const TypeConverter *converter = mat.getConverter()) { - rewriter.setInsertionPoint(op); - Value newMaterialization; - switch (mat.getMaterializationKind()) { - case MaterializationKind::Argument: - // Try to materialize an argument conversion. - newMaterialization = converter->materializeArgumentConversion( - rewriter, op->getLoc(), outputType, inputOperands); - if (newMaterialization) - break; - // If an argument materialization failed, fallback to trying a target - // materialization. - [[fallthrough]]; - case MaterializationKind::Target: - newMaterialization = converter->materializeTargetConversion( - rewriter, op->getLoc(), outputType, inputOperands); - break; - case MaterializationKind::Source: - newMaterialization = converter->materializeSourceConversion( - rewriter, op->getLoc(), outputType, inputOperands); - break; - } - if (newMaterialization) { - assert(newMaterialization.getType() == outputType && - "materialization callback produced value of incorrect type"); - replaceMaterialization(rewriterImpl, opResult, newMaterialization, - inverseMapping); - return success(); - } - } - - InFlightDiagnostic diag = op->emitError() - << "failed to legalize unresolved materialization " - "from (" - << inputOperands.getTypes() << ") to " << outputType - << " that remained live after conversion"; - if (Operation *liveUser = findLiveUser(op->getUsers())) { - diag.attachNote(liveUser->getLoc()) - << "see existing live user here: " << *liveUser; - } - return failure(); -} - -LogicalResult OperationConverter::legalizeUnresolvedMaterializations( - ConversionPatternRewriter &rewriter, - ConversionPatternRewriterImpl &rewriterImpl, - DenseMap> &inverseMapping) { - // As an initial step, compute all of the inserted materializations that we - // expect to persist beyond the conversion process. - DenseMap materializationOps; - SetVector necessaryMaterializations; - computeNecessaryMaterializations(materializationOps, rewriter, rewriterImpl, - inverseMapping, necessaryMaterializations); - - // Once computed, legalize any necessary materializations. - for (auto *mat : necessaryMaterializations) { - if (failed(legalizeUnresolvedMaterialization( - *mat, materializationOps, rewriter, rewriterImpl, inverseMapping))) - return failure(); - } - return success(); -} - LogicalResult OperationConverter::legalizeErasedResult( Operation *op, OpResult result, ConversionPatternRewriterImpl &rewriterImpl) { diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir index 156a8a468d5b42..75362378daaaaa 100644 --- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir +++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir @@ -1286,7 +1286,6 @@ func.func @warpgroup_matrix_multiply_m128n128k64( // CHECK-DAG: %[[S0:.+]] = builtin.unrealized_conversion_cast %[[arg0]] : !nvgpu.warpgroup.descriptor> to i64 // CHECK-DAG: %[[S1:.+]] = builtin.unrealized_conversion_cast %[[arg1]] : !nvgpu.warpgroup.descriptor> to i64 -// CHECK-DAG: %[[S2:.+]] = builtin.unrealized_conversion_cast %[[arg2]] : memref<128x128xf32, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> // CHECK: %[[S3:.+]] = llvm.mlir.constant(0.000000e+00 : f32) : f32 // CHECK: %[[S4:.+]] = llvm.mlir.undef : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> // CHECK: %[[S5:.+]] = llvm.extractvalue %[[S4]][0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> @@ -1299,8 +1298,8 @@ func.func @warpgroup_matrix_multiply_m128n128k64( // CHECK: %[[S136:.+]] = llvm.insertvalue %[[S134]], %[[S135]][1] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> // CHECK: nvvm.wgmma.fence.aligned // CHECK: %[[S137:.+]] = llvm.mlir.undef : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> -// CHECK: %[[S138:.+]] = llvm.extractvalue %136[0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> -// CHECK: %[[S139:.+]] = nvvm.wgmma.mma_async %[[S0]], %1, %[[S138]], , D[, , ], A[, , ], B[, , ] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> +// CHECK: %[[S138:.+]] = llvm.extractvalue %{{.*}}[0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> +// CHECK: %[[S139:.+]] = nvvm.wgmma.mma_async %[[S0]], %[[S1]], %[[S138]], , D[, , ], A[, , ], B[, , ] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> // CHECK: nvvm.wgmma.mma_async // CHECK: nvvm.wgmma.mma_async // CHECK: %[[S154:.+]] = nvvm.wgmma.mma_async diff --git a/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir index a192434c5accf8..ab18ce05e355d3 100644 --- a/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir @@ -80,6 +80,7 @@ func.func @no_layout_to_dyn_layout_cast(%m: memref) -> memref // expected-error @+1 {{failed to legalize unresolved materialization from ('memref') to 'memref>' that remained live after conversion}} %1 = bufferization.to_memref %0 : memref> + // expected-note @below{{see existing live user here}} return %1 : memref> } diff --git a/mlir/test/Transforms/test-legalize-type-conversion.mlir b/mlir/test/Transforms/test-legalize-type-conversion.mlir index cf2c9f6a8ec441..f130adff42f8cd 100644 --- a/mlir/test/Transforms/test-legalize-type-conversion.mlir +++ b/mlir/test/Transforms/test-legalize-type-conversion.mlir @@ -4,6 +4,7 @@ func.func @test_invalid_arg_materialization( // expected-error@below {{failed to legalize unresolved materialization from () to 'i16' that remained live after conversion}} %arg0: i16) { + // expected-note@below{{see existing live user here}} "foo.return"(%arg0) : (i16) -> () } @@ -22,6 +23,7 @@ func.func @test_valid_arg_materialization(%arg0: i64) { func.func @test_invalid_result_materialization() { // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f16' that remained live after conversion}} %result = "test.type_producer"() : () -> f16 + // expected-note@below{{see existing live user here}} "foo.return"(%result) : (f16) -> () } @@ -30,6 +32,7 @@ func.func @test_invalid_result_materialization() { func.func @test_invalid_result_materialization() { // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f16' that remained live after conversion}} %result = "test.type_producer"() : () -> f16 + // expected-note@below{{see existing live user here}} "foo.return"(%result) : (f16) -> () } @@ -49,6 +52,7 @@ func.func @test_transitive_use_materialization() { func.func @test_transitive_use_invalid_materialization() { // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f16' that remained live after conversion}} %result = "test.another_type_producer"() : () -> f16 + // expected-note@below{{see existing live user here}} "foo.return"(%result) : (f16) -> () } @@ -99,9 +103,9 @@ func.func @test_block_argument_not_converted() { func.func @test_signature_conversion_no_converter() { "test.signature_conversion_no_converter"() ({ // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f32' that remained live after conversion}} - // expected-note@below {{see existing live user here}} ^bb0(%arg0: f32): "test.type_consumer"(%arg0) : (f32) -> () + // expected-note@below{{see existing live user here}} "test.return"(%arg0) : (f32) -> () }) : () -> () return From 64afbf0cbe2e7b77cc0e139cb9ccd086a7f9b930 Mon Sep 17 00:00:00 2001 From: Chris Apple Date: Fri, 23 Aug 2024 14:09:44 -0700 Subject: [PATCH 376/426] [rtsan][compiler-rt] Prevent UB hang in rtsan lock unit tests (#104733) It is undefined behavior to lock or unlock an uninitialized lock, and unlock a lock which isn't locked. Introduce a fixture to set up and tear down the locks where appropriate, and separates them into two tests (realtime death and non realtime survival) so each test is guaranteed a fresh lock. --- .../rtsan/tests/rtsan_test_interceptors.cpp | 130 ++++++++++++++---- 1 file changed, 107 insertions(+), 23 deletions(-) diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp index f5b016089087df..8861104068c8e9 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp @@ -328,26 +328,64 @@ TEST(TestRtsanInterceptors, PthreadCreateDiesWhenRealtime) { ExpectNonRealtimeSurvival(Func); } -TEST(TestRtsanInterceptors, PthreadMutexLockDiesWhenRealtime) { - auto Func = []() { - pthread_mutex_t mutex{}; +class PthreadMutexLockTest : public ::testing::Test { +protected: + void SetUp() override { + pthread_mutex_init(&mutex, nullptr); + is_locked = false; + } + + void TearDown() override { + if (is_locked) + Unlock(); + + pthread_mutex_destroy(&mutex); + } + + void Lock() { + ASSERT_TRUE(!is_locked); pthread_mutex_lock(&mutex); - }; + is_locked = true; + } + + void Unlock() { + ASSERT_TRUE(is_locked); + pthread_mutex_unlock(&mutex); + is_locked = false; + } + +private: + pthread_mutex_t mutex; + bool is_locked; +}; + +TEST_F(PthreadMutexLockTest, PthreadMutexLockDiesWhenRealtime) { + auto Func = [this]() { Lock(); }; ExpectRealtimeDeath(Func, "pthread_mutex_lock"); +} + +TEST_F(PthreadMutexLockTest, PthreadMutexLockSurvivesWhenNotRealtime) { + auto Func = [this]() { Lock(); }; + ExpectNonRealtimeSurvival(Func); } -TEST(TestRtsanInterceptors, PthreadMutexUnlockDiesWhenRealtime) { - auto Func = []() { - pthread_mutex_t mutex{}; - pthread_mutex_unlock(&mutex); - }; +TEST_F(PthreadMutexLockTest, PthreadMutexUnlockDiesWhenRealtime) { + Lock(); + auto Func = [this]() { Unlock(); }; ExpectRealtimeDeath(Func, "pthread_mutex_unlock"); ExpectNonRealtimeSurvival(Func); } +TEST_F(PthreadMutexLockTest, PthreadMutexUnlockSurvivesWhenNotRealtime) { + Lock(); + auto Func = [this]() { Unlock(); }; + + ExpectNonRealtimeSurvival(Func); +} + TEST(TestRtsanInterceptors, PthreadMutexJoinDiesWhenRealtime) { auto Func = []() { pthread_t thread{}; @@ -431,30 +469,76 @@ TEST(TestRtsanInterceptors, PthreadCondWaitDiesWhenRealtime) { pthread_mutex_destroy(&mutex); } -TEST(TestRtsanInterceptors, PthreadRwlockRdlockDiesWhenRealtime) { - auto Func = []() { - pthread_rwlock_t rw_lock; +class PthreadRwlockTest : public ::testing::Test { +protected: + void SetUp() override { + pthread_rwlock_init(&rw_lock, nullptr); + is_locked = false; + } + + void TearDown() override { + if (is_locked) + Unlock(); + + pthread_rwlock_destroy(&rw_lock); + } + + void RdLock() { + ASSERT_TRUE(!is_locked); pthread_rwlock_rdlock(&rw_lock); - }; + is_locked = true; + } + + void WrLock() { + ASSERT_TRUE(!is_locked); + pthread_rwlock_wrlock(&rw_lock); + is_locked = true; + } + + void Unlock() { + ASSERT_TRUE(is_locked); + pthread_rwlock_unlock(&rw_lock); + is_locked = false; + } + +private: + pthread_rwlock_t rw_lock; + bool is_locked; +}; + +TEST_F(PthreadRwlockTest, PthreadRwlockRdlockDiesWhenRealtime) { + auto Func = [this]() { RdLock(); }; ExpectRealtimeDeath(Func, "pthread_rwlock_rdlock"); +} + +TEST_F(PthreadRwlockTest, PthreadRwlockRdlockSurvivesWhenNonRealtime) { + auto Func = [this]() { RdLock(); }; ExpectNonRealtimeSurvival(Func); } -TEST(TestRtsanInterceptors, PthreadRwlockUnlockDiesWhenRealtime) { - auto Func = []() { - pthread_rwlock_t rw_lock; - pthread_rwlock_unlock(&rw_lock); - }; +TEST_F(PthreadRwlockTest, PthreadRwlockUnlockDiesWhenRealtime) { + RdLock(); + + auto Func = [this]() { Unlock(); }; ExpectRealtimeDeath(Func, "pthread_rwlock_unlock"); +} + +TEST_F(PthreadRwlockTest, PthreadRwlockUnlockSurvivesWhenNonRealtime) { + RdLock(); + + auto Func = [this]() { Unlock(); }; ExpectNonRealtimeSurvival(Func); } -TEST(TestRtsanInterceptors, PthreadRwlockWrlockDiesWhenRealtime) { - auto Func = []() { - pthread_rwlock_t rw_lock; - pthread_rwlock_wrlock(&rw_lock); - }; +TEST_F(PthreadRwlockTest, PthreadRwlockWrlockDiesWhenRealtime) { + auto Func = [this]() { WrLock(); }; + ExpectRealtimeDeath(Func, "pthread_rwlock_wrlock"); +} + +TEST_F(PthreadRwlockTest, PthreadRwlockWrlockSurvivesWhenNonRealtime) { + auto Func = [this]() { WrLock(); }; + ExpectNonRealtimeSurvival(Func); } From 3b703d479ff37883242acc20fed317ed8a5466dc Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 23 Aug 2024 14:19:48 -0700 Subject: [PATCH 377/426] [Bitcode] Use DenseSet instead of std::set (NFC) (#105851) DefOrUseGUIDs is used only for membership checking purposes. We don't need std::set's strengths like iterators staying valid or the ability to traverse in a sorted order. While I am at it, this patch replaces count with contains for slightly increased readability. --- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 03d0537291dada..20737c0812cf86 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -4628,7 +4628,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { NameVals.clear(); }; - std::set DefOrUseGUIDs; + DenseSet DefOrUseGUIDs; forEachSummary([&](GVInfo I, bool IsAliasee) { GlobalValueSummary *S = I.second; assert(S); @@ -4777,7 +4777,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { if (!Index.cfiFunctionDefs().empty()) { for (auto &S : Index.cfiFunctionDefs()) { - if (DefOrUseGUIDs.count( + if (DefOrUseGUIDs.contains( GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(S)))) { NameVals.push_back(StrtabBuilder.add(S)); NameVals.push_back(S.size()); @@ -4791,7 +4791,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { if (!Index.cfiFunctionDecls().empty()) { for (auto &S : Index.cfiFunctionDecls()) { - if (DefOrUseGUIDs.count( + if (DefOrUseGUIDs.contains( GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(S)))) { NameVals.push_back(StrtabBuilder.add(S)); NameVals.push_back(S.size()); From da6f42325175bdf1652c296136d9883e1100f86c Mon Sep 17 00:00:00 2001 From: Volodymyr Vasylkun Date: Fri, 23 Aug 2024 22:31:03 +0100 Subject: [PATCH 378/426] [InstCombine] Fold `(x < y) ? -1 : zext(x > y)` and `(x > y) ? 1 : sext(x < y)` to `ucmp/scmp(x, y)` (#105272) This patch expands already existing funcionality to include these two additional folds, which are nearly identical to the ones already implemented. Proofs: https://alive2.llvm.org/ce/z/Xy7s4j --- .../InstCombine/InstCombineSelect.cpp | 17 ++++++-- llvm/test/Transforms/InstCombine/scmp.ll | 28 +++++++++++++ .../Transforms/InstCombine/select-select.ll | 42 +++++-------------- llvm/test/Transforms/InstCombine/ucmp.ll | 32 +++++++++++++- 4 files changed, 82 insertions(+), 37 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 1f6d5759883fd0..18ffc209f259e0 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -3560,7 +3560,9 @@ static Instruction *foldBitCeil(SelectInst &SI, IRBuilderBase &Builder) { // This function tries to fold the following operations: // (x < y) ? -1 : zext(x != y) +// (x < y) ? -1 : zext(x > y) // (x > y) ? 1 : sext(x != y) +// (x > y) ? 1 : sext(x < y) // Into ucmp/scmp(x, y), where signedness is determined by the signedness // of the comparison in the original sequence. Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) { @@ -3589,16 +3591,23 @@ Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) { ICmpInst::isSigned(Pred) ? Intrinsic::scmp : Intrinsic::ucmp; bool Replace = false; + ICmpInst::Predicate ExtendedCmpPredicate; // (x < y) ? -1 : zext(x != y) + // (x < y) ? -1 : zext(x > y) if (ICmpInst::isLT(Pred) && match(TV, m_AllOnes()) && - match(FV, m_ZExt(m_c_SpecificICmp(ICmpInst::ICMP_NE, m_Specific(LHS), - m_Specific(RHS))))) + match(FV, m_ZExt(m_c_ICmp(ExtendedCmpPredicate, m_Specific(LHS), + m_Specific(RHS)))) && + (ExtendedCmpPredicate == ICmpInst::ICMP_NE || + ICmpInst::getSwappedPredicate(ExtendedCmpPredicate) == Pred)) Replace = true; // (x > y) ? 1 : sext(x != y) + // (x > y) ? 1 : sext(x < y) if (ICmpInst::isGT(Pred) && match(TV, m_One()) && - match(FV, m_SExt(m_c_SpecificICmp(ICmpInst::ICMP_NE, m_Specific(LHS), - m_Specific(RHS))))) + match(FV, m_SExt(m_c_ICmp(ExtendedCmpPredicate, m_Specific(LHS), + m_Specific(RHS)))) && + (ExtendedCmpPredicate == ICmpInst::ICMP_NE || + ICmpInst::getSwappedPredicate(ExtendedCmpPredicate) == Pred)) Replace = true; if (Replace) diff --git a/llvm/test/Transforms/InstCombine/scmp.ll b/llvm/test/Transforms/InstCombine/scmp.ll index a3334599a67f1c..123bc647462337 100644 --- a/llvm/test/Transforms/InstCombine/scmp.ll +++ b/llvm/test/Transforms/InstCombine/scmp.ll @@ -223,6 +223,20 @@ define i8 @scmp_from_select_lt(i32 %x, i32 %y) { ret i8 %r } +; Fold (x s< y) ? -1 : zext(x s> y) into scmp(x, y) +define i8 @scmp_from_select_lt_and_gt(i32 %x, i32 %y) { +; CHECK-LABEL: define i8 @scmp_from_select_lt_and_gt( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[X]], i32 [[Y]]) +; CHECK-NEXT: ret i8 [[R]] +; + %gt_bool = icmp sgt i32 %x, %y + %gt = zext i1 %gt_bool to i8 + %lt = icmp slt i32 %x, %y + %r = select i1 %lt, i8 -1, i8 %gt + ret i8 %r +} + ; Vector version define <4 x i8> @scmp_from_select_vec_lt(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: define <4 x i8> @scmp_from_select_vec_lt( @@ -315,3 +329,17 @@ define i8 @scmp_of_sub_and_zero_neg3(i32 %x, i32 %y) { %r = call i8 @llvm.ucmp(i32 %diff, i32 0) ret i8 %r } + +; Fold (x s> y) ? 1 : sext(x s< y) +define i8 @scmp_from_select_gt_and_lt(i32 %x, i32 %y) { +; CHECK-LABEL: define i8 @scmp_from_select_gt_and_lt( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[X]], i32 [[Y]]) +; CHECK-NEXT: ret i8 [[R]] +; + %lt_bool = icmp slt i32 %x, %y + %lt = sext i1 %lt_bool to i8 + %gt = icmp sgt i32 %x, %y + %r = select i1 %gt, i8 1, i8 %lt + ret i8 %r +} diff --git a/llvm/test/Transforms/InstCombine/select-select.ll b/llvm/test/Transforms/InstCombine/select-select.ll index 5460ba1bc55838..1feae5ab504dcf 100644 --- a/llvm/test/Transforms/InstCombine/select-select.ll +++ b/llvm/test/Transforms/InstCombine/select-select.ll @@ -18,9 +18,9 @@ define float @foo1(float %a) { define float @foo2(float %a) { ; CHECK-LABEL: @foo2( -; CHECK-NEXT: [[B:%.*]] = fcmp ule float [[C:%.*]], 0.000000e+00 -; CHECK-NEXT: [[D:%.*]] = fcmp olt float [[C]], 1.000000e+00 -; CHECK-NEXT: [[E:%.*]] = select i1 [[D]], float [[C]], float 1.000000e+00 +; CHECK-NEXT: [[B:%.*]] = fcmp ule float [[A:%.*]], 0.000000e+00 +; CHECK-NEXT: [[TMP1:%.*]] = fcmp olt float [[A]], 1.000000e+00 +; CHECK-NEXT: [[E:%.*]] = select i1 [[TMP1]], float [[A]], float 1.000000e+00 ; CHECK-NEXT: [[F:%.*]] = select i1 [[B]], float 0.000000e+00, float [[E]] ; CHECK-NEXT: ret float [[F]] ; @@ -330,10 +330,7 @@ define i8 @strong_order_cmp_eq_ugt(i32 %a, i32 %b) { define i8 @strong_order_cmp_slt_sgt(i32 %a, i32 %b) { ; CHECK-LABEL: @strong_order_cmp_slt_sgt( -; CHECK-NEXT: [[CMP_LT:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[SEXT:%.*]] = sext i1 [[CMP_LT]] to i8 -; CHECK-NEXT: [[CMP_GT:%.*]] = icmp sgt i32 [[A]], [[B]] -; CHECK-NEXT: [[SEL_GT:%.*]] = select i1 [[CMP_GT]], i8 1, i8 [[SEXT]] +; CHECK-NEXT: [[SEL_GT:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[A:%.*]], i32 [[B:%.*]]) ; CHECK-NEXT: ret i8 [[SEL_GT]] ; %cmp.lt = icmp slt i32 %a, %b @@ -345,10 +342,7 @@ define i8 @strong_order_cmp_slt_sgt(i32 %a, i32 %b) { define i8 @strong_order_cmp_ult_ugt(i32 %a, i32 %b) { ; CHECK-LABEL: @strong_order_cmp_ult_ugt( -; CHECK-NEXT: [[CMP_LT:%.*]] = icmp ult i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[SEXT:%.*]] = sext i1 [[CMP_LT]] to i8 -; CHECK-NEXT: [[CMP_GT:%.*]] = icmp ugt i32 [[A]], [[B]] -; CHECK-NEXT: [[SEL_GT:%.*]] = select i1 [[CMP_GT]], i8 1, i8 [[SEXT]] +; CHECK-NEXT: [[SEL_GT:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A:%.*]], i32 [[B:%.*]]) ; CHECK-NEXT: ret i8 [[SEL_GT]] ; %cmp.lt = icmp ult i32 %a, %b @@ -360,10 +354,7 @@ define i8 @strong_order_cmp_ult_ugt(i32 %a, i32 %b) { define i8 @strong_order_cmp_sgt_slt(i32 %a, i32 %b) { ; CHECK-LABEL: @strong_order_cmp_sgt_slt( -; CHECK-NEXT: [[CMP_GT:%.*]] = icmp sgt i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP_GT]] to i8 -; CHECK-NEXT: [[CMP_LT:%.*]] = icmp slt i32 [[A]], [[B]] -; CHECK-NEXT: [[SEL_LT:%.*]] = select i1 [[CMP_LT]], i8 -1, i8 [[ZEXT]] +; CHECK-NEXT: [[SEL_LT:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[A:%.*]], i32 [[B:%.*]]) ; CHECK-NEXT: ret i8 [[SEL_LT]] ; %cmp.gt = icmp sgt i32 %a, %b @@ -375,10 +366,7 @@ define i8 @strong_order_cmp_sgt_slt(i32 %a, i32 %b) { define i8 @strong_order_cmp_ugt_ult(i32 %a, i32 %b) { ; CHECK-LABEL: @strong_order_cmp_ugt_ult( -; CHECK-NEXT: [[CMP_GT:%.*]] = icmp ugt i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP_GT]] to i8 -; CHECK-NEXT: [[CMP_LT:%.*]] = icmp ult i32 [[A]], [[B]] -; CHECK-NEXT: [[SEL_LT:%.*]] = select i1 [[CMP_LT]], i8 -1, i8 [[ZEXT]] +; CHECK-NEXT: [[SEL_LT:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A:%.*]], i32 [[B:%.*]]) ; CHECK-NEXT: ret i8 [[SEL_LT]] ; %cmp.gt = icmp ugt i32 %a, %b @@ -460,8 +448,7 @@ define i8 @strong_order_cmp_ugt_ult_zext_not_oneuse(i32 %a, i32 %b) { ; CHECK-NEXT: [[CMP_GT:%.*]] = icmp ugt i32 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP_GT]] to i8 ; CHECK-NEXT: call void @use8(i8 [[ZEXT]]) -; CHECK-NEXT: [[CMP_LT:%.*]] = icmp ult i32 [[A]], [[B]] -; CHECK-NEXT: [[SEL_LT:%.*]] = select i1 [[CMP_LT]], i8 -1, i8 [[ZEXT]] +; CHECK-NEXT: [[SEL_LT:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[A]], i32 [[B]]) ; CHECK-NEXT: ret i8 [[SEL_LT]] ; %cmp.gt = icmp ugt i32 %a, %b @@ -477,8 +464,7 @@ define i8 @strong_order_cmp_slt_sgt_sext_not_oneuse(i32 %a, i32 %b) { ; CHECK-NEXT: [[CMP_LT:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[SEXT:%.*]] = sext i1 [[CMP_LT]] to i8 ; CHECK-NEXT: call void @use8(i8 [[SEXT]]) -; CHECK-NEXT: [[CMP_GT:%.*]] = icmp sgt i32 [[A]], [[B]] -; CHECK-NEXT: [[SEL_GT:%.*]] = select i1 [[CMP_GT]], i8 1, i8 [[SEXT]] +; CHECK-NEXT: [[SEL_GT:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[A]], i32 [[B]]) ; CHECK-NEXT: ret i8 [[SEL_GT]] ; %cmp.lt = icmp slt i32 %a, %b @@ -491,10 +477,7 @@ define i8 @strong_order_cmp_slt_sgt_sext_not_oneuse(i32 %a, i32 %b) { define <2 x i8> @strong_order_cmp_ugt_ult_vector(<2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: @strong_order_cmp_ugt_ult_vector( -; CHECK-NEXT: [[CMP_GT:%.*]] = icmp ugt <2 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[ZEXT:%.*]] = zext <2 x i1> [[CMP_GT]] to <2 x i8> -; CHECK-NEXT: [[CMP_LT:%.*]] = icmp ult <2 x i32> [[A]], [[B]] -; CHECK-NEXT: [[SEL_LT:%.*]] = select <2 x i1> [[CMP_LT]], <2 x i8> , <2 x i8> [[ZEXT]] +; CHECK-NEXT: [[SEL_LT:%.*]] = call <2 x i8> @llvm.ucmp.v2i8.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) ; CHECK-NEXT: ret <2 x i8> [[SEL_LT]] ; %cmp.gt = icmp ugt <2 x i32> %a, %b @@ -506,10 +489,7 @@ define <2 x i8> @strong_order_cmp_ugt_ult_vector(<2 x i32> %a, <2 x i32> %b) { define <2 x i8> @strong_order_cmp_ugt_ult_vector_poison(<2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: @strong_order_cmp_ugt_ult_vector_poison( -; CHECK-NEXT: [[CMP_GT:%.*]] = icmp ugt <2 x i32> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[ZEXT:%.*]] = zext <2 x i1> [[CMP_GT]] to <2 x i8> -; CHECK-NEXT: [[CMP_LT:%.*]] = icmp ult <2 x i32> [[A]], [[B]] -; CHECK-NEXT: [[SEL_LT:%.*]] = select <2 x i1> [[CMP_LT]], <2 x i8> , <2 x i8> [[ZEXT]] +; CHECK-NEXT: [[SEL_LT:%.*]] = call <2 x i8> @llvm.ucmp.v2i8.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) ; CHECK-NEXT: ret <2 x i8> [[SEL_LT]] ; %cmp.gt = icmp ugt <2 x i32> %a, %b diff --git a/llvm/test/Transforms/InstCombine/ucmp.ll b/llvm/test/Transforms/InstCombine/ucmp.ll index ad8a57825253b0..13755f13bb0a11 100644 --- a/llvm/test/Transforms/InstCombine/ucmp.ll +++ b/llvm/test/Transforms/InstCombine/ucmp.ll @@ -222,6 +222,20 @@ define i8 @ucmp_from_select_lt(i32 %x, i32 %y) { ret i8 %r } +; Fold (x u< y) ? -1 : zext(x u> y) into ucmp(x, y) +define i8 @ucmp_from_select_lt_and_gt(i32 %x, i32 %y) { +; CHECK-LABEL: define i8 @ucmp_from_select_lt_and_gt( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[X]], i32 [[Y]]) +; CHECK-NEXT: ret i8 [[R]] +; + %gt_bool = icmp ugt i32 %x, %y + %gt = zext i1 %gt_bool to i8 + %lt = icmp ult i32 %x, %y + %r = select i1 %lt, i8 -1, i8 %gt + ret i8 %r +} + ; Vector version define <4 x i8> @ucmp_from_select_vec_lt(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: define <4 x i8> @ucmp_from_select_vec_lt( @@ -349,13 +363,13 @@ define i8 @ucmp_from_select_le_neg1(i32 %x, i32 %y) { ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: [[NE_BOOL:%.*]] = icmp ult i32 [[X]], [[Y]] ; CHECK-NEXT: [[NE:%.*]] = sext i1 [[NE_BOOL]] to i8 -; CHECK-NEXT: [[LE_NOT:%.*]] = icmp ugt i32 [[X]], [[Y]] +; CHECK-NEXT: [[LE_NOT:%.*]] = icmp ult i32 [[X]], [[Y]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[LE_NOT]], i8 1, i8 [[NE]] ; CHECK-NEXT: ret i8 [[R]] ; %ne_bool = icmp ult i32 %x, %y %ne = sext i1 %ne_bool to i8 - %le = icmp ule i32 %x, %y + %le = icmp uge i32 %x, %y %r = select i1 %le, i8 %ne, i8 1 ret i8 %r } @@ -513,3 +527,17 @@ define i8 @ucmp_from_select_ge_neg4(i32 %x, i32 %y) { %r = select i1 %ge, i8 %ne, i8 3 ret i8 %r } + +; Fold (x > y) ? 1 : sext(x < y) +define i8 @ucmp_from_select_gt_and_lt(i32 %x, i32 %y) { +; CHECK-LABEL: define i8 @ucmp_from_select_gt_and_lt( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[X]], i32 [[Y]]) +; CHECK-NEXT: ret i8 [[R]] +; + %lt_bool = icmp ult i32 %x, %y + %lt = sext i1 %lt_bool to i8 + %gt = icmp ugt i32 %x, %y + %r = select i1 %gt, i8 1, i8 %lt + ret i8 %r +} From 283dff4593dbbd68594606cda9fbd3631e6648dc Mon Sep 17 00:00:00 2001 From: pokeslow <69726511+cseslowpoke@users.noreply.github.com> Date: Sat, 24 Aug 2024 05:32:31 +0800 Subject: [PATCH 379/426] [compiler-rt][nsan] Add support for nan detection (#101531) Add support for nan detection. #100305 --- compiler-rt/lib/nsan/nsan.cpp | 26 +++++++++++++ compiler-rt/lib/nsan/nsan_flags.inc | 2 + compiler-rt/test/nsan/nan.cpp | 25 ++++++++++++ compiler-rt/test/nsan/softmax.cpp | 54 ++++++++++++++++++++++++++ compiler-rt/test/nsan/vec_sqrt.cpp | 34 ++++++++++++++++ compiler-rt/test/nsan/vec_sqrt_ext.cpp | 25 ++++++++++++ 6 files changed, 166 insertions(+) create mode 100644 compiler-rt/test/nsan/nan.cpp create mode 100644 compiler-rt/test/nsan/softmax.cpp create mode 100644 compiler-rt/test/nsan/vec_sqrt.cpp create mode 100644 compiler-rt/test/nsan/vec_sqrt_ext.cpp diff --git a/compiler-rt/lib/nsan/nsan.cpp b/compiler-rt/lib/nsan/nsan.cpp index bfa55c317cfe79..5bb0cf2b694d5d 100644 --- a/compiler-rt/lib/nsan/nsan.cpp +++ b/compiler-rt/lib/nsan/nsan.cpp @@ -445,6 +445,32 @@ int32_t checkFT(const FT value, ShadowFT Shadow, CheckTypeT CheckType, const InternalFT check_value = value; const InternalFT check_shadow = Shadow; + // We only check for NaNs in the value, not the shadow. + if (flags().check_nan && isnan(check_value)) { + GET_CALLER_PC_BP; + BufferedStackTrace stack; + stack.Unwind(pc, bp, nullptr, false); + if (GetSuppressionForStack(&stack, CheckKind::Consistency)) { + // FIXME: optionally print. + return flags().resume_after_suppression ? kResumeFromValue + : kContinueWithShadow; + } + Decorator D; + Printf("%s", D.Warning()); + Printf("WARNING: NumericalStabilitySanitizer: NaN detected\n"); + Printf("%s", D.Default()); + stack.Print(); + if (flags().halt_on_error) { + if (common_flags()->abort_on_error) + Printf("ABORTING\n"); + else + Printf("Exiting\n"); + Die(); + } + // Performing other tests for NaN values is meaningless when dealing with numbers. + return kResumeFromValue; + } + // See this article for an interesting discussion of how to compare floats: // https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/ static constexpr const FT Eps = FTInfo::kEpsilon; diff --git a/compiler-rt/lib/nsan/nsan_flags.inc b/compiler-rt/lib/nsan/nsan_flags.inc index 658cd5b3b01bf4..7c9e579d91fc33 100644 --- a/compiler-rt/lib/nsan/nsan_flags.inc +++ b/compiler-rt/lib/nsan/nsan_flags.inc @@ -48,3 +48,5 @@ NSAN_FLAG(bool, enable_loadtracking_stats, false, "due to invalid or unknown types.") NSAN_FLAG(bool, poison_in_free, true, "") NSAN_FLAG(bool, print_stats_on_exit, false, "If true, print stats on exit.") +NSAN_FLAG(bool, check_nan, false, + "If true, check the floating-point number is nan") \ No newline at end of file diff --git a/compiler-rt/test/nsan/nan.cpp b/compiler-rt/test/nsan/nan.cpp new file mode 100644 index 00000000000000..59fc391a3e0a6b --- /dev/null +++ b/compiler-rt/test/nsan/nan.cpp @@ -0,0 +1,25 @@ +// RUN: %clangxx_nsan -O0 -g %s -o %t +// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s + +// RUN: %clangxx_nsan -O3 -g %s -o %t +// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s + +// RUN: %clangxx_nsan -O0 -g %s -o %t +// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=1 not %run %t + +#include +#include + +// This function returns a NaN value for triggering the NaN detection. +__attribute__((noinline)) float ReturnNaN(float p, float q) { + float ret = p / q; + return ret; + // CHECK: WARNING: NumericalStabilitySanitizer: NaN detected +} + +int main() { + float val = ReturnNaN(0., 0.); + printf("%f\n", val); + // CHECK: WARNING: NumericalStabilitySanitizer: NaN detected + return 0; +} diff --git a/compiler-rt/test/nsan/softmax.cpp b/compiler-rt/test/nsan/softmax.cpp new file mode 100644 index 00000000000000..29eaa2f9607a20 --- /dev/null +++ b/compiler-rt/test/nsan/softmax.cpp @@ -0,0 +1,54 @@ +// RUN: %clangxx_nsan -O0 -g -DSOFTMAX=softmax %s -o %t +// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0,log2_max_relative_error=19 %run %t 2>&1 | FileCheck %s + +// RUN: %clangxx_nsan -O3 -g -DSOFTMAX=softmax %s -o %t +// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0,log2_max_relative_error=19 %run %t 2>&1 | FileCheck %s + +// RUN: %clangxx_nsan -O0 -g -DSOFTMAX=stable_softmax %s -o %t +// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=1,log2_max_relative_error=19 %run %t + +// RUN: %clangxx_nsan -O3 -g -DSOFTMAX=stable_softmax %s -o %t +// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=1,log2_max_relative_error=19 %run %t + +#include +#include +#include +#include + +// unstable softmax +template +__attribute__((noinline)) void softmax(std::vector &values) { + T sum_exp = 0.0; + for (auto &i: values) { + i = std::exp(i); + sum_exp += i; + } + for (auto &i: values) { + i /= sum_exp; + } +} + +// use max value to avoid overflow +// \sigma_i exp(x_i) / \sum_j exp(x_j) = \sigma_i exp(x_i - max(x)) / \sum_j exp(x_j - max(x)) +template +__attribute__((noinline)) void stable_softmax(std::vector &values) { + T sum_exp = 0.0; + T max_values = *std::max_element(values.begin(), values.end()); + for (auto &i: values) { + i = std::exp(i - max_values); + sum_exp += i; + } + for (auto &i:values) { + i /= sum_exp; + } +} + +int main() { + std::vector data = {1000, 1001, 1002}; + SOFTMAX(data); + for (auto i: data) { + printf("%f", i); + // CHECK: WARNING: NumericalStabilitySanitizer: NaN detected + } + return 0; +} \ No newline at end of file diff --git a/compiler-rt/test/nsan/vec_sqrt.cpp b/compiler-rt/test/nsan/vec_sqrt.cpp new file mode 100644 index 00000000000000..d1ef0487858506 --- /dev/null +++ b/compiler-rt/test/nsan/vec_sqrt.cpp @@ -0,0 +1,34 @@ +// RUN: %clangxx_nsan -O0 -g -mavx %s -o %t +// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s +// RUN: %clangxx_nsan -O3 -g -mavx %s -o %t +// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s + +#include +#include +#include + +void simd_sqrt(const float *input, float *output, size_t size) { + size_t i = 0; + for (; i + 7 < size; i += 8) { + __m256 vec = _mm256_loadu_ps(&input[i]); + __m256 result = _mm256_sqrt_ps(vec); + _mm256_storeu_ps(&output[i], result); + } + for (; i < size; ++i) { + output[i] = std::sqrt(input[i]); + // CHECK: WARNING: NumericalStabilitySanitizer: NaN detected + } +} + +int main() { + float input[] = {1.0, 2.0, -3.0, 4.0, 5.0, 6.0, 7.0, + 8.0, 9.0, -10.0, 11.0, 12.0, 13.0, 14.0, + 15.0, -16.0, 17.0, -18.0, -19.0, -20.0}; + float output[20]; + simd_sqrt(input, output, 20); + for (int i = 0; i < 20; ++i) { + std::cout << output[i] << std::endl; + // CHECK: WARNING: NumericalStabilitySanitizer: NaN detected + } + return 0; +} \ No newline at end of file diff --git a/compiler-rt/test/nsan/vec_sqrt_ext.cpp b/compiler-rt/test/nsan/vec_sqrt_ext.cpp new file mode 100644 index 00000000000000..b39ce4b99bcab6 --- /dev/null +++ b/compiler-rt/test/nsan/vec_sqrt_ext.cpp @@ -0,0 +1,25 @@ +// RUN: %clangxx_nsan -O0 -g -mavx %s -o %t +// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s +// RUN: %clangxx_nsan -O3 -g -mavx %s -o %t +// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s +#include +#include + +typedef float v8sf __attribute__ ((vector_size(32))); + +v8sf simd_sqrt(v8sf a) { + return __builtin_elementwise_sqrt(a); + // CHECK: WARNING: NumericalStabilitySanitizer: NaN detected +} + +int main() { + v8sf a = {-1.0, -2.0, -3.0, 4.0, 5.0, 6.0, 7.0, 8.0}; + a = simd_sqrt(a); + + // This prevents DCE. + for (size_t i = 0; i < 8; ++i) { + std::cout << a[i] << std::endl; + // CHECK: WARNING: NumericalStabilitySanitizer: NaN detected + } + return 0; +} \ No newline at end of file From b48ef8d8d4fac69a9763945a6019dc59ad21ca28 Mon Sep 17 00:00:00 2001 From: Peiming Liu Date: Fri, 23 Aug 2024 14:49:00 -0700 Subject: [PATCH 380/426] [mlir][sparse] unify block arguments order between iterate/coiterate operations. (#105567) --- .../SparseTensor/IR/SparseTensorOps.td | 7 ++-- .../SparseTensor/IR/SparseTensorDialect.cpp | 31 ++++++++-------- .../Transforms/SparseIterationToScf.cpp | 36 ++++++------------- 3 files changed, 31 insertions(+), 43 deletions(-) diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td index 20512f972e67cd..96a61419a541f7 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td @@ -1644,7 +1644,7 @@ def IterateOp : SparseTensor_Op<"iterate", return getIterSpace().getType().getSpaceDim(); } BlockArgument getIterator() { - return getRegion().getArguments().front(); + return getRegion().getArguments().back(); } std::optional getLvlCrd(Level lvl) { if (getCrdUsedLvls()[lvl]) { @@ -1654,9 +1654,8 @@ def IterateOp : SparseTensor_Op<"iterate", return std::nullopt; } Block::BlockArgListType getCrds() { - // The first block argument is iterator, the remaining arguments are - // referenced coordinates. - return getRegion().getArguments().slice(1, getCrdUsedLvls().count()); + // User-provided iteration arguments -> coords -> iterator. + return getRegion().getArguments().slice(getNumRegionIterArgs(), getCrdUsedLvls().count()); } unsigned getNumRegionIterArgs() { return getRegion().getArguments().size() - 1 - getCrdUsedLvls().count(); diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp index 16856b958d4f13..b21bc1a93036c4 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp +++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp @@ -2228,9 +2228,10 @@ parseSparseIterateLoop(OpAsmParser &parser, OperationState &state, parser.getNameLoc(), "mismatch in number of sparse iterators and sparse spaces"); - if (failed(parseUsedCoordList(parser, state, blockArgs))) + SmallVector coords; + if (failed(parseUsedCoordList(parser, state, coords))) return failure(); - size_t numCrds = blockArgs.size(); + size_t numCrds = coords.size(); // Parse "iter_args(%arg = %init, ...)" bool hasIterArgs = succeeded(parser.parseOptionalKeyword("iter_args")); @@ -2238,6 +2239,8 @@ parseSparseIterateLoop(OpAsmParser &parser, OperationState &state, if (parser.parseAssignmentList(blockArgs, initArgs)) return failure(); + blockArgs.append(coords); + SmallVector iterSpaceTps; // parse ": sparse_tensor.iter_space -> ret" if (parser.parseColon() || parser.parseTypeList(iterSpaceTps)) @@ -2267,7 +2270,7 @@ parseSparseIterateLoop(OpAsmParser &parser, OperationState &state, if (hasIterArgs) { // Strip off leading args that used for coordinates. - MutableArrayRef args = MutableArrayRef(blockArgs).drop_front(numCrds); + MutableArrayRef args = MutableArrayRef(blockArgs).drop_back(numCrds); if (args.size() != initArgs.size() || args.size() != state.types.size()) { return parser.emitError( parser.getNameLoc(), @@ -2448,18 +2451,18 @@ void IterateOp::build(OpBuilder &builder, OperationState &odsState, odsState.addTypes(initArgs.getTypes()); Block *bodyBlock = builder.createBlock(bodyRegion); - // First argument, sparse iterator - bodyBlock->addArgument( - llvm::cast(iterSpace.getType()).getIteratorType(), - odsState.location); + // Starts with a list of user-provided loop arguments. + for (Value v : initArgs) + bodyBlock->addArgument(v.getType(), v.getLoc()); - // Followed by a list of used coordinates. + // Follows by a list of used coordinates. for (unsigned i = 0, e = crdUsedLvls.count(); i < e; i++) bodyBlock->addArgument(builder.getIndexType(), odsState.location); - // Followed by a list of user-provided loop arguments. - for (Value v : initArgs) - bodyBlock->addArgument(v.getType(), v.getLoc()); + // Ends with sparse iterator + bodyBlock->addArgument( + llvm::cast(iterSpace.getType()).getIteratorType(), + odsState.location); } ParseResult IterateOp::parse(OpAsmParser &parser, OperationState &result) { @@ -2473,9 +2476,9 @@ ParseResult IterateOp::parse(OpAsmParser &parser, OperationState &result) { return parser.emitError(parser.getNameLoc(), "expected only one iterator/iteration space"); - iters.append(iterArgs); + iterArgs.append(iters); Region *body = result.addRegion(); - if (parser.parseRegion(*body, iters)) + if (parser.parseRegion(*body, iterArgs)) return failure(); IterateOp::ensureTerminator(*body, parser.getBuilder(), result.location); @@ -2580,7 +2583,7 @@ MutableArrayRef IterateOp::getInitsMutable() { } Block::BlockArgListType IterateOp::getRegionIterArgs() { - return getRegion().getArguments().take_back(getNumRegionIterArgs()); + return getRegion().getArguments().take_front(getNumRegionIterArgs()); } std::optional> IterateOp::getYieldedValuesMutable() { diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp index f7fcabb0220b50..71a229bea990c0 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp @@ -111,7 +111,7 @@ genCoIterateBranchNest(PatternRewriter &rewriter, Location loc, CoIterateOp op, static ValueRange genLoopWithIterator( PatternRewriter &rewriter, Location loc, SparseIterator *it, - ValueRange reduc, bool iterFirst, + ValueRange reduc, function_ref(PatternRewriter &rewriter, Location loc, Region &loopBody, SparseIterator *it, ValueRange reduc)> @@ -138,15 +138,9 @@ static ValueRange genLoopWithIterator( } return forOp.getResults(); } - SmallVector ivs; - // TODO: always put iterator SSA values at the end of argument list to be - // consistent with coiterate operation. - if (!iterFirst) - llvm::append_range(ivs, it->getCursor()); - // Appends the user-provided values. - llvm::append_range(ivs, reduc); - if (iterFirst) - llvm::append_range(ivs, it->getCursor()); + + SmallVector ivs(reduc); + llvm::append_range(ivs, it->getCursor()); TypeRange types = ValueRange(ivs).getTypes(); auto whileOp = rewriter.create(loc, types, ivs); @@ -164,12 +158,8 @@ static ValueRange genLoopWithIterator( Region &dstRegion = whileOp.getAfter(); Block *after = rewriter.createBlock(&dstRegion, {}, types, l); ValueRange aArgs = whileOp.getAfterArguments(); - if (iterFirst) { - aArgs = it->linkNewScope(aArgs); - } else { - aArgs = aArgs.take_front(reduc.size()); - it->linkNewScope(aArgs.drop_front(reduc.size())); - } + it->linkNewScope(aArgs.drop_front(reduc.size())); + aArgs = aArgs.take_front(reduc.size()); rewriter.setInsertionPointToStart(after); SmallVector ret = bodyBuilder(rewriter, loc, dstRegion, it, aArgs); @@ -177,12 +167,8 @@ static ValueRange genLoopWithIterator( // Forward loops SmallVector yields; - ValueRange nx = it->forward(rewriter, loc); - if (iterFirst) - llvm::append_range(yields, nx); llvm::append_range(yields, ret); - if (!iterFirst) - llvm::append_range(yields, nx); + llvm::append_range(yields, it->forward(rewriter, loc)); rewriter.create(loc, yields); } return whileOp.getResults().drop_front(it->getCursor().size()); @@ -258,13 +244,13 @@ class SparseIterateOpConverter : public OneToNOpConversionPattern { Block *block = op.getBody(); ValueRange ret = genLoopWithIterator( - rewriter, loc, it.get(), ivs, /*iterFirst=*/true, + rewriter, loc, it.get(), ivs, [block](PatternRewriter &rewriter, Location loc, Region &loopBody, SparseIterator *it, ValueRange reduc) -> SmallVector { - SmallVector blockArgs(it->getCursor()); + SmallVector blockArgs(reduc); // TODO: Also appends coordinates if used. // blockArgs.push_back(it->deref(rewriter, loc)); - llvm::append_range(blockArgs, reduc); + llvm::append_range(blockArgs, it->getCursor()); Block *dstBlock = &loopBody.getBlocks().front(); rewriter.inlineBlockBefore(block, dstBlock, dstBlock->end(), @@ -404,7 +390,7 @@ class SparseCoIterateOpConverter Block *block = &r.getBlocks().front(); ValueRange curResult = genLoopWithIterator( - rewriter, loc, validIters.front(), userReduc, /*iterFirst=*/false, + rewriter, loc, validIters.front(), userReduc, /*bodyBuilder=*/ [block](PatternRewriter &rewriter, Location loc, Region &dstRegion, SparseIterator *it, From 3e763db81607c71d0bb2eb4c01721ac6965d8de7 Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Fri, 23 Aug 2024 14:55:40 -0700 Subject: [PATCH 381/426] [SPIRV] Fix return type mismatch for createSPIRVEmitNonSemanticDIPass (#105889) The declaration in SPIRV.h had this returning a `MachineFunctionPass *`, but the implementation returned a `FunctionPass *`. This showed up as a build error on windows, but it was clearly a mistake regardless. I also updated the pass to include SPIRV.h rather than using its own declarations for pass initialization, as this results in better errors for this kind of typo. Fixes a build break after #97558 --- llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp index cc506356e39043..b37c7c1a6ee044 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp @@ -1,5 +1,6 @@ #include "MCTargetDesc/SPIRVBaseInfo.h" #include "MCTargetDesc/SPIRVMCTargetDesc.h" +#include "SPIRV.h" #include "SPIRVGlobalRegistry.h" #include "SPIRVRegisterInfo.h" #include "SPIRVTargetMachine.h" @@ -33,12 +34,6 @@ struct SPIRVEmitNonSemanticDI : public MachineFunctionPass { bool IsGlobalDIEmitted = false; bool emitGlobalDI(MachineFunction &MF); }; - -void initializeSPIRVEmitNonSemanticDIPass(PassRegistry &); - -FunctionPass *createSPIRVEmitNonSemanticDIPass(SPIRVTargetMachine *TM) { - return new SPIRVEmitNonSemanticDI(TM); -} } // namespace llvm using namespace llvm; @@ -48,6 +43,11 @@ INITIALIZE_PASS(SPIRVEmitNonSemanticDI, DEBUG_TYPE, char SPIRVEmitNonSemanticDI::ID = 0; +MachineFunctionPass * +llvm::createSPIRVEmitNonSemanticDIPass(SPIRVTargetMachine *TM) { + return new SPIRVEmitNonSemanticDI(TM); +} + SPIRVEmitNonSemanticDI::SPIRVEmitNonSemanticDI(SPIRVTargetMachine *TM) : MachineFunctionPass(ID), TM(TM) { initializeSPIRVEmitNonSemanticDIPass(*PassRegistry::getPassRegistry()); From 10407be542aeb2b59477b167bbba3716538dc722 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Fri, 23 Aug 2024 15:06:45 -0700 Subject: [PATCH 382/426] "Reland "[asan] Remove debug tracing from `report_globals` (#104404)" (#105895) Reland #104404. In addition to #104404 it raises required verbosity for stack tracing on global registration. It confuses a symbolizer test on Darwin. This reverts commit 6a8f73803a32db75d22490d341bf8744722a9025. --- compiler-rt/lib/asan/asan_flags.inc | 7 ++----- compiler-rt/lib/asan/asan_globals.cpp | 19 ++++++++----------- .../Linux/initialization-nobug-lld.cpp | 2 +- .../Linux/odr_indicator_unregister.cpp | 2 +- .../asan/TestCases/Linux/odr_indicators.cpp | 4 ++-- .../TestCases/Windows/dll_global_dead_strip.c | 4 ++-- ...eport_globals_symbolization_at_startup.cpp | 2 +- .../TestCases/Windows/global_dead_strip.c | 4 ++-- .../Windows/report_globals_vs_freelibrary.cpp | 2 +- .../asan/TestCases/initialization-nobug.cpp | 8 ++++---- 10 files changed, 24 insertions(+), 30 deletions(-) diff --git a/compiler-rt/lib/asan/asan_flags.inc b/compiler-rt/lib/asan/asan_flags.inc index fad1577d912a5e..5e0ced9706e664 100644 --- a/compiler-rt/lib/asan/asan_flags.inc +++ b/compiler-rt/lib/asan/asan_flags.inc @@ -36,11 +36,8 @@ ASAN_FLAG(int, max_redzone, 2048, ASAN_FLAG( bool, debug, false, "If set, prints some debugging information and does additional checks.") -ASAN_FLAG( - int, report_globals, 1, - "Controls the way to handle globals (0 - don't detect buffer overflow on " - "globals, 1 - detect buffer overflow, 2 - print data about registered " - "globals).") +ASAN_FLAG(bool, report_globals, true, + "If set, detect and report errors on globals .") ASAN_FLAG(bool, check_initialization_order, false, "If set, attempts to catch initialization order issues.") ASAN_FLAG( diff --git a/compiler-rt/lib/asan/asan_globals.cpp b/compiler-rt/lib/asan/asan_globals.cpp index c83b782cb85f89..bf0edce937f06e 100644 --- a/compiler-rt/lib/asan/asan_globals.cpp +++ b/compiler-rt/lib/asan/asan_globals.cpp @@ -22,6 +22,7 @@ #include "asan_thread.h" #include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_dense_map.h" +#include "sanitizer_common/sanitizer_internal_defs.h" #include "sanitizer_common/sanitizer_list.h" #include "sanitizer_common/sanitizer_mutex.h" #include "sanitizer_common/sanitizer_placement_new.h" @@ -179,7 +180,7 @@ int GetGlobalsForAddress(uptr addr, Global *globals, u32 *reg_sites, int res = 0; for (const auto &l : list_of_all_globals) { const Global &g = *l.g; - if (flags()->report_globals >= 2) + if (UNLIKELY(common_flags()->verbosity >= 3)) ReportGlobal(g, "Search"); if (IsAddressNearGlobal(addr, g)) { internal_memcpy(&globals[res], &g, sizeof(g)); @@ -270,7 +271,7 @@ static inline bool UseODRIndicator(const Global *g) { // so we store the globals in a map. static void RegisterGlobal(const Global *g) SANITIZER_REQUIRES(mu_for_globals) { CHECK(AsanInited()); - if (flags()->report_globals >= 2) + if (UNLIKELY(common_flags()->verbosity >= 3)) ReportGlobal(*g, "Added"); CHECK(flags()->report_globals); CHECK(AddrIsInMem(g->beg)); @@ -307,7 +308,7 @@ static void RegisterGlobal(const Global *g) SANITIZER_REQUIRES(mu_for_globals) { static void UnregisterGlobal(const Global *g) SANITIZER_REQUIRES(mu_for_globals) { CHECK(AsanInited()); - if (flags()->report_globals >= 2) + if (UNLIKELY(common_flags()->verbosity >= 3)) ReportGlobal(*g, "Removed"); CHECK(flags()->report_globals); CHECK(AddrIsInMem(g->beg)); @@ -438,7 +439,7 @@ void __asan_register_globals(__asan_global *globals, uptr n) { } GlobalRegistrationSite site = {stack_id, &globals[0], &globals[n - 1]}; global_registration_site_vector->push_back(site); - if (flags()->report_globals >= 2) { + if (UNLIKELY(common_flags()->verbosity >= 4)) { PRINT_CURRENT_STACK(); Printf("=== ID %d; %p %p\n", stack_id, (void *)&globals[0], (void *)&globals[n - 1]); @@ -497,9 +498,7 @@ void __asan_before_dynamic_init(const char *module_name) { Lock lock(&mu_for_globals); if (current_dynamic_init_module_name == module_name) return; - if (flags()->report_globals >= 3) - Printf("DynInitPoison module: %s\n", module_name); - + VPrintf(2, "DynInitPoison module: %s\n", module_name); if (current_dynamic_init_module_name == nullptr) { // First call, poison all globals from other modules. DynInitGlobals().forEach([&](auto &kv) { @@ -545,8 +544,7 @@ static void UnpoisonBeforeMain(void) { return; allow_after_dynamic_init = true; } - if (flags()->report_globals >= 3) - Printf("UnpoisonBeforeMain\n"); + VPrintf(2, "UnpoisonBeforeMain\n"); __asan_after_dynamic_init(); } @@ -570,8 +568,7 @@ void __asan_after_dynamic_init() { if (!current_dynamic_init_module_name) return; - if (flags()->report_globals >= 3) - Printf("DynInitUnpoison\n"); + VPrintf(2, "DynInitUnpoison\n"); DynInitGlobals().forEach([&](auto &kv) { UnpoisonDynamicGlobals(kv.second, /*mark_initialized=*/false); diff --git a/compiler-rt/test/asan/TestCases/Linux/initialization-nobug-lld.cpp b/compiler-rt/test/asan/TestCases/Linux/initialization-nobug-lld.cpp index 5cec029811cbc8..ef82c7a29575eb 100644 --- a/compiler-rt/test/asan/TestCases/Linux/initialization-nobug-lld.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/initialization-nobug-lld.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx_asan -O3 %S/../initialization-nobug.cpp %S/../Helpers/initialization-nobug-extra.cpp -fuse-ld=lld -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInit" +// RUN: %clangxx_asan -O3 %S/../initialization-nobug.cpp %S/../Helpers/initialization-nobug-extra.cpp -fuse-ld=lld -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInit" // Same as initialization-nobug.cpp, but with lld we expect just one // `DynInitUnpoison` executed after `AfterDynamicInit` at the end. diff --git a/compiler-rt/test/asan/TestCases/Linux/odr_indicator_unregister.cpp b/compiler-rt/test/asan/TestCases/Linux/odr_indicator_unregister.cpp index 0f2ed6597154bb..b75f5be101ef8a 100644 --- a/compiler-rt/test/asan/TestCases/Linux/odr_indicator_unregister.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/odr_indicator_unregister.cpp @@ -4,7 +4,7 @@ // RUN: %clangxx_asan -g -O0 -DSHARED_LIB -DSIZE=1 %s -fPIC -shared -o %t-so-1.so // RUN: %clangxx_asan -g -O0 -DSHARED_LIB -DSIZE=2 %s -fPIC -shared -o %t-so-2.so // RUN: %clangxx_asan -g -O0 %s %libdl -Wl,--export-dynamic -o %t -// RUN: %env_asan_opts=report_globals=2:detect_odr_violation=1 %run %t 2>&1 | FileCheck %s +// RUN: %env_asan_opts=report_globals=1:detect_odr_violation=1:verbosity=3 %run %t 2>&1 | FileCheck %s // FIXME: Checks do not match on Android. // UNSUPPORTED: android diff --git a/compiler-rt/test/asan/TestCases/Linux/odr_indicators.cpp b/compiler-rt/test/asan/TestCases/Linux/odr_indicators.cpp index 8af3ec09be78c4..f28a9f6d07386d 100644 --- a/compiler-rt/test/asan/TestCases/Linux/odr_indicators.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/odr_indicators.cpp @@ -1,8 +1,8 @@ // RUN: %clangxx_asan -fno-sanitize-address-use-odr-indicator -fPIC %s -o %t -// RUN: %env_asan_opts=report_globals=2 %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,INDICATOR0 +// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,INDICATOR0 // RUN: %clangxx_asan -fsanitize-address-use-odr-indicator -fPIC %s -o %t -// RUN: %env_asan_opts=report_globals=2 %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,INDICATOR1 +// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,INDICATOR1 #include diff --git a/compiler-rt/test/asan/TestCases/Windows/dll_global_dead_strip.c b/compiler-rt/test/asan/TestCases/Windows/dll_global_dead_strip.c index a0c96622efeea4..e5bd27bdf65fdf 100644 --- a/compiler-rt/test/asan/TestCases/Windows/dll_global_dead_strip.c +++ b/compiler-rt/test/asan/TestCases/Windows/dll_global_dead_strip.c @@ -1,11 +1,11 @@ // RUN: %clang_cl_asan %Od %p/dll_host.cpp %Fe%t // // RUN: %clang_cl_nocxx_asan %Gw %LD %Od %s %Fe%t.dll -// RUN: %env_asan_opts=report_globals=2 %run %t %t.dll 2>&1 | FileCheck %s --check-prefix=NOSTRIP +// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %t %t.dll 2>&1 | FileCheck %s --check-prefix=NOSTRIP // RUN: %clang_cl_nocxx_asan %Gw %LD -O2 %s %Fe%t.dll \ // RUN: %if target={{.*-windows-gnu}} %{ -Wl,--gc-sections %} \ // RUN: %else %{ -link -opt:ref %} -// RUN: %env_asan_opts=report_globals=2 %run %t %t.dll 2>&1 | FileCheck %s --check-prefix=STRIP +// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %t %t.dll 2>&1 | FileCheck %s --check-prefix=STRIP #include diff --git a/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp b/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp index 06a632e6708b1e..c74b66f2b43b3e 100644 --- a/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp +++ b/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp @@ -1,7 +1,7 @@ // RUN: %clang_cl_asan %LD %Od -DDLL %s %Fe%t.dll \ // RUN: %if target={{.*-windows-gnu}} %{ -Wl,--out-implib,%t.lib %} // RUN: %clang_cl_asan %Od -DEXE %s %t.lib %Fe%te.exe -// RUN: %env_asan_opts=report_globals=2 %run %te.exe 2>&1 | FileCheck %s +// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %te.exe 2>&1 | FileCheck %s // FIXME: Currently, the MT runtime build crashes on startup due to dbghelp.dll // initialization failure. diff --git a/compiler-rt/test/asan/TestCases/Windows/global_dead_strip.c b/compiler-rt/test/asan/TestCases/Windows/global_dead_strip.c index 0e15120a46f776..7f2405fdfc8364 100644 --- a/compiler-rt/test/asan/TestCases/Windows/global_dead_strip.c +++ b/compiler-rt/test/asan/TestCases/Windows/global_dead_strip.c @@ -1,9 +1,9 @@ // RUN: %clang_cl_nocxx_asan %Gw %Od %s %Fe%t.exe -// RUN: %env_asan_opts=report_globals=2 %t.exe 2>&1 | FileCheck %s --check-prefix=NOSTRIP +// RUN: %env_asan_opts=report_globals=1:verbosity=3 %t.exe 2>&1 | FileCheck %s --check-prefix=NOSTRIP // RUN: %clang_cl_nocxx_asan %Gw -O2 %s %Fe%t.exe \ // RUN: %if target={{.*-windows-gnu}} %{ -Wl,--gc-sections %} \ // RUN: %else %{ -link -opt:ref %} -// RUN: %env_asan_opts=report_globals=2 %t.exe 2>&1 | FileCheck %s --check-prefix=STRIP +// RUN: %env_asan_opts=report_globals=1:verbosity=3 %t.exe 2>&1 | FileCheck %s --check-prefix=STRIP #include int dead_global = 42; diff --git a/compiler-rt/test/asan/TestCases/Windows/report_globals_vs_freelibrary.cpp b/compiler-rt/test/asan/TestCases/Windows/report_globals_vs_freelibrary.cpp index 7cad3f39be1ec2..34ce18e146d677 100644 --- a/compiler-rt/test/asan/TestCases/Windows/report_globals_vs_freelibrary.cpp +++ b/compiler-rt/test/asan/TestCases/Windows/report_globals_vs_freelibrary.cpp @@ -1,6 +1,6 @@ // RUN: %clang_cl_asan %LD %Od -DDLL %s %Fe%t.dll // RUN: %clang_cl_asan %Od -DEXE %s %Fe%te.exe -// RUN: %env_asan_opts=report_globals=2 %run %te.exe %t.dll 2>&1 | FileCheck %s +// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %te.exe %t.dll 2>&1 | FileCheck %s #include #include diff --git a/compiler-rt/test/asan/TestCases/initialization-nobug.cpp b/compiler-rt/test/asan/TestCases/initialization-nobug.cpp index f66d501124bc48..61328b9de28ae6 100644 --- a/compiler-rt/test/asan/TestCases/initialization-nobug.cpp +++ b/compiler-rt/test/asan/TestCases/initialization-nobug.cpp @@ -1,10 +1,10 @@ // A collection of various initializers which shouldn't trip up initialization // order checking. If successful, this will just return 0. -// RUN: %clangxx_asan -O0 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" -// RUN: %clangxx_asan -O1 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" -// RUN: %clangxx_asan -O2 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" -// RUN: %clangxx_asan -O3 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" +// RUN: %clangxx_asan -O0 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" +// RUN: %clangxx_asan -O1 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" +// RUN: %clangxx_asan -O2 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" +// RUN: %clangxx_asan -O3 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" // Simple access: // Make sure that accessing a global in the same TU is safe From 91e57c6fa80dee935a9080f27c4d9b7971b347d5 Mon Sep 17 00:00:00 2001 From: Quinn Dawkins Date: Fri, 23 Aug 2024 19:10:04 -0400 Subject: [PATCH 383/426] [mlir][tensor] Add TilingInterface support for fusing tensor.pad (#105892) This adds implementations for the two TilingInterface methods required for fusion to `tensor.pad`: `getIterationDomainTileFromResultTile` and `generateResultTileValue`, allowing fusion of pad with a tiled consumer. --- .../Tensor/IR/TensorTilingInterfaceImpl.cpp | 17 ++++++++ mlir/test/Dialect/Tensor/tiling.mlir | 41 +++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp index dec678de6d1c27..f35a9cd4cb9275 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp @@ -67,6 +67,23 @@ struct PadOpTiling : public TilingInterface::ExternalModel { resultSizes.assign(sizes.begin(), sizes.end()); return success(); } + + LogicalResult getIterationDomainTileFromResultTile( + Operation *op, OpBuilder &b, unsigned resultNumber, + ArrayRef offsets, ArrayRef sizes, + SmallVectorImpl &iterDomainOffsets, + SmallVectorImpl &iterDomainSizes) const { + iterDomainOffsets.assign(offsets.begin(), offsets.end()); + iterDomainSizes.assign(sizes.begin(), sizes.end()); + return success(); + } + + FailureOr + generateResultTileValue(Operation *op, OpBuilder &b, unsigned resultNumber, + ArrayRef offsets, + ArrayRef sizes) const { + return getTiledImplementation(op, b, offsets, sizes); + } }; template diff --git a/mlir/test/Dialect/Tensor/tiling.mlir b/mlir/test/Dialect/Tensor/tiling.mlir index e02ab06a9d5337..193fbe93e0f9ee 100644 --- a/mlir/test/Dialect/Tensor/tiling.mlir +++ b/mlir/test/Dialect/Tensor/tiling.mlir @@ -116,6 +116,47 @@ module attributes {transform.with_named_sequence} { // ----- +// CHECK-LABEL: func @fuse_static_pad_tensor_3_4( +// CHECK-SAME: %[[IN:.*]]: tensor<7x9xf32> +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[C15:.*]] = arith.constant 15 : index +// CHECK-DAG: %[[C16:.*]] = arith.constant 16 : index +// CHECK: %[[RESULT:.*]] = scf.for {{.*}} = %[[C0]] to %[[C15]] step %[[C2]] +// CHECK: scf.for {{.*}} = %[[C0]] to %[[C16]] step %[[C3]] iter_args(%[[INNER_OUT:.*]] = +// CHECK: %[[SWAP_RESULT:.*]] = scf.if +// CHECK: tensor.generate +// CHECK: else +// CHECK: %[[SLICE:.*]] = tensor.extract_slice %[[IN]][{{.*}}, {{.*}}] [{{.*}}, {{.*}}] [1, 1] +// CHECK: %[[PAD:.*]] = tensor.pad %[[SLICE]] +// CHECK: %[[COPY:.*]] = linalg.copy ins(%[[SWAP_RESULT:.*]] +// CHECK: tensor.insert_slice %[[COPY]] into %[[INNER_OUT]][{{.*}}, {{.*}}] [{{.*}}, {{.*}}] [1, 1] +// CHECK: return %[[RESULT]] + +func.func @fuse_static_pad_tensor_3_4(%input_tensor: tensor<7x9xf32>, + %pad_value: f32) -> tensor<15x16xf32> { + %0 = tensor.pad %input_tensor low[3, 4] high[5, 3] { + ^bb0(%arg1: index, %arg2: index): + tensor.yield %pad_value : f32 + } : tensor<7x9xf32> to tensor<15x16xf32> + %empty = tensor.empty() : tensor<15x16xf32> + %1 = linalg.copy ins(%0 : tensor<15x16xf32>) outs(%empty : tensor<15x16xf32>) -> tensor<15x16xf32> + return %1 : tensor<15x16xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) { + %copy = transform.structured.match ops{["linalg.copy"]} in %arg1 + : (!transform.any_op) -> !transform.any_op + %a, %b, %c = transform.structured.fuse %copy [2, 3] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + // CHECK-LABEL: func @static_pad_tensor_0_3( // CHECK-SAME: %[[IN:.*]]: tensor<7x9xf32> // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index From cdd11d694a406a98a16d6265168ee2fbe1b6a87c Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Fri, 23 Aug 2024 16:07:45 -0700 Subject: [PATCH 384/426] Fix bot failures after PR #104867 An assert was left over after addressing feedback. In the process of fixing, realized the way I addressed the feedback was also incomplete. --- .../Analysis/FunctionPropertiesAnalysis.cpp | 29 +++++-------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp index 479cfc58ab38f5..07906fa1aa6c65 100644 --- a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp +++ b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp @@ -374,32 +374,17 @@ DominatorTree &FunctionPropertiesUpdater::getUpdatedDominatorTree( auto &DT = FAM.getResult(const_cast(Caller)); - SetVector NewSucc; - NewSucc.insert(succ_begin(&CallSiteBB), succ_end(&CallSiteBB)); - - // tell the DomTree about the new edges - std::deque Worklist; - Worklist.push_back(&CallSiteBB); - - // Build the list of edges to actually remove. Those are those edges in the - // DomTreeUpdates that cannot be found in the CFG anymore. SmallVector FinalDomTreeUpdates; - while (!Worklist.empty()) { - auto *BB = Worklist.front(); - Worklist.pop_front(); - assert(DT.getNode(BB)); - for (auto *Succ : NewSucc) { - if (!DT.getNode(Succ)) - Worklist.push_back(Succ); + for (auto &Upd : DomTreeUpdates) + FinalDomTreeUpdates.push_back(Upd); + + DenseSet Inserted; + for (auto *Succ : successors(&CallSiteBB)) + if (Inserted.insert(Succ).second) FinalDomTreeUpdates.push_back({DominatorTree::UpdateKind::Insert, - const_cast(BB), + const_cast(&CallSiteBB), const_cast(Succ)}); - } - } - for (auto &Upd : DomTreeUpdates) - if (!llvm::is_contained(successors(Upd.getFrom()), Upd.getTo())) - FinalDomTreeUpdates.push_back(Upd); DT.applyUpdates(FinalDomTreeUpdates); #ifdef EXPENSIVE_CHECKS From ca53611c905f82628ab2e40185307995b552e14d Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 23 Aug 2024 16:56:27 -0700 Subject: [PATCH 385/426] [llvm] Use range-based for loops (NFC) (#105861) --- llvm/include/llvm/IR/ModuleSummaryIndex.h | 12 ++++++------ llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 6 +++--- llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp | 2 +- llvm/lib/IR/AsmWriter.cpp | 8 ++++---- llvm/tools/llvm-profgen/ProfiledBinary.cpp | 4 ++-- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h index 00934cc1ce6f2d..e3455f02878f03 100644 --- a/llvm/include/llvm/IR/ModuleSummaryIndex.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h @@ -1808,9 +1808,9 @@ class ModuleSummaryIndex { /// the ThinLTO backends. TypeIdSummary &getOrInsertTypeIdSummary(StringRef TypeId) { auto TidIter = TypeIdMap.equal_range(GlobalValue::getGUID(TypeId)); - for (auto It = TidIter.first; It != TidIter.second; ++It) - if (It->second.first == TypeId) - return It->second.second; + for (auto &[GUID, TypeIdPair] : make_range(TidIter)) + if (TypeIdPair.first == TypeId) + return TypeIdPair.second; auto It = TypeIdMap.insert( {GlobalValue::getGUID(TypeId), {std::string(TypeId), TypeIdSummary()}}); return It->second.second; @@ -1820,9 +1820,9 @@ class ModuleSummaryIndex { /// summary map) or null (if not present). This may be used when importing. const TypeIdSummary *getTypeIdSummary(StringRef TypeId) const { auto TidIter = TypeIdMap.equal_range(GlobalValue::getGUID(TypeId)); - for (auto It = TidIter.first; It != TidIter.second; ++It) - if (It->second.first == TypeId) - return &It->second.second; + for (const auto &[GUID, TypeIdPair] : make_range(TidIter)) + if (TypeIdPair.first == TypeId) + return &TypeIdPair.second; return nullptr; } diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 20737c0812cf86..e4b4339b5e5b19 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -4807,9 +4807,9 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { // corresponding type id records. for (auto &T : ReferencedTypeIds) { auto TidIter = Index.typeIds().equal_range(T); - for (auto It = TidIter.first; It != TidIter.second; ++It) { - writeTypeIdSummaryRecord(NameVals, StrtabBuilder, It->second.first, - It->second.second); + for (const auto &[GUID, TypeIdPair] : make_range(TidIter)) { + writeTypeIdSummaryRecord(NameVals, StrtabBuilder, TypeIdPair.first, + TypeIdPair.second); Stream.EmitRecord(bitc::FS_TYPE_ID, NameVals); NameVals.clear(); } diff --git a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp index e146fb7e576819..2959d3261bea71 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp @@ -1312,7 +1312,7 @@ void VarLocBasedLDV::cleanupEntryValueTransfers( return; auto TransRange = EntryValTransfers.equal_range(TRInst); - for (auto &TDPair : llvm::make_range(TransRange.first, TransRange.second)) { + for (auto &TDPair : llvm::make_range(TransRange)) { const VarLoc &EmittedEV = VarLocIDs[TDPair.second]; if (std::tie(EntryVL.Var, EntryVL.Locs[0].Value.RegNo, EntryVL.Expr) == std::tie(EmittedEV.Var, EmittedEV.Locs[0].Value.RegNo, diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 01a16ccd688f43..70e3af941bf77b 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -3488,9 +3488,9 @@ void AssemblyWriter::printTypeIdInfo( continue; } // Print all type id that correspond to this GUID. - for (auto It = TidIter.first; It != TidIter.second; ++It) { + for (const auto &[GUID, TypeIdPair] : make_range(TidIter)) { Out << FS; - auto Slot = Machine.getTypeIdSlot(It->second.first); + auto Slot = Machine.getTypeIdSlot(TypeIdPair.first); assert(Slot != -1); Out << "^" << Slot; } @@ -3529,10 +3529,10 @@ void AssemblyWriter::printVFuncId(const FunctionSummary::VFuncId VFId) { } // Print all type id that correspond to this GUID. FieldSeparator FS; - for (auto It = TidIter.first; It != TidIter.second; ++It) { + for (const auto &[GUID, TypeIdPair] : make_range(TidIter)) { Out << FS; Out << "vFuncId: ("; - auto Slot = Machine.getTypeIdSlot(It->second.first); + auto Slot = Machine.getTypeIdSlot(TypeIdPair.first); assert(Slot != -1); Out << "^" << Slot; Out << ", offset: " << VFId.Offset; diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp index 574a9c9f52bf18..a458ffcb96b41a 100644 --- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp +++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp @@ -423,8 +423,8 @@ void ProfiledBinary::decodePseudoProbe(const ELFObjectFileBase *Obj) { GuidFilter.insert(Function::getGUID(F->FuncName)); for (auto &Range : F->Ranges) { auto GUIDs = StartAddrToSymMap.equal_range(Range.first); - for (auto I = GUIDs.first; I != GUIDs.second; ++I) - FuncStartAddresses[I->second] = I->first; + for (const auto &[StartAddr, Func] : make_range(GUIDs)) + FuncStartAddresses[Func] = StartAddr; } } } From dbd7ce0ccd3a88f2c1d6e47d31da63a48cafdc8f Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 23 Aug 2024 17:32:52 -0700 Subject: [PATCH 386/426] [IR] Inroduce ModuleToSummariesForIndexTy (NFC) (#105906) This patch introduces type alias ModuleToSummariesForIndexTy. I'm planning to change the type slightly to allow heterogeneous lookup (that is, std::map>) in a subsequent patch. The problem is that changing the type affects many places. Using a type alias reduces the impact. --- llvm/include/llvm/Bitcode/BitcodeWriter.h | 15 +++++++-------- llvm/include/llvm/IR/ModuleSummaryIndex.h | 4 ++++ .../llvm/LTO/legacy/ThinLTOCodeGenerator.h | 2 +- .../include/llvm/Transforms/IPO/FunctionImport.h | 8 ++++---- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 16 ++++++++-------- llvm/lib/LTO/LTO.cpp | 2 +- llvm/lib/LTO/ThinLTOCodeGenerator.cpp | 4 ++-- llvm/lib/Transforms/IPO/FunctionImport.cpp | 4 ++-- llvm/tools/llvm-lto/llvm-lto.cpp | 2 +- 9 files changed, 30 insertions(+), 27 deletions(-) diff --git a/llvm/include/llvm/Bitcode/BitcodeWriter.h b/llvm/include/llvm/Bitcode/BitcodeWriter.h index 770e249290c3c3..2823b438f80bf4 100644 --- a/llvm/include/llvm/Bitcode/BitcodeWriter.h +++ b/llvm/include/llvm/Bitcode/BitcodeWriter.h @@ -100,10 +100,9 @@ class BitcodeWriter { void writeThinLinkBitcode(const Module &M, const ModuleSummaryIndex &Index, const ModuleHash &ModHash); - void writeIndex( - const ModuleSummaryIndex *Index, - const std::map *ModuleToSummariesForIndex, - const GVSummaryPtrSet *DecSummaries); + void writeIndex(const ModuleSummaryIndex *Index, + const ModuleToSummariesForIndexTy *ModuleToSummariesForIndex, + const GVSummaryPtrSet *DecSummaries); }; /// Write the specified module to the specified raw output stream. @@ -150,10 +149,10 @@ void writeThinLinkBitcodeToFile(const Module &M, raw_ostream &Out, /// index for a distributed backend, provide the \p ModuleToSummariesForIndex /// map. \p DecSummaries specifies the set of summaries for which the /// corresponding value should be imported as a declaration (prototype). -void writeIndexToFile(const ModuleSummaryIndex &Index, raw_ostream &Out, - const std::map - *ModuleToSummariesForIndex = nullptr, - const GVSummaryPtrSet *DecSummaries = nullptr); +void writeIndexToFile( + const ModuleSummaryIndex &Index, raw_ostream &Out, + const ModuleToSummariesForIndexTy *ModuleToSummariesForIndex = nullptr, + const GVSummaryPtrSet *DecSummaries = nullptr); /// If EmbedBitcode is set, save a copy of the llvm IR as data in the /// __LLVM,__bitcode section (.llvmbc on non-MacOS). diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h index e3455f02878f03..b16596e454bdcc 100644 --- a/llvm/include/llvm/IR/ModuleSummaryIndex.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h @@ -1291,6 +1291,10 @@ using ModulePathStringTableTy = StringMap; /// a particular module, and provide efficient access to their summary. using GVSummaryMapTy = DenseMap; +/// Map of a module name to the GUIDs and summaries we will import from that +/// module. +using ModuleToSummariesForIndexTy = std::map; + /// A set of global value summary pointers. using GVSummaryPtrSet = std::unordered_set; diff --git a/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h b/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h index f1337e82485c94..7eb30d56e10c10 100644 --- a/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h +++ b/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h @@ -276,7 +276,7 @@ class ThinLTOCodeGenerator { */ void gatherImportedSummariesForModule( Module &Module, ModuleSummaryIndex &Index, - std::map &ModuleToSummariesForIndex, + ModuleToSummariesForIndexTy &ModuleToSummariesForIndex, GVSummaryPtrSet &DecSummaries, const lto::InputFile &File); /** diff --git a/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/llvm/include/llvm/Transforms/IPO/FunctionImport.h index 0c8380db74314f..93d831c26938bb 100644 --- a/llvm/include/llvm/Transforms/IPO/FunctionImport.h +++ b/llvm/include/llvm/Transforms/IPO/FunctionImport.h @@ -262,13 +262,13 @@ void gatherImportedSummariesForModule( StringRef ModulePath, const DenseMap &ModuleToDefinedGVSummaries, const FunctionImporter::ImportMapTy &ImportList, - std::map &ModuleToSummariesForIndex, + ModuleToSummariesForIndexTy &ModuleToSummariesForIndex, GVSummaryPtrSet &DecSummaries); /// Emit into \p OutputFilename the files module \p ModulePath will import from. -std::error_code EmitImportsFiles( - StringRef ModulePath, StringRef OutputFilename, - const std::map &ModuleToSummariesForIndex); +std::error_code +EmitImportsFiles(StringRef ModulePath, StringRef OutputFilename, + const ModuleToSummariesForIndexTy &ModuleToSummariesForIndex); /// Based on the information recorded in the summaries during global /// summary-based analysis: diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index e4b4339b5e5b19..2f3e90d6e3821b 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -432,7 +432,7 @@ class IndexBitcodeWriter : public BitcodeWriterBase { /// When writing a subset of the index for distributed backends, client /// provides a map of modules to the corresponding GUIDs/summaries to write. - const std::map *ModuleToSummariesForIndex; + const ModuleToSummariesForIndexTy *ModuleToSummariesForIndex; /// Map that holds the correspondence between the GUID used in the combined /// index and a value id generated by this class to use in references. @@ -461,11 +461,11 @@ class IndexBitcodeWriter : public BitcodeWriterBase { /// If provided, \p DecSummaries specifies the set of summaries for which /// the corresponding functions or aliased functions should be imported as a /// declaration (but not definition) for each module. - IndexBitcodeWriter(BitstreamWriter &Stream, StringTableBuilder &StrtabBuilder, - const ModuleSummaryIndex &Index, - const GVSummaryPtrSet *DecSummaries = nullptr, - const std::map - *ModuleToSummariesForIndex = nullptr) + IndexBitcodeWriter( + BitstreamWriter &Stream, StringTableBuilder &StrtabBuilder, + const ModuleSummaryIndex &Index, + const GVSummaryPtrSet *DecSummaries = nullptr, + const ModuleToSummariesForIndexTy *ModuleToSummariesForIndex = nullptr) : BitcodeWriterBase(Stream, StrtabBuilder), Index(Index), DecSummaries(DecSummaries), ModuleToSummariesForIndex(ModuleToSummariesForIndex) { @@ -5102,7 +5102,7 @@ void BitcodeWriter::writeModule(const Module &M, void BitcodeWriter::writeIndex( const ModuleSummaryIndex *Index, - const std::map *ModuleToSummariesForIndex, + const ModuleToSummariesForIndexTy *ModuleToSummariesForIndex, const GVSummaryPtrSet *DecSummaries) { IndexBitcodeWriter IndexWriter(*Stream, StrtabBuilder, *Index, DecSummaries, ModuleToSummariesForIndex); @@ -5159,7 +5159,7 @@ void IndexBitcodeWriter::write() { // index for a distributed backend, provide a \p ModuleToSummariesForIndex map. void llvm::writeIndexToFile( const ModuleSummaryIndex &Index, raw_ostream &Out, - const std::map *ModuleToSummariesForIndex, + const ModuleToSummariesForIndexTy *ModuleToSummariesForIndex, const GVSummaryPtrSet *DecSummaries) { SmallVector Buffer; Buffer.reserve(256 * 1024); diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index ee0193344d5ac9..bd031338e8f39b 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1399,7 +1399,7 @@ class lto::ThinBackendProc { Error emitFiles(const FunctionImporter::ImportMapTy &ImportList, llvm::StringRef ModulePath, const std::string &NewModulePath) { - std::map ModuleToSummariesForIndex; + ModuleToSummariesForIndexTy ModuleToSummariesForIndex; GVSummaryPtrSet DeclarationSummaries; std::error_code EC; diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp index 0ba3093637aacf..f74202781a5f4b 100644 --- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp +++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp @@ -762,7 +762,7 @@ void ThinLTOCodeGenerator::crossModuleImport(Module &TheModule, */ void ThinLTOCodeGenerator::gatherImportedSummariesForModule( Module &TheModule, ModuleSummaryIndex &Index, - std::map &ModuleToSummariesForIndex, + ModuleToSummariesForIndexTy &ModuleToSummariesForIndex, GVSummaryPtrSet &DecSummaries, const lto::InputFile &File) { auto ModuleCount = Index.modulePaths().size(); auto ModuleIdentifier = TheModule.getModuleIdentifier(); @@ -833,7 +833,7 @@ void ThinLTOCodeGenerator::emitImports(Module &TheModule, StringRef OutputName, // the set of keys in `ModuleToSummariesForIndex` should be a superset of keys // in `DecSummaries`, so no need to use `DecSummaries` in `EmitImportFiles`. GVSummaryPtrSet DecSummaries; - std::map ModuleToSummariesForIndex; + ModuleToSummariesForIndexTy ModuleToSummariesForIndex; llvm::gatherImportedSummariesForModule( ModuleIdentifier, ModuleToDefinedGVSummaries, ImportLists[ModuleIdentifier], ModuleToSummariesForIndex, DecSummaries); diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp index b26c15b665b590..74a71cbf101b5d 100644 --- a/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -1485,7 +1485,7 @@ void llvm::gatherImportedSummariesForModule( StringRef ModulePath, const DenseMap &ModuleToDefinedGVSummaries, const FunctionImporter::ImportMapTy &ImportList, - std::map &ModuleToSummariesForIndex, + ModuleToSummariesForIndexTy &ModuleToSummariesForIndex, GVSummaryPtrSet &DecSummaries) { // Include all summaries from the importing module. ModuleToSummariesForIndex[std::string(ModulePath)] = @@ -1511,7 +1511,7 @@ void llvm::gatherImportedSummariesForModule( /// Emit the files \p ModulePath will import from into \p OutputFilename. std::error_code llvm::EmitImportsFiles( StringRef ModulePath, StringRef OutputFilename, - const std::map &ModuleToSummariesForIndex) { + const ModuleToSummariesForIndexTy &ModuleToSummariesForIndex) { std::error_code EC; raw_fd_ostream ImportsOS(OutputFilename, EC, sys::fs::OpenFlags::OF_Text); if (EC) diff --git a/llvm/tools/llvm-lto/llvm-lto.cpp b/llvm/tools/llvm-lto/llvm-lto.cpp index 8218bd5a74ea3e..f5076f0b975178 100644 --- a/llvm/tools/llvm-lto/llvm-lto.cpp +++ b/llvm/tools/llvm-lto/llvm-lto.cpp @@ -691,7 +691,7 @@ class ThinLTOProcessing { // Build a map of module to the GUIDs and summary objects that should // be written to its index. - std::map ModuleToSummariesForIndex; + ModuleToSummariesForIndexTy ModuleToSummariesForIndex; GVSummaryPtrSet DecSummaries; ThinGenerator.gatherImportedSummariesForModule( *TheModule, *Index, ModuleToSummariesForIndex, DecSummaries, *Input); From 1f89cd4a1970fee65f5ecb189c4d1a0a376d9bb2 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Fri, 23 Aug 2024 17:34:04 -0700 Subject: [PATCH 387/426] Revert "[compiler-rt][nsan] Add support for nan detection" (#105909) Reverts llvm/llvm-project#101531 Fails https://lab.llvm.org/buildbot/#/builders/66/builds/3051 --- compiler-rt/lib/nsan/nsan.cpp | 26 ------------- compiler-rt/lib/nsan/nsan_flags.inc | 2 - compiler-rt/test/nsan/nan.cpp | 25 ------------ compiler-rt/test/nsan/softmax.cpp | 54 -------------------------- compiler-rt/test/nsan/vec_sqrt.cpp | 34 ---------------- compiler-rt/test/nsan/vec_sqrt_ext.cpp | 25 ------------ 6 files changed, 166 deletions(-) delete mode 100644 compiler-rt/test/nsan/nan.cpp delete mode 100644 compiler-rt/test/nsan/softmax.cpp delete mode 100644 compiler-rt/test/nsan/vec_sqrt.cpp delete mode 100644 compiler-rt/test/nsan/vec_sqrt_ext.cpp diff --git a/compiler-rt/lib/nsan/nsan.cpp b/compiler-rt/lib/nsan/nsan.cpp index 5bb0cf2b694d5d..bfa55c317cfe79 100644 --- a/compiler-rt/lib/nsan/nsan.cpp +++ b/compiler-rt/lib/nsan/nsan.cpp @@ -445,32 +445,6 @@ int32_t checkFT(const FT value, ShadowFT Shadow, CheckTypeT CheckType, const InternalFT check_value = value; const InternalFT check_shadow = Shadow; - // We only check for NaNs in the value, not the shadow. - if (flags().check_nan && isnan(check_value)) { - GET_CALLER_PC_BP; - BufferedStackTrace stack; - stack.Unwind(pc, bp, nullptr, false); - if (GetSuppressionForStack(&stack, CheckKind::Consistency)) { - // FIXME: optionally print. - return flags().resume_after_suppression ? kResumeFromValue - : kContinueWithShadow; - } - Decorator D; - Printf("%s", D.Warning()); - Printf("WARNING: NumericalStabilitySanitizer: NaN detected\n"); - Printf("%s", D.Default()); - stack.Print(); - if (flags().halt_on_error) { - if (common_flags()->abort_on_error) - Printf("ABORTING\n"); - else - Printf("Exiting\n"); - Die(); - } - // Performing other tests for NaN values is meaningless when dealing with numbers. - return kResumeFromValue; - } - // See this article for an interesting discussion of how to compare floats: // https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/ static constexpr const FT Eps = FTInfo::kEpsilon; diff --git a/compiler-rt/lib/nsan/nsan_flags.inc b/compiler-rt/lib/nsan/nsan_flags.inc index 7c9e579d91fc33..658cd5b3b01bf4 100644 --- a/compiler-rt/lib/nsan/nsan_flags.inc +++ b/compiler-rt/lib/nsan/nsan_flags.inc @@ -48,5 +48,3 @@ NSAN_FLAG(bool, enable_loadtracking_stats, false, "due to invalid or unknown types.") NSAN_FLAG(bool, poison_in_free, true, "") NSAN_FLAG(bool, print_stats_on_exit, false, "If true, print stats on exit.") -NSAN_FLAG(bool, check_nan, false, - "If true, check the floating-point number is nan") \ No newline at end of file diff --git a/compiler-rt/test/nsan/nan.cpp b/compiler-rt/test/nsan/nan.cpp deleted file mode 100644 index 59fc391a3e0a6b..00000000000000 --- a/compiler-rt/test/nsan/nan.cpp +++ /dev/null @@ -1,25 +0,0 @@ -// RUN: %clangxx_nsan -O0 -g %s -o %t -// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s - -// RUN: %clangxx_nsan -O3 -g %s -o %t -// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s - -// RUN: %clangxx_nsan -O0 -g %s -o %t -// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=1 not %run %t - -#include -#include - -// This function returns a NaN value for triggering the NaN detection. -__attribute__((noinline)) float ReturnNaN(float p, float q) { - float ret = p / q; - return ret; - // CHECK: WARNING: NumericalStabilitySanitizer: NaN detected -} - -int main() { - float val = ReturnNaN(0., 0.); - printf("%f\n", val); - // CHECK: WARNING: NumericalStabilitySanitizer: NaN detected - return 0; -} diff --git a/compiler-rt/test/nsan/softmax.cpp b/compiler-rt/test/nsan/softmax.cpp deleted file mode 100644 index 29eaa2f9607a20..00000000000000 --- a/compiler-rt/test/nsan/softmax.cpp +++ /dev/null @@ -1,54 +0,0 @@ -// RUN: %clangxx_nsan -O0 -g -DSOFTMAX=softmax %s -o %t -// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0,log2_max_relative_error=19 %run %t 2>&1 | FileCheck %s - -// RUN: %clangxx_nsan -O3 -g -DSOFTMAX=softmax %s -o %t -// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0,log2_max_relative_error=19 %run %t 2>&1 | FileCheck %s - -// RUN: %clangxx_nsan -O0 -g -DSOFTMAX=stable_softmax %s -o %t -// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=1,log2_max_relative_error=19 %run %t - -// RUN: %clangxx_nsan -O3 -g -DSOFTMAX=stable_softmax %s -o %t -// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=1,log2_max_relative_error=19 %run %t - -#include -#include -#include -#include - -// unstable softmax -template -__attribute__((noinline)) void softmax(std::vector &values) { - T sum_exp = 0.0; - for (auto &i: values) { - i = std::exp(i); - sum_exp += i; - } - for (auto &i: values) { - i /= sum_exp; - } -} - -// use max value to avoid overflow -// \sigma_i exp(x_i) / \sum_j exp(x_j) = \sigma_i exp(x_i - max(x)) / \sum_j exp(x_j - max(x)) -template -__attribute__((noinline)) void stable_softmax(std::vector &values) { - T sum_exp = 0.0; - T max_values = *std::max_element(values.begin(), values.end()); - for (auto &i: values) { - i = std::exp(i - max_values); - sum_exp += i; - } - for (auto &i:values) { - i /= sum_exp; - } -} - -int main() { - std::vector data = {1000, 1001, 1002}; - SOFTMAX(data); - for (auto i: data) { - printf("%f", i); - // CHECK: WARNING: NumericalStabilitySanitizer: NaN detected - } - return 0; -} \ No newline at end of file diff --git a/compiler-rt/test/nsan/vec_sqrt.cpp b/compiler-rt/test/nsan/vec_sqrt.cpp deleted file mode 100644 index d1ef0487858506..00000000000000 --- a/compiler-rt/test/nsan/vec_sqrt.cpp +++ /dev/null @@ -1,34 +0,0 @@ -// RUN: %clangxx_nsan -O0 -g -mavx %s -o %t -// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s -// RUN: %clangxx_nsan -O3 -g -mavx %s -o %t -// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s - -#include -#include -#include - -void simd_sqrt(const float *input, float *output, size_t size) { - size_t i = 0; - for (; i + 7 < size; i += 8) { - __m256 vec = _mm256_loadu_ps(&input[i]); - __m256 result = _mm256_sqrt_ps(vec); - _mm256_storeu_ps(&output[i], result); - } - for (; i < size; ++i) { - output[i] = std::sqrt(input[i]); - // CHECK: WARNING: NumericalStabilitySanitizer: NaN detected - } -} - -int main() { - float input[] = {1.0, 2.0, -3.0, 4.0, 5.0, 6.0, 7.0, - 8.0, 9.0, -10.0, 11.0, 12.0, 13.0, 14.0, - 15.0, -16.0, 17.0, -18.0, -19.0, -20.0}; - float output[20]; - simd_sqrt(input, output, 20); - for (int i = 0; i < 20; ++i) { - std::cout << output[i] << std::endl; - // CHECK: WARNING: NumericalStabilitySanitizer: NaN detected - } - return 0; -} \ No newline at end of file diff --git a/compiler-rt/test/nsan/vec_sqrt_ext.cpp b/compiler-rt/test/nsan/vec_sqrt_ext.cpp deleted file mode 100644 index b39ce4b99bcab6..00000000000000 --- a/compiler-rt/test/nsan/vec_sqrt_ext.cpp +++ /dev/null @@ -1,25 +0,0 @@ -// RUN: %clangxx_nsan -O0 -g -mavx %s -o %t -// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s -// RUN: %clangxx_nsan -O3 -g -mavx %s -o %t -// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s -#include -#include - -typedef float v8sf __attribute__ ((vector_size(32))); - -v8sf simd_sqrt(v8sf a) { - return __builtin_elementwise_sqrt(a); - // CHECK: WARNING: NumericalStabilitySanitizer: NaN detected -} - -int main() { - v8sf a = {-1.0, -2.0, -3.0, 4.0, 5.0, 6.0, 7.0, 8.0}; - a = simd_sqrt(a); - - // This prevents DCE. - for (size_t i = 0; i < 8; ++i) { - std::cout << a[i] << std::endl; - // CHECK: WARNING: NumericalStabilitySanitizer: NaN detected - } - return 0; -} \ No newline at end of file From 74b538d7e6428921b0bc8f1f5d5dc287c430fa29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?kadir=20=C3=A7etinkaya?= Date: Sat, 24 Aug 2024 02:53:59 +0200 Subject: [PATCH 388/426] [include-cleaner] Turn new/delete usages to ambiguous references (#105844) In practice most of these expressions just resolve to implicitly provided `operator new` and standard says it's not necessary to include `` for that. Hence this is resulting in a lot of churn in cases where inclusion of `` doesn't matter, and might even be undesired by the developer. By switching to an ambiguous reference we try to find a middle ground here, ensuring that we don't drop providers of `operator new` when the developer explicitly listed them in the includes, and chose to believe it's the implicitly provided `operator new` and don't insert an include in other cases. --- clang-tools-extra/include-cleaner/lib/WalkAST.cpp | 4 ++-- clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/clang-tools-extra/include-cleaner/lib/WalkAST.cpp b/clang-tools-extra/include-cleaner/lib/WalkAST.cpp index a5ac3760a3be2a..598484d09712e5 100644 --- a/clang-tools-extra/include-cleaner/lib/WalkAST.cpp +++ b/clang-tools-extra/include-cleaner/lib/WalkAST.cpp @@ -351,11 +351,11 @@ class ASTWalker : public RecursiveASTVisitor { } bool VisitCXXNewExpr(CXXNewExpr *E) { - report(E->getExprLoc(), E->getOperatorNew()); + report(E->getExprLoc(), E->getOperatorNew(), RefType::Ambiguous); return true; } bool VisitCXXDeleteExpr(CXXDeleteExpr *E) { - report(E->getExprLoc(), E->getOperatorDelete()); + report(E->getExprLoc(), E->getOperatorDelete(), RefType::Ambiguous); return true; } }; diff --git a/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp b/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp index b0a4473d4ad2b7..6c8eacbff1cea3 100644 --- a/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp +++ b/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp @@ -557,9 +557,9 @@ TEST(WalkAST, FriendDecl) { } TEST(WalkAST, OperatorNewDelete) { - testWalk("void* $explicit^operator new(decltype(sizeof(int)), void*);", + testWalk("void* $ambiguous^operator new(decltype(sizeof(int)), void*);", "struct Bar { void foo() { Bar b; ^new (&b) Bar; } };"); - testWalk("struct A { static void $explicit^operator delete(void*); };", + testWalk("struct A { static void $ambiguous^operator delete(void*); };", "void foo() { A a; ^delete &a; }"); } } // namespace From e439fdf4ea0dbc6f001428f4d4956700bf26bb97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?kadir=20=C3=A7etinkaya?= Date: Sat, 24 Aug 2024 03:04:30 +0200 Subject: [PATCH 389/426] [clang-format] Treat new expressions as simple functions (#105168) ccae7b461be339e717d02f99ac857cf0bc7d17f improved handling for nested calls, but this resulted in a lot of changes near `new` expressions. This patch tries to restore previous behavior around new expressions, by treating them as simple functions, which seem to align with the concept. Fixes https://github.com/llvm/llvm-project/issues/105133. --- clang/lib/Format/ContinuationIndenter.cpp | 9 +++++++++ clang/unittests/Format/FormatTest.cpp | 12 ++++++++++++ 2 files changed, 21 insertions(+) diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp index 4fcb776db45b58..2422deee3b8d89 100644 --- a/clang/lib/Format/ContinuationIndenter.cpp +++ b/clang/lib/Format/ContinuationIndenter.cpp @@ -830,6 +830,12 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, const auto IsSimpleFunction = [&](const FormatToken &Tok) { if (!Tok.FakeLParens.empty() && Tok.FakeLParens.back() > prec::Unknown) return false; + // Nested calls that involve `new` expressions also look like simple + // function calls, eg: + // - foo(new Bar()) + // - foo(::new Bar()) + if (Tok.is(tok::kw_new) || Tok.startsSequence(tok::coloncolon, tok::kw_new)) + return true; const auto *Previous = Tok.Previous; if (!Previous || (!Previous->isOneOf(TT_FunctionDeclarationLParen, TT_LambdaDefinitionLParen) && @@ -852,6 +858,9 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, // caaaaaaaaaaaall( // caaaaaaaaaaaall( // caaaaaaaaaaaaaaaaaaaaaaall(aaaaaaaaaaaaaa, aaaaaaaaa)))); + // or + // caaaaaaaaaaaaaaaaaaaaal( + // new SomethingElseeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee()); !IsSimpleFunction(Current)) { CurrentState.NoLineBreak = true; } diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 779109976a4f77..a383a624434b1f 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -9383,6 +9383,18 @@ TEST_F(FormatTest, AlignsAfterOpenBracket) { " aaaaaaaaaaaaaaaaaaaa(aaaaaaaaaaaaaaaaa, aaaaaaaaaaaaaaaa)) &&\n" " aaaaaaaaaaaaaaaa);", Style); + verifyFormat( + "fooooooooooo(new BARRRRRRRRR(\n" + " XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXZZZZZZZZZZZZZZZZZZZZZZZZZ()));", + Style); + verifyFormat( + "fooooooooooo(::new BARRRRRRRRR(\n" + " XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXZZZZZZZZZZZZZZZZZZZZZZZZZ()));", + Style); + verifyFormat( + "fooooooooooo(new FOO::BARRRR(\n" + " XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXZZZZZZZZZZZZZZZZZZZZZZZZZ()));", + Style); Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent; Style.BinPackArguments = false; From d02132166a6ce56d54d6c8b2ca39e81b6466eb55 Mon Sep 17 00:00:00 2001 From: vporpo Date: Fri, 23 Aug 2024 20:13:56 -0700 Subject: [PATCH 390/426] [SandboxIR] Implement CleanupReturnInst (#105750) This patch implements sandboxir::CleanupReturnInst mirroring llvm::CleanupReturnInst. --- llvm/include/llvm/SandboxIR/SandboxIR.h | 35 +++++++++ .../llvm/SandboxIR/SandboxIRValues.def | 1 + llvm/lib/SandboxIR/SandboxIR.cpp | 57 +++++++++++++++ llvm/unittests/SandboxIR/SandboxIRTest.cpp | 71 +++++++++++++++++++ llvm/unittests/SandboxIR/TrackerTest.cpp | 51 +++++++++++++ 5 files changed, 215 insertions(+) diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index c09e167d67bb1c..b8a28669cdd074 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -131,6 +131,7 @@ class FuncletPadInst; class CatchPadInst; class CleanupPadInst; class CatchReturnInst; +class CleanupReturnInst; class GetElementPtrInst; class CastInst; class PtrToIntInst; @@ -266,6 +267,7 @@ class Value { friend class CatchReturnInst; // For getting `Val`. friend class GetElementPtrInst; // For getting `Val`. friend class CatchSwitchInst; // For getting `Val`. + friend class CleanupReturnInst; // For getting `Val`. friend class SwitchInst; // For getting `Val`. friend class UnaryOperator; // For getting `Val`. friend class BinaryOperator; // For getting `Val`. @@ -690,6 +692,7 @@ class Instruction : public sandboxir::User { friend class CatchPadInst; // For getTopmostLLVMInstruction(). friend class CleanupPadInst; // For getTopmostLLVMInstruction(). friend class CatchReturnInst; // For getTopmostLLVMInstruction(). + friend class CleanupReturnInst; // For getTopmostLLVMInstruction(). friend class GetElementPtrInst; // For getTopmostLLVMInstruction(). friend class CatchSwitchInst; // For getTopmostLLVMInstruction(). friend class SwitchInst; // For getTopmostLLVMInstruction(). @@ -1941,6 +1944,36 @@ class CatchReturnInst } }; +class CleanupReturnInst + : public SingleLLVMInstructionImpl { + CleanupReturnInst(llvm::CleanupReturnInst *CRI, Context &Ctx) + : SingleLLVMInstructionImpl(ClassID::CleanupRet, Opcode::CleanupRet, CRI, + Ctx) {} + friend class Context; // For constructor. + +public: + static CleanupReturnInst *create(CleanupPadInst *CleanupPad, + BasicBlock *UnwindBB, BBIterator WhereIt, + BasicBlock *WhereBB, Context &Ctx); + bool hasUnwindDest() const { + return cast(Val)->hasUnwindDest(); + } + bool unwindsToCaller() const { + return cast(Val)->unwindsToCaller(); + } + CleanupPadInst *getCleanupPad() const; + void setCleanupPad(CleanupPadInst *CleanupPad); + unsigned getNumSuccessors() const { + return cast(Val)->getNumSuccessors(); + } + BasicBlock *getUnwindDest() const; + void setUnwindDest(BasicBlock *NewDest); + + static bool classof(const Value *From) { + return From->getSubclassID() == ClassID::CleanupRet; + } +}; + class GetElementPtrInst final : public SingleLLVMInstructionImpl { /// Use Context::createGetElementPtrInst(). Don't call @@ -2849,6 +2882,8 @@ class Context { friend CleanupPadInst; // For createCleanupPadInst() CatchReturnInst *createCatchReturnInst(llvm::CatchReturnInst *I); friend CatchReturnInst; // For createCatchReturnInst() + CleanupReturnInst *createCleanupReturnInst(llvm::CleanupReturnInst *I); + friend CleanupReturnInst; // For createCleanupReturnInst() GetElementPtrInst *createGetElementPtrInst(llvm::GetElementPtrInst *I); friend GetElementPtrInst; // For createGetElementPtrInst() CatchSwitchInst *createCatchSwitchInst(llvm::CatchSwitchInst *I); diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def index b7b396e30dc3ca..14cb2d72ad3af6 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def +++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def @@ -49,6 +49,7 @@ DEF_INSTR(CallBr, OP(CallBr), CallBrInst) DEF_INSTR(CatchPad, OP(CatchPad), CatchPadInst) DEF_INSTR(CleanupPad, OP(CleanupPad), CleanupPadInst) DEF_INSTR(CatchRet, OP(CatchRet), CatchReturnInst) +DEF_INSTR(CleanupRet, OP(CleanupRet), CleanupReturnInst) DEF_INSTR(GetElementPtr, OP(GetElementPtr), GetElementPtrInst) DEF_INSTR(CatchSwitch, OP(CatchSwitch), CatchSwitchInst) DEF_INSTR(Switch, OP(Switch), SwitchInst) diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp index b953e68c33180e..f92e9d38125139 100644 --- a/llvm/lib/SandboxIR/SandboxIR.cpp +++ b/llvm/lib/SandboxIR/SandboxIR.cpp @@ -1149,6 +1149,51 @@ Value *CatchReturnInst::getCatchSwitchParentPad() const { cast(Val)->getCatchSwitchParentPad()); } +CleanupReturnInst *CleanupReturnInst::create(CleanupPadInst *CleanupPad, + BasicBlock *UnwindBB, + BBIterator WhereIt, + BasicBlock *WhereBB, + Context &Ctx) { + auto &Builder = Ctx.getLLVMIRBuilder(); + if (WhereIt != WhereBB->end()) + Builder.SetInsertPoint((*WhereIt).getTopmostLLVMInstruction()); + else + Builder.SetInsertPoint(cast(WhereBB->Val)); + auto *LLVMUnwindBB = + UnwindBB != nullptr ? cast(UnwindBB->Val) : nullptr; + llvm::CleanupReturnInst *LLVMI = Builder.CreateCleanupRet( + cast(CleanupPad->Val), LLVMUnwindBB); + return Ctx.createCleanupReturnInst(LLVMI); +} + +CleanupPadInst *CleanupReturnInst::getCleanupPad() const { + return cast( + Ctx.getValue(cast(Val)->getCleanupPad())); +} + +void CleanupReturnInst::setCleanupPad(CleanupPadInst *CleanupPad) { + Ctx.getTracker() + .emplaceIfTracking>( + this); + cast(Val)->setCleanupPad( + cast(CleanupPad->Val)); +} + +BasicBlock *CleanupReturnInst::getUnwindDest() const { + return cast_or_null( + Ctx.getValue(cast(Val)->getUnwindDest())); +} + +void CleanupReturnInst::setUnwindDest(BasicBlock *NewDest) { + Ctx.getTracker() + .emplaceIfTracking>( + this); + cast(Val)->setUnwindDest( + cast(NewDest->Val)); +} + Value *GetElementPtrInst::create(Type *Ty, Value *Ptr, ArrayRef IdxList, BasicBlock::iterator WhereIt, @@ -2188,6 +2233,12 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { std::unique_ptr(new CatchReturnInst(LLVMCRI, *this)); return It->second.get(); } + case llvm::Instruction::CleanupRet: { + auto *LLVMCRI = cast(LLVMV); + It->second = std::unique_ptr( + new CleanupReturnInst(LLVMCRI, *this)); + return It->second.get(); + } case llvm::Instruction::GetElementPtr: { auto *LLVMGEP = cast(LLVMV); It->second = std::unique_ptr( @@ -2376,6 +2427,12 @@ CatchReturnInst *Context::createCatchReturnInst(llvm::CatchReturnInst *I) { auto NewPtr = std::unique_ptr(new CatchReturnInst(I, *this)); return cast(registerValue(std::move(NewPtr))); } +CleanupReturnInst * +Context::createCleanupReturnInst(llvm::CleanupReturnInst *I) { + auto NewPtr = + std::unique_ptr(new CleanupReturnInst(I, *this)); + return cast(registerValue(std::move(NewPtr))); +} GetElementPtrInst * Context::createGetElementPtrInst(llvm::GetElementPtrInst *I) { auto NewPtr = diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index 76ca64caeeeb07..83edd954080e9f 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -2022,6 +2022,77 @@ define void @foo() { EXPECT_EQ(CRI->getSuccessor(), Catch); } +TEST_F(SandboxIRTest, CleanupReturnInst) { + parseIR(C, R"IR( +define void @foo() { +dispatch: + invoke void @foo() + to label %throw unwind label %cleanup +throw: + ret void +cleanup: + %cleanuppad = cleanuppad within none [] + cleanupret from %cleanuppad unwind label %cleanup2 +cleanup2: + %cleanuppad2 = cleanuppad within none [] + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + BasicBlock *LLVMCleanup = getBasicBlockByName(LLVMF, "cleanup"); + auto LLVMIt = LLVMCleanup->begin(); + [[maybe_unused]] auto *LLVMCP = cast(&*LLVMIt++); + auto *LLVMCRI = cast(&*LLVMIt++); + + sandboxir::Context Ctx(C); + [[maybe_unused]] auto &F = *Ctx.createFunction(&LLVMF); + auto *Throw = cast( + Ctx.getValue(getBasicBlockByName(LLVMF, "throw"))); + auto *Cleanup = cast(Ctx.getValue(LLVMCleanup)); + auto *Cleanup2 = cast( + Ctx.getValue(getBasicBlockByName(LLVMF, "cleanup2"))); + auto It = Cleanup->begin(); + [[maybe_unused]] auto *CP = cast(&*It++); + auto *CRI = cast(&*It++); + It = Cleanup2->begin(); + auto *CP2 = cast(&*It++); + auto *Ret = cast(&*It++); + + // Check hasUnwindDest(). + EXPECT_EQ(CRI->hasUnwindDest(), LLVMCRI->hasUnwindDest()); + // Check unwindsToCaller(). + EXPECT_EQ(CRI->unwindsToCaller(), LLVMCRI->unwindsToCaller()); + // Check getCleanupPad(). + EXPECT_EQ(CRI->getCleanupPad(), Ctx.getValue(LLVMCRI->getCleanupPad())); + // Check setCleanupPad(). + auto *OrigCleanupPad = CRI->getCleanupPad(); + auto *NewCleanupPad = CP2; + EXPECT_NE(NewCleanupPad, OrigCleanupPad); + CRI->setCleanupPad(NewCleanupPad); + EXPECT_EQ(CRI->getCleanupPad(), NewCleanupPad); + CRI->setCleanupPad(OrigCleanupPad); + EXPECT_EQ(CRI->getCleanupPad(), OrigCleanupPad); + // Check setNumSuccessors(). + EXPECT_EQ(CRI->getNumSuccessors(), LLVMCRI->getNumSuccessors()); + // Check getUnwindDest(). + EXPECT_EQ(CRI->getUnwindDest(), Ctx.getValue(LLVMCRI->getUnwindDest())); + // Check setUnwindDest(). + auto *OrigUnwindDest = CRI->getUnwindDest(); + auto *NewUnwindDest = Throw; + EXPECT_NE(NewUnwindDest, OrigUnwindDest); + CRI->setUnwindDest(NewUnwindDest); + EXPECT_EQ(CRI->getUnwindDest(), NewUnwindDest); + CRI->setUnwindDest(OrigUnwindDest); + EXPECT_EQ(CRI->getUnwindDest(), OrigUnwindDest); + // Check create(). + auto *UnwindBB = Cleanup; + auto *NewCRI = sandboxir::CleanupReturnInst::create( + CP2, UnwindBB, Ret->getIterator(), Ret->getParent(), Ctx); + EXPECT_EQ(NewCRI->getCleanupPad(), CP2); + EXPECT_EQ(NewCRI->getUnwindDest(), UnwindBB); + EXPECT_EQ(NewCRI->getNextNode(), Ret); +} + TEST_F(SandboxIRTest, GetElementPtrInstruction) { parseIR(C, R"IR( define void @foo(ptr %ptr, <2 x ptr> %ptrs) { diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp index 6614ab7fa248e1..f0d6a0d57b8c3e 100644 --- a/llvm/unittests/SandboxIR/TrackerTest.cpp +++ b/llvm/unittests/SandboxIR/TrackerTest.cpp @@ -763,6 +763,57 @@ define void @foo() { EXPECT_EQ(CR->getSuccessor(), OrigSucc); } +TEST_F(TrackerTest, CleanupReturnInstSetters) { + parseIR(C, R"IR( +define void @foo() { +dispatch: + invoke void @foo() + to label %throw unwind label %cleanup +throw: + ret void +cleanup: + %cleanuppad = cleanuppad within none [] + cleanupret from %cleanuppad unwind label %cleanup2 +cleanup2: + %cleanuppad2 = cleanuppad within none [] + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + BasicBlock *LLVMCleanup = getBasicBlockByName(LLVMF, "cleanup"); + + sandboxir::Context Ctx(C); + [[maybe_unused]] auto &F = *Ctx.createFunction(&LLVMF); + auto *Throw = cast( + Ctx.getValue(getBasicBlockByName(LLVMF, "throw"))); + auto *Cleanup = cast(Ctx.getValue(LLVMCleanup)); + auto *Cleanup2 = cast( + Ctx.getValue(getBasicBlockByName(LLVMF, "cleanup2"))); + auto It = Cleanup->begin(); + [[maybe_unused]] auto *CP = cast(&*It++); + auto *CRI = cast(&*It++); + auto *CP2 = cast(&*Cleanup2->begin()); + + // Check setCleanupPad(). + auto *OrigCleanupPad = CRI->getCleanupPad(); + auto *NewCleanupPad = CP2; + EXPECT_NE(NewCleanupPad, OrigCleanupPad); + Ctx.save(); + CRI->setCleanupPad(NewCleanupPad); + EXPECT_EQ(CRI->getCleanupPad(), NewCleanupPad); + Ctx.revert(); + EXPECT_EQ(CRI->getCleanupPad(), OrigCleanupPad); + // Check setUnwindDest(). + auto *OrigUnwindDest = CRI->getUnwindDest(); + auto *NewUnwindDest = Throw; + EXPECT_NE(NewUnwindDest, OrigUnwindDest); + Ctx.save(); + CRI->setUnwindDest(NewUnwindDest); + EXPECT_EQ(CRI->getUnwindDest(), NewUnwindDest); + Ctx.revert(); + EXPECT_EQ(CRI->getUnwindDest(), OrigUnwindDest); +} + TEST_F(TrackerTest, SwitchInstSetters) { parseIR(C, R"IR( define void @foo(i32 %cond0, i32 %cond1) { From 7615c0b2eb52b8c5d8e6dfc7f265a87a7a9f3ef5 Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee Date: Fri, 23 Aug 2024 21:53:43 -0700 Subject: [PATCH 391/426] [StableHash] Implement with xxh3_64bits (#105849) This is a follow-up to address a suggestion from https://github.com/llvm/llvm-project/pull/105619. The main goal of this change is to efficiently implement stable hash functions using the xxh3 64bits API. `stable_hash_combine_range` and `stable_hash_combine_array` functions are removed and consolidated into a more general `stable_hash_combine` function that takes an `ArrayRef` as input. --- llvm/include/llvm/ADT/StableHashing.h | 75 ++++++-------------------- llvm/lib/CodeGen/MachineOperand.cpp | 3 +- llvm/lib/CodeGen/MachineStableHash.cpp | 27 ++++------ 3 files changed, 26 insertions(+), 79 deletions(-) diff --git a/llvm/include/llvm/ADT/StableHashing.h b/llvm/include/llvm/ADT/StableHashing.h index f675f828f702e5..7778f5d7c3a1c3 100644 --- a/llvm/include/llvm/ADT/StableHashing.h +++ b/llvm/include/llvm/ADT/StableHashing.h @@ -8,7 +8,10 @@ // // This file provides types and functions for computing and combining stable // hashes. Stable hashes can be useful for hashing across different modules, -// processes, or compiler runs. +// processes, machines, or compiler runs for a specific compiler version. It +// currently employs the xxh3_64bits hashing algorithm. Be aware that this +// implementation may be adjusted or updated as improvements to the compiler are +// made. // //===----------------------------------------------------------------------===// @@ -16,6 +19,7 @@ #define LLVM_ADT_STABLEHASHING_H #include "llvm/ADT/StringRef.h" +#include "llvm/Support/xxhash.h" namespace llvm { @@ -23,78 +27,29 @@ namespace llvm { /// deserialized, and is stable across processes and executions. using stable_hash = uint64_t; -// Implementation details -namespace hashing { -namespace detail { - -// Stable hashes are based on the 64-bit FNV-1 hash: -// https://en.wikipedia.org/wiki/Fowler-Noll-Vo_hash_function - -const uint64_t FNV_PRIME_64 = 1099511628211u; -const uint64_t FNV_OFFSET_64 = 14695981039346656037u; - -inline void stable_hash_append(stable_hash &Hash, const char Value) { - Hash = Hash ^ (Value & 0xFF); - Hash = Hash * FNV_PRIME_64; -} - -inline void stable_hash_append(stable_hash &Hash, stable_hash Value) { - for (unsigned I = 0; I < 8; ++I) { - stable_hash_append(Hash, static_cast(Value)); - Value >>= 8; - } +inline stable_hash stable_hash_combine(ArrayRef Buffer) { + const uint8_t *Ptr = reinterpret_cast(Buffer.data()); + size_t Size = Buffer.size() * sizeof(stable_hash); + return xxh3_64bits(ArrayRef(Ptr, Size)); } -} // namespace detail -} // namespace hashing - inline stable_hash stable_hash_combine(stable_hash A, stable_hash B) { - stable_hash Hash = hashing::detail::FNV_OFFSET_64; - hashing::detail::stable_hash_append(Hash, A); - hashing::detail::stable_hash_append(Hash, B); - return Hash; + stable_hash Hashes[2] = {A, B}; + return stable_hash_combine(Hashes); } inline stable_hash stable_hash_combine(stable_hash A, stable_hash B, stable_hash C) { - stable_hash Hash = hashing::detail::FNV_OFFSET_64; - hashing::detail::stable_hash_append(Hash, A); - hashing::detail::stable_hash_append(Hash, B); - hashing::detail::stable_hash_append(Hash, C); - return Hash; + stable_hash Hashes[3] = {A, B, C}; + return stable_hash_combine(Hashes); } inline stable_hash stable_hash_combine(stable_hash A, stable_hash B, stable_hash C, stable_hash D) { - stable_hash Hash = hashing::detail::FNV_OFFSET_64; - hashing::detail::stable_hash_append(Hash, A); - hashing::detail::stable_hash_append(Hash, B); - hashing::detail::stable_hash_append(Hash, C); - hashing::detail::stable_hash_append(Hash, D); - return Hash; -} - -/// Compute a stable_hash for a sequence of values. -/// -/// This hashes a sequence of values. It produces the same stable_hash as -/// 'stable_hash_combine(a, b, c, ...)', but can run over arbitrary sized -/// sequences and is significantly faster given pointers and types which -/// can be hashed as a sequence of bytes. -template -stable_hash stable_hash_combine_range(InputIteratorT First, - InputIteratorT Last) { - stable_hash Hash = hashing::detail::FNV_OFFSET_64; - for (auto I = First; I != Last; ++I) - hashing::detail::stable_hash_append(Hash, *I); - return Hash; + stable_hash Hashes[4] = {A, B, C, D}; + return stable_hash_combine(Hashes); } -inline stable_hash stable_hash_combine_array(const stable_hash *P, size_t C) { - stable_hash Hash = hashing::detail::FNV_OFFSET_64; - for (size_t I = 0; I < C; ++I) - hashing::detail::stable_hash_append(Hash, P[I]); - return Hash; -} } // namespace llvm #endif diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp index ace05902d5df79..a0726ca64910ea 100644 --- a/llvm/lib/CodeGen/MachineOperand.cpp +++ b/llvm/lib/CodeGen/MachineOperand.cpp @@ -424,8 +424,7 @@ hash_code llvm::hash_value(const MachineOperand &MO) { const uint32_t *RegMask = MO.getRegMask(); std::vector RegMaskHashes(RegMask, RegMask + RegMaskSize); return hash_combine(MO.getType(), MO.getTargetFlags(), - stable_hash_combine_array(RegMaskHashes.data(), - RegMaskHashes.size())); + stable_hash_combine(RegMaskHashes)); } assert(0 && "MachineOperand not associated with any MachineFunction"); diff --git a/llvm/lib/CodeGen/MachineStableHash.cpp b/llvm/lib/CodeGen/MachineStableHash.cpp index fb5e9a37d9b997..916acbf2d2cbf9 100644 --- a/llvm/lib/CodeGen/MachineStableHash.cpp +++ b/llvm/lib/CodeGen/MachineStableHash.cpp @@ -66,7 +66,7 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) { SmallVector DefOpcodes; for (auto &Def : MRI.def_instructions(MO.getReg())) DefOpcodes.push_back(Def.getOpcode()); - return stable_hash_combine_range(DefOpcodes.begin(), DefOpcodes.end()); + return stable_hash_combine(DefOpcodes); } // Register operands don't have target flags. @@ -78,8 +78,8 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) { case MachineOperand::MO_FPImmediate: { auto Val = MO.isCImm() ? MO.getCImm()->getValue() : MO.getFPImm()->getValueAPF().bitcastToAPInt(); - auto ValHash = - stable_hash_combine_array(Val.getRawData(), Val.getNumWords()); + auto ValHash = stable_hash_combine( + ArrayRef(Val.getRawData(), Val.getNumWords())); return stable_hash_combine(MO.getType(), MO.getTargetFlags(), ValHash); } @@ -126,10 +126,8 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) { const uint32_t *RegMask = MO.getRegMask(); std::vector RegMaskHashes(RegMask, RegMask + RegMaskSize); - return stable_hash_combine( - MO.getType(), MO.getTargetFlags(), - stable_hash_combine_array(RegMaskHashes.data(), - RegMaskHashes.size())); + return stable_hash_combine(MO.getType(), MO.getTargetFlags(), + stable_hash_combine(RegMaskHashes)); } } } @@ -145,10 +143,8 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) { MO.getShuffleMask(), std::back_inserter(ShuffleMaskHashes), [](int S) -> llvm::stable_hash { return llvm::stable_hash(S); }); - return stable_hash_combine( - MO.getType(), MO.getTargetFlags(), - stable_hash_combine_array(ShuffleMaskHashes.data(), - ShuffleMaskHashes.size())); + return stable_hash_combine(MO.getType(), MO.getTargetFlags(), + stable_hash_combine(ShuffleMaskHashes)); } case MachineOperand::MO_MCSymbol: { auto SymbolName = MO.getMCSymbol()->getName(); @@ -212,8 +208,7 @@ stable_hash llvm::stableHashValue(const MachineInstr &MI, bool HashVRegs, HashComponents.push_back(static_cast(Op->getFailureOrdering())); } - return stable_hash_combine_range(HashComponents.begin(), - HashComponents.end()); + return stable_hash_combine(HashComponents); } stable_hash llvm::stableHashValue(const MachineBasicBlock &MBB) { @@ -221,8 +216,7 @@ stable_hash llvm::stableHashValue(const MachineBasicBlock &MBB) { // TODO: Hash more stuff like block alignment and branch probabilities. for (const auto &MI : MBB) HashComponents.push_back(stableHashValue(MI)); - return stable_hash_combine_range(HashComponents.begin(), - HashComponents.end()); + return stable_hash_combine(HashComponents); } stable_hash llvm::stableHashValue(const MachineFunction &MF) { @@ -230,6 +224,5 @@ stable_hash llvm::stableHashValue(const MachineFunction &MF) { // TODO: Hash lots more stuff like function alignment and stack objects. for (const auto &MBB : MF) HashComponents.push_back(stableHashValue(MBB)); - return stable_hash_combine_range(HashComponents.begin(), - HashComponents.end()); + return stable_hash_combine(HashComponents); } From 62601250786244981026164b21f553c9c40a4602 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Fri, 23 Aug 2024 21:55:11 -0700 Subject: [PATCH 392/426] [docs] Fix links in github user guide - graphite section Mistakenly used markdown style rather than rst in #104499. --- llvm/docs/GitHub.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/docs/GitHub.rst b/llvm/docs/GitHub.rst index a33211bfef41de..bf6e915d4458f2 100644 --- a/llvm/docs/GitHub.rst +++ b/llvm/docs/GitHub.rst @@ -26,19 +26,19 @@ aren't associated with a pull-request **will be deleted**. Using Graphite for stacked Pull Requests ======================================== -[Graphite](https://app.graphite.dev/) is a stacked pull request tool supported -by the LLVM repo (the other being [reviewable.io](https://reviewable.io)). +`Graphite `_ is a stacked pull request tool supported +by the LLVM repo (the other being `reviewable.io `_). -Graphite will want to create branches under `llvm/llvm-project` rather than your +Graphite will want to create branches under ``llvm/llvm-project`` rather than your private fork, so the guidance above, about branch naming, is critical, otherwise -`gt submit` (i.e. publish your PRs for review) will fail. +``gt submit`` (i.e. publish your PRs for review) will fail. -Use `gt config` then `Branch naming settings` and `Set a prefix for branch names`. -Include the last `/`. +Use ``gt config`` then ``Branch naming settings`` and ``Set a prefix for branch names``. +Include the last ``/``. If you didn't do the above and Graphite created non-prefixed branches, a simple way to -unblock is to rename (`git -m `), and then checkout the branch -and `gt track`. +unblock is to rename (``git -m ``), and then checkout the branch +and ``gt track``. Pull Requests ============= From 75ef95584d8867d0039a43bad0bd8e53f3293f67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Thu, 22 Aug 2024 15:52:54 +0200 Subject: [PATCH 393/426] [clang][bytecode][NFC] Move test to verify=expected,both style --- clang/test/AST/ByteCode/invalid.cpp | 48 ++++++++++------------------- 1 file changed, 16 insertions(+), 32 deletions(-) diff --git a/clang/test/AST/ByteCode/invalid.cpp b/clang/test/AST/ByteCode/invalid.cpp index 3c142481f78119..13ba84bcad1040 100644 --- a/clang/test/AST/ByteCode/invalid.cpp +++ b/clang/test/AST/ByteCode/invalid.cpp @@ -5,65 +5,49 @@ namespace Throw { constexpr int ConditionalThrow(bool t) { if (t) - throw 4; // expected-note {{subexpression not valid in a constant expression}} \ - // ref-note {{subexpression not valid in a constant expression}} + throw 4; // both-note {{subexpression not valid in a constant expression}} return 0; } static_assert(ConditionalThrow(false) == 0, ""); - static_assert(ConditionalThrow(true) == 0, ""); // expected-error {{not an integral constant expression}} \ - // expected-note {{in call to 'ConditionalThrow(true)'}} \ - // ref-error {{not an integral constant expression}} \ - // ref-note {{in call to 'ConditionalThrow(true)'}} + static_assert(ConditionalThrow(true) == 0, ""); // both-error {{not an integral constant expression}} \ + // both-note {{in call to 'ConditionalThrow(true)'}} - constexpr int Throw() { // expected-error {{never produces a constant expression}} \ - // ref-error {{never produces a constant expression}} - throw 5; // expected-note {{subexpression not valid in a constant expression}} \ - // ref-note {{subexpression not valid in a constant expression}} + constexpr int Throw() { // both-error {{never produces a constant expression}} + throw 5; // both-note {{subexpression not valid in a constant expression}} return 0; } - constexpr int NoSubExpr() { // ref-error {{never produces a constant expression}} \ - // expected-error {{never produces a constant expression}} - throw; // ref-note 2{{subexpression not valid}} \ - // expected-note 2{{subexpression not valid}} + constexpr int NoSubExpr() { // both-error {{never produces a constant expression}} + throw; // both-note 2{{subexpression not valid}} return 0; } - static_assert(NoSubExpr() == 0, ""); // ref-error {{not an integral constant expression}} \ - // ref-note {{in call to}} \ - // expected-error {{not an integral constant expression}} \ - // expected-note {{in call to}} + static_assert(NoSubExpr() == 0, ""); // both-error {{not an integral constant expression}} \ + // both-note {{in call to}} } namespace Asm { constexpr int ConditionalAsm(bool t) { if (t) - asm(""); // expected-note {{subexpression not valid in a constant expression}} \ - // ref-note {{subexpression not valid in a constant expression}} + asm(""); // both-note {{subexpression not valid in a constant expression}} return 0; } static_assert(ConditionalAsm(false) == 0, ""); - static_assert(ConditionalAsm(true) == 0, ""); // expected-error {{not an integral constant expression}} \ - // expected-note {{in call to 'ConditionalAsm(true)'}} \ - // ref-error {{not an integral constant expression}} \ - // ref-note {{in call to 'ConditionalAsm(true)'}} + static_assert(ConditionalAsm(true) == 0, ""); // both-error {{not an integral constant expression}} \ + // both-note {{in call to 'ConditionalAsm(true)'}} - constexpr int Asm() { // expected-error {{never produces a constant expression}} \ - // ref-error {{never produces a constant expression}} - __asm volatile(""); // expected-note {{subexpression not valid in a constant expression}} \ - // ref-note {{subexpression not valid in a constant expression}} + constexpr int Asm() { // both-error {{never produces a constant expression}} + __asm volatile(""); // both-note {{subexpression not valid in a constant expression}} return 0; } } namespace Casts { - constexpr int a = reinterpret_cast(12); // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{reinterpret_cast is not allowed}} \ - // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{reinterpret_cast is not allowed}} + constexpr int a = reinterpret_cast(12); // both-error {{must be initialized by a constant expression}} \ + // both-note {{reinterpret_cast is not allowed}} void func() { struct B {}; From c81d6665601d648c1a5349b665ee6019f3786352 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Sat, 24 Aug 2024 07:50:23 +0200 Subject: [PATCH 394/426] [clang][bytecode] Fix IntegralAP::is{Positive,Negative} (#105924) This depends on signed-ness. --- clang/lib/AST/ByteCode/IntegralAP.h | 12 ++++++++++-- clang/test/AST/ByteCode/intap.cpp | 9 +++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/ByteCode/IntegralAP.h b/clang/lib/AST/ByteCode/IntegralAP.h index b8aa21038256c7..209b0af7da5f30 100644 --- a/clang/lib/AST/ByteCode/IntegralAP.h +++ b/clang/lib/AST/ByteCode/IntegralAP.h @@ -136,8 +136,16 @@ template class IntegralAP final { APValue toAPValue(const ASTContext &) const { return APValue(toAPSInt()); } bool isZero() const { return V.isZero(); } - bool isPositive() const { return V.isNonNegative(); } - bool isNegative() const { return !V.isNonNegative(); } + bool isPositive() const { + if constexpr (Signed) + return V.isNonNegative(); + return true; + } + bool isNegative() const { + if constexpr (Signed) + return !V.isNonNegative(); + return false; + } bool isMin() const { return V.isMinValue(); } bool isMax() const { return V.isMaxValue(); } static constexpr bool isSigned() { return Signed; } diff --git a/clang/test/AST/ByteCode/intap.cpp b/clang/test/AST/ByteCode/intap.cpp index d4440124856915..d0ad641fe508cb 100644 --- a/clang/test/AST/ByteCode/intap.cpp +++ b/clang/test/AST/ByteCode/intap.cpp @@ -104,6 +104,15 @@ static_assert(INT128_MAX == 0, ""); // expected-error {{failed}} \ // ref-note {{evaluates to '170141183460469231731687303715884105727 == 0'}} static const __int128_t INT128_MIN = -INT128_MAX - 1; + +namespace PointerArithmeticOverflow { + int n; + constexpr int *p = (&n + 1) + (unsigned __int128)-1; // expected-error {{constant expression}} \ + // expected-note {{cannot refer to element 3402}} \ + // ref-error {{constant expression}} \ + // ref-note {{cannot refer to element 3402}} +} + namespace i128 { constexpr int128_t I128_1 = 12; From 68030f86aef11558c9ed14a34250433f57923c84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Sat, 24 Aug 2024 06:49:11 +0200 Subject: [PATCH 395/426] [clang][bytecode][NFC] Fix printing signed IntegralAP values --- clang/lib/AST/ByteCode/IntegralAP.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/AST/ByteCode/IntegralAP.h b/clang/lib/AST/ByteCode/IntegralAP.h index 209b0af7da5f30..a4d656433344b7 100644 --- a/clang/lib/AST/ByteCode/IntegralAP.h +++ b/clang/lib/AST/ByteCode/IntegralAP.h @@ -153,7 +153,7 @@ template class IntegralAP final { unsigned countLeadingZeros() const { return V.countl_zero(); } - void print(llvm::raw_ostream &OS) const { OS << V; } + void print(llvm::raw_ostream &OS) const { V.print(OS, Signed);} std::string toDiagnosticString(const ASTContext &Ctx) const { std::string NameStr; llvm::raw_string_ostream OS(NameStr); From 62e7b59f10d9af809dd54fc064e2f60f0b48938c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Sat, 24 Aug 2024 08:24:58 +0200 Subject: [PATCH 396/426] [clang][bytecode][NFC] Move test case to -verify=expected,both style --- clang/test/AST/ByteCode/intap.cpp | 134 ++++++++++-------------------- 1 file changed, 46 insertions(+), 88 deletions(-) diff --git a/clang/test/AST/ByteCode/intap.cpp b/clang/test/AST/ByteCode/intap.cpp index d0ad641fe508cb..3f952ddf626b58 100644 --- a/clang/test/AST/ByteCode/intap.cpp +++ b/clang/test/AST/ByteCode/intap.cpp @@ -1,7 +1,7 @@ -// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -fms-extensions -std=c++11 -verify %s -// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -fms-extensions -std=c++20 -verify %s -// RUN: %clang_cc1 -std=c++11 -fms-extensions -verify=ref %s -// RUN: %clang_cc1 -std=c++20 -fms-extensions -verify=ref %s +// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -fms-extensions -std=c++11 -verify=expected,both %s +// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -fms-extensions -std=c++20 -verify=expected,both %s +// RUN: %clang_cc1 -std=c++11 -fms-extensions -verify=ref,both %s +// RUN: %clang_cc1 -std=c++20 -fms-extensions -verify=ref,both %s using MaxBitInt = _BitInt(128); @@ -9,13 +9,10 @@ using MaxBitInt = _BitInt(128); constexpr _BitInt(2) A = 0; constexpr _BitInt(2) B = A + 1; -constexpr _BitInt(2) C = B + 1; // expected-warning {{from 2 to -2}} \ - // ref-warning {{from 2 to -2}} +constexpr _BitInt(2) C = B + 1; // both-warning {{from 2 to -2}} static_assert(C == -2, ""); -static_assert(C - B == A, ""); // expected-error {{not an integral constant expression}} \ - // expected-note {{value -3 is outside the range of representable values}} \ - // ref-error {{not an integral constant expression}} \ - // ref-note {{value -3 is outside the range of representable values}} +static_assert(C - B == A, ""); // both-error {{not an integral constant expression}} \ + // both-note {{value -3 is outside the range of representable values}} static_assert(B - 1 == 0, ""); @@ -38,10 +35,8 @@ static_assert(BI1 == 3, ""); constexpr _BitInt(4) MulA = 5; constexpr _BitInt(4) MulB = 7; -static_assert(MulA * MulB == 50, ""); // ref-error {{not an integral constant expression}} \ - // ref-note {{value 35 is outside the range of representable values of type '_BitInt(4)'}} \ - // expected-error {{not an integral constant expression}} \ - // expected-note {{value 35 is outside the range of representable values of type '_BitInt(4)'}} +static_assert(MulA * MulB == 50, ""); // both-error {{not an integral constant expression}} \ + // both-note {{value 35 is outside the range of representable values of type '_BitInt(4)'}} static_assert(MulA * 5 == 25, ""); static_assert(-1 * MulB == -7, ""); @@ -50,29 +45,21 @@ constexpr _BitInt(4) DivA = 2; constexpr _BitInt(2) DivB = 1; static_assert(DivA / DivB == 2, ""); -constexpr _BitInt(4) DivC = DivA / 0; // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{division by zero}} \ - // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{division by zero}} +constexpr _BitInt(4) DivC = DivA / 0; // both-error {{must be initialized by a constant expression}} \ + // both-note {{division by zero}} constexpr _BitInt(7) RemA = 47; constexpr _BitInt(6) RemB = 9; static_assert(RemA % RemB == 2, ""); -static_assert(RemA % 0 == 1, ""); // ref-error {{not an integral constant expression}} \ - // ref-note {{division by zero}} \ - // expected-error {{not an integral constant expression}} \ - // expected-note {{division by zero}} +static_assert(RemA % 0 == 1, ""); // both-error {{not an integral constant expression}} \ + // both-note {{division by zero}} constexpr _BitInt(32) bottom = -1; constexpr _BitInt(32) top = INT_MIN; -constexpr _BitInt(32) nope = top / bottom; // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{value 2147483648 is outside the range}} \ - // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{value 2147483648 is outside the range}} -constexpr _BitInt(32) noooo = top % bottom; // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{value 2147483648 is outside the range}} \ - // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{value 2147483648 is outside the range}} +constexpr _BitInt(32) nope = top / bottom; // both-error {{must be initialized by a constant expression}} \ + // both-note {{value 2147483648 is outside the range}} +constexpr _BitInt(32) noooo = top % bottom; // both-error {{must be initialized by a constant expression}} \ + // both-note {{value 2147483648 is outside the range}} namespace APCast { constexpr _BitInt(10) A = 1; @@ -91,26 +78,20 @@ typedef __int128 int128_t; typedef unsigned __int128 uint128_t; static const __uint128_t UINT128_MAX =__uint128_t(__int128_t(-1L)); static_assert(UINT128_MAX == -1, ""); -static_assert(UINT128_MAX == 1, ""); // expected-error {{static assertion failed}} \ - // expected-note {{'340282366920938463463374607431768211455 == 1'}} \ - // ref-error {{static assertion failed}} \ - // ref-note {{'340282366920938463463374607431768211455 == 1'}} +static_assert(UINT128_MAX == 1, ""); // both-error {{static assertion failed}} \ + // both-note {{'340282366920938463463374607431768211455 == 1'}} static const __int128_t INT128_MAX = UINT128_MAX >> (__int128_t)1; static_assert(INT128_MAX != 0, ""); -static_assert(INT128_MAX == 0, ""); // expected-error {{failed}} \ - // expected-note {{evaluates to '170141183460469231731687303715884105727 == 0'}} \ - // ref-error {{failed}} \ - // ref-note {{evaluates to '170141183460469231731687303715884105727 == 0'}} +static_assert(INT128_MAX == 0, ""); // both-error {{failed}} \ + // both-note {{evaluates to '170141183460469231731687303715884105727 == 0'}} static const __int128_t INT128_MIN = -INT128_MAX - 1; namespace PointerArithmeticOverflow { int n; - constexpr int *p = (&n + 1) + (unsigned __int128)-1; // expected-error {{constant expression}} \ - // expected-note {{cannot refer to element 3402}} \ - // ref-error {{constant expression}} \ - // ref-note {{cannot refer to element 3402}} + constexpr int *p = (&n + 1) + (unsigned __int128)-1; // both-error {{constant expression}} \ + // both-note {{cannot refer to element 3402}} } namespace i128 { @@ -118,37 +99,27 @@ namespace i128 { constexpr int128_t I128_1 = 12; static_assert(I128_1 == 12, ""); static_assert(I128_1 != 10, ""); - static_assert(I128_1 != 12, ""); // expected-error{{failed}} \ - // ref-error{{failed}} \ - // expected-note{{evaluates to}} \ - // ref-note{{evaluates to}} + static_assert(I128_1 != 12, ""); // both-error{{failed}} \ + // both-note{{evaluates to}} static const __uint128_t UINT128_MAX =__uint128_t(__int128_t(-1L)); static_assert(UINT128_MAX == -1, ""); - static_assert(UINT128_MAX == 1, ""); // expected-error {{static assertion failed}} \ - // expected-note {{'340282366920938463463374607431768211455 == 1'}} \ - // ref-error {{static assertion failed}} \ - // ref-note {{'340282366920938463463374607431768211455 == 1'}} + static_assert(UINT128_MAX == 1, ""); // both-error {{static assertion failed}} \ + // both-note {{'340282366920938463463374607431768211455 == 1'}} constexpr uint128_t TooMuch = UINT128_MAX * 2; static const __int128_t INT128_MAX = UINT128_MAX >> (__int128_t)1; static_assert(INT128_MAX != 0, ""); - static_assert(INT128_MAX == 0, ""); // expected-error {{failed}} \ - // expected-note {{evaluates to '170141183460469231731687303715884105727 == 0'}} \ - // ref-error {{failed}} \ - // ref-note {{evaluates to '170141183460469231731687303715884105727 == 0'}} + static_assert(INT128_MAX == 0, ""); // both-error {{failed}} \ + // both-note {{evaluates to '170141183460469231731687303715884105727 == 0'}} - constexpr int128_t TooMuch2 = INT128_MAX * INT128_MAX; // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{value 28948022309329048855892746252171976962977213799489202546401021394546514198529 is outside the range of representable}} \ - // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{value 28948022309329048855892746252171976962977213799489202546401021394546514198529 is outside the range of representable}} + constexpr int128_t TooMuch2 = INT128_MAX * INT128_MAX; // both-error {{must be initialized by a constant expression}} \ + // both-note {{value 28948022309329048855892746252171976962977213799489202546401021394546514198529 is outside the range of representable}} static const __int128_t INT128_MIN = -INT128_MAX - 1; - constexpr __int128 A = INT128_MAX + 1; // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{value 170141183460469231731687303715884105728 is outside the range}} \ - // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{value 170141183460469231731687303715884105728 is outside the range}} + constexpr __int128 A = INT128_MAX + 1; // both-error {{must be initialized by a constant expression}} \ + // both-note {{value 170141183460469231731687303715884105728 is outside the range}} constexpr int128_t Two = (int128_t)1 << 1ul; static_assert(Two == 2, ""); static_assert(Two, ""); @@ -214,22 +185,17 @@ namespace i128 { static_assert(CastTo(12) == 12, ""); #endif - constexpr int128_t Error = __LDBL_MAX__; // ref-warning {{implicit conversion of out of range value}} \ - // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{is outside the range of representable values of type}} \ - // expected-warning {{implicit conversion of out of range value}} \ - // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{is outside the range of representable values of type}} + constexpr int128_t Error = __LDBL_MAX__; // both-warning {{implicit conversion of out of range value}} \ + // both-error {{must be initialized by a constant expression}} \ + // both-note {{is outside the range of representable values of type}} constexpr uint128_t Zero = 0; static_assert((Zero -1) == -1, ""); constexpr int128_t Five = 5; static_assert(Five - Zero == Five, ""); - constexpr int128_t Sub1 = INT128_MIN - 1; // expected-error {{must be initialized by a constant expression}} \ - // expected-note {{-170141183460469231731687303715884105729 is outside the range}} \ - // ref-error {{must be initialized by a constant expression}} \ - // ref-note {{-170141183460469231731687303715884105729 is outside the range}} + constexpr int128_t Sub1 = INT128_MIN - 1; // both-error {{must be initialized by a constant expression}} \ + // both-note {{-170141183460469231731687303715884105729 is outside the range}} } namespace AddSubOffset { @@ -245,16 +211,14 @@ namespace Bitfields { struct S1 { unsigned _BitInt(128) a : 2; }; - constexpr S1 s1{100}; // ref-warning {{changes value from 100 to 0}} \ - // expected-warning {{changes value from 100 to 0}} + constexpr S1 s1{100}; // both-warning {{changes value from 100 to 0}} constexpr S1 s12{3}; static_assert(s12.a == 3, ""); struct S2 { unsigned __int128 a : 2; }; - constexpr S2 s2{100}; // ref-warning {{changes value from 100 to 0}} \ - // expected-warning {{changes value from 100 to 0}} + constexpr S2 s2{100}; // both-warning {{changes value from 100 to 0}} } namespace BitOps { @@ -275,21 +239,15 @@ namespace IncDec { int128_t a = INT128_MAX; if (Pre) - ++a; // ref-note {{value 170141183460469231731687303715884105728 is outside the range}} \ - // expected-note {{value 170141183460469231731687303715884105728 is outside the range}} + ++a; // both-note {{value 170141183460469231731687303715884105728 is outside the range}} else - a++; // ref-note {{value 170141183460469231731687303715884105728 is outside the range}} \ - // expected-note {{value 170141183460469231731687303715884105728 is outside the range}} + a++; // both-note {{value 170141183460469231731687303715884105728 is outside the range}} return a; } - static_assert(maxPlus1(true) == 0, ""); // ref-error {{not an integral constant expression}} \ - // ref-note {{in call to}} \ - // expected-error {{not an integral constant expression}} \ - // expected-note {{in call to}} - static_assert(maxPlus1(false) == 0, ""); // ref-error {{not an integral constant expression}} \ - // ref-note {{in call to}} \ - // expected-error {{not an integral constant expression}} \ - // expected-note {{in call to}} + static_assert(maxPlus1(true) == 0, ""); // both-error {{not an integral constant expression}} \ + // both-note {{in call to}} + static_assert(maxPlus1(false) == 0, ""); // both-error {{not an integral constant expression}} \ + // both-note {{in call to}} constexpr int128_t inc1(bool Pre) { int128_t A = 0; From e185850ce735ade5924129bec56a5954c443cf17 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Fri, 23 Aug 2024 23:29:09 -0700 Subject: [PATCH 397/426] Revert ""Reland "[asan] Remove debug tracing from `report_globals` (#104404)"" (#105926) Reverts llvm/llvm-project#105895 Still breaks the test https://green.lab.llvm.org/job/llvm.org/job/clang-stage1-RA/1864/ --- compiler-rt/lib/asan/asan_flags.inc | 7 +++++-- compiler-rt/lib/asan/asan_globals.cpp | 19 +++++++++++-------- .../Linux/initialization-nobug-lld.cpp | 2 +- .../Linux/odr_indicator_unregister.cpp | 2 +- .../asan/TestCases/Linux/odr_indicators.cpp | 4 ++-- .../TestCases/Windows/dll_global_dead_strip.c | 4 ++-- ...eport_globals_symbolization_at_startup.cpp | 2 +- .../TestCases/Windows/global_dead_strip.c | 4 ++-- .../Windows/report_globals_vs_freelibrary.cpp | 2 +- .../asan/TestCases/initialization-nobug.cpp | 8 ++++---- 10 files changed, 30 insertions(+), 24 deletions(-) diff --git a/compiler-rt/lib/asan/asan_flags.inc b/compiler-rt/lib/asan/asan_flags.inc index 5e0ced9706e664..fad1577d912a5e 100644 --- a/compiler-rt/lib/asan/asan_flags.inc +++ b/compiler-rt/lib/asan/asan_flags.inc @@ -36,8 +36,11 @@ ASAN_FLAG(int, max_redzone, 2048, ASAN_FLAG( bool, debug, false, "If set, prints some debugging information and does additional checks.") -ASAN_FLAG(bool, report_globals, true, - "If set, detect and report errors on globals .") +ASAN_FLAG( + int, report_globals, 1, + "Controls the way to handle globals (0 - don't detect buffer overflow on " + "globals, 1 - detect buffer overflow, 2 - print data about registered " + "globals).") ASAN_FLAG(bool, check_initialization_order, false, "If set, attempts to catch initialization order issues.") ASAN_FLAG( diff --git a/compiler-rt/lib/asan/asan_globals.cpp b/compiler-rt/lib/asan/asan_globals.cpp index bf0edce937f06e..c83b782cb85f89 100644 --- a/compiler-rt/lib/asan/asan_globals.cpp +++ b/compiler-rt/lib/asan/asan_globals.cpp @@ -22,7 +22,6 @@ #include "asan_thread.h" #include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_dense_map.h" -#include "sanitizer_common/sanitizer_internal_defs.h" #include "sanitizer_common/sanitizer_list.h" #include "sanitizer_common/sanitizer_mutex.h" #include "sanitizer_common/sanitizer_placement_new.h" @@ -180,7 +179,7 @@ int GetGlobalsForAddress(uptr addr, Global *globals, u32 *reg_sites, int res = 0; for (const auto &l : list_of_all_globals) { const Global &g = *l.g; - if (UNLIKELY(common_flags()->verbosity >= 3)) + if (flags()->report_globals >= 2) ReportGlobal(g, "Search"); if (IsAddressNearGlobal(addr, g)) { internal_memcpy(&globals[res], &g, sizeof(g)); @@ -271,7 +270,7 @@ static inline bool UseODRIndicator(const Global *g) { // so we store the globals in a map. static void RegisterGlobal(const Global *g) SANITIZER_REQUIRES(mu_for_globals) { CHECK(AsanInited()); - if (UNLIKELY(common_flags()->verbosity >= 3)) + if (flags()->report_globals >= 2) ReportGlobal(*g, "Added"); CHECK(flags()->report_globals); CHECK(AddrIsInMem(g->beg)); @@ -308,7 +307,7 @@ static void RegisterGlobal(const Global *g) SANITIZER_REQUIRES(mu_for_globals) { static void UnregisterGlobal(const Global *g) SANITIZER_REQUIRES(mu_for_globals) { CHECK(AsanInited()); - if (UNLIKELY(common_flags()->verbosity >= 3)) + if (flags()->report_globals >= 2) ReportGlobal(*g, "Removed"); CHECK(flags()->report_globals); CHECK(AddrIsInMem(g->beg)); @@ -439,7 +438,7 @@ void __asan_register_globals(__asan_global *globals, uptr n) { } GlobalRegistrationSite site = {stack_id, &globals[0], &globals[n - 1]}; global_registration_site_vector->push_back(site); - if (UNLIKELY(common_flags()->verbosity >= 4)) { + if (flags()->report_globals >= 2) { PRINT_CURRENT_STACK(); Printf("=== ID %d; %p %p\n", stack_id, (void *)&globals[0], (void *)&globals[n - 1]); @@ -498,7 +497,9 @@ void __asan_before_dynamic_init(const char *module_name) { Lock lock(&mu_for_globals); if (current_dynamic_init_module_name == module_name) return; - VPrintf(2, "DynInitPoison module: %s\n", module_name); + if (flags()->report_globals >= 3) + Printf("DynInitPoison module: %s\n", module_name); + if (current_dynamic_init_module_name == nullptr) { // First call, poison all globals from other modules. DynInitGlobals().forEach([&](auto &kv) { @@ -544,7 +545,8 @@ static void UnpoisonBeforeMain(void) { return; allow_after_dynamic_init = true; } - VPrintf(2, "UnpoisonBeforeMain\n"); + if (flags()->report_globals >= 3) + Printf("UnpoisonBeforeMain\n"); __asan_after_dynamic_init(); } @@ -568,7 +570,8 @@ void __asan_after_dynamic_init() { if (!current_dynamic_init_module_name) return; - VPrintf(2, "DynInitUnpoison\n"); + if (flags()->report_globals >= 3) + Printf("DynInitUnpoison\n"); DynInitGlobals().forEach([&](auto &kv) { UnpoisonDynamicGlobals(kv.second, /*mark_initialized=*/false); diff --git a/compiler-rt/test/asan/TestCases/Linux/initialization-nobug-lld.cpp b/compiler-rt/test/asan/TestCases/Linux/initialization-nobug-lld.cpp index ef82c7a29575eb..5cec029811cbc8 100644 --- a/compiler-rt/test/asan/TestCases/Linux/initialization-nobug-lld.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/initialization-nobug-lld.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx_asan -O3 %S/../initialization-nobug.cpp %S/../Helpers/initialization-nobug-extra.cpp -fuse-ld=lld -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInit" +// RUN: %clangxx_asan -O3 %S/../initialization-nobug.cpp %S/../Helpers/initialization-nobug-extra.cpp -fuse-ld=lld -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInit" // Same as initialization-nobug.cpp, but with lld we expect just one // `DynInitUnpoison` executed after `AfterDynamicInit` at the end. diff --git a/compiler-rt/test/asan/TestCases/Linux/odr_indicator_unregister.cpp b/compiler-rt/test/asan/TestCases/Linux/odr_indicator_unregister.cpp index b75f5be101ef8a..0f2ed6597154bb 100644 --- a/compiler-rt/test/asan/TestCases/Linux/odr_indicator_unregister.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/odr_indicator_unregister.cpp @@ -4,7 +4,7 @@ // RUN: %clangxx_asan -g -O0 -DSHARED_LIB -DSIZE=1 %s -fPIC -shared -o %t-so-1.so // RUN: %clangxx_asan -g -O0 -DSHARED_LIB -DSIZE=2 %s -fPIC -shared -o %t-so-2.so // RUN: %clangxx_asan -g -O0 %s %libdl -Wl,--export-dynamic -o %t -// RUN: %env_asan_opts=report_globals=1:detect_odr_violation=1:verbosity=3 %run %t 2>&1 | FileCheck %s +// RUN: %env_asan_opts=report_globals=2:detect_odr_violation=1 %run %t 2>&1 | FileCheck %s // FIXME: Checks do not match on Android. // UNSUPPORTED: android diff --git a/compiler-rt/test/asan/TestCases/Linux/odr_indicators.cpp b/compiler-rt/test/asan/TestCases/Linux/odr_indicators.cpp index f28a9f6d07386d..8af3ec09be78c4 100644 --- a/compiler-rt/test/asan/TestCases/Linux/odr_indicators.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/odr_indicators.cpp @@ -1,8 +1,8 @@ // RUN: %clangxx_asan -fno-sanitize-address-use-odr-indicator -fPIC %s -o %t -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,INDICATOR0 +// RUN: %env_asan_opts=report_globals=2 %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,INDICATOR0 // RUN: %clangxx_asan -fsanitize-address-use-odr-indicator -fPIC %s -o %t -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,INDICATOR1 +// RUN: %env_asan_opts=report_globals=2 %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,INDICATOR1 #include diff --git a/compiler-rt/test/asan/TestCases/Windows/dll_global_dead_strip.c b/compiler-rt/test/asan/TestCases/Windows/dll_global_dead_strip.c index e5bd27bdf65fdf..a0c96622efeea4 100644 --- a/compiler-rt/test/asan/TestCases/Windows/dll_global_dead_strip.c +++ b/compiler-rt/test/asan/TestCases/Windows/dll_global_dead_strip.c @@ -1,11 +1,11 @@ // RUN: %clang_cl_asan %Od %p/dll_host.cpp %Fe%t // // RUN: %clang_cl_nocxx_asan %Gw %LD %Od %s %Fe%t.dll -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %t %t.dll 2>&1 | FileCheck %s --check-prefix=NOSTRIP +// RUN: %env_asan_opts=report_globals=2 %run %t %t.dll 2>&1 | FileCheck %s --check-prefix=NOSTRIP // RUN: %clang_cl_nocxx_asan %Gw %LD -O2 %s %Fe%t.dll \ // RUN: %if target={{.*-windows-gnu}} %{ -Wl,--gc-sections %} \ // RUN: %else %{ -link -opt:ref %} -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %t %t.dll 2>&1 | FileCheck %s --check-prefix=STRIP +// RUN: %env_asan_opts=report_globals=2 %run %t %t.dll 2>&1 | FileCheck %s --check-prefix=STRIP #include diff --git a/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp b/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp index c74b66f2b43b3e..06a632e6708b1e 100644 --- a/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp +++ b/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp @@ -1,7 +1,7 @@ // RUN: %clang_cl_asan %LD %Od -DDLL %s %Fe%t.dll \ // RUN: %if target={{.*-windows-gnu}} %{ -Wl,--out-implib,%t.lib %} // RUN: %clang_cl_asan %Od -DEXE %s %t.lib %Fe%te.exe -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %te.exe 2>&1 | FileCheck %s +// RUN: %env_asan_opts=report_globals=2 %run %te.exe 2>&1 | FileCheck %s // FIXME: Currently, the MT runtime build crashes on startup due to dbghelp.dll // initialization failure. diff --git a/compiler-rt/test/asan/TestCases/Windows/global_dead_strip.c b/compiler-rt/test/asan/TestCases/Windows/global_dead_strip.c index 7f2405fdfc8364..0e15120a46f776 100644 --- a/compiler-rt/test/asan/TestCases/Windows/global_dead_strip.c +++ b/compiler-rt/test/asan/TestCases/Windows/global_dead_strip.c @@ -1,9 +1,9 @@ // RUN: %clang_cl_nocxx_asan %Gw %Od %s %Fe%t.exe -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %t.exe 2>&1 | FileCheck %s --check-prefix=NOSTRIP +// RUN: %env_asan_opts=report_globals=2 %t.exe 2>&1 | FileCheck %s --check-prefix=NOSTRIP // RUN: %clang_cl_nocxx_asan %Gw -O2 %s %Fe%t.exe \ // RUN: %if target={{.*-windows-gnu}} %{ -Wl,--gc-sections %} \ // RUN: %else %{ -link -opt:ref %} -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %t.exe 2>&1 | FileCheck %s --check-prefix=STRIP +// RUN: %env_asan_opts=report_globals=2 %t.exe 2>&1 | FileCheck %s --check-prefix=STRIP #include int dead_global = 42; diff --git a/compiler-rt/test/asan/TestCases/Windows/report_globals_vs_freelibrary.cpp b/compiler-rt/test/asan/TestCases/Windows/report_globals_vs_freelibrary.cpp index 34ce18e146d677..7cad3f39be1ec2 100644 --- a/compiler-rt/test/asan/TestCases/Windows/report_globals_vs_freelibrary.cpp +++ b/compiler-rt/test/asan/TestCases/Windows/report_globals_vs_freelibrary.cpp @@ -1,6 +1,6 @@ // RUN: %clang_cl_asan %LD %Od -DDLL %s %Fe%t.dll // RUN: %clang_cl_asan %Od -DEXE %s %Fe%te.exe -// RUN: %env_asan_opts=report_globals=1:verbosity=3 %run %te.exe %t.dll 2>&1 | FileCheck %s +// RUN: %env_asan_opts=report_globals=2 %run %te.exe %t.dll 2>&1 | FileCheck %s #include #include diff --git a/compiler-rt/test/asan/TestCases/initialization-nobug.cpp b/compiler-rt/test/asan/TestCases/initialization-nobug.cpp index 61328b9de28ae6..f66d501124bc48 100644 --- a/compiler-rt/test/asan/TestCases/initialization-nobug.cpp +++ b/compiler-rt/test/asan/TestCases/initialization-nobug.cpp @@ -1,10 +1,10 @@ // A collection of various initializers which shouldn't trip up initialization // order checking. If successful, this will just return 0. -// RUN: %clangxx_asan -O0 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" -// RUN: %clangxx_asan -O1 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" -// RUN: %clangxx_asan -O2 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" -// RUN: %clangxx_asan -O3 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=1:verbosity=2 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" +// RUN: %clangxx_asan -O0 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" +// RUN: %clangxx_asan -O1 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" +// RUN: %clangxx_asan -O2 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" +// RUN: %clangxx_asan -O3 %s %p/Helpers/initialization-nobug-extra.cpp -o %t && %env_asan_opts=check_initialization_order=true:report_globals=3 %run %t 2>&1 | FileCheck %s --implicit-check-not "DynInitPoison" // Simple access: // Make sure that accessing a global in the same TU is safe From 76236fafda19ff3760443196edcd3cd9610ed733 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Fri, 23 Aug 2024 23:33:23 -0700 Subject: [PATCH 398/426] [Clang] Overflow Pattern Exclusion - rename some patterns, enhance docs (#105709) From @vitalybuka's review on https://github.com/llvm/llvm-project/pull/104889: - [x] remove unused variable in tests - [x] rename `post-decr-while` --> `unsigned-post-decr-while` - [x] split `add-overflow-test` into `add-unsigned-overflow-test` and `add-signed-overflow-test` - [x] be more clear about defaults within docs - [x] add table to docs Here's a screenshot of the rendered table so you don't have to build the html docs yourself to inspect the layout: ![image](https://github.com/user-attachments/assets/5d3497c4-5f5a-4579-b29b-96a0fd192faa) CCs: @vitalybuka --------- Signed-off-by: Justin Stitt Co-authored-by: Vitaly Buka --- clang/docs/ReleaseNotes.rst | 20 ++++++---- clang/docs/UndefinedBehaviorSanitizer.rst | 37 +++++++++++++++---- clang/include/clang/Basic/LangOptions.h | 8 ++-- clang/include/clang/Driver/Options.td | 2 +- clang/lib/AST/Expr.cpp | 27 +++++++++++--- clang/lib/CodeGen/CGExprScalar.cpp | 2 +- clang/lib/Driver/SanitizerArgs.cpp | 7 +++- clang/lib/Frontend/CompilerInvocation.cpp | 8 +++- .../ignore-overflow-pattern-false-pos.c | 1 - clang/test/CodeGen/ignore-overflow-pattern.c | 7 +++- 10 files changed, 87 insertions(+), 32 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 798f59009af3c3..0ced2f779f7058 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -466,28 +466,34 @@ Sanitizers - Added the ``-fsanitize-undefined-ignore-overflow-pattern`` flag which can be used to disable specific overflow-dependent code patterns. The supported - patterns are: ``add-overflow-test``, ``negated-unsigned-const``, and - ``post-decr-while``. The sanitizer instrumentation can be toggled off for all - available patterns by specifying ``all``. Conversely, you can disable all - exclusions with ``none``. + patterns are: ``add-signed-overflow-test``, ``add-unsigned-overflow-test``, + ``negated-unsigned-const``, and ``unsigned-post-decr-while``. The sanitizer + instrumentation can be toggled off for all available patterns by specifying + ``all``. Conversely, you may disable all exclusions with ``none`` which is + the default. .. code-block:: c++ - /// specified with ``-fsanitize-undefined-ignore-overflow-pattern=add-overflow-test`` + /// specified with ``-fsanitize-undefined-ignore-overflow-pattern=add-unsigned-overflow-test`` int common_overflow_check_pattern(unsigned base, unsigned offset) { if (base + offset < base) { /* ... */ } // The pattern of `a + b < a`, and other re-orderings, won't be instrumented } + /// specified with ``-fsanitize-undefined-ignore-overflow-pattern=add-signed-overflow-test`` + int common_overflow_check_pattern_signed(signed int base, signed int offset) { + if (base + offset < base) { /* ... */ } // The pattern of `a + b < a`, and other re-orderings, won't be instrumented + } + /// specified with ``-fsanitize-undefined-ignore-overflow-pattern=negated-unsigned-const`` void negation_overflow() { unsigned long foo = -1UL; // No longer causes a negation overflow warning unsigned long bar = -2UL; // and so on... } - /// specified with ``-fsanitize-undefined-ignore-overflow-pattern=post-decr-while`` + /// specified with ``-fsanitize-undefined-ignore-overflow-pattern=unsigned-post-decr-while`` void while_post_decrement() { unsigned char count = 16; - while (count--) { /* ... */} // No longer causes unsigned-integer-overflow sanitizer to trip + while (count--) { /* ... */ } // No longer causes unsigned-integer-overflow sanitizer to trip } Many existing projects have a large amount of these code patterns present. diff --git a/clang/docs/UndefinedBehaviorSanitizer.rst b/clang/docs/UndefinedBehaviorSanitizer.rst index 1c92907372f83c..0d1010b7dcb338 100644 --- a/clang/docs/UndefinedBehaviorSanitizer.rst +++ b/clang/docs/UndefinedBehaviorSanitizer.rst @@ -314,26 +314,49 @@ Currently, this option supports three overflow-dependent code idioms: unsigned long foo = -1UL; // No longer causes a negation overflow warning unsigned long bar = -2UL; // and so on... -``post-decr-while`` +``unsigned-post-decr-while`` .. code-block:: c++ - /// -fsanitize-undefined-ignore-overflow-pattern=post-decr-while + /// -fsanitize-undefined-ignore-overflow-pattern=unsigned-post-decr-while unsigned char count = 16; while (count--) { /* ... */ } // No longer causes unsigned-integer-overflow sanitizer to trip -``add-overflow-test`` +``add-signed-overflow-test,add-unsigned-overflow-test`` .. code-block:: c++ - /// -fsanitize-undefined-ignore-overflow-pattern=add-overflow-test + /// -fsanitize-undefined-ignore-overflow-pattern=add-(signed|unsigned)-overflow-test if (base + offset < base) { /* ... */ } // The pattern of `a + b < a`, and other re-orderings, - // won't be instrumented (same for signed types) + // won't be instrumented (signed or unsigned types) + +.. list-table:: Overflow Pattern Types + :widths: 30 50 + :header-rows: 1 + + * - Pattern + - Sanitizer + * - negated-unsigned-const + - unsigned-integer-overflow + * - unsigned-post-decr-while + - unsigned-integer-overflow + * - add-unsigned-overflow-test + - unsigned-integer-overflow + * - add-signed-overflow-test + - signed-integer-overflow + + + +Note: ``add-signed-overflow-test`` suppresses only the check for Undefined +Behavior. Eager Undefined Behavior optimizations are still possible. One may +remedy this with ``-fwrapv`` or ``-fno-strict-overflow``. You can enable all exclusions with ``-fsanitize-undefined-ignore-overflow-pattern=all`` or disable all exclusions -with ``-fsanitize-undefined-ignore-overflow-pattern=none``. Specifying ``none`` -has precedence over other values. +with ``-fsanitize-undefined-ignore-overflow-pattern=none``. If +``-fsanitize-undefined-ignore-overflow-pattern`` is not specified ``none`` is +implied. Specifying ``none`` alongside other values also implies ``none`` as +``none`` has precedence over other values -- including ``all``. Issue Suppression ================= diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h index eb4cb4b5a7e93f..1c80ee89837cb3 100644 --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -375,11 +375,13 @@ class LangOptionsBase { /// Exclude all overflow patterns (below) All = 1 << 1, /// if (a + b < a) - AddOverflowTest = 1 << 2, + AddSignedOverflowTest = 1 << 2, + /// if (a + b < a) + AddUnsignedOverflowTest = 1 << 3, /// -1UL - NegUnsignedConst = 1 << 3, + NegUnsignedConst = 1 << 4, /// while (count--) - PostDecrInWhile = 1 << 4, + PostDecrInWhile = 1 << 5, }; enum class DefaultVisiblityExportMapping { diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 4bf604d46a0f70..1b9b3f2c6600a3 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2570,7 +2570,7 @@ defm sanitize_stats : BoolOption<"f", "sanitize-stats", def fsanitize_undefined_ignore_overflow_pattern_EQ : CommaJoined<["-"], "fsanitize-undefined-ignore-overflow-pattern=">, HelpText<"Specify the overflow patterns to exclude from artihmetic sanitizer instrumentation">, Visibility<[ClangOption, CC1Option]>, - Values<"none,all,add-overflow-test,negated-unsigned-const,post-decr-while">, + Values<"none,all,add-unsigned-overflow-test,add-signed-overflow-test,negated-unsigned-const,unsigned-post-decr-while">, MarshallingInfoStringVector>; def fsanitize_thread_memory_access : Flag<["-"], "fsanitize-thread-memory-access">, Group, diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 25ab6f3b2addfb..3309619850f34a 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -4806,6 +4806,26 @@ getOverflowPatternBinOp(const BinaryOperator *E) { return {}; } +/// Compute and set the OverflowPatternExclusion bit based on whether the +/// BinaryOperator expression matches an overflow pattern being ignored by +/// -fsanitize-undefined-ignore-overflow-pattern=add-signed-overflow-test or +/// -fsanitize-undefined-ignore-overflow-pattern=add-unsigned-overflow-test +static void computeOverflowPatternExclusion(const ASTContext &Ctx, + const BinaryOperator *E) { + std::optional Result = getOverflowPatternBinOp(E); + if (!Result.has_value()) + return; + QualType AdditionResultType = Result.value()->getType(); + + if ((AdditionResultType->isSignedIntegerType() && + Ctx.getLangOpts().isOverflowPatternExcluded( + LangOptions::OverflowPatternExclusionKind::AddSignedOverflowTest)) || + (AdditionResultType->isUnsignedIntegerType() && + Ctx.getLangOpts().isOverflowPatternExcluded( + LangOptions::OverflowPatternExclusionKind::AddUnsignedOverflowTest))) + Result.value()->setExcludedOverflowPattern(true); +} + BinaryOperator::BinaryOperator(const ASTContext &Ctx, Expr *lhs, Expr *rhs, Opcode opc, QualType ResTy, ExprValueKind VK, ExprObjectKind OK, SourceLocation opLoc, @@ -4818,12 +4838,7 @@ BinaryOperator::BinaryOperator(const ASTContext &Ctx, Expr *lhs, Expr *rhs, BinaryOperatorBits.ExcludedOverflowPattern = false; SubExprs[LHS] = lhs; SubExprs[RHS] = rhs; - if (Ctx.getLangOpts().isOverflowPatternExcluded( - LangOptions::OverflowPatternExclusionKind::AddOverflowTest)) { - std::optional Result = getOverflowPatternBinOp(this); - if (Result.has_value()) - Result.value()->BinaryOperatorBits.ExcludedOverflowPattern = true; - } + computeOverflowPatternExclusion(Ctx, this); BinaryOperatorBits.HasFPFeatures = FPFeatures.requiresTrailingStorage(); if (hasStoredFPFeatures()) setStoredFPFeatures(FPFeatures); diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index 2a726bba2dd304..af11bc20a3b639 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -2785,7 +2785,7 @@ static bool matchesPostDecrInWhile(const UnaryOperator *UO, bool isInc, if (isInc || isPre) return false; - // -fsanitize-undefined-ignore-overflow-pattern=post-decr-while + // -fsanitize-undefined-ignore-overflow-pattern=unsigned-post-decr-while if (!Ctx.getLangOpts().isOverflowPatternExcluded( LangOptions::OverflowPatternExclusionKind::PostDecrInWhile)) return false; diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp index 09262f40b5b50c..18bb35a563167e 100644 --- a/clang/lib/Driver/SanitizerArgs.cpp +++ b/clang/lib/Driver/SanitizerArgs.cpp @@ -1457,9 +1457,12 @@ static int parseOverflowPatternExclusionValues(const Driver &D, llvm::StringSwitch(Value) .Case("none", LangOptionsBase::None) .Case("all", LangOptionsBase::All) - .Case("add-overflow-test", LangOptionsBase::AddOverflowTest) + .Case("add-unsigned-overflow-test", + LangOptionsBase::AddUnsignedOverflowTest) + .Case("add-signed-overflow-test", + LangOptionsBase::AddSignedOverflowTest) .Case("negated-unsigned-const", LangOptionsBase::NegUnsignedConst) - .Case("post-decr-while", LangOptionsBase::PostDecrInWhile) + .Case("unsigned-post-decr-while", LangOptionsBase::PostDecrInWhile) .Default(0); if (E == 0) D.Diag(clang::diag::err_drv_unsupported_option_argument) diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index f510d3067d4d58..0bb4175dd021ee 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -4274,9 +4274,13 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, llvm::StringSwitch(A->getValue(i)) .Case("none", LangOptionsBase::None) .Case("all", LangOptionsBase::All) - .Case("add-overflow-test", LangOptionsBase::AddOverflowTest) + .Case("add-unsigned-overflow-test", + LangOptionsBase::AddUnsignedOverflowTest) + .Case("add-signed-overflow-test", + LangOptionsBase::AddSignedOverflowTest) .Case("negated-unsigned-const", LangOptionsBase::NegUnsignedConst) - .Case("post-decr-while", LangOptionsBase::PostDecrInWhile) + .Case("unsigned-post-decr-while", + LangOptionsBase::PostDecrInWhile) .Default(0); } } diff --git a/clang/test/CodeGen/ignore-overflow-pattern-false-pos.c b/clang/test/CodeGen/ignore-overflow-pattern-false-pos.c index 40193e0c3e2671..b4811443b95192 100644 --- a/clang/test/CodeGen/ignore-overflow-pattern-false-pos.c +++ b/clang/test/CodeGen/ignore-overflow-pattern-false-pos.c @@ -3,7 +3,6 @@ // Check for potential false positives from patterns that _almost_ match classic overflow-dependent or overflow-prone code patterns extern unsigned a, b, c; -extern int u, v, w; extern unsigned some(void); diff --git a/clang/test/CodeGen/ignore-overflow-pattern.c b/clang/test/CodeGen/ignore-overflow-pattern.c index b7d700258f8538..c4a9d07b07aaac 100644 --- a/clang/test/CodeGen/ignore-overflow-pattern.c +++ b/clang/test/CodeGen/ignore-overflow-pattern.c @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsanitize=signed-integer-overflow,unsigned-integer-overflow -fsanitize-undefined-ignore-overflow-pattern=all %s -emit-llvm -o - | FileCheck %s // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsanitize=signed-integer-overflow,unsigned-integer-overflow -fsanitize-undefined-ignore-overflow-pattern=all -fwrapv %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsanitize=signed-integer-overflow,unsigned-integer-overflow -fsanitize-undefined-ignore-overflow-pattern=add-overflow-test %s -emit-llvm -o - | FileCheck %s --check-prefix=ADD +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsanitize=signed-integer-overflow,unsigned-integer-overflow -fsanitize-undefined-ignore-overflow-pattern=add-signed-overflow-test,add-unsigned-overflow-test %s -emit-llvm -o - | FileCheck %s --check-prefix=ADD // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsanitize=signed-integer-overflow,unsigned-integer-overflow -fsanitize-undefined-ignore-overflow-pattern=negated-unsigned-const %s -emit-llvm -o - | FileCheck %s --check-prefix=NEGATE -// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsanitize=signed-integer-overflow,unsigned-integer-overflow -fsanitize-undefined-ignore-overflow-pattern=post-decr-while %s -emit-llvm -o - | FileCheck %s --check-prefix=WHILE +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsanitize=signed-integer-overflow,unsigned-integer-overflow -fsanitize-undefined-ignore-overflow-pattern=unsigned-post-decr-while %s -emit-llvm -o - | FileCheck %s --check-prefix=WHILE // Ensure some common overflow-dependent or overflow-prone code patterns don't // trigger the overflow sanitizers. In many cases, overflow warnings caused by @@ -25,6 +25,7 @@ // CHECK-NOT: handle{{.*}}overflow extern unsigned a, b, c; +extern int u, v; extern unsigned some(void); // ADD-LABEL: @basic_commutativity @@ -50,6 +51,8 @@ void basic_commutativity(void) { c = 9; if (b > b + a) c = 9; + if (u + v < u) + c = 9; } // ADD-LABEL: @arguments_and_commutativity From 99b85cae628c1cc5641944290712cd84ccf1f6c8 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Sat, 24 Aug 2024 09:23:25 +0200 Subject: [PATCH 399/426] [clang][bytecode][NFC] Add an additional assertion (#105927) Since this must be true, add an assertion instead of just documenting it via the comment. --- clang/lib/AST/ByteCode/Interp.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index 7ba51f737db491..81c547991c3d7d 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -2608,9 +2608,11 @@ inline bool Call(InterpState &S, CodePtr OpPC, const Function *Func, // the function we're about to call is a lambda call operator, // skip the CheckInvoke, since the ThisPtr is a null pointer // anyway. - if (!(S.Current->getFunction() && - S.Current->getFunction()->isLambdaStaticInvoker() && - Func->isLambdaCallOperator())) { + if (S.Current->getFunction() && + S.Current->getFunction()->isLambdaStaticInvoker() && + Func->isLambdaCallOperator()) { + assert(ThisPtr.isZero()); + } else { if (!CheckInvoke(S, OpPC, ThisPtr)) return false; } From 43c6fb29a64b9443367bf4085a11ca68f7cd4492 Mon Sep 17 00:00:00 2001 From: c8ef Date: Sat, 24 Aug 2024 19:56:59 +0800 Subject: [PATCH 400/426] [InstCombine] Update the `select` operand when the `cond` is `trunc` and has the `nuw` or `nsw` property. (#105914) This patch updates the select operand when the cond has the nuw or nsw property. Considering the semantics of the nuw and nsw flag, if there is no poison value in this expression, this code assumes that X can only be 0, 1 or -1. close: #96765 alive2: https://alive2.llvm.org/ce/z/3n3n2Q --- .../InstCombine/InstCombineSelect.cpp | 19 ++++++ .../InstCombine/fold-select-trunc.ll | 68 +++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 llvm/test/Transforms/InstCombine/fold-select-trunc.ll diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 18ffc209f259e0..fcd11126073bf1 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -4201,5 +4201,24 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) { } } + // select (trunc nuw X to i1), X, Y --> select (trunc nuw X to i1), 1, Y + // select (trunc nuw X to i1), Y, X --> select (trunc nuw X to i1), Y, 0 + // select (trunc nsw X to i1), X, Y --> select (trunc nsw X to i1), -1, Y + // select (trunc nsw X to i1), Y, X --> select (trunc nsw X to i1), Y, 0 + Value *Trunc; + if (match(CondVal, m_NUWTrunc(m_Value(Trunc)))) { + if (TrueVal == Trunc) + return replaceOperand(SI, 1, ConstantInt::get(TrueVal->getType(), 1)); + if (FalseVal == Trunc) + return replaceOperand(SI, 2, ConstantInt::get(FalseVal->getType(), 0)); + } + if (match(CondVal, m_NSWTrunc(m_Value(Trunc)))) { + if (TrueVal == Trunc) + return replaceOperand(SI, 1, + Constant::getAllOnesValue(TrueVal->getType())); + if (FalseVal == Trunc) + return replaceOperand(SI, 2, ConstantInt::get(FalseVal->getType(), 0)); + } + return nullptr; } diff --git a/llvm/test/Transforms/InstCombine/fold-select-trunc.ll b/llvm/test/Transforms/InstCombine/fold-select-trunc.ll new file mode 100644 index 00000000000000..5567d7d5e1fca9 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/fold-select-trunc.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +define i8 @fold_select_trunc_nuw_true(i8 %x, i8 %y) { +; CHECK-LABEL: @fold_select_trunc_nuw_true( +; CHECK-NEXT: [[TRUNC:%.*]] = trunc nuw i8 [[X:%.*]] to i1 +; CHECK-NEXT: [[RET:%.*]] = select i1 [[TRUNC]], i8 1, i8 [[Y:%.*]] +; CHECK-NEXT: ret i8 [[RET]] +; + %trunc = trunc nuw i8 %x to i1 + %ret = select i1 %trunc, i8 %x, i8 %y + ret i8 %ret +} + +define i8 @fold_select_trunc_nuw_false(i8 %x, i8 %y) { +; CHECK-LABEL: @fold_select_trunc_nuw_false( +; CHECK-NEXT: [[TRUNC:%.*]] = trunc nuw i8 [[X:%.*]] to i1 +; CHECK-NEXT: [[RET:%.*]] = select i1 [[TRUNC]], i8 [[Y:%.*]], i8 0 +; CHECK-NEXT: ret i8 [[RET]] +; + %trunc = trunc nuw i8 %x to i1 + %ret = select i1 %trunc, i8 %y, i8 %x + ret i8 %ret +} + +define i128 @fold_select_trunc_nsw_true(i128 %x, i128 %y) { +; CHECK-LABEL: @fold_select_trunc_nsw_true( +; CHECK-NEXT: [[TRUNC:%.*]] = trunc nsw i128 [[X:%.*]] to i1 +; CHECK-NEXT: [[RET:%.*]] = select i1 [[TRUNC]], i128 -1, i128 [[Y:%.*]] +; CHECK-NEXT: ret i128 [[RET]] +; + %trunc = trunc nsw i128 %x to i1 + %ret = select i1 %trunc, i128 %x, i128 %y + ret i128 %ret +} + +define i8 @fold_select_trunc_nsw_false(i8 %x, i8 %y) { +; CHECK-LABEL: @fold_select_trunc_nsw_false( +; CHECK-NEXT: [[TRUNC:%.*]] = trunc nsw i8 [[X:%.*]] to i1 +; CHECK-NEXT: [[RET:%.*]] = select i1 [[TRUNC]], i8 [[Y:%.*]], i8 0 +; CHECK-NEXT: ret i8 [[RET]] +; + %trunc = trunc nsw i8 %x to i1 + %ret = select i1 %trunc, i8 %y, i8 %x + ret i8 %ret +} + +define i8 @fold_select_trunc_negative(i8 %x, i8 %y) { +; CHECK-LABEL: @fold_select_trunc_negative( +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i8 [[X:%.*]] to i1 +; CHECK-NEXT: [[RET:%.*]] = select i1 [[TRUNC]], i8 [[X]], i8 [[Y:%.*]] +; CHECK-NEXT: ret i8 [[RET]] +; + %trunc = trunc i8 %x to i1 + %ret = select i1 %trunc, i8 %x, i8 %y + ret i8 %ret +} + +define <2 x i8> @fold_select_trunc_vector(<2 x i8> %x, <2 x i8> %y) { +; CHECK-LABEL: @fold_select_trunc_vector( +; CHECK-NEXT: [[TRUNC:%.*]] = trunc nuw <2 x i8> [[X:%.*]] to <2 x i1> +; CHECK-NEXT: [[RET:%.*]] = select <2 x i1> [[TRUNC]], <2 x i8> , <2 x i8> [[Y:%.*]] +; CHECK-NEXT: ret <2 x i8> [[RET]] +; + %trunc = trunc nuw <2 x i8> %x to <2 x i1> + %ret = select <2 x i1> %trunc, <2 x i8> %x, <2 x i8> %y + ret <2 x i8> %ret +} From 001e423ac6261283f0289a774bf5e7577adb1ea6 Mon Sep 17 00:00:00 2001 From: David Green Date: Sat, 24 Aug 2024 13:03:52 +0100 Subject: [PATCH 401/426] [Tests] Attempt to fix PowerPC buildbots. The intent is that the tests should not be running on PowerPC as the fp128 type will differ. This attempts to fix the bots by using __powerpc__ instead, which appears to be defined in godbolt. --- llvm/cmake/config-ix.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake index 471dd1615c2e7b..e7ed839ad68101 100644 --- a/llvm/cmake/config-ix.cmake +++ b/llvm/cmake/config-ix.cmake @@ -260,7 +260,7 @@ if(C_SUPPORTS_WERROR_UNGUARDED_AVAILABILITY_NEW) endif() check_cxx_symbol_exists(logf128 cmath HAS_LOGF128) -check_symbol_exists(__powerpc64le__ "" __PPC64LE) +check_symbol_exists(__powerpc__ "" __PPC64LE) if(HAS_LOGF128 AND NOT __PPC64LE) set(LLVM_HAS_LOGF128 On) add_compile_definitions(HAS_LOGF128) From be5ecc35efc902a4742669d41a88cfd88babb245 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Sat, 24 Aug 2024 20:14:28 +0800 Subject: [PATCH 402/426] [RISCV] Don't move source if passthru already dominates in vmv.v.v peephole (#105792) Currently we move the source down to where vmv.v.v to make sure that the new passthru dominates, but we do this even if it already does. This adds a simple local dominance check (taken from X86FastPreTileConfig.cpp) and avoids doing the move if it can. It also modifies the move to only move it to just past the passthru definition, and not all the way down to the vmv.v.v. This allows folding to succeed in some edge cases, which prevents regressions in an upcoming patch. --- llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp | 31 +++++++++++++++---- .../CodeGen/RISCV/rvv/vmv.v.v-peephole.mir | 20 ++++++++++++ 2 files changed, 45 insertions(+), 6 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp index 9772782ad3d6db..822ab492c710b4 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp @@ -437,6 +437,22 @@ static bool isSafeToMove(const MachineInstr &From, const MachineInstr &To) { return From.isSafeToMove(SawStore); } +/// Given A and B are in the same MBB, returns true if A comes before B. +static bool dominates(MachineBasicBlock::const_iterator A, + MachineBasicBlock::const_iterator B) { + assert(A->getParent() == B->getParent()); + const MachineBasicBlock *MBB = A->getParent(); + auto MBBEnd = MBB->end(); + if (B == MBBEnd) + return true; + + MachineBasicBlock::const_iterator I = MBB->begin(); + for (; &*I != A && &*I != B; ++I) + ; + + return &*I == A; +} + /// If a PseudoVMV_V_V is the only user of its input, fold its passthru and VL /// into it. /// @@ -481,12 +497,15 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) { if (!isVLKnownLE(SrcVL, MI.getOperand(3))) return false; - // If Src ends up using MI's passthru/VL, move it so it can access it. - // TODO: We don't need to do this if they already dominate Src. - if (!SrcPassthru.isIdenticalTo(Passthru)) { - if (!isSafeToMove(*Src, MI)) - return false; - Src->moveBefore(&MI); + // If the new passthru doesn't dominate Src, try to move Src so it does. + if (Passthru.getReg() != RISCV::NoRegister) { + MachineInstr *PassthruDef = MRI->getVRegDef(Passthru.getReg()); + if (PassthruDef->getParent() == Src->getParent() && + !dominates(PassthruDef, Src)) { + if (!isSafeToMove(*Src, *PassthruDef->getNextNode())) + return false; + Src->moveBefore(PassthruDef->getNextNode()); + } } if (SrcPassthru.getReg() != Passthru.getReg()) { diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir new file mode 100644 index 00000000000000..b2526c6df6939e --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir @@ -0,0 +1,20 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 +# RUN: llc %s -o - -mtriple=riscv64 -mattr=+v -run-pass=riscv-vector-peephole \ +# RUN: -verify-machineinstrs | FileCheck %s + +--- +name: move_src +body: | + bb.0: + liveins: $v8 + ; CHECK-LABEL: name: move_src + ; CHECK: liveins: $v8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %passthru:vr = COPY $v8 + ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 %passthru, $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: %y:gpr = ADDI $x0, 1 + %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */ + %passthru:vr = COPY $v8 + %y:gpr = ADDI $x0, 1 + %z:vr = PseudoVMV_V_V_M1 %passthru, %x, 4, 5 /* e32 */, 0 /* tu, mu */ +... From 40975da950c95124155b752cd683d945f7d203fd Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 24 Aug 2024 13:22:52 +0100 Subject: [PATCH 403/426] [VPlan] Wrap planContainsAdditionalSimplifications in NDEBUG (NFC) Only used for an assertion. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b12121d4688c65..6fd89ef76f9aae 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7239,11 +7239,12 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, return Cost; } +#ifndef NDEBUG /// Return true if the original loop \ TheLoop contains any instructions that do /// not have corresponding recipes in \p Plan and are not marked to be ignored /// in \p CostCtx. This means the VPlan contains simplification that the legacy /// cost-model did not account for. -[[maybe_unused]] static bool +static bool planContainsAdditionalSimplifications(VPlan &Plan, ElementCount VF, VPCostContext &CostCtx, Loop *TheLoop, LoopVectorizationCostModel &CM) { @@ -7288,6 +7289,7 @@ planContainsAdditionalSimplifications(VPlan &Plan, ElementCount VF, }); }); } +#endif VectorizationFactor LoopVectorizationPlanner::computeBestVF() { if (VPlans.empty()) From 83a5c7cb62e404a713a35445b755cf0109650279 Mon Sep 17 00:00:00 2001 From: David Green Date: Sat, 24 Aug 2024 14:39:20 +0100 Subject: [PATCH 404/426] [ConstantFolding] Ensure TLI is valid when simplifying fp128 intrinsics. TLI might not be valid for all contexts that constant folding is performed. Add a quick guard that it is not null. --- llvm/lib/Analysis/ConstantFolding.cpp | 2 +- llvm/test/Transforms/Inline/simplify-fp128.ll | 24 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/Inline/simplify-fp128.ll diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 81c4d4ec5be412..26d9304cb73672 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -2140,7 +2140,7 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, return GetConstantFoldFPValue128(Result, Ty); } LibFunc Fp128Func = NotLibFunc; - if (TLI->getLibFunc(Name, Fp128Func) && TLI->has(Fp128Func) && + if (TLI && TLI->getLibFunc(Name, Fp128Func) && TLI->has(Fp128Func) && Fp128Func == LibFunc_logl) return ConstantFoldFP128(logf128, Op->getValueAPF(), Ty); } diff --git a/llvm/test/Transforms/Inline/simplify-fp128.ll b/llvm/test/Transforms/Inline/simplify-fp128.ll new file mode 100644 index 00000000000000..73e63702cefcba --- /dev/null +++ b/llvm/test/Transforms/Inline/simplify-fp128.ll @@ -0,0 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=inline -S | FileCheck %s + +define void @fli() { +; CHECK-LABEL: define void @fli() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call fp128 @llvm.floor.f128(fp128 0xL999999999999999A4001199999999999) +; CHECK-NEXT: ret void +; +entry: + call void @sc() + ret void +} + +define void @sc() { +; CHECK-LABEL: define void @sc() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = tail call fp128 @llvm.floor.f128(fp128 0xL999999999999999A4001199999999999) +; CHECK-NEXT: ret void +; +entry: + %0 = tail call fp128 @llvm.floor.f128(fp128 0xL999999999999999A4001199999999999) + ret void +} From 08acc3f73b64bed578d18812a04015cb537c9c82 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 24 Aug 2024 07:13:42 -0700 Subject: [PATCH 405/426] [Analysis] Copy-construct SmallVector (NFC) (#105911) --- llvm/lib/Analysis/LazyValueInfo.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp index 47d3dac73083ee..615d8b7ccd8ccf 100644 --- a/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/llvm/lib/Analysis/LazyValueInfo.cpp @@ -512,8 +512,8 @@ class LazyValueInfoImpl { } // namespace llvm void LazyValueInfoImpl::solve() { - SmallVector, 8> StartingStack( - BlockValueStack.begin(), BlockValueStack.end()); + SmallVector, 8> StartingStack = + BlockValueStack; unsigned processedCount = 0; while (!BlockValueStack.empty()) { From 65b7cbbd8735b90933369364153b982d498f649a Mon Sep 17 00:00:00 2001 From: Keith Smiley Date: Sat, 24 Aug 2024 09:20:14 -0700 Subject: [PATCH 406/426] [lit] Export env vars in script to avoid pruning (#105759) On macOS the dynamic loader prunes dyld specific environment variables such as `DYLD_INSERT_LIBRARIES`, `DYLD_LIBRARY_PATH`, etc. If these are set in the lit config it's safe to assume that the user actually wanted their subprocesses to run with these variables, versus the python interpreter that gets executed with them before they are pruned. This change exports all known variables in the shell script instead of relying on them being passed through. --- llvm/utils/lit/lit/TestRunner.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py index 2d9af9fbbb3634..4dad1412436d93 100644 --- a/llvm/utils/lit/lit/TestRunner.py +++ b/llvm/utils/lit/lit/TestRunner.py @@ -1226,6 +1226,16 @@ def executeScript(test, litConfig, tmpBase, commands, cwd): commands[i] += f" && {{ {command}; }}" if test.config.pipefail: f.write(b"set -o pipefail;" if mode == "wb" else "set -o pipefail;") + + # Manually export any DYLD_* variables used by dyld on macOS because + # otherwise they are lost when the shell executable is run, before the + # lit test is executed. + env_str = "\n".join( + "export {}={};".format(k, shlex.quote(v)) + for k, v in test.config.environment.items() + if k.startswith("DYLD_") + ) + f.write(bytes(env_str, "utf-8") if mode == "wb" else env_str) f.write(b"set -x;" if mode == "wb" else "set -x;") if sys.version_info > (3, 0) and mode == "wb": f.write(bytes("{ " + "; } &&\n{ ".join(commands) + "; }", "utf-8")) From 7036394048a963dd23f1a2da269089224e30d0b2 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sat, 24 Aug 2024 09:51:08 -0700 Subject: [PATCH 407/426] Update Python requirements to fix more CVEs (#105853) Followup to #90109. In Microsoft, our automated scans are warning that LLVM has vulnerable dependencies. Specifically: * [CVE-2024-35195](https://nvd.nist.gov/vuln/detail/CVE-2024-35195) was fixed in `requests` 2.32.0. * [CVE-2024-37891](https://nvd.nist.gov/vuln/detail/CVE-2024-37891) was fixed in `urllib3` 2.2.2. I've updated LLVM's dependencies by running the following commands in `llvm/utils/git`: ``` pip-compile --upgrade --generate-hashes --output-file=requirements.txt requirements.txt.in pip-compile --upgrade --generate-hashes --output-file=requirements_formatting.txt requirements_formatting.txt.in ``` Note that for `requirements_formatting.txt` this adds `--generate-hashes` (according to my vague understanding, it's highly desirable and was already used for `requirements.txt`) and was locally run within `llvm/utils/git` (changing the recorded command, which apparently was originally run from the repo root - again, `requirements.txt` was already being regenerated with a locally run command, so this increases consistency). I observe that this has updated the relevant components to pick up the CVE fixes. Note that I am largely clueless in this area, so I hope that (like #90109) no other changes will be necessary. --- llvm/utils/git/requirements.txt | 212 ++++++------ llvm/utils/git/requirements_formatting.txt | 370 +++++++++++++++++++-- 2 files changed, 456 insertions(+), 126 deletions(-) diff --git a/llvm/utils/git/requirements.txt b/llvm/utils/git/requirements.txt index e354c91a4d5bd5..0fa6b31d860473 100644 --- a/llvm/utils/git/requirements.txt +++ b/llvm/utils/git/requirements.txt @@ -4,65 +4,80 @@ # # pip-compile --generate-hashes --output-file=requirements.txt requirements.txt.in # -certifi==2024.2.2 \ - --hash=sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f \ - --hash=sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1 +certifi==2024.7.4 \ + --hash=sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b \ + --hash=sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90 # via # -r requirements.txt.in # requests -cffi==1.16.0 \ - --hash=sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc \ - --hash=sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a \ - --hash=sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417 \ - --hash=sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab \ - --hash=sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520 \ - --hash=sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36 \ - --hash=sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743 \ - --hash=sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8 \ - --hash=sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed \ - --hash=sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684 \ - --hash=sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56 \ - --hash=sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324 \ - --hash=sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d \ - --hash=sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235 \ - --hash=sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e \ - --hash=sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088 \ - --hash=sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000 \ - --hash=sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7 \ - --hash=sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e \ - --hash=sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673 \ - --hash=sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c \ - --hash=sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe \ - --hash=sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2 \ - --hash=sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098 \ - --hash=sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8 \ - --hash=sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a \ - --hash=sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0 \ - --hash=sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b \ - --hash=sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896 \ - --hash=sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e \ - --hash=sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9 \ - --hash=sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2 \ - --hash=sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b \ - --hash=sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6 \ - --hash=sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404 \ - --hash=sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f \ - --hash=sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0 \ - --hash=sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4 \ - --hash=sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc \ - --hash=sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936 \ - --hash=sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba \ - --hash=sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872 \ - --hash=sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb \ - --hash=sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614 \ - --hash=sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1 \ - --hash=sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d \ - --hash=sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969 \ - --hash=sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b \ - --hash=sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4 \ - --hash=sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627 \ - --hash=sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956 \ - --hash=sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357 +cffi==1.17.0 \ + --hash=sha256:011aff3524d578a9412c8b3cfaa50f2c0bd78e03eb7af7aa5e0df59b158efb2f \ + --hash=sha256:0a048d4f6630113e54bb4b77e315e1ba32a5a31512c31a273807d0027a7e69ab \ + --hash=sha256:0bb15e7acf8ab35ca8b24b90af52c8b391690ef5c4aec3d31f38f0d37d2cc499 \ + --hash=sha256:0d46ee4764b88b91f16661a8befc6bfb24806d885e27436fdc292ed7e6f6d058 \ + --hash=sha256:0e60821d312f99d3e1569202518dddf10ae547e799d75aef3bca3a2d9e8ee693 \ + --hash=sha256:0fdacad9e0d9fc23e519efd5ea24a70348305e8d7d85ecbb1a5fa66dc834e7fb \ + --hash=sha256:14b9cbc8f7ac98a739558eb86fabc283d4d564dafed50216e7f7ee62d0d25377 \ + --hash=sha256:17c6d6d3260c7f2d94f657e6872591fe8733872a86ed1345bda872cfc8c74885 \ + --hash=sha256:1a2ddbac59dc3716bc79f27906c010406155031a1c801410f1bafff17ea304d2 \ + --hash=sha256:2404f3de742f47cb62d023f0ba7c5a916c9c653d5b368cc966382ae4e57da401 \ + --hash=sha256:24658baf6224d8f280e827f0a50c46ad819ec8ba380a42448e24459daf809cf4 \ + --hash=sha256:24aa705a5f5bd3a8bcfa4d123f03413de5d86e497435693b638cbffb7d5d8a1b \ + --hash=sha256:2770bb0d5e3cc0e31e7318db06efcbcdb7b31bcb1a70086d3177692a02256f59 \ + --hash=sha256:331ad15c39c9fe9186ceaf87203a9ecf5ae0ba2538c9e898e3a6967e8ad3db6f \ + --hash=sha256:3aa9d43b02a0c681f0bfbc12d476d47b2b2b6a3f9287f11ee42989a268a1833c \ + --hash=sha256:41f4915e09218744d8bae14759f983e466ab69b178de38066f7579892ff2a555 \ + --hash=sha256:4304d4416ff032ed50ad6bb87416d802e67139e31c0bde4628f36a47a3164bfa \ + --hash=sha256:435a22d00ec7d7ea533db494da8581b05977f9c37338c80bc86314bec2619424 \ + --hash=sha256:45f7cd36186db767d803b1473b3c659d57a23b5fa491ad83c6d40f2af58e4dbb \ + --hash=sha256:48b389b1fd5144603d61d752afd7167dfd205973a43151ae5045b35793232aa2 \ + --hash=sha256:4e67d26532bfd8b7f7c05d5a766d6f437b362c1bf203a3a5ce3593a645e870b8 \ + --hash=sha256:516a405f174fd3b88829eabfe4bb296ac602d6a0f68e0d64d5ac9456194a5b7e \ + --hash=sha256:5ba5c243f4004c750836f81606a9fcb7841f8874ad8f3bf204ff5e56332b72b9 \ + --hash=sha256:5bdc0f1f610d067c70aa3737ed06e2726fd9d6f7bfee4a351f4c40b6831f4e82 \ + --hash=sha256:6107e445faf057c118d5050560695e46d272e5301feffda3c41849641222a828 \ + --hash=sha256:6327b572f5770293fc062a7ec04160e89741e8552bf1c358d1a23eba68166759 \ + --hash=sha256:669b29a9eca6146465cc574659058ed949748f0809a2582d1f1a324eb91054dc \ + --hash=sha256:6ce01337d23884b21c03869d2f68c5523d43174d4fc405490eb0091057943118 \ + --hash=sha256:6d872186c1617d143969defeadac5a904e6e374183e07977eedef9c07c8953bf \ + --hash=sha256:6f76a90c345796c01d85e6332e81cab6d70de83b829cf1d9762d0a3da59c7932 \ + --hash=sha256:70d2aa9fb00cf52034feac4b913181a6e10356019b18ef89bc7c12a283bf5f5a \ + --hash=sha256:7cbc78dc018596315d4e7841c8c3a7ae31cc4d638c9b627f87d52e8abaaf2d29 \ + --hash=sha256:856bf0924d24e7f93b8aee12a3a1095c34085600aa805693fb7f5d1962393206 \ + --hash=sha256:8a98748ed1a1df4ee1d6f927e151ed6c1a09d5ec21684de879c7ea6aa96f58f2 \ + --hash=sha256:93a7350f6706b31f457c1457d3a3259ff9071a66f312ae64dc024f049055f72c \ + --hash=sha256:964823b2fc77b55355999ade496c54dde161c621cb1f6eac61dc30ed1b63cd4c \ + --hash=sha256:a003ac9edc22d99ae1286b0875c460351f4e101f8c9d9d2576e78d7e048f64e0 \ + --hash=sha256:a0ce71725cacc9ebf839630772b07eeec220cbb5f03be1399e0457a1464f8e1a \ + --hash=sha256:a47eef975d2b8b721775a0fa286f50eab535b9d56c70a6e62842134cf7841195 \ + --hash=sha256:a8b5b9712783415695663bd463990e2f00c6750562e6ad1d28e072a611c5f2a6 \ + --hash=sha256:a9015f5b8af1bb6837a3fcb0cdf3b874fe3385ff6274e8b7925d81ccaec3c5c9 \ + --hash=sha256:aec510255ce690d240f7cb23d7114f6b351c733a74c279a84def763660a2c3bc \ + --hash=sha256:b00e7bcd71caa0282cbe3c90966f738e2db91e64092a877c3ff7f19a1628fdcb \ + --hash=sha256:b50aaac7d05c2c26dfd50c3321199f019ba76bb650e346a6ef3616306eed67b0 \ + --hash=sha256:b7b6ea9e36d32582cda3465f54c4b454f62f23cb083ebc7a94e2ca6ef011c3a7 \ + --hash=sha256:bb9333f58fc3a2296fb1d54576138d4cf5d496a2cc118422bd77835e6ae0b9cb \ + --hash=sha256:c1c13185b90bbd3f8b5963cd8ce7ad4ff441924c31e23c975cb150e27c2bf67a \ + --hash=sha256:c3b8bd3133cd50f6b637bb4322822c94c5ce4bf0d724ed5ae70afce62187c492 \ + --hash=sha256:c5d97162c196ce54af6700949ddf9409e9833ef1003b4741c2b39ef46f1d9720 \ + --hash=sha256:c815270206f983309915a6844fe994b2fa47e5d05c4c4cef267c3b30e34dbe42 \ + --hash=sha256:cab2eba3830bf4f6d91e2d6718e0e1c14a2f5ad1af68a89d24ace0c6b17cced7 \ + --hash=sha256:d1df34588123fcc88c872f5acb6f74ae59e9d182a2707097f9e28275ec26a12d \ + --hash=sha256:d6bdcd415ba87846fd317bee0774e412e8792832e7805938987e4ede1d13046d \ + --hash=sha256:db9a30ec064129d605d0f1aedc93e00894b9334ec74ba9c6bdd08147434b33eb \ + --hash=sha256:dbc183e7bef690c9abe5ea67b7b60fdbca81aa8da43468287dae7b5c046107d4 \ + --hash=sha256:dca802c8db0720ce1c49cce1149ff7b06e91ba15fa84b1d59144fef1a1bc7ac2 \ + --hash=sha256:dec6b307ce928e8e112a6bb9921a1cb00a0e14979bf28b98e084a4b8a742bd9b \ + --hash=sha256:df8bb0010fdd0a743b7542589223a2816bdde4d94bb5ad67884348fa2c1c67e8 \ + --hash=sha256:e4094c7b464cf0a858e75cd14b03509e84789abf7b79f8537e6a72152109c76e \ + --hash=sha256:e4760a68cab57bfaa628938e9c2971137e05ce48e762a9cb53b76c9b569f1204 \ + --hash=sha256:eb09b82377233b902d4c3fbeeb7ad731cdab579c6c6fda1f763cd779139e47c3 \ + --hash=sha256:eb862356ee9391dc5a0b3cbc00f416b48c1b9a52d252d898e5b7696a5f9fe150 \ + --hash=sha256:ef9528915df81b8f4c7612b19b8628214c65c9b7f74db2e34a646a0a2a0da2d4 \ + --hash=sha256:f3157624b7558b914cb039fd1af735e5e8049a87c817cc215109ad1c8779df76 \ + --hash=sha256:f3e0992f23bbb0be00a921eae5363329253c3b86287db27092461c887b791e5e \ + --hash=sha256:f9338cc05451f1942d0d8203ec2c346c830f8e86469903d5126c1f0a13a2bcbb \ + --hash=sha256:ffef8fd58a36fb5f1196919638f73dd3ae0db1a878982b27a9a5a176ede4ba91 # via # cryptography # pynacl @@ -158,39 +173,34 @@ charset-normalizer==3.3.2 \ --hash=sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519 \ --hash=sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561 # via requests -cryptography==42.0.5 \ - --hash=sha256:0270572b8bd2c833c3981724b8ee9747b3ec96f699a9665470018594301439ee \ - --hash=sha256:111a0d8553afcf8eb02a4fea6ca4f59d48ddb34497aa8706a6cf536f1a5ec576 \ - --hash=sha256:16a48c23a62a2f4a285699dba2e4ff2d1cff3115b9df052cdd976a18856d8e3d \ - --hash=sha256:1b95b98b0d2af784078fa69f637135e3c317091b615cd0905f8b8a087e86fa30 \ - --hash=sha256:1f71c10d1e88467126f0efd484bd44bca5e14c664ec2ede64c32f20875c0d413 \ - --hash=sha256:2424ff4c4ac7f6b8177b53c17ed5d8fa74ae5955656867f5a8affaca36a27abb \ - --hash=sha256:2bce03af1ce5a5567ab89bd90d11e7bbdff56b8af3acbbec1faded8f44cb06da \ - --hash=sha256:329906dcc7b20ff3cad13c069a78124ed8247adcac44b10bea1130e36caae0b4 \ - --hash=sha256:37dd623507659e08be98eec89323469e8c7b4c1407c85112634ae3dbdb926fdd \ - --hash=sha256:3eaafe47ec0d0ffcc9349e1708be2aaea4c6dd4978d76bf6eb0cb2c13636c6fc \ - --hash=sha256:5e6275c09d2badf57aea3afa80d975444f4be8d3bc58f7f80d2a484c6f9485c8 \ - --hash=sha256:6fe07eec95dfd477eb9530aef5bead34fec819b3aaf6c5bd6d20565da607bfe1 \ - --hash=sha256:7367d7b2eca6513681127ebad53b2582911d1736dc2ffc19f2c3ae49997496bc \ - --hash=sha256:7cde5f38e614f55e28d831754e8a3bacf9ace5d1566235e39d91b35502d6936e \ - --hash=sha256:9481ffe3cf013b71b2428b905c4f7a9a4f76ec03065b05ff499bb5682a8d9ad8 \ - --hash=sha256:98d8dc6d012b82287f2c3d26ce1d2dd130ec200c8679b6213b3c73c08b2b7940 \ - --hash=sha256:a011a644f6d7d03736214d38832e030d8268bcff4a41f728e6030325fea3e400 \ - --hash=sha256:a2913c5375154b6ef2e91c10b5720ea6e21007412f6437504ffea2109b5a33d7 \ - --hash=sha256:a30596bae9403a342c978fb47d9b0ee277699fa53bbafad14706af51fe543d16 \ - --hash=sha256:b03c2ae5d2f0fc05f9a2c0c997e1bc18c8229f392234e8a0194f202169ccd278 \ - --hash=sha256:b6cd2203306b63e41acdf39aa93b86fb566049aeb6dc489b70e34bcd07adca74 \ - --hash=sha256:b7ffe927ee6531c78f81aa17e684e2ff617daeba7f189f911065b2ea2d526dec \ - --hash=sha256:b8cac287fafc4ad485b8a9b67d0ee80c66bf3574f655d3b97ef2e1082360faf1 \ - --hash=sha256:ba334e6e4b1d92442b75ddacc615c5476d4ad55cc29b15d590cc6b86efa487e2 \ - --hash=sha256:ba3e4a42397c25b7ff88cdec6e2a16c2be18720f317506ee25210f6d31925f9c \ - --hash=sha256:c41fb5e6a5fe9ebcd58ca3abfeb51dffb5d83d6775405305bfa8715b76521922 \ - --hash=sha256:cd2030f6650c089aeb304cf093f3244d34745ce0cfcc39f20c6fbfe030102e2a \ - --hash=sha256:cd65d75953847815962c84a4654a84850b2bb4aed3f26fadcc1c13892e1e29f6 \ - --hash=sha256:e4985a790f921508f36f81831817cbc03b102d643b5fcb81cd33df3fa291a1a1 \ - --hash=sha256:e807b3188f9eb0eaa7bbb579b462c5ace579f1cedb28107ce8b48a9f7ad3679e \ - --hash=sha256:f12764b8fffc7a123f641d7d049d382b73f96a34117e0b637b80643169cec8ac \ - --hash=sha256:f8837fe1d6ac4a8052a9a8ddab256bc006242696f03368a4009be7ee3075cdb7 +cryptography==43.0.0 \ + --hash=sha256:0663585d02f76929792470451a5ba64424acc3cd5227b03921dab0e2f27b1709 \ + --hash=sha256:08a24a7070b2b6804c1940ff0f910ff728932a9d0e80e7814234269f9d46d069 \ + --hash=sha256:232ce02943a579095a339ac4b390fbbe97f5b5d5d107f8a08260ea2768be8cc2 \ + --hash=sha256:2905ccf93a8a2a416f3ec01b1a7911c3fe4073ef35640e7ee5296754e30b762b \ + --hash=sha256:299d3da8e00b7e2b54bb02ef58d73cd5f55fb31f33ebbf33bd00d9aa6807df7e \ + --hash=sha256:2c6d112bf61c5ef44042c253e4859b3cbbb50df2f78fa8fae6747a7814484a70 \ + --hash=sha256:31e44a986ceccec3d0498e16f3d27b2ee5fdf69ce2ab89b52eaad1d2f33d8778 \ + --hash=sha256:3d9a1eca329405219b605fac09ecfc09ac09e595d6def650a437523fcd08dd22 \ + --hash=sha256:3dcdedae5c7710b9f97ac6bba7e1052b95c7083c9d0e9df96e02a1932e777895 \ + --hash=sha256:47ca71115e545954e6c1d207dd13461ab81f4eccfcb1345eac874828b5e3eaaf \ + --hash=sha256:4a997df8c1c2aae1e1e5ac49c2e4f610ad037fc5a3aadc7b64e39dea42249431 \ + --hash=sha256:51956cf8730665e2bdf8ddb8da0056f699c1a5715648c1b0144670c1ba00b48f \ + --hash=sha256:5bcb8a5620008a8034d39bce21dc3e23735dfdb6a33a06974739bfa04f853947 \ + --hash=sha256:64c3f16e2a4fc51c0d06af28441881f98c5d91009b8caaff40cf3548089e9c74 \ + --hash=sha256:6e2b11c55d260d03a8cf29ac9b5e0608d35f08077d8c087be96287f43af3ccdc \ + --hash=sha256:7b3f5fe74a5ca32d4d0f302ffe6680fcc5c28f8ef0dc0ae8f40c0f3a1b4fca66 \ + --hash=sha256:844b6d608374e7d08f4f6e6f9f7b951f9256db41421917dfb2d003dde4cd6b66 \ + --hash=sha256:9a8d6802e0825767476f62aafed40532bd435e8a5f7d23bd8b4f5fd04cc80ecf \ + --hash=sha256:aae4d918f6b180a8ab8bf6511a419473d107df4dbb4225c7b48c5c9602c38c7f \ + --hash=sha256:ac1955ce000cb29ab40def14fd1bbfa7af2017cca696ee696925615cafd0dce5 \ + --hash=sha256:b88075ada2d51aa9f18283532c9f60e72170041bba88d7f37e49cbb10275299e \ + --hash=sha256:cb013933d4c127349b3948aa8aaf2f12c0353ad0eccd715ca789c8a0f671646f \ + --hash=sha256:cc70b4b581f28d0a254d006f26949245e3657d40d8857066c2ae22a61222ef55 \ + --hash=sha256:e9c5266c432a1e23738d178e51c2c7a5e2ddf790f248be939448c0ba2021f9d1 \ + --hash=sha256:ea9e57f8ea880eeea38ab5abf9fbe39f923544d7884228ec67d666abd60f5a47 \ + --hash=sha256:ee0c405832ade84d4de74b9029bedb7b31200600fa524d218fc29bfa371e97f5 \ + --hash=sha256:fdcb265de28585de5b859ae13e3846a8e805268a823a12a4da2597f1f5afc9f0 # via pyjwt deprecated==1.2.14 \ --hash=sha256:6fac8b097794a90302bdbb17b9b815e732d3c4720583ff1b198499d78470466c \ @@ -204,9 +214,9 @@ gitpython==3.1.43 \ --hash=sha256:35f314a9f878467f5453cc1fee295c3e18e52f1b99f10f6cf5b1682e968a9e7c \ --hash=sha256:eec7ec56b92aad751f9912a73404bc02ba212a23adb2c7098ee668417051a1ff # via -r requirements.txt.in -idna==3.7 \ - --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \ - --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0 +idna==3.8 \ + --hash=sha256:050b4e5baadcd44d760cedbd2b8e639f2ff89bbc7a5730fcc662954303377aac \ + --hash=sha256:d838c2c0ed6fced7693d5e8ab8e734d5f8fda53a039c0164afb0b82e771e3603 # via requests pycparser==2.22 \ --hash=sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6 \ @@ -216,9 +226,9 @@ pygithub==1.59.1 \ --hash=sha256:3d87a822e6c868142f0c2c4bf16cce4696b5a7a4d142a7bd160e1bdf75bc54a9 \ --hash=sha256:c44e3a121c15bf9d3a5cc98d94c9a047a5132a9b01d22264627f58ade9ddc217 # via -r requirements.txt.in -pyjwt[crypto]==2.8.0 \ - --hash=sha256:57e28d156e3d5c10088e0c68abb90bfac3df82b40a71bd0daa20c65ccd5c23de \ - --hash=sha256:59127c392cc44c2da5bb3192169a91f429924e17aff6534d70fdc02ab3e04320 +pyjwt[crypto]==2.9.0 \ + --hash=sha256:3b02fb0f44517787776cf48f2ae25d8e14f300e6d7545a4315cee571a415e850 \ + --hash=sha256:7e1e5b56cc735432a7369cbfa0efe50fa113ebecdc04ae6922deba8b84582d0c # via pygithub pynacl==1.5.0 \ --hash=sha256:06b8f6fa7f5de8d5d2f7573fe8c863c051225a27b61e6860fd047b1775807858 \ @@ -232,17 +242,17 @@ pynacl==1.5.0 \ --hash=sha256:a422368fc821589c228f4c49438a368831cb5bbc0eab5ebe1d7fac9dded6567b \ --hash=sha256:e46dae94e34b085175f8abb3b0aaa7da40767865ac82c928eeb9e57e1ea8a543 # via pygithub -requests==2.31.0 \ - --hash=sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f \ - --hash=sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1 +requests==2.32.3 \ + --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ + --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 # via pygithub smmap==5.0.1 \ --hash=sha256:dceeb6c0028fdb6734471eb07c0cd2aae706ccaecab45965ee83f11c8d3b1f62 \ --hash=sha256:e6d8668fa5f93e706934a62d7b4db19c8d9eb8cf2adbb75ef1b675aa332b69da # via gitdb -urllib3==2.2.1 \ - --hash=sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d \ - --hash=sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19 +urllib3==2.2.2 \ + --hash=sha256:a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472 \ + --hash=sha256:dd505485549a7a552833da5e6063639d0d177c04f23bc3864e41e5dc5f612168 # via requests wrapt==1.16.0 \ --hash=sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc \ diff --git a/llvm/utils/git/requirements_formatting.txt b/llvm/utils/git/requirements_formatting.txt index 4c9dbd8755ab50..18e2626c79460c 100644 --- a/llvm/utils/git/requirements_formatting.txt +++ b/llvm/utils/git/requirements_formatting.txt @@ -2,51 +2,371 @@ # This file is autogenerated by pip-compile with Python 3.11 # by the following command: # -# pip-compile --output-file=llvm/utils/git/requirements_formatting.txt llvm/utils/git/requirements_formatting.txt.in +# pip-compile --generate-hashes --output-file=requirements_formatting.txt requirements_formatting.txt.in # -black==23.12.1 +black==23.12.1 \ + --hash=sha256:0808494f2b2df923ffc5723ed3c7b096bd76341f6213989759287611e9837d50 \ + --hash=sha256:1fa88a0f74e50e4487477bc0bb900c6781dbddfdfa32691e780bf854c3b4a47f \ + --hash=sha256:25e57fd232a6d6ff3f4478a6fd0580838e47c93c83eaf1ccc92d4faf27112c4e \ + --hash=sha256:2d9e13db441c509a3763a7a3d9a49ccc1b4e974a47be4e08ade2a228876500ec \ + --hash=sha256:3e1b38b3135fd4c025c28c55ddfc236b05af657828a8a6abe5deec419a0b7055 \ + --hash=sha256:3fa4be75ef2a6b96ea8d92b1587dd8cb3a35c7e3d51f0738ced0781c3aa3a5a3 \ + --hash=sha256:4ce3ef14ebe8d9509188014d96af1c456a910d5b5cbf434a09fef7e024b3d0d5 \ + --hash=sha256:4f0031eaa7b921db76decd73636ef3a12c942ed367d8c3841a0739412b260a54 \ + --hash=sha256:602cfb1196dc692424c70b6507593a2b29aac0547c1be9a1d1365f0d964c353b \ + --hash=sha256:6d1bd9c210f8b109b1762ec9fd36592fdd528485aadb3f5849b2740ef17e674e \ + --hash=sha256:78baad24af0f033958cad29731e27363183e140962595def56423e626f4bee3e \ + --hash=sha256:8d4df77958a622f9b5a4c96edb4b8c0034f8434032ab11077ec6c56ae9f384ba \ + --hash=sha256:97e56155c6b737854e60a9ab1c598ff2533d57e7506d97af5481141671abf3ea \ + --hash=sha256:9c4352800f14be5b4864016882cdba10755bd50805c95f728011bcb47a4afd59 \ + --hash=sha256:a4d6a9668e45ad99d2f8ec70d5c8c04ef4f32f648ef39048d010b0689832ec6d \ + --hash=sha256:a920b569dc6b3472513ba6ddea21f440d4b4c699494d2e972a1753cdc25df7b0 \ + --hash=sha256:ae76c22bde5cbb6bfd211ec343ded2163bba7883c7bc77f6b756a1049436fbb9 \ + --hash=sha256:b18fb2ae6c4bb63eebe5be6bd869ba2f14fd0259bda7d18a46b764d8fb86298a \ + --hash=sha256:c04b6d9d20e9c13f43eee8ea87d44156b8505ca8a3c878773f68b4e4812a421e \ + --hash=sha256:c88b3711d12905b74206227109272673edce0cb29f27e1385f33b0163c414bba \ + --hash=sha256:dd15245c8b68fe2b6bd0f32c1556509d11bb33aec9b5d0866dd8e2ed3dba09c2 \ + --hash=sha256:e0aaf6041986767a5e0ce663c7a2f0e9eaf21e6ff87a5f95cbf3675bfd4c41d2 # via - # -r llvm/utils/git/requirements_formatting.txt.in + # -r requirements_formatting.txt.in # darker -certifi==2024.2.2 +certifi==2024.7.4 \ + --hash=sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b \ + --hash=sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90 # via requests -cffi==1.16.0 +cffi==1.17.0 \ + --hash=sha256:011aff3524d578a9412c8b3cfaa50f2c0bd78e03eb7af7aa5e0df59b158efb2f \ + --hash=sha256:0a048d4f6630113e54bb4b77e315e1ba32a5a31512c31a273807d0027a7e69ab \ + --hash=sha256:0bb15e7acf8ab35ca8b24b90af52c8b391690ef5c4aec3d31f38f0d37d2cc499 \ + --hash=sha256:0d46ee4764b88b91f16661a8befc6bfb24806d885e27436fdc292ed7e6f6d058 \ + --hash=sha256:0e60821d312f99d3e1569202518dddf10ae547e799d75aef3bca3a2d9e8ee693 \ + --hash=sha256:0fdacad9e0d9fc23e519efd5ea24a70348305e8d7d85ecbb1a5fa66dc834e7fb \ + --hash=sha256:14b9cbc8f7ac98a739558eb86fabc283d4d564dafed50216e7f7ee62d0d25377 \ + --hash=sha256:17c6d6d3260c7f2d94f657e6872591fe8733872a86ed1345bda872cfc8c74885 \ + --hash=sha256:1a2ddbac59dc3716bc79f27906c010406155031a1c801410f1bafff17ea304d2 \ + --hash=sha256:2404f3de742f47cb62d023f0ba7c5a916c9c653d5b368cc966382ae4e57da401 \ + --hash=sha256:24658baf6224d8f280e827f0a50c46ad819ec8ba380a42448e24459daf809cf4 \ + --hash=sha256:24aa705a5f5bd3a8bcfa4d123f03413de5d86e497435693b638cbffb7d5d8a1b \ + --hash=sha256:2770bb0d5e3cc0e31e7318db06efcbcdb7b31bcb1a70086d3177692a02256f59 \ + --hash=sha256:331ad15c39c9fe9186ceaf87203a9ecf5ae0ba2538c9e898e3a6967e8ad3db6f \ + --hash=sha256:3aa9d43b02a0c681f0bfbc12d476d47b2b2b6a3f9287f11ee42989a268a1833c \ + --hash=sha256:41f4915e09218744d8bae14759f983e466ab69b178de38066f7579892ff2a555 \ + --hash=sha256:4304d4416ff032ed50ad6bb87416d802e67139e31c0bde4628f36a47a3164bfa \ + --hash=sha256:435a22d00ec7d7ea533db494da8581b05977f9c37338c80bc86314bec2619424 \ + --hash=sha256:45f7cd36186db767d803b1473b3c659d57a23b5fa491ad83c6d40f2af58e4dbb \ + --hash=sha256:48b389b1fd5144603d61d752afd7167dfd205973a43151ae5045b35793232aa2 \ + --hash=sha256:4e67d26532bfd8b7f7c05d5a766d6f437b362c1bf203a3a5ce3593a645e870b8 \ + --hash=sha256:516a405f174fd3b88829eabfe4bb296ac602d6a0f68e0d64d5ac9456194a5b7e \ + --hash=sha256:5ba5c243f4004c750836f81606a9fcb7841f8874ad8f3bf204ff5e56332b72b9 \ + --hash=sha256:5bdc0f1f610d067c70aa3737ed06e2726fd9d6f7bfee4a351f4c40b6831f4e82 \ + --hash=sha256:6107e445faf057c118d5050560695e46d272e5301feffda3c41849641222a828 \ + --hash=sha256:6327b572f5770293fc062a7ec04160e89741e8552bf1c358d1a23eba68166759 \ + --hash=sha256:669b29a9eca6146465cc574659058ed949748f0809a2582d1f1a324eb91054dc \ + --hash=sha256:6ce01337d23884b21c03869d2f68c5523d43174d4fc405490eb0091057943118 \ + --hash=sha256:6d872186c1617d143969defeadac5a904e6e374183e07977eedef9c07c8953bf \ + --hash=sha256:6f76a90c345796c01d85e6332e81cab6d70de83b829cf1d9762d0a3da59c7932 \ + --hash=sha256:70d2aa9fb00cf52034feac4b913181a6e10356019b18ef89bc7c12a283bf5f5a \ + --hash=sha256:7cbc78dc018596315d4e7841c8c3a7ae31cc4d638c9b627f87d52e8abaaf2d29 \ + --hash=sha256:856bf0924d24e7f93b8aee12a3a1095c34085600aa805693fb7f5d1962393206 \ + --hash=sha256:8a98748ed1a1df4ee1d6f927e151ed6c1a09d5ec21684de879c7ea6aa96f58f2 \ + --hash=sha256:93a7350f6706b31f457c1457d3a3259ff9071a66f312ae64dc024f049055f72c \ + --hash=sha256:964823b2fc77b55355999ade496c54dde161c621cb1f6eac61dc30ed1b63cd4c \ + --hash=sha256:a003ac9edc22d99ae1286b0875c460351f4e101f8c9d9d2576e78d7e048f64e0 \ + --hash=sha256:a0ce71725cacc9ebf839630772b07eeec220cbb5f03be1399e0457a1464f8e1a \ + --hash=sha256:a47eef975d2b8b721775a0fa286f50eab535b9d56c70a6e62842134cf7841195 \ + --hash=sha256:a8b5b9712783415695663bd463990e2f00c6750562e6ad1d28e072a611c5f2a6 \ + --hash=sha256:a9015f5b8af1bb6837a3fcb0cdf3b874fe3385ff6274e8b7925d81ccaec3c5c9 \ + --hash=sha256:aec510255ce690d240f7cb23d7114f6b351c733a74c279a84def763660a2c3bc \ + --hash=sha256:b00e7bcd71caa0282cbe3c90966f738e2db91e64092a877c3ff7f19a1628fdcb \ + --hash=sha256:b50aaac7d05c2c26dfd50c3321199f019ba76bb650e346a6ef3616306eed67b0 \ + --hash=sha256:b7b6ea9e36d32582cda3465f54c4b454f62f23cb083ebc7a94e2ca6ef011c3a7 \ + --hash=sha256:bb9333f58fc3a2296fb1d54576138d4cf5d496a2cc118422bd77835e6ae0b9cb \ + --hash=sha256:c1c13185b90bbd3f8b5963cd8ce7ad4ff441924c31e23c975cb150e27c2bf67a \ + --hash=sha256:c3b8bd3133cd50f6b637bb4322822c94c5ce4bf0d724ed5ae70afce62187c492 \ + --hash=sha256:c5d97162c196ce54af6700949ddf9409e9833ef1003b4741c2b39ef46f1d9720 \ + --hash=sha256:c815270206f983309915a6844fe994b2fa47e5d05c4c4cef267c3b30e34dbe42 \ + --hash=sha256:cab2eba3830bf4f6d91e2d6718e0e1c14a2f5ad1af68a89d24ace0c6b17cced7 \ + --hash=sha256:d1df34588123fcc88c872f5acb6f74ae59e9d182a2707097f9e28275ec26a12d \ + --hash=sha256:d6bdcd415ba87846fd317bee0774e412e8792832e7805938987e4ede1d13046d \ + --hash=sha256:db9a30ec064129d605d0f1aedc93e00894b9334ec74ba9c6bdd08147434b33eb \ + --hash=sha256:dbc183e7bef690c9abe5ea67b7b60fdbca81aa8da43468287dae7b5c046107d4 \ + --hash=sha256:dca802c8db0720ce1c49cce1149ff7b06e91ba15fa84b1d59144fef1a1bc7ac2 \ + --hash=sha256:dec6b307ce928e8e112a6bb9921a1cb00a0e14979bf28b98e084a4b8a742bd9b \ + --hash=sha256:df8bb0010fdd0a743b7542589223a2816bdde4d94bb5ad67884348fa2c1c67e8 \ + --hash=sha256:e4094c7b464cf0a858e75cd14b03509e84789abf7b79f8537e6a72152109c76e \ + --hash=sha256:e4760a68cab57bfaa628938e9c2971137e05ce48e762a9cb53b76c9b569f1204 \ + --hash=sha256:eb09b82377233b902d4c3fbeeb7ad731cdab579c6c6fda1f763cd779139e47c3 \ + --hash=sha256:eb862356ee9391dc5a0b3cbc00f416b48c1b9a52d252d898e5b7696a5f9fe150 \ + --hash=sha256:ef9528915df81b8f4c7612b19b8628214c65c9b7f74db2e34a646a0a2a0da2d4 \ + --hash=sha256:f3157624b7558b914cb039fd1af735e5e8049a87c817cc215109ad1c8779df76 \ + --hash=sha256:f3e0992f23bbb0be00a921eae5363329253c3b86287db27092461c887b791e5e \ + --hash=sha256:f9338cc05451f1942d0d8203ec2c346c830f8e86469903d5126c1f0a13a2bcbb \ + --hash=sha256:ffef8fd58a36fb5f1196919638f73dd3ae0db1a878982b27a9a5a176ede4ba91 # via # cryptography # pynacl -charset-normalizer==3.3.2 +charset-normalizer==3.3.2 \ + --hash=sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027 \ + --hash=sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087 \ + --hash=sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786 \ + --hash=sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8 \ + --hash=sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09 \ + --hash=sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185 \ + --hash=sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574 \ + --hash=sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e \ + --hash=sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519 \ + --hash=sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898 \ + --hash=sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269 \ + --hash=sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3 \ + --hash=sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f \ + --hash=sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6 \ + --hash=sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8 \ + --hash=sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a \ + --hash=sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73 \ + --hash=sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc \ + --hash=sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714 \ + --hash=sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2 \ + --hash=sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc \ + --hash=sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce \ + --hash=sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d \ + --hash=sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e \ + --hash=sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6 \ + --hash=sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269 \ + --hash=sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96 \ + --hash=sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d \ + --hash=sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a \ + --hash=sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4 \ + --hash=sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77 \ + --hash=sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d \ + --hash=sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0 \ + --hash=sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed \ + --hash=sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068 \ + --hash=sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac \ + --hash=sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25 \ + --hash=sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8 \ + --hash=sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab \ + --hash=sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26 \ + --hash=sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2 \ + --hash=sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db \ + --hash=sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f \ + --hash=sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5 \ + --hash=sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99 \ + --hash=sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c \ + --hash=sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d \ + --hash=sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811 \ + --hash=sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa \ + --hash=sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a \ + --hash=sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03 \ + --hash=sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b \ + --hash=sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04 \ + --hash=sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c \ + --hash=sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001 \ + --hash=sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458 \ + --hash=sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389 \ + --hash=sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99 \ + --hash=sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985 \ + --hash=sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537 \ + --hash=sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238 \ + --hash=sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f \ + --hash=sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d \ + --hash=sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796 \ + --hash=sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a \ + --hash=sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143 \ + --hash=sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8 \ + --hash=sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c \ + --hash=sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5 \ + --hash=sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5 \ + --hash=sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711 \ + --hash=sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4 \ + --hash=sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6 \ + --hash=sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c \ + --hash=sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7 \ + --hash=sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4 \ + --hash=sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b \ + --hash=sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae \ + --hash=sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12 \ + --hash=sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c \ + --hash=sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae \ + --hash=sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8 \ + --hash=sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887 \ + --hash=sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b \ + --hash=sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4 \ + --hash=sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f \ + --hash=sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5 \ + --hash=sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33 \ + --hash=sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519 \ + --hash=sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561 # via requests -click==8.1.7 +click==8.1.7 \ + --hash=sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28 \ + --hash=sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de # via black -cryptography==42.0.5 +colorama==0.4.6 \ + --hash=sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44 \ + --hash=sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6 + # via click +cryptography==43.0.0 \ + --hash=sha256:0663585d02f76929792470451a5ba64424acc3cd5227b03921dab0e2f27b1709 \ + --hash=sha256:08a24a7070b2b6804c1940ff0f910ff728932a9d0e80e7814234269f9d46d069 \ + --hash=sha256:232ce02943a579095a339ac4b390fbbe97f5b5d5d107f8a08260ea2768be8cc2 \ + --hash=sha256:2905ccf93a8a2a416f3ec01b1a7911c3fe4073ef35640e7ee5296754e30b762b \ + --hash=sha256:299d3da8e00b7e2b54bb02ef58d73cd5f55fb31f33ebbf33bd00d9aa6807df7e \ + --hash=sha256:2c6d112bf61c5ef44042c253e4859b3cbbb50df2f78fa8fae6747a7814484a70 \ + --hash=sha256:31e44a986ceccec3d0498e16f3d27b2ee5fdf69ce2ab89b52eaad1d2f33d8778 \ + --hash=sha256:3d9a1eca329405219b605fac09ecfc09ac09e595d6def650a437523fcd08dd22 \ + --hash=sha256:3dcdedae5c7710b9f97ac6bba7e1052b95c7083c9d0e9df96e02a1932e777895 \ + --hash=sha256:47ca71115e545954e6c1d207dd13461ab81f4eccfcb1345eac874828b5e3eaaf \ + --hash=sha256:4a997df8c1c2aae1e1e5ac49c2e4f610ad037fc5a3aadc7b64e39dea42249431 \ + --hash=sha256:51956cf8730665e2bdf8ddb8da0056f699c1a5715648c1b0144670c1ba00b48f \ + --hash=sha256:5bcb8a5620008a8034d39bce21dc3e23735dfdb6a33a06974739bfa04f853947 \ + --hash=sha256:64c3f16e2a4fc51c0d06af28441881f98c5d91009b8caaff40cf3548089e9c74 \ + --hash=sha256:6e2b11c55d260d03a8cf29ac9b5e0608d35f08077d8c087be96287f43af3ccdc \ + --hash=sha256:7b3f5fe74a5ca32d4d0f302ffe6680fcc5c28f8ef0dc0ae8f40c0f3a1b4fca66 \ + --hash=sha256:844b6d608374e7d08f4f6e6f9f7b951f9256db41421917dfb2d003dde4cd6b66 \ + --hash=sha256:9a8d6802e0825767476f62aafed40532bd435e8a5f7d23bd8b4f5fd04cc80ecf \ + --hash=sha256:aae4d918f6b180a8ab8bf6511a419473d107df4dbb4225c7b48c5c9602c38c7f \ + --hash=sha256:ac1955ce000cb29ab40def14fd1bbfa7af2017cca696ee696925615cafd0dce5 \ + --hash=sha256:b88075ada2d51aa9f18283532c9f60e72170041bba88d7f37e49cbb10275299e \ + --hash=sha256:cb013933d4c127349b3948aa8aaf2f12c0353ad0eccd715ca789c8a0f671646f \ + --hash=sha256:cc70b4b581f28d0a254d006f26949245e3657d40d8857066c2ae22a61222ef55 \ + --hash=sha256:e9c5266c432a1e23738d178e51c2c7a5e2ddf790f248be939448c0ba2021f9d1 \ + --hash=sha256:ea9e57f8ea880eeea38ab5abf9fbe39f923544d7884228ec67d666abd60f5a47 \ + --hash=sha256:ee0c405832ade84d4de74b9029bedb7b31200600fa524d218fc29bfa371e97f5 \ + --hash=sha256:fdcb265de28585de5b859ae13e3846a8e805268a823a12a4da2597f1f5afc9f0 # via pyjwt -darker==1.7.2 - # via -r llvm/utils/git/requirements_formatting.txt.in -deprecated==1.2.14 +darker==1.7.2 \ + --hash=sha256:ec5b7c382d9537611c164f3ecca2e1b8a7923bc5a02bf22f6e7f6c8bcbdf593a \ + --hash=sha256:ec9d130ab2a0f7fa49ab68a08fd231a5bec66147ecbbf94c92a1f33d97b5ef6f + # via -r requirements_formatting.txt.in +deprecated==1.2.14 \ + --hash=sha256:6fac8b097794a90302bdbb17b9b815e732d3c4720583ff1b198499d78470466c \ + --hash=sha256:e5323eb936458dccc2582dc6f9c322c852a775a27065ff2b0c4970b9d53d01b3 # via pygithub -idna==3.7 +idna==3.8 \ + --hash=sha256:050b4e5baadcd44d760cedbd2b8e639f2ff89bbc7a5730fcc662954303377aac \ + --hash=sha256:d838c2c0ed6fced7693d5e8ab8e734d5f8fda53a039c0164afb0b82e771e3603 # via requests -mypy-extensions==1.0.0 +mypy-extensions==1.0.0 \ + --hash=sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d \ + --hash=sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782 # via black -packaging==24.0 +packaging==24.1 \ + --hash=sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002 \ + --hash=sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124 # via black -pathspec==0.12.1 +pathspec==0.12.1 \ + --hash=sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08 \ + --hash=sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712 # via black -platformdirs==4.2.1 +platformdirs==4.2.2 \ + --hash=sha256:2d7a1657e36a80ea911db832a8a6ece5ee53d8de21edd5cc5879af6530b1bfee \ + --hash=sha256:38b7b51f512eed9e84a22788b4bce1de17c0adb134d6becb09836e37d8654cd3 # via black -pycparser==2.22 +pycparser==2.22 \ + --hash=sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6 \ + --hash=sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc # via cffi -pygithub==1.59.1 - # via -r llvm/utils/git/requirements_formatting.txt.in -pyjwt[crypto]==2.8.0 +pygithub==1.59.1 \ + --hash=sha256:3d87a822e6c868142f0c2c4bf16cce4696b5a7a4d142a7bd160e1bdf75bc54a9 \ + --hash=sha256:c44e3a121c15bf9d3a5cc98d94c9a047a5132a9b01d22264627f58ade9ddc217 + # via -r requirements_formatting.txt.in +pyjwt[crypto]==2.9.0 \ + --hash=sha256:3b02fb0f44517787776cf48f2ae25d8e14f300e6d7545a4315cee571a415e850 \ + --hash=sha256:7e1e5b56cc735432a7369cbfa0efe50fa113ebecdc04ae6922deba8b84582d0c # via pygithub -pynacl==1.5.0 +pynacl==1.5.0 \ + --hash=sha256:06b8f6fa7f5de8d5d2f7573fe8c863c051225a27b61e6860fd047b1775807858 \ + --hash=sha256:0c84947a22519e013607c9be43706dd42513f9e6ae5d39d3613ca1e142fba44d \ + --hash=sha256:20f42270d27e1b6a29f54032090b972d97f0a1b0948cc52392041ef7831fee93 \ + --hash=sha256:401002a4aaa07c9414132aaed7f6836ff98f59277a234704ff66878c2ee4a0d1 \ + --hash=sha256:52cb72a79269189d4e0dc537556f4740f7f0a9ec41c1322598799b0bdad4ef92 \ + --hash=sha256:61f642bf2378713e2c2e1de73444a3778e5f0a38be6fee0fe532fe30060282ff \ + --hash=sha256:8ac7448f09ab85811607bdd21ec2464495ac8b7c66d146bf545b0f08fb9220ba \ + --hash=sha256:a36d4a9dda1f19ce6e03c9a784a2921a4b726b02e1c736600ca9c22029474394 \ + --hash=sha256:a422368fc821589c228f4c49438a368831cb5bbc0eab5ebe1d7fac9dded6567b \ + --hash=sha256:e46dae94e34b085175f8abb3b0aaa7da40767865ac82c928eeb9e57e1ea8a543 # via pygithub -requests==2.31.0 +requests==2.32.3 \ + --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ + --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 # via pygithub -toml==0.10.2 +toml==0.10.2 \ + --hash=sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b \ + --hash=sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f # via darker -urllib3==2.2.1 +urllib3==2.2.2 \ + --hash=sha256:a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472 \ + --hash=sha256:dd505485549a7a552833da5e6063639d0d177c04f23bc3864e41e5dc5f612168 # via requests -wrapt==1.16.0 +wrapt==1.16.0 \ + --hash=sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc \ + --hash=sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81 \ + --hash=sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09 \ + --hash=sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e \ + --hash=sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca \ + --hash=sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0 \ + --hash=sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb \ + --hash=sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487 \ + --hash=sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40 \ + --hash=sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c \ + --hash=sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060 \ + --hash=sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202 \ + --hash=sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41 \ + --hash=sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9 \ + --hash=sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b \ + --hash=sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664 \ + --hash=sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d \ + --hash=sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362 \ + --hash=sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00 \ + --hash=sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc \ + --hash=sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1 \ + --hash=sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267 \ + --hash=sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956 \ + --hash=sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966 \ + --hash=sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1 \ + --hash=sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228 \ + --hash=sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72 \ + --hash=sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d \ + --hash=sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292 \ + --hash=sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0 \ + --hash=sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0 \ + --hash=sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36 \ + --hash=sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c \ + --hash=sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5 \ + --hash=sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f \ + --hash=sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73 \ + --hash=sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b \ + --hash=sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2 \ + --hash=sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593 \ + --hash=sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39 \ + --hash=sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389 \ + --hash=sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf \ + --hash=sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf \ + --hash=sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89 \ + --hash=sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c \ + --hash=sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c \ + --hash=sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f \ + --hash=sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440 \ + --hash=sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465 \ + --hash=sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136 \ + --hash=sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b \ + --hash=sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8 \ + --hash=sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3 \ + --hash=sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8 \ + --hash=sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6 \ + --hash=sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e \ + --hash=sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f \ + --hash=sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c \ + --hash=sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e \ + --hash=sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8 \ + --hash=sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2 \ + --hash=sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020 \ + --hash=sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35 \ + --hash=sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d \ + --hash=sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3 \ + --hash=sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537 \ + --hash=sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809 \ + --hash=sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d \ + --hash=sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a \ + --hash=sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4 # via deprecated From 886b76128fba5f995c8c8e24aaa2030b59dec01a Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sat, 24 Aug 2024 09:55:17 -0700 Subject: [PATCH 408/426] [libc++][test] Fix `msvc_is_lock_free_macro_value()` (#105876) Followup to #99570. * `TEST_COMPILER_MSVC` must be tested for `defined`ness, as it is everywhere else. + Definition: https://github.com/llvm/llvm-project/blob/52a7116f5c6ada234f47f7794aaf501a3692b997/libcxx/test/support/test_macros.h#L71-L72 + Example usage: https://github.com/llvm/llvm-project/blob/52a7116f5c6ada234f47f7794aaf501a3692b997/libcxx/test/std/utilities/function.objects/func.not_fn/not_fn.pass.cpp#L248 + Fixes: `llvm-project\libcxx\test\support\atomic_helpers.h(33): fatal error C1017: invalid integer constant expression` * Fix bogus return type: `msvc_is_lock_free_macro_value()` returns `2` or `0`, so it needs to return `int`. + Fixes: `llvm-project\libcxx\test\support\atomic_helpers.h(41): warning C4305: 'return': truncation from 'int' to 'bool'` * Clarity improvement: also add parens when mixing bitwise with arithmetic operators. --- libcxx/test/support/atomic_helpers.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libcxx/test/support/atomic_helpers.h b/libcxx/test/support/atomic_helpers.h index d2f2b751cb47de..2b3a3caa06a589 100644 --- a/libcxx/test/support/atomic_helpers.h +++ b/libcxx/test/support/atomic_helpers.h @@ -30,15 +30,15 @@ # define TEST_ATOMIC_LONG_LOCK_FREE __GCC_ATOMIC_LONG_LOCK_FREE # define TEST_ATOMIC_LLONG_LOCK_FREE __GCC_ATOMIC_LLONG_LOCK_FREE # define TEST_ATOMIC_POINTER_LOCK_FREE __GCC_ATOMIC_POINTER_LOCK_FREE -#elif TEST_COMPILER_MSVC +#elif defined(TEST_COMPILER_MSVC) // This is lifted from STL/stl/inc/atomic on github for the purposes of // keeping the tests compiling for MSVC's STL. It's not a perfect solution // but at least the tests will keep running. // // Note MSVC's STL never produces a type that is sometimes lock free, but not always lock free. template -constexpr bool msvc_is_lock_free_macro_value() { - return (Size <= 8 && (Size & Size - 1) == 0) ? 2 : 0; +constexpr int msvc_is_lock_free_macro_value() { + return (Size <= 8 && (Size & (Size - 1)) == 0) ? 2 : 0; } # define TEST_ATOMIC_CHAR_LOCK_FREE ::msvc_is_lock_free_macro_value() # define TEST_ATOMIC_SHORT_LOCK_FREE ::msvc_is_lock_free_macro_value() From a5d89d5048b6d62e6e327ba01eff276f398b7a51 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 24 Aug 2024 10:02:01 -0700 Subject: [PATCH 409/426] [Target] Use llvm::replace (NFC) (#105942) --- llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp | 4 +--- llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp | 5 +---- llvm/lib/Target/X86/X86ISelLowering.cpp | 4 +--- 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index b55b9a42e52cdf..e42623cb385637 100644 --- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -2517,9 +2517,7 @@ static void updateRegisterMapForDbgValueListAfterMove( if (RegIt == RegisterMap.end()) return; auto &InstrVec = RegIt->getSecond(); - for (unsigned I = 0; I < InstrVec.size(); I++) - if (InstrVec[I] == InstrToReplace) - InstrVec[I] = DbgValueListInstr; + llvm::replace(InstrVec, InstrToReplace, DbgValueListInstr); }); } diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp index ec5435949ae4a7..4e6b80284c46b3 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp @@ -2002,10 +2002,7 @@ SmallVector HvxSelector::getPerfectCompletions(ShuffleMask SM, if ((unsigned)llvm::popcount(P) < Count) { // Reset all occurences of P, if there are more occurrences of P // than there are bits in P. - for (unsigned &Q : Worklist) { - if (Q == P) - Q = 0; - } + llvm::replace(Worklist, P, 0U); } } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 97775ce40aee4f..1a6be4eb5af1ef 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -35781,9 +35781,7 @@ X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI, // Zero out any registers that are already used. for (const auto &MO : MI.operands()) { if (MO.isReg() && MO.isUse()) - for (unsigned &Reg : AvailableRegs) - if (Reg == MO.getReg()) - Reg = 0; + llvm::replace(AvailableRegs, static_cast(MO.getReg()), 0U); } // Choose the first remaining non-zero available register. From 31b4bf938b46001abbf2a58875047bf13ba083dd Mon Sep 17 00:00:00 2001 From: Jeff Niu Date: Sat, 24 Aug 2024 13:19:44 -0400 Subject: [PATCH 410/426] [llvm][NVPTX] Fix RAUW bug in NVPTXProxyRegErasure (#105871) Fix bug introduced in #105730 The bug is in how the batch RAUW is implemented. If we have ``` %0 = mov %src %1 = mov %0 use %0 use %1 ``` The use of `%1` is rewritten to `%0`, not `%src`. This PR just looks for a replacement when it maps to the src register, which should transitively propagate the replacements. --- .../lib/Target/NVPTX/NVPTXProxyRegErasure.cpp | 6 +- .../CodeGen/NVPTX/proxy-reg-erasure-mir.ll | 25 ----- llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir | 98 +++++++++++++++++++ 3 files changed, 103 insertions(+), 26 deletions(-) delete mode 100644 llvm/test/CodeGen/NVPTX/proxy-reg-erasure-mir.ll create mode 100644 llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir diff --git a/llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp b/llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp index f3a3362addb0ea..16c2b307efabfb 100644 --- a/llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp @@ -78,7 +78,11 @@ bool NVPTXProxyRegErasure::runOnMachineFunction(MachineFunction &MF) { assert(InOp.isReg() && "ProxyReg input should be a register."); assert(OutOp.isReg() && "ProxyReg output should be a register."); RemoveList.push_back(&MI); - RAUWBatch.try_emplace(OutOp.getReg(), InOp.getReg()); + Register replacement = InOp.getReg(); + // Check if the replacement itself has been replaced. + if (auto it = RAUWBatch.find(replacement); it != RAUWBatch.end()) + replacement = it->second; + RAUWBatch.try_emplace(OutOp.getReg(), replacement); break; } } diff --git a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-mir.ll b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-mir.ll deleted file mode 100644 index 6bfbe2aea8196c..00000000000000 --- a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-mir.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc -march=nvptx64 -stop-before=nvptx-proxyreg-erasure < %s 2>&1 \ -; RUN: | FileCheck %s --check-prefix=MIR --check-prefix=MIR-BEFORE - -; RUN: llc -march=nvptx64 -stop-after=nvptx-proxyreg-erasure < %s 2>&1 \ -; RUN: | FileCheck %s --check-prefix=MIR --check-prefix=MIR-AFTER - -; Check ProxyRegErasure pass MIR manipulation. - -declare <4 x i32> @callee_vec_i32() -define <4 x i32> @check_vec_i32() { - ; MIR: body: - ; MIR-DAG: Callseq_Start {{[0-9]+}}, {{[0-9]+}} - ; MIR-DAG: %0:int32regs, %1:int32regs, %2:int32regs, %3:int32regs = LoadParamMemV4I32 0 - ; MIR-DAG: Callseq_End {{[0-9]+}} - - ; MIR-BEFORE-DAG: %4:int32regs = ProxyRegI32 killed %0 - ; MIR-BEFORE-DAG: %5:int32regs = ProxyRegI32 killed %1 - ; MIR-BEFORE-DAG: %6:int32regs = ProxyRegI32 killed %2 - ; MIR-BEFORE-DAG: %7:int32regs = ProxyRegI32 killed %3 - ; MIR-BEFORE-DAG: StoreRetvalV4I32 killed %4, killed %5, killed %6, killed %7, 0 - ; MIR-AFTER-DAG: StoreRetvalV4I32 killed %0, killed %1, killed %2, killed %3, 0 - - %ret = call <4 x i32> @callee_vec_i32() - ret <4 x i32> %ret -} diff --git a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir new file mode 100644 index 00000000000000..7f80d011901d34 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir @@ -0,0 +1,98 @@ +# RUN: llc %s --run-pass=nvptx-proxyreg-erasure -march=nvptx64 -o - | FileCheck %s + +--- | + ; ModuleID = 'third-party/llvm-project/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-mir.ll' + source_filename = "third-party/llvm-project/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-mir.ll" + target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" + + declare <4 x i32> @callee_vec_i32() + + define <4 x i32> @check_vec_i32() { + %ret = call <4 x i32> @callee_vec_i32() + ret <4 x i32> %ret + } + +... +--- +name: check_vec_i32 +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: int32regs, preferred-register: '' } + - { id: 1, class: int32regs, preferred-register: '' } + - { id: 2, class: int32regs, preferred-register: '' } + - { id: 3, class: int32regs, preferred-register: '' } + - { id: 4, class: int32regs, preferred-register: '' } + - { id: 5, class: int32regs, preferred-register: '' } + - { id: 6, class: int32regs, preferred-register: '' } + - { id: 7, class: int32regs, preferred-register: '' } + - { id: 8, class: int32regs, preferred-register: '' } + - { id: 9, class: int32regs, preferred-register: '' } + - { id: 10, class: int32regs, preferred-register: '' } + - { id: 11, class: int32regs, preferred-register: '' } +liveins: [] +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: true + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + isCalleeSavedInfoValid: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0: + %0:int32regs, %1:int32regs, %2:int32regs, %3:int32regs = LoadParamMemV4I32 0 + ; CHECK-NOT: ProxyReg + %4:int32regs = ProxyRegI32 killed %0 + %5:int32regs = ProxyRegI32 killed %1 + %6:int32regs = ProxyRegI32 killed %2 + %7:int32regs = ProxyRegI32 killed %3 + ; CHECK: StoreRetvalV4I32 killed %0, killed %1, killed %2, killed %3 + StoreRetvalV4I32 killed %4, killed %5, killed %6, killed %7, 0 + + %8:int32regs = LoadParamMemI32 0 + ; CHECK-NOT: ProxyReg + %9:int32regs = ProxyRegI32 killed %8 + %10:int32regs = ProxyRegI32 killed %9 + %11:int32regs = ProxyRegI32 killed %10 + ; CHECK: StoreRetvalI32 killed %8 + StoreRetvalI32 killed %11, 0 + Return + +... From 2cb25d5608453655a2ed39d8177034ab7773aac2 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Sat, 24 Aug 2024 10:47:57 -0700 Subject: [PATCH 411/426] [DAG][RISCV] Use vp_reduce_fadd/fmul when widening types for FP reductions (#105840) This is a follow up to #105455 which updates the VPIntrinsic mappings for the fadd and fmul cases, and supports both ordered and unordered reductions. This allows the use a single wider operation with a restricted EVL instead of padding the vector with the neutral element. This has all the same tradeoffs as the previous patch. --- llvm/include/llvm/IR/VPIntrinsics.def | 12 ++- .../SelectionDAG/LegalizeVectorTypes.cpp | 20 +++- .../rvv/fixed-vectors-reduction-formation.ll | 9 +- .../RISCV/rvv/fixed-vectors-reduction-fp.ll | 12 --- .../RISCV/rvv/vreductions-fp-sdnode.ll | 100 ++++++++---------- 5 files changed, 67 insertions(+), 86 deletions(-) diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def index 9333f6be5b516d..521cbc2dc278f9 100644 --- a/llvm/include/llvm/IR/VPIntrinsics.def +++ b/llvm/include/llvm/IR/VPIntrinsics.def @@ -722,13 +722,15 @@ HELPER_REGISTER_REDUCTION_VP(vp_reduce_fminimum, VP_REDUCE_FMINIMUM, #error \ "The internal helper macro HELPER_REGISTER_REDUCTION_SEQ_VP is already defined!" #endif -#define HELPER_REGISTER_REDUCTION_SEQ_VP(VPID, VPSD, SEQ_VPSD, INTRIN) \ +#define HELPER_REGISTER_REDUCTION_SEQ_VP(VPID, VPSD, SEQ_VPSD, SDOPC, SEQ_SDOPC, INTRIN) \ BEGIN_REGISTER_VP_INTRINSIC(VPID, 2, 3) \ BEGIN_REGISTER_VP_SDNODE(VPSD, 1, VPID, 2, 3) \ VP_PROPERTY_REDUCTION(0, 1) \ + VP_PROPERTY_FUNCTIONAL_SDOPC(SDOPC) \ END_REGISTER_VP_SDNODE(VPSD) \ BEGIN_REGISTER_VP_SDNODE(SEQ_VPSD, 1, VPID, 2, 3) \ HELPER_MAP_VPID_TO_VPSD(VPID, SEQ_VPSD) \ + VP_PROPERTY_FUNCTIONAL_SDOPC(SEQ_SDOPC) \ VP_PROPERTY_REDUCTION(0, 1) \ END_REGISTER_VP_SDNODE(SEQ_VPSD) \ VP_PROPERTY_FUNCTIONAL_INTRINSIC(INTRIN) \ @@ -736,13 +738,13 @@ HELPER_REGISTER_REDUCTION_VP(vp_reduce_fminimum, VP_REDUCE_FMINIMUM, // llvm.vp.reduce.fadd(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_SEQ_VP(vp_reduce_fadd, VP_REDUCE_FADD, - VP_REDUCE_SEQ_FADD, - vector_reduce_fadd) + VP_REDUCE_SEQ_FADD, VECREDUCE_FADD, + VECREDUCE_SEQ_FADD, vector_reduce_fadd) // llvm.vp.reduce.fmul(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_SEQ_VP(vp_reduce_fmul, VP_REDUCE_FMUL, - VP_REDUCE_SEQ_FMUL, - vector_reduce_fmul) + VP_REDUCE_SEQ_FMUL, VECREDUCE_FMUL, + VECREDUCE_SEQ_FMUL, vector_reduce_fmul) #undef HELPER_REGISTER_REDUCTION_SEQ_VP diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 5745c147e3502d..475d5806467d98 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -7311,8 +7311,6 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) { // Generate a vp.reduce_op if it is custom/legal for the target. This avoids // needing to pad the source vector, because the inactive lanes can simply be // disabled and not contribute to the result. - // TODO: VECREDUCE_FADD, VECREDUCE_FMUL aren't currently mapped correctly, - // and thus don't take this path. if (auto VPOpcode = ISD::getVPForBaseOpcode(Opc); VPOpcode && TLI.isOperationLegalOrCustom(*VPOpcode, WideVT)) { SDValue Start = NeutralElem; @@ -7351,6 +7349,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE_SEQ(SDNode *N) { SDValue VecOp = N->getOperand(1); SDValue Op = GetWidenedVector(VecOp); + EVT VT = N->getValueType(0); EVT OrigVT = VecOp.getValueType(); EVT WideVT = Op.getValueType(); EVT ElemVT = OrigVT.getVectorElementType(); @@ -7364,6 +7363,19 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE_SEQ(SDNode *N) { unsigned OrigElts = OrigVT.getVectorMinNumElements(); unsigned WideElts = WideVT.getVectorMinNumElements(); + // Generate a vp.reduce_op if it is custom/legal for the target. This avoids + // needing to pad the source vector, because the inactive lanes can simply be + // disabled and not contribute to the result. + if (auto VPOpcode = ISD::getVPForBaseOpcode(Opc); + VPOpcode && TLI.isOperationLegalOrCustom(*VPOpcode, WideVT)) { + EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + WideVT.getVectorElementCount()); + SDValue Mask = DAG.getAllOnesConstant(dl, WideMaskVT); + SDValue EVL = DAG.getElementCount(dl, TLI.getVPExplicitVectorLengthTy(), + OrigVT.getVectorElementCount()); + return DAG.getNode(*VPOpcode, dl, VT, {AccOp, Op, Mask, EVL}, Flags); + } + if (WideVT.isScalableVector()) { unsigned GCD = std::gcd(OrigElts, WideElts); EVT SplatVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, @@ -7372,14 +7384,14 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE_SEQ(SDNode *N) { for (unsigned Idx = OrigElts; Idx < WideElts; Idx = Idx + GCD) Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Op, SplatNeutral, DAG.getVectorIdxConstant(Idx, dl)); - return DAG.getNode(Opc, dl, N->getValueType(0), AccOp, Op, Flags); + return DAG.getNode(Opc, dl, VT, AccOp, Op, Flags); } for (unsigned Idx = OrigElts; Idx < WideElts; Idx++) Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, WideVT, Op, NeutralElem, DAG.getVectorIdxConstant(Idx, dl)); - return DAG.getNode(Opc, dl, N->getValueType(0), AccOp, Op, Flags); + return DAG.getNode(Opc, dl, VT, AccOp, Op, Flags); } SDValue DAGTypeLegalizer::WidenVecOp_VP_REDUCE(SDNode *N) { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll index fa56412e71c678..6e5ab436fc02d0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll @@ -791,12 +791,7 @@ define float @reduce_fadd_16xi32_prefix5(ptr %p) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a0, 524288 ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 5 -; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 6 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 7 +; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma ; CHECK-NEXT: vfredusum.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -880,7 +875,7 @@ define float @reduce_fadd_4xi32_non_associative(ptr %p) { ; CHECK-NEXT: vfmv.f.s fa5, v9 ; CHECK-NEXT: lui a0, 524288 ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vslideup.vi v8, v9, 3 +; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma ; CHECK-NEXT: vfredusum.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa4, v8 ; CHECK-NEXT: fadd.s fa0, fa4, fa5 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll index 26dc11aef2805b..566c9070eab512 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll @@ -98,10 +98,6 @@ define half @vreduce_fadd_v7f16(ptr %x, half %s) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 7, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: lui a0, 1048568 -; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 7 ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vfredusum.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -470,10 +466,6 @@ define float @vreduce_fadd_v7f32(ptr %x, float %s) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: lui a0, 524288 -; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 7 ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vfredusum.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -488,10 +480,6 @@ define float @vreduce_ord_fadd_v7f32(ptr %x, float %s) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: lui a0, 524288 -; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 7 ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vfredosum.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll index 5b140299070b94..c2ad7e76a26c75 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll @@ -889,17 +889,12 @@ define half @vreduce_ord_fadd_nxv3f16( %v, half %s) { ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: slli a1, a0, 1 -; CHECK-NEXT: add a1, a1, a0 ; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: lui a2, 1048568 -; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, ma -; CHECK-NEXT: vmv.v.x v9, a2 -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vx v8, v9, a1 -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vfredosum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfredosum.vs v9, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fadd.nxv3f16(half %s, %v) ret half %red @@ -910,18 +905,15 @@ declare half @llvm.vector.reduce.fadd.nxv6f16(half, ) define half @vreduce_ord_fadd_nxv6f16( %v, half %s) { ; CHECK-LABEL: vreduce_ord_fadd_nxv6f16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 1048568 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: add a1, a0, a0 -; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vx v9, v10, a0 -; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: sub a0, a0, a1 +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vfredosum.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfredosum.vs v10, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fadd.nxv6f16(half %s, %v) ret half %red @@ -932,22 +924,15 @@ declare half @llvm.vector.reduce.fadd.nxv10f16(half, ) define half @vreduce_ord_fadd_nxv10f16( %v, half %s) { ; CHECK-LABEL: vreduce_ord_fadd_nxv10f16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 1048568 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; CHECK-NEXT: vmv.v.x v12, a0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: add a1, a0, a0 -; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vx v10, v12, a0 -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma -; CHECK-NEXT: vmv.v.v v11, v12 -; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vx v11, v12, a0 -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: li a1, 10 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; CHECK-NEXT: vfmv.s.f v12, fa0 -; CHECK-NEXT: vfredosum.vs v8, v8, v12 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfredosum.vs v12, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v12 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fadd.nxv10f16(half %s, %v) ret half %red @@ -958,13 +943,16 @@ declare half @llvm.vector.reduce.fadd.nxv12f16(half, ) define half @vreduce_ord_fadd_nxv12f16( %v, half %s) { ; CHECK-LABEL: vreduce_ord_fadd_nxv12f16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 1048568 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; CHECK-NEXT: vmv.v.x v11, a0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: slli a1, a0, 2 +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub a0, a0, a1 +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; CHECK-NEXT: vfmv.s.f v12, fa0 -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfredosum.vs v8, v8, v12 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfredosum.vs v12, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v12 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fadd.nxv12f16(half %s, %v) ret half %red @@ -977,17 +965,14 @@ define half @vreduce_fadd_nxv3f16( %v, half %s) { ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: slli a1, a0, 1 -; CHECK-NEXT: add a1, a1, a0 ; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: lui a2, 1048568 -; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, ma -; CHECK-NEXT: vmv.v.x v9, a2 -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vx v8, v9, a1 -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: lui a1, 1048568 +; CHECK-NEXT: vmv.s.x v10, a1 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfredusum.vs v10, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v10 ; CHECK-NEXT: ret %red = call reassoc half @llvm.vector.reduce.fadd.nxv3f16(half %s, %v) ret half %red @@ -996,18 +981,17 @@ define half @vreduce_fadd_nxv3f16( %v, half %s) { define half @vreduce_fadd_nxv6f16( %v, half %s) { ; CHECK-LABEL: vreduce_fadd_nxv6f16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 1048568 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: add a1, a0, a0 -; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vx v9, v10, a0 -; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: sub a0, a0, a1 +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vfredusum.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: lui a1, 1048568 +; CHECK-NEXT: vmv.s.x v11, a1 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfredusum.vs v11, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v11 ; CHECK-NEXT: ret %red = call reassoc half @llvm.vector.reduce.fadd.nxv6f16(half %s, %v) ret half %red From d252365a272b702e32220038f5fdad7e511dbf58 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 24 Aug 2024 10:53:28 -0700 Subject: [PATCH 412/426] [IR] Modernize StructuralHashImpl (NFC) (#105951) --- llvm/lib/IR/StructuralHash.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/IR/StructuralHash.cpp b/llvm/lib/IR/StructuralHash.cpp index b6de1ed725d7d4..fb4f33a021a96b 100644 --- a/llvm/lib/IR/StructuralHash.cpp +++ b/llvm/lib/IR/StructuralHash.cpp @@ -24,7 +24,7 @@ namespace { // by the MergeFunctions pass. class StructuralHashImpl { - uint64_t Hash; + uint64_t Hash = 4; void hash(uint64_t V) { Hash = hashing::detail::hash_16_bytes(Hash, V); } @@ -43,7 +43,7 @@ class StructuralHashImpl { } public: - StructuralHashImpl() : Hash(4) {} + StructuralHashImpl() = default; void updateOperand(Value *Operand) { hashType(Operand->getType()); From 6f618a7b8249e7baa3b2d18f8bbec3c5b6f6d24e Mon Sep 17 00:00:00 2001 From: Qiu Chaofan Date: Sun, 25 Aug 2024 02:17:15 +0800 Subject: [PATCH 413/426] Update my email --- .mailmap | 2 +- llvm/CREDITS.TXT | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/.mailmap b/.mailmap index 6211922135257c..851394b3e9dca1 100644 --- a/.mailmap +++ b/.mailmap @@ -30,7 +30,7 @@ - + Jianjian GUAN diff --git a/llvm/CREDITS.TXT b/llvm/CREDITS.TXT index a6f042779da2e1..fc02828e601c25 100644 --- a/llvm/CREDITS.TXT +++ b/llvm/CREDITS.TXT @@ -432,10 +432,6 @@ W: http://vladimir_prus.blogspot.com E: ghost@cs.msu.su D: Made inst_iterator behave like a proper iterator, LowerConstantExprs pass -N: QIU Chaofan -E: qiucofan@cn.ibm.com -D: PowerPC Backend Developer - N: Kalle Raiskila E: kalle.rasikila@nokia.com D: Some bugfixes to CellSPU From 9f82f6daa5e470652f4ffced628547d0c24aac2c Mon Sep 17 00:00:00 2001 From: David Green Date: Sat, 24 Aug 2024 20:20:23 +0100 Subject: [PATCH 414/426] [ARM] Add a number of extra vmovimm tests for BE. NFC --- llvm/test/CodeGen/ARM/big-endian-vmov.ll | 100 +++ llvm/test/CodeGen/Thumb2/mve-vmovimm.ll | 818 ++++++++++++++++++++++- 2 files changed, 900 insertions(+), 18 deletions(-) diff --git a/llvm/test/CodeGen/ARM/big-endian-vmov.ll b/llvm/test/CodeGen/ARM/big-endian-vmov.ll index 2cb22b4d5fbc26..1cb7a030d58c26 100644 --- a/llvm/test/CodeGen/ARM/big-endian-vmov.ll +++ b/llvm/test/CodeGen/ARM/big-endian-vmov.ll @@ -134,3 +134,103 @@ define arm_aapcs_vfpcc <1 x i64> @vmov_i64_b() { ; CHECK-NEXT: bx lr ret <1 x i64> } + +define arm_aapcs_vfpcc <2 x i64> @vmov_v2i64_b() { +; CHECK-LABEL: vmov_v2i64_b: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.i64 q0, #0xffff00ff0000ff +; CHECK-NEXT: bx lr + ret <2 x i64> +} + +define arm_aapcs_vfpcc <4 x i32> @vmov_v4i32_b() { +; CHECK-LE-LABEL: vmov_v4i32_b: +; CHECK-LE: @ %bb.0: +; CHECK-LE-NEXT: vmov.i64 q0, #0xff0000ff00ffff00 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: vmov_v4i32_b: +; CHECK-BE: @ %bb.0: +; CHECK-BE-NEXT: vmov.i64 q0, #0xffff00ff0000ff +; CHECK-BE-NEXT: bx lr + ret <4 x i32> +} + +define arm_aapcs_vfpcc <2 x i64> @and_v2i64_b(<2 x i64> %a) { +; CHECK-LABEL: and_v2i64_b: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.i64 q8, #0xffff00ff0000ff +; CHECK-NEXT: vand q0, q0, q8 +; CHECK-NEXT: bx lr + %b = and <2 x i64> %a, + ret <2 x i64> %b +} + +define arm_aapcs_vfpcc <4 x i32> @and_v4i32_b(<4 x i32> %a) { +; CHECK-LE-LABEL: and_v4i32_b: +; CHECK-LE: @ %bb.0: +; CHECK-LE-NEXT: vmov.i64 q8, #0xff0000ff00ffff00 +; CHECK-LE-NEXT: vand q0, q0, q8 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: and_v4i32_b: +; CHECK-BE: @ %bb.0: +; CHECK-BE-NEXT: vmov.i64 q8, #0xffff00ff0000ff +; CHECK-BE-NEXT: vrev64.32 q9, q0 +; CHECK-BE-NEXT: vrev64.32 q8, q8 +; CHECK-BE-NEXT: vand q8, q9, q8 +; CHECK-BE-NEXT: vrev64.32 q0, q8 +; CHECK-BE-NEXT: bx lr + %b = and <4 x i32> %a, + ret <4 x i32> %b +} + +define arm_aapcs_vfpcc <8 x i16> @vmvn_v16i8_m1() { +; CHECK-LE-LABEL: vmvn_v16i8_m1: +; CHECK-LE: @ %bb.0: +; CHECK-LE-NEXT: vmvn.i32 q0, #0x10000 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: vmvn_v16i8_m1: +; CHECK-BE: @ %bb.0: +; CHECK-BE-NEXT: vmvn.i32 q0, #0x1 +; CHECK-BE-NEXT: bx lr + ret <8 x i16> +} + +; FIXME: This is incorrect for BE +define arm_aapcs_vfpcc <8 x i16> @and_v8i16_m1(<8 x i16> %a) { +; CHECK-LE-LABEL: and_v8i16_m1: +; CHECK-LE: @ %bb.0: +; CHECK-LE-NEXT: vbic.i32 q0, #0x10000 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: and_v8i16_m1: +; CHECK-BE: @ %bb.0: +; CHECK-BE-NEXT: vrev64.32 q8, q0 +; CHECK-BE-NEXT: vbic.i32 q8, #0x10000 +; CHECK-BE-NEXT: vrev64.32 q0, q8 +; CHECK-BE-NEXT: bx lr + %b = and <8 x i16> %a, + ret <8 x i16> %b +} + +; FIXME: This is incorrect for BE +define arm_aapcs_vfpcc <8 x i16> @xor_v8i16_m1(<8 x i16> %a) { +; CHECK-LE-LABEL: xor_v8i16_m1: +; CHECK-LE: @ %bb.0: +; CHECK-LE-NEXT: vmvn.i32 q8, #0x10000 +; CHECK-LE-NEXT: veor q0, q0, q8 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: xor_v8i16_m1: +; CHECK-BE: @ %bb.0: +; CHECK-BE-NEXT: vmvn.i32 q8, #0x10000 +; CHECK-BE-NEXT: vrev64.16 q9, q0 +; CHECK-BE-NEXT: vrev32.16 q8, q8 +; CHECK-BE-NEXT: veor q8, q9, q8 +; CHECK-BE-NEXT: vrev64.16 q0, q8 +; CHECK-BE-NEXT: bx lr + %b = xor <8 x i16> %a, + ret <8 x i16> %b +} diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovimm.ll b/llvm/test/CodeGen/Thumb2/mve-vmovimm.ll index 97abc539557131..729e4c5e89c75e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmovimm.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmovimm.ll @@ -12,6 +12,25 @@ entry: ret <16 x i8> } +define arm_aapcs_vfpcc <16 x i8> @xor_int8_1(<16 x i8> %a) { +; CHECKLE-LABEL: xor_int8_1: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmov.i8 q1, #0x1 +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int8_1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i8 q1, #0x1 +; CHECKBE-NEXT: vrev64.8 q2, q0 +; CHECKBE-NEXT: veor q1, q2, q1 +; CHECKBE-NEXT: vrev64.8 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <16 x i8> %a, + ret <16 x i8> %b +} + define arm_aapcs_vfpcc <16 x i8> @mov_int8_m1() { ; CHECK-LABEL: mov_int8_m1: ; CHECK: @ %bb.0: @ %entry @@ -21,6 +40,23 @@ entry: ret <16 x i8> } +define arm_aapcs_vfpcc <16 x i8> @xor_int8_m1(<16 x i8> %a) { +; CHECKLE-LABEL: xor_int8_m1: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmvn q0, q0 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int8_m1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vrev64.8 q1, q0 +; CHECKBE-NEXT: vmvn q1, q1 +; CHECKBE-NEXT: vrev64.8 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <16 x i8> %a, + ret <16 x i8> %b +} + ; This has 0x01020304 or 0x04030201 vdup.32'd to q reg depending on endianness. ; The big endian is different as there is an implicit vrev64.8 out of the ; function, which gets constant folded away. @@ -42,6 +78,98 @@ entry: ret <16 x i8> } +define arm_aapcs_vfpcc <16 x i8> @xor_int8_1234(<16 x i8> %a) { +; CHECKLE-LABEL: xor_int8_1234: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: movw r0, #513 +; CHECKLE-NEXT: movt r0, #1027 +; CHECKLE-NEXT: vdup.32 q1, r0 +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int8_1234: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: movw r0, #513 +; CHECKBE-NEXT: vrev64.8 q1, q0 +; CHECKBE-NEXT: movt r0, #1027 +; CHECKBE-NEXT: vdup.32 q0, r0 +; CHECKBE-NEXT: veor q1, q1, q0 +; CHECKBE-NEXT: vrev64.8 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <16 x i8> %a, + ret <16 x i8> %b +} + +define arm_aapcs_vfpcc <16 x i8> @mov_int8_32() { +; CHECKLE-LABEL: mov_int8_32: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmov.i32 q0, #0x1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: mov_int8_32: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i32 q0, #0x1000000 +; CHECKBE-NEXT: bx lr +entry: + ret <16 x i8> +} + +; FIXME: This is incorrect for BE +define arm_aapcs_vfpcc <16 x i8> @xor_int8_32(<16 x i8> %a) { +; CHECKLE-LABEL: xor_int8_32: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmov.i32 q1, #0x1 +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int8_32: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i32 q1, #0x1 +; CHECKBE-NEXT: vrev64.8 q2, q0 +; CHECKBE-NEXT: vrev32.8 q1, q1 +; CHECKBE-NEXT: veor q1, q2, q1 +; CHECKBE-NEXT: vrev64.8 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <16 x i8> %a, + ret <16 x i8> %b +} + +define arm_aapcs_vfpcc <16 x i8> @mov_int8_64() { +; CHECKLE-LABEL: mov_int8_64: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmov.i64 q0, #0xffff00ffff0000ff +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: mov_int8_64: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i64 q0, #0xff0000ffff00ffff +; CHECKBE-NEXT: bx lr +entry: + ret <16 x i8> +} + +define arm_aapcs_vfpcc <16 x i8> @xor_int8_64(<16 x i8> %a) { +; CHECKLE-LABEL: xor_int8_64: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmov.i64 q1, #0xffff00ffff0000ff +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int8_64: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i64 q1, #0xff0000ffff00ffff +; CHECKBE-NEXT: vrev64.8 q2, q1 +; CHECKBE-NEXT: vrev64.8 q1, q0 +; CHECKBE-NEXT: veor q1, q1, q2 +; CHECKBE-NEXT: vrev64.8 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <16 x i8> %a, + ret <16 x i8> %b +} + define arm_aapcs_vfpcc <8 x i16> @mov_int16_1() { ; CHECK-LABEL: mov_int16_1: ; CHECK: @ %bb.0: @ %entry @@ -51,6 +179,25 @@ entry: ret <8 x i16> } +define arm_aapcs_vfpcc <8 x i16> @xor_int16_1(<8 x i16> %a) { +; CHECKLE-LABEL: xor_int16_1: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmov.i16 q1, #0x1 +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int16_1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i16 q1, #0x1 +; CHECKBE-NEXT: vrev64.16 q2, q0 +; CHECKBE-NEXT: veor q1, q2, q1 +; CHECKBE-NEXT: vrev64.16 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <8 x i16> %a, + ret <8 x i16> %b +} + define arm_aapcs_vfpcc <8 x i16> @mov_int16_m1() { ; CHECK-LABEL: mov_int16_m1: ; CHECK: @ %bb.0: @ %entry @@ -60,6 +207,24 @@ entry: ret <8 x i16> } +define arm_aapcs_vfpcc <8 x i16> @xor_int16_m1(<8 x i16> %a) { +; CHECKLE-LABEL: xor_int16_m1: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmvn q0, q0 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int16_m1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i8 q1, #0xff +; CHECKBE-NEXT: vrev64.16 q2, q0 +; CHECKBE-NEXT: veor q1, q2, q1 +; CHECKBE-NEXT: vrev64.16 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <8 x i16> %a, + ret <8 x i16> %b +} + define arm_aapcs_vfpcc <8 x i16> @mov_int16_256() { ; CHECK-LABEL: mov_int16_256: ; CHECK: @ %bb.0: @ %entry @@ -69,6 +234,25 @@ entry: ret <8 x i16> } +define arm_aapcs_vfpcc <8 x i16> @xor_int16_256(<8 x i16> %a) { +; CHECKLE-LABEL: xor_int16_256: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmov.i16 q1, #0x100 +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int16_256: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i16 q1, #0x100 +; CHECKBE-NEXT: vrev64.16 q2, q0 +; CHECKBE-NEXT: veor q1, q2, q1 +; CHECKBE-NEXT: vrev64.16 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <8 x i16> %a, + ret <8 x i16> %b +} + define arm_aapcs_vfpcc <8 x i16> @mov_int16_257() { ; CHECK-LABEL: mov_int16_257: ; CHECK: @ %bb.0: @ %entry @@ -78,6 +262,25 @@ entry: ret <8 x i16> } +define arm_aapcs_vfpcc <8 x i16> @xor_int16_257(<8 x i16> %a) { +; CHECKLE-LABEL: xor_int16_257: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmov.i8 q1, #0x1 +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int16_257: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i8 q1, #0x1 +; CHECKBE-NEXT: vrev64.16 q2, q0 +; CHECKBE-NEXT: veor q1, q2, q1 +; CHECKBE-NEXT: vrev64.16 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <8 x i16> %a, + ret <8 x i16> %b +} + define arm_aapcs_vfpcc <8 x i16> @mov_int16_258() { ; CHECK-LABEL: mov_int16_258: ; CHECK: @ %bb.0: @ %entry @@ -88,6 +291,97 @@ entry: ret <8 x i16> } +define arm_aapcs_vfpcc <8 x i16> @xor_int16_258(<8 x i16> %a) { +; CHECKLE-LABEL: xor_int16_258: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: mov.w r0, #258 +; CHECKLE-NEXT: vdup.16 q1, r0 +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int16_258: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: mov.w r0, #258 +; CHECKBE-NEXT: vrev64.16 q2, q0 +; CHECKBE-NEXT: vdup.16 q1, r0 +; CHECKBE-NEXT: veor q1, q2, q1 +; CHECKBE-NEXT: vrev64.16 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <8 x i16> %a, + ret <8 x i16> %b +} + +define arm_aapcs_vfpcc <8 x i16> @mov_int16_32() { +; CHECKLE-LABEL: mov_int16_32: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: movw r0, #257 +; CHECKLE-NEXT: movt r0, #256 +; CHECKLE-NEXT: vdup.32 q0, r0 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: mov_int16_32: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: movw r0, #256 +; CHECKBE-NEXT: movt r0, #257 +; CHECKBE-NEXT: vdup.32 q0, r0 +; CHECKBE-NEXT: bx lr +entry: + ret <8 x i16> +} + +define arm_aapcs_vfpcc <8 x i16> @xor_int16_32(<8 x i16> %a) { +; CHECKLE-LABEL: xor_int16_32: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: movw r0, #257 +; CHECKLE-NEXT: movt r0, #256 +; CHECKLE-NEXT: vdup.32 q1, r0 +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int16_32: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: movw r0, #257 +; CHECKBE-NEXT: vrev64.16 q1, q0 +; CHECKBE-NEXT: movt r0, #256 +; CHECKBE-NEXT: vdup.32 q0, r0 +; CHECKBE-NEXT: veor q1, q1, q0 +; CHECKBE-NEXT: vrev64.16 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <8 x i16> %a, + ret <8 x i16> %b +} + +define arm_aapcs_vfpcc <8 x i16> @mov_int16_64() { +; CHECK-LABEL: mov_int16_64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i64 q0, #0xff0000000000ff +; CHECK-NEXT: bx lr +entry: + ret <8 x i16> +} + +define arm_aapcs_vfpcc <8 x i16> @xor_int16_64(<8 x i16> %a) { +; CHECKLE-LABEL: xor_int16_64: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmov.i64 q1, #0xff0000000000ff +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int16_64: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i64 q1, #0xff0000000000ff +; CHECKBE-NEXT: vrev64.16 q2, q1 +; CHECKBE-NEXT: vrev64.16 q1, q0 +; CHECKBE-NEXT: veor q1, q1, q2 +; CHECKBE-NEXT: vrev64.16 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <8 x i16> %a, + ret <8 x i16> %b +} + define arm_aapcs_vfpcc <4 x i32> @mov_int32_1() { ; CHECK-LABEL: mov_int32_1: ; CHECK: @ %bb.0: @ %entry @@ -97,6 +391,25 @@ entry: ret <4 x i32> } +define arm_aapcs_vfpcc <4 x i32> @xor_int32_1(<4 x i32> %a) { +; CHECKLE-LABEL: xor_int32_1: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmov.i32 q1, #0x1 +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int32_1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i32 q1, #0x1 +; CHECKBE-NEXT: vrev64.32 q2, q0 +; CHECKBE-NEXT: veor q1, q2, q1 +; CHECKBE-NEXT: vrev64.32 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <4 x i32> %a, + ret <4 x i32> %b +} + define arm_aapcs_vfpcc <4 x i32> @mov_int32_256() { ; CHECK-LABEL: mov_int32_256: ; CHECK: @ %bb.0: @ %entry @@ -106,6 +419,25 @@ entry: ret <4 x i32> } +define arm_aapcs_vfpcc <4 x i32> @xor_int32_256(<4 x i32> %a) { +; CHECKLE-LABEL: xor_int32_256: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmov.i32 q1, #0x100 +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int32_256: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i32 q1, #0x100 +; CHECKBE-NEXT: vrev64.32 q2, q0 +; CHECKBE-NEXT: veor q1, q2, q1 +; CHECKBE-NEXT: vrev64.32 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <4 x i32> %a, + ret <4 x i32> %b +} + define arm_aapcs_vfpcc <4 x i32> @mov_int32_65536() { ; CHECK-LABEL: mov_int32_65536: ; CHECK: @ %bb.0: @ %entry @@ -115,6 +447,25 @@ entry: ret <4 x i32> } +define arm_aapcs_vfpcc <4 x i32> @xor_int32_65536(<4 x i32> %a) { +; CHECKLE-LABEL: xor_int32_65536: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmov.i32 q1, #0x10000 +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int32_65536: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i32 q1, #0x10000 +; CHECKBE-NEXT: vrev64.32 q2, q0 +; CHECKBE-NEXT: veor q1, q2, q1 +; CHECKBE-NEXT: vrev64.32 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <4 x i32> %a, + ret <4 x i32> %b +} + define arm_aapcs_vfpcc <4 x i32> @mov_int32_16777216() { ; CHECK-LABEL: mov_int32_16777216: ; CHECK: @ %bb.0: @ %entry @@ -124,6 +475,25 @@ entry: ret <4 x i32> } +define arm_aapcs_vfpcc <4 x i32> @xor_int32_16777216(<4 x i32> %a) { +; CHECKLE-LABEL: xor_int32_16777216: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmov.i32 q1, #0x1000000 +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int32_16777216: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i32 q1, #0x1000000 +; CHECKBE-NEXT: vrev64.32 q2, q0 +; CHECKBE-NEXT: veor q1, q2, q1 +; CHECKBE-NEXT: vrev64.32 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <4 x i32> %a, + ret <4 x i32> %b +} + define arm_aapcs_vfpcc <4 x i32> @mov_int32_16777217() { ; CHECK-LABEL: mov_int32_16777217: ; CHECK: @ %bb.0: @ %entry @@ -135,6 +505,29 @@ entry: ret <4 x i32> } +define arm_aapcs_vfpcc <4 x i32> @xor_int32_16777217(<4 x i32> %a) { +; CHECKLE-LABEL: xor_int32_16777217: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: movs r0, #1 +; CHECKLE-NEXT: movt r0, #256 +; CHECKLE-NEXT: vdup.32 q1, r0 +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int32_16777217: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: movs r0, #1 +; CHECKBE-NEXT: vrev64.32 q1, q0 +; CHECKBE-NEXT: movt r0, #256 +; CHECKBE-NEXT: vdup.32 q0, r0 +; CHECKBE-NEXT: veor q1, q1, q0 +; CHECKBE-NEXT: vrev64.32 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <4 x i32> %a, + ret <4 x i32> %b +} + define arm_aapcs_vfpcc <4 x i32> @mov_int32_17919() { ; CHECK-LABEL: mov_int32_17919: ; CHECK: @ %bb.0: @ %entry @@ -144,6 +537,25 @@ entry: ret <4 x i32> } +define arm_aapcs_vfpcc <4 x i32> @xor_int32_17919(<4 x i32> %a) { +; CHECKLE-LABEL: xor_int32_17919: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmov.i32 q1, #0x45ff +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int32_17919: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i32 q1, #0x45ff +; CHECKBE-NEXT: vrev64.32 q2, q0 +; CHECKBE-NEXT: veor q1, q2, q1 +; CHECKBE-NEXT: vrev64.32 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <4 x i32> %a, + ret <4 x i32> %b +} + define arm_aapcs_vfpcc <4 x i32> @mov_int32_4587519() { ; CHECK-LABEL: mov_int32_4587519: ; CHECK: @ %bb.0: @ %entry @@ -153,6 +565,25 @@ entry: ret <4 x i32> } +define arm_aapcs_vfpcc <4 x i32> @xor_int32_4587519(<4 x i32> %a) { +; CHECKLE-LABEL: xor_int32_4587519: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmov.i32 q1, #0x45ffff +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int32_4587519: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i32 q1, #0x45ffff +; CHECKBE-NEXT: vrev64.32 q2, q0 +; CHECKBE-NEXT: veor q1, q2, q1 +; CHECKBE-NEXT: vrev64.32 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <4 x i32> %a, + ret <4 x i32> %b +} + define arm_aapcs_vfpcc <4 x i32> @mov_int32_m1() { ; CHECK-LABEL: mov_int32_m1: ; CHECK: @ %bb.0: @ %entry @@ -162,6 +593,24 @@ entry: ret <4 x i32> } +define arm_aapcs_vfpcc <4 x i32> @xor_int32_m1(<4 x i32> %a) { +; CHECKLE-LABEL: xor_int32_m1: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmvn q0, q0 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int32_m1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i8 q1, #0xff +; CHECKBE-NEXT: vrev64.32 q2, q0 +; CHECKBE-NEXT: veor q1, q2, q1 +; CHECKBE-NEXT: vrev64.32 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <4 x i32> %a, + ret <4 x i32> %b +} + define arm_aapcs_vfpcc <4 x i32> @mov_int32_4294901760() { ; CHECK-LABEL: mov_int32_4294901760: ; CHECK: @ %bb.0: @ %entry @@ -171,6 +620,25 @@ entry: ret <4 x i32> } +define arm_aapcs_vfpcc <4 x i32> @xor_int32_4294901760(<4 x i32> %a) { +; CHECKLE-LABEL: xor_int32_4294901760: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmvn.i32 q1, #0xffff +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int32_4294901760: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmvn.i32 q1, #0xffff +; CHECKBE-NEXT: vrev64.32 q2, q0 +; CHECKBE-NEXT: veor q1, q2, q1 +; CHECKBE-NEXT: vrev64.32 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <4 x i32> %a, + ret <4 x i32> %b +} + define arm_aapcs_vfpcc <4 x i32> @mov_int32_4278190335() { ; CHECK-LABEL: mov_int32_4278190335: ; CHECK: @ %bb.0: @ %entry @@ -182,6 +650,29 @@ entry: ret <4 x i32> } +define arm_aapcs_vfpcc <4 x i32> @xor_int32_4278190335(<4 x i32> %a) { +; CHECKLE-LABEL: xor_int32_4278190335: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: movs r0, #255 +; CHECKLE-NEXT: movt r0, #65280 +; CHECKLE-NEXT: vdup.32 q1, r0 +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int32_4278190335: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: movs r0, #255 +; CHECKBE-NEXT: vrev64.32 q1, q0 +; CHECKBE-NEXT: movt r0, #65280 +; CHECKBE-NEXT: vdup.32 q0, r0 +; CHECKBE-NEXT: veor q1, q1, q0 +; CHECKBE-NEXT: vrev64.32 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <4 x i32> %a, + ret <4 x i32> %b +} + define arm_aapcs_vfpcc <4 x i32> @mov_int32_4278255615() { ; CHECK-LABEL: mov_int32_4278255615: ; CHECK: @ %bb.0: @ %entry @@ -191,6 +682,25 @@ entry: ret <4 x i32> } +define arm_aapcs_vfpcc <4 x i32> @xor_int32_4278255615(<4 x i32> %a) { +; CHECKLE-LABEL: xor_int32_4278255615: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmvn.i32 q1, #0xff0000 +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int32_4278255615: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmvn.i32 q1, #0xff0000 +; CHECKBE-NEXT: vrev64.32 q2, q0 +; CHECKBE-NEXT: veor q1, q2, q1 +; CHECKBE-NEXT: vrev64.32 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <4 x i32> %a, + ret <4 x i32> %b +} + define arm_aapcs_vfpcc <4 x i32> @mov_int32_16908546() { ; CHECK-LABEL: mov_int32_16908546: ; CHECK: @ %bb.0: @ %entry @@ -201,15 +711,70 @@ entry: ret <4 x i32> } +define arm_aapcs_vfpcc <4 x i32> @xor_int32_16908546(<4 x i32> %a) { +; CHECKLE-LABEL: xor_int32_16908546: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: mov.w r0, #258 +; CHECKLE-NEXT: vdup.16 q1, r0 +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int32_16908546: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: mov.w r0, #258 +; CHECKBE-NEXT: vrev64.32 q2, q0 +; CHECKBE-NEXT: vdup.16 q1, r0 +; CHECKBE-NEXT: veor q1, q2, q1 +; CHECKBE-NEXT: vrev64.32 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <4 x i32> %a, + ret <4 x i32> %b +} + +define arm_aapcs_vfpcc <4 x i32> @mov_int32_64() { +; CHECKLE-LABEL: mov_int32_64: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmov.i64 q0, #0xff00ffff00ff00 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: mov_int32_64: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i64 q0, #0xff00ff0000ff00ff +; CHECKBE-NEXT: bx lr +entry: + ret <4 x i32> +} + +define arm_aapcs_vfpcc <4 x i32> @xor_int32_64(<4 x i32> %a) { +; CHECKLE-LABEL: xor_int32_64: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmov.i64 q1, #0xff00ffff00ff00 +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int32_64: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i64 q1, #0xff00ff0000ff00ff +; CHECKBE-NEXT: vrev64.32 q2, q1 +; CHECKBE-NEXT: vrev64.32 q1, q0 +; CHECKBE-NEXT: veor q1, q1, q2 +; CHECKBE-NEXT: vrev64.32 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <4 x i32> %a, + ret <4 x i32> %b +} + define arm_aapcs_vfpcc <2 x i64> @mov_int64_1() { ; CHECKLE-LABEL: mov_int64_1: ; CHECKLE: @ %bb.0: @ %entry -; CHECKLE-NEXT: adr r0, .LCPI20_0 +; CHECKLE-NEXT: adr r0, .LCPI50_0 ; CHECKLE-NEXT: vldrw.u32 q0, [r0] ; CHECKLE-NEXT: bx lr ; CHECKLE-NEXT: .p2align 4 ; CHECKLE-NEXT: @ %bb.1: -; CHECKLE-NEXT: .LCPI20_0: +; CHECKLE-NEXT: .LCPI50_0: ; CHECKLE-NEXT: .long 1 @ double 4.9406564584124654E-324 ; CHECKLE-NEXT: .long 0 ; CHECKLE-NEXT: .long 1 @ double 4.9406564584124654E-324 @@ -217,13 +782,13 @@ define arm_aapcs_vfpcc <2 x i64> @mov_int64_1() { ; ; CHECKBE-LABEL: mov_int64_1: ; CHECKBE: @ %bb.0: @ %entry -; CHECKBE-NEXT: adr r0, .LCPI20_0 +; CHECKBE-NEXT: adr r0, .LCPI50_0 ; CHECKBE-NEXT: vldrb.u8 q1, [r0] ; CHECKBE-NEXT: vrev64.8 q0, q1 ; CHECKBE-NEXT: bx lr ; CHECKBE-NEXT: .p2align 4 ; CHECKBE-NEXT: @ %bb.1: -; CHECKBE-NEXT: .LCPI20_0: +; CHECKBE-NEXT: .LCPI50_0: ; CHECKBE-NEXT: .long 0 @ double 4.9406564584124654E-324 ; CHECKBE-NEXT: .long 1 ; CHECKBE-NEXT: .long 0 @ double 4.9406564584124654E-324 @@ -232,13 +797,58 @@ entry: ret <2 x i64> } +define arm_aapcs_vfpcc <2 x i64> @xor_int64_1(<2 x i64> %a) { +; CHECKLE-LABEL: xor_int64_1: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: adr r0, .LCPI51_0 +; CHECKLE-NEXT: vldrw.u32 q1, [r0] +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; CHECKLE-NEXT: .p2align 4 +; CHECKLE-NEXT: @ %bb.1: +; CHECKLE-NEXT: .LCPI51_0: +; CHECKLE-NEXT: .long 1 @ 0x1 +; CHECKLE-NEXT: .long 0 @ 0x0 +; CHECKLE-NEXT: .long 1 @ 0x1 +; CHECKLE-NEXT: .long 0 @ 0x0 +; +; CHECKBE-LABEL: xor_int64_1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: adr r0, .LCPI51_0 +; CHECKBE-NEXT: vldrb.u8 q1, [r0] +; CHECKBE-NEXT: vrev64.8 q2, q1 +; CHECKBE-NEXT: veor q0, q0, q2 +; CHECKBE-NEXT: bx lr +; CHECKBE-NEXT: .p2align 4 +; CHECKBE-NEXT: @ %bb.1: +; CHECKBE-NEXT: .LCPI51_0: +; CHECKBE-NEXT: .long 0 @ 0x0 +; CHECKBE-NEXT: .long 1 @ 0x1 +; CHECKBE-NEXT: .long 0 @ 0x0 +; CHECKBE-NEXT: .long 1 @ 0x1 +entry: + %b = xor <2 x i64> %a, + ret <2 x i64> %b +} + define arm_aapcs_vfpcc <2 x i64> @mov_int64_ff() { ; CHECK-LABEL: mov_int64_ff: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i64 q0, #0xff ; CHECK-NEXT: bx lr entry: - ret <2 x i64> < i64 255, i64 255 > + ret <2 x i64> +} + +define arm_aapcs_vfpcc <2 x i64> @xor_int64_ff(<2 x i64> %a) { +; CHECK-LABEL: xor_int64_ff: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i64 q1, #0xff +; CHECK-NEXT: veor q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %b = xor <2 x i64> %a, + ret <2 x i64> %b } define arm_aapcs_vfpcc <2 x i64> @mov_int64_m1() { @@ -247,7 +857,23 @@ define arm_aapcs_vfpcc <2 x i64> @mov_int64_m1() { ; CHECK-NEXT: vmov.i8 q0, #0xff ; CHECK-NEXT: bx lr entry: - ret <2 x i64> < i64 -1, i64 -1 > + ret <2 x i64> +} + +define arm_aapcs_vfpcc <2 x i64> @xor_int64_m1(<2 x i64> %a) { +; CHECKLE-LABEL: xor_int64_m1: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmvn q0, q0 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int64_m1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i8 q1, #0xff +; CHECKBE-NEXT: veor q0, q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <2 x i64> %a, + ret <2 x i64> %b } define arm_aapcs_vfpcc <2 x i64> @mov_int64_ff0000ff0000ffff() { @@ -256,18 +882,29 @@ define arm_aapcs_vfpcc <2 x i64> @mov_int64_ff0000ff0000ffff() { ; CHECK-NEXT: vmov.i64 q0, #0xff0000ff0000ffff ; CHECK-NEXT: bx lr entry: - ret <2 x i64> < i64 18374687574888349695, i64 18374687574888349695 > + ret <2 x i64> +} + +define arm_aapcs_vfpcc <2 x i64> @xor_int64_ff0000ff0000ffff(<2 x i64> %a) { +; CHECK-LABEL: xor_int64_ff0000ff0000ffff: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i64 q1, #0xff0000ff0000ffff +; CHECK-NEXT: veor q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %b = xor <2 x i64> %a, + ret <2 x i64> %b } define arm_aapcs_vfpcc <2 x i64> @mov_int64_f_0() { ; CHECKLE-LABEL: mov_int64_f_0: ; CHECKLE: @ %bb.0: @ %entry -; CHECKLE-NEXT: adr r0, .LCPI24_0 +; CHECKLE-NEXT: adr r0, .LCPI58_0 ; CHECKLE-NEXT: vldrw.u32 q0, [r0] ; CHECKLE-NEXT: bx lr ; CHECKLE-NEXT: .p2align 4 ; CHECKLE-NEXT: @ %bb.1: -; CHECKLE-NEXT: .LCPI24_0: +; CHECKLE-NEXT: .LCPI58_0: ; CHECKLE-NEXT: .long 255 @ double 1.2598673968951787E-321 ; CHECKLE-NEXT: .long 0 ; CHECKLE-NEXT: .long 0 @ double 0 @@ -275,19 +912,53 @@ define arm_aapcs_vfpcc <2 x i64> @mov_int64_f_0() { ; ; CHECKBE-LABEL: mov_int64_f_0: ; CHECKBE: @ %bb.0: @ %entry -; CHECKBE-NEXT: adr r0, .LCPI24_0 +; CHECKBE-NEXT: adr r0, .LCPI58_0 ; CHECKBE-NEXT: vldrb.u8 q1, [r0] ; CHECKBE-NEXT: vrev64.8 q0, q1 ; CHECKBE-NEXT: bx lr ; CHECKBE-NEXT: .p2align 4 ; CHECKBE-NEXT: @ %bb.1: -; CHECKBE-NEXT: .LCPI24_0: +; CHECKBE-NEXT: .LCPI58_0: ; CHECKBE-NEXT: .long 0 @ double 1.2598673968951787E-321 ; CHECKBE-NEXT: .long 255 ; CHECKBE-NEXT: .long 0 @ double 0 ; CHECKBE-NEXT: .long 0 entry: - ret <2 x i64> < i64 255, i64 0 > + ret <2 x i64> +} + +define arm_aapcs_vfpcc <2 x i64> @xor_int64_f_0(<2 x i64> %a) { +; CHECKLE-LABEL: xor_int64_f_0: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: adr r0, .LCPI59_0 +; CHECKLE-NEXT: vldrw.u32 q1, [r0] +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; CHECKLE-NEXT: .p2align 4 +; CHECKLE-NEXT: @ %bb.1: +; CHECKLE-NEXT: .LCPI59_0: +; CHECKLE-NEXT: .long 255 @ 0xff +; CHECKLE-NEXT: .long 0 @ 0x0 +; CHECKLE-NEXT: .long 0 @ 0x0 +; CHECKLE-NEXT: .long 0 @ 0x0 +; +; CHECKBE-LABEL: xor_int64_f_0: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: adr r0, .LCPI59_0 +; CHECKBE-NEXT: vldrb.u8 q1, [r0] +; CHECKBE-NEXT: vrev64.8 q2, q1 +; CHECKBE-NEXT: veor q0, q0, q2 +; CHECKBE-NEXT: bx lr +; CHECKBE-NEXT: .p2align 4 +; CHECKBE-NEXT: @ %bb.1: +; CHECKBE-NEXT: .LCPI59_0: +; CHECKBE-NEXT: .long 0 @ 0x0 +; CHECKBE-NEXT: .long 255 @ 0xff +; CHECKBE-NEXT: .long 0 @ 0x0 +; CHECKBE-NEXT: .long 0 @ 0x0 +entry: + %b = xor <2 x i64> %a, + ret <2 x i64> %b } define arm_aapcs_vfpcc <16 x i8> @mov_int64_0f000f0f() { @@ -304,6 +975,26 @@ entry: ret <16 x i8> } +define arm_aapcs_vfpcc <16 x i8> @xor_int64_0f000f0f(<16 x i8> %a) { +; CHECKLE-LABEL: xor_int64_0f000f0f: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmov.i64 q1, #0xff000000ff00ff +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int64_0f000f0f: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i64 q1, #0xff00ff000000ff00 +; CHECKBE-NEXT: vrev64.8 q2, q1 +; CHECKBE-NEXT: vrev64.8 q1, q0 +; CHECKBE-NEXT: veor q1, q1, q2 +; CHECKBE-NEXT: vrev64.8 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <16 x i8> %a, + ret <16 x i8> %b +} + define arm_aapcs_vfpcc <8 x i16> @mov_int64_ff00ffff() { ; CHECKLE-LABEL: mov_int64_ff00ffff: ; CHECKLE: @ %bb.0: @ %entry @@ -318,6 +1009,26 @@ entry: ret <8 x i16> } +define arm_aapcs_vfpcc <8 x i16> @xor_int64_ff00ffff(<8 x i16> %a) { +; CHECKLE-LABEL: xor_int64_ff00ffff: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmov.i64 q1, #0xffffffff0000ffff +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int64_ff00ffff: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i64 q1, #0xffff0000ffffffff +; CHECKBE-NEXT: vrev64.16 q2, q1 +; CHECKBE-NEXT: vrev64.16 q1, q0 +; CHECKBE-NEXT: veor q1, q1, q2 +; CHECKBE-NEXT: vrev64.16 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <8 x i16> %a, + ret <8 x i16> %b +} + define arm_aapcs_vfpcc <16 x i8> @mov_int64_0f0f0f0f0f0f0f0f() { ; CHECKLE-LABEL: mov_int64_0f0f0f0f0f0f0f0f: ; CHECKLE: @ %bb.0: @ %entry @@ -332,6 +1043,27 @@ entry: ret <16 x i8> } +; FIXME: This is incorrect for BE +define arm_aapcs_vfpcc <16 x i8> @xor_int64_0f0f0f0f0f0f0f0f(<16 x i8> %a) { +; CHECKLE-LABEL: xor_int64_0f0f0f0f0f0f0f0f: +; CHECKLE: @ %bb.0: @ %entry +; CHECKLE-NEXT: vmov.i16 q1, #0xff +; CHECKLE-NEXT: veor q0, q0, q1 +; CHECKLE-NEXT: bx lr +; +; CHECKBE-LABEL: xor_int64_0f0f0f0f0f0f0f0f: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i16 q1, #0xff +; CHECKBE-NEXT: vrev64.8 q2, q0 +; CHECKBE-NEXT: vrev16.8 q1, q1 +; CHECKBE-NEXT: veor q1, q2, q1 +; CHECKBE-NEXT: vrev64.8 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = xor <16 x i8> %a, + ret <16 x i8> %b +} + define arm_aapcs_vfpcc <4 x float> @mov_float_1() { ; CHECK-LABEL: mov_float_1: ; CHECK: @ %bb.0: @ %entry @@ -342,6 +1074,19 @@ entry: ret <4 x float> } +define arm_aapcs_vfpcc <4 x float> @fadd_float_1(<4 x float> %a) { +; CHECKBE-LABEL: fadd_float_1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.f32 q1, #1.000000e+00 +; CHECKBE-NEXT: vrev64.32 q2, q0 +; CHECKBE-NEXT: vadd.f32 q1, q2, q1 +; CHECKBE-NEXT: vrev64.32 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = fadd <4 x float> %a, + ret <4 x float> %b +} + define arm_aapcs_vfpcc <4 x float> @mov_float_m3() { ; CHECK-LABEL: mov_float_m3: ; CHECK: @ %bb.0: @ %entry @@ -353,35 +1098,72 @@ entry: ret <4 x float> } +define arm_aapcs_vfpcc <4 x float> @fadd_float_m3(<4 x float> %a) { +; CHECKBE-LABEL: fadd_float_m3: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.f32 q1, #-3.000000e+00 +; CHECKBE-NEXT: vrev64.32 q2, q0 +; CHECKBE-NEXT: vadd.f32 q1, q2, q1 +; CHECKBE-NEXT: vrev64.32 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = fadd <4 x float> %a, + ret <4 x float> %b +} + define arm_aapcs_vfpcc <8 x half> @mov_float16_1() { ; CHECK-LABEL: mov_float16_1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i16 q0, #0x3c00 ; CHECK-NEXT: bx lr - entry: ret <8 x half> } +define arm_aapcs_vfpcc <8 x half> @fadd_float16_1(<8 x half> %a) { +; CHECKBE-LABEL: fadd_float16_1: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i16 q1, #0x3c00 +; CHECKBE-NEXT: vrev64.16 q2, q0 +; CHECKBE-NEXT: vadd.f16 q1, q2, q1 +; CHECKBE-NEXT: vrev64.16 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = fadd <8 x half> %a, + ret <8 x half> %b +} + define arm_aapcs_vfpcc <8 x half> @mov_float16_m3() { ; CHECK-LABEL: mov_float16_m3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i16 q0, #0xc200 ; CHECK-NEXT: bx lr - entry: ret <8 x half> } +define arm_aapcs_vfpcc <8 x half> @fadd_float16_m3(<8 x half> %a) { +; CHECKBE-LABEL: fadd_float16_m3: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: vmov.i16 q1, #0xc200 +; CHECKBE-NEXT: vrev64.16 q2, q0 +; CHECKBE-NEXT: vadd.f16 q1, q2, q1 +; CHECKBE-NEXT: vrev64.16 q0, q1 +; CHECKBE-NEXT: bx lr +entry: + %b = fadd <8 x half> %a, + ret <8 x half> %b +} + define arm_aapcs_vfpcc <2 x double> @mov_double_1() { ; CHECKLE-LABEL: mov_double_1: ; CHECKLE: @ %bb.0: @ %entry -; CHECKLE-NEXT: adr r0, .LCPI32_0 +; CHECKLE-NEXT: adr r0, .LCPI74_0 ; CHECKLE-NEXT: vldrw.u32 q0, [r0] ; CHECKLE-NEXT: bx lr ; CHECKLE-NEXT: .p2align 4 ; CHECKLE-NEXT: @ %bb.1: -; CHECKLE-NEXT: .LCPI32_0: +; CHECKLE-NEXT: .LCPI74_0: ; CHECKLE-NEXT: .long 0 @ double 1 ; CHECKLE-NEXT: .long 1072693248 ; CHECKLE-NEXT: .long 0 @ double 1 @@ -389,13 +1171,13 @@ define arm_aapcs_vfpcc <2 x double> @mov_double_1() { ; ; CHECKBE-LABEL: mov_double_1: ; CHECKBE: @ %bb.0: @ %entry -; CHECKBE-NEXT: adr r0, .LCPI32_0 +; CHECKBE-NEXT: adr r0, .LCPI74_0 ; CHECKBE-NEXT: vldrb.u8 q1, [r0] ; CHECKBE-NEXT: vrev64.8 q0, q1 ; CHECKBE-NEXT: bx lr ; CHECKBE-NEXT: .p2align 4 ; CHECKBE-NEXT: @ %bb.1: -; CHECKBE-NEXT: .LCPI32_0: +; CHECKBE-NEXT: .LCPI74_0: ; CHECKBE-NEXT: .long 1072693248 @ double 1 ; CHECKBE-NEXT: .long 0 ; CHECKBE-NEXT: .long 1072693248 @ double 1 From 43b88851cefe68645aa59b1fccc8390a8a31f469 Mon Sep 17 00:00:00 2001 From: Max Winkler Date: Sat, 24 Aug 2024 12:25:46 -0700 Subject: [PATCH 415/426] [clang-cl] [AST] Reapply #102848 Fix placeholder return type name mangling for MSVC 1920+ / VS2019+ (#104722) Reapply https://github.com/llvm/llvm-project/pull/102848. The description in this PR will detail the changes from the reverted original PR above. For `auto&&` return types that can partake in reference collapsing we weren't properly handling that mangling that can arise. When collapsing occurs an inner reference is created with the collapsed reference type. If we return `int&` from such a function then an inner reference of `int&` is created within the `auto&&` return type. `getPointeeType` on a reference type goes through all inner references before returning the pointee type which ends up being a builtin type, `int`, which is unexpected. We can use `getPointeeTypeAsWritten` to get the `AutoType` as expected however for the instantiated template declaration reference collapsing already occurred on the return type. This means `auto&&` is turned into `auto&` in our example above. We end up mangling an lvalue reference type. This is unintended as MSVC mangles on the declaration of the return type, `auto&&` in this case, which is treated as an rvalue reference. ``` template auto&& AutoReferenceCollapseT(int& x) { return static_cast(x); } void test() { int x = 1; auto&& rref = AutoReferenceCollapseT(x); // "??$AutoReferenceCollapseT@X@@YA$$QEA_PAEAH@Z" // Mangled as an rvalue reference to auto } ``` If we are mangling a template with a placeholder return type we want to get the first template declaration and use its return type to do the mangling of any instantiations. This fixes the bug reported in the original PR that caused the revert with libcxx `std::variant`. I also tested locally with libcxx and the following test code which fails in the original PR but now works in this PR. ``` #include void test() { std::variant v{ 1 }; int& r = std::get<0>(v); (void)r; } ``` --- clang/docs/ReleaseNotes.rst | 2 + clang/lib/AST/MicrosoftMangle.cpp | 170 +++++++- .../test/CodeGenCXX/mangle-ms-auto-return.cpp | 383 ++++++++++++++++++ .../mangle-ms-auto-templates-memptrs.cpp | 12 +- .../mangle-ms-auto-templates-nullptr.cpp | 2 +- .../CodeGenCXX/mangle-ms-auto-templates.cpp | 6 +- 6 files changed, 556 insertions(+), 19 deletions(-) create mode 100644 clang/test/CodeGenCXX/mangle-ms-auto-return.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 0ced2f779f7058..6e1db41a55cbe0 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -77,6 +77,8 @@ C++ Specific Potentially Breaking Changes ABI Changes in This Version --------------------------- +- Fixed Microsoft name mangling of placeholder, auto and decltype(auto), return types for MSVC 1920+. This change resolves incompatibilities with code compiled by MSVC 1920+ but will introduce incompatibilities with code compiled by earlier versions of Clang unless such code is built with the compiler option -fms-compatibility-version=19.14 to imitate the MSVC 1914 mangling behavior. + AST Dumping Potentially Breaking Changes ---------------------------------------- diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp index ed8d1cf1b98dd8..b539681984ef7c 100644 --- a/clang/lib/AST/MicrosoftMangle.cpp +++ b/clang/lib/AST/MicrosoftMangle.cpp @@ -408,6 +408,8 @@ class MicrosoftCXXNameMangler { void mangleSourceName(StringRef Name); void mangleNestedName(GlobalDecl GD); + void mangleAutoReturnType(QualType T, QualifierMangleMode QMM); + private: bool isStructorDecl(const NamedDecl *ND) const { return ND == Structor || getStructor(ND) == Structor; @@ -477,6 +479,11 @@ class MicrosoftCXXNameMangler { SourceRange Range); void mangleObjCKindOfType(const ObjCObjectType *T, Qualifiers Quals, SourceRange Range); + + void mangleAutoReturnType(const MemberPointerType *T, Qualifiers Quals); + void mangleAutoReturnType(const PointerType *T, Qualifiers Quals); + void mangleAutoReturnType(const LValueReferenceType *T, Qualifiers Quals); + void mangleAutoReturnType(const RValueReferenceType *T, Qualifiers Quals); }; } @@ -2494,6 +2501,57 @@ void MicrosoftCXXNameMangler::mangleAddressSpaceType(QualType T, mangleArtificialTagType(TagTypeKind::Struct, ASMangling, {"__clang"}); } +void MicrosoftCXXNameMangler::mangleAutoReturnType(QualType T, + QualifierMangleMode QMM) { + assert(getASTContext().getLangOpts().isCompatibleWithMSVC( + LangOptions::MSVC2019) && + "Cannot mangle MSVC 2017 auto return types!"); + + if (isa(T)) { + const auto *AT = T->getContainedAutoType(); + Qualifiers Quals = T.getLocalQualifiers(); + + if (QMM == QMM_Result) + Out << '?'; + if (QMM != QMM_Drop) + mangleQualifiers(Quals, false); + Out << (AT->isDecltypeAuto() ? "_T" : "_P"); + return; + } + + T = T.getDesugaredType(getASTContext()); + Qualifiers Quals = T.getLocalQualifiers(); + + switch (QMM) { + case QMM_Drop: + case QMM_Result: + break; + case QMM_Mangle: + mangleQualifiers(Quals, false); + break; + default: + llvm_unreachable("QMM_Escape unexpected"); + } + + const Type *ty = T.getTypePtr(); + switch (ty->getTypeClass()) { + case Type::MemberPointer: + mangleAutoReturnType(cast(ty), Quals); + break; + case Type::Pointer: + mangleAutoReturnType(cast(ty), Quals); + break; + case Type::LValueReference: + mangleAutoReturnType(cast(ty), Quals); + break; + case Type::RValueReference: + mangleAutoReturnType(cast(ty), Quals); + break; + default: + llvm_unreachable("Invalid type expected"); + } +} + void MicrosoftCXXNameMangler::mangleType(QualType T, SourceRange Range, QualifierMangleMode QMM) { // Don't use the canonical types. MSVC includes things like 'const' on @@ -2907,17 +2965,60 @@ void MicrosoftCXXNameMangler::mangleFunctionType(const FunctionType *T, // can differ by their calling convention and are typically deduced. So // we make sure that this type gets mangled properly. mangleType(ResultType, Range, QMM_Result); - } else if (const auto *AT = dyn_cast_or_null( - ResultType->getContainedAutoType())) { - Out << '?'; - mangleQualifiers(ResultType.getLocalQualifiers(), /*IsMember=*/false); - Out << '?'; + } else if (IsInLambda) { + if (const auto *AT = ResultType->getContainedAutoType()) { + assert(AT->getKeyword() == AutoTypeKeyword::Auto && + "should only need to mangle auto!"); + (void)AT; + Out << '?'; + mangleQualifiers(ResultType.getLocalQualifiers(), /*IsMember=*/false); + Out << '?'; + mangleSourceName(""); + Out << '@'; + } else { + Out << '@'; + } + } else if (const auto *AT = ResultType->getContainedAutoType()) { assert(AT->getKeyword() != AutoTypeKeyword::GNUAutoType && "shouldn't need to mangle __auto_type!"); - mangleSourceName(AT->isDecltypeAuto() ? "" : ""); - Out << '@'; - } else if (IsInLambda) { - Out << '@'; + + // If we have any pointer types with the clang address space extension + // then defer to the custom clang mangling to keep backwards + // compatibility. See `mangleType(const PointerType *T, Qualifiers Quals, + // SourceRange Range)` for details. + auto UseClangMangling = [](QualType ResultType) { + QualType T = ResultType; + while (isa(T.getTypePtr())) { + T = T->getPointeeType(); + if (T.getQualifiers().hasAddressSpace()) + return true; + } + return false; + }; + + if (getASTContext().getLangOpts().isCompatibleWithMSVC( + LangOptions::MSVC2019) && + !UseClangMangling(ResultType)) { + if (D && !D->getPrimaryTemplate()) { + Out << '@'; + } else { + if (D && D->getPrimaryTemplate()) { + const FunctionProtoType *FPT = D->getPrimaryTemplate() + ->getTemplatedDecl() + ->getFirstDecl() + ->getType() + ->castAs(); + ResultType = FPT->getReturnType(); + } + mangleAutoReturnType(ResultType, QMM_Result); + } + } else { + Out << '?'; + mangleQualifiers(ResultType.getLocalQualifiers(), /*IsMember=*/false); + Out << '?'; + mangleSourceName(AT->isDecltypeAuto() ? "" : ""); + Out << '@'; + } } else { if (ResultType->isVoidType()) ResultType = ResultType.getUnqualifiedType(); @@ -4220,6 +4321,57 @@ void MicrosoftMangleContextImpl::mangleStringLiteral(const StringLiteral *SL, Mangler.getStream() << '@'; } +void MicrosoftCXXNameMangler::mangleAutoReturnType(const MemberPointerType *T, + Qualifiers Quals) { + QualType PointeeType = T->getPointeeType(); + manglePointerCVQualifiers(Quals); + manglePointerExtQualifiers(Quals, PointeeType); + if (const FunctionProtoType *FPT = PointeeType->getAs()) { + Out << '8'; + mangleName(T->getClass()->castAs()->getDecl()); + mangleFunctionType(FPT, nullptr, true); + } else { + mangleQualifiers(PointeeType.getQualifiers(), true); + mangleName(T->getClass()->castAs()->getDecl()); + mangleAutoReturnType(PointeeType, QMM_Drop); + } +} + +void MicrosoftCXXNameMangler::mangleAutoReturnType(const PointerType *T, + Qualifiers Quals) { + QualType PointeeType = T->getPointeeType(); + assert(!PointeeType.getQualifiers().hasAddressSpace() && + "Unexpected address space mangling required"); + + manglePointerCVQualifiers(Quals); + manglePointerExtQualifiers(Quals, PointeeType); + + if (const FunctionProtoType *FPT = PointeeType->getAs()) { + Out << '6'; + mangleFunctionType(FPT); + } else { + mangleAutoReturnType(PointeeType, QMM_Mangle); + } +} + +void MicrosoftCXXNameMangler::mangleAutoReturnType(const LValueReferenceType *T, + Qualifiers Quals) { + QualType PointeeType = T->getPointeeType(); + assert(!Quals.hasConst() && !Quals.hasVolatile() && "unexpected qualifier!"); + Out << 'A'; + manglePointerExtQualifiers(Quals, PointeeType); + mangleAutoReturnType(PointeeType, QMM_Mangle); +} + +void MicrosoftCXXNameMangler::mangleAutoReturnType(const RValueReferenceType *T, + Qualifiers Quals) { + QualType PointeeType = T->getPointeeType(); + assert(!Quals.hasConst() && !Quals.hasVolatile() && "unexpected qualifier!"); + Out << "$$Q"; + manglePointerExtQualifiers(Quals, PointeeType); + mangleAutoReturnType(PointeeType, QMM_Mangle); +} + MicrosoftMangleContext *MicrosoftMangleContext::create(ASTContext &Context, DiagnosticsEngine &Diags, bool IsAux) { diff --git a/clang/test/CodeGenCXX/mangle-ms-auto-return.cpp b/clang/test/CodeGenCXX/mangle-ms-auto-return.cpp new file mode 100644 index 00000000000000..5b18dcc0820ee6 --- /dev/null +++ b/clang/test/CodeGenCXX/mangle-ms-auto-return.cpp @@ -0,0 +1,383 @@ +// RUN: %clang_cc1 -std=c++17 -fms-compatibility-version=19.20 -emit-llvm %s -o - -fms-extensions -fdelayed-template-parsing -triple=x86_64-pc-windows-msvc | FileCheck %s + +struct StructA {}; + +template +auto AutoT() { return T(); } + +template +const auto AutoConstT() { return T(); } + +template +volatile auto AutoVolatileT() { return T(); } + +template +const volatile auto AutoConstVolatileT() { return T(); } + +// The qualifiers of the return type should always be emitted even for void types. +// Void types usually have their qualifers stripped in the mangled name for MSVC ABI. +void test_template_auto_void() { + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@X@@YA?A_PXZ" + + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@$$CBX@@YA?A_PXZ" + + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@$$CCX@@YA?A_PXZ" + + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@$$CDX@@YA?A_PXZ" + + AutoConstT(); + // CHECK: call {{.*}} @"??$AutoConstT@X@@YA?B_PXZ" + + AutoVolatileT(); + // CHECK: call {{.*}} @"??$AutoVolatileT@X@@YA?C_PXZ" + + AutoConstVolatileT(); + // CHECK: call {{.*}} @"??$AutoConstVolatileT@X@@YA?D_PXZ" +} + +void test_template_auto_int() { + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@H@@YA?A_PXZ" + + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@$$CBH@@YA?A_PXZ" + + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@$$CCH@@YA?A_PXZ" + + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@$$CDH@@YA?A_PXZ" + + AutoConstT(); + // CHECK: call {{.*}} @"??$AutoConstT@H@@YA?B_PXZ" + + AutoVolatileT(); + // CHECK: call {{.*}} @"??$AutoVolatileT@H@@YA?C_PXZ" + + AutoConstVolatileT(); + // CHECK: call {{.*}} @"??$AutoConstVolatileT@H@@YA?D_PXZ" +} + +void test_template_auto_struct() { + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@UStructA@@@@YA?A_PXZ" + + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@$$CBUStructA@@@@YA?A_PXZ" + + AutoConstT(); + // CHECK: call {{.*}} @"??$AutoConstT@UStructA@@@@YA?B_PXZ" + + AutoVolatileT(); + // CHECK: call {{.*}} @"??$AutoVolatileT@UStructA@@@@YA?C_PXZ" + + AutoConstVolatileT(); + // CHECK: call {{.*}} @"??$AutoConstVolatileT@UStructA@@@@YA?D_PXZ" +} + +void test_template_auto_ptr() { + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@PEAH@@YA?A_PXZ" + + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@PEBH@@YA?A_PXZ" + + AutoT(); + // CHECK: call {{.*}} @"??$AutoT@QEBH@@YA?A_PXZ" + + AutoConstT(); + // CHECK: call {{.*}} @"??$AutoConstT@PEAH@@YA?B_PXZ" + + AutoVolatileT(); + // CHECK: call {{.*}} @"??$AutoVolatileT@PEAH@@YA?C_PXZ" + + AutoConstVolatileT(); + // CHECK: call {{.*}} @"??$AutoConstVolatileT@PEAH@@YA?D_PXZ" +} + +template +auto* PtrAutoT() { return T(); } + +template +const auto* PtrAutoConstT() { return T(); } + +template +volatile auto* PtrAutoVolatileT() { return T(); } + +template +const volatile auto* PtrAutoConstVolatileT() { return T(); } + +void test_template_ptr_auto() { + PtrAutoT(); + // CHECK: call {{.*}} @"??$PtrAutoT@PEAH@@YAPEA_PXZ" + + PtrAutoT(); + // CHECK: call {{.*}} @"??$PtrAutoT@PEBH@@YAPEA_PXZ" + + PtrAutoT(); + // CHECK: call {{.*}} @"??$PtrAutoT@QEBH@@YAPEA_PXZ" + + PtrAutoConstT(); + // CHECK: call {{.*}} @"??$PtrAutoConstT@PEAH@@YAPEB_PXZ" + + PtrAutoVolatileT(); + // CHECK: call {{.*}} @"??$PtrAutoVolatileT@PEAH@@YAPEC_PXZ" + + PtrAutoConstVolatileT(); + // CHECK: call {{.*}} @"??$PtrAutoConstVolatileT@PEAH@@YAPED_PXZ" +} + +int func_int(); +const int func_constint(); +void func_void(); +int* func_intptr(); + +template +auto (*FuncPtrAutoT())() { return v; } + +void test_template_func_ptr_auto() { + FuncPtrAutoT(); + // CHECK: call {{.*}} @"??$FuncPtrAutoT@P6AHXZ$1?func_int@@YAHXZ@@YAP6A?A_PXZXZ" + + FuncPtrAutoT(); + // CHECK: call {{.*}} @"??$FuncPtrAutoT@P6A?BHXZ$1?func_constint@@YA?BHXZ@@YAP6A?A_PXZXZ" + + FuncPtrAutoT(); + // CHECK: call {{.*}} @"??$FuncPtrAutoT@P6AXXZ$1?func_void@@YAXXZ@@YAP6A?A_PXZXZ" + + FuncPtrAutoT(); + // CHECK: call {{.*}} @"??$FuncPtrAutoT@P6APEAHXZ$1?func_intptr@@YAPEAHXZ@@YAP6A?A_PXZXZ" +} + +template +auto& RefAutoT(T& x) { return x; } + +template +const auto& ConstRefAutoT(T& x) { return x; } + +template +auto&& RRefAutoT(T& x) { return static_cast(x); } + +void test_template_ref_auto() { + int x; + + RefAutoT(x); + // CHECK: call {{.*}} @"??$RefAutoT@H@@YAAEA_PAEAH@Z" + + ConstRefAutoT(x); + // CHECK: call {{.*}} @"??$ConstRefAutoT@H@@YAAEB_PAEAH@Z" + + RRefAutoT(x); + // CHECK: call {{.*}} @"??$RRefAutoT@H@@YA$$QEA_PAEAH@Z" +} + +template +decltype(auto) DecltypeAutoT() { return T(); } + +template +decltype(auto) DecltypeAutoT2(T& x) { return static_cast(x); } + +void test_template_decltypeauto() { + DecltypeAutoT(); + // CHECK: call {{.*}} @"??$DecltypeAutoT@X@@YA?A_TXZ" + + DecltypeAutoT(); + // CHECK: call {{.*}} @"??$DecltypeAutoT@$$CBX@@YA?A_TXZ" + + DecltypeAutoT(); + // CHECK: call {{.*}} @"??$DecltypeAutoT@$$CCX@@YA?A_TXZ" + + DecltypeAutoT(); + // CHECK: call {{.*}} @"??$DecltypeAutoT@$$CDX@@YA?A_TXZ" + + DecltypeAutoT(); + // CHECK: call {{.*}} @"??$DecltypeAutoT@H@@YA?A_TXZ" + + DecltypeAutoT(); + // CHECK: call {{.*}} @"??$DecltypeAutoT@$$CBH@@YA?A_TXZ" + + DecltypeAutoT(); + // CHECK: call {{.*}} @"??$DecltypeAutoT@$$CCH@@YA?A_TXZ" + + DecltypeAutoT(); + // CHECK: call {{.*}} @"??$DecltypeAutoT@$$CDH@@YA?A_TXZ" + + int x; + + DecltypeAutoT2(x); + // CHECK: call {{.*}} @"??$DecltypeAutoT2@H@@YA?A_TAEAH@Z" +} + +// Still want to use clang's custom mangling for lambdas to keep backwards compatibility until +// MSVC lambda name mangling has been deciphered. +void test_lambda() { + auto lambdaIntRetAuto = []() { return 0; }; + lambdaIntRetAuto(); + // CHECK: call {{.*}} @"??R@?0??test_lambda@@YAXXZ@QEBA?A?@@XZ" + + auto lambdaIntRet = []() -> int { return 0; }; + lambdaIntRet(); + // CHECK: call {{.*}} @"??R@?0??test_lambda@@YAXXZ@QEBA@XZ" + + auto lambdaGenericIntIntRetAuto = [](auto a) { return a; }; + lambdaGenericIntIntRetAuto(0); + // CHECK: call {{.*}} @"??$?RH@@?0??test_lambda@@YAXXZ@QEBA?A?@@H@Z" +} + +auto TestTrailingInt() -> int { + return 0; +} + +auto TestTrailingConstVolatileVoid() -> const volatile void { +} + +auto TestTrailingStructA() -> StructA { + return StructA{}; +} + +void test_trailing_return() { + TestTrailingInt(); + // CHECK: call {{.*}} @"?TestTrailingInt@@YAHXZ" + + TestTrailingConstVolatileVoid(); + // CHECK: call {{.*}} @"?TestTrailingConstVolatileVoid@@YAXXZ" + + TestTrailingStructA(); + // CHECK: call {{.*}} @"?TestTrailingStructA@@YA?AUStructA@@XZ" +} + +auto TestNonTemplateAutoInt() { + return 0; +} + +auto TestNonTemplateAutoVoid() { + return; +} + +auto TestNonTemplateAutoStructA() { + return StructA{}; +} + +const auto TestNonTemplateConstAutoInt() { + return 0; +} + +const auto TestNonTemplateConstAutoVoid() { + return; +} + +const auto TestNonTemplateConstAutoStructA() { + return StructA{}; +} + +void test_nontemplate_auto() { + TestNonTemplateAutoInt(); + // CHECK: call {{.*}} @"?TestNonTemplateAutoInt@@YA@XZ" + + TestNonTemplateAutoVoid(); + // CHECK: call {{.*}} @"?TestNonTemplateAutoVoid@@YA@XZ" + + TestNonTemplateAutoStructA(); + // CHECK: call {{.*}} @"?TestNonTemplateAutoStructA@@YA@XZ" + + TestNonTemplateConstAutoInt(); + // CHECK: call {{.*}} @"?TestNonTemplateConstAutoInt@@YA@XZ" + + TestNonTemplateConstAutoVoid(); + // CHECK: call {{.*}} @"?TestNonTemplateConstAutoVoid@@YA@XZ" + + TestNonTemplateConstAutoStructA(); + // CHECK: call {{.*}} @"?TestNonTemplateConstAutoStructA@@YA@XZ" +} + +decltype(auto) TestNonTemplateDecltypeAutoInt() { + return 0; +} + +decltype(auto) TestNonTemplateDecltypeAutoVoid() { + return; +} + +decltype(auto) TestNonTemplateDecltypeAutoStructA() { + return StructA{}; +} + +void test_nontemplate_decltypeauto() { + TestNonTemplateDecltypeAutoInt(); + // CHECK: call {{.*}} @"?TestNonTemplateDecltypeAutoInt@@YA@XZ" + + TestNonTemplateDecltypeAutoVoid(); + // CHECK: call {{.*}} @"?TestNonTemplateDecltypeAutoVoid@@YA@XZ" + + TestNonTemplateDecltypeAutoStructA(); + // CHECK: call {{.*}} @"?TestNonTemplateDecltypeAutoStructA@@YA@XZ" +} + +struct StructB { + int x; +}; + +template +auto StructB::* AutoMemberDataPtrT(T x) { return x; } + +template +const auto StructB::* AutoConstMemberDataPtrT(T x) { return x; } + +void test_template_auto_member_data_ptr() { + AutoMemberDataPtrT(&StructB::x); + // CHECK: call {{.*}} @"??$AutoMemberDataPtrT@PEQStructB@@H@@YAPEQStructB@@_PPEQ0@H@Z" + + AutoConstMemberDataPtrT(&StructB::x); + // CHECK: call {{.*}} @"??$AutoConstMemberDataPtrT@PEQStructB@@H@@YAPERStructB@@_PPEQ0@H@Z" +} + +struct StructC { + void test() {} +}; + +struct StructD { + const int test() { return 0; } +}; + +template +auto (StructC::*AutoMemberFuncPtrT(T x))() { return x; } + +template +const auto (StructD::*AutoConstMemberFuncPtrT(T x))() { return x; } + +void test_template_auto_member_func_ptr() { + AutoMemberFuncPtrT(&StructC::test); + // CHECK: call {{.*}} @"??$AutoMemberFuncPtrT@P8StructC@@EAAXXZ@@YAP8StructC@@EAA?A_PXZP80@EAAXXZ@Z" + + AutoConstMemberFuncPtrT(&StructD::test); + // CHECK: call {{.*}} @"??$AutoConstMemberFuncPtrT@P8StructD@@EAA?BHXZ@@YAP8StructD@@EAA?B_PXZP80@EAA?BHXZ@Z" +} + +template +auto * __attribute__((address_space(1))) * AutoPtrAddressSpaceT() { + T * __attribute__((address_space(1))) * p = nullptr; + return p; +} + +void test_template_auto_address_space_ptr() { + AutoPtrAddressSpaceT(); + // CHECK: call {{.*}} @"??$AutoPtrAddressSpaceT@H@@YA?A?@@XZ" +} + +template +auto&& AutoReferenceCollapseT(T& x) { return static_cast(x); } + +auto&& AutoReferenceCollapse(int& x) { return static_cast(x); } + +void test2() { + int x = 1; + auto&& rref0 = AutoReferenceCollapseT(x); + // CHECK: call {{.*}} @"??$AutoReferenceCollapseT@H@@YA$$QEA_PAEAH@Z" + + auto&& rref1 = AutoReferenceCollapse(x); + // CHECK: call {{.*}} @"?AutoReferenceCollapse@@YA@AEAH@Z" +} diff --git a/clang/test/CodeGenCXX/mangle-ms-auto-templates-memptrs.cpp b/clang/test/CodeGenCXX/mangle-ms-auto-templates-memptrs.cpp index 360ebdecc5562b..b7bc3953f0b438 100644 --- a/clang/test/CodeGenCXX/mangle-ms-auto-templates-memptrs.cpp +++ b/clang/test/CodeGenCXX/mangle-ms-auto-templates-memptrs.cpp @@ -34,15 +34,15 @@ void template_mangling() { // BEFORE: call {{.*}} @"??0?$AutoParmTemplate@$I?f@V@@QEAAXXZA@A@@@QEAA@XZ" AutoFunc<&S::f>(); - // AFTER: call {{.*}} @"??$AutoFunc@$MP8S@@EAAXXZ1?f@1@QEAAXXZ@@YA?A?@@XZ" + // AFTER: call {{.*}} @"??$AutoFunc@$MP8S@@EAAXXZ1?f@1@QEAAXXZ@@YA?A_PXZ" // BEFORE: call {{.*}} @"??$AutoFunc@$1?f@S@@QEAAXXZ@@YA?A?@@XZ" AutoFunc<&M::f>(); - // AFTER: call {{.*}} @"??$AutoFunc@$MP8M@@EAAXXZH?f@1@QEAAXXZA@@@YA?A?@@XZ" + // AFTER: call {{.*}} @"??$AutoFunc@$MP8M@@EAAXXZH?f@1@QEAAXXZA@@@YA?A_PXZ" // BEFORE: call {{.*}} @"??$AutoFunc@$H?f@M@@QEAAXXZA@@@YA?A?@@XZ" AutoFunc<&V::f>(); - // AFTER: call {{.*}} @"??$AutoFunc@$MP8V@@EAAXXZI?f@1@QEAAXXZA@A@@@YA?A?@@XZ" + // AFTER: call {{.*}} @"??$AutoFunc@$MP8V@@EAAXXZI?f@1@QEAAXXZA@A@@@YA?A_PXZ" // BEFORE: call {{.*}} @"??$AutoFunc@$I?f@V@@QEAAXXZA@A@@@YA?A?@@XZ" AutoParmTemplate<&S::a> auto_data_single_inheritance; @@ -58,14 +58,14 @@ void template_mangling() { // BEFORE: call {{.*}} @"??0?$AutoParmTemplate@$FBA@A@@@QEAA@XZ" AutoFunc<&S::a>(); - // AFTER: call {{.*}} @"??$AutoFunc@$MPEQS@@H07@@YA?A?@@XZ" + // AFTER: call {{.*}} @"??$AutoFunc@$MPEQS@@H07@@YA?A_PXZ" // BEFORE: call {{.*}} @"??$AutoFunc@$07@@YA?A?@@XZ" AutoFunc<&M::a>(); - // AFTER: call {{.*}} @"??$AutoFunc@$MPEQM@@H0M@@@YA?A?@@XZ" + // AFTER: call {{.*}} @"??$AutoFunc@$MPEQM@@H0M@@@YA?A_PXZ" // BEFORE: call {{.*}} @"??$AutoFunc@$0M@@@YA?A?@@XZ" AutoFunc<&V::a>(); - // AFTER: call {{.*}} @"??$AutoFunc@$MPEQV@@HFBA@A@@@YA?A?@@XZ" + // AFTER: call {{.*}} @"??$AutoFunc@$MPEQV@@HFBA@A@@@YA?A_PXZ" // BEFORE: call {{.*}} @"??$AutoFunc@$FBA@A@@@YA?A?@@XZ" } diff --git a/clang/test/CodeGenCXX/mangle-ms-auto-templates-nullptr.cpp b/clang/test/CodeGenCXX/mangle-ms-auto-templates-nullptr.cpp index 8f98c1e59f73d7..251d9219c01ce2 100644 --- a/clang/test/CodeGenCXX/mangle-ms-auto-templates-nullptr.cpp +++ b/clang/test/CodeGenCXX/mangle-ms-auto-templates-nullptr.cpp @@ -19,6 +19,6 @@ void template_mangling() { // BEFORE: call {{.*}} @"??0?$AutoParmTemplate@$0A@@@QEAA@XZ" AutoFunc(); - // AFTER: call {{.*}} @"??$AutoFunc@$M$$T0A@@@YA?A?@@XZ" + // AFTER: call {{.*}} @"??$AutoFunc@$M$$T0A@@@YA?A_PXZ" // BEFORE: call {{.*}} @"??$AutoFunc@$0A@@@YA?A?@@XZ" } diff --git a/clang/test/CodeGenCXX/mangle-ms-auto-templates.cpp b/clang/test/CodeGenCXX/mangle-ms-auto-templates.cpp index ff5395cea75eb7..effcc31ee31103 100644 --- a/clang/test/CodeGenCXX/mangle-ms-auto-templates.cpp +++ b/clang/test/CodeGenCXX/mangle-ms-auto-templates.cpp @@ -26,7 +26,7 @@ int j; void template_mangling() { AutoFunc<1>(); - // AFTER: call {{.*}} @"??$AutoFunc@$MH00@@YA?A?@@XZ" + // AFTER: call {{.*}} @"??$AutoFunc@$MH00@@YA?A_PXZ" // BEFORE: call {{.*}} @"??$AutoFunc@$00@@YA?A?@@XZ" AutoParmTemplate<0> auto_int; // AFTER: call {{.*}} @"??0?$AutoParmTemplate@$MH0A@@@QEAA@XZ" @@ -52,7 +52,7 @@ void template_mangling() { // BEFORE: call {{.*}} @"??0?$AutoParmsTemplate@$00$0HPPPPPPPPPPPPPPP@@@QEAA@XZ" AutoFunc<&i>(); - // AFTER: call {{.*}} @"??$AutoFunc@$MPEAH1?i@@3HA@@YA?A?@@XZ" + // AFTER: call {{.*}} @"??$AutoFunc@$MPEAH1?i@@3HA@@YA?A_PXZ" // BEFORE: call {{.*}} @"??$AutoFunc@$1?i@@3HA@@YA?A?@@XZ" AutoParmTemplate<&i> auto_int_ptr; @@ -64,7 +64,7 @@ void template_mangling() { // BEFORE: call {{.*}} @"??0?$AutoParmsTemplate@$1?i@@3HA$1?j@@3HA@@QEAA@XZ" AutoFunc<&Func>(); - // AFTER: call {{.*}} @"??$AutoFunc@$MP6AHXZ1?Func@@YAHXZ@@YA?A?@@XZ" + // AFTER: call {{.*}} @"??$AutoFunc@$MP6AHXZ1?Func@@YAHXZ@@YA?A_PXZ" // BEFORE: call {{.*}} @"??$AutoFunc@$1?Func@@YAHXZ@@YA?A?@@XZ" AutoParmTemplate<&Func> auto_func_ptr; From 77fccb35ac08f66d52bb152735e27572bf9f3f93 Mon Sep 17 00:00:00 2001 From: hanbeom Date: Sun, 25 Aug 2024 04:30:40 +0900 Subject: [PATCH 416/426] [AArch64] Replace AND with LSL#2 for LDR target (#34101) (#89531) Currently, process of replacing bitwise operations consisting of `LSR`/`LSL` with `And` is performed by `DAGCombiner`. However, in certain cases, the `AND` generated by this process can be removed. Consider following case: ``` lsr x8, x8, #56 and x8, x8, #0xfc ldr w0, [x2, x8] ret ``` In this case, we can remove the `AND` by changing the target of `LDR` to `[X2, X8, LSL #2]` and right-shifting amount change to 56 to 58. after changed: ``` lsr x8, x8, #58 ldr w0, [x2, x8, lsl #2] ret ``` This patch checks to see if the `SHIFTING` + `AND` operation on load target can be optimized and optimizes it if it can. --- .../Target/AArch64/AArch64ISelLowering.cpp | 17 +++ llvm/test/CodeGen/AArch64/arm64-fold-lshr.ll | 138 ++++++++++++++++++ 2 files changed, 155 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/arm64-fold-lshr.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8c2f85657ff87e..5ac5b7f8a5ab18 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -18023,6 +18023,23 @@ bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask( return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue()); } + // We do not need to fold when this shifting used in specific load case: + // (ldr x, (add x, (shl (srl x, c1) 2))) + if (N->getOpcode() == ISD::SHL && N->hasOneUse()) { + if (auto C2 = dyn_cast(N->getOperand(1))) { + unsigned ShlAmt = C2->getZExtValue(); + if (auto ShouldADD = *N->use_begin(); + ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) { + if (auto ShouldLOAD = dyn_cast(*ShouldADD->use_begin())) { + unsigned ByteVT = ShouldLOAD->getMemoryVT().getSizeInBits() / 8; + if ((1ULL << ShlAmt) == ByteVT && + isIndexedLoadLegal(ISD::PRE_INC, ShouldLOAD->getMemoryVT())) + return false; + } + } + } + } + return true; } diff --git a/llvm/test/CodeGen/AArch64/arm64-fold-lshr.ll b/llvm/test/CodeGen/AArch64/arm64-fold-lshr.ll new file mode 100644 index 00000000000000..9dfc8df703ce64 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-fold-lshr.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s -mtriple=aarch64 | FileCheck %s +; + +define i16 @load16_shr63(i64 %a, i64 %b, ptr %table) { +; CHECK-LABEL: load16_shr63: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul x8, x1, x0 +; CHECK-NEXT: lsr x8, x8, #63 +; CHECK-NEXT: ldrh w0, [x2, x8, lsl #1] +; CHECK-NEXT: ret +entry: + %mul = mul i64 %b, %a + %shr = lshr i64 %mul, 63 + %arrayidx = getelementptr inbounds i16, ptr %table, i64 %shr + %0 = load i16, ptr %arrayidx, align 2 + ret i16 %0 +} + +define i16 @load16_shr2(i64 %a, i64 %b, ptr %table) { +; CHECK-LABEL: load16_shr2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul x8, x1, x0 +; CHECK-NEXT: lsr x8, x8, #2 +; CHECK-NEXT: ldrh w0, [x2, x8, lsl #1] +; CHECK-NEXT: ret +entry: + %mul = mul i64 %b, %a + %shr = lshr i64 %mul, 2 + %arrayidx = getelementptr inbounds i16, ptr %table, i64 %shr + %0 = load i16, ptr %arrayidx, align 2 + ret i16 %0 +} + +define i16 @load16_shr1(i64 %a, i64 %b, ptr %table) { +; CHECK-LABEL: load16_shr1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul x8, x1, x0 +; CHECK-NEXT: lsr x8, x8, #1 +; CHECK-NEXT: ldrh w0, [x2, x8, lsl #1] +; CHECK-NEXT: ret +entry: + %mul = mul i64 %b, %a + %shr = lshr i64 %mul, 1 + %arrayidx = getelementptr inbounds i16, ptr %table, i64 %shr + %0 = load i16, ptr %arrayidx, align 2 + ret i16 %0 +} + +define i32 @load32_shr63(i64 %a, i64 %b, ptr %table) { +; CHECK-LABEL: load32_shr63: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul x8, x1, x0 +; CHECK-NEXT: lsr x8, x8, #63 +; CHECK-NEXT: ldr w0, [x2, x8, lsl #2] +; CHECK-NEXT: ret +entry: + %mul = mul i64 %b, %a + %shr = lshr i64 %mul, 63 + %arrayidx = getelementptr inbounds i32, ptr %table, i64 %shr + %0 = load i32, ptr %arrayidx, align 4 + ret i32 %0 +} + +define i32 @load32_shr2(i64 %a, i64 %b, ptr %table) { +; CHECK-LABEL: load32_shr2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul x8, x1, x0 +; CHECK-NEXT: lsr x8, x8, #2 +; CHECK-NEXT: ldr w0, [x2, x8, lsl #2] +; CHECK-NEXT: ret +entry: + %mul = mul i64 %b, %a + %shr = lshr i64 %mul, 2 + %arrayidx = getelementptr inbounds i32, ptr %table, i64 %shr + %0 = load i32, ptr %arrayidx, align 4 + ret i32 %0 +} + +define i32 @load32_shr1(i64 %a, i64 %b, ptr %table) { +; CHECK-LABEL: load32_shr1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul x8, x1, x0 +; CHECK-NEXT: lsr x8, x8, #1 +; CHECK-NEXT: ldr w0, [x2, x8, lsl #2] +; CHECK-NEXT: ret +entry: + %mul = mul i64 %b, %a + %shr = lshr i64 %mul, 1 + %arrayidx = getelementptr inbounds i32, ptr %table, i64 %shr + %0 = load i32, ptr %arrayidx, align 4 + ret i32 %0 +} + +define i64 @load64_shr63(i64 %a, i64 %b, ptr %table) { +; CHECK-LABEL: load64_shr63: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul x8, x1, x0 +; CHECK-NEXT: lsr x8, x8, #63 +; CHECK-NEXT: ldr x0, [x2, x8, lsl #3] +; CHECK-NEXT: ret +entry: + %mul = mul i64 %b, %a + %shr = lshr i64 %mul, 63 + %arrayidx = getelementptr inbounds i64, ptr %table, i64 %shr + %0 = load i64, ptr %arrayidx, align 8 + ret i64 %0 +} + +define i64 @load64_shr2(i64 %a, i64 %b, ptr %table) { +; CHECK-LABEL: load64_shr2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul x8, x1, x0 +; CHECK-NEXT: lsr x8, x8, #2 +; CHECK-NEXT: ldr x0, [x2, x8, lsl #3] +; CHECK-NEXT: ret +entry: + %mul = mul i64 %b, %a + %shr = lshr i64 %mul, 2 + %arrayidx = getelementptr inbounds i64, ptr %table, i64 %shr + %0 = load i64, ptr %arrayidx, align 8 + ret i64 %0 +} + +define i64 @load64_shr1(i64 %a, i64 %b, ptr %table) { +; CHECK-LABEL: load64_shr1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul x8, x1, x0 +; CHECK-NEXT: lsr x8, x8, #1 +; CHECK-NEXT: ldr x0, [x2, x8, lsl #3] +; CHECK-NEXT: ret +entry: + %mul = mul i64 %b, %a + %shr = lshr i64 %mul, 1 + %arrayidx = getelementptr inbounds i64, ptr %table, i64 %shr + %0 = load i64, ptr %arrayidx, align 8 + ret i64 %0 +} From b9a02765504f8b83701ffffc097531638c4fc22e Mon Sep 17 00:00:00 2001 From: David Green Date: Sat, 24 Aug 2024 21:21:27 +0100 Subject: [PATCH 417/426] [ARM] Add VECTOR_REG_CAST identity fold. v16i8 VECTOR_REG_CAST (v16i8 Op) can use v16i8 Op directly, as the VECTOR_REG_CAST is a noop. --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 3 +++ llvm/test/CodeGen/Thumb2/mve-be.ll | 7 +++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 1e8bb8a495e68b..4ab0433069ae66 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -15444,6 +15444,9 @@ static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG, if (ST->isLittle()) return DAG.getNode(ISD::BITCAST, dl, VT, Op); + // VT VECTOR_REG_CAST (VT Op) -> Op + if (Op.getValueType() == VT) + return Op; // VECTOR_REG_CAST undef -> undef if (Op.isUndef()) return DAG.getUNDEF(VT); diff --git a/llvm/test/CodeGen/Thumb2/mve-be.ll b/llvm/test/CodeGen/Thumb2/mve-be.ll index 522d6f8704b6af..2f2ecc76472374 100644 --- a/llvm/test/CodeGen/Thumb2/mve-be.ll +++ b/llvm/test/CodeGen/Thumb2/mve-be.ll @@ -278,10 +278,9 @@ define arm_aapcs_vfpcc <4 x i32> @test(ptr %data) { ; ; CHECK-BE-LABEL: test: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: movs r1, #1 -; CHECK-BE-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-BE-NEXT: vdup.32 q0, r1 -; CHECK-BE-NEXT: vadd.i32 q0, q1, q0 +; CHECK-BE-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-BE-NEXT: movs r0, #1 +; CHECK-BE-NEXT: vadd.i32 q0, q0, r0 ; CHECK-BE-NEXT: vrev32.8 q0, q0 ; CHECK-BE-NEXT: @APP ; CHECK-BE-NEXT: vmullb.s32 q1, q0, q0 From a6f87abf73be02be0b7c50083b3d93ac81a80c29 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 24 Aug 2024 16:20:38 -0700 Subject: [PATCH 418/426] [Mips] Remove a trivial variable (NFC) (#105940) We assign I->getNumOperands() to J and immediately print that out as a debug message. We don't need to keep J across iterations. --- llvm/lib/Target/Mips/MipsConstantIslandPass.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp index 0341af0caac46e..60bb10369df4fa 100644 --- a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp +++ b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp @@ -1630,8 +1630,6 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) { } void MipsConstantIslands::prescanForConstants() { - unsigned J = 0; - (void)J; for (MachineBasicBlock &B : *MF) { for (MachineBasicBlock::instr_iterator I = B.instr_begin(), EB = B.instr_end(); @@ -1640,8 +1638,7 @@ void MipsConstantIslands::prescanForConstants() { case Mips::LwConstant32: { PrescannedForConstants = true; LLVM_DEBUG(dbgs() << "constant island constant " << *I << "\n"); - J = I->getNumOperands(); - LLVM_DEBUG(dbgs() << "num operands " << J << "\n"); + LLVM_DEBUG(dbgs() << "num operands " << I->getNumOperands() << "\n"); MachineOperand& Literal = I->getOperand(1); if (Literal.isImm()) { int64_t V = Literal.getImm(); From 3ef64f7ab5b8651eab500cd944984379fce5f639 Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Sun, 25 Aug 2024 08:17:48 +0900 Subject: [PATCH 419/426] Revert "Enable logf128 constant folding for hosts with 128bit long double (#104929)" ConstantFolding behaves differently depending on host's `HAS_IEE754_FLOAT128`. LLVM should not change the behavior depending on host configurations. This reverts commit 14c7e4a1844904f3db9b2dc93b722925a8c66b27. (llvmorg-20-init-3262-g14c7e4a18449 and llvmorg-20-init-3498-g001e423ac626) --- llvm/CMakeLists.txt | 2 ++ llvm/cmake/config-ix.cmake | 18 +++++++++------- llvm/include/llvm/ADT/APFloat.h | 15 +++++++++++--- llvm/include/llvm/ADT/APInt.h | 8 +++++++ llvm/include/llvm/Support/float128.h | 14 +++++++------ llvm/lib/Analysis/CMakeLists.txt | 6 ++++++ llvm/lib/Analysis/ConstantFolding.cpp | 30 ++++++--------------------- llvm/lib/Support/APFloat.cpp | 24 +++++++++++++++++++-- 8 files changed, 75 insertions(+), 42 deletions(-) diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index b03d89a43c34b0..d681b1ccab6299 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -560,6 +560,8 @@ set(LLVM_USE_STATIC_ZSTD FALSE CACHE BOOL "Use static version of zstd. Can be TR set(LLVM_ENABLE_CURL "OFF" CACHE STRING "Use libcurl for the HTTP client if available. Can be ON, OFF, or FORCE_ON") +set(LLVM_HAS_LOGF128 "OFF" CACHE STRING "Use logf128 to constant fold fp128 logarithm calls. Can be ON, OFF, or FORCE_ON") + set(LLVM_ENABLE_HTTPLIB "OFF" CACHE STRING "Use cpp-httplib HTTP server library if available. Can be ON, OFF, or FORCE_ON") set(LLVM_Z3_INSTALL_DIR "" CACHE STRING "Install directory of the Z3 solver.") diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake index e7ed839ad68101..f76eacb9d51366 100644 --- a/llvm/cmake/config-ix.cmake +++ b/llvm/cmake/config-ix.cmake @@ -246,6 +246,17 @@ else() set(HAVE_LIBEDIT 0) endif() +if(LLVM_HAS_LOGF128) + include(CheckCXXSymbolExists) + check_cxx_symbol_exists(logf128 math.h HAS_LOGF128) + + if(LLVM_HAS_LOGF128 STREQUAL FORCE_ON AND NOT HAS_LOGF128) + message(FATAL_ERROR "Failed to configure logf128") + endif() + + set(LLVM_HAS_LOGF128 "${HAS_LOGF128}") +endif() + # function checks check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM) find_package(Backtrace) @@ -259,13 +270,6 @@ if(C_SUPPORTS_WERROR_UNGUARDED_AVAILABILITY_NEW) set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror=unguarded-availability-new") endif() -check_cxx_symbol_exists(logf128 cmath HAS_LOGF128) -check_symbol_exists(__powerpc__ "" __PPC64LE) -if(HAS_LOGF128 AND NOT __PPC64LE) - set(LLVM_HAS_LOGF128 On) - add_compile_definitions(HAS_LOGF128) -endif() - # Determine whether we can register EH tables. check_symbol_exists(__register_frame "${CMAKE_CURRENT_LIST_DIR}/unwind.h" HAVE_REGISTER_FRAME) check_symbol_exists(__deregister_frame "${CMAKE_CURRENT_LIST_DIR}/unwind.h" HAVE_DEREGISTER_FRAME) diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h index 925d03d4c06670..7039e961bff82d 100644 --- a/llvm/include/llvm/ADT/APFloat.h +++ b/llvm/include/llvm/ADT/APFloat.h @@ -19,6 +19,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/FloatingPointMode.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/float128.h" #include #define APFLOAT_DISPATCH_ON_SEMANTICS(METHOD_CALL) \ @@ -377,6 +378,9 @@ class IEEEFloat final : public APFloatBase { Expected convertFromString(StringRef, roundingMode); APInt bitcastToAPInt() const; double convertToDouble() const; +#ifdef HAS_IEE754_FLOAT128 + float128 convertToQuad() const; +#endif float convertToFloat() const; /// @} @@ -1270,9 +1274,14 @@ class APFloat : public APFloatBase { /// shorter semantics, like IEEEsingle and others. double convertToDouble() const; - /// Return true if this APFloat has quadruple precision floating point - /// semantics - bool isValidIEEEQuad() const; + /// Converts this APFloat to host float value. + /// + /// \pre The APFloat must be built using semantics, that can be represented by + /// the host float type without loss of precision. It can be IEEEquad and + /// shorter semantics, like IEEEdouble and others. +#ifdef HAS_IEE754_FLOAT128 + float128 convertToQuad() const; +#endif /// Converts this APFloat to host float value. /// diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h index 13837413ae49fe..65ba3f15305c78 100644 --- a/llvm/include/llvm/ADT/APInt.h +++ b/llvm/include/llvm/ADT/APInt.h @@ -17,6 +17,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/float128.h" #include #include #include @@ -1676,6 +1677,13 @@ class [[nodiscard]] APInt { /// any bit width. Exactly 64 bits will be translated. double bitsToDouble() const { return llvm::bit_cast(getWord(0)); } +#ifdef HAS_IEE754_FLOAT128 + float128 bitsToQuad() const { + __uint128_t ul = ((__uint128_t)U.pVal[1] << 64) + U.pVal[0]; + return llvm::bit_cast(ul); + } +#endif + /// Converts APInt bits to a float /// /// The conversion does not do a translation from integer to float, it just diff --git a/llvm/include/llvm/Support/float128.h b/llvm/include/llvm/Support/float128.h index 618b320086ba59..e15a98dc5a6779 100644 --- a/llvm/include/llvm/Support/float128.h +++ b/llvm/include/llvm/Support/float128.h @@ -9,16 +9,18 @@ #ifndef LLVM_FLOAT128 #define LLVM_FLOAT128 -#include - namespace llvm { -#ifdef HAS_LOGF128 -#if !defined(__LONG_DOUBLE_IBM128__) && (__SIZEOF_INT128__ == 16) -typedef decltype(logf128(0.)) float128; +#if defined(__clang__) && defined(__FLOAT128__) && \ + defined(__SIZEOF_INT128__) && !defined(__LONG_DOUBLE_IBM128__) +#define HAS_IEE754_FLOAT128 +typedef __float128 float128; +#elif defined(__FLOAT128__) && defined(__SIZEOF_INT128__) && \ + !defined(__LONG_DOUBLE_IBM128__) && \ + (defined(__GNUC__) || defined(__GNUG__)) #define HAS_IEE754_FLOAT128 +typedef _Float128 float128; #endif -#endif // HAS_LOGF128 } // namespace llvm #endif // LLVM_FLOAT128 diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt index 3127f45cc54cb1..393803fad89383 100644 --- a/llvm/lib/Analysis/CMakeLists.txt +++ b/llvm/lib/Analysis/CMakeLists.txt @@ -162,3 +162,9 @@ add_llvm_component_library(LLVMAnalysis Support TargetParser ) + +include(CheckCXXSymbolExists) +check_cxx_symbol_exists(logf128 math.h HAS_LOGF128) +if(HAS_LOGF128) + target_compile_definitions(LLVMAnalysis PRIVATE HAS_LOGF128) +endif() diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 26d9304cb73672..a7a6de3f3b97b0 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -54,7 +54,6 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Support/float128.h" #include #include #include @@ -1742,7 +1741,7 @@ Constant *GetConstantFoldFPValue(double V, Type *Ty) { llvm_unreachable("Can only constant fold half/float/double"); } -#if defined(HAS_IEE754_FLOAT128) +#if defined(HAS_IEE754_FLOAT128) && defined(HAS_LOGF128) Constant *GetConstantFoldFPValue128(float128 V, Type *Ty) { if (Ty->isFP128Ty()) return ConstantFP::get(Ty, V); @@ -1782,25 +1781,11 @@ Constant *ConstantFoldFP(double (*NativeFP)(double), const APFloat &V, return GetConstantFoldFPValue(Result, Ty); } -#if defined(HAS_IEE754_FLOAT128) -float128 ConvertToQuad(const APFloat &Apf) { - APInt Api = Apf.bitcastToAPInt(); - __uint128_t Uint128 = - ((__uint128_t)Api.extractBitsAsZExtValue(64, 64) << 64) + - Api.extractBitsAsZExtValue(64, 0); - return llvm::bit_cast(Uint128); -} -#endif - -#if defined(HAS_IEE754_FLOAT128) +#if defined(HAS_IEE754_FLOAT128) && defined(HAS_LOGF128) Constant *ConstantFoldFP128(float128 (*NativeFP)(float128), const APFloat &V, Type *Ty) { llvm_fenv_clearexcept(); - if (!V.isValidIEEEQuad()) - return nullptr; - - float128 Result = NativeFP(ConvertToQuad(V)); - + float128 Result = NativeFP(V.convertToQuad()); if (llvm_fenv_testexcept()) { llvm_fenv_clearexcept(); return nullptr; @@ -2129,16 +2114,13 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, if (IntrinsicID == Intrinsic::canonicalize) return constantFoldCanonicalize(Ty, Call, U); -#if defined(HAS_IEE754_FLOAT128) +#if defined(HAS_IEE754_FLOAT128) && defined(HAS_LOGF128) if (Ty->isFP128Ty()) { if (IntrinsicID == Intrinsic::log) { - APFloat Value = Op->getValueAPF(); - if (!Value.isValidIEEEQuad()) - return nullptr; - - float128 Result = logf128(ConvertToQuad(Value)); + float128 Result = logf128(Op->getValueAPF().convertToQuad()); return GetConstantFoldFPValue128(Result, Ty); } + LibFunc Fp128Func = NotLibFunc; if (TLI && TLI->getLibFunc(Name, Fp128Func) && TLI->has(Fp128Func) && Fp128Func == LibFunc_logl) diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp index 2ddf99f56f88d5..7f68c5ab9b7cf7 100644 --- a/llvm/lib/Support/APFloat.cpp +++ b/llvm/lib/Support/APFloat.cpp @@ -3749,6 +3749,15 @@ double IEEEFloat::convertToDouble() const { return api.bitsToDouble(); } +#ifdef HAS_IEE754_FLOAT128 +float128 IEEEFloat::convertToQuad() const { + assert(semantics == (const llvm::fltSemantics *)&semIEEEquad && + "Float semantics are not IEEEquads"); + APInt api = bitcastToAPInt(); + return api.bitsToQuad(); +} +#endif + /// Integer bit is explicit in this format. Intel hardware (387 and later) /// does not support these bit patterns: /// exponent = all 1's, integer bit 0, significand 0 ("pseudoinfinity") @@ -5397,9 +5406,20 @@ double APFloat::convertToDouble() const { return Temp.getIEEE().convertToDouble(); } -bool APFloat::isValidIEEEQuad() const { - return (&getSemantics() == (const llvm::fltSemantics *)&semIEEEquad); +#ifdef HAS_IEE754_FLOAT128 +float128 APFloat::convertToQuad() const { + if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEquad) + return getIEEE().convertToQuad(); + assert(getSemantics().isRepresentableBy(semIEEEquad) && + "Float semantics is not representable by IEEEquad"); + APFloat Temp = *this; + bool LosesInfo; + opStatus St = Temp.convert(semIEEEquad, rmNearestTiesToEven, &LosesInfo); + assert(!(St & opInexact) && !LosesInfo && "Unexpected imprecision"); + (void)St; + return Temp.getIEEE().convertToQuad(); } +#endif float APFloat::convertToFloat() const { if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEsingle) From 6bc225e0630f28e83290a43c3d9b25b057fc815a Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Sat, 24 Aug 2024 19:12:15 -0700 Subject: [PATCH 420/426] [clang-format] Fix a misannotation of redundant r_paren as CastRParen (#105921) Fixes #105880. --- clang/lib/Format/TokenAnnotator.cpp | 2 ++ clang/unittests/Format/TokenAnnotatorTest.cpp | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index f8bf8d9570d9a8..7c35171ab35232 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -2875,6 +2875,8 @@ class AnnotatingParser { // Search for unexpected tokens. for (auto *Prev = BeforeRParen; Prev != LParen; Prev = Prev->Previous) { if (Prev->is(tok::r_paren)) { + if (Prev->is(TT_CastRParen)) + return false; Prev = Prev->MatchingParen; if (!Prev) return false; diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index 99798de43e70ff..834430fa931129 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -747,6 +747,12 @@ TEST_F(TokenAnnotatorTest, UnderstandsCasts) { EXPECT_TOKEN(Tokens[9], tok::r_paren, TT_CastRParen); EXPECT_TOKEN(Tokens[10], tok::amp, TT_UnaryOperator); + Tokens = annotate("int result = ((int)a) - b;"); + ASSERT_EQ(Tokens.size(), 13u) << Tokens; + EXPECT_TOKEN(Tokens[6], tok::r_paren, TT_CastRParen); + EXPECT_TOKEN(Tokens[8], tok::r_paren, TT_Unknown); + EXPECT_TOKEN(Tokens[9], tok::minus, TT_BinaryOperator); + auto Style = getLLVMStyle(); Style.TypeNames.push_back("Foo"); Tokens = annotate("#define FOO(bar) foo((Foo)&bar)", Style); From 0916ae49b89db6eb9eee9f6fee4f1a65fd9cdb74 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Sat, 24 Aug 2024 20:10:03 -0700 Subject: [PATCH 421/426] [clang-format] Fix a misannotation of less/greater as angle brackets (#105941) Fixes #105877. --- clang/lib/Format/TokenAnnotator.cpp | 2 +- clang/unittests/Format/TokenAnnotatorTest.cpp | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index 7c35171ab35232..f15330098a2395 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -250,7 +250,7 @@ class AnnotatingParser { if (Precedence > prec::Conditional && Precedence < prec::Relational) return false; } - if (Prev.is(TT_ConditionalExpr)) + if (Prev.isOneOf(tok::question, tok::colon) && !Style.isProto()) SeenTernaryOperator = true; updateParameterCount(Left, CurrentToken); if (Style.Language == FormatStyle::LK_Proto) { diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index 834430fa931129..db44d418a84484 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -620,6 +620,11 @@ TEST_F(TokenAnnotatorTest, UnderstandsNonTemplateAngleBrackets) { EXPECT_TOKEN(Tokens[2], tok::less, TT_BinaryOperator); EXPECT_TOKEN(Tokens[8], tok::greater, TT_BinaryOperator); + Tokens = annotate("return checklower ? a < b : a > b;"); + ASSERT_EQ(Tokens.size(), 12u) << Tokens; + EXPECT_TOKEN(Tokens[4], tok::less, TT_BinaryOperator); + EXPECT_TOKEN(Tokens[8], tok::greater, TT_BinaryOperator); + Tokens = annotate("return A < B ^ A > B;"); ASSERT_EQ(Tokens.size(), 10u) << Tokens; EXPECT_TOKEN(Tokens[2], tok::less, TT_BinaryOperator); From 5c94dd73b2df1f6b469e858ff29055ac117e8494 Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Sun, 25 Aug 2024 14:01:32 +0800 Subject: [PATCH 422/426] [X86][AMX] Avoid to construct invalid shape for checking, NFCI (#105973) --- llvm/include/llvm/CodeGen/VirtRegMap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/CodeGen/VirtRegMap.h b/llvm/include/llvm/CodeGen/VirtRegMap.h index 42e8d294a637ad..864eb23e133ebd 100644 --- a/llvm/include/llvm/CodeGen/VirtRegMap.h +++ b/llvm/include/llvm/CodeGen/VirtRegMap.h @@ -114,7 +114,7 @@ class TargetInstrInfo; bool isShapeMapEmpty() const { return Virt2ShapeMap.empty(); } bool hasShape(Register virtReg) const { - return getShape(virtReg).isValid(); + return Virt2ShapeMap.contains(virtReg); } ShapeT getShape(Register virtReg) const { From 579fd59ab920a3a5723393727f94716706f2cea2 Mon Sep 17 00:00:00 2001 From: Brandon Wu Date: Sun, 25 Aug 2024 14:40:03 +0800 Subject: [PATCH 423/426] [RISCV][ISel] Move VCIX ISDs to correct position. NFC (#105934) Current VCIX ISDs are placed after FIRST_TARGET_STRICTFP_OPCODE which is not expected, it should be in normal OPCODE area. --- llvm/lib/Target/RISCV/RISCVISelLowering.h | 52 +++++++++++------------ 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 2298998b47357d..1b91ab43a4637f 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -415,32 +415,6 @@ enum NodeType : unsigned { /// operand 1 is the target address. SW_GUARDED_BRIND, - // FP to 32 bit int conversions for RV64. These are used to keep track of the - // result being sign extended to 64 bit. These saturate out of range inputs. - STRICT_FCVT_W_RV64 = ISD::FIRST_TARGET_STRICTFP_OPCODE, - STRICT_FCVT_WU_RV64, - STRICT_FADD_VL, - STRICT_FSUB_VL, - STRICT_FMUL_VL, - STRICT_FDIV_VL, - STRICT_FSQRT_VL, - STRICT_VFMADD_VL, - STRICT_VFNMADD_VL, - STRICT_VFMSUB_VL, - STRICT_VFNMSUB_VL, - STRICT_FP_ROUND_VL, - STRICT_FP_EXTEND_VL, - STRICT_VFNCVT_ROD_VL, - STRICT_SINT_TO_FP_VL, - STRICT_UINT_TO_FP_VL, - STRICT_VFCVT_RM_X_F_VL, - STRICT_VFCVT_RTZ_X_F_VL, - STRICT_VFCVT_RTZ_XU_F_VL, - STRICT_FSETCC_VL, - STRICT_FSETCCS_VL, - STRICT_VFROUND_NOEXCEPT_VL, - LAST_RISCV_STRICTFP_OPCODE = STRICT_VFROUND_NOEXCEPT_VL, - SF_VC_XV_SE, SF_VC_IV_SE, SF_VC_VV_SE, @@ -468,6 +442,32 @@ enum NodeType : unsigned { SF_VC_V_VVW_SE, SF_VC_V_FVW_SE, + // FP to 32 bit int conversions for RV64. These are used to keep track of the + // result being sign extended to 64 bit. These saturate out of range inputs. + STRICT_FCVT_W_RV64 = ISD::FIRST_TARGET_STRICTFP_OPCODE, + STRICT_FCVT_WU_RV64, + STRICT_FADD_VL, + STRICT_FSUB_VL, + STRICT_FMUL_VL, + STRICT_FDIV_VL, + STRICT_FSQRT_VL, + STRICT_VFMADD_VL, + STRICT_VFNMADD_VL, + STRICT_VFMSUB_VL, + STRICT_VFNMSUB_VL, + STRICT_FP_ROUND_VL, + STRICT_FP_EXTEND_VL, + STRICT_VFNCVT_ROD_VL, + STRICT_SINT_TO_FP_VL, + STRICT_UINT_TO_FP_VL, + STRICT_VFCVT_RM_X_F_VL, + STRICT_VFCVT_RTZ_X_F_VL, + STRICT_VFCVT_RTZ_XU_F_VL, + STRICT_FSETCC_VL, + STRICT_FSETCCS_VL, + STRICT_VFROUND_NOEXCEPT_VL, + LAST_RISCV_STRICTFP_OPCODE = STRICT_VFROUND_NOEXCEPT_VL, + // WARNING: Do not add anything in the end unless you want the node to // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all // opcodes will be thought as target memory ops! From f22b1da8791edd557ce34c87190e329df2e1c892 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 24 Aug 2024 23:37:37 -0700 Subject: [PATCH 424/426] [CodeGen] Replace MCPhysReg with MCRegister in MachineBasicBlock::isLiveIn/removeLiveIn. NFC We already used it for addLiveIn. --- llvm/include/llvm/CodeGen/MachineBasicBlock.h | 4 ++-- llvm/lib/CodeGen/MachineBasicBlock.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h index 2d238326ee1a30..6efb17c55493a9 100644 --- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h +++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h @@ -478,11 +478,11 @@ class MachineBasicBlock Register addLiveIn(MCRegister PhysReg, const TargetRegisterClass *RC); /// Remove the specified register from the live in set. - void removeLiveIn(MCPhysReg Reg, + void removeLiveIn(MCRegister Reg, LaneBitmask LaneMask = LaneBitmask::getAll()); /// Return true if the specified register is in the live in set. - bool isLiveIn(MCPhysReg Reg, + bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask = LaneBitmask::getAll()) const; // Iteration support for live in sets. These sets are kept in sorted diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp index d681d00b5d8c4d..5d06af3ebf3360 100644 --- a/llvm/lib/CodeGen/MachineBasicBlock.cpp +++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp @@ -595,7 +595,7 @@ void MachineBasicBlock::printAsOperand(raw_ostream &OS, printName(OS, 0); } -void MachineBasicBlock::removeLiveIn(MCPhysReg Reg, LaneBitmask LaneMask) { +void MachineBasicBlock::removeLiveIn(MCRegister Reg, LaneBitmask LaneMask) { LiveInVector::iterator I = find_if( LiveIns, [Reg](const RegisterMaskPair &LI) { return LI.PhysReg == Reg; }); if (I == LiveIns.end()) @@ -613,7 +613,7 @@ MachineBasicBlock::removeLiveIn(MachineBasicBlock::livein_iterator I) { return LiveIns.erase(LI); } -bool MachineBasicBlock::isLiveIn(MCPhysReg Reg, LaneBitmask LaneMask) const { +bool MachineBasicBlock::isLiveIn(MCRegister Reg, LaneBitmask LaneMask) const { livein_iterator I = find_if( LiveIns, [Reg](const RegisterMaskPair &LI) { return LI.PhysReg == Reg; }); return I != livein_end() && (I->LaneMask & LaneMask).any(); From 2847020dbd9b8f932ee564651ec72ce15fa37d07 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Sun, 25 Aug 2024 10:03:16 +0100 Subject: [PATCH 425/426] [lldb][TypeSystemClang][NFC] Log failure to InitBuiltinTypes If we fail to initialize the ASTContext builtins, LLDB may crash in non-obvious ways down-the-line, e.g., when it tries to call `ASTContext::getTypeSize` on a builtin like `ast.UnsignedCharTy`, which would derefernce a `null` `QualType`. The initialization can fail if we either didn't set the `TypeSystemClang` target triple, or if the embedded clang isn't enabled for a certain target. This patch attempts to help pin-point the failure case post-mortem by adding a log message here that prints the triple. rdar://134260837 --- lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp index 8f646803848096..695801da9da69a 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp @@ -697,6 +697,10 @@ void TypeSystemClang::CreateASTContext() { TargetInfo *target_info = getTargetInfo(); if (target_info) m_ast_up->InitBuiltinTypes(*target_info); + else if (auto *log = GetLog(LLDBLog::Expressions)) + LLDB_LOG(log, + "Failed to initialize builtin ASTContext types for target '{0}'", + m_target_triple); GetASTMap().Insert(m_ast_up.get(), this); From 51365212362c4d0e32a0c747ab85bbf3919944b8 Mon Sep 17 00:00:00 2001 From: Alexander Shaposhnikov Date: Sun, 25 Aug 2024 10:15:55 +0000 Subject: [PATCH 426/426] Reapply "[compiler-rt][nsan] Add support for nan detection" (#105909) This reverts commit 1f89cd4a1970fee65f5ecb189c4d1a0a376d9bb2. --- compiler-rt/lib/nsan/nsan.cpp | 34 ++++++++++++++-- compiler-rt/lib/nsan/nsan_flags.inc | 2 + compiler-rt/test/nsan/nan.cpp | 25 ++++++++++++ compiler-rt/test/nsan/softmax.cpp | 54 ++++++++++++++++++++++++++ compiler-rt/test/nsan/vec_sqrt.cpp | 34 ++++++++++++++++ compiler-rt/test/nsan/vec_sqrt_ext.cpp | 25 ++++++++++++ 6 files changed, 170 insertions(+), 4 deletions(-) create mode 100644 compiler-rt/test/nsan/nan.cpp create mode 100644 compiler-rt/test/nsan/softmax.cpp create mode 100644 compiler-rt/test/nsan/vec_sqrt.cpp create mode 100644 compiler-rt/test/nsan/vec_sqrt_ext.cpp diff --git a/compiler-rt/lib/nsan/nsan.cpp b/compiler-rt/lib/nsan/nsan.cpp index bfa55c317cfe79..ce161a18fa8f70 100644 --- a/compiler-rt/lib/nsan/nsan.cpp +++ b/compiler-rt/lib/nsan/nsan.cpp @@ -409,21 +409,21 @@ __nsan_dump_shadow_mem(const u8 *addr, size_t size_bytes, size_t bytes_per_line, } } -alignas(16) SANITIZER_INTERFACE_ATTRIBUTE +alignas(64) SANITIZER_INTERFACE_ATTRIBUTE thread_local uptr __nsan_shadow_ret_tag = 0; -alignas(16) SANITIZER_INTERFACE_ATTRIBUTE +alignas(64) SANITIZER_INTERFACE_ATTRIBUTE thread_local char __nsan_shadow_ret_ptr[kMaxVectorWidth * sizeof(__float128)]; -alignas(16) SANITIZER_INTERFACE_ATTRIBUTE +alignas(64) SANITIZER_INTERFACE_ATTRIBUTE thread_local uptr __nsan_shadow_args_tag = 0; // Maximum number of args. This should be enough for anyone (tm). An alternate // scheme is to have the generated code create an alloca and make // __nsan_shadow_args_ptr point ot the alloca. constexpr const int kMaxNumArgs = 128; -alignas(16) SANITIZER_INTERFACE_ATTRIBUTE +alignas(64) SANITIZER_INTERFACE_ATTRIBUTE thread_local char __nsan_shadow_args_ptr[kMaxVectorWidth * kMaxNumArgs * sizeof(__float128)]; @@ -445,6 +445,32 @@ int32_t checkFT(const FT value, ShadowFT Shadow, CheckTypeT CheckType, const InternalFT check_value = value; const InternalFT check_shadow = Shadow; + // We only check for NaNs in the value, not the shadow. + if (flags().check_nan && isnan(check_value)) { + GET_CALLER_PC_BP; + BufferedStackTrace stack; + stack.Unwind(pc, bp, nullptr, false); + if (GetSuppressionForStack(&stack, CheckKind::Consistency)) { + // FIXME: optionally print. + return flags().resume_after_suppression ? kResumeFromValue + : kContinueWithShadow; + } + Decorator D; + Printf("%s", D.Warning()); + Printf("WARNING: NumericalStabilitySanitizer: NaN detected\n"); + Printf("%s", D.Default()); + stack.Print(); + if (flags().halt_on_error) { + if (common_flags()->abort_on_error) + Printf("ABORTING\n"); + else + Printf("Exiting\n"); + Die(); + } + // Performing other tests for NaN values is meaningless when dealing with numbers. + return kResumeFromValue; + } + // See this article for an interesting discussion of how to compare floats: // https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/ static constexpr const FT Eps = FTInfo::kEpsilon; diff --git a/compiler-rt/lib/nsan/nsan_flags.inc b/compiler-rt/lib/nsan/nsan_flags.inc index 658cd5b3b01bf4..7c9e579d91fc33 100644 --- a/compiler-rt/lib/nsan/nsan_flags.inc +++ b/compiler-rt/lib/nsan/nsan_flags.inc @@ -48,3 +48,5 @@ NSAN_FLAG(bool, enable_loadtracking_stats, false, "due to invalid or unknown types.") NSAN_FLAG(bool, poison_in_free, true, "") NSAN_FLAG(bool, print_stats_on_exit, false, "If true, print stats on exit.") +NSAN_FLAG(bool, check_nan, false, + "If true, check the floating-point number is nan") \ No newline at end of file diff --git a/compiler-rt/test/nsan/nan.cpp b/compiler-rt/test/nsan/nan.cpp new file mode 100644 index 00000000000000..59fc391a3e0a6b --- /dev/null +++ b/compiler-rt/test/nsan/nan.cpp @@ -0,0 +1,25 @@ +// RUN: %clangxx_nsan -O0 -g %s -o %t +// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s + +// RUN: %clangxx_nsan -O3 -g %s -o %t +// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s + +// RUN: %clangxx_nsan -O0 -g %s -o %t +// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=1 not %run %t + +#include +#include + +// This function returns a NaN value for triggering the NaN detection. +__attribute__((noinline)) float ReturnNaN(float p, float q) { + float ret = p / q; + return ret; + // CHECK: WARNING: NumericalStabilitySanitizer: NaN detected +} + +int main() { + float val = ReturnNaN(0., 0.); + printf("%f\n", val); + // CHECK: WARNING: NumericalStabilitySanitizer: NaN detected + return 0; +} diff --git a/compiler-rt/test/nsan/softmax.cpp b/compiler-rt/test/nsan/softmax.cpp new file mode 100644 index 00000000000000..29eaa2f9607a20 --- /dev/null +++ b/compiler-rt/test/nsan/softmax.cpp @@ -0,0 +1,54 @@ +// RUN: %clangxx_nsan -O0 -g -DSOFTMAX=softmax %s -o %t +// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0,log2_max_relative_error=19 %run %t 2>&1 | FileCheck %s + +// RUN: %clangxx_nsan -O3 -g -DSOFTMAX=softmax %s -o %t +// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0,log2_max_relative_error=19 %run %t 2>&1 | FileCheck %s + +// RUN: %clangxx_nsan -O0 -g -DSOFTMAX=stable_softmax %s -o %t +// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=1,log2_max_relative_error=19 %run %t + +// RUN: %clangxx_nsan -O3 -g -DSOFTMAX=stable_softmax %s -o %t +// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=1,log2_max_relative_error=19 %run %t + +#include +#include +#include +#include + +// unstable softmax +template +__attribute__((noinline)) void softmax(std::vector &values) { + T sum_exp = 0.0; + for (auto &i: values) { + i = std::exp(i); + sum_exp += i; + } + for (auto &i: values) { + i /= sum_exp; + } +} + +// use max value to avoid overflow +// \sigma_i exp(x_i) / \sum_j exp(x_j) = \sigma_i exp(x_i - max(x)) / \sum_j exp(x_j - max(x)) +template +__attribute__((noinline)) void stable_softmax(std::vector &values) { + T sum_exp = 0.0; + T max_values = *std::max_element(values.begin(), values.end()); + for (auto &i: values) { + i = std::exp(i - max_values); + sum_exp += i; + } + for (auto &i:values) { + i /= sum_exp; + } +} + +int main() { + std::vector data = {1000, 1001, 1002}; + SOFTMAX(data); + for (auto i: data) { + printf("%f", i); + // CHECK: WARNING: NumericalStabilitySanitizer: NaN detected + } + return 0; +} \ No newline at end of file diff --git a/compiler-rt/test/nsan/vec_sqrt.cpp b/compiler-rt/test/nsan/vec_sqrt.cpp new file mode 100644 index 00000000000000..d1ef0487858506 --- /dev/null +++ b/compiler-rt/test/nsan/vec_sqrt.cpp @@ -0,0 +1,34 @@ +// RUN: %clangxx_nsan -O0 -g -mavx %s -o %t +// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s +// RUN: %clangxx_nsan -O3 -g -mavx %s -o %t +// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s + +#include +#include +#include + +void simd_sqrt(const float *input, float *output, size_t size) { + size_t i = 0; + for (; i + 7 < size; i += 8) { + __m256 vec = _mm256_loadu_ps(&input[i]); + __m256 result = _mm256_sqrt_ps(vec); + _mm256_storeu_ps(&output[i], result); + } + for (; i < size; ++i) { + output[i] = std::sqrt(input[i]); + // CHECK: WARNING: NumericalStabilitySanitizer: NaN detected + } +} + +int main() { + float input[] = {1.0, 2.0, -3.0, 4.0, 5.0, 6.0, 7.0, + 8.0, 9.0, -10.0, 11.0, 12.0, 13.0, 14.0, + 15.0, -16.0, 17.0, -18.0, -19.0, -20.0}; + float output[20]; + simd_sqrt(input, output, 20); + for (int i = 0; i < 20; ++i) { + std::cout << output[i] << std::endl; + // CHECK: WARNING: NumericalStabilitySanitizer: NaN detected + } + return 0; +} \ No newline at end of file diff --git a/compiler-rt/test/nsan/vec_sqrt_ext.cpp b/compiler-rt/test/nsan/vec_sqrt_ext.cpp new file mode 100644 index 00000000000000..b39ce4b99bcab6 --- /dev/null +++ b/compiler-rt/test/nsan/vec_sqrt_ext.cpp @@ -0,0 +1,25 @@ +// RUN: %clangxx_nsan -O0 -g -mavx %s -o %t +// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s +// RUN: %clangxx_nsan -O3 -g -mavx %s -o %t +// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s +#include +#include + +typedef float v8sf __attribute__ ((vector_size(32))); + +v8sf simd_sqrt(v8sf a) { + return __builtin_elementwise_sqrt(a); + // CHECK: WARNING: NumericalStabilitySanitizer: NaN detected +} + +int main() { + v8sf a = {-1.0, -2.0, -3.0, 4.0, 5.0, 6.0, 7.0, 8.0}; + a = simd_sqrt(a); + + // This prevents DCE. + for (size_t i = 0; i < 8; ++i) { + std::cout << a[i] << std::endl; + // CHECK: WARNING: NumericalStabilitySanitizer: NaN detected + } + return 0; +} \ No newline at end of file